aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c6
-rw-r--r--fs/9p/acl.h4
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_file.c36
-rw-r--r--fs/9p/vfs_inode.c139
-rw-r--r--fs/9p/vfs_inode_dotl.c92
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/autofs4/autofs_i.h26
-rw-r--r--fs/autofs4/waitq.c2
-rw-r--r--fs/befs/linuxvfs.c23
-rw-r--r--fs/block_dev.c28
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c27
-rw-r--r--fs/btrfs/btrfs_inode.h22
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/ctree.c457
-rw-r--r--fs/btrfs/ctree.h54
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/dir-item.c39
-rw-r--r--fs/btrfs/disk-io.c116
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c401
-rw-r--r--fs/btrfs/extent_io.c309
-rw-r--r--fs/btrfs/extent_io.h55
-rw-r--r--fs/btrfs/extent_map.c155
-rw-r--r--fs/btrfs/file-item.c50
-rw-r--r--fs/btrfs/file.c76
-rw-r--r--fs/btrfs/free-space-cache.c193
-rw-r--r--fs/btrfs/inode.c259
-rw-r--r--fs/btrfs/ioctl.c34
-rw-r--r--fs/btrfs/locking.c280
-rw-r--r--fs/btrfs/locking.h36
-rw-r--r--fs/btrfs/ref-cache.c68
-rw-r--r--fs/btrfs/ref-cache.h52
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/struct-funcs.c100
-rw-r--r--fs/btrfs/transaction.c116
-rw-r--r--fs/btrfs/tree-log.c46
-rw-r--r--fs/btrfs/volumes.c65
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/xattr.c73
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c116
-rw-r--r--fs/ceph/export.c24
-rw-r--r--fs/ceph/file.c61
-rw-r--r--fs/ceph/inode.c48
-rw-r--r--fs/ceph/ioctl.c15
-rw-r--r--fs/ceph/ioctl.h1
-rw-r--r--fs/ceph/mds_client.c58
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/snap.c25
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h20
-rw-r--r--fs/ceph/xattr.c8
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c5
-rw-r--r--fs/cifs/cifsacl.c28
-rw-r--r--fs/cifs/cifsencrypt.c126
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h60
-rw-r--r--fs/cifs/cifssmb.c6
-rw-r--r--fs/cifs/connect.c662
-rw-r--r--fs/cifs/dir.c9
-rw-r--r--fs/cifs/dns_resolve.c4
-rw-r--r--fs/cifs/file.c27
-rw-r--r--fs/cifs/inode.c14
-rw-r--r--fs/cifs/link.c8
-rw-r--r--fs/cifs/misc.c11
-rw-r--r--fs/cifs/readdir.c427
-rw-r--r--fs/cifs/smbencrypt.c8
-rw-r--r--fs/cifs/transport.c53
-rw-r--r--fs/compat.c5
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/dcache.c83
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ecryptfs/Kconfig2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h150
-rw-r--r--fs/ecryptfs/inode.c1
-rw-r--r--fs/ecryptfs/keystore.c62
-rw-r--r--fs/ecryptfs/main.c23
-rw-r--r--fs/ecryptfs/read_write.c18
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c77
-rw-r--r--fs/exofs/Kbuild5
-rw-r--r--fs/exofs/Kconfig4
-rw-r--r--fs/exofs/exofs.h159
-rw-r--r--fs/exofs/inode.c152
-rw-r--r--fs/exofs/ore.c (renamed from fs/exofs/ios.c)370
-rw-r--r--fs/exofs/pnfs.h45
-rw-r--r--fs/exofs/super.c251
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h1
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext3/acl.c9
-rw-r--r--fs/ext3/balloc.c38
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/fsync.c11
-rw-r--r--fs/ext3/ialloc.c4
-rw-r--r--fs/ext3/inode.c193
-rw-r--r--fs/ext3/ioctl.c4
-rw-r--r--fs/ext3/namei.c13
-rw-r--r--fs/ext3/super.c13
-rw-r--r--fs/ext3/xattr.c12
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c9
-rw-r--r--fs/ext4/balloc.c48
-rw-r--r--fs/ext4/block_validity.c21
-rw-r--r--fs/ext4/ext4.h56
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c129
-rw-r--r--fs/ext4/fsync.c26
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c1487
-rw-r--r--fs/ext4/inode.c1623
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c230
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/namei.c27
-rw-r--r--fs/ext4/page-io.c30
-rw-r--r--fs/ext4/resize.c199
-rw-r--r--fs/ext4/super.c89
-rw-r--r--fs/ext4/truncate.h43
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/inode.c7
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c378
-rw-r--r--fs/fuse/dev.c16
-rw-r--r--fs/fuse/file.c84
-rw-r--r--fs/fuse/fuse_i.h8
-rw-r--r--fs/fuse/inode.c13
-rw-r--r--fs/generic_acl.c13
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/hppfs/hppfs.c1
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/inode.c82
-rw-r--r--fs/jbd/checkpoint.c37
-rw-r--r--fs/jbd/commit.c57
-rw-r--r--fs/jbd/journal.c99
-rw-r--r--fs/jbd/transaction.c83
-rw-r--r--fs/jbd2/checkpoint.c5
-rw-r--r--fs/jbd2/journal.c67
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/jfs_dmap.c5
-rw-r--r--fs/jfs/jfs_txnmgr.c6
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/namei.c3
-rw-r--r--fs/jfs/xattr.c4
-rw-r--r--fs/lockd/clntproc.c9
-rw-r--r--fs/namei.c118
-rw-r--r--fs/nfs/Kconfig15
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/Makefile5
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1020
-rw-r--r--fs/nfs/blocklayout/blocklayout.h207
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c410
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c111
-rw-r--r--fs/nfs/blocklayout/extents.c935
-rw-r--r--fs/nfs/cache_lib.h2
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c82
-rw-r--r--fs/nfs/callback_xdr.c24
-rw-r--r--fs/nfs/client.c18
-rw-r--r--fs/nfs/delegation.c16
-rw-r--r--fs/nfs/dir.c57
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/internal.h13
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/nfs/nfs4_fs.h7
-rw-r--r--fs/nfs/nfs4filelayout.c82
-rw-r--r--fs/nfs/nfs4filelayout.h17
-rw-r--r--fs/nfs/nfs4filelayoutdev.c452
-rw-r--r--fs/nfs/nfs4proc.c277
-rw-r--r--fs/nfs/nfs4state.c9
-rw-r--r--fs/nfs/nfs4xdr.c480
-rw-r--r--fs/nfs/objlayout/objio_osd.c48
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c3
-rw-r--r--fs/nfs/pagelist.c69
-rw-r--r--fs/nfs/pnfs.c307
-rw-r--r--fs/nfs/pnfs.h102
-rw-r--r--fs/nfs/pnfs_dev.c64
-rw-r--r--fs/nfs/read.c166
-rw-r--r--fs/nfs/unlink.c37
-rw-r--r--fs/nfs/write.c159
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inode_mark.c2
-rw-r--r--fs/notify/mark.c2
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/notify/vfsmount_mark.c2
-rw-r--r--fs/ntfs/inode.h2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/open.c78
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/posix_acl.c18
-rw-r--r--fs/proc/base.c28
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/proc_net.c4
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/pstore/inode.c12
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c30
-rw-r--r--fs/read_write.c12
-rw-r--r--fs/reiserfs/xattr_acl.c10
-rw-r--r--fs/stack.c5
-rw-r--r--fs/stat.c4
-rw-r--r--fs/ubifs/debug.h6
-rw-r--r--fs/xfs/Makefile119
-rw-r--r--fs/xfs/kmem.c (renamed from fs/xfs/linux-2.6/kmem.c)0
-rw-r--r--fs/xfs/kmem.h (renamed from fs/xfs/linux-2.6/kmem.h)0
-rw-r--r--fs/xfs/mrlock.h (renamed from fs/xfs/linux-2.6/mrlock.h)0
-rw-r--r--fs/xfs/time.h (renamed from fs/xfs/linux-2.6/time.h)0
-rw-r--r--fs/xfs/uuid.c (renamed from fs/xfs/support/uuid.c)0
-rw-r--r--fs/xfs/uuid.h (renamed from fs/xfs/support/uuid.h)0
-rw-r--r--fs/xfs/xfs.h3
-rw-r--r--fs/xfs/xfs_acl.c (renamed from fs/xfs/linux-2.6/xfs_acl.c)6
-rw-r--r--fs/xfs/xfs_acl.h5
-rw-r--r--fs/xfs/xfs_ag.h6
-rw-r--r--fs/xfs/xfs_alloc.c7
-rw-r--r--fs/xfs/xfs_aops.c (renamed from fs/xfs/linux-2.6/xfs_aops.c)3
-rw-r--r--fs/xfs/xfs_aops.h (renamed from fs/xfs/linux-2.6/xfs_aops.h)0
-rw-r--r--fs/xfs/xfs_attr.c3
-rw-r--r--fs/xfs/xfs_bmap.c10
-rw-r--r--fs/xfs/xfs_btree.c17
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_buf.c (renamed from fs/xfs/linux-2.6/xfs_buf.c)18
-rw-r--r--fs/xfs/xfs_buf.h (renamed from fs/xfs/linux-2.6/xfs_buf.h)32
-rw-r--r--fs/xfs/xfs_buf_item.c24
-rw-r--r--fs/xfs/xfs_da_btree.c44
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c16
-rw-r--r--fs/xfs/xfs_discard.c (renamed from fs/xfs/linux-2.6/xfs_discard.c)0
-rw-r--r--fs/xfs/xfs_discard.h (renamed from fs/xfs/linux-2.6/xfs_discard.h)0
-rw-r--r--fs/xfs/xfs_dquot.c (renamed from fs/xfs/quota/xfs_dquot.c)16
-rw-r--r--fs/xfs/xfs_dquot.h (renamed from fs/xfs/quota/xfs_dquot.h)0
-rw-r--r--fs/xfs/xfs_dquot_item.c (renamed from fs/xfs/quota/xfs_dquot_item.c)0
-rw-r--r--fs/xfs/xfs_dquot_item.h (renamed from fs/xfs/quota/xfs_dquot_item.h)0
-rw-r--r--fs/xfs/xfs_export.c (renamed from fs/xfs/linux-2.6/xfs_export.c)0
-rw-r--r--fs/xfs/xfs_export.h (renamed from fs/xfs/linux-2.6/xfs_export.h)0
-rw-r--r--fs/xfs/xfs_file.c (renamed from fs/xfs/linux-2.6/xfs_file.c)2
-rw-r--r--fs/xfs/xfs_filestream.c14
-rw-r--r--fs/xfs/xfs_fs_subr.c (renamed from fs/xfs/linux-2.6/xfs_fs_subr.c)0
-rw-r--r--fs/xfs/xfs_globals.c (renamed from fs/xfs/linux-2.6/xfs_globals.c)0
-rw-r--r--fs/xfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/xfs_inode.c20
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_ioctl.c (renamed from fs/xfs/linux-2.6/xfs_ioctl.c)6
-rw-r--r--fs/xfs/xfs_ioctl.h (renamed from fs/xfs/linux-2.6/xfs_ioctl.h)0
-rw-r--r--fs/xfs/xfs_ioctl32.c (renamed from fs/xfs/linux-2.6/xfs_ioctl32.c)0
-rw-r--r--fs/xfs/xfs_ioctl32.h (renamed from fs/xfs/linux-2.6/xfs_ioctl32.h)0
-rw-r--r--fs/xfs/xfs_iops.c (renamed from fs/xfs/linux-2.6/xfs_iops.c)23
-rw-r--r--fs/xfs/xfs_iops.h (renamed from fs/xfs/linux-2.6/xfs_iops.h)0
-rw-r--r--fs/xfs/xfs_linux.h (renamed from fs/xfs/linux-2.6/xfs_linux.h)27
-rw-r--r--fs/xfs/xfs_log.c14
-rw-r--r--fs/xfs/xfs_log_recover.c42
-rw-r--r--fs/xfs/xfs_message.c (renamed from fs/xfs/linux-2.6/xfs_message.c)0
-rw-r--r--fs/xfs/xfs_message.h (renamed from fs/xfs/linux-2.6/xfs_message.h)0
-rw-r--r--fs/xfs/xfs_mount.c6
-rw-r--r--fs/xfs/xfs_qm.c (renamed from fs/xfs/quota/xfs_qm.c)2
-rw-r--r--fs/xfs/xfs_qm.h (renamed from fs/xfs/quota/xfs_qm.h)0
-rw-r--r--fs/xfs/xfs_qm_bhv.c (renamed from fs/xfs/quota/xfs_qm_bhv.c)0
-rw-r--r--fs/xfs/xfs_qm_stats.c (renamed from fs/xfs/quota/xfs_qm_stats.c)0
-rw-r--r--fs/xfs/xfs_qm_stats.h (renamed from fs/xfs/quota/xfs_qm_stats.h)0
-rw-r--r--fs/xfs/xfs_qm_syscalls.c (renamed from fs/xfs/quota/xfs_qm_syscalls.c)0
-rw-r--r--fs/xfs/xfs_quota_priv.h (renamed from fs/xfs/quota/xfs_quota_priv.h)0
-rw-r--r--fs/xfs/xfs_quotaops.c (renamed from fs/xfs/linux-2.6/xfs_quotaops.c)2
-rw-r--r--fs/xfs/xfs_rename.c4
-rw-r--r--fs/xfs/xfs_rtalloc.c32
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_rw.c8
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_stats.c (renamed from fs/xfs/linux-2.6/xfs_stats.c)0
-rw-r--r--fs/xfs/xfs_stats.h (renamed from fs/xfs/linux-2.6/xfs_stats.h)0
-rw-r--r--fs/xfs/xfs_super.c (renamed from fs/xfs/linux-2.6/xfs_super.c)36
-rw-r--r--fs/xfs/xfs_super.h (renamed from fs/xfs/linux-2.6/xfs_super.h)0
-rw-r--r--fs/xfs/xfs_sync.c (renamed from fs/xfs/linux-2.6/xfs_sync.c)2
-rw-r--r--fs/xfs/xfs_sync.h (renamed from fs/xfs/linux-2.6/xfs_sync.h)0
-rw-r--r--fs/xfs/xfs_sysctl.c (renamed from fs/xfs/linux-2.6/xfs_sysctl.c)0
-rw-r--r--fs/xfs/xfs_sysctl.h (renamed from fs/xfs/linux-2.6/xfs_sysctl.h)0
-rw-r--r--fs/xfs/xfs_trace.c (renamed from fs/xfs/linux-2.6/xfs_trace.c)4
-rw-r--r--fs/xfs/xfs_trace.h (renamed from fs/xfs/linux-2.6/xfs_trace.h)0
-rw-r--r--fs/xfs/xfs_trans_ail.c67
-rw-r--r--fs/xfs/xfs_trans_buf.c28
-rw-r--r--fs/xfs/xfs_trans_dquot.c (renamed from fs/xfs/quota/xfs_trans_dquot.c)0
-rw-r--r--fs/xfs/xfs_vnode.h (renamed from fs/xfs/linux-2.6/xfs_vnode.h)0
-rw-r--r--fs/xfs/xfs_vnodeops.c22
-rw-r--r--fs/xfs/xfs_xattr.c (renamed from fs/xfs/linux-2.6/xfs_xattr.c)0
301 files changed, 11478 insertions, 6949 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index e9cb57f07546..9a1d42630751 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -182,11 +182,11 @@ int v9fs_set_create_acl(struct dentry *dentry,
182 return 0; 182 return 0;
183} 183}
184 184
185int v9fs_acl_mode(struct inode *dir, mode_t *modep, 185int v9fs_acl_mode(struct inode *dir, umode_t *modep,
186 struct posix_acl **dpacl, struct posix_acl **pacl) 186 struct posix_acl **dpacl, struct posix_acl **pacl)
187{ 187{
188 int retval = 0; 188 int retval = 0;
189 mode_t mode = *modep; 189 umode_t mode = *modep;
190 struct posix_acl *acl = NULL; 190 struct posix_acl *acl = NULL;
191 191
192 if (!S_ISLNK(mode)) { 192 if (!S_ISLNK(mode)) {
@@ -319,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
319 case ACL_TYPE_ACCESS: 319 case ACL_TYPE_ACCESS:
320 name = POSIX_ACL_XATTR_ACCESS; 320 name = POSIX_ACL_XATTR_ACCESS;
321 if (acl) { 321 if (acl) {
322 mode_t mode = inode->i_mode; 322 umode_t mode = inode->i_mode;
323 retval = posix_acl_equiv_mode(acl, &mode); 323 retval = posix_acl_equiv_mode(acl, &mode);
324 if (retval < 0) 324 if (retval < 0)
325 goto err_out; 325 goto err_out;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ddb7ae19d971..559556411965 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -20,7 +20,7 @@ extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
20extern int v9fs_acl_chmod(struct dentry *); 20extern int v9fs_acl_chmod(struct dentry *);
21extern int v9fs_set_create_acl(struct dentry *, 21extern int v9fs_set_create_acl(struct dentry *,
22 struct posix_acl **, struct posix_acl **); 22 struct posix_acl **, struct posix_acl **);
23extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, 23extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
24 struct posix_acl **dpacl, struct posix_acl **pacl); 24 struct posix_acl **dpacl, struct posix_acl **pacl);
25#else 25#else
26#define v9fs_iop_get_acl NULL 26#define v9fs_iop_get_acl NULL
@@ -38,7 +38,7 @@ static inline int v9fs_set_create_acl(struct dentry *dentry,
38{ 38{
39 return 0; 39 return 0;
40} 40}
41static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, 41static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
42 struct posix_acl **dpacl, 42 struct posix_acl **dpacl,
43 struct posix_acl **pacl) 43 struct posix_acl **pacl)
44{ 44{
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 46ce357ca1ab..410ffd6ceb5f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache;
54 54
55struct inode *v9fs_alloc_inode(struct super_block *sb); 55struct inode *v9fs_alloc_inode(struct super_block *sb);
56void v9fs_destroy_inode(struct inode *inode); 56void v9fs_destroy_inode(struct inode *inode);
57struct inode *v9fs_get_inode(struct super_block *sb, int mode); 57struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
58int v9fs_init_inode(struct v9fs_session_info *v9ses, 58int v9fs_init_inode(struct v9fs_session_info *v9ses,
59 struct inode *inode, int mode); 59 struct inode *inode, int mode, dev_t);
60void v9fs_evict_inode(struct inode *inode); 60void v9fs_evict_inode(struct inode *inode);
61ino_t v9fs_qid2ino(struct p9_qid *qid); 61ino_t v9fs_qid2ino(struct p9_qid *qid);
62void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 62void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -83,4 +83,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
83 v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; 83 v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
84 return; 84 return;
85} 85}
86
87int v9fs_open_to_dotl_flags(int flags);
86#endif 88#endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3c173fcc2c5a..62857a810a79 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
65 v9inode = V9FS_I(inode); 65 v9inode = V9FS_I(inode);
66 v9ses = v9fs_inode2v9ses(inode); 66 v9ses = v9fs_inode2v9ses(inode);
67 if (v9fs_proto_dotl(v9ses)) 67 if (v9fs_proto_dotl(v9ses))
68 omode = file->f_flags; 68 omode = v9fs_open_to_dotl_flags(file->f_flags);
69 else 69 else
70 omode = v9fs_uflags2omode(file->f_flags, 70 omode = v9fs_uflags2omode(file->f_flags,
71 v9fs_proto_dotu(v9ses)); 71 v9fs_proto_dotu(v9ses));
@@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
169 169
170 /* convert posix lock to p9 tlock args */ 170 /* convert posix lock to p9 tlock args */
171 memset(&flock, 0, sizeof(flock)); 171 memset(&flock, 0, sizeof(flock));
172 flock.type = fl->fl_type; 172 /* map the lock type */
173 switch (fl->fl_type) {
174 case F_RDLCK:
175 flock.type = P9_LOCK_TYPE_RDLCK;
176 break;
177 case F_WRLCK:
178 flock.type = P9_LOCK_TYPE_WRLCK;
179 break;
180 case F_UNLCK:
181 flock.type = P9_LOCK_TYPE_UNLCK;
182 break;
183 }
173 flock.start = fl->fl_start; 184 flock.start = fl->fl_start;
174 if (fl->fl_end == OFFSET_MAX) 185 if (fl->fl_end == OFFSET_MAX)
175 flock.length = 0; 186 flock.length = 0;
@@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
245 256
246 /* convert posix lock to p9 tgetlock args */ 257 /* convert posix lock to p9 tgetlock args */
247 memset(&glock, 0, sizeof(glock)); 258 memset(&glock, 0, sizeof(glock));
248 glock.type = fl->fl_type; 259 glock.type = P9_LOCK_TYPE_UNLCK;
249 glock.start = fl->fl_start; 260 glock.start = fl->fl_start;
250 if (fl->fl_end == OFFSET_MAX) 261 if (fl->fl_end == OFFSET_MAX)
251 glock.length = 0; 262 glock.length = 0;
@@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
257 res = p9_client_getlock_dotl(fid, &glock); 268 res = p9_client_getlock_dotl(fid, &glock);
258 if (res < 0) 269 if (res < 0)
259 return res; 270 return res;
260 if (glock.type != F_UNLCK) { 271 /* map 9p lock type to os lock type */
261 fl->fl_type = glock.type; 272 switch (glock.type) {
273 case P9_LOCK_TYPE_RDLCK:
274 fl->fl_type = F_RDLCK;
275 break;
276 case P9_LOCK_TYPE_WRLCK:
277 fl->fl_type = F_WRLCK;
278 break;
279 case P9_LOCK_TYPE_UNLCK:
280 fl->fl_type = F_UNLCK;
281 break;
282 }
283 if (glock.type != P9_LOCK_TYPE_UNLCK) {
262 fl->fl_start = glock.start; 284 fl->fl_start = glock.start;
263 if (glock.length == 0) 285 if (glock.length == 0)
264 fl->fl_end = OFFSET_MAX; 286 fl->fl_end = OFFSET_MAX;
265 else 287 else
266 fl->fl_end = glock.start + glock.length - 1; 288 fl->fl_end = glock.start + glock.length - 1;
267 fl->fl_pid = glock.proc_id; 289 fl->fl_pid = glock.proc_id;
268 } else 290 }
269 fl->fl_type = F_UNLCK;
270
271 return res; 291 return res;
272} 292}
273 293
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8bb5507e822f..e3c03db3c788 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
95/** 95/**
96 * p9mode2unixmode- convert plan9 mode bits to unix mode bits 96 * p9mode2unixmode- convert plan9 mode bits to unix mode bits
97 * @v9ses: v9fs session information 97 * @v9ses: v9fs session information
98 * @mode: mode to convert 98 * @stat: p9_wstat from which mode need to be derived
99 * @rdev: major number, minor number in case of device files.
99 * 100 *
100 */ 101 */
101 102static int p9mode2unixmode(struct v9fs_session_info *v9ses,
102static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) 103 struct p9_wstat *stat, dev_t *rdev)
103{ 104{
104 int res; 105 int res;
106 int mode = stat->mode;
105 107
106 res = mode & 0777; 108 res = mode & S_IALLUGO;
109 *rdev = 0;
107 110
108 if ((mode & P9_DMDIR) == P9_DMDIR) 111 if ((mode & P9_DMDIR) == P9_DMDIR)
109 res |= S_IFDIR; 112 res |= S_IFDIR;
@@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
116 && (v9ses->nodev == 0)) 119 && (v9ses->nodev == 0))
117 res |= S_IFIFO; 120 res |= S_IFIFO;
118 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) 121 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
119 && (v9ses->nodev == 0)) 122 && (v9ses->nodev == 0)) {
120 res |= S_IFBLK; 123 char type = 0, ext[32];
121 else 124 int major = -1, minor = -1;
125
126 strncpy(ext, stat->extension, sizeof(ext));
127 sscanf(ext, "%c %u %u", &type, &major, &minor);
128 switch (type) {
129 case 'c':
130 res |= S_IFCHR;
131 break;
132 case 'b':
133 res |= S_IFBLK;
134 break;
135 default:
136 P9_DPRINTK(P9_DEBUG_ERROR,
137 "Unknown special type %c %s\n", type,
138 stat->extension);
139 };
140 *rdev = MKDEV(major, minor);
141 } else
122 res |= S_IFREG; 142 res |= S_IFREG;
123 143
124 if (v9fs_proto_dotu(v9ses)) { 144 if (v9fs_proto_dotu(v9ses)) {
@@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
131 if ((mode & P9_DMSETVTX) == P9_DMSETVTX) 151 if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
132 res |= S_ISVTX; 152 res |= S_ISVTX;
133 } 153 }
134
135 return res; 154 return res;
136} 155}
137 156
@@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode)
242} 261}
243 262
244int v9fs_init_inode(struct v9fs_session_info *v9ses, 263int v9fs_init_inode(struct v9fs_session_info *v9ses,
245 struct inode *inode, int mode) 264 struct inode *inode, int mode, dev_t rdev)
246{ 265{
247 int err = 0; 266 int err = 0;
248 267
249 inode_init_owner(inode, NULL, mode); 268 inode_init_owner(inode, NULL, mode);
250 inode->i_blocks = 0; 269 inode->i_blocks = 0;
251 inode->i_rdev = 0; 270 inode->i_rdev = rdev;
252 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 271 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
253 inode->i_mapping->a_ops = &v9fs_addr_operations; 272 inode->i_mapping->a_ops = &v9fs_addr_operations;
254 273
@@ -335,7 +354,7 @@ error:
335 * 354 *
336 */ 355 */
337 356
338struct inode *v9fs_get_inode(struct super_block *sb, int mode) 357struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
339{ 358{
340 int err; 359 int err;
341 struct inode *inode; 360 struct inode *inode;
@@ -348,7 +367,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
348 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 367 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
349 return ERR_PTR(-ENOMEM); 368 return ERR_PTR(-ENOMEM);
350 } 369 }
351 err = v9fs_init_inode(v9ses, inode, mode); 370 err = v9fs_init_inode(v9ses, inode, mode, rdev);
352 if (err) { 371 if (err) {
353 iput(inode); 372 iput(inode);
354 return ERR_PTR(err); 373 return ERR_PTR(err);
@@ -435,11 +454,12 @@ void v9fs_evict_inode(struct inode *inode)
435static int v9fs_test_inode(struct inode *inode, void *data) 454static int v9fs_test_inode(struct inode *inode, void *data)
436{ 455{
437 int umode; 456 int umode;
457 dev_t rdev;
438 struct v9fs_inode *v9inode = V9FS_I(inode); 458 struct v9fs_inode *v9inode = V9FS_I(inode);
439 struct p9_wstat *st = (struct p9_wstat *)data; 459 struct p9_wstat *st = (struct p9_wstat *)data;
440 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); 460 struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
441 461
442 umode = p9mode2unixmode(v9ses, st->mode); 462 umode = p9mode2unixmode(v9ses, st, &rdev);
443 /* don't match inode of different type */ 463 /* don't match inode of different type */
444 if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) 464 if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
445 return 0; 465 return 0;
@@ -473,6 +493,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
473 struct p9_wstat *st, 493 struct p9_wstat *st,
474 int new) 494 int new)
475{ 495{
496 dev_t rdev;
476 int retval, umode; 497 int retval, umode;
477 unsigned long i_ino; 498 unsigned long i_ino;
478 struct inode *inode; 499 struct inode *inode;
@@ -496,8 +517,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
496 * later. 517 * later.
497 */ 518 */
498 inode->i_ino = i_ino; 519 inode->i_ino = i_ino;
499 umode = p9mode2unixmode(v9ses, st->mode); 520 umode = p9mode2unixmode(v9ses, st, &rdev);
500 retval = v9fs_init_inode(v9ses, inode, umode); 521 retval = v9fs_init_inode(v9ses, inode, umode, rdev);
501 if (retval) 522 if (retval)
502 goto error; 523 goto error;
503 524
@@ -532,6 +553,19 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
532} 553}
533 554
534/** 555/**
556 * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
557 * plan 9 AT flag.
558 * @flags: flags to convert
559 */
560static int v9fs_at_to_dotl_flags(int flags)
561{
562 int rflags = 0;
563 if (flags & AT_REMOVEDIR)
564 rflags |= P9_DOTL_AT_REMOVEDIR;
565 return rflags;
566}
567
568/**
535 * v9fs_remove - helper function to remove files and directories 569 * v9fs_remove - helper function to remove files and directories
536 * @dir: directory inode that is being deleted 570 * @dir: directory inode that is being deleted
537 * @dentry: dentry that is being deleted 571 * @dentry: dentry that is being deleted
@@ -558,7 +592,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
558 return retval; 592 return retval;
559 } 593 }
560 if (v9fs_proto_dotl(v9ses)) 594 if (v9fs_proto_dotl(v9ses))
561 retval = p9_client_unlinkat(dfid, dentry->d_name.name, flags); 595 retval = p9_client_unlinkat(dfid, dentry->d_name.name,
596 v9fs_at_to_dotl_flags(flags));
562 if (retval == -EOPNOTSUPP) { 597 if (retval == -EOPNOTSUPP) {
563 /* Try the one based on path */ 598 /* Try the one based on path */
564 v9fid = v9fs_fid_clone(dentry); 599 v9fid = v9fs_fid_clone(dentry);
@@ -645,13 +680,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
645 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 680 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
646 goto error; 681 goto error;
647 } 682 }
648 d_instantiate(dentry, inode);
649 err = v9fs_fid_add(dentry, fid); 683 err = v9fs_fid_add(dentry, fid);
650 if (err < 0) 684 if (err < 0)
651 goto error; 685 goto error;
652 686 d_instantiate(dentry, inode);
653 return ofid; 687 return ofid;
654
655error: 688error:
656 if (ofid) 689 if (ofid)
657 p9_client_clunk(ofid); 690 p9_client_clunk(ofid);
@@ -792,6 +825,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
792struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, 825struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
793 struct nameidata *nameidata) 826 struct nameidata *nameidata)
794{ 827{
828 struct dentry *res;
795 struct super_block *sb; 829 struct super_block *sb;
796 struct v9fs_session_info *v9ses; 830 struct v9fs_session_info *v9ses;
797 struct p9_fid *dfid, *fid; 831 struct p9_fid *dfid, *fid;
@@ -823,22 +857,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
823 857
824 return ERR_PTR(result); 858 return ERR_PTR(result);
825 } 859 }
826 860 /*
827 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); 861 * Make sure we don't use a wrong inode due to parallel
862 * unlink. For cached mode create calls request for new
863 * inode. But with cache disabled, lookup should do this.
864 */
865 if (v9ses->cache)
866 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
867 else
868 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
828 if (IS_ERR(inode)) { 869 if (IS_ERR(inode)) {
829 result = PTR_ERR(inode); 870 result = PTR_ERR(inode);
830 inode = NULL; 871 inode = NULL;
831 goto error; 872 goto error;
832 } 873 }
833
834 result = v9fs_fid_add(dentry, fid); 874 result = v9fs_fid_add(dentry, fid);
835 if (result < 0) 875 if (result < 0)
836 goto error_iput; 876 goto error_iput;
837
838inst_out: 877inst_out:
839 d_add(dentry, inode); 878 /*
840 return NULL; 879 * If we had a rename on the server and a parallel lookup
841 880 * for the new name, then make sure we instantiate with
881 * the new name. ie look up for a/b, while on server somebody
882 * moved b under k and client parallely did a lookup for
883 * k/b.
884 */
885 res = d_materialise_unique(dentry, inode);
886 if (!IS_ERR(res))
887 return res;
888 result = PTR_ERR(res);
842error_iput: 889error_iput:
843 iput(inode); 890 iput(inode);
844error: 891error:
@@ -1002,7 +1049,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1002 return PTR_ERR(st); 1049 return PTR_ERR(st);
1003 1050
1004 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); 1051 v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
1005 generic_fillattr(dentry->d_inode, stat); 1052 generic_fillattr(dentry->d_inode, stat);
1006 1053
1007 p9stat_free(st); 1054 p9stat_free(st);
1008 kfree(st); 1055 kfree(st);
@@ -1086,6 +1133,7 @@ void
1086v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, 1133v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1087 struct super_block *sb) 1134 struct super_block *sb)
1088{ 1135{
1136 mode_t mode;
1089 char ext[32]; 1137 char ext[32];
1090 char tag_name[14]; 1138 char tag_name[14];
1091 unsigned int i_nlink; 1139 unsigned int i_nlink;
@@ -1121,31 +1169,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1121 inode->i_nlink = i_nlink; 1169 inode->i_nlink = i_nlink;
1122 } 1170 }
1123 } 1171 }
1124 inode->i_mode = p9mode2unixmode(v9ses, stat->mode); 1172 mode = stat->mode & S_IALLUGO;
1125 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { 1173 mode |= inode->i_mode & ~S_IALLUGO;
1126 char type = 0; 1174 inode->i_mode = mode;
1127 int major = -1;
1128 int minor = -1;
1129
1130 strncpy(ext, stat->extension, sizeof(ext));
1131 sscanf(ext, "%c %u %u", &type, &major, &minor);
1132 switch (type) {
1133 case 'c':
1134 inode->i_mode &= ~S_IFBLK;
1135 inode->i_mode |= S_IFCHR;
1136 break;
1137 case 'b':
1138 break;
1139 default:
1140 P9_DPRINTK(P9_DEBUG_ERROR,
1141 "Unknown special type %c %s\n", type,
1142 stat->extension);
1143 };
1144 inode->i_rdev = MKDEV(major, minor);
1145 init_special_inode(inode, inode->i_mode, inode->i_rdev);
1146 } else
1147 inode->i_rdev = 0;
1148
1149 i_size_write(inode, stat->length); 1175 i_size_write(inode, stat->length);
1150 1176
1151 /* not real number of blocks, but 512 byte ones ... */ 1177 /* not real number of blocks, but 512 byte ones ... */
@@ -1411,6 +1437,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1411 1437
1412int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) 1438int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1413{ 1439{
1440 int umode;
1441 dev_t rdev;
1414 loff_t i_size; 1442 loff_t i_size;
1415 struct p9_wstat *st; 1443 struct p9_wstat *st;
1416 struct v9fs_session_info *v9ses; 1444 struct v9fs_session_info *v9ses;
@@ -1419,6 +1447,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1419 st = p9_client_stat(fid); 1447 st = p9_client_stat(fid);
1420 if (IS_ERR(st)) 1448 if (IS_ERR(st))
1421 return PTR_ERR(st); 1449 return PTR_ERR(st);
1450 /*
1451 * Don't update inode if the file type is different
1452 */
1453 umode = p9mode2unixmode(v9ses, st, &rdev);
1454 if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
1455 goto out;
1422 1456
1423 spin_lock(&inode->i_lock); 1457 spin_lock(&inode->i_lock);
1424 /* 1458 /*
@@ -1430,6 +1464,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1430 if (v9ses->cache) 1464 if (v9ses->cache)
1431 inode->i_size = i_size; 1465 inode->i_size = i_size;
1432 spin_unlock(&inode->i_lock); 1466 spin_unlock(&inode->i_lock);
1467out:
1433 p9stat_free(st); 1468 p9stat_free(st);
1434 kfree(st); 1469 kfree(st);
1435 return 0; 1470 return 0;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 9a26dce5a99f..aded79fcd5cf 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
153 * later. 153 * later.
154 */ 154 */
155 inode->i_ino = i_ino; 155 inode->i_ino = i_ino;
156 retval = v9fs_init_inode(v9ses, inode, st->st_mode); 156 retval = v9fs_init_inode(v9ses, inode,
157 st->st_mode, new_decode_dev(st->st_rdev));
157 if (retval) 158 if (retval)
158 goto error; 159 goto error;
159 160
@@ -190,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
190 return inode; 191 return inode;
191} 192}
192 193
194struct dotl_openflag_map {
195 int open_flag;
196 int dotl_flag;
197};
198
199static int v9fs_mapped_dotl_flags(int flags)
200{
201 int i;
202 int rflags = 0;
203 struct dotl_openflag_map dotl_oflag_map[] = {
204 { O_CREAT, P9_DOTL_CREATE },
205 { O_EXCL, P9_DOTL_EXCL },
206 { O_NOCTTY, P9_DOTL_NOCTTY },
207 { O_TRUNC, P9_DOTL_TRUNC },
208 { O_APPEND, P9_DOTL_APPEND },
209 { O_NONBLOCK, P9_DOTL_NONBLOCK },
210 { O_DSYNC, P9_DOTL_DSYNC },
211 { FASYNC, P9_DOTL_FASYNC },
212 { O_DIRECT, P9_DOTL_DIRECT },
213 { O_LARGEFILE, P9_DOTL_LARGEFILE },
214 { O_DIRECTORY, P9_DOTL_DIRECTORY },
215 { O_NOFOLLOW, P9_DOTL_NOFOLLOW },
216 { O_NOATIME, P9_DOTL_NOATIME },
217 { O_CLOEXEC, P9_DOTL_CLOEXEC },
218 { O_SYNC, P9_DOTL_SYNC},
219 };
220 for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
221 if (flags & dotl_oflag_map[i].open_flag)
222 rflags |= dotl_oflag_map[i].dotl_flag;
223 }
224 return rflags;
225}
226
227/**
228 * v9fs_open_to_dotl_flags- convert Linux specific open flags to
229 * plan 9 open flag.
230 * @flags: flags to convert
231 */
232int v9fs_open_to_dotl_flags(int flags)
233{
234 int rflags = 0;
235
236 /*
237 * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
238 * and P9_DOTL_NOACCESS
239 */
240 rflags |= flags & O_ACCMODE;
241 rflags |= v9fs_mapped_dotl_flags(flags);
242
243 return rflags;
244}
245
193/** 246/**
194 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. 247 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
195 * @dir: directory inode that is being created 248 * @dir: directory inode that is being created
@@ -206,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
206 int err = 0; 259 int err = 0;
207 gid_t gid; 260 gid_t gid;
208 int flags; 261 int flags;
209 mode_t mode; 262 umode_t mode;
210 char *name = NULL; 263 char *name = NULL;
211 struct file *filp; 264 struct file *filp;
212 struct p9_qid qid; 265 struct p9_qid qid;
@@ -258,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
258 "Failed to get acl values in creat %d\n", err); 311 "Failed to get acl values in creat %d\n", err);
259 goto error; 312 goto error;
260 } 313 }
261 err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); 314 err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
315 mode, gid, &qid);
262 if (err < 0) { 316 if (err < 0) {
263 P9_DPRINTK(P9_DEBUG_VFS, 317 P9_DPRINTK(P9_DEBUG_VFS,
264 "p9_client_open_dotl failed in creat %d\n", 318 "p9_client_open_dotl failed in creat %d\n",
@@ -281,10 +335,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
281 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 335 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
282 goto error; 336 goto error;
283 } 337 }
284 d_instantiate(dentry, inode);
285 err = v9fs_fid_add(dentry, fid); 338 err = v9fs_fid_add(dentry, fid);
286 if (err < 0) 339 if (err < 0)
287 goto error; 340 goto error;
341 d_instantiate(dentry, inode);
288 342
289 /* Now set the ACL based on the default value */ 343 /* Now set the ACL based on the default value */
290 v9fs_set_create_acl(dentry, &dacl, &pacl); 344 v9fs_set_create_acl(dentry, &dacl, &pacl);
@@ -348,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
348 struct p9_fid *fid = NULL, *dfid = NULL; 402 struct p9_fid *fid = NULL, *dfid = NULL;
349 gid_t gid; 403 gid_t gid;
350 char *name; 404 char *name;
351 mode_t mode; 405 umode_t mode;
352 struct inode *inode; 406 struct inode *inode;
353 struct p9_qid qid; 407 struct p9_qid qid;
354 struct dentry *dir_dentry; 408 struct dentry *dir_dentry;
@@ -403,10 +457,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
403 err); 457 err);
404 goto error; 458 goto error;
405 } 459 }
406 d_instantiate(dentry, inode);
407 err = v9fs_fid_add(dentry, fid); 460 err = v9fs_fid_add(dentry, fid);
408 if (err < 0) 461 if (err < 0)
409 goto error; 462 goto error;
463 d_instantiate(dentry, inode);
410 fid = NULL; 464 fid = NULL;
411 } else { 465 } else {
412 /* 466 /*
@@ -414,7 +468,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
414 * inode with stat. We need to get an inode 468 * inode with stat. We need to get an inode
415 * so that we can set the acl with dentry 469 * so that we can set the acl with dentry
416 */ 470 */
417 inode = v9fs_get_inode(dir->i_sb, mode); 471 inode = v9fs_get_inode(dir->i_sb, mode, 0);
418 if (IS_ERR(inode)) { 472 if (IS_ERR(inode)) {
419 err = PTR_ERR(inode); 473 err = PTR_ERR(inode);
420 goto error; 474 goto error;
@@ -540,6 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
540void 594void
541v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) 595v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
542{ 596{
597 mode_t mode;
543 struct v9fs_inode *v9inode = V9FS_I(inode); 598 struct v9fs_inode *v9inode = V9FS_I(inode);
544 599
545 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { 600 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -552,11 +607,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
552 inode->i_uid = stat->st_uid; 607 inode->i_uid = stat->st_uid;
553 inode->i_gid = stat->st_gid; 608 inode->i_gid = stat->st_gid;
554 inode->i_nlink = stat->st_nlink; 609 inode->i_nlink = stat->st_nlink;
555 inode->i_mode = stat->st_mode;
556 inode->i_rdev = new_decode_dev(stat->st_rdev);
557 610
558 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) 611 mode = stat->st_mode & S_IALLUGO;
559 init_special_inode(inode, inode->i_mode, inode->i_rdev); 612 mode |= inode->i_mode & ~S_IALLUGO;
613 inode->i_mode = mode;
560 614
561 i_size_write(inode, stat->st_size); 615 i_size_write(inode, stat->st_size);
562 inode->i_blocks = stat->st_blocks; 616 inode->i_blocks = stat->st_blocks;
@@ -657,14 +711,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
657 err); 711 err);
658 goto error; 712 goto error;
659 } 713 }
660 d_instantiate(dentry, inode);
661 err = v9fs_fid_add(dentry, fid); 714 err = v9fs_fid_add(dentry, fid);
662 if (err < 0) 715 if (err < 0)
663 goto error; 716 goto error;
717 d_instantiate(dentry, inode);
664 fid = NULL; 718 fid = NULL;
665 } else { 719 } else {
666 /* Not in cached mode. No need to populate inode with stat */ 720 /* Not in cached mode. No need to populate inode with stat */
667 inode = v9fs_get_inode(dir->i_sb, S_IFLNK); 721 inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
668 if (IS_ERR(inode)) { 722 if (IS_ERR(inode)) {
669 err = PTR_ERR(inode); 723 err = PTR_ERR(inode);
670 goto error; 724 goto error;
@@ -751,7 +805,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
751 int err; 805 int err;
752 gid_t gid; 806 gid_t gid;
753 char *name; 807 char *name;
754 mode_t mode; 808 umode_t mode;
755 struct v9fs_session_info *v9ses; 809 struct v9fs_session_info *v9ses;
756 struct p9_fid *fid = NULL, *dfid = NULL; 810 struct p9_fid *fid = NULL, *dfid = NULL;
757 struct inode *inode; 811 struct inode *inode;
@@ -810,17 +864,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
810 err); 864 err);
811 goto error; 865 goto error;
812 } 866 }
813 d_instantiate(dentry, inode);
814 err = v9fs_fid_add(dentry, fid); 867 err = v9fs_fid_add(dentry, fid);
815 if (err < 0) 868 if (err < 0)
816 goto error; 869 goto error;
870 d_instantiate(dentry, inode);
817 fid = NULL; 871 fid = NULL;
818 } else { 872 } else {
819 /* 873 /*
820 * Not in cached mode. No need to populate inode with stat. 874 * Not in cached mode. No need to populate inode with stat.
821 * socket syscall returns a fd, so we need instantiate 875 * socket syscall returns a fd, so we need instantiate
822 */ 876 */
823 inode = v9fs_get_inode(dir->i_sb, mode); 877 inode = v9fs_get_inode(dir->i_sb, mode, rdev);
824 if (IS_ERR(inode)) { 878 if (IS_ERR(inode)) {
825 err = PTR_ERR(inode); 879 err = PTR_ERR(inode);
826 goto error; 880 goto error;
@@ -886,6 +940,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
886 st = p9_client_getattr_dotl(fid, P9_STATS_ALL); 940 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
887 if (IS_ERR(st)) 941 if (IS_ERR(st))
888 return PTR_ERR(st); 942 return PTR_ERR(st);
943 /*
944 * Don't update inode if the file type is different
945 */
946 if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
947 goto out;
889 948
890 spin_lock(&inode->i_lock); 949 spin_lock(&inode->i_lock);
891 /* 950 /*
@@ -897,6 +956,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
897 if (v9ses->cache) 956 if (v9ses->cache)
898 inode->i_size = i_size; 957 inode->i_size = i_size;
899 spin_unlock(&inode->i_lock); 958 spin_unlock(&inode->i_lock);
959out:
900 kfree(st); 960 kfree(st);
901 return 0; 961 return 0;
902} 962}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index feef6cdc1fd2..c70251d47ed1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
149 else 149 else
150 sb->s_d_op = &v9fs_dentry_operations; 150 sb->s_d_op = &v9fs_dentry_operations;
151 151
152 inode = v9fs_get_inode(sb, S_IFDIR | mode); 152 inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
153 if (IS_ERR(inode)) { 153 if (IS_ERR(inode)) {
154 retval = PTR_ERR(inode); 154 retval = PTR_ERR(inode);
155 goto release_sb; 155 goto release_sb;
diff --git a/fs/Kconfig b/fs/Kconfig
index 19891aab9c6e..9fe0b349f4cd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL
127 select TMPFS_XATTR 127 select TMPFS_XATTR
128 select GENERIC_ACL 128 select GENERIC_ACL
129 help 129 help
130 POSIX Access Control Lists (ACLs) support permissions for users and 130 POSIX Access Control Lists (ACLs) support additional access rights
131 groups beyond the owner/group/world scheme. 131 for users and groups beyond the standard owner/group/world scheme,
132 and this option selects support for ACLs specifically for tmpfs
133 filesystems.
134
135 If you've selected TMPFS, it's possible that you'll also need
136 this option as there are a number of Linux distros that require
137 POSIX ACL support under /dev for certain features to work properly.
138 For example, some distros need this feature for ALSA-related /dev
139 files for sound to work properly. In short, if you're not sure,
140 say Y.
132 141
133 To learn more about Access Control Lists, visit the POSIX ACLs for 142 To learn more about Access Control Lists, visit the POSIX ACLs for
134 Linux website <http://acl.bestbits.at/>. 143 Linux website <http://acl.bestbits.at/>.
135 144
136 If you don't know what Access Control Lists are, say N.
137
138config TMPFS_XATTR 145config TMPFS_XATTR
139 bool "Tmpfs extended attributes" 146 bool "Tmpfs extended attributes"
140 depends on TMPFS 147 depends on TMPFS
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 4d433d34736f..f11e43ed907d 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
187 */ 187 */
188static struct inode *anon_inode_mkinode(void) 188static struct inode *anon_inode_mkinode(void)
189{ 189{
190 struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); 190 struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb);
191 191
192 if (!inode) 192 if (!inode)
193 return ERR_PTR(-ENOMEM); 193 return ERR_PTR(-ENOMEM);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 475f9c597cb7..326dc08d3e3f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -39,27 +39,17 @@
39 39
40/* #define DEBUG */ 40/* #define DEBUG */
41 41
42#ifdef DEBUG 42#define DPRINTK(fmt, ...) \
43#define DPRINTK(fmt, args...) \ 43 pr_debug("pid %d: %s: " fmt "\n", \
44do { \ 44 current->pid, __func__, ##__VA_ARGS__)
45 printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ 45
46 current->pid, __func__, ##args); \ 46#define AUTOFS_WARN(fmt, ...) \
47} while (0)
48#else
49#define DPRINTK(fmt, args...) do {} while (0)
50#endif
51
52#define AUTOFS_WARN(fmt, args...) \
53do { \
54 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ 47 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
55 current->pid, __func__, ##args); \ 48 current->pid, __func__, ##__VA_ARGS__)
56} while (0)
57 49
58#define AUTOFS_ERROR(fmt, args...) \ 50#define AUTOFS_ERROR(fmt, ...) \
59do { \
60 printk(KERN_ERR "pid %d: %s: " fmt "\n", \ 51 printk(KERN_ERR "pid %d: %s: " fmt "\n", \
61 current->pid, __func__, ##args); \ 52 current->pid, __func__, ##__VA_ARGS__)
62} while (0)
63 53
64/* Unified info structure. This is pointed to by both the dentry and 54/* Unified info structure. This is pointed to by both the dentry and
65 inode structures. Each file in the filesystem has an instance of this 55 inode structures. Each file in the filesystem has an instance of this
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 25435987d6ae..e1fbdeef85db 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
104 size_t pktsz; 104 size_t pktsz;
105 105
106 DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", 106 DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
107 wq->wait_queue_token, wq->name.len, wq->name.name, type); 107 (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
108 108
109 memset(&pkt,0,sizeof pkt); /* For security reasons */ 109 memset(&pkt,0,sizeof pkt); /* For security reasons */
110 110
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 54b8c28bebc8..720d885e8dca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
474 befs_data_stream *data = &befs_ino->i_data.ds; 474 befs_data_stream *data = &befs_ino->i_data.ds;
475 befs_off_t len = data->size; 475 befs_off_t len = data->size;
476 476
477 befs_debug(sb, "Follow long symlink"); 477 if (len == 0) {
478 478 befs_error(sb, "Long symlink with illegal length");
479 link = kmalloc(len, GFP_NOFS);
480 if (!link) {
481 link = ERR_PTR(-ENOMEM);
482 } else if (befs_read_lsymlink(sb, data, link, len) != len) {
483 kfree(link);
484 befs_error(sb, "Failed to read entire long symlink");
485 link = ERR_PTR(-EIO); 479 link = ERR_PTR(-EIO);
486 } else { 480 } else {
487 link[len - 1] = '\0'; 481 befs_debug(sb, "Follow long symlink");
482
483 link = kmalloc(len, GFP_NOFS);
484 if (!link) {
485 link = ERR_PTR(-ENOMEM);
486 } else if (befs_read_lsymlink(sb, data, link, len) != len) {
487 kfree(link);
488 befs_error(sb, "Failed to read entire long symlink");
489 link = ERR_PTR(-EIO);
490 } else {
491 link[len - 1] = '\0';
492 }
488 } 493 }
489 } else { 494 } else {
490 link = befs_ino->i_data.symlink; 495 link = befs_ino->i_data.symlink;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c62fb84944d5..95f786ec7f08 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
44{ 44{
45 return &BDEV_I(inode)->bdev; 45 return &BDEV_I(inode)->bdev;
46} 46}
47
48EXPORT_SYMBOL(I_BDEV); 47EXPORT_SYMBOL(I_BDEV);
49 48
50/* 49/*
51 * move the inode from it's current bdi to the a new bdi. if the inode is dirty 50 * Move the inode from its current bdi to a new bdi. If the inode is dirty we
52 * we need to move it onto the dirty list of @dst so that the inode is always 51 * need to move it onto the dirty list of @dst so that the inode is always on
53 * on the right list. 52 * the right list.
54 */ 53 */
55static void bdev_inode_switch_bdi(struct inode *inode, 54static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 55 struct backing_dev_info *dst)
57{ 56{
58 spin_lock(&inode_wb_list_lock); 57 struct backing_dev_info *old = inode->i_data.backing_dev_info;
58
59 if (unlikely(dst == old)) /* deadlock avoidance */
60 return;
61 bdi_lock_two(&old->wb, &dst->wb);
59 spin_lock(&inode->i_lock); 62 spin_lock(&inode->i_lock);
60 inode->i_data.backing_dev_info = dst; 63 inode->i_data.backing_dev_info = dst;
61 if (inode->i_state & I_DIRTY) 64 if (inode->i_state & I_DIRTY)
62 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 65 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
63 spin_unlock(&inode->i_lock); 66 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock); 67 spin_unlock(&old->wb.list_lock);
68 spin_unlock(&dst->wb.list_lock);
65} 69}
66 70
67static sector_t max_block(struct block_device *bdev) 71static sector_t max_block(struct block_device *bdev)
@@ -383,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
383 struct inode *bd_inode = filp->f_mapping->host; 387 struct inode *bd_inode = filp->f_mapping->host;
384 struct block_device *bdev = I_BDEV(bd_inode); 388 struct block_device *bdev = I_BDEV(bd_inode);
385 int error; 389 int error;
390
391 error = filemap_write_and_wait_range(filp->f_mapping, start, end);
392 if (error)
393 return error;
386 394
387 /* 395 /*
388 * There is no need to serialise calls to blkdev_issue_flush with 396 * There is no need to serialise calls to blkdev_issue_flush with
@@ -548,6 +556,7 @@ struct block_device *bdget(dev_t dev)
548 556
549 if (inode->i_state & I_NEW) { 557 if (inode->i_state & I_NEW) {
550 bdev->bd_contains = NULL; 558 bdev->bd_contains = NULL;
559 bdev->bd_super = NULL;
551 bdev->bd_inode = inode; 560 bdev->bd_inode = inode;
552 bdev->bd_block_size = (1 << inode->i_blkbits); 561 bdev->bd_block_size = (1 << inode->i_blkbits);
553 bdev->bd_part_count = 0; 562 bdev->bd_part_count = 0;
@@ -1420,6 +1429,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1420 WARN_ON_ONCE(bdev->bd_holders); 1429 WARN_ON_ONCE(bdev->bd_holders);
1421 sync_blockdev(bdev); 1430 sync_blockdev(bdev);
1422 kill_bdev(bdev); 1431 kill_bdev(bdev);
1432 /* ->release can cause the old bdi to disappear,
1433 * so must switch it out first
1434 */
1435 bdev_inode_switch_bdi(bdev->bd_inode,
1436 &default_backing_dev_info);
1423 } 1437 }
1424 if (bdev->bd_contains == bdev) { 1438 if (bdev->bd_contains == bdev) {
1425 if (disk->fops->release) 1439 if (disk->fops->release)
@@ -1433,8 +1447,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1433 disk_put_part(bdev->bd_part); 1447 disk_put_part(bdev->bd_part);
1434 bdev->bd_part = NULL; 1448 bdev->bd_part = NULL;
1435 bdev->bd_disk = NULL; 1449 bdev->bd_disk = NULL;
1436 bdev_inode_switch_bdi(bdev->bd_inode,
1437 &default_backing_dev_info);
1438 if (bdev != bdev->bd_contains) 1450 if (bdev != bdev->bd_contains)
1439 victim = bdev->bd_contains; 1451 victim = bdev->bd_contains;
1440 bdev->bd_contains = NULL; 1452 bdev->bd_contains = NULL;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9b72dcf1cd25..40e6ac08c21f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
11
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 65a735d8f6e4..eb159aaa5a11 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -28,8 +28,6 @@
28#include "btrfs_inode.h" 28#include "btrfs_inode.h"
29#include "xattr.h" 29#include "xattr.h"
30 30
31#ifdef CONFIG_BTRFS_FS_POSIX_ACL
32
33struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 31struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
34{ 32{
35 int size; 33 int size;
@@ -111,7 +109,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
111 int ret, size = 0; 109 int ret, size = 0;
112 const char *name; 110 const char *name;
113 char *value = NULL; 111 char *value = NULL;
114 mode_t mode;
115 112
116 if (acl) { 113 if (acl) {
117 ret = posix_acl_valid(acl); 114 ret = posix_acl_valid(acl);
@@ -122,13 +119,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
122 119
123 switch (type) { 120 switch (type) {
124 case ACL_TYPE_ACCESS: 121 case ACL_TYPE_ACCESS:
125 mode = inode->i_mode;
126 name = POSIX_ACL_XATTR_ACCESS; 122 name = POSIX_ACL_XATTR_ACCESS;
127 if (acl) { 123 if (acl) {
128 ret = posix_acl_equiv_mode(acl, &mode); 124 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
129 if (ret < 0) 125 if (ret < 0)
130 return ret; 126 return ret;
131 inode->i_mode = mode;
132 } 127 }
133 ret = 0; 128 ret = 0;
134 break; 129 break;
@@ -222,19 +217,16 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
222 } 217 }
223 218
224 if (IS_POSIXACL(dir) && acl) { 219 if (IS_POSIXACL(dir) && acl) {
225 mode_t mode = inode->i_mode;
226
227 if (S_ISDIR(inode->i_mode)) { 220 if (S_ISDIR(inode->i_mode)) {
228 ret = btrfs_set_acl(trans, inode, acl, 221 ret = btrfs_set_acl(trans, inode, acl,
229 ACL_TYPE_DEFAULT); 222 ACL_TYPE_DEFAULT);
230 if (ret) 223 if (ret)
231 goto failed; 224 goto failed;
232 } 225 }
233 ret = posix_acl_create(&acl, GFP_NOFS, &mode); 226 ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
234 if (ret < 0) 227 if (ret < 0)
235 return ret; 228 return ret;
236 229
237 inode->i_mode = mode;
238 if (ret > 0) { 230 if (ret > 0) {
239 /* we need an acl */ 231 /* we need an acl */
240 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); 232 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
@@ -282,18 +274,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = {
282 .get = btrfs_xattr_acl_get, 274 .get = btrfs_xattr_acl_get,
283 .set = btrfs_xattr_acl_set, 275 .set = btrfs_xattr_acl_set,
284}; 276};
285
286#else /* CONFIG_BTRFS_FS_POSIX_ACL */
287
288int btrfs_acl_chmod(struct inode *inode)
289{
290 return 0;
291}
292
293int btrfs_init_acl(struct btrfs_trans_handle *trans,
294 struct inode *inode, struct inode *dir)
295{
296 return 0;
297}
298
299#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 52d7eca8c7bf..d9f99a16edd6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,9 @@ struct btrfs_inode {
34 */ 34 */
35 struct btrfs_key location; 35 struct btrfs_key location;
36 36
37 /* Lock for counters */
38 spinlock_t lock;
39
37 /* the extent_tree has caches of all the extent mappings to disk */ 40 /* the extent_tree has caches of all the extent mappings to disk */
38 struct extent_map_tree extent_tree; 41 struct extent_map_tree extent_tree;
39 42
@@ -134,8 +137,8 @@ struct btrfs_inode {
134 * items we think we'll end up using, and reserved_extents is the number 137 * items we think we'll end up using, and reserved_extents is the number
135 * of extent items we've reserved metadata for. 138 * of extent items we've reserved metadata for.
136 */ 139 */
137 atomic_t outstanding_extents; 140 unsigned outstanding_extents;
138 atomic_t reserved_extents; 141 unsigned reserved_extents;
139 142
140 /* 143 /*
141 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -173,7 +176,11 @@ static inline u64 btrfs_ino(struct inode *inode)
173{ 176{
174 u64 ino = BTRFS_I(inode)->location.objectid; 177 u64 ino = BTRFS_I(inode)->location.objectid;
175 178
176 if (ino <= BTRFS_FIRST_FREE_OBJECTID) 179 /*
180 * !ino: btree_inode
181 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
182 */
183 if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
177 ino = inode->i_ino; 184 ino = inode->i_ino;
178 return ino; 185 return ino;
179} 186}
@@ -184,4 +191,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
184 BTRFS_I(inode)->disk_i_size = size; 191 BTRFS_I(inode)->disk_i_size = size;
185} 192}
186 193
194static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
195 struct inode *inode)
196{
197 if (root == root->fs_info->tree_root ||
198 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
199 return true;
200 return false;
201}
202
187#endif 203#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfe42b03eaf9..8ec5d86f1734 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -338,6 +338,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 u64 first_byte = disk_start; 338 u64 first_byte = disk_start;
339 struct block_device *bdev; 339 struct block_device *bdev;
340 int ret; 340 int ret;
341 int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
341 342
342 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 343 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
343 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 344 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -392,8 +393,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 393 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
393 BUG_ON(ret); 394 BUG_ON(ret);
394 395
395 ret = btrfs_csum_one_bio(root, inode, bio, start, 1); 396 if (!skip_sum) {
396 BUG_ON(ret); 397 ret = btrfs_csum_one_bio(root, inode, bio,
398 start, 1);
399 BUG_ON(ret);
400 }
397 401
398 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 402 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
399 BUG_ON(ret); 403 BUG_ON(ret);
@@ -418,8 +422,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
418 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 422 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
419 BUG_ON(ret); 423 BUG_ON(ret);
420 424
421 ret = btrfs_csum_one_bio(root, inode, bio, start, 1); 425 if (!skip_sum) {
422 BUG_ON(ret); 426 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
427 BUG_ON(ret);
428 }
423 429
424 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
425 BUG_ON(ret); 431 BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2e667868e0d2..011cab3aca8d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54{ 54{
55 int i; 55 int i;
56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57 if (p->nodes[i] && p->locks[i]) 57 if (!p->nodes[i] || !p->locks[i])
58 btrfs_set_lock_blocking(p->nodes[i]); 58 continue;
59 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60 if (p->locks[i] == BTRFS_READ_LOCK)
61 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
59 } 64 }
60} 65}
61 66
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
68 * for held 73 * for held
69 */ 74 */
70noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 75noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71 struct extent_buffer *held) 76 struct extent_buffer *held, int held_rw)
72{ 77{
73 int i; 78 int i;
74 79
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 * really sure by forcing the path to blocking before we clear 84 * really sure by forcing the path to blocking before we clear
80 * the path blocking. 85 * the path blocking.
81 */ 86 */
82 if (held) 87 if (held) {
83 btrfs_set_lock_blocking(held); 88 btrfs_set_lock_blocking_rw(held, held_rw);
89 if (held_rw == BTRFS_WRITE_LOCK)
90 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91 else if (held_rw == BTRFS_READ_LOCK)
92 held_rw = BTRFS_READ_LOCK_BLOCKING;
93 }
84 btrfs_set_path_blocking(p); 94 btrfs_set_path_blocking(p);
85#endif 95#endif
86 96
87 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88 if (p->nodes[i] && p->locks[i]) 98 if (p->nodes[i] && p->locks[i]) {
89 btrfs_clear_lock_blocking(p->nodes[i]); 99 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101 p->locks[i] = BTRFS_WRITE_LOCK;
102 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103 p->locks[i] = BTRFS_READ_LOCK;
104 }
90 } 105 }
91 106
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 107#ifdef CONFIG_DEBUG_LOCK_ALLOC
93 if (held) 108 if (held)
94 btrfs_clear_lock_blocking(held); 109 btrfs_clear_lock_blocking_rw(held, held_rw);
95#endif 110#endif
96} 111}
97 112
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
119 if (!p->nodes[i]) 134 if (!p->nodes[i])
120 continue; 135 continue;
121 if (p->locks[i]) { 136 if (p->locks[i]) {
122 btrfs_tree_unlock(p->nodes[i]); 137 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
123 p->locks[i] = 0; 138 p->locks[i] = 0;
124 } 139 }
125 free_extent_buffer(p->nodes[i]); 140 free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
167 return eb; 182 return eb;
168} 183}
169 184
185/* loop around taking references on and locking the root node of the
186 * tree until you end up with a lock on the root. A locked buffer
187 * is returned, with a reference held.
188 */
189struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190{
191 struct extent_buffer *eb;
192
193 while (1) {
194 eb = btrfs_root_node(root);
195 btrfs_tree_read_lock(eb);
196 if (eb == root->node)
197 break;
198 btrfs_tree_read_unlock(eb);
199 free_extent_buffer(eb);
200 }
201 return eb;
202}
203
170/* cowonly root (everything not a reference counted cow subvolume), just get 204/* cowonly root (everything not a reference counted cow subvolume), just get
171 * put onto a simple dirty list. transaction.c walks this to make sure they 205 * put onto a simple dirty list. transaction.c walks this to make sure they
172 * get properly updated on disk. 206 * get properly updated on disk.
@@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
626 for (i = start_slot; i < end_slot; i++) { 660 for (i = start_slot; i < end_slot; i++) {
627 int close = 1; 661 int close = 1;
628 662
629 if (!parent->map_token) {
630 map_extent_buffer(parent,
631 btrfs_node_key_ptr_offset(i),
632 sizeof(struct btrfs_key_ptr),
633 &parent->map_token, &parent->kaddr,
634 &parent->map_start, &parent->map_len,
635 KM_USER1);
636 }
637 btrfs_node_key(parent, &disk_key, i); 663 btrfs_node_key(parent, &disk_key, i);
638 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 664 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
639 continue; 665 continue;
@@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
656 last_block = blocknr; 682 last_block = blocknr;
657 continue; 683 continue;
658 } 684 }
659 if (parent->map_token) {
660 unmap_extent_buffer(parent, parent->map_token,
661 KM_USER1);
662 parent->map_token = NULL;
663 }
664 685
665 cur = btrfs_find_tree_block(root, blocknr, blocksize); 686 cur = btrfs_find_tree_block(root, blocknr, blocksize);
666 if (cur) 687 if (cur)
@@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
701 btrfs_tree_unlock(cur); 722 btrfs_tree_unlock(cur);
702 free_extent_buffer(cur); 723 free_extent_buffer(cur);
703 } 724 }
704 if (parent->map_token) {
705 unmap_extent_buffer(parent, parent->map_token,
706 KM_USER1);
707 parent->map_token = NULL;
708 }
709 return err; 725 return err;
710} 726}
711 727
@@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
746 struct btrfs_disk_key *tmp = NULL; 762 struct btrfs_disk_key *tmp = NULL;
747 struct btrfs_disk_key unaligned; 763 struct btrfs_disk_key unaligned;
748 unsigned long offset; 764 unsigned long offset;
749 char *map_token = NULL;
750 char *kaddr = NULL; 765 char *kaddr = NULL;
751 unsigned long map_start = 0; 766 unsigned long map_start = 0;
752 unsigned long map_len = 0; 767 unsigned long map_len = 0;
@@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
756 mid = (low + high) / 2; 771 mid = (low + high) / 2;
757 offset = p + mid * item_size; 772 offset = p + mid * item_size;
758 773
759 if (!map_token || offset < map_start || 774 if (!kaddr || offset < map_start ||
760 (offset + sizeof(struct btrfs_disk_key)) > 775 (offset + sizeof(struct btrfs_disk_key)) >
761 map_start + map_len) { 776 map_start + map_len) {
762 if (map_token) {
763 unmap_extent_buffer(eb, map_token, KM_USER0);
764 map_token = NULL;
765 }
766 777
767 err = map_private_extent_buffer(eb, offset, 778 err = map_private_extent_buffer(eb, offset,
768 sizeof(struct btrfs_disk_key), 779 sizeof(struct btrfs_disk_key),
769 &map_token, &kaddr, 780 &kaddr, &map_start, &map_len);
770 &map_start, &map_len, KM_USER0);
771 781
772 if (!err) { 782 if (!err) {
773 tmp = (struct btrfs_disk_key *)(kaddr + offset - 783 tmp = (struct btrfs_disk_key *)(kaddr + offset -
@@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
790 high = mid; 800 high = mid;
791 else { 801 else {
792 *slot = mid; 802 *slot = mid;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 0; 803 return 0;
796 } 804 }
797 } 805 }
798 *slot = low; 806 *slot = low;
799 if (map_token)
800 unmap_extent_buffer(eb, map_token, KM_USER0);
801 return 1; 807 return 1;
802} 808}
803 809
@@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
890 896
891 mid = path->nodes[level]; 897 mid = path->nodes[level];
892 898
893 WARN_ON(!path->locks[level]); 899 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
900 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
894 WARN_ON(btrfs_header_generation(mid) != trans->transid); 901 WARN_ON(btrfs_header_generation(mid) != trans->transid);
895 902
896 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root,
1228 u32 nr; 1235 u32 nr;
1229 u32 blocksize; 1236 u32 blocksize;
1230 u32 nscan = 0; 1237 u32 nscan = 0;
1231 bool map = true;
1232 1238
1233 if (level != 1) 1239 if (level != 1)
1234 return; 1240 return;
@@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root,
1250 1256
1251 nritems = btrfs_header_nritems(node); 1257 nritems = btrfs_header_nritems(node);
1252 nr = slot; 1258 nr = slot;
1253 if (node->map_token || path->skip_locking)
1254 map = false;
1255 1259
1256 while (1) { 1260 while (1) {
1257 if (map && !node->map_token) {
1258 unsigned long offset = btrfs_node_key_ptr_offset(nr);
1259 map_private_extent_buffer(node, offset,
1260 sizeof(struct btrfs_key_ptr),
1261 &node->map_token,
1262 &node->kaddr,
1263 &node->map_start,
1264 &node->map_len, KM_USER1);
1265 }
1266 if (direction < 0) { 1261 if (direction < 0) {
1267 if (nr == 0) 1262 if (nr == 0)
1268 break; 1263 break;
@@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root,
1281 if ((search <= target && target - search <= 65536) || 1276 if ((search <= target && target - search <= 65536) ||
1282 (search > target && search - target <= 65536)) { 1277 (search > target && search - target <= 65536)) {
1283 gen = btrfs_node_ptr_generation(node, nr); 1278 gen = btrfs_node_ptr_generation(node, nr);
1284 if (map && node->map_token) {
1285 unmap_extent_buffer(node, node->map_token,
1286 KM_USER1);
1287 node->map_token = NULL;
1288 }
1289 readahead_tree_block(root, search, blocksize, gen); 1279 readahead_tree_block(root, search, blocksize, gen);
1290 nread += blocksize; 1280 nread += blocksize;
1291 } 1281 }
@@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root,
1293 if ((nread > 65536 || nscan > 32)) 1283 if ((nread > 65536 || nscan > 32))
1294 break; 1284 break;
1295 } 1285 }
1296 if (map && node->map_token) {
1297 unmap_extent_buffer(node, node->map_token, KM_USER1);
1298 node->map_token = NULL;
1299 }
1300} 1286}
1301 1287
1302/* 1288/*
@@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1409 1395
1410 t = path->nodes[i]; 1396 t = path->nodes[i];
1411 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 1397 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1412 btrfs_tree_unlock(t); 1398 btrfs_tree_unlock_rw(t, path->locks[i]);
1413 path->locks[i] = 0; 1399 path->locks[i] = 0;
1414 } 1400 }
1415 } 1401 }
@@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1436 continue; 1422 continue;
1437 if (!path->locks[i]) 1423 if (!path->locks[i])
1438 continue; 1424 continue;
1439 btrfs_tree_unlock(path->nodes[i]); 1425 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1440 path->locks[i] = 0; 1426 path->locks[i] = 0;
1441 } 1427 }
1442} 1428}
@@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1485 * we can trust our generation number 1471 * we can trust our generation number
1486 */ 1472 */
1487 free_extent_buffer(tmp); 1473 free_extent_buffer(tmp);
1474 btrfs_set_path_blocking(p);
1475
1488 tmp = read_tree_block(root, blocknr, blocksize, gen); 1476 tmp = read_tree_block(root, blocknr, blocksize, gen);
1489 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1477 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1490 *eb_ret = tmp; 1478 *eb_ret = tmp;
@@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1540static int 1528static int
1541setup_nodes_for_search(struct btrfs_trans_handle *trans, 1529setup_nodes_for_search(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root, struct btrfs_path *p, 1530 struct btrfs_root *root, struct btrfs_path *p,
1543 struct extent_buffer *b, int level, int ins_len) 1531 struct extent_buffer *b, int level, int ins_len,
1532 int *write_lock_level)
1544{ 1533{
1545 int ret; 1534 int ret;
1546 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1535 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1547 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1536 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1548 int sret; 1537 int sret;
1549 1538
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1542 goto again;
1543 }
1544
1550 sret = reada_for_balance(root, p, level); 1545 sret = reada_for_balance(root, p, level);
1551 if (sret) 1546 if (sret)
1552 goto again; 1547 goto again;
1553 1548
1554 btrfs_set_path_blocking(p); 1549 btrfs_set_path_blocking(p);
1555 sret = split_node(trans, root, p, level); 1550 sret = split_node(trans, root, p, level);
1556 btrfs_clear_path_blocking(p, NULL); 1551 btrfs_clear_path_blocking(p, NULL, 0);
1557 1552
1558 BUG_ON(sret > 0); 1553 BUG_ON(sret > 0);
1559 if (sret) { 1554 if (sret) {
@@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1565 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 1560 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1566 int sret; 1561 int sret;
1567 1562
1563 if (*write_lock_level < level + 1) {
1564 *write_lock_level = level + 1;
1565 btrfs_release_path(p);
1566 goto again;
1567 }
1568
1568 sret = reada_for_balance(root, p, level); 1569 sret = reada_for_balance(root, p, level);
1569 if (sret) 1570 if (sret)
1570 goto again; 1571 goto again;
1571 1572
1572 btrfs_set_path_blocking(p); 1573 btrfs_set_path_blocking(p);
1573 sret = balance_level(trans, root, p, level); 1574 sret = balance_level(trans, root, p, level);
1574 btrfs_clear_path_blocking(p, NULL); 1575 btrfs_clear_path_blocking(p, NULL, 0);
1575 1576
1576 if (sret) { 1577 if (sret) {
1577 ret = sret; 1578 ret = sret;
@@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1615 int err; 1616 int err;
1616 int level; 1617 int level;
1617 int lowest_unlock = 1; 1618 int lowest_unlock = 1;
1619 int root_lock;
1620 /* everything at write_lock_level or lower must be write locked */
1621 int write_lock_level = 0;
1618 u8 lowest_level = 0; 1622 u8 lowest_level = 0;
1619 1623
1620 lowest_level = p->lowest_level; 1624 lowest_level = p->lowest_level;
1621 WARN_ON(lowest_level && ins_len > 0); 1625 WARN_ON(lowest_level && ins_len > 0);
1622 WARN_ON(p->nodes[0] != NULL); 1626 WARN_ON(p->nodes[0] != NULL);
1623 1627
1624 if (ins_len < 0) 1628 if (ins_len < 0) {
1625 lowest_unlock = 2; 1629 lowest_unlock = 2;
1626 1630
1631 /* when we are removing items, we might have to go up to level
1632 * two as we update tree pointers Make sure we keep write
1633 * for those levels as well
1634 */
1635 write_lock_level = 2;
1636 } else if (ins_len > 0) {
1637 /*
1638 * for inserting items, make sure we have a write lock on
1639 * level 1 so we can update keys
1640 */
1641 write_lock_level = 1;
1642 }
1643
1644 if (!cow)
1645 write_lock_level = -1;
1646
1647 if (cow && (p->keep_locks || p->lowest_level))
1648 write_lock_level = BTRFS_MAX_LEVEL;
1649
1627again: 1650again:
1651 /*
1652 * we try very hard to do read locks on the root
1653 */
1654 root_lock = BTRFS_READ_LOCK;
1655 level = 0;
1628 if (p->search_commit_root) { 1656 if (p->search_commit_root) {
1657 /*
1658 * the commit roots are read only
1659 * so we always do read locks
1660 */
1629 b = root->commit_root; 1661 b = root->commit_root;
1630 extent_buffer_get(b); 1662 extent_buffer_get(b);
1663 level = btrfs_header_level(b);
1631 if (!p->skip_locking) 1664 if (!p->skip_locking)
1632 btrfs_tree_lock(b); 1665 btrfs_tree_read_lock(b);
1633 } else { 1666 } else {
1634 if (p->skip_locking) 1667 if (p->skip_locking) {
1635 b = btrfs_root_node(root); 1668 b = btrfs_root_node(root);
1636 else 1669 level = btrfs_header_level(b);
1637 b = btrfs_lock_root_node(root); 1670 } else {
1671 /* we don't know the level of the root node
1672 * until we actually have it read locked
1673 */
1674 b = btrfs_read_lock_root_node(root);
1675 level = btrfs_header_level(b);
1676 if (level <= write_lock_level) {
1677 /* whoops, must trade for write lock */
1678 btrfs_tree_read_unlock(b);
1679 free_extent_buffer(b);
1680 b = btrfs_lock_root_node(root);
1681 root_lock = BTRFS_WRITE_LOCK;
1682
1683 /* the level might have changed, check again */
1684 level = btrfs_header_level(b);
1685 }
1686 }
1638 } 1687 }
1688 p->nodes[level] = b;
1689 if (!p->skip_locking)
1690 p->locks[level] = root_lock;
1639 1691
1640 while (b) { 1692 while (b) {
1641 level = btrfs_header_level(b); 1693 level = btrfs_header_level(b);
@@ -1644,10 +1696,6 @@ again:
1644 * setup the path here so we can release it under lock 1696 * setup the path here so we can release it under lock
1645 * contention with the cow code 1697 * contention with the cow code
1646 */ 1698 */
1647 p->nodes[level] = b;
1648 if (!p->skip_locking)
1649 p->locks[level] = 1;
1650
1651 if (cow) { 1699 if (cow) {
1652 /* 1700 /*
1653 * if we don't really need to cow this block 1701 * if we don't really need to cow this block
@@ -1659,6 +1707,16 @@ again:
1659 1707
1660 btrfs_set_path_blocking(p); 1708 btrfs_set_path_blocking(p);
1661 1709
1710 /*
1711 * must have write locks on this node and the
1712 * parent
1713 */
1714 if (level + 1 > write_lock_level) {
1715 write_lock_level = level + 1;
1716 btrfs_release_path(p);
1717 goto again;
1718 }
1719
1662 err = btrfs_cow_block(trans, root, b, 1720 err = btrfs_cow_block(trans, root, b,
1663 p->nodes[level + 1], 1721 p->nodes[level + 1],
1664 p->slots[level + 1], &b); 1722 p->slots[level + 1], &b);
@@ -1671,10 +1729,7 @@ cow_done:
1671 BUG_ON(!cow && ins_len); 1729 BUG_ON(!cow && ins_len);
1672 1730
1673 p->nodes[level] = b; 1731 p->nodes[level] = b;
1674 if (!p->skip_locking) 1732 btrfs_clear_path_blocking(p, NULL, 0);
1675 p->locks[level] = 1;
1676
1677 btrfs_clear_path_blocking(p, NULL);
1678 1733
1679 /* 1734 /*
1680 * we have a lock on b and as long as we aren't changing 1735 * we have a lock on b and as long as we aren't changing
@@ -1700,7 +1755,7 @@ cow_done:
1700 } 1755 }
1701 p->slots[level] = slot; 1756 p->slots[level] = slot;
1702 err = setup_nodes_for_search(trans, root, p, b, level, 1757 err = setup_nodes_for_search(trans, root, p, b, level,
1703 ins_len); 1758 ins_len, &write_lock_level);
1704 if (err == -EAGAIN) 1759 if (err == -EAGAIN)
1705 goto again; 1760 goto again;
1706 if (err) { 1761 if (err) {
@@ -1710,6 +1765,19 @@ cow_done:
1710 b = p->nodes[level]; 1765 b = p->nodes[level];
1711 slot = p->slots[level]; 1766 slot = p->slots[level];
1712 1767
1768 /*
1769 * slot 0 is special, if we change the key
1770 * we have to update the parent pointer
1771 * which means we must have a write lock
1772 * on the parent
1773 */
1774 if (slot == 0 && cow &&
1775 write_lock_level < level + 1) {
1776 write_lock_level = level + 1;
1777 btrfs_release_path(p);
1778 goto again;
1779 }
1780
1713 unlock_up(p, level, lowest_unlock); 1781 unlock_up(p, level, lowest_unlock);
1714 1782
1715 if (level == lowest_level) { 1783 if (level == lowest_level) {
@@ -1728,23 +1796,42 @@ cow_done:
1728 } 1796 }
1729 1797
1730 if (!p->skip_locking) { 1798 if (!p->skip_locking) {
1731 btrfs_clear_path_blocking(p, NULL); 1799 level = btrfs_header_level(b);
1732 err = btrfs_try_spin_lock(b); 1800 if (level <= write_lock_level) {
1733 1801 err = btrfs_try_tree_write_lock(b);
1734 if (!err) { 1802 if (!err) {
1735 btrfs_set_path_blocking(p); 1803 btrfs_set_path_blocking(p);
1736 btrfs_tree_lock(b); 1804 btrfs_tree_lock(b);
1737 btrfs_clear_path_blocking(p, b); 1805 btrfs_clear_path_blocking(p, b,
1806 BTRFS_WRITE_LOCK);
1807 }
1808 p->locks[level] = BTRFS_WRITE_LOCK;
1809 } else {
1810 err = btrfs_try_tree_read_lock(b);
1811 if (!err) {
1812 btrfs_set_path_blocking(p);
1813 btrfs_tree_read_lock(b);
1814 btrfs_clear_path_blocking(p, b,
1815 BTRFS_READ_LOCK);
1816 }
1817 p->locks[level] = BTRFS_READ_LOCK;
1738 } 1818 }
1819 p->nodes[level] = b;
1739 } 1820 }
1740 } else { 1821 } else {
1741 p->slots[level] = slot; 1822 p->slots[level] = slot;
1742 if (ins_len > 0 && 1823 if (ins_len > 0 &&
1743 btrfs_leaf_free_space(root, b) < ins_len) { 1824 btrfs_leaf_free_space(root, b) < ins_len) {
1825 if (write_lock_level < 1) {
1826 write_lock_level = 1;
1827 btrfs_release_path(p);
1828 goto again;
1829 }
1830
1744 btrfs_set_path_blocking(p); 1831 btrfs_set_path_blocking(p);
1745 err = split_leaf(trans, root, key, 1832 err = split_leaf(trans, root, key,
1746 p, ins_len, ret == 0); 1833 p, ins_len, ret == 0);
1747 btrfs_clear_path_blocking(p, NULL); 1834 btrfs_clear_path_blocking(p, NULL, 0);
1748 1835
1749 BUG_ON(err > 0); 1836 BUG_ON(err > 0);
1750 if (err) { 1837 if (err) {
@@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2025 add_root_to_dirty_list(root); 2112 add_root_to_dirty_list(root);
2026 extent_buffer_get(c); 2113 extent_buffer_get(c);
2027 path->nodes[level] = c; 2114 path->nodes[level] = c;
2028 path->locks[level] = 1; 2115 path->locks[level] = BTRFS_WRITE_LOCK;
2029 path->slots[level] = 0; 2116 path->slots[level] = 0;
2030 return 0; 2117 return 0;
2031} 2118}
@@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2253 if (path->slots[0] == i) 2340 if (path->slots[0] == i)
2254 push_space += data_size; 2341 push_space += data_size;
2255 2342
2256 if (!left->map_token) {
2257 map_extent_buffer(left, (unsigned long)item,
2258 sizeof(struct btrfs_item),
2259 &left->map_token, &left->kaddr,
2260 &left->map_start, &left->map_len,
2261 KM_USER1);
2262 }
2263
2264 this_item_size = btrfs_item_size(left, item); 2343 this_item_size = btrfs_item_size(left, item);
2265 if (this_item_size + sizeof(*item) + push_space > free_space) 2344 if (this_item_size + sizeof(*item) + push_space > free_space)
2266 break; 2345 break;
@@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2271 break; 2350 break;
2272 i--; 2351 i--;
2273 } 2352 }
2274 if (left->map_token) {
2275 unmap_extent_buffer(left, left->map_token, KM_USER1);
2276 left->map_token = NULL;
2277 }
2278 2353
2279 if (push_items == 0) 2354 if (push_items == 0)
2280 goto out_unlock; 2355 goto out_unlock;
@@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2316 push_space = BTRFS_LEAF_DATA_SIZE(root); 2391 push_space = BTRFS_LEAF_DATA_SIZE(root);
2317 for (i = 0; i < right_nritems; i++) { 2392 for (i = 0; i < right_nritems; i++) {
2318 item = btrfs_item_nr(right, i); 2393 item = btrfs_item_nr(right, i);
2319 if (!right->map_token) {
2320 map_extent_buffer(right, (unsigned long)item,
2321 sizeof(struct btrfs_item),
2322 &right->map_token, &right->kaddr,
2323 &right->map_start, &right->map_len,
2324 KM_USER1);
2325 }
2326 push_space -= btrfs_item_size(right, item); 2394 push_space -= btrfs_item_size(right, item);
2327 btrfs_set_item_offset(right, item, push_space); 2395 btrfs_set_item_offset(right, item, push_space);
2328 } 2396 }
2329 2397
2330 if (right->map_token) {
2331 unmap_extent_buffer(right, right->map_token, KM_USER1);
2332 right->map_token = NULL;
2333 }
2334 left_nritems -= push_items; 2398 left_nritems -= push_items;
2335 btrfs_set_header_nritems(left, left_nritems); 2399 btrfs_set_header_nritems(left, left_nritems);
2336 2400
@@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2467 2531
2468 for (i = 0; i < nr; i++) { 2532 for (i = 0; i < nr; i++) {
2469 item = btrfs_item_nr(right, i); 2533 item = btrfs_item_nr(right, i);
2470 if (!right->map_token) {
2471 map_extent_buffer(right, (unsigned long)item,
2472 sizeof(struct btrfs_item),
2473 &right->map_token, &right->kaddr,
2474 &right->map_start, &right->map_len,
2475 KM_USER1);
2476 }
2477 2534
2478 if (!empty && push_items > 0) { 2535 if (!empty && push_items > 0) {
2479 if (path->slots[0] < i) 2536 if (path->slots[0] < i)
@@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2496 push_space += this_item_size + sizeof(*item); 2553 push_space += this_item_size + sizeof(*item);
2497 } 2554 }
2498 2555
2499 if (right->map_token) {
2500 unmap_extent_buffer(right, right->map_token, KM_USER1);
2501 right->map_token = NULL;
2502 }
2503
2504 if (push_items == 0) { 2556 if (push_items == 0) {
2505 ret = 1; 2557 ret = 1;
2506 goto out; 2558 goto out;
@@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2530 u32 ioff; 2582 u32 ioff;
2531 2583
2532 item = btrfs_item_nr(left, i); 2584 item = btrfs_item_nr(left, i);
2533 if (!left->map_token) {
2534 map_extent_buffer(left, (unsigned long)item,
2535 sizeof(struct btrfs_item),
2536 &left->map_token, &left->kaddr,
2537 &left->map_start, &left->map_len,
2538 KM_USER1);
2539 }
2540 2585
2541 ioff = btrfs_item_offset(left, item); 2586 ioff = btrfs_item_offset(left, item);
2542 btrfs_set_item_offset(left, item, 2587 btrfs_set_item_offset(left, item,
2543 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); 2588 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2544 } 2589 }
2545 btrfs_set_header_nritems(left, old_left_nritems + push_items); 2590 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2546 if (left->map_token) {
2547 unmap_extent_buffer(left, left->map_token, KM_USER1);
2548 left->map_token = NULL;
2549 }
2550 2591
2551 /* fixup right node */ 2592 /* fixup right node */
2552 if (push_items > right_nritems) { 2593 if (push_items > right_nritems) {
@@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2574 for (i = 0; i < right_nritems; i++) { 2615 for (i = 0; i < right_nritems; i++) {
2575 item = btrfs_item_nr(right, i); 2616 item = btrfs_item_nr(right, i);
2576 2617
2577 if (!right->map_token) {
2578 map_extent_buffer(right, (unsigned long)item,
2579 sizeof(struct btrfs_item),
2580 &right->map_token, &right->kaddr,
2581 &right->map_start, &right->map_len,
2582 KM_USER1);
2583 }
2584
2585 push_space = push_space - btrfs_item_size(right, item); 2618 push_space = push_space - btrfs_item_size(right, item);
2586 btrfs_set_item_offset(right, item, push_space); 2619 btrfs_set_item_offset(right, item, push_space);
2587 } 2620 }
2588 if (right->map_token) {
2589 unmap_extent_buffer(right, right->map_token, KM_USER1);
2590 right->map_token = NULL;
2591 }
2592 2621
2593 btrfs_mark_buffer_dirty(left); 2622 btrfs_mark_buffer_dirty(left);
2594 if (right_nritems) 2623 if (right_nritems)
@@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2729 struct btrfs_item *item = btrfs_item_nr(right, i); 2758 struct btrfs_item *item = btrfs_item_nr(right, i);
2730 u32 ioff; 2759 u32 ioff;
2731 2760
2732 if (!right->map_token) {
2733 map_extent_buffer(right, (unsigned long)item,
2734 sizeof(struct btrfs_item),
2735 &right->map_token, &right->kaddr,
2736 &right->map_start, &right->map_len,
2737 KM_USER1);
2738 }
2739
2740 ioff = btrfs_item_offset(right, item); 2761 ioff = btrfs_item_offset(right, item);
2741 btrfs_set_item_offset(right, item, ioff + rt_data_off); 2762 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2742 } 2763 }
2743 2764
2744 if (right->map_token) {
2745 unmap_extent_buffer(right, right->map_token, KM_USER1);
2746 right->map_token = NULL;
2747 }
2748
2749 btrfs_set_header_nritems(l, mid); 2765 btrfs_set_header_nritems(l, mid);
2750 ret = 0; 2766 ret = 0;
2751 btrfs_item_key(right, &disk_key, 0); 2767 btrfs_item_key(right, &disk_key, 0);
@@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3264 u32 ioff; 3280 u32 ioff;
3265 item = btrfs_item_nr(leaf, i); 3281 item = btrfs_item_nr(leaf, i);
3266 3282
3267 if (!leaf->map_token) {
3268 map_extent_buffer(leaf, (unsigned long)item,
3269 sizeof(struct btrfs_item),
3270 &leaf->map_token, &leaf->kaddr,
3271 &leaf->map_start, &leaf->map_len,
3272 KM_USER1);
3273 }
3274
3275 ioff = btrfs_item_offset(leaf, item); 3283 ioff = btrfs_item_offset(leaf, item);
3276 btrfs_set_item_offset(leaf, item, ioff + size_diff); 3284 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3277 } 3285 }
3278 3286
3279 if (leaf->map_token) {
3280 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3281 leaf->map_token = NULL;
3282 }
3283
3284 /* shift the data */ 3287 /* shift the data */
3285 if (from_end) { 3288 if (from_end) {
3286 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 3289 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
@@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3377 u32 ioff; 3380 u32 ioff;
3378 item = btrfs_item_nr(leaf, i); 3381 item = btrfs_item_nr(leaf, i);
3379 3382
3380 if (!leaf->map_token) {
3381 map_extent_buffer(leaf, (unsigned long)item,
3382 sizeof(struct btrfs_item),
3383 &leaf->map_token, &leaf->kaddr,
3384 &leaf->map_start, &leaf->map_len,
3385 KM_USER1);
3386 }
3387 ioff = btrfs_item_offset(leaf, item); 3383 ioff = btrfs_item_offset(leaf, item);
3388 btrfs_set_item_offset(leaf, item, ioff - data_size); 3384 btrfs_set_item_offset(leaf, item, ioff - data_size);
3389 } 3385 }
3390 3386
3391 if (leaf->map_token) {
3392 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3393 leaf->map_token = NULL;
3394 }
3395
3396 /* shift the data */ 3387 /* shift the data */
3397 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 3388 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3398 data_end - data_size, btrfs_leaf_data(leaf) + 3389 data_end - data_size, btrfs_leaf_data(leaf) +
@@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3494 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3485 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3495 */ 3486 */
3496 /* first correct the data pointers */ 3487 /* first correct the data pointers */
3497 WARN_ON(leaf->map_token);
3498 for (i = slot; i < nritems; i++) { 3488 for (i = slot; i < nritems; i++) {
3499 u32 ioff; 3489 u32 ioff;
3500 3490
3501 item = btrfs_item_nr(leaf, i); 3491 item = btrfs_item_nr(leaf, i);
3502 if (!leaf->map_token) {
3503 map_extent_buffer(leaf, (unsigned long)item,
3504 sizeof(struct btrfs_item),
3505 &leaf->map_token, &leaf->kaddr,
3506 &leaf->map_start, &leaf->map_len,
3507 KM_USER1);
3508 }
3509
3510 ioff = btrfs_item_offset(leaf, item); 3492 ioff = btrfs_item_offset(leaf, item);
3511 btrfs_set_item_offset(leaf, item, ioff - total_data); 3493 btrfs_set_item_offset(leaf, item, ioff - total_data);
3512 } 3494 }
3513 if (leaf->map_token) {
3514 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3515 leaf->map_token = NULL;
3516 }
3517
3518 /* shift the items */ 3495 /* shift the items */
3519 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3496 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3520 btrfs_item_nr_offset(slot), 3497 btrfs_item_nr_offset(slot),
@@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
3608 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3585 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3609 */ 3586 */
3610 /* first correct the data pointers */ 3587 /* first correct the data pointers */
3611 WARN_ON(leaf->map_token);
3612 for (i = slot; i < nritems; i++) { 3588 for (i = slot; i < nritems; i++) {
3613 u32 ioff; 3589 u32 ioff;
3614 3590
3615 item = btrfs_item_nr(leaf, i); 3591 item = btrfs_item_nr(leaf, i);
3616 if (!leaf->map_token) {
3617 map_extent_buffer(leaf, (unsigned long)item,
3618 sizeof(struct btrfs_item),
3619 &leaf->map_token, &leaf->kaddr,
3620 &leaf->map_start, &leaf->map_len,
3621 KM_USER1);
3622 }
3623
3624 ioff = btrfs_item_offset(leaf, item); 3592 ioff = btrfs_item_offset(leaf, item);
3625 btrfs_set_item_offset(leaf, item, ioff - total_data); 3593 btrfs_set_item_offset(leaf, item, ioff - total_data);
3626 } 3594 }
3627 if (leaf->map_token) {
3628 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3629 leaf->map_token = NULL;
3630 }
3631
3632 /* shift the items */ 3595 /* shift the items */
3633 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3596 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3634 btrfs_item_nr_offset(slot), 3597 btrfs_item_nr_offset(slot),
@@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3840 u32 ioff; 3803 u32 ioff;
3841 3804
3842 item = btrfs_item_nr(leaf, i); 3805 item = btrfs_item_nr(leaf, i);
3843 if (!leaf->map_token) {
3844 map_extent_buffer(leaf, (unsigned long)item,
3845 sizeof(struct btrfs_item),
3846 &leaf->map_token, &leaf->kaddr,
3847 &leaf->map_start, &leaf->map_len,
3848 KM_USER1);
3849 }
3850 ioff = btrfs_item_offset(leaf, item); 3806 ioff = btrfs_item_offset(leaf, item);
3851 btrfs_set_item_offset(leaf, item, ioff + dsize); 3807 btrfs_set_item_offset(leaf, item, ioff + dsize);
3852 } 3808 }
3853 3809
3854 if (leaf->map_token) {
3855 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3856 leaf->map_token = NULL;
3857 }
3858
3859 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 3810 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3860 btrfs_item_nr_offset(slot + nr), 3811 btrfs_item_nr_offset(slot + nr),
3861 sizeof(struct btrfs_item) * 3812 sizeof(struct btrfs_item) *
@@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4004 3955
4005 WARN_ON(!path->keep_locks); 3956 WARN_ON(!path->keep_locks);
4006again: 3957again:
4007 cur = btrfs_lock_root_node(root); 3958 cur = btrfs_read_lock_root_node(root);
4008 level = btrfs_header_level(cur); 3959 level = btrfs_header_level(cur);
4009 WARN_ON(path->nodes[level]); 3960 WARN_ON(path->nodes[level]);
4010 path->nodes[level] = cur; 3961 path->nodes[level] = cur;
4011 path->locks[level] = 1; 3962 path->locks[level] = BTRFS_READ_LOCK;
4012 3963
4013 if (btrfs_header_generation(cur) < min_trans) { 3964 if (btrfs_header_generation(cur) < min_trans) {
4014 ret = 1; 3965 ret = 1;
@@ -4098,12 +4049,12 @@ find_next_key:
4098 cur = read_node_slot(root, cur, slot); 4049 cur = read_node_slot(root, cur, slot);
4099 BUG_ON(!cur); 4050 BUG_ON(!cur);
4100 4051
4101 btrfs_tree_lock(cur); 4052 btrfs_tree_read_lock(cur);
4102 4053
4103 path->locks[level - 1] = 1; 4054 path->locks[level - 1] = BTRFS_READ_LOCK;
4104 path->nodes[level - 1] = cur; 4055 path->nodes[level - 1] = cur;
4105 unlock_up(path, level, 1); 4056 unlock_up(path, level, 1);
4106 btrfs_clear_path_blocking(path, NULL); 4057 btrfs_clear_path_blocking(path, NULL, 0);
4107 } 4058 }
4108out: 4059out:
4109 if (ret == 0) 4060 if (ret == 0)
@@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4218 u32 nritems; 4169 u32 nritems;
4219 int ret; 4170 int ret;
4220 int old_spinning = path->leave_spinning; 4171 int old_spinning = path->leave_spinning;
4221 int force_blocking = 0; 4172 int next_rw_lock = 0;
4222 4173
4223 nritems = btrfs_header_nritems(path->nodes[0]); 4174 nritems = btrfs_header_nritems(path->nodes[0]);
4224 if (nritems == 0) 4175 if (nritems == 0)
4225 return 1; 4176 return 1;
4226 4177
4227 /*
4228 * we take the blocks in an order that upsets lockdep. Using
4229 * blocking mode is the only way around it.
4230 */
4231#ifdef CONFIG_DEBUG_LOCK_ALLOC
4232 force_blocking = 1;
4233#endif
4234
4235 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4178 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4236again: 4179again:
4237 level = 1; 4180 level = 1;
4238 next = NULL; 4181 next = NULL;
4182 next_rw_lock = 0;
4239 btrfs_release_path(path); 4183 btrfs_release_path(path);
4240 4184
4241 path->keep_locks = 1; 4185 path->keep_locks = 1;
4242 4186 path->leave_spinning = 1;
4243 if (!force_blocking)
4244 path->leave_spinning = 1;
4245 4187
4246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4247 path->keep_locks = 0; 4189 path->keep_locks = 0;
@@ -4281,11 +4223,12 @@ again:
4281 } 4223 }
4282 4224
4283 if (next) { 4225 if (next) {
4284 btrfs_tree_unlock(next); 4226 btrfs_tree_unlock_rw(next, next_rw_lock);
4285 free_extent_buffer(next); 4227 free_extent_buffer(next);
4286 } 4228 }
4287 4229
4288 next = c; 4230 next = c;
4231 next_rw_lock = path->locks[level];
4289 ret = read_block_for_search(NULL, root, path, &next, level, 4232 ret = read_block_for_search(NULL, root, path, &next, level,
4290 slot, &key); 4233 slot, &key);
4291 if (ret == -EAGAIN) 4234 if (ret == -EAGAIN)
@@ -4297,15 +4240,14 @@ again:
4297 } 4240 }
4298 4241
4299 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4300 ret = btrfs_try_spin_lock(next); 4243 ret = btrfs_try_tree_read_lock(next);
4301 if (!ret) { 4244 if (!ret) {
4302 btrfs_set_path_blocking(path); 4245 btrfs_set_path_blocking(path);
4303 btrfs_tree_lock(next); 4246 btrfs_tree_read_lock(next);
4304 if (!force_blocking) 4247 btrfs_clear_path_blocking(path, next,
4305 btrfs_clear_path_blocking(path, next); 4248 BTRFS_READ_LOCK);
4306 } 4249 }
4307 if (force_blocking) 4250 next_rw_lock = BTRFS_READ_LOCK;
4308 btrfs_set_lock_blocking(next);
4309 } 4251 }
4310 break; 4252 break;
4311 } 4253 }
@@ -4314,14 +4256,13 @@ again:
4314 level--; 4256 level--;
4315 c = path->nodes[level]; 4257 c = path->nodes[level];
4316 if (path->locks[level]) 4258 if (path->locks[level])
4317 btrfs_tree_unlock(c); 4259 btrfs_tree_unlock_rw(c, path->locks[level]);
4318 4260
4319 free_extent_buffer(c); 4261 free_extent_buffer(c);
4320 path->nodes[level] = next; 4262 path->nodes[level] = next;
4321 path->slots[level] = 0; 4263 path->slots[level] = 0;
4322 if (!path->skip_locking) 4264 if (!path->skip_locking)
4323 path->locks[level] = 1; 4265 path->locks[level] = next_rw_lock;
4324
4325 if (!level) 4266 if (!level)
4326 break; 4267 break;
4327 4268
@@ -4336,16 +4277,14 @@ again:
4336 } 4277 }
4337 4278
4338 if (!path->skip_locking) { 4279 if (!path->skip_locking) {
4339 btrfs_assert_tree_locked(path->nodes[level]); 4280 ret = btrfs_try_tree_read_lock(next);
4340 ret = btrfs_try_spin_lock(next);
4341 if (!ret) { 4281 if (!ret) {
4342 btrfs_set_path_blocking(path); 4282 btrfs_set_path_blocking(path);
4343 btrfs_tree_lock(next); 4283 btrfs_tree_read_lock(next);
4344 if (!force_blocking) 4284 btrfs_clear_path_blocking(path, next,
4345 btrfs_clear_path_blocking(path, next); 4285 BTRFS_READ_LOCK);
4346 } 4286 }
4347 if (force_blocking) 4287 next_rw_lock = BTRFS_READ_LOCK;
4348 btrfs_set_lock_blocking(next);
4349 } 4288 }
4350 } 4289 }
4351 ret = 0; 4290 ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe9287b06496..03912c5c6f49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -755,6 +755,8 @@ struct btrfs_space_info {
755 chunks for this space */ 755 chunks for this space */
756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ 756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
757 757
758 unsigned int flush:1; /* set if we are trying to make space */
759
758 unsigned int force_alloc; /* set if we need to force a chunk 760 unsigned int force_alloc; /* set if we need to force a chunk
759 alloc for this space */ 761 alloc for this space */
760 762
@@ -764,7 +766,7 @@ struct btrfs_space_info {
764 struct list_head block_groups[BTRFS_NR_RAID_TYPES]; 766 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
765 spinlock_t lock; 767 spinlock_t lock;
766 struct rw_semaphore groups_sem; 768 struct rw_semaphore groups_sem;
767 atomic_t caching_threads; 769 wait_queue_head_t wait;
768}; 770};
769 771
770struct btrfs_block_rsv { 772struct btrfs_block_rsv {
@@ -824,6 +826,7 @@ struct btrfs_caching_control {
824 struct list_head list; 826 struct list_head list;
825 struct mutex mutex; 827 struct mutex mutex;
826 wait_queue_head_t wait; 828 wait_queue_head_t wait;
829 struct btrfs_work work;
827 struct btrfs_block_group_cache *block_group; 830 struct btrfs_block_group_cache *block_group;
828 u64 progress; 831 u64 progress;
829 atomic_t count; 832 atomic_t count;
@@ -1032,6 +1035,8 @@ struct btrfs_fs_info {
1032 struct btrfs_workers endio_write_workers; 1035 struct btrfs_workers endio_write_workers;
1033 struct btrfs_workers endio_freespace_worker; 1036 struct btrfs_workers endio_freespace_worker;
1034 struct btrfs_workers submit_workers; 1037 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers;
1039
1035 /* 1040 /*
1036 * fixup workers take dirty pages that didn't properly go through 1041 * fixup workers take dirty pages that didn't properly go through
1037 * the cow mechanism and make them safe to write. It happens 1042 * the cow mechanism and make them safe to write. It happens
@@ -1410,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1410#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1415#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1411static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1416static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1412{ \ 1417{ \
1413 type *p = kmap_atomic(eb->first_page, KM_USER0); \ 1418 type *p = page_address(eb->first_page); \
1414 u##bits res = le##bits##_to_cpu(p->member); \ 1419 u##bits res = le##bits##_to_cpu(p->member); \
1415 kunmap_atomic(p, KM_USER0); \
1416 return res; \ 1420 return res; \
1417} \ 1421} \
1418static inline void btrfs_set_##name(struct extent_buffer *eb, \ 1422static inline void btrfs_set_##name(struct extent_buffer *eb, \
1419 u##bits val) \ 1423 u##bits val) \
1420{ \ 1424{ \
1421 type *p = kmap_atomic(eb->first_page, KM_USER0); \ 1425 type *p = page_address(eb->first_page); \
1422 p->member = cpu_to_le##bits(val); \ 1426 p->member = cpu_to_le##bits(val); \
1423 kunmap_atomic(p, KM_USER0); \
1424} 1427}
1425 1428
1426#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ 1429#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
@@ -2128,7 +2131,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2128 2131
2129/* extent-tree.c */ 2132/* extent-tree.c */
2130static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2131 int num_items) 2134 unsigned num_items)
2132{ 2135{
2133 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 2136 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2134 3 * num_items; 2137 3 * num_items;
@@ -2222,9 +2225,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2222void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2225void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2223int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2226int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2224void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2227void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2225int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2226 struct btrfs_root *root,
2227 int num_items);
2228void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2228void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2229 struct btrfs_root *root); 2229 struct btrfs_root *root);
2230int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 2230int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2330,7 +2330,7 @@ struct btrfs_path *btrfs_alloc_path(void);
2330void btrfs_free_path(struct btrfs_path *p); 2330void btrfs_free_path(struct btrfs_path *p);
2331void btrfs_set_path_blocking(struct btrfs_path *p); 2331void btrfs_set_path_blocking(struct btrfs_path *p);
2332void btrfs_clear_path_blocking(struct btrfs_path *p, 2332void btrfs_clear_path_blocking(struct btrfs_path *p,
2333 struct extent_buffer *held); 2333 struct extent_buffer *held, int held_rw);
2334void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2334void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2335 2335
2336int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2336int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2365,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2365int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2365int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2366int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2366int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2367int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2367int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2368int btrfs_drop_snapshot(struct btrfs_root *root, 2368void btrfs_drop_snapshot(struct btrfs_root *root,
2369 struct btrfs_block_rsv *block_rsv, int update_ref); 2369 struct btrfs_block_rsv *block_rsv, int update_ref);
2370int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2370int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2371 struct btrfs_root *root, 2371 struct btrfs_root *root,
2372 struct extent_buffer *node, 2372 struct extent_buffer *node,
@@ -2404,8 +2404,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2404 btrfs_root_item *item, struct btrfs_key *key); 2404 btrfs_root_item *item, struct btrfs_key *key);
2405int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 2405int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2406int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2406int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2407int btrfs_set_root_node(struct btrfs_root_item *item, 2407void btrfs_set_root_node(struct btrfs_root_item *item,
2408 struct extent_buffer *node); 2408 struct extent_buffer *node);
2409void btrfs_check_and_init_root_item(struct btrfs_root_item *item); 2409void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2410 2410
2411/* dir-item.c */ 2411/* dir-item.c */
@@ -2521,6 +2521,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
2521#define PageChecked PageFsMisc 2521#define PageChecked PageFsMisc
2522#endif 2522#endif
2523 2523
2524/* This forces readahead on a given range of bytes in an inode */
2525static inline void btrfs_force_ra(struct address_space *mapping,
2526 struct file_ra_state *ra, struct file *file,
2527 pgoff_t offset, unsigned long req_size)
2528{
2529 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
2530}
2531
2524struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); 2532struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2525int btrfs_set_inode_index(struct inode *dir, u64 *index); 2533int btrfs_set_inode_index(struct inode *dir, u64 *index);
2526int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2534int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -2549,9 +2557,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2549int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2557int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2550 size_t size, struct bio *bio, unsigned long bio_flags); 2558 size_t size, struct bio *bio, unsigned long bio_flags);
2551 2559
2552unsigned long btrfs_force_ra(struct address_space *mapping,
2553 struct file_ra_state *ra, struct file *file,
2554 pgoff_t offset, pgoff_t last_index);
2555int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2560int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2556int btrfs_readpage(struct file *file, struct page *page); 2561int btrfs_readpage(struct file *file, struct page *page);
2557void btrfs_evict_inode(struct inode *inode); 2562void btrfs_evict_inode(struct inode *inode);
@@ -2646,12 +2651,21 @@ do { \
2646/* acl.c */ 2651/* acl.c */
2647#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2652#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2648struct posix_acl *btrfs_get_acl(struct inode *inode, int type); 2653struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
2649#else
2650#define btrfs_get_acl NULL
2651#endif
2652int btrfs_init_acl(struct btrfs_trans_handle *trans, 2654int btrfs_init_acl(struct btrfs_trans_handle *trans,
2653 struct inode *inode, struct inode *dir); 2655 struct inode *inode, struct inode *dir);
2654int btrfs_acl_chmod(struct inode *inode); 2656int btrfs_acl_chmod(struct inode *inode);
2657#else
2658#define btrfs_get_acl NULL
2659static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
2660 struct inode *inode, struct inode *dir)
2661{
2662 return 0;
2663}
2664static inline int btrfs_acl_chmod(struct inode *inode)
2665{
2666 return 0;
2667}
2668#endif
2655 2669
2656/* relocation.c */ 2670/* relocation.c */
2657int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); 2671int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e658a9b..b52c672f4c18 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
735 } 735 }
736 736
737 /* reset all the locked nodes in the patch to spinning locks. */ 737 /* reset all the locked nodes in the patch to spinning locks. */
738 btrfs_clear_path_blocking(path, NULL); 738 btrfs_clear_path_blocking(path, NULL, 0);
739 739
740 /* insert the keys of the items */ 740 /* insert the keys of the items */
741 ret = setup_items_for_insert(trans, root, path, keys, data_size, 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 8d27af4bd8b9..7083d08b2a21 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -25,7 +25,7 @@
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/wait.h> 27#include <linux/wait.h>
28#include <asm/atomic.h> 28#include <linux/atomic.h>
29 29
30#include "ctree.h" 30#include "ctree.h"
31 31
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 685f2593c4f0..31d84e78129b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
89 data_size = sizeof(*dir_item) + name_len + data_len; 89 data_size = sizeof(*dir_item) + name_len + data_len;
90 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 90 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
91 name, name_len); 91 name, name_len);
92 /* 92 if (IS_ERR(dir_item))
93 * FIXME: at some point we should handle xattr's that are larger than 93 return PTR_ERR(dir_item);
94 * what we can fit in our leaf. We set location to NULL b/c we arent
95 * pointing at anything else, that will change if we store the xattr
96 * data in a separate inode.
97 */
98 BUG_ON(IS_ERR(dir_item));
99 memset(&location, 0, sizeof(location)); 94 memset(&location, 0, sizeof(location));
100 95
101 leaf = path->nodes[0]; 96 leaf = path->nodes[0];
@@ -203,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
203 struct btrfs_key key; 198 struct btrfs_key key;
204 int ins_len = mod < 0 ? -1 : 0; 199 int ins_len = mod < 0 ? -1 : 0;
205 int cow = mod != 0; 200 int cow = mod != 0;
206 struct btrfs_key found_key;
207 struct extent_buffer *leaf;
208 201
209 key.objectid = dir; 202 key.objectid = dir;
210 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 203 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -214,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
214 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 207 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
215 if (ret < 0) 208 if (ret < 0)
216 return ERR_PTR(ret); 209 return ERR_PTR(ret);
217 if (ret > 0) { 210 if (ret > 0)
218 if (path->slots[0] == 0)
219 return NULL;
220 path->slots[0]--;
221 }
222
223 leaf = path->nodes[0];
224 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
225
226 if (found_key.objectid != dir ||
227 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
228 found_key.offset != key.offset)
229 return NULL; 211 return NULL;
230 212
231 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -320,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
320 struct btrfs_key key; 302 struct btrfs_key key;
321 int ins_len = mod < 0 ? -1 : 0; 303 int ins_len = mod < 0 ? -1 : 0;
322 int cow = mod != 0; 304 int cow = mod != 0;
323 struct btrfs_key found_key;
324 struct extent_buffer *leaf;
325 305
326 key.objectid = dir; 306 key.objectid = dir;
327 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 307 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
@@ -329,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
329 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 309 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
330 if (ret < 0) 310 if (ret < 0)
331 return ERR_PTR(ret); 311 return ERR_PTR(ret);
332 if (ret > 0) { 312 if (ret > 0)
333 if (path->slots[0] == 0)
334 return NULL;
335 path->slots[0]--;
336 }
337
338 leaf = path->nodes[0];
339 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
340
341 if (found_key.objectid != dir ||
342 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
343 found_key.offset != key.offset)
344 return NULL; 313 return NULL;
345 314
346 return btrfs_match_dir_item_name(root, path, name, name_len); 315 return btrfs_match_dir_item_name(root, path, name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b231ae13b269..07b3ac662e19 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -100,38 +100,83 @@ struct async_submit_bio {
100 struct btrfs_work work; 100 struct btrfs_work work;
101}; 101};
102 102
103/* These are used to set the lockdep class on the extent buffer locks. 103/*
104 * The class is set by the readpage_end_io_hook after the buffer has 104 * Lockdep class keys for extent_buffer->lock's in this root. For a given
105 * passed csum validation but before the pages are unlocked. 105 * eb, the lockdep key is determined by the btrfs_root it belongs to and
106 * the level the eb occupies in the tree.
107 *
108 * Different roots are used for different purposes and may nest inside each
109 * other and they require separate keysets. As lockdep keys should be
110 * static, assign keysets according to the purpose of the root as indicated
111 * by btrfs_root->objectid. This ensures that all special purpose roots
112 * have separate keysets.
106 * 113 *
107 * The lockdep class is also set by btrfs_init_new_buffer on freshly 114 * Lock-nesting across peer nodes is always done with the immediate parent
108 * allocated blocks. 115 * node locked thus preventing deadlock. As lockdep doesn't know this, use
116 * subclass to avoid triggering lockdep warning in such cases.
109 * 117 *
110 * The class is based on the level in the tree block, which allows lockdep 118 * The key is set by the readpage_end_io_hook after the buffer has passed
111 * to know that lower nodes nest inside the locks of higher nodes. 119 * csum validation but before the pages are unlocked. It is also set by
120 * btrfs_init_new_buffer on freshly allocated blocks.
112 * 121 *
113 * We also add a check to make sure the highest level of the tree is 122 * We also add a check to make sure the highest level of the tree is the
114 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this 123 * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
115 * code needs update as well. 124 * needs update as well.
116 */ 125 */
117#ifdef CONFIG_DEBUG_LOCK_ALLOC 126#ifdef CONFIG_DEBUG_LOCK_ALLOC
118# if BTRFS_MAX_LEVEL != 8 127# if BTRFS_MAX_LEVEL != 8
119# error 128# error
120# endif 129# endif
121static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; 130
122static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { 131static struct btrfs_lockdep_keyset {
123 /* leaf */ 132 u64 id; /* root objectid */
124 "btrfs-extent-00", 133 const char *name_stem; /* lock name stem */
125 "btrfs-extent-01", 134 char names[BTRFS_MAX_LEVEL + 1][20];
126 "btrfs-extent-02", 135 struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
127 "btrfs-extent-03", 136} btrfs_lockdep_keysets[] = {
128 "btrfs-extent-04", 137 { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
129 "btrfs-extent-05", 138 { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
130 "btrfs-extent-06", 139 { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
131 "btrfs-extent-07", 140 { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
132 /* highest possible level */ 141 { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
133 "btrfs-extent-08", 142 { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
143 { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
144 { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
145 { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
146 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
147 { .id = 0, .name_stem = "tree" },
134}; 148};
149
150void __init btrfs_init_lockdep(void)
151{
152 int i, j;
153
154 /* initialize lockdep class names */
155 for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
156 struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
157
158 for (j = 0; j < ARRAY_SIZE(ks->names); j++)
159 snprintf(ks->names[j], sizeof(ks->names[j]),
160 "btrfs-%s-%02d", ks->name_stem, j);
161 }
162}
163
164void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
165 int level)
166{
167 struct btrfs_lockdep_keyset *ks;
168
169 BUG_ON(level >= ARRAY_SIZE(ks->keys));
170
171 /* find the matching keyset, id 0 is the default entry */
172 for (ks = btrfs_lockdep_keysets; ks->id; ks++)
173 if (ks->id == objectid)
174 break;
175
176 lockdep_set_class_and_name(&eb->lock,
177 &ks->keys[level], ks->names[level]);
178}
179
135#endif 180#endif
136 181
137/* 182/*
@@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
217 unsigned long len; 262 unsigned long len;
218 unsigned long cur_len; 263 unsigned long cur_len;
219 unsigned long offset = BTRFS_CSUM_SIZE; 264 unsigned long offset = BTRFS_CSUM_SIZE;
220 char *map_token = NULL;
221 char *kaddr; 265 char *kaddr;
222 unsigned long map_start; 266 unsigned long map_start;
223 unsigned long map_len; 267 unsigned long map_len;
@@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
228 len = buf->len - offset; 272 len = buf->len - offset;
229 while (len > 0) { 273 while (len > 0) {
230 err = map_private_extent_buffer(buf, offset, 32, 274 err = map_private_extent_buffer(buf, offset, 32,
231 &map_token, &kaddr, 275 &kaddr, &map_start, &map_len);
232 &map_start, &map_len, KM_USER0);
233 if (err) 276 if (err)
234 return 1; 277 return 1;
235 cur_len = min(len, map_len - (offset - map_start)); 278 cur_len = min(len, map_len - (offset - map_start));
@@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
237 crc, cur_len); 280 crc, cur_len);
238 len -= cur_len; 281 len -= cur_len;
239 offset += cur_len; 282 offset += cur_len;
240 unmap_extent_buffer(buf, map_token, KM_USER0);
241 } 283 }
242 if (csum_size > sizeof(inline_result)) { 284 if (csum_size > sizeof(inline_result)) {
243 result = kzalloc(csum_size * sizeof(char), GFP_NOFS); 285 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,
494 return 0; 536 return 0;
495} 537}
496 538
497#ifdef CONFIG_DEBUG_LOCK_ALLOC
498void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
499{
500 lockdep_set_class_and_name(&eb->lock,
501 &btrfs_eb_class[level],
502 btrfs_eb_name[level]);
503}
504#endif
505
506static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 539static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
507 struct extent_state *state) 540 struct extent_state *state)
508{ 541{
@@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
553 } 586 }
554 found_level = btrfs_header_level(eb); 587 found_level = btrfs_header_level(eb);
555 588
556 btrfs_set_buffer_lockdep_class(eb, found_level); 589 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
590 eb, found_level);
557 591
558 ret = csum_tree_block(root, eb, 1); 592 ret = csum_tree_block(root, eb, 1);
559 if (ret) { 593 if (ret) {
@@ -1598,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1598 goto fail_bdi; 1632 goto fail_bdi;
1599 } 1633 }
1600 1634
1601 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; 1635 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
1602 1636
1603 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1637 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1604 INIT_LIST_HEAD(&fs_info->trans_list); 1638 INIT_LIST_HEAD(&fs_info->trans_list);
@@ -1802,6 +1836,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1802 fs_info->thread_pool_size), 1836 fs_info->thread_pool_size),
1803 &fs_info->generic_worker); 1837 &fs_info->generic_worker);
1804 1838
1839 btrfs_init_workers(&fs_info->caching_workers, "cache",
1840 2, &fs_info->generic_worker);
1841
1805 /* a higher idle thresh on the submit workers makes it much more 1842 /* a higher idle thresh on the submit workers makes it much more
1806 * likely that bios will be send down in a sane order to the 1843 * likely that bios will be send down in a sane order to the
1807 * devices 1844 * devices
@@ -1855,6 +1892,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1855 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1892 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1856 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1857 btrfs_start_workers(&fs_info->delayed_workers, 1); 1894 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1);
1858 1896
1859 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1860 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2112,6 +2150,7 @@ fail_sb_buffer:
2112 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2150 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2113 btrfs_stop_workers(&fs_info->submit_workers); 2151 btrfs_stop_workers(&fs_info->submit_workers);
2114 btrfs_stop_workers(&fs_info->delayed_workers); 2152 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers);
2115fail_alloc: 2154fail_alloc:
2116 kfree(fs_info->delayed_root); 2155 kfree(fs_info->delayed_root);
2117fail_iput: 2156fail_iput:
@@ -2577,6 +2616,7 @@ int close_ctree(struct btrfs_root *root)
2577 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2616 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2578 btrfs_stop_workers(&fs_info->submit_workers); 2617 btrfs_stop_workers(&fs_info->submit_workers);
2579 btrfs_stop_workers(&fs_info->delayed_workers); 2618 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers);
2580 2620
2581 btrfs_close_devices(fs_info->fs_devices); 2621 btrfs_close_devices(fs_info->fs_devices);
2582 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2622 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0b610a67aae..bec3ea4bd67f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page);
87 87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); 90void btrfs_init_lockdep(void);
91void btrfs_set_buffer_lockdep_class(u64 objectid,
92 struct extent_buffer *eb, int level);
91#else 93#else
92static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, 94static inline void btrfs_init_lockdep(void)
93 int level) 95{ }
96static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
97 struct extent_buffer *eb, int level)
94{ 98{
95} 99}
96#endif 100#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71cd456fdb60..f5be06a2462f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
320 return total_added; 320 return total_added;
321} 321}
322 322
323static int caching_kthread(void *data) 323static noinline void caching_thread(struct btrfs_work *work)
324{ 324{
325 struct btrfs_block_group_cache *block_group = data; 325 struct btrfs_block_group_cache *block_group;
326 struct btrfs_fs_info *fs_info = block_group->fs_info; 326 struct btrfs_fs_info *fs_info;
327 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; 327 struct btrfs_caching_control *caching_ctl;
328 struct btrfs_root *extent_root = fs_info->extent_root; 328 struct btrfs_root *extent_root;
329 struct btrfs_path *path; 329 struct btrfs_path *path;
330 struct extent_buffer *leaf; 330 struct extent_buffer *leaf;
331 struct btrfs_key key; 331 struct btrfs_key key;
@@ -334,9 +334,14 @@ static int caching_kthread(void *data)
334 u32 nritems; 334 u32 nritems;
335 int ret = 0; 335 int ret = 0;
336 336
337 caching_ctl = container_of(work, struct btrfs_caching_control, work);
338 block_group = caching_ctl->block_group;
339 fs_info = block_group->fs_info;
340 extent_root = fs_info->extent_root;
341
337 path = btrfs_alloc_path(); 342 path = btrfs_alloc_path();
338 if (!path) 343 if (!path)
339 return -ENOMEM; 344 goto out;
340 345
341 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 346 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
342 347
@@ -433,13 +438,11 @@ err:
433 free_excluded_extents(extent_root, block_group); 438 free_excluded_extents(extent_root, block_group);
434 439
435 mutex_unlock(&caching_ctl->mutex); 440 mutex_unlock(&caching_ctl->mutex);
441out:
436 wake_up(&caching_ctl->wait); 442 wake_up(&caching_ctl->wait);
437 443
438 put_caching_control(caching_ctl); 444 put_caching_control(caching_ctl);
439 atomic_dec(&block_group->space_info->caching_threads);
440 btrfs_put_block_group(block_group); 445 btrfs_put_block_group(block_group);
441
442 return 0;
443} 446}
444 447
445static int cache_block_group(struct btrfs_block_group_cache *cache, 448static int cache_block_group(struct btrfs_block_group_cache *cache,
@@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
449{ 452{
450 struct btrfs_fs_info *fs_info = cache->fs_info; 453 struct btrfs_fs_info *fs_info = cache->fs_info;
451 struct btrfs_caching_control *caching_ctl; 454 struct btrfs_caching_control *caching_ctl;
452 struct task_struct *tsk;
453 int ret = 0; 455 int ret = 0;
454 456
455 smp_mb(); 457 smp_mb();
@@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
501 caching_ctl->progress = cache->key.objectid; 503 caching_ctl->progress = cache->key.objectid;
502 /* one for caching kthread, one for caching block group list */ 504 /* one for caching kthread, one for caching block group list */
503 atomic_set(&caching_ctl->count, 2); 505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
504 507
505 spin_lock(&cache->lock); 508 spin_lock(&cache->lock);
506 if (cache->cached != BTRFS_CACHE_NO) { 509 if (cache->cached != BTRFS_CACHE_NO) {
@@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
516 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
517 up_write(&fs_info->extent_commit_sem); 520 up_write(&fs_info->extent_commit_sem);
518 521
519 atomic_inc(&cache->space_info->caching_threads);
520 btrfs_get_block_group(cache); 522 btrfs_get_block_group(cache);
521 523
522 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 524 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
523 cache->key.objectid);
524 if (IS_ERR(tsk)) {
525 ret = PTR_ERR(tsk);
526 printk(KERN_ERR "error running thread %d\n", ret);
527 BUG();
528 }
529 525
530 return ret; 526 return ret;
531} 527}
@@ -667,7 +663,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
667 struct btrfs_path *path; 663 struct btrfs_path *path;
668 664
669 path = btrfs_alloc_path(); 665 path = btrfs_alloc_path();
670 BUG_ON(!path); 666 if (!path)
667 return -ENOMEM;
668
671 key.objectid = start; 669 key.objectid = start;
672 key.offset = len; 670 key.offset = len;
673 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 671 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1784,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1784 1782
1785 1783
1786 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1784 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard)
1786 continue;
1787
1787 ret = btrfs_issue_discard(stripe->dev->bdev, 1788 ret = btrfs_issue_discard(stripe->dev->bdev,
1788 stripe->physical, 1789 stripe->physical,
1789 stripe->length); 1790 stripe->length);
@@ -1791,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1791 discarded_bytes += stripe->length; 1792 discarded_bytes += stripe->length;
1792 else if (ret != -EOPNOTSUPP) 1793 else if (ret != -EOPNOTSUPP)
1793 break; 1794 break;
1795
1796 /*
1797 * Just in case we get back EOPNOTSUPP for some reason,
1798 * just ignore the return value so we don't screw up
1799 * people calling discard_extent.
1800 */
1801 ret = 0;
1794 } 1802 }
1795 kfree(multi); 1803 kfree(multi);
1796 } 1804 }
1797 if (discarded_bytes && ret == -EOPNOTSUPP)
1798 ret = 0;
1799 1805
1800 if (actual_bytes) 1806 if (actual_bytes)
1801 *actual_bytes = discarded_bytes; 1807 *actual_bytes = discarded_bytes;
@@ -2932,9 +2938,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2932 found->full = 0; 2938 found->full = 0;
2933 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 2939 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2934 found->chunk_alloc = 0; 2940 found->chunk_alloc = 0;
2941 found->flush = 0;
2942 init_waitqueue_head(&found->wait);
2935 *space_info = found; 2943 *space_info = found;
2936 list_add_rcu(&found->list, &info->space_info); 2944 list_add_rcu(&found->list, &info->space_info);
2937 atomic_set(&found->caching_threads, 0);
2938 return 0; 2945 return 0;
2939} 2946}
2940 2947
@@ -3275,6 +3282,9 @@ again:
3275 } 3282 }
3276 3283
3277 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3284 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3285 if (ret < 0 && ret != -ENOSPC)
3286 goto out;
3287
3278 spin_lock(&space_info->lock); 3288 spin_lock(&space_info->lock);
3279 if (ret) 3289 if (ret)
3280 space_info->full = 1; 3290 space_info->full = 1;
@@ -3284,6 +3294,7 @@ again:
3284 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3294 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3285 space_info->chunk_alloc = 0; 3295 space_info->chunk_alloc = 0;
3286 spin_unlock(&space_info->lock); 3296 spin_unlock(&space_info->lock);
3297out:
3287 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3298 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3288 return ret; 3299 return ret;
3289} 3300}
@@ -3314,6 +3325,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3314 if (reserved == 0) 3325 if (reserved == 0)
3315 return 0; 3326 return 0;
3316 3327
3328 smp_mb();
3329 if (root->fs_info->delalloc_bytes == 0) {
3330 if (trans)
3331 return 0;
3332 btrfs_wait_ordered_extents(root, 0, 0);
3333 return 0;
3334 }
3335
3317 max_reclaim = min(reserved, to_reclaim); 3336 max_reclaim = min(reserved, to_reclaim);
3318 3337
3319 while (loops < 1024) { 3338 while (loops < 1024) {
@@ -3356,6 +3375,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 } 3375 }
3357 3376
3358 } 3377 }
3378 if (reclaimed >= to_reclaim && !trans)
3379 btrfs_wait_ordered_extents(root, 0, 0);
3359 return reclaimed >= to_reclaim; 3380 return reclaimed >= to_reclaim;
3360} 3381}
3361 3382
@@ -3380,15 +3401,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3380 u64 num_bytes = orig_bytes; 3401 u64 num_bytes = orig_bytes;
3381 int retries = 0; 3402 int retries = 0;
3382 int ret = 0; 3403 int ret = 0;
3383 bool reserved = false;
3384 bool committed = false; 3404 bool committed = false;
3405 bool flushing = false;
3385 3406
3386again: 3407again:
3387 ret = -ENOSPC; 3408 ret = 0;
3388 if (reserved)
3389 num_bytes = 0;
3390
3391 spin_lock(&space_info->lock); 3409 spin_lock(&space_info->lock);
3410 /*
3411 * We only want to wait if somebody other than us is flushing and we are
3412 * actually alloed to flush.
3413 */
3414 while (flush && !flushing && space_info->flush) {
3415 spin_unlock(&space_info->lock);
3416 /*
3417 * If we have a trans handle we can't wait because the flusher
3418 * may have to commit the transaction, which would mean we would
3419 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open.
3421 */
3422 if (trans)
3423 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush);
3426 /* Must have been interrupted, return */
3427 if (ret)
3428 return -EINTR;
3429
3430 spin_lock(&space_info->lock);
3431 }
3432
3433 ret = -ENOSPC;
3392 unused = space_info->bytes_used + space_info->bytes_reserved + 3434 unused = space_info->bytes_used + space_info->bytes_reserved +
3393 space_info->bytes_pinned + space_info->bytes_readonly + 3435 space_info->bytes_pinned + space_info->bytes_readonly +
3394 space_info->bytes_may_use; 3436 space_info->bytes_may_use;
@@ -3403,8 +3445,7 @@ again:
3403 if (unused <= space_info->total_bytes) { 3445 if (unused <= space_info->total_bytes) {
3404 unused = space_info->total_bytes - unused; 3446 unused = space_info->total_bytes - unused;
3405 if (unused >= num_bytes) { 3447 if (unused >= num_bytes) {
3406 if (!reserved) 3448 space_info->bytes_reserved += orig_bytes;
3407 space_info->bytes_reserved += orig_bytes;
3408 ret = 0; 3449 ret = 0;
3409 } else { 3450 } else {
3410 /* 3451 /*
@@ -3429,17 +3470,14 @@ again:
3429 * to reclaim space we can actually use it instead of somebody else 3470 * to reclaim space we can actually use it instead of somebody else
3430 * stealing it from us. 3471 * stealing it from us.
3431 */ 3472 */
3432 if (ret && !reserved) { 3473 if (ret && flush) {
3433 space_info->bytes_reserved += orig_bytes; 3474 flushing = true;
3434 reserved = true; 3475 space_info->flush = 1;
3435 } 3476 }
3436 3477
3437 spin_unlock(&space_info->lock); 3478 spin_unlock(&space_info->lock);
3438 3479
3439 if (!ret) 3480 if (!ret || !flush)
3440 return 0;
3441
3442 if (!flush)
3443 goto out; 3481 goto out;
3444 3482
3445 /* 3483 /*
@@ -3447,11 +3485,11 @@ again:
3447 * metadata until after the IO is completed. 3485 * metadata until after the IO is completed.
3448 */ 3486 */
3449 ret = shrink_delalloc(trans, root, num_bytes, 1); 3487 ret = shrink_delalloc(trans, root, num_bytes, 1);
3450 if (ret > 0) 3488 if (ret < 0)
3451 return 0;
3452 else if (ret < 0)
3453 goto out; 3489 goto out;
3454 3490
3491 ret = 0;
3492
3455 /* 3493 /*
3456 * So if we were overcommitted it's possible that somebody else flushed 3494 * So if we were overcommitted it's possible that somebody else flushed
3457 * out enough space and we simply didn't have enough space to reclaim, 3495 * out enough space and we simply didn't have enough space to reclaim,
@@ -3462,11 +3500,11 @@ again:
3462 goto again; 3500 goto again;
3463 } 3501 }
3464 3502
3465 spin_lock(&space_info->lock);
3466 /* 3503 /*
3467 * Not enough space to be reclaimed, don't bother committing the 3504 * Not enough space to be reclaimed, don't bother committing the
3468 * transaction. 3505 * transaction.
3469 */ 3506 */
3507 spin_lock(&space_info->lock);
3470 if (space_info->bytes_pinned < orig_bytes) 3508 if (space_info->bytes_pinned < orig_bytes)
3471 ret = -ENOSPC; 3509 ret = -ENOSPC;
3472 spin_unlock(&space_info->lock); 3510 spin_unlock(&space_info->lock);
@@ -3474,10 +3512,13 @@ again:
3474 goto out; 3512 goto out;
3475 3513
3476 ret = -EAGAIN; 3514 ret = -EAGAIN;
3477 if (trans || committed) 3515 if (trans)
3478 goto out; 3516 goto out;
3479 3517
3480 ret = -ENOSPC; 3518 ret = -ENOSPC;
3519 if (committed)
3520 goto out;
3521
3481 trans = btrfs_join_transaction(root); 3522 trans = btrfs_join_transaction(root);
3482 if (IS_ERR(trans)) 3523 if (IS_ERR(trans))
3483 goto out; 3524 goto out;
@@ -3489,12 +3530,12 @@ again:
3489 } 3530 }
3490 3531
3491out: 3532out:
3492 if (reserved) { 3533 if (flushing) {
3493 spin_lock(&space_info->lock); 3534 spin_lock(&space_info->lock);
3494 space_info->bytes_reserved -= orig_bytes; 3535 space_info->flush = 0;
3536 wake_up_all(&space_info->wait);
3495 spin_unlock(&space_info->lock); 3537 spin_unlock(&space_info->lock);
3496 } 3538 }
3497
3498 return ret; 3539 return ret;
3499} 3540}
3500 3541
@@ -3704,7 +3745,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3704 if (commit_trans) { 3745 if (commit_trans) {
3705 if (trans) 3746 if (trans)
3706 return -EAGAIN; 3747 return -EAGAIN;
3707
3708 trans = btrfs_join_transaction(root); 3748 trans = btrfs_join_transaction(root);
3709 BUG_ON(IS_ERR(trans)); 3749 BUG_ON(IS_ERR(trans));
3710 ret = btrfs_commit_transaction(trans, root); 3750 ret = btrfs_commit_transaction(trans, root);
@@ -3874,26 +3914,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3874 return 0; 3914 return 0;
3875} 3915}
3876 3916
3877int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3878 struct btrfs_root *root,
3879 int num_items)
3880{
3881 u64 num_bytes;
3882 int ret;
3883
3884 if (num_items == 0 || root->fs_info->chunk_root == root)
3885 return 0;
3886
3887 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
3888 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3889 num_bytes);
3890 if (!ret) {
3891 trans->bytes_reserved += num_bytes;
3892 trans->block_rsv = &root->fs_info->trans_block_rsv;
3893 }
3894 return ret;
3895}
3896
3897void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3898 struct btrfs_root *root) 3918 struct btrfs_root *root)
3899{ 3919{
@@ -3944,6 +3964,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3944 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3945} 3965}
3946 3966
3967static unsigned drop_outstanding_extent(struct inode *inode)
3968{
3969 unsigned dropped_extents = 0;
3970
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--;
3974
3975 /*
3976 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone.
3978 */
3979 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents)
3981 goto out;
3982
3983 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out:
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989}
3990
3947static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3948{ 3992{
3949 return num_bytes >>= 3; 3993 return num_bytes >>= 3;
@@ -3953,9 +3997,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3953{ 3997{
3954 struct btrfs_root *root = BTRFS_I(inode)->root; 3998 struct btrfs_root *root = BTRFS_I(inode)->root;
3955 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3956 u64 to_reserve; 4000 u64 to_reserve = 0;
3957 int nr_extents; 4001 unsigned nr_extents = 0;
3958 int reserved_extents;
3959 int ret; 4002 int ret;
3960 4003
3961 if (btrfs_transaction_in_commit(root->fs_info)) 4004 if (btrfs_transaction_in_commit(root->fs_info))
@@ -3963,66 +4006,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3963 4006
3964 num_bytes = ALIGN(num_bytes, root->sectorsize); 4007 num_bytes = ALIGN(num_bytes, root->sectorsize);
3965 4008
3966 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 4009 spin_lock(&BTRFS_I(inode)->lock);
3967 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); 4010 BTRFS_I(inode)->outstanding_extents++;
4011
4012 if (BTRFS_I(inode)->outstanding_extents >
4013 BTRFS_I(inode)->reserved_extents) {
4014 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents;
3968 4017
3969 if (nr_extents > reserved_extents) {
3970 nr_extents -= reserved_extents;
3971 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
3972 } else {
3973 nr_extents = 0;
3974 to_reserve = 0;
3975 } 4019 }
4020 spin_unlock(&BTRFS_I(inode)->lock);
3976 4021
3977 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4022 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3978 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3979 if (ret) 4024 if (ret) {
4025 unsigned dropped;
4026 /*
4027 * We don't need the return value since our reservation failed,
4028 * we just need to clean up our counter.
4029 */
4030 dropped = drop_outstanding_extent(inode);
4031 WARN_ON(dropped > 1);
3980 return ret; 4032 return ret;
3981 4033 }
3982 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3983 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3984 4034
3985 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4035 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3986 4036
3987 if (block_rsv->size > 512 * 1024 * 1024)
3988 shrink_delalloc(NULL, root, to_reserve, 0);
3989
3990 return 0; 4037 return 0;
3991} 4038}
3992 4039
3993void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3994{ 4041{
3995 struct btrfs_root *root = BTRFS_I(inode)->root; 4042 struct btrfs_root *root = BTRFS_I(inode)->root;
3996 u64 to_free; 4043 u64 to_free = 0;
3997 int nr_extents; 4044 unsigned dropped;
3998 int reserved_extents;
3999 4045
4000 num_bytes = ALIGN(num_bytes, root->sectorsize); 4046 num_bytes = ALIGN(num_bytes, root->sectorsize);
4001 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4047 dropped = drop_outstanding_extent(inode);
4002 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4003
4004 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4005 do {
4006 int old, new;
4007
4008 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4009 if (nr_extents >= reserved_extents) {
4010 nr_extents = 0;
4011 break;
4012 }
4013 old = reserved_extents;
4014 nr_extents = reserved_extents - nr_extents;
4015 new = reserved_extents - nr_extents;
4016 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4017 reserved_extents, new);
4018 if (likely(old == reserved_extents))
4019 break;
4020 reserved_extents = old;
4021 } while (1);
4022 4048
4023 to_free = calc_csum_metadata_size(inode, num_bytes); 4049 to_free = calc_csum_metadata_size(inode, num_bytes);
4024 if (nr_extents > 0) 4050 if (dropped > 0)
4025 to_free += btrfs_calc_trans_metadata_size(root, nr_extents); 4051 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4026 4052
4027 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4053 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4028 to_free); 4054 to_free);
@@ -4444,7 +4470,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4444 printk(KERN_ERR "umm, got %d back from search" 4470 printk(KERN_ERR "umm, got %d back from search"
4445 ", was looking for %llu\n", ret, 4471 ", was looking for %llu\n", ret,
4446 (unsigned long long)bytenr); 4472 (unsigned long long)bytenr);
4447 btrfs_print_leaf(extent_root, path->nodes[0]); 4473 if (ret > 0)
4474 btrfs_print_leaf(extent_root,
4475 path->nodes[0]);
4448 } 4476 }
4449 BUG_ON(ret); 4477 BUG_ON(ret);
4450 extent_slot = path->slots[0]; 4478 extent_slot = path->slots[0];
@@ -4990,14 +5018,10 @@ have_block_group:
4990 } 5018 }
4991 5019
4992 /* 5020 /*
4993 * We only want to start kthread caching if we are at 5021 * The caching workers are limited to 2 threads, so we
4994 * the point where we will wait for caching to make 5022 * can queue as much work as we care to.
4995 * progress, or if our ideal search is over and we've
4996 * found somebody to start caching.
4997 */ 5023 */
4998 if (loop > LOOP_CACHING_NOWAIT || 5024 if (loop > LOOP_FIND_IDEAL) {
4999 (loop > LOOP_FIND_IDEAL &&
5000 atomic_read(&space_info->caching_threads) < 2)) {
5001 ret = cache_block_group(block_group, trans, 5025 ret = cache_block_group(block_group, trans,
5002 orig_root, 0); 5026 orig_root, 0);
5003 BUG_ON(ret); 5027 BUG_ON(ret);
@@ -5065,7 +5089,9 @@ have_block_group:
5065 * group is does point to and try again 5089 * group is does point to and try again
5066 */ 5090 */
5067 if (!last_ptr_loop && last_ptr->block_group && 5091 if (!last_ptr_loop && last_ptr->block_group &&
5068 last_ptr->block_group != block_group) { 5092 last_ptr->block_group != block_group &&
5093 index <=
5094 get_block_group_index(last_ptr->block_group)) {
5069 5095
5070 btrfs_put_block_group(block_group); 5096 btrfs_put_block_group(block_group);
5071 block_group = last_ptr->block_group; 5097 block_group = last_ptr->block_group;
@@ -5219,8 +5245,7 @@ loop:
5219 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 5245 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5220 found_uncached_bg = false; 5246 found_uncached_bg = false;
5221 loop++; 5247 loop++;
5222 if (!ideal_cache_percent && 5248 if (!ideal_cache_percent)
5223 atomic_read(&space_info->caching_threads))
5224 goto search; 5249 goto search;
5225 5250
5226 /* 5251 /*
@@ -5494,7 +5519,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5494 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 5519 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5495 5520
5496 path = btrfs_alloc_path(); 5521 path = btrfs_alloc_path();
5497 BUG_ON(!path); 5522 if (!path)
5523 return -ENOMEM;
5498 5524
5499 path->leave_spinning = 1; 5525 path->leave_spinning = 1;
5500 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5526 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5623,7 +5649,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5623 if (!buf) 5649 if (!buf)
5624 return ERR_PTR(-ENOMEM); 5650 return ERR_PTR(-ENOMEM);
5625 btrfs_set_header_generation(buf, trans->transid); 5651 btrfs_set_header_generation(buf, trans->transid);
5626 btrfs_set_buffer_lockdep_class(buf, level); 5652 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
5627 btrfs_tree_lock(buf); 5653 btrfs_tree_lock(buf);
5628 clean_tree_block(trans, root, buf); 5654 clean_tree_block(trans, root, buf);
5629 5655
@@ -5910,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5910 return 1; 5936 return 1;
5911 5937
5912 if (path->locks[level] && !wc->keep_locks) { 5938 if (path->locks[level] && !wc->keep_locks) {
5913 btrfs_tree_unlock(eb); 5939 btrfs_tree_unlock_rw(eb, path->locks[level]);
5914 path->locks[level] = 0; 5940 path->locks[level] = 0;
5915 } 5941 }
5916 return 0; 5942 return 0;
@@ -5934,7 +5960,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5934 * keep the tree lock 5960 * keep the tree lock
5935 */ 5961 */
5936 if (path->locks[level] && level > 0) { 5962 if (path->locks[level] && level > 0) {
5937 btrfs_tree_unlock(eb); 5963 btrfs_tree_unlock_rw(eb, path->locks[level]);
5938 path->locks[level] = 0; 5964 path->locks[level] = 0;
5939 } 5965 }
5940 return 0; 5966 return 0;
@@ -6047,7 +6073,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6047 BUG_ON(level != btrfs_header_level(next)); 6073 BUG_ON(level != btrfs_header_level(next));
6048 path->nodes[level] = next; 6074 path->nodes[level] = next;
6049 path->slots[level] = 0; 6075 path->slots[level] = 0;
6050 path->locks[level] = 1; 6076 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6051 wc->level = level; 6077 wc->level = level;
6052 if (wc->level == 1) 6078 if (wc->level == 1)
6053 wc->reada_slot = 0; 6079 wc->reada_slot = 0;
@@ -6118,7 +6144,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6118 BUG_ON(level == 0); 6144 BUG_ON(level == 0);
6119 btrfs_tree_lock(eb); 6145 btrfs_tree_lock(eb);
6120 btrfs_set_lock_blocking(eb); 6146 btrfs_set_lock_blocking(eb);
6121 path->locks[level] = 1; 6147 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6122 6148
6123 ret = btrfs_lookup_extent_info(trans, root, 6149 ret = btrfs_lookup_extent_info(trans, root,
6124 eb->start, eb->len, 6150 eb->start, eb->len,
@@ -6127,8 +6153,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6127 BUG_ON(ret); 6153 BUG_ON(ret);
6128 BUG_ON(wc->refs[level] == 0); 6154 BUG_ON(wc->refs[level] == 0);
6129 if (wc->refs[level] == 1) { 6155 if (wc->refs[level] == 1) {
6130 btrfs_tree_unlock(eb); 6156 btrfs_tree_unlock_rw(eb, path->locks[level]);
6131 path->locks[level] = 0;
6132 return 1; 6157 return 1;
6133 } 6158 }
6134 } 6159 }
@@ -6150,7 +6175,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6150 btrfs_header_generation(eb) == trans->transid) { 6175 btrfs_header_generation(eb) == trans->transid) {
6151 btrfs_tree_lock(eb); 6176 btrfs_tree_lock(eb);
6152 btrfs_set_lock_blocking(eb); 6177 btrfs_set_lock_blocking(eb);
6153 path->locks[level] = 1; 6178 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6154 } 6179 }
6155 clean_tree_block(trans, root, eb); 6180 clean_tree_block(trans, root, eb);
6156 } 6181 }
@@ -6229,7 +6254,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6229 return 0; 6254 return 0;
6230 6255
6231 if (path->locks[level]) { 6256 if (path->locks[level]) {
6232 btrfs_tree_unlock(path->nodes[level]); 6257 btrfs_tree_unlock_rw(path->nodes[level],
6258 path->locks[level]);
6233 path->locks[level] = 0; 6259 path->locks[level] = 0;
6234 } 6260 }
6235 free_extent_buffer(path->nodes[level]); 6261 free_extent_buffer(path->nodes[level]);
@@ -6251,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6251 * also make sure backrefs for the shared block and all lower level 6277 * also make sure backrefs for the shared block and all lower level
6252 * blocks are properly updated. 6278 * blocks are properly updated.
6253 */ 6279 */
6254int btrfs_drop_snapshot(struct btrfs_root *root, 6280void btrfs_drop_snapshot(struct btrfs_root *root,
6255 struct btrfs_block_rsv *block_rsv, int update_ref) 6281 struct btrfs_block_rsv *block_rsv, int update_ref)
6256{ 6282{
6257 struct btrfs_path *path; 6283 struct btrfs_path *path;
6258 struct btrfs_trans_handle *trans; 6284 struct btrfs_trans_handle *trans;
@@ -6265,10 +6291,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6265 int level; 6291 int level;
6266 6292
6267 path = btrfs_alloc_path(); 6293 path = btrfs_alloc_path();
6268 BUG_ON(!path); 6294 if (!path) {
6295 err = -ENOMEM;
6296 goto out;
6297 }
6269 6298
6270 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6299 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6271 BUG_ON(!wc); 6300 if (!wc) {
6301 btrfs_free_path(path);
6302 err = -ENOMEM;
6303 goto out;
6304 }
6272 6305
6273 trans = btrfs_start_transaction(tree_root, 0); 6306 trans = btrfs_start_transaction(tree_root, 0);
6274 BUG_ON(IS_ERR(trans)); 6307 BUG_ON(IS_ERR(trans));
@@ -6281,7 +6314,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6281 path->nodes[level] = btrfs_lock_root_node(root); 6314 path->nodes[level] = btrfs_lock_root_node(root);
6282 btrfs_set_lock_blocking(path->nodes[level]); 6315 btrfs_set_lock_blocking(path->nodes[level]);
6283 path->slots[level] = 0; 6316 path->slots[level] = 0;
6284 path->locks[level] = 1; 6317 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6285 memset(&wc->update_progress, 0, 6318 memset(&wc->update_progress, 0,
6286 sizeof(wc->update_progress)); 6319 sizeof(wc->update_progress));
6287 } else { 6320 } else {
@@ -6296,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6296 path->lowest_level = 0; 6329 path->lowest_level = 0;
6297 if (ret < 0) { 6330 if (ret < 0) {
6298 err = ret; 6331 err = ret;
6299 goto out; 6332 goto out_free;
6300 } 6333 }
6301 WARN_ON(ret > 0); 6334 WARN_ON(ret > 0);
6302 6335
@@ -6403,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6403 free_extent_buffer(root->commit_root); 6436 free_extent_buffer(root->commit_root);
6404 kfree(root); 6437 kfree(root);
6405 } 6438 }
6406out: 6439out_free:
6407 btrfs_end_transaction_throttle(trans, tree_root); 6440 btrfs_end_transaction_throttle(trans, tree_root);
6408 kfree(wc); 6441 kfree(wc);
6409 btrfs_free_path(path); 6442 btrfs_free_path(path);
6410 return err; 6443out:
6444 if (err)
6445 btrfs_std_error(root->fs_info, err);
6446 return;
6411} 6447}
6412 6448
6413/* 6449/*
@@ -6449,7 +6485,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6449 level = btrfs_header_level(node); 6485 level = btrfs_header_level(node);
6450 path->nodes[level] = node; 6486 path->nodes[level] = node;
6451 path->slots[level] = 0; 6487 path->slots[level] = 0;
6452 path->locks[level] = 1; 6488 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6453 6489
6454 wc->refs[parent_level] = 1; 6490 wc->refs[parent_level] = 1;
6455 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6491 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6524,30 +6560,48 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6524 return flags; 6560 return flags;
6525} 6561}
6526 6562
6527static int set_block_group_ro(struct btrfs_block_group_cache *cache) 6563static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6528{ 6564{
6529 struct btrfs_space_info *sinfo = cache->space_info; 6565 struct btrfs_space_info *sinfo = cache->space_info;
6530 u64 num_bytes; 6566 u64 num_bytes;
6567 u64 min_allocable_bytes;
6531 int ret = -ENOSPC; 6568 int ret = -ENOSPC;
6532 6569
6533 if (cache->ro) 6570
6534 return 0; 6571 /*
6572 * We need some metadata space and system metadata space for
6573 * allocating chunks in some corner cases until we force to set
6574 * it to be readonly.
6575 */
6576 if ((sinfo->flags &
6577 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
6578 !force)
6579 min_allocable_bytes = 1 * 1024 * 1024;
6580 else
6581 min_allocable_bytes = 0;
6535 6582
6536 spin_lock(&sinfo->lock); 6583 spin_lock(&sinfo->lock);
6537 spin_lock(&cache->lock); 6584 spin_lock(&cache->lock);
6585
6586 if (cache->ro) {
6587 ret = 0;
6588 goto out;
6589 }
6590
6538 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 6591 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6539 cache->bytes_super - btrfs_block_group_used(&cache->item); 6592 cache->bytes_super - btrfs_block_group_used(&cache->item);
6540 6593
6541 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6542 sinfo->bytes_may_use + sinfo->bytes_readonly + 6595 sinfo->bytes_may_use + sinfo->bytes_readonly +
6543 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { 6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <=
6597 sinfo->total_bytes) {
6544 sinfo->bytes_readonly += num_bytes; 6598 sinfo->bytes_readonly += num_bytes;
6545 sinfo->bytes_reserved += cache->reserved_pinned; 6599 sinfo->bytes_reserved += cache->reserved_pinned;
6546 cache->reserved_pinned = 0; 6600 cache->reserved_pinned = 0;
6547 cache->ro = 1; 6601 cache->ro = 1;
6548 ret = 0; 6602 ret = 0;
6549 } 6603 }
6550 6604out:
6551 spin_unlock(&cache->lock); 6605 spin_unlock(&cache->lock);
6552 spin_unlock(&sinfo->lock); 6606 spin_unlock(&sinfo->lock);
6553 return ret; 6607 return ret;
@@ -6571,7 +6625,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6571 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 6625 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6572 CHUNK_ALLOC_FORCE); 6626 CHUNK_ALLOC_FORCE);
6573 6627
6574 ret = set_block_group_ro(cache); 6628 ret = set_block_group_ro(cache, 0);
6575 if (!ret) 6629 if (!ret)
6576 goto out; 6630 goto out;
6577 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 6631 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -6579,7 +6633,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6579 CHUNK_ALLOC_FORCE); 6633 CHUNK_ALLOC_FORCE);
6580 if (ret < 0) 6634 if (ret < 0)
6581 goto out; 6635 goto out;
6582 ret = set_block_group_ro(cache); 6636 ret = set_block_group_ro(cache, 0);
6583out: 6637out:
6584 btrfs_end_transaction(trans, root); 6638 btrfs_end_transaction(trans, root);
6585 return ret; 6639 return ret;
@@ -6680,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6680 struct btrfs_space_info *space_info; 6734 struct btrfs_space_info *space_info;
6681 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6735 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6682 struct btrfs_device *device; 6736 struct btrfs_device *device;
6737 u64 min_free;
6738 u64 dev_min = 1;
6739 u64 dev_nr = 0;
6740 int index;
6683 int full = 0; 6741 int full = 0;
6684 int ret = 0; 6742 int ret = 0;
6685 6743
@@ -6689,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6689 if (!block_group) 6747 if (!block_group)
6690 return -1; 6748 return -1;
6691 6749
6750 min_free = btrfs_block_group_used(&block_group->item);
6751
6692 /* no bytes used, we're good */ 6752 /* no bytes used, we're good */
6693 if (!btrfs_block_group_used(&block_group->item)) 6753 if (!min_free)
6694 goto out; 6754 goto out;
6695 6755
6696 space_info = block_group->space_info; 6756 space_info = block_group->space_info;
@@ -6706,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6706 * all of the extents from this block group. If we can, we're good 6766 * all of the extents from this block group. If we can, we're good
6707 */ 6767 */
6708 if ((space_info->total_bytes != block_group->key.offset) && 6768 if ((space_info->total_bytes != block_group->key.offset) &&
6709 (space_info->bytes_used + space_info->bytes_reserved + 6769 (space_info->bytes_used + space_info->bytes_reserved +
6710 space_info->bytes_pinned + space_info->bytes_readonly + 6770 space_info->bytes_pinned + space_info->bytes_readonly +
6711 btrfs_block_group_used(&block_group->item) < 6771 min_free < space_info->total_bytes)) {
6712 space_info->total_bytes)) {
6713 spin_unlock(&space_info->lock); 6772 spin_unlock(&space_info->lock);
6714 goto out; 6773 goto out;
6715 } 6774 }
@@ -6726,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6726 if (full) 6785 if (full)
6727 goto out; 6786 goto out;
6728 6787
6788 /*
6789 * index:
6790 * 0: raid10
6791 * 1: raid1
6792 * 2: dup
6793 * 3: raid0
6794 * 4: single
6795 */
6796 index = get_block_group_index(block_group);
6797 if (index == 0) {
6798 dev_min = 4;
6799 /* Divide by 2 */
6800 min_free >>= 1;
6801 } else if (index == 1) {
6802 dev_min = 2;
6803 } else if (index == 2) {
6804 /* Multiply by 2 */
6805 min_free <<= 1;
6806 } else if (index == 3) {
6807 dev_min = fs_devices->rw_devices;
6808 do_div(min_free, dev_min);
6809 }
6810
6729 mutex_lock(&root->fs_info->chunk_mutex); 6811 mutex_lock(&root->fs_info->chunk_mutex);
6730 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 6812 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
6731 u64 min_free = btrfs_block_group_used(&block_group->item);
6732 u64 dev_offset; 6813 u64 dev_offset;
6733 6814
6734 /* 6815 /*
@@ -6739,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6739 ret = find_free_dev_extent(NULL, device, min_free, 6820 ret = find_free_dev_extent(NULL, device, min_free,
6740 &dev_offset, NULL); 6821 &dev_offset, NULL);
6741 if (!ret) 6822 if (!ret)
6823 dev_nr++;
6824
6825 if (dev_nr >= dev_min)
6742 break; 6826 break;
6827
6743 ret = -1; 6828 ret = -1;
6744 } 6829 }
6745 } 6830 }
@@ -7016,7 +7101,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7016 7101
7017 set_avail_alloc_bits(root->fs_info, cache->flags); 7102 set_avail_alloc_bits(root->fs_info, cache->flags);
7018 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7103 if (btrfs_chunk_readonly(root, cache->key.objectid))
7019 set_block_group_ro(cache); 7104 set_block_group_ro(cache, 1);
7020 } 7105 }
7021 7106
7022 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 7107 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -7030,9 +7115,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7030 * mirrored block groups. 7115 * mirrored block groups.
7031 */ 7116 */
7032 list_for_each_entry(cache, &space_info->block_groups[3], list) 7117 list_for_each_entry(cache, &space_info->block_groups[3], list)
7033 set_block_group_ro(cache); 7118 set_block_group_ro(cache, 1);
7034 list_for_each_entry(cache, &space_info->block_groups[4], list) 7119 list_for_each_entry(cache, &space_info->block_groups[4], list)
7035 set_block_group_ro(cache); 7120 set_block_group_ro(cache, 1);
7036 } 7121 }
7037 7122
7038 init_global_block_rsv(info); 7123 init_global_block_rsv(info);
@@ -7162,11 +7247,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7162 spin_unlock(&cluster->refill_lock); 7247 spin_unlock(&cluster->refill_lock);
7163 7248
7164 path = btrfs_alloc_path(); 7249 path = btrfs_alloc_path();
7165 BUG_ON(!path); 7250 if (!path) {
7251 ret = -ENOMEM;
7252 goto out;
7253 }
7166 7254
7167 inode = lookup_free_space_inode(root, block_group, path); 7255 inode = lookup_free_space_inode(root, block_group, path);
7168 if (!IS_ERR(inode)) { 7256 if (!IS_ERR(inode)) {
7169 btrfs_orphan_add(trans, inode); 7257 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret);
7170 clear_nlink(inode); 7259 clear_nlink(inode);
7171 /* One for the block groups ref */ 7260 /* One for the block groups ref */
7172 spin_lock(&block_group->lock); 7261 spin_lock(&block_group->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11c1efd..d418164a35f1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -254,14 +254,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
254 * 254 *
255 * This should be called with the tree lock held. 255 * This should be called with the tree lock held.
256 */ 256 */
257static int merge_state(struct extent_io_tree *tree, 257static void merge_state(struct extent_io_tree *tree,
258 struct extent_state *state) 258 struct extent_state *state)
259{ 259{
260 struct extent_state *other; 260 struct extent_state *other;
261 struct rb_node *other_node; 261 struct rb_node *other_node;
262 262
263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
264 return 0; 264 return;
265 265
266 other_node = rb_prev(&state->rb_node); 266 other_node = rb_prev(&state->rb_node);
267 if (other_node) { 267 if (other_node) {
@@ -281,26 +281,19 @@ static int merge_state(struct extent_io_tree *tree,
281 if (other->start == state->end + 1 && 281 if (other->start == state->end + 1 &&
282 other->state == state->state) { 282 other->state == state->state) {
283 merge_cb(tree, state, other); 283 merge_cb(tree, state, other);
284 other->start = state->start; 284 state->end = other->end;
285 state->tree = NULL; 285 other->tree = NULL;
286 rb_erase(&state->rb_node, &tree->state); 286 rb_erase(&other->rb_node, &tree->state);
287 free_extent_state(state); 287 free_extent_state(other);
288 state = NULL;
289 } 288 }
290 } 289 }
291
292 return 0;
293} 290}
294 291
295static int set_state_cb(struct extent_io_tree *tree, 292static void set_state_cb(struct extent_io_tree *tree,
296 struct extent_state *state, int *bits) 293 struct extent_state *state, int *bits)
297{ 294{
298 if (tree->ops && tree->ops->set_bit_hook) { 295 if (tree->ops && tree->ops->set_bit_hook)
299 return tree->ops->set_bit_hook(tree->mapping->host, 296 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
300 state, bits);
301 }
302
303 return 0;
304} 297}
305 298
306static void clear_state_cb(struct extent_io_tree *tree, 299static void clear_state_cb(struct extent_io_tree *tree,
@@ -310,6 +303,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
310 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 303 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
311} 304}
312 305
306static void set_state_bits(struct extent_io_tree *tree,
307 struct extent_state *state, int *bits);
308
313/* 309/*
314 * insert an extent_state struct into the tree. 'bits' are set on the 310 * insert an extent_state struct into the tree. 'bits' are set on the
315 * struct before it is inserted. 311 * struct before it is inserted.
@@ -325,8 +321,6 @@ static int insert_state(struct extent_io_tree *tree,
325 int *bits) 321 int *bits)
326{ 322{
327 struct rb_node *node; 323 struct rb_node *node;
328 int bits_to_set = *bits & ~EXTENT_CTLBITS;
329 int ret;
330 324
331 if (end < start) { 325 if (end < start) {
332 printk(KERN_ERR "btrfs end < start %llu %llu\n", 326 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -336,13 +330,9 @@ static int insert_state(struct extent_io_tree *tree,
336 } 330 }
337 state->start = start; 331 state->start = start;
338 state->end = end; 332 state->end = end;
339 ret = set_state_cb(tree, state, bits);
340 if (ret)
341 return ret;
342 333
343 if (bits_to_set & EXTENT_DIRTY) 334 set_state_bits(tree, state, bits);
344 tree->dirty_bytes += end - start + 1; 335
345 state->state |= bits_to_set;
346 node = tree_insert(&tree->state, end, &state->rb_node); 336 node = tree_insert(&tree->state, end, &state->rb_node);
347 if (node) { 337 if (node) {
348 struct extent_state *found; 338 struct extent_state *found;
@@ -351,7 +341,6 @@ static int insert_state(struct extent_io_tree *tree,
351 "%llu %llu\n", (unsigned long long)found->start, 341 "%llu %llu\n", (unsigned long long)found->start,
352 (unsigned long long)found->end, 342 (unsigned long long)found->end,
353 (unsigned long long)start, (unsigned long long)end); 343 (unsigned long long)start, (unsigned long long)end);
354 free_extent_state(state);
355 return -EEXIST; 344 return -EEXIST;
356 } 345 }
357 state->tree = tree; 346 state->tree = tree;
@@ -359,13 +348,11 @@ static int insert_state(struct extent_io_tree *tree,
359 return 0; 348 return 0;
360} 349}
361 350
362static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 351static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
363 u64 split) 352 u64 split)
364{ 353{
365 if (tree->ops && tree->ops->split_extent_hook) 354 if (tree->ops && tree->ops->split_extent_hook)
366 return tree->ops->split_extent_hook(tree->mapping->host, 355 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
367 orig, split);
368 return 0;
369} 356}
370 357
371/* 358/*
@@ -500,7 +487,8 @@ again:
500 cached_state = NULL; 487 cached_state = NULL;
501 } 488 }
502 489
503 if (cached && cached->tree && cached->start == start) { 490 if (cached && cached->tree && cached->start <= start &&
491 cached->end > start) {
504 if (clear) 492 if (clear)
505 atomic_dec(&cached->refs); 493 atomic_dec(&cached->refs);
506 state = cached; 494 state = cached;
@@ -660,34 +648,25 @@ again:
660 if (start > end) 648 if (start > end)
661 break; 649 break;
662 650
663 if (need_resched()) { 651 cond_resched_lock(&tree->lock);
664 spin_unlock(&tree->lock);
665 cond_resched();
666 spin_lock(&tree->lock);
667 }
668 } 652 }
669out: 653out:
670 spin_unlock(&tree->lock); 654 spin_unlock(&tree->lock);
671 return 0; 655 return 0;
672} 656}
673 657
674static int set_state_bits(struct extent_io_tree *tree, 658static void set_state_bits(struct extent_io_tree *tree,
675 struct extent_state *state, 659 struct extent_state *state,
676 int *bits) 660 int *bits)
677{ 661{
678 int ret;
679 int bits_to_set = *bits & ~EXTENT_CTLBITS; 662 int bits_to_set = *bits & ~EXTENT_CTLBITS;
680 663
681 ret = set_state_cb(tree, state, bits); 664 set_state_cb(tree, state, bits);
682 if (ret)
683 return ret;
684 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 665 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
685 u64 range = state->end - state->start + 1; 666 u64 range = state->end - state->start + 1;
686 tree->dirty_bytes += range; 667 tree->dirty_bytes += range;
687 } 668 }
688 state->state |= bits_to_set; 669 state->state |= bits_to_set;
689
690 return 0;
691} 670}
692 671
693static void cache_state(struct extent_state *state, 672static void cache_state(struct extent_state *state,
@@ -742,7 +721,8 @@ again:
742 spin_lock(&tree->lock); 721 spin_lock(&tree->lock);
743 if (cached_state && *cached_state) { 722 if (cached_state && *cached_state) {
744 state = *cached_state; 723 state = *cached_state;
745 if (state->start == start && state->tree) { 724 if (state->start <= start && state->end > start &&
725 state->tree) {
746 node = &state->rb_node; 726 node = &state->rb_node;
747 goto hit_next; 727 goto hit_next;
748 } 728 }
@@ -779,17 +759,15 @@ hit_next:
779 goto out; 759 goto out;
780 } 760 }
781 761
782 err = set_state_bits(tree, state, &bits); 762 set_state_bits(tree, state, &bits);
783 if (err)
784 goto out;
785 763
786 next_node = rb_next(node);
787 cache_state(state, cached_state); 764 cache_state(state, cached_state);
788 merge_state(tree, state); 765 merge_state(tree, state);
789 if (last_end == (u64)-1) 766 if (last_end == (u64)-1)
790 goto out; 767 goto out;
791 768
792 start = last_end + 1; 769 start = last_end + 1;
770 next_node = rb_next(&state->rb_node);
793 if (next_node && start < end && prealloc && !need_resched()) { 771 if (next_node && start < end && prealloc && !need_resched()) {
794 state = rb_entry(next_node, struct extent_state, 772 state = rb_entry(next_node, struct extent_state,
795 rb_node); 773 rb_node);
@@ -830,9 +808,7 @@ hit_next:
830 if (err) 808 if (err)
831 goto out; 809 goto out;
832 if (state->end <= end) { 810 if (state->end <= end) {
833 err = set_state_bits(tree, state, &bits); 811 set_state_bits(tree, state, &bits);
834 if (err)
835 goto out;
836 cache_state(state, cached_state); 812 cache_state(state, cached_state);
837 merge_state(tree, state); 813 merge_state(tree, state);
838 if (last_end == (u64)-1) 814 if (last_end == (u64)-1)
@@ -862,7 +838,6 @@ hit_next:
862 * Avoid to free 'prealloc' if it can be merged with 838 * Avoid to free 'prealloc' if it can be merged with
863 * the later extent. 839 * the later extent.
864 */ 840 */
865 atomic_inc(&prealloc->refs);
866 err = insert_state(tree, prealloc, start, this_end, 841 err = insert_state(tree, prealloc, start, this_end,
867 &bits); 842 &bits);
868 BUG_ON(err == -EEXIST); 843 BUG_ON(err == -EEXIST);
@@ -872,7 +847,6 @@ hit_next:
872 goto out; 847 goto out;
873 } 848 }
874 cache_state(prealloc, cached_state); 849 cache_state(prealloc, cached_state);
875 free_extent_state(prealloc);
876 prealloc = NULL; 850 prealloc = NULL;
877 start = this_end + 1; 851 start = this_end + 1;
878 goto search_again; 852 goto search_again;
@@ -895,11 +869,7 @@ hit_next:
895 err = split_state(tree, state, prealloc, end + 1); 869 err = split_state(tree, state, prealloc, end + 1);
896 BUG_ON(err == -EEXIST); 870 BUG_ON(err == -EEXIST);
897 871
898 err = set_state_bits(tree, prealloc, &bits); 872 set_state_bits(tree, prealloc, &bits);
899 if (err) {
900 prealloc = NULL;
901 goto out;
902 }
903 cache_state(prealloc, cached_state); 873 cache_state(prealloc, cached_state);
904 merge_state(tree, prealloc); 874 merge_state(tree, prealloc);
905 prealloc = NULL; 875 prealloc = NULL;
@@ -1061,46 +1031,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1061 return 0; 1031 return 0;
1062} 1032}
1063 1033
1064/*
1065 * find the first offset in the io tree with 'bits' set. zero is
1066 * returned if we find something, and *start_ret and *end_ret are
1067 * set to reflect the state struct that was found.
1068 *
1069 * If nothing was found, 1 is returned, < 0 on error
1070 */
1071int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1072 u64 *start_ret, u64 *end_ret, int bits)
1073{
1074 struct rb_node *node;
1075 struct extent_state *state;
1076 int ret = 1;
1077
1078 spin_lock(&tree->lock);
1079 /*
1080 * this search will find all the extents that end after
1081 * our range starts.
1082 */
1083 node = tree_search(tree, start);
1084 if (!node)
1085 goto out;
1086
1087 while (1) {
1088 state = rb_entry(node, struct extent_state, rb_node);
1089 if (state->end >= start && (state->state & bits)) {
1090 *start_ret = state->start;
1091 *end_ret = state->end;
1092 ret = 0;
1093 break;
1094 }
1095 node = rb_next(node);
1096 if (!node)
1097 break;
1098 }
1099out:
1100 spin_unlock(&tree->lock);
1101 return ret;
1102}
1103
1104/* find the first state struct with 'bits' set after 'start', and 1034/* find the first state struct with 'bits' set after 'start', and
1105 * return it. tree->lock must be held. NULL will returned if 1035 * return it. tree->lock must be held. NULL will returned if
1106 * nothing was found after 'start' 1036 * nothing was found after 'start'
@@ -1133,6 +1063,30 @@ out:
1133} 1063}
1134 1064
1135/* 1065/*
1066 * find the first offset in the io tree with 'bits' set. zero is
1067 * returned if we find something, and *start_ret and *end_ret are
1068 * set to reflect the state struct that was found.
1069 *
1070 * If nothing was found, 1 is returned, < 0 on error
1071 */
1072int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1073 u64 *start_ret, u64 *end_ret, int bits)
1074{
1075 struct extent_state *state;
1076 int ret = 1;
1077
1078 spin_lock(&tree->lock);
1079 state = find_first_extent_bit_state(tree, start, bits);
1080 if (state) {
1081 *start_ret = state->start;
1082 *end_ret = state->end;
1083 ret = 0;
1084 }
1085 spin_unlock(&tree->lock);
1086 return ret;
1087}
1088
1089/*
1136 * find a contiguous range of bytes in the file marked as delalloc, not 1090 * find a contiguous range of bytes in the file marked as delalloc, not
1137 * more than 'max_bytes'. start and end are used to return the range, 1091 * more than 'max_bytes'. start and end are used to return the range,
1138 * 1092 *
@@ -1564,7 +1518,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1564 int bitset = 0; 1518 int bitset = 0;
1565 1519
1566 spin_lock(&tree->lock); 1520 spin_lock(&tree->lock);
1567 if (cached && cached->tree && cached->start == start) 1521 if (cached && cached->tree && cached->start <= start &&
1522 cached->end > start)
1568 node = &cached->rb_node; 1523 node = &cached->rb_node;
1569 else 1524 else
1570 node = tree_search(tree, start); 1525 node = tree_search(tree, start);
@@ -2432,6 +2387,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2432 pgoff_t index; 2387 pgoff_t index;
2433 pgoff_t end; /* Inclusive */ 2388 pgoff_t end; /* Inclusive */
2434 int scanned = 0; 2389 int scanned = 0;
2390 int tag;
2435 2391
2436 pagevec_init(&pvec, 0); 2392 pagevec_init(&pvec, 0);
2437 if (wbc->range_cyclic) { 2393 if (wbc->range_cyclic) {
@@ -2442,11 +2398,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2442 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2398 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2443 scanned = 1; 2399 scanned = 1;
2444 } 2400 }
2401 if (wbc->sync_mode == WB_SYNC_ALL)
2402 tag = PAGECACHE_TAG_TOWRITE;
2403 else
2404 tag = PAGECACHE_TAG_DIRTY;
2445retry: 2405retry:
2406 if (wbc->sync_mode == WB_SYNC_ALL)
2407 tag_pages_for_writeback(mapping, index, end);
2446 while (!done && !nr_to_write_done && (index <= end) && 2408 while (!done && !nr_to_write_done && (index <= end) &&
2447 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2409 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2448 PAGECACHE_TAG_DIRTY, min(end - index, 2410 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2449 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2450 unsigned i; 2411 unsigned i;
2451 2412
2452 scanned = 1; 2413 scanned = 1;
@@ -2541,7 +2502,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2541 struct writeback_control *wbc) 2502 struct writeback_control *wbc)
2542{ 2503{
2543 int ret; 2504 int ret;
2544 struct address_space *mapping = page->mapping;
2545 struct extent_page_data epd = { 2505 struct extent_page_data epd = {
2546 .bio = NULL, 2506 .bio = NULL,
2547 .tree = tree, 2507 .tree = tree,
@@ -2549,18 +2509,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2549 .extent_locked = 0, 2509 .extent_locked = 0,
2550 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2510 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2551 }; 2511 };
2552 struct writeback_control wbc_writepages = {
2553 .sync_mode = wbc->sync_mode,
2554 .older_than_this = NULL,
2555 .nr_to_write = 64,
2556 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557 .range_end = (loff_t)-1,
2558 };
2559 2512
2560 ret = __extent_writepage(page, wbc, &epd); 2513 ret = __extent_writepage(page, wbc, &epd);
2561 2514
2562 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2563 __extent_writepage, &epd, flush_write_bio);
2564 flush_epd_write_bio(&epd); 2515 flush_epd_write_bio(&epd);
2565 return ret; 2516 return ret;
2566} 2517}
@@ -2584,7 +2535,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2584 }; 2535 };
2585 struct writeback_control wbc_writepages = { 2536 struct writeback_control wbc_writepages = {
2586 .sync_mode = mode, 2537 .sync_mode = mode,
2587 .older_than_this = NULL,
2588 .nr_to_write = nr_pages * 2, 2538 .nr_to_write = nr_pages * 2,
2589 .range_start = start, 2539 .range_start = start,
2590 .range_end = end + 1, 2540 .range_end = end + 1,
@@ -3022,8 +2972,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3022 return NULL; 2972 return NULL;
3023 eb->start = start; 2973 eb->start = start;
3024 eb->len = len; 2974 eb->len = len;
3025 spin_lock_init(&eb->lock); 2975 rwlock_init(&eb->lock);
3026 init_waitqueue_head(&eb->lock_wq); 2976 atomic_set(&eb->write_locks, 0);
2977 atomic_set(&eb->read_locks, 0);
2978 atomic_set(&eb->blocking_readers, 0);
2979 atomic_set(&eb->blocking_writers, 0);
2980 atomic_set(&eb->spinning_readers, 0);
2981 atomic_set(&eb->spinning_writers, 0);
2982 init_waitqueue_head(&eb->write_lock_wq);
2983 init_waitqueue_head(&eb->read_lock_wq);
3027 2984
3028#if LEAK_DEBUG 2985#if LEAK_DEBUG
3029 spin_lock_irqsave(&leak_lock, flags); 2986 spin_lock_irqsave(&leak_lock, flags);
@@ -3119,7 +3076,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3119 i = 0; 3076 i = 0;
3120 } 3077 }
3121 for (; i < num_pages; i++, index++) { 3078 for (; i < num_pages; i++, index++) {
3122 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); 3079 p = find_or_create_page(mapping, index, GFP_NOFS);
3123 if (!p) { 3080 if (!p) {
3124 WARN_ON(1); 3081 WARN_ON(1);
3125 goto free_eb; 3082 goto free_eb;
@@ -3266,6 +3223,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3266 return was_dirty; 3223 return was_dirty;
3267} 3224}
3268 3225
3226static int __eb_straddles_pages(u64 start, u64 len)
3227{
3228 if (len < PAGE_CACHE_SIZE)
3229 return 1;
3230 if (start & (PAGE_CACHE_SIZE - 1))
3231 return 1;
3232 if ((start + len) & (PAGE_CACHE_SIZE - 1))
3233 return 1;
3234 return 0;
3235}
3236
3237static int eb_straddles_pages(struct extent_buffer *eb)
3238{
3239 return __eb_straddles_pages(eb->start, eb->len);
3240}
3241
3269int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3242int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3270 struct extent_buffer *eb, 3243 struct extent_buffer *eb,
3271 struct extent_state **cached_state) 3244 struct extent_state **cached_state)
@@ -3277,8 +3250,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3277 num_pages = num_extent_pages(eb->start, eb->len); 3250 num_pages = num_extent_pages(eb->start, eb->len);
3278 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3251 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3279 3252
3280 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3253 if (eb_straddles_pages(eb)) {
3281 cached_state, GFP_NOFS); 3254 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3255 cached_state, GFP_NOFS);
3256 }
3282 for (i = 0; i < num_pages; i++) { 3257 for (i = 0; i < num_pages; i++) {
3283 page = extent_buffer_page(eb, i); 3258 page = extent_buffer_page(eb, i);
3284 if (page) 3259 if (page)
@@ -3296,8 +3271,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3296 3271
3297 num_pages = num_extent_pages(eb->start, eb->len); 3272 num_pages = num_extent_pages(eb->start, eb->len);
3298 3273
3299 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3274 if (eb_straddles_pages(eb)) {
3300 NULL, GFP_NOFS); 3275 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3276 NULL, GFP_NOFS);
3277 }
3301 for (i = 0; i < num_pages; i++) { 3278 for (i = 0; i < num_pages; i++) {
3302 page = extent_buffer_page(eb, i); 3279 page = extent_buffer_page(eb, i);
3303 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3280 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3320,9 +3297,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3320 int uptodate; 3297 int uptodate;
3321 unsigned long index; 3298 unsigned long index;
3322 3299
3323 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3300 if (__eb_straddles_pages(start, end - start + 1)) {
3324 if (ret) 3301 ret = test_range_bit(tree, start, end,
3325 return 1; 3302 EXTENT_UPTODATE, 1, NULL);
3303 if (ret)
3304 return 1;
3305 }
3326 while (start <= end) { 3306 while (start <= end) {
3327 index = start >> PAGE_CACHE_SHIFT; 3307 index = start >> PAGE_CACHE_SHIFT;
3328 page = find_get_page(tree->mapping, index); 3308 page = find_get_page(tree->mapping, index);
@@ -3350,10 +3330,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3350 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3351 return 1; 3331 return 1;
3352 3332
3353 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3333 if (eb_straddles_pages(eb)) {
3354 EXTENT_UPTODATE, 1, cached_state); 3334 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3355 if (ret) 3335 EXTENT_UPTODATE, 1, cached_state);
3356 return ret; 3336 if (ret)
3337 return ret;
3338 }
3357 3339
3358 num_pages = num_extent_pages(eb->start, eb->len); 3340 num_pages = num_extent_pages(eb->start, eb->len);
3359 for (i = 0; i < num_pages; i++) { 3341 for (i = 0; i < num_pages; i++) {
@@ -3386,9 +3368,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3368 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3387 return 0; 3369 return 0;
3388 3370
3389 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3371 if (eb_straddles_pages(eb)) {
3390 EXTENT_UPTODATE, 1, NULL)) { 3372 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3391 return 0; 3373 EXTENT_UPTODATE, 1, NULL)) {
3374 return 0;
3375 }
3392 } 3376 }
3393 3377
3394 if (start) { 3378 if (start) {
@@ -3492,9 +3476,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3492 page = extent_buffer_page(eb, i); 3476 page = extent_buffer_page(eb, i);
3493 3477
3494 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3478 cur = min(len, (PAGE_CACHE_SIZE - offset));
3495 kaddr = kmap_atomic(page, KM_USER1); 3479 kaddr = page_address(page);
3496 memcpy(dst, kaddr + offset, cur); 3480 memcpy(dst, kaddr + offset, cur);
3497 kunmap_atomic(kaddr, KM_USER1);
3498 3481
3499 dst += cur; 3482 dst += cur;
3500 len -= cur; 3483 len -= cur;
@@ -3504,9 +3487,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3504} 3487}
3505 3488
3506int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3489int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3507 unsigned long min_len, char **token, char **map, 3490 unsigned long min_len, char **map,
3508 unsigned long *map_start, 3491 unsigned long *map_start,
3509 unsigned long *map_len, int km) 3492 unsigned long *map_len)
3510{ 3493{
3511 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3494 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3512 char *kaddr; 3495 char *kaddr;
@@ -3536,42 +3519,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3536 } 3519 }
3537 3520
3538 p = extent_buffer_page(eb, i); 3521 p = extent_buffer_page(eb, i);
3539 kaddr = kmap_atomic(p, km); 3522 kaddr = page_address(p);
3540 *token = kaddr;
3541 *map = kaddr + offset; 3523 *map = kaddr + offset;
3542 *map_len = PAGE_CACHE_SIZE - offset; 3524 *map_len = PAGE_CACHE_SIZE - offset;
3543 return 0; 3525 return 0;
3544} 3526}
3545 3527
3546int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3547 unsigned long min_len,
3548 char **token, char **map,
3549 unsigned long *map_start,
3550 unsigned long *map_len, int km)
3551{
3552 int err;
3553 int save = 0;
3554 if (eb->map_token) {
3555 unmap_extent_buffer(eb, eb->map_token, km);
3556 eb->map_token = NULL;
3557 save = 1;
3558 }
3559 err = map_private_extent_buffer(eb, start, min_len, token, map,
3560 map_start, map_len, km);
3561 if (!err && save) {
3562 eb->map_token = *token;
3563 eb->kaddr = *map;
3564 eb->map_start = *map_start;
3565 eb->map_len = *map_len;
3566 }
3567 return err;
3568}
3569
3570void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3571{
3572 kunmap_atomic(token, km);
3573}
3574
3575int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3528int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3576 unsigned long start, 3529 unsigned long start,
3577 unsigned long len) 3530 unsigned long len)
@@ -3595,9 +3548,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3595 3548
3596 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3549 cur = min(len, (PAGE_CACHE_SIZE - offset));
3597 3550
3598 kaddr = kmap_atomic(page, KM_USER0); 3551 kaddr = page_address(page);
3599 ret = memcmp(ptr, kaddr + offset, cur); 3552 ret = memcmp(ptr, kaddr + offset, cur);
3600 kunmap_atomic(kaddr, KM_USER0);
3601 if (ret) 3553 if (ret)
3602 break; 3554 break;
3603 3555
@@ -3630,9 +3582,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3630 WARN_ON(!PageUptodate(page)); 3582 WARN_ON(!PageUptodate(page));
3631 3583
3632 cur = min(len, PAGE_CACHE_SIZE - offset); 3584 cur = min(len, PAGE_CACHE_SIZE - offset);
3633 kaddr = kmap_atomic(page, KM_USER1); 3585 kaddr = page_address(page);
3634 memcpy(kaddr + offset, src, cur); 3586 memcpy(kaddr + offset, src, cur);
3635 kunmap_atomic(kaddr, KM_USER1);
3636 3587
3637 src += cur; 3588 src += cur;
3638 len -= cur; 3589 len -= cur;
@@ -3661,9 +3612,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
3661 WARN_ON(!PageUptodate(page)); 3612 WARN_ON(!PageUptodate(page));
3662 3613
3663 cur = min(len, PAGE_CACHE_SIZE - offset); 3614 cur = min(len, PAGE_CACHE_SIZE - offset);
3664 kaddr = kmap_atomic(page, KM_USER0); 3615 kaddr = page_address(page);
3665 memset(kaddr + offset, c, cur); 3616 memset(kaddr + offset, c, cur);
3666 kunmap_atomic(kaddr, KM_USER0);
3667 3617
3668 len -= cur; 3618 len -= cur;
3669 offset = 0; 3619 offset = 0;
@@ -3694,9 +3644,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3694 3644
3695 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3645 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3696 3646
3697 kaddr = kmap_atomic(page, KM_USER0); 3647 kaddr = page_address(page);
3698 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3648 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3699 kunmap_atomic(kaddr, KM_USER0);
3700 3649
3701 src_offset += cur; 3650 src_offset += cur;
3702 len -= cur; 3651 len -= cur;
@@ -3709,20 +3658,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3709 unsigned long dst_off, unsigned long src_off, 3658 unsigned long dst_off, unsigned long src_off,
3710 unsigned long len) 3659 unsigned long len)
3711{ 3660{
3712 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3661 char *dst_kaddr = page_address(dst_page);
3713 if (dst_page == src_page) { 3662 if (dst_page == src_page) {
3714 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3663 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3715 } else { 3664 } else {
3716 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3665 char *src_kaddr = page_address(src_page);
3717 char *p = dst_kaddr + dst_off + len; 3666 char *p = dst_kaddr + dst_off + len;
3718 char *s = src_kaddr + src_off + len; 3667 char *s = src_kaddr + src_off + len;
3719 3668
3720 while (len--) 3669 while (len--)
3721 *--p = *--s; 3670 *--p = *--s;
3722
3723 kunmap_atomic(src_kaddr, KM_USER1);
3724 } 3671 }
3725 kunmap_atomic(dst_kaddr, KM_USER0);
3726} 3672}
3727 3673
3728static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 3674static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3735,20 +3681,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3735 unsigned long dst_off, unsigned long src_off, 3681 unsigned long dst_off, unsigned long src_off,
3736 unsigned long len) 3682 unsigned long len)
3737{ 3683{
3738 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3684 char *dst_kaddr = page_address(dst_page);
3739 char *src_kaddr; 3685 char *src_kaddr;
3740 3686
3741 if (dst_page != src_page) { 3687 if (dst_page != src_page) {
3742 src_kaddr = kmap_atomic(src_page, KM_USER1); 3688 src_kaddr = page_address(src_page);
3743 } else { 3689 } else {
3744 src_kaddr = dst_kaddr; 3690 src_kaddr = dst_kaddr;
3745 BUG_ON(areas_overlap(src_off, dst_off, len)); 3691 BUG_ON(areas_overlap(src_off, dst_off, len));
3746 } 3692 }
3747 3693
3748 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3694 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3749 kunmap_atomic(dst_kaddr, KM_USER0);
3750 if (dst_page != src_page)
3751 kunmap_atomic(src_kaddr, KM_USER1);
3752} 3695}
3753 3696
3754void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3697void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a11a92ee2d30..7b2f0c3e7929 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -76,15 +76,15 @@ struct extent_io_ops {
76 struct extent_state *state); 76 struct extent_state *state);
77 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 77 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
78 struct extent_state *state, int uptodate); 78 struct extent_state *state, int uptodate);
79 int (*set_bit_hook)(struct inode *inode, struct extent_state *state, 79 void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
80 int *bits); 80 int *bits);
81 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 81 void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
82 int *bits); 82 int *bits);
83 int (*merge_extent_hook)(struct inode *inode, 83 void (*merge_extent_hook)(struct inode *inode,
84 struct extent_state *new, 84 struct extent_state *new,
85 struct extent_state *other); 85 struct extent_state *other);
86 int (*split_extent_hook)(struct inode *inode, 86 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 87 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 88 int (*write_cache_pages_lock_hook)(struct page *page);
89}; 89};
90 90
@@ -108,8 +108,6 @@ struct extent_state {
108 wait_queue_head_t wq; 108 wait_queue_head_t wq;
109 atomic_t refs; 109 atomic_t refs;
110 unsigned long state; 110 unsigned long state;
111 u64 split_start;
112 u64 split_end;
113 111
114 /* for use by the FS */ 112 /* for use by the FS */
115 u64 private; 113 u64 private;
@@ -120,8 +118,6 @@ struct extent_state {
120struct extent_buffer { 118struct extent_buffer {
121 u64 start; 119 u64 start;
122 unsigned long len; 120 unsigned long len;
123 char *map_token;
124 char *kaddr;
125 unsigned long map_start; 121 unsigned long map_start;
126 unsigned long map_len; 122 unsigned long map_len;
127 struct page *first_page; 123 struct page *first_page;
@@ -130,14 +126,26 @@ struct extent_buffer {
130 struct rcu_head rcu_head; 126 struct rcu_head rcu_head;
131 atomic_t refs; 127 atomic_t refs;
132 128
133 /* the spinlock is used to protect most operations */ 129 /* count of read lock holders on the extent buffer */
134 spinlock_t lock; 130 atomic_t write_locks;
131 atomic_t read_locks;
132 atomic_t blocking_writers;
133 atomic_t blocking_readers;
134 atomic_t spinning_readers;
135 atomic_t spinning_writers;
136
137 /* protects write locks */
138 rwlock_t lock;
135 139
136 /* 140 /* readers use lock_wq while they wait for the write
137 * when we keep the lock held while blocking, waiters go onto 141 * lock holders to unlock
138 * the wq
139 */ 142 */
140 wait_queue_head_t lock_wq; 143 wait_queue_head_t write_lock_wq;
144
145 /* writers use read_lock_wq while they wait for readers
146 * to unlock
147 */
148 wait_queue_head_t read_lock_wq;
141}; 149};
142 150
143static inline void extent_set_compress_type(unsigned long *bio_flags, 151static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -279,15 +287,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
279int extent_buffer_uptodate(struct extent_io_tree *tree, 287int extent_buffer_uptodate(struct extent_io_tree *tree,
280 struct extent_buffer *eb, 288 struct extent_buffer *eb,
281 struct extent_state *cached_state); 289 struct extent_state *cached_state);
282int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
283 unsigned long min_len, char **token, char **map,
284 unsigned long *map_start,
285 unsigned long *map_len, int km);
286int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, 290int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
287 unsigned long min_len, char **token, char **map, 291 unsigned long min_len, char **map,
288 unsigned long *map_start, 292 unsigned long *map_start,
289 unsigned long *map_len, int km); 293 unsigned long *map_len);
290void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
291int extent_range_uptodate(struct extent_io_tree *tree, 294int extent_range_uptodate(struct extent_io_tree *tree,
292 u64 start, u64 end); 295 u64 start, u64 end);
293int extent_clear_unlock_delalloc(struct inode *inode, 296int extent_clear_unlock_delalloc(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2d0410344ea3..7c97b3301459 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183 return 0; 183 return 0;
184} 184}
185 185
186int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 186static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
187{ 187{
188 int ret = 0;
189 struct extent_map *merge = NULL; 188 struct extent_map *merge = NULL;
190 struct rb_node *rb; 189 struct rb_node *rb;
191 struct extent_map *em;
192
193 write_lock(&tree->lock);
194 em = lookup_extent_mapping(tree, start, len);
195
196 WARN_ON(!em || em->start != start);
197
198 if (!em)
199 goto out;
200
201 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
202 190
203 if (em->start != 0) { 191 if (em->start != 0) {
204 rb = rb_prev(&em->rb_node); 192 rb = rb_prev(&em->rb_node);
@@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
225 merge->in_tree = 0; 213 merge->in_tree = 0;
226 free_extent_map(merge); 214 free_extent_map(merge);
227 } 215 }
216}
217
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
219{
220 int ret = 0;
221 struct extent_map *em;
222
223 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len);
225
226 WARN_ON(!em || em->start != start);
227
228 if (!em)
229 goto out;
230
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
232
233 try_merge_map(tree, em);
228 234
229 free_extent_map(em); 235 free_extent_map(em);
230out: 236out:
@@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
247 struct extent_map *em) 253 struct extent_map *em)
248{ 254{
249 int ret = 0; 255 int ret = 0;
250 struct extent_map *merge = NULL;
251 struct rb_node *rb; 256 struct rb_node *rb;
252 struct extent_map *exist; 257 struct extent_map *exist;
253 258
@@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
263 goto out; 268 goto out;
264 } 269 }
265 atomic_inc(&em->refs); 270 atomic_inc(&em->refs);
266 if (em->start != 0) { 271
267 rb = rb_prev(&em->rb_node); 272 try_merge_map(tree, em);
268 if (rb)
269 merge = rb_entry(rb, struct extent_map, rb_node);
270 if (rb && mergable_maps(merge, em)) {
271 em->start = merge->start;
272 em->len += merge->len;
273 em->block_len += merge->block_len;
274 em->block_start = merge->block_start;
275 merge->in_tree = 0;
276 rb_erase(&merge->rb_node, &tree->map);
277 free_extent_map(merge);
278 }
279 }
280 rb = rb_next(&em->rb_node);
281 if (rb)
282 merge = rb_entry(rb, struct extent_map, rb_node);
283 if (rb && mergable_maps(em, merge)) {
284 em->len += merge->len;
285 em->block_len += merge->len;
286 rb_erase(&merge->rb_node, &tree->map);
287 merge->in_tree = 0;
288 free_extent_map(merge);
289 }
290out: 273out:
291 return ret; 274 return ret;
292} 275}
@@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len)
299 return start + len; 282 return start + len;
300} 283}
301 284
302/** 285struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
303 * lookup_extent_mapping - lookup extent_map 286 u64 start, u64 len, int strict)
304 * @tree: tree to lookup in
305 * @start: byte offset to start the search
306 * @len: length of the lookup range
307 *
308 * Find and return the first extent_map struct in @tree that intersects the
309 * [start, len] range. There may be additional objects in the tree that
310 * intersect, so check the object returned carefully to make sure that no
311 * additional lookups are needed.
312 */
313struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
314 u64 start, u64 len)
315{ 287{
316 struct extent_map *em; 288 struct extent_map *em;
317 struct rb_node *rb_node; 289 struct rb_node *rb_node;
@@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
320 u64 end = range_end(start, len); 292 u64 end = range_end(start, len);
321 293
322 rb_node = __tree_search(&tree->map, start, &prev, &next); 294 rb_node = __tree_search(&tree->map, start, &prev, &next);
323 if (!rb_node && prev) {
324 em = rb_entry(prev, struct extent_map, rb_node);
325 if (end > em->start && start < extent_map_end(em))
326 goto found;
327 }
328 if (!rb_node && next) {
329 em = rb_entry(next, struct extent_map, rb_node);
330 if (end > em->start && start < extent_map_end(em))
331 goto found;
332 }
333 if (!rb_node) { 295 if (!rb_node) {
334 em = NULL; 296 if (prev)
335 goto out; 297 rb_node = prev;
336 } 298 else if (next)
337 if (IS_ERR(rb_node)) { 299 rb_node = next;
338 em = ERR_CAST(rb_node); 300 else
339 goto out; 301 return NULL;
340 } 302 }
303
341 em = rb_entry(rb_node, struct extent_map, rb_node); 304 em = rb_entry(rb_node, struct extent_map, rb_node);
342 if (end > em->start && start < extent_map_end(em))
343 goto found;
344 305
345 em = NULL; 306 if (strict && !(end > em->start && start < extent_map_end(em)))
346 goto out; 307 return NULL;
347 308
348found:
349 atomic_inc(&em->refs); 309 atomic_inc(&em->refs);
350out:
351 return em; 310 return em;
352} 311}
353 312
354/** 313/**
314 * lookup_extent_mapping - lookup extent_map
315 * @tree: tree to lookup in
316 * @start: byte offset to start the search
317 * @len: length of the lookup range
318 *
319 * Find and return the first extent_map struct in @tree that intersects the
320 * [start, len] range. There may be additional objects in the tree that
321 * intersect, so check the object returned carefully to make sure that no
322 * additional lookups are needed.
323 */
324struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
325 u64 start, u64 len)
326{
327 return __lookup_extent_mapping(tree, start, len, 1);
328}
329
330/**
355 * search_extent_mapping - find a nearby extent map 331 * search_extent_mapping - find a nearby extent map
356 * @tree: tree to lookup in 332 * @tree: tree to lookup in
357 * @start: byte offset to start the search 333 * @start: byte offset to start the search
@@ -365,38 +341,7 @@ out:
365struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 341struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
366 u64 start, u64 len) 342 u64 start, u64 len)
367{ 343{
368 struct extent_map *em; 344 return __lookup_extent_mapping(tree, start, len, 0);
369 struct rb_node *rb_node;
370 struct rb_node *prev = NULL;
371 struct rb_node *next = NULL;
372
373 rb_node = __tree_search(&tree->map, start, &prev, &next);
374 if (!rb_node && prev) {
375 em = rb_entry(prev, struct extent_map, rb_node);
376 goto found;
377 }
378 if (!rb_node && next) {
379 em = rb_entry(next, struct extent_map, rb_node);
380 goto found;
381 }
382 if (!rb_node) {
383 em = NULL;
384 goto out;
385 }
386 if (IS_ERR(rb_node)) {
387 em = ERR_CAST(rb_node);
388 goto out;
389 }
390 em = rb_entry(rb_node, struct extent_map, rb_node);
391 goto found;
392
393 em = NULL;
394 goto out;
395
396found:
397 atomic_inc(&em->refs);
398out:
399 return em;
400} 345}
401 346
402/** 347/**
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90d4ee52cd45..a1cb7821becd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -177,6 +177,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
177 177
178 WARN_ON(bio->bi_vcnt <= 0); 178 WARN_ON(bio->bi_vcnt <= 0);
179 179
180 /*
181 * the free space stuff is only read when it hasn't been
182 * updated in the current transaction. So, we can safely
183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree.
185 */
186 if (btrfs_is_free_space_inode(root, inode)) {
187 path->search_commit_root = 1;
188 path->skip_locking = 1;
189 }
190
180 disk_bytenr = (u64)bio->bi_sector << 9; 191 disk_bytenr = (u64)bio->bi_sector << 9;
181 if (dio) 192 if (dio)
182 offset = logical_offset; 193 offset = logical_offset;
@@ -282,7 +293,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
282 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
283 294
284 path = btrfs_alloc_path(); 295 path = btrfs_alloc_path();
285 BUG_ON(!path); 296 if (!path)
297 return -ENOMEM;
286 298
287 if (search_commit) { 299 if (search_commit) {
288 path->skip_locking = 1; 300 path->skip_locking = 1;
@@ -664,15 +676,13 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
664 struct btrfs_sector_sum *sector_sum; 676 struct btrfs_sector_sum *sector_sum;
665 u32 nritems; 677 u32 nritems;
666 u32 ins_size; 678 u32 ins_size;
667 char *eb_map;
668 char *eb_token;
669 unsigned long map_len;
670 unsigned long map_start;
671 u16 csum_size = 679 u16 csum_size =
672 btrfs_super_csum_size(&root->fs_info->super_copy); 680 btrfs_super_csum_size(&root->fs_info->super_copy);
673 681
674 path = btrfs_alloc_path(); 682 path = btrfs_alloc_path();
675 BUG_ON(!path); 683 if (!path)
684 return -ENOMEM;
685
676 sector_sum = sums->sums; 686 sector_sum = sums->sums;
677again: 687again:
678 next_offset = (u64)-1; 688 next_offset = (u64)-1;
@@ -814,30 +824,9 @@ found:
814 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 824 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
815 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 825 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
816 btrfs_item_size_nr(leaf, path->slots[0])); 826 btrfs_item_size_nr(leaf, path->slots[0]));
817 eb_token = NULL;
818next_sector: 827next_sector:
819 828
820 if (!eb_token || 829 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
821 (unsigned long)item + csum_size >= map_start + map_len) {
822 int err;
823
824 if (eb_token)
825 unmap_extent_buffer(leaf, eb_token, KM_USER1);
826 eb_token = NULL;
827 err = map_private_extent_buffer(leaf, (unsigned long)item,
828 csum_size,
829 &eb_token, &eb_map,
830 &map_start, &map_len, KM_USER1);
831 if (err)
832 eb_token = NULL;
833 }
834 if (eb_token) {
835 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
836 &sector_sum->sum, csum_size);
837 } else {
838 write_extent_buffer(leaf, &sector_sum->sum,
839 (unsigned long)item, csum_size);
840 }
841 830
842 total_bytes += root->sectorsize; 831 total_bytes += root->sectorsize;
843 sector_sum++; 832 sector_sum++;
@@ -850,10 +839,7 @@ next_sector:
850 goto next_sector; 839 goto next_sector;
851 } 840 }
852 } 841 }
853 if (eb_token) { 842
854 unmap_extent_buffer(leaf, eb_token, KM_USER1);
855 eb_token = NULL;
856 }
857 btrfs_mark_buffer_dirty(path->nodes[0]); 843 btrfs_mark_buffer_dirty(path->nodes[0]);
858 if (total_bytes < sums->len) { 844 if (total_bytes < sums->len) {
859 btrfs_release_path(path); 845 btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 59cbdb120ad0..3c3abff731a7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -74,7 +74,7 @@ struct inode_defrag {
74 * If an existing record is found the defrag item you 74 * If an existing record is found the defrag item you
75 * pass in is freed 75 * pass in is freed
76 */ 76 */
77static int __btrfs_add_inode_defrag(struct inode *inode, 77static void __btrfs_add_inode_defrag(struct inode *inode,
78 struct inode_defrag *defrag) 78 struct inode_defrag *defrag)
79{ 79{
80 struct btrfs_root *root = BTRFS_I(inode)->root; 80 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
106 BTRFS_I(inode)->in_defrag = 1; 106 BTRFS_I(inode)->in_defrag = 1;
107 rb_link_node(&defrag->rb_node, parent, p); 107 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return 0; 109 return;
110 110
111exists: 111exists:
112 kfree(defrag); 112 kfree(defrag);
113 return 0; 113 return;
114 114
115} 115}
116 116
@@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
123{ 123{
124 struct btrfs_root *root = BTRFS_I(inode)->root; 124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct inode_defrag *defrag; 125 struct inode_defrag *defrag;
126 int ret = 0;
127 u64 transid; 126 u64 transid;
128 127
129 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 128 if (!btrfs_test_opt(root, AUTO_DEFRAG))
@@ -150,9 +149,11 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
150 149
151 spin_lock(&root->fs_info->defrag_inodes_lock); 150 spin_lock(&root->fs_info->defrag_inodes_lock);
152 if (!BTRFS_I(inode)->in_defrag) 151 if (!BTRFS_I(inode)->in_defrag)
153 ret = __btrfs_add_inode_defrag(inode, defrag); 152 __btrfs_add_inode_defrag(inode, defrag);
153 else
154 kfree(defrag);
154 spin_unlock(&root->fs_info->defrag_inodes_lock); 155 spin_unlock(&root->fs_info->defrag_inodes_lock);
155 return ret; 156 return 0;
156} 157}
157 158
158/* 159/*
@@ -855,7 +856,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
855 btrfs_drop_extent_cache(inode, start, end - 1, 0); 856 btrfs_drop_extent_cache(inode, start, end - 1, 0);
856 857
857 path = btrfs_alloc_path(); 858 path = btrfs_alloc_path();
858 BUG_ON(!path); 859 if (!path)
860 return -ENOMEM;
859again: 861again:
860 recow = 0; 862 recow = 0;
861 split = start; 863 split = start;
@@ -1059,7 +1061,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
1059static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1061static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1060 struct page **pages, size_t num_pages, 1062 struct page **pages, size_t num_pages,
1061 loff_t pos, unsigned long first_index, 1063 loff_t pos, unsigned long first_index,
1062 unsigned long last_index, size_t write_bytes) 1064 size_t write_bytes)
1063{ 1065{
1064 struct extent_state *cached_state = NULL; 1066 struct extent_state *cached_state = NULL;
1065 int i; 1067 int i;
@@ -1073,15 +1075,10 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1073 start_pos = pos & ~((u64)root->sectorsize - 1); 1075 start_pos = pos & ~((u64)root->sectorsize - 1);
1074 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1076 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1075 1077
1076 if (start_pos > inode->i_size) {
1077 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1078 if (err)
1079 return err;
1080 }
1081
1082again: 1078again:
1083 for (i = 0; i < num_pages; i++) { 1079 for (i = 0; i < num_pages; i++) {
1084 pages[i] = grab_cache_page(inode->i_mapping, index + i); 1080 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1081 GFP_NOFS);
1085 if (!pages[i]) { 1082 if (!pages[i]) {
1086 faili = i - 1; 1083 faili = i - 1;
1087 err = -ENOMEM; 1084 err = -ENOMEM;
@@ -1158,7 +1155,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1158 struct btrfs_root *root = BTRFS_I(inode)->root; 1155 struct btrfs_root *root = BTRFS_I(inode)->root;
1159 struct page **pages = NULL; 1156 struct page **pages = NULL;
1160 unsigned long first_index; 1157 unsigned long first_index;
1161 unsigned long last_index;
1162 size_t num_written = 0; 1158 size_t num_written = 0;
1163 int nrptrs; 1159 int nrptrs;
1164 int ret = 0; 1160 int ret = 0;
@@ -1171,7 +1167,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1171 return -ENOMEM; 1167 return -ENOMEM;
1172 1168
1173 first_index = pos >> PAGE_CACHE_SHIFT; 1169 first_index = pos >> PAGE_CACHE_SHIFT;
1174 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
1175 1170
1176 while (iov_iter_count(i) > 0) { 1171 while (iov_iter_count(i) > 0) {
1177 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1172 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -1205,8 +1200,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1205 * contents of pages from loop to loop 1200 * contents of pages from loop to loop
1206 */ 1201 */
1207 ret = prepare_pages(root, file, pages, num_pages, 1202 ret = prepare_pages(root, file, pages, num_pages,
1208 pos, first_index, last_index, 1203 pos, first_index, write_bytes);
1209 write_bytes);
1210 if (ret) { 1204 if (ret) {
1211 btrfs_delalloc_release_space(inode, 1205 btrfs_delalloc_release_space(inode,
1212 num_pages << PAGE_CACHE_SHIFT); 1206 num_pages << PAGE_CACHE_SHIFT);
@@ -1238,9 +1232,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1238 * managed to copy. 1232 * managed to copy.
1239 */ 1233 */
1240 if (num_pages > dirty_pages) { 1234 if (num_pages > dirty_pages) {
1241 if (copied > 0) 1235 if (copied > 0) {
1242 atomic_inc( 1236 spin_lock(&BTRFS_I(inode)->lock);
1243 &BTRFS_I(inode)->outstanding_extents); 1237 BTRFS_I(inode)->outstanding_extents++;
1238 spin_unlock(&BTRFS_I(inode)->lock);
1239 }
1244 btrfs_delalloc_release_space(inode, 1240 btrfs_delalloc_release_space(inode,
1245 (num_pages - dirty_pages) << 1241 (num_pages - dirty_pages) <<
1246 PAGE_CACHE_SHIFT); 1242 PAGE_CACHE_SHIFT);
@@ -1336,6 +1332,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1336 struct inode *inode = fdentry(file)->d_inode; 1332 struct inode *inode = fdentry(file)->d_inode;
1337 struct btrfs_root *root = BTRFS_I(inode)->root; 1333 struct btrfs_root *root = BTRFS_I(inode)->root;
1338 loff_t *ppos = &iocb->ki_pos; 1334 loff_t *ppos = &iocb->ki_pos;
1335 u64 start_pos;
1339 ssize_t num_written = 0; 1336 ssize_t num_written = 0;
1340 ssize_t err = 0; 1337 ssize_t err = 0;
1341 size_t count, ocount; 1338 size_t count, ocount;
@@ -1384,6 +1381,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1384 file_update_time(file); 1381 file_update_time(file);
1385 BTRFS_I(inode)->sequence++; 1382 BTRFS_I(inode)->sequence++;
1386 1383
1384 start_pos = round_down(pos, root->sectorsize);
1385 if (start_pos > i_size_read(inode)) {
1386 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1387 if (err) {
1388 mutex_unlock(&inode->i_mutex);
1389 goto out;
1390 }
1391 }
1392
1387 if (unlikely(file->f_flags & O_DIRECT)) { 1393 if (unlikely(file->f_flags & O_DIRECT)) {
1388 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1394 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1389 pos, ppos, count, ocount); 1395 pos, ppos, count, ocount);
@@ -1638,11 +1644,15 @@ static long btrfs_fallocate(struct file *file, int mode,
1638 1644
1639 cur_offset = alloc_start; 1645 cur_offset = alloc_start;
1640 while (1) { 1646 while (1) {
1647 u64 actual_end;
1648
1641 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 1649 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1642 alloc_end - cur_offset, 0); 1650 alloc_end - cur_offset, 0);
1643 BUG_ON(IS_ERR_OR_NULL(em)); 1651 BUG_ON(IS_ERR_OR_NULL(em));
1644 last_byte = min(extent_map_end(em), alloc_end); 1652 last_byte = min(extent_map_end(em), alloc_end);
1653 actual_end = min_t(u64, extent_map_end(em), offset + len);
1645 last_byte = (last_byte + mask) & ~mask; 1654 last_byte = (last_byte + mask) & ~mask;
1655
1646 if (em->block_start == EXTENT_MAP_HOLE || 1656 if (em->block_start == EXTENT_MAP_HOLE ||
1647 (cur_offset >= inode->i_size && 1657 (cur_offset >= inode->i_size &&
1648 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1658 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1655,6 +1665,16 @@ static long btrfs_fallocate(struct file *file, int mode,
1655 free_extent_map(em); 1665 free_extent_map(em);
1656 break; 1666 break;
1657 } 1667 }
1668 } else if (actual_end > inode->i_size &&
1669 !(mode & FALLOC_FL_KEEP_SIZE)) {
1670 /*
1671 * We didn't need to allocate any more space, but we
1672 * still extended the size of the file so we need to
1673 * update i_size.
1674 */
1675 inode->i_ctime = CURRENT_TIME;
1676 i_size_write(inode, actual_end);
1677 btrfs_ordered_update_i_size(inode, actual_end, NULL);
1658 } 1678 }
1659 free_extent_map(em); 1679 free_extent_map(em);
1660 1680
@@ -1804,10 +1824,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1804 } 1824 }
1805 } 1825 }
1806 1826
1807 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 1827 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
1808 return -EINVAL; 1828 ret = -EINVAL;
1809 if (offset > inode->i_sb->s_maxbytes) 1829 goto out;
1810 return -EINVAL; 1830 }
1831 if (offset > inode->i_sb->s_maxbytes) {
1832 ret = -EINVAL;
1833 goto out;
1834 }
1811 1835
1812 /* Special lock needed here? */ 1836 /* Special lock needed here? */
1813 if (offset != file->f_pos) { 1837 if (offset != file->f_pos) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bf0d61567f3d..41ac927401d0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 98 return inode;
99 99
100 spin_lock(&block_group->lock); 100 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
102 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 }
106
101 if (!btrfs_fs_closing(root->fs_info)) { 107 if (!btrfs_fs_closing(root->fs_info)) {
102 block_group->inode = igrab(inode); 108 block_group->inode = igrab(inode);
103 block_group->iref = 1; 109 block_group->iref = 1;
@@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root,
135 btrfs_set_inode_gid(leaf, inode_item, 0); 141 btrfs_set_inode_gid(leaf, inode_item, 0);
136 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
137 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
138 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); 144 BTRFS_INODE_PREALLOC);
139 btrfs_set_inode_nlink(leaf, inode_item, 1); 145 btrfs_set_inode_nlink(leaf, inode_item, 1);
140 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 146 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
141 btrfs_set_inode_block_group(leaf, inode_item, offset); 147 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -184,9 +190,11 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
184 struct btrfs_path *path, 190 struct btrfs_path *path,
185 struct inode *inode) 191 struct inode *inode)
186{ 192{
193 struct btrfs_block_rsv *rsv;
187 loff_t oldsize; 194 loff_t oldsize;
188 int ret = 0; 195 int ret = 0;
189 196
197 rsv = trans->block_rsv;
190 trans->block_rsv = root->orphan_block_rsv; 198 trans->block_rsv = root->orphan_block_rsv;
191 ret = btrfs_block_rsv_check(trans, root, 199 ret = btrfs_block_rsv_check(trans, root,
192 root->orphan_block_rsv, 200 root->orphan_block_rsv,
@@ -204,6 +212,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
204 */ 212 */
205 ret = btrfs_truncate_inode_items(trans, root, inode, 213 ret = btrfs_truncate_inode_items(trans, root, inode,
206 0, BTRFS_EXTENT_DATA_KEY); 214 0, BTRFS_EXTENT_DATA_KEY);
215
216 trans->block_rsv = rsv;
207 if (ret) { 217 if (ret) {
208 WARN_ON(1); 218 WARN_ON(1);
209 return ret; 219 return ret;
@@ -239,17 +249,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
239 struct btrfs_free_space_header *header; 249 struct btrfs_free_space_header *header;
240 struct extent_buffer *leaf; 250 struct extent_buffer *leaf;
241 struct page *page; 251 struct page *page;
242 u32 *checksums = NULL, *crc;
243 char *disk_crcs = NULL;
244 struct btrfs_key key; 252 struct btrfs_key key;
245 struct list_head bitmaps; 253 struct list_head bitmaps;
246 u64 num_entries; 254 u64 num_entries;
247 u64 num_bitmaps; 255 u64 num_bitmaps;
248 u64 generation; 256 u64 generation;
249 u32 cur_crc = ~(u32)0;
250 pgoff_t index = 0; 257 pgoff_t index = 0;
251 unsigned long first_page_offset;
252 int num_checksums;
253 int ret = 0; 258 int ret = 0;
254 259
255 INIT_LIST_HEAD(&bitmaps); 260 INIT_LIST_HEAD(&bitmaps);
@@ -292,16 +297,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
292 if (!num_entries) 297 if (!num_entries)
293 goto out; 298 goto out;
294 299
295 /* Setup everything for doing checksumming */
296 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
297 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
298 if (!checksums)
299 goto out;
300 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
301 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
302 if (!disk_crcs)
303 goto out;
304
305 ret = readahead_cache(inode); 300 ret = readahead_cache(inode);
306 if (ret) 301 if (ret)
307 goto out; 302 goto out;
@@ -311,18 +306,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
311 struct btrfs_free_space *e; 306 struct btrfs_free_space *e;
312 void *addr; 307 void *addr;
313 unsigned long offset = 0; 308 unsigned long offset = 0;
314 unsigned long start_offset = 0;
315 int need_loop = 0; 309 int need_loop = 0;
316 310
317 if (!num_entries && !num_bitmaps) 311 if (!num_entries && !num_bitmaps)
318 break; 312 break;
319 313
320 if (index == 0) { 314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
321 start_offset = first_page_offset;
322 offset = start_offset;
323 }
324
325 page = grab_cache_page(inode->i_mapping, index);
326 if (!page) 315 if (!page)
327 goto free_cache; 316 goto free_cache;
328 317
@@ -342,8 +331,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
342 if (index == 0) { 331 if (index == 0) {
343 u64 *gen; 332 u64 *gen;
344 333
345 memcpy(disk_crcs, addr, first_page_offset); 334 /*
346 gen = addr + (sizeof(u32) * num_checksums); 335 * We put a bogus crc in the front of the first page in
336 * case old kernels try to mount a fs with the new
337 * format to make sure they discard the cache.
338 */
339 addr += sizeof(u64);
340 offset += sizeof(u64);
341
342 gen = addr;
347 if (*gen != BTRFS_I(inode)->generation) { 343 if (*gen != BTRFS_I(inode)->generation) {
348 printk(KERN_ERR "btrfs: space cache generation" 344 printk(KERN_ERR "btrfs: space cache generation"
349 " (%llu) does not match inode (%llu)\n", 345 " (%llu) does not match inode (%llu)\n",
@@ -355,24 +351,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
355 page_cache_release(page); 351 page_cache_release(page);
356 goto free_cache; 352 goto free_cache;
357 } 353 }
358 crc = (u32 *)disk_crcs; 354 addr += sizeof(u64);
355 offset += sizeof(u64);
359 } 356 }
360 entry = addr + start_offset; 357 entry = addr;
361
362 /* First lets check our crc before we do anything fun */
363 cur_crc = ~(u32)0;
364 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
365 PAGE_CACHE_SIZE - start_offset);
366 btrfs_csum_final(cur_crc, (char *)&cur_crc);
367 if (cur_crc != *crc) {
368 printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
369 index);
370 kunmap(page);
371 unlock_page(page);
372 page_cache_release(page);
373 goto free_cache;
374 }
375 crc++;
376 358
377 while (1) { 359 while (1) {
378 if (!num_entries) 360 if (!num_entries)
@@ -470,8 +452,6 @@ next:
470 452
471 ret = 1; 453 ret = 1;
472out: 454out:
473 kfree(checksums);
474 kfree(disk_crcs);
475 return ret; 455 return ret;
476free_cache: 456free_cache:
477 __btrfs_remove_free_space_cache(ctl); 457 __btrfs_remove_free_space_cache(ctl);
@@ -569,8 +549,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
569 struct btrfs_key key; 549 struct btrfs_key key;
570 u64 start, end, len; 550 u64 start, end, len;
571 u64 bytes = 0; 551 u64 bytes = 0;
572 u32 *crc, *checksums; 552 u32 crc = ~(u32)0;
573 unsigned long first_page_offset;
574 int index = 0, num_pages = 0; 553 int index = 0, num_pages = 0;
575 int entries = 0; 554 int entries = 0;
576 int bitmaps = 0; 555 int bitmaps = 0;
@@ -590,34 +569,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
590 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
591 PAGE_CACHE_SHIFT; 570 PAGE_CACHE_SHIFT;
592 571
593 /* Since the first page has all of our checksums and our generation we
594 * need to calculate the offset into the page that we can start writing
595 * our entries.
596 */
597 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
598
599 filemap_write_and_wait(inode->i_mapping); 572 filemap_write_and_wait(inode->i_mapping);
600 btrfs_wait_ordered_range(inode, inode->i_size & 573 btrfs_wait_ordered_range(inode, inode->i_size &
601 ~(root->sectorsize - 1), (u64)-1); 574 ~(root->sectorsize - 1), (u64)-1);
602 575
603 /* make sure we don't overflow that first page */
604 if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
605 /* this is really the same as running out of space, where we also return 0 */
606 printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
607 ret = 0;
608 goto out_update;
609 }
610
611 /* We need a checksum per page. */
612 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
613 if (!crc)
614 return -1;
615
616 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
617 if (!pages) { 577 if (!pages)
618 kfree(crc);
619 return -1; 578 return -1;
620 }
621 579
622 /* Get the cluster for this block_group if it exists */ 580 /* Get the cluster for this block_group if it exists */
623 if (block_group && !list_empty(&block_group->cluster_list)) 581 if (block_group && !list_empty(&block_group->cluster_list))
@@ -640,7 +598,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
640 * know and don't freak out. 598 * know and don't freak out.
641 */ 599 */
642 while (index < num_pages) { 600 while (index < num_pages) {
643 page = grab_cache_page(inode->i_mapping, index); 601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
644 if (!page) { 602 if (!page) {
645 int i; 603 int i;
646 604
@@ -648,7 +606,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
648 unlock_page(pages[i]); 606 unlock_page(pages[i]);
649 page_cache_release(pages[i]); 607 page_cache_release(pages[i]);
650 } 608 }
651 goto out_free; 609 goto out;
652 } 610 }
653 pages[index] = page; 611 pages[index] = page;
654 index++; 612 index++;
@@ -668,17 +626,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
668 /* Write out the extent entries */ 626 /* Write out the extent entries */
669 do { 627 do {
670 struct btrfs_free_space_entry *entry; 628 struct btrfs_free_space_entry *entry;
671 void *addr; 629 void *addr, *orig;
672 unsigned long offset = 0; 630 unsigned long offset = 0;
673 unsigned long start_offset = 0;
674 631
675 next_page = false; 632 next_page = false;
676 633
677 if (index == 0) {
678 start_offset = first_page_offset;
679 offset = start_offset;
680 }
681
682 if (index >= num_pages) { 634 if (index >= num_pages) {
683 out_of_space = true; 635 out_of_space = true;
684 break; 636 break;
@@ -686,10 +638,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
686 638
687 page = pages[index]; 639 page = pages[index];
688 640
689 addr = kmap(page); 641 orig = addr = kmap(page);
690 entry = addr + start_offset; 642 if (index == 0) {
643 u64 *gen;
691 644
692 memset(addr, 0, PAGE_CACHE_SIZE); 645 /*
646 * We're going to put in a bogus crc for this page to
647 * make sure that old kernels who aren't aware of this
648 * format will be sure to discard the cache.
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652
653 gen = addr;
654 *gen = trans->transid;
655 addr += sizeof(u64);
656 offset += sizeof(u64);
657 }
658 entry = addr;
659
660 memset(addr, 0, PAGE_CACHE_SIZE - offset);
693 while (node && !next_page) { 661 while (node && !next_page) {
694 struct btrfs_free_space *e; 662 struct btrfs_free_space *e;
695 663
@@ -752,13 +720,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
752 next_page = true; 720 next_page = true;
753 entry++; 721 entry++;
754 } 722 }
755 *crc = ~(u32)0;
756 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
757 PAGE_CACHE_SIZE - start_offset);
758 kunmap(page);
759 723
760 btrfs_csum_final(*crc, (char *)crc); 724 /* Generate bogus crc value */
761 crc++; 725 if (index == 0) {
726 u32 *tmp;
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734
735 kunmap(page);
762 736
763 bytes += PAGE_CACHE_SIZE; 737 bytes += PAGE_CACHE_SIZE;
764 738
@@ -779,11 +753,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
779 753
780 addr = kmap(page); 754 addr = kmap(page);
781 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
782 *crc = ~(u32)0;
783 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
784 kunmap(page); 756 kunmap(page);
785 btrfs_csum_final(*crc, (char *)crc);
786 crc++;
787 bytes += PAGE_CACHE_SIZE; 757 bytes += PAGE_CACHE_SIZE;
788 758
789 list_del_init(&entry->list); 759 list_del_init(&entry->list);
@@ -796,7 +766,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
796 i_size_read(inode) - 1, &cached_state, 766 i_size_read(inode) - 1, &cached_state,
797 GFP_NOFS); 767 GFP_NOFS);
798 ret = 0; 768 ret = 0;
799 goto out_free; 769 goto out;
800 } 770 }
801 771
802 /* Zero out the rest of the pages just to make sure */ 772 /* Zero out the rest of the pages just to make sure */
@@ -811,20 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
811 index++; 781 index++;
812 } 782 }
813 783
814 /* Write the checksums and trans id to the first page */
815 {
816 void *addr;
817 u64 *gen;
818
819 page = pages[0];
820
821 addr = kmap(page);
822 memcpy(addr, checksums, sizeof(u32) * num_pages);
823 gen = addr + (sizeof(u32) * num_pages);
824 *gen = trans->transid;
825 kunmap(page);
826 }
827
828 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
829 bytes, &cached_state); 785 bytes, &cached_state);
830 btrfs_drop_pages(pages, num_pages); 786 btrfs_drop_pages(pages, num_pages);
@@ -833,7 +789,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 789
834 if (ret) { 790 if (ret) {
835 ret = 0; 791 ret = 0;
836 goto out_free; 792 goto out;
837 } 793 }
838 794
839 BTRFS_I(inode)->generation = trans->transid; 795 BTRFS_I(inode)->generation = trans->transid;
@@ -850,7 +806,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
850 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
851 EXTENT_DIRTY | EXTENT_DELALLOC | 807 EXTENT_DIRTY | EXTENT_DELALLOC |
852 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); 808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
853 goto out_free; 809 goto out;
854 } 810 }
855 leaf = path->nodes[0]; 811 leaf = path->nodes[0];
856 if (ret > 0) { 812 if (ret > 0) {
@@ -866,7 +822,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
866 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 822 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
867 GFP_NOFS); 823 GFP_NOFS);
868 btrfs_release_path(path); 824 btrfs_release_path(path);
869 goto out_free; 825 goto out;
870 } 826 }
871 } 827 }
872 header = btrfs_item_ptr(leaf, path->slots[0], 828 header = btrfs_item_ptr(leaf, path->slots[0],
@@ -879,11 +835,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
879 835
880 ret = 1; 836 ret = 1;
881 837
882out_free: 838out:
883 kfree(checksums);
884 kfree(pages); 839 kfree(pages);
885
886out_update:
887 if (ret != 1) { 840 if (ret != 1) {
888 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 841 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
889 BTRFS_I(inode)->generation = 0; 842 BTRFS_I(inode)->generation = 0;
@@ -1219,9 +1172,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1219 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1172 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1220} 1173}
1221 1174
1222static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1175static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1223 struct btrfs_free_space *info, u64 offset, 1176 struct btrfs_free_space *info,
1224 u64 bytes) 1177 u64 offset, u64 bytes)
1225{ 1178{
1226 unsigned long start, count; 1179 unsigned long start, count;
1227 1180
@@ -1232,6 +1185,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1232 bitmap_clear(info->bitmap, start, count); 1185 bitmap_clear(info->bitmap, start, count);
1233 1186
1234 info->bytes -= bytes; 1187 info->bytes -= bytes;
1188}
1189
1190static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1191 struct btrfs_free_space *info, u64 offset,
1192 u64 bytes)
1193{
1194 __bitmap_clear_bits(ctl, info, offset, bytes);
1235 ctl->free_space -= bytes; 1195 ctl->free_space -= bytes;
1236} 1196}
1237 1197
@@ -2035,7 +1995,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2035 return 0; 1995 return 0;
2036 1996
2037 ret = search_start; 1997 ret = search_start;
2038 bitmap_clear_bits(ctl, entry, ret, bytes); 1998 __bitmap_clear_bits(ctl, entry, ret, bytes);
2039 1999
2040 return ret; 2000 return ret;
2041} 2001}
@@ -2090,7 +2050,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2090 continue; 2050 continue;
2091 } 2051 }
2092 } else { 2052 } else {
2093
2094 ret = entry->offset; 2053 ret = entry->offset;
2095 2054
2096 entry->offset += bytes; 2055 entry->offset += bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e91b097e7252..4d14de6d121b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
750 return alloc_hint; 750 return alloc_hint;
751} 751}
752 752
753static inline bool is_free_space_inode(struct btrfs_root *root,
754 struct inode *inode)
755{
756 if (root == root->fs_info->tree_root ||
757 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
758 return true;
759 return false;
760}
761
762/* 753/*
763 * when extent_io.c finds a delayed allocation range in the file, 754 * when extent_io.c finds a delayed allocation range in the file,
764 * the call backs end up in this code. The basic idea is to 755 * the call backs end up in this code. The basic idea is to
@@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode,
791 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 782 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
792 int ret = 0; 783 int ret = 0;
793 784
794 BUG_ON(is_free_space_inode(root, inode)); 785 BUG_ON(btrfs_is_free_space_inode(root, inode));
795 trans = btrfs_join_transaction(root); 786 trans = btrfs_join_transaction(root);
796 BUG_ON(IS_ERR(trans)); 787 BUG_ON(IS_ERR(trans));
797 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 788 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1070,9 +1061,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1070 u64 ino = btrfs_ino(inode); 1061 u64 ino = btrfs_ino(inode);
1071 1062
1072 path = btrfs_alloc_path(); 1063 path = btrfs_alloc_path();
1073 BUG_ON(!path); 1064 if (!path)
1065 return -ENOMEM;
1074 1066
1075 nolock = is_free_space_inode(root, inode); 1067 nolock = btrfs_is_free_space_inode(root, inode);
1076 1068
1077 if (nolock) 1069 if (nolock)
1078 trans = btrfs_join_transaction_nolock(root); 1070 trans = btrfs_join_transaction_nolock(root);
@@ -1291,15 +1283,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1291 return ret; 1283 return ret;
1292} 1284}
1293 1285
1294static int btrfs_split_extent_hook(struct inode *inode, 1286static void btrfs_split_extent_hook(struct inode *inode,
1295 struct extent_state *orig, u64 split) 1287 struct extent_state *orig, u64 split)
1296{ 1288{
1297 /* not delalloc, ignore it */ 1289 /* not delalloc, ignore it */
1298 if (!(orig->state & EXTENT_DELALLOC)) 1290 if (!(orig->state & EXTENT_DELALLOC))
1299 return 0; 1291 return;
1300 1292
1301 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1293 spin_lock(&BTRFS_I(inode)->lock);
1302 return 0; 1294 BTRFS_I(inode)->outstanding_extents++;
1295 spin_unlock(&BTRFS_I(inode)->lock);
1303} 1296}
1304 1297
1305/* 1298/*
@@ -1308,16 +1301,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
1308 * extents, such as when we are doing sequential writes, so we can properly 1301 * extents, such as when we are doing sequential writes, so we can properly
1309 * account for the metadata space we'll need. 1302 * account for the metadata space we'll need.
1310 */ 1303 */
1311static int btrfs_merge_extent_hook(struct inode *inode, 1304static void btrfs_merge_extent_hook(struct inode *inode,
1312 struct extent_state *new, 1305 struct extent_state *new,
1313 struct extent_state *other) 1306 struct extent_state *other)
1314{ 1307{
1315 /* not delalloc, ignore it */ 1308 /* not delalloc, ignore it */
1316 if (!(other->state & EXTENT_DELALLOC)) 1309 if (!(other->state & EXTENT_DELALLOC))
1317 return 0; 1310 return;
1318 1311
1319 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1312 spin_lock(&BTRFS_I(inode)->lock);
1320 return 0; 1313 BTRFS_I(inode)->outstanding_extents--;
1314 spin_unlock(&BTRFS_I(inode)->lock);
1321} 1315}
1322 1316
1323/* 1317/*
@@ -1325,8 +1319,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1325 * bytes in this file, and to maintain the list of inodes that 1319 * bytes in this file, and to maintain the list of inodes that
1326 * have pending delalloc work to be done. 1320 * have pending delalloc work to be done.
1327 */ 1321 */
1328static int btrfs_set_bit_hook(struct inode *inode, 1322static void btrfs_set_bit_hook(struct inode *inode,
1329 struct extent_state *state, int *bits) 1323 struct extent_state *state, int *bits)
1330{ 1324{
1331 1325
1332 /* 1326 /*
@@ -1337,12 +1331,15 @@ static int btrfs_set_bit_hook(struct inode *inode,
1337 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1331 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1338 struct btrfs_root *root = BTRFS_I(inode)->root; 1332 struct btrfs_root *root = BTRFS_I(inode)->root;
1339 u64 len = state->end + 1 - state->start; 1333 u64 len = state->end + 1 - state->start;
1340 bool do_list = !is_free_space_inode(root, inode); 1334 bool do_list = !btrfs_is_free_space_inode(root, inode);
1341 1335
1342 if (*bits & EXTENT_FIRST_DELALLOC) 1336 if (*bits & EXTENT_FIRST_DELALLOC) {
1343 *bits &= ~EXTENT_FIRST_DELALLOC; 1337 *bits &= ~EXTENT_FIRST_DELALLOC;
1344 else 1338 } else {
1345 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1339 spin_lock(&BTRFS_I(inode)->lock);
1340 BTRFS_I(inode)->outstanding_extents++;
1341 spin_unlock(&BTRFS_I(inode)->lock);
1342 }
1346 1343
1347 spin_lock(&root->fs_info->delalloc_lock); 1344 spin_lock(&root->fs_info->delalloc_lock);
1348 BTRFS_I(inode)->delalloc_bytes += len; 1345 BTRFS_I(inode)->delalloc_bytes += len;
@@ -1353,14 +1350,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
1353 } 1350 }
1354 spin_unlock(&root->fs_info->delalloc_lock); 1351 spin_unlock(&root->fs_info->delalloc_lock);
1355 } 1352 }
1356 return 0;
1357} 1353}
1358 1354
1359/* 1355/*
1360 * extent_io.c clear_bit_hook, see set_bit_hook for why 1356 * extent_io.c clear_bit_hook, see set_bit_hook for why
1361 */ 1357 */
1362static int btrfs_clear_bit_hook(struct inode *inode, 1358static void btrfs_clear_bit_hook(struct inode *inode,
1363 struct extent_state *state, int *bits) 1359 struct extent_state *state, int *bits)
1364{ 1360{
1365 /* 1361 /*
1366 * set_bit and clear bit hooks normally require _irqsave/restore 1362 * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1370,12 +1366,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1370 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1366 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1371 struct btrfs_root *root = BTRFS_I(inode)->root; 1367 struct btrfs_root *root = BTRFS_I(inode)->root;
1372 u64 len = state->end + 1 - state->start; 1368 u64 len = state->end + 1 - state->start;
1373 bool do_list = !is_free_space_inode(root, inode); 1369 bool do_list = !btrfs_is_free_space_inode(root, inode);
1374 1370
1375 if (*bits & EXTENT_FIRST_DELALLOC) 1371 if (*bits & EXTENT_FIRST_DELALLOC) {
1376 *bits &= ~EXTENT_FIRST_DELALLOC; 1372 *bits &= ~EXTENT_FIRST_DELALLOC;
1377 else if (!(*bits & EXTENT_DO_ACCOUNTING)) 1373 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1378 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1374 spin_lock(&BTRFS_I(inode)->lock);
1375 BTRFS_I(inode)->outstanding_extents--;
1376 spin_unlock(&BTRFS_I(inode)->lock);
1377 }
1379 1378
1380 if (*bits & EXTENT_DO_ACCOUNTING) 1379 if (*bits & EXTENT_DO_ACCOUNTING)
1381 btrfs_delalloc_release_metadata(inode, len); 1380 btrfs_delalloc_release_metadata(inode, len);
@@ -1394,7 +1393,6 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1394 } 1393 }
1395 spin_unlock(&root->fs_info->delalloc_lock); 1394 spin_unlock(&root->fs_info->delalloc_lock);
1396 } 1395 }
1397 return 0;
1398} 1396}
1399 1397
1400/* 1398/*
@@ -1477,7 +1475,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1477 1475
1478 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1476 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1479 1477
1480 if (is_free_space_inode(root, inode)) 1478 if (btrfs_is_free_space_inode(root, inode))
1481 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1479 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1482 else 1480 else
1483 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1481 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1644,7 +1642,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1644 int ret; 1642 int ret;
1645 1643
1646 path = btrfs_alloc_path(); 1644 path = btrfs_alloc_path();
1647 BUG_ON(!path); 1645 if (!path)
1646 return -ENOMEM;
1648 1647
1649 path->leave_spinning = 1; 1648 path->leave_spinning = 1;
1650 1649
@@ -1726,7 +1725,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1726 return 0; 1725 return 0;
1727 BUG_ON(!ordered_extent); 1726 BUG_ON(!ordered_extent);
1728 1727
1729 nolock = is_free_space_inode(root, inode); 1728 nolock = btrfs_is_free_space_inode(root, inode);
1730 1729
1731 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1730 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1732 BUG_ON(!list_empty(&ordered_extent->list)); 1731 BUG_ON(!list_empty(&ordered_extent->list));
@@ -1787,7 +1786,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 &ordered_extent->list); 1786 &ordered_extent->list);
1788 1787
1789 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1790 if (!ret) { 1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1791 ret = btrfs_update_inode(trans, root, inode); 1790 ret = btrfs_update_inode(trans, root, inode);
1792 BUG_ON(ret); 1791 BUG_ON(ret);
1793 } 1792 }
@@ -2214,7 +2213,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2214 2213
2215 if (!root->orphan_block_rsv) { 2214 if (!root->orphan_block_rsv) {
2216 block_rsv = btrfs_alloc_block_rsv(root); 2215 block_rsv = btrfs_alloc_block_rsv(root);
2217 BUG_ON(!block_rsv); 2216 if (!block_rsv)
2217 return -ENOMEM;
2218 } 2218 }
2219 2219
2220 spin_lock(&root->orphan_lock); 2220 spin_lock(&root->orphan_lock);
@@ -2516,7 +2516,9 @@ static void btrfs_read_locked_inode(struct inode *inode)
2516 filled = true; 2516 filled = true;
2517 2517
2518 path = btrfs_alloc_path(); 2518 path = btrfs_alloc_path();
2519 BUG_ON(!path); 2519 if (!path)
2520 goto make_bad;
2521
2520 path->leave_spinning = 1; 2522 path->leave_spinning = 1;
2521 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2523 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2522 2524
@@ -2531,13 +2533,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
2531 2533
2532 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2534 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2533 struct btrfs_inode_item); 2535 struct btrfs_inode_item);
2534 if (!leaf->map_token)
2535 map_private_extent_buffer(leaf, (unsigned long)inode_item,
2536 sizeof(struct btrfs_inode_item),
2537 &leaf->map_token, &leaf->kaddr,
2538 &leaf->map_start, &leaf->map_len,
2539 KM_USER1);
2540
2541 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2536 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2542 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2537 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
2543 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2538 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
@@ -2575,11 +2570,6 @@ cache_acl:
2575 if (!maybe_acls) 2570 if (!maybe_acls)
2576 cache_no_acl(inode); 2571 cache_no_acl(inode);
2577 2572
2578 if (leaf->map_token) {
2579 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2580 leaf->map_token = NULL;
2581 }
2582
2583 btrfs_free_path(path); 2573 btrfs_free_path(path);
2584 2574
2585 switch (inode->i_mode & S_IFMT) { 2575 switch (inode->i_mode & S_IFMT) {
@@ -2624,13 +2614,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2624 struct btrfs_inode_item *item, 2614 struct btrfs_inode_item *item,
2625 struct inode *inode) 2615 struct inode *inode)
2626{ 2616{
2627 if (!leaf->map_token)
2628 map_private_extent_buffer(leaf, (unsigned long)item,
2629 sizeof(struct btrfs_inode_item),
2630 &leaf->map_token, &leaf->kaddr,
2631 &leaf->map_start, &leaf->map_len,
2632 KM_USER1);
2633
2634 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2617 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2635 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2618 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2636 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2619 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2659,11 +2642,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2659 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2642 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2660 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2643 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2661 btrfs_set_inode_block_group(leaf, item, 0); 2644 btrfs_set_inode_block_group(leaf, item, 0);
2662
2663 if (leaf->map_token) {
2664 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2665 leaf->map_token = NULL;
2666 }
2667} 2645}
2668 2646
2669/* 2647/*
@@ -2684,7 +2662,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2684 * The data relocation inode should also be directly updated 2662 * The data relocation inode should also be directly updated
2685 * without delay 2663 * without delay
2686 */ 2664 */
2687 if (!is_free_space_inode(root, inode) 2665 if (!btrfs_is_free_space_inode(root, inode)
2688 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2689 ret = btrfs_delayed_update_inode(trans, root, inode); 2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2690 if (!ret) 2668 if (!ret)
@@ -3021,13 +2999,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3021 2999
3022 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3000 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3023 dentry->d_name.name, dentry->d_name.len); 3001 dentry->d_name.name, dentry->d_name.len);
3024 BUG_ON(ret); 3002 if (ret)
3003 goto out;
3025 3004
3026 if (inode->i_nlink == 0) { 3005 if (inode->i_nlink == 0) {
3027 ret = btrfs_orphan_add(trans, inode); 3006 ret = btrfs_orphan_add(trans, inode);
3028 BUG_ON(ret); 3007 if (ret)
3008 goto out;
3029 } 3009 }
3030 3010
3011out:
3031 nr = trans->blocks_used; 3012 nr = trans->blocks_used;
3032 __unlink_end_trans(trans, root); 3013 __unlink_end_trans(trans, root);
3033 btrfs_btree_balance_dirty(root, nr); 3014 btrfs_btree_balance_dirty(root, nr);
@@ -3170,6 +3151,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3170 3151
3171 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3152 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3172 3153
3154 path = btrfs_alloc_path();
3155 if (!path)
3156 return -ENOMEM;
3157 path->reada = -1;
3158
3173 if (root->ref_cows || root == root->fs_info->tree_root) 3159 if (root->ref_cows || root == root->fs_info->tree_root)
3174 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3160 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3175 3161
@@ -3182,10 +3168,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3182 if (min_type == 0 && root == BTRFS_I(inode)->root) 3168 if (min_type == 0 && root == BTRFS_I(inode)->root)
3183 btrfs_kill_delayed_inode_items(inode); 3169 btrfs_kill_delayed_inode_items(inode);
3184 3170
3185 path = btrfs_alloc_path();
3186 BUG_ON(!path);
3187 path->reada = -1;
3188
3189 key.objectid = ino; 3171 key.objectid = ino;
3190 key.offset = (u64)-1; 3172 key.offset = (u64)-1;
3191 key.type = (u8)-1; 3173 key.type = (u8)-1;
@@ -3398,7 +3380,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3398 3380
3399 ret = -ENOMEM; 3381 ret = -ENOMEM;
3400again: 3382again:
3401 page = grab_cache_page(mapping, index); 3383 page = find_or_create_page(mapping, index, GFP_NOFS);
3402 if (!page) { 3384 if (!page) {
3403 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3404 goto out; 3386 goto out;
@@ -3528,15 +3510,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3528 err = btrfs_drop_extents(trans, inode, cur_offset, 3510 err = btrfs_drop_extents(trans, inode, cur_offset,
3529 cur_offset + hole_size, 3511 cur_offset + hole_size,
3530 &hint_byte, 1); 3512 &hint_byte, 1);
3531 if (err) 3513 if (err) {
3514 btrfs_end_transaction(trans, root);
3532 break; 3515 break;
3516 }
3533 3517
3534 err = btrfs_insert_file_extent(trans, root, 3518 err = btrfs_insert_file_extent(trans, root,
3535 btrfs_ino(inode), cur_offset, 0, 3519 btrfs_ino(inode), cur_offset, 0,
3536 0, hole_size, 0, hole_size, 3520 0, hole_size, 0, hole_size,
3537 0, 0, 0); 3521 0, 0, 0);
3538 if (err) 3522 if (err) {
3523 btrfs_end_transaction(trans, root);
3539 break; 3524 break;
3525 }
3540 3526
3541 btrfs_drop_extent_cache(inode, hole_start, 3527 btrfs_drop_extent_cache(inode, hole_start,
3542 last_byte - 1, 0); 3528 last_byte - 1, 0);
@@ -3634,7 +3620,7 @@ void btrfs_evict_inode(struct inode *inode)
3634 3620
3635 truncate_inode_pages(&inode->i_data, 0); 3621 truncate_inode_pages(&inode->i_data, 0);
3636 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3622 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3637 is_free_space_inode(root, inode))) 3623 btrfs_is_free_space_inode(root, inode)))
3638 goto no_delete; 3624 goto no_delete;
3639 3625
3640 if (is_bad_inode(inode)) { 3626 if (is_bad_inode(inode)) {
@@ -3713,7 +3699,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3713 int ret = 0; 3699 int ret = 0;
3714 3700
3715 path = btrfs_alloc_path(); 3701 path = btrfs_alloc_path();
3716 BUG_ON(!path); 3702 if (!path)
3703 return -ENOMEM;
3717 3704
3718 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3705 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
3719 namelen, 0); 3706 namelen, 0);
@@ -3978,10 +3965,16 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3978 BTRFS_I(inode)->root = root; 3965 BTRFS_I(inode)->root = root;
3979 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3966 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3980 btrfs_read_locked_inode(inode); 3967 btrfs_read_locked_inode(inode);
3981 inode_tree_add(inode); 3968 if (!is_bad_inode(inode)) {
3982 unlock_new_inode(inode); 3969 inode_tree_add(inode);
3983 if (new) 3970 unlock_new_inode(inode);
3984 *new = 1; 3971 if (new)
3972 *new = 1;
3973 } else {
3974 unlock_new_inode(inode);
3975 iput(inode);
3976 inode = ERR_PTR(-ESTALE);
3977 }
3985 } 3978 }
3986 3979
3987 return inode; 3980 return inode;
@@ -4016,12 +4009,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4016 struct btrfs_root *sub_root = root; 4009 struct btrfs_root *sub_root = root;
4017 struct btrfs_key location; 4010 struct btrfs_key location;
4018 int index; 4011 int index;
4019 int ret; 4012 int ret = 0;
4020 4013
4021 if (dentry->d_name.len > BTRFS_NAME_LEN) 4014 if (dentry->d_name.len > BTRFS_NAME_LEN)
4022 return ERR_PTR(-ENAMETOOLONG); 4015 return ERR_PTR(-ENAMETOOLONG);
4023 4016
4024 ret = btrfs_inode_by_name(dir, dentry, &location); 4017 if (unlikely(d_need_lookup(dentry))) {
4018 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4019 kfree(dentry->d_fsdata);
4020 dentry->d_fsdata = NULL;
4021 d_clear_need_lookup(dentry);
4022 } else {
4023 ret = btrfs_inode_by_name(dir, dentry, &location);
4024 }
4025 4025
4026 if (ret < 0) 4026 if (ret < 0)
4027 return ERR_PTR(ret); 4027 return ERR_PTR(ret);
@@ -4076,6 +4076,12 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
4076 return 0; 4076 return 0;
4077} 4077}
4078 4078
4079static void btrfs_dentry_release(struct dentry *dentry)
4080{
4081 if (dentry->d_fsdata)
4082 kfree(dentry->d_fsdata);
4083}
4084
4079static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4085static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4080 struct nameidata *nd) 4086 struct nameidata *nd)
4081{ 4087{
@@ -4098,6 +4104,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4098 struct btrfs_path *path; 4104 struct btrfs_path *path;
4099 struct list_head ins_list; 4105 struct list_head ins_list;
4100 struct list_head del_list; 4106 struct list_head del_list;
4107 struct qstr q;
4101 int ret; 4108 int ret;
4102 struct extent_buffer *leaf; 4109 struct extent_buffer *leaf;
4103 int slot; 4110 int slot;
@@ -4187,6 +4194,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4187 4194
4188 while (di_cur < di_total) { 4195 while (di_cur < di_total) {
4189 struct btrfs_key location; 4196 struct btrfs_key location;
4197 struct dentry *tmp;
4190 4198
4191 if (verify_dir_item(root, leaf, di)) 4199 if (verify_dir_item(root, leaf, di))
4192 break; 4200 break;
@@ -4207,6 +4215,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4207 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4215 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
4208 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4216 btrfs_dir_item_key_to_cpu(leaf, di, &location);
4209 4217
4218 q.name = name_ptr;
4219 q.len = name_len;
4220 q.hash = full_name_hash(q.name, q.len);
4221 tmp = d_lookup(filp->f_dentry, &q);
4222 if (!tmp) {
4223 struct btrfs_key *newkey;
4224
4225 newkey = kzalloc(sizeof(struct btrfs_key),
4226 GFP_NOFS);
4227 if (!newkey)
4228 goto no_dentry;
4229 tmp = d_alloc(filp->f_dentry, &q);
4230 if (!tmp) {
4231 kfree(newkey);
4232 dput(tmp);
4233 goto no_dentry;
4234 }
4235 memcpy(newkey, &location,
4236 sizeof(struct btrfs_key));
4237 tmp->d_fsdata = newkey;
4238 tmp->d_flags |= DCACHE_NEED_LOOKUP;
4239 d_rehash(tmp);
4240 dput(tmp);
4241 } else {
4242 dput(tmp);
4243 }
4244no_dentry:
4210 /* is this a reference to our own snapshot? If so 4245 /* is this a reference to our own snapshot? If so
4211 * skip it 4246 * skip it
4212 */ 4247 */
@@ -4271,7 +4306,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4271 if (BTRFS_I(inode)->dummy_inode) 4306 if (BTRFS_I(inode)->dummy_inode)
4272 return 0; 4307 return 0;
4273 4308
4274 if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) 4309 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
4275 nolock = true; 4310 nolock = true;
4276 4311
4277 if (wbc->sync_mode == WB_SYNC_ALL) { 4312 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4432,7 +4467,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4432 int owner; 4467 int owner;
4433 4468
4434 path = btrfs_alloc_path(); 4469 path = btrfs_alloc_path();
4435 BUG_ON(!path); 4470 if (!path)
4471 return ERR_PTR(-ENOMEM);
4436 4472
4437 inode = new_inode(root->fs_info->sb); 4473 inode = new_inode(root->fs_info->sb);
4438 if (!inode) { 4474 if (!inode) {
@@ -4467,7 +4503,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4467 inode->i_generation = BTRFS_I(inode)->generation; 4503 inode->i_generation = BTRFS_I(inode)->generation;
4468 btrfs_set_inode_space_info(root, inode); 4504 btrfs_set_inode_space_info(root, inode);
4469 4505
4470 if (mode & S_IFDIR) 4506 if (S_ISDIR(mode))
4471 owner = 0; 4507 owner = 0;
4472 else 4508 else
4473 owner = 1; 4509 owner = 1;
@@ -4512,7 +4548,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4512 4548
4513 btrfs_inherit_iflags(inode, dir); 4549 btrfs_inherit_iflags(inode, dir);
4514 4550
4515 if ((mode & S_IFREG)) { 4551 if (S_ISREG(mode)) {
4516 if (btrfs_test_opt(root, NODATASUM)) 4552 if (btrfs_test_opt(root, NODATASUM))
4517 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4553 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4518 if (btrfs_test_opt(root, NODATACOW) || 4554 if (btrfs_test_opt(root, NODATACOW) ||
@@ -5787,7 +5823,7 @@ again:
5787 5823
5788 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5824 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5789 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5825 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5790 if (!ret) 5826 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5791 btrfs_update_inode(trans, root, inode); 5827 btrfs_update_inode(trans, root, inode);
5792 ret = 0; 5828 ret = 0;
5793out_unlock: 5829out_unlock:
@@ -6692,19 +6728,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6692 return 0; 6728 return 0;
6693} 6729}
6694 6730
6695/* helper function for file defrag and space balancing. This
6696 * forces readahead on a given range of bytes in an inode
6697 */
6698unsigned long btrfs_force_ra(struct address_space *mapping,
6699 struct file_ra_state *ra, struct file *file,
6700 pgoff_t offset, pgoff_t last_index)
6701{
6702 pgoff_t req_size = last_index - offset + 1;
6703
6704 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
6705 return offset + req_size;
6706}
6707
6708struct inode *btrfs_alloc_inode(struct super_block *sb) 6731struct inode *btrfs_alloc_inode(struct super_block *sb)
6709{ 6732{
6710 struct btrfs_inode *ei; 6733 struct btrfs_inode *ei;
@@ -6728,8 +6751,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6728 ei->index_cnt = (u64)-1; 6751 ei->index_cnt = (u64)-1;
6729 ei->last_unlink_trans = 0; 6752 ei->last_unlink_trans = 0;
6730 6753
6731 atomic_set(&ei->outstanding_extents, 0); 6754 spin_lock_init(&ei->lock);
6732 atomic_set(&ei->reserved_extents, 0); 6755 ei->outstanding_extents = 0;
6756 ei->reserved_extents = 0;
6733 6757
6734 ei->ordered_data_close = 0; 6758 ei->ordered_data_close = 0;
6735 ei->orphan_meta_reserved = 0; 6759 ei->orphan_meta_reserved = 0;
@@ -6767,8 +6791,8 @@ void btrfs_destroy_inode(struct inode *inode)
6767 6791
6768 WARN_ON(!list_empty(&inode->i_dentry)); 6792 WARN_ON(!list_empty(&inode->i_dentry));
6769 WARN_ON(inode->i_data.nrpages); 6793 WARN_ON(inode->i_data.nrpages);
6770 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6794 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6771 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); 6795 WARN_ON(BTRFS_I(inode)->reserved_extents);
6772 6796
6773 /* 6797 /*
6774 * This can happen where we create an inode, but somebody else also 6798 * This can happen where we create an inode, but somebody else also
@@ -6823,7 +6847,7 @@ int btrfs_drop_inode(struct inode *inode)
6823 struct btrfs_root *root = BTRFS_I(inode)->root; 6847 struct btrfs_root *root = BTRFS_I(inode)->root;
6824 6848
6825 if (btrfs_root_refs(&root->root_item) == 0 && 6849 if (btrfs_root_refs(&root->root_item) == 0 &&
6826 !is_free_space_inode(root, inode)) 6850 !btrfs_is_free_space_inode(root, inode))
6827 return 1; 6851 return 1;
6828 else 6852 else
6829 return generic_drop_inode(inode); 6853 return generic_drop_inode(inode);
@@ -7186,7 +7210,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7186 goto out_unlock; 7210 goto out_unlock;
7187 7211
7188 path = btrfs_alloc_path(); 7212 path = btrfs_alloc_path();
7189 BUG_ON(!path); 7213 if (!path) {
7214 err = -ENOMEM;
7215 drop_inode = 1;
7216 goto out_unlock;
7217 }
7190 key.objectid = btrfs_ino(inode); 7218 key.objectid = btrfs_ino(inode);
7191 key.offset = 0; 7219 key.offset = 0;
7192 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7220 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
@@ -7326,11 +7354,15 @@ static int btrfs_set_page_dirty(struct page *page)
7326static int btrfs_permission(struct inode *inode, int mask) 7354static int btrfs_permission(struct inode *inode, int mask)
7327{ 7355{
7328 struct btrfs_root *root = BTRFS_I(inode)->root; 7356 struct btrfs_root *root = BTRFS_I(inode)->root;
7357 umode_t mode = inode->i_mode;
7329 7358
7330 if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) 7359 if (mask & MAY_WRITE &&
7331 return -EROFS; 7360 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
7332 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7361 if (btrfs_root_readonly(root))
7333 return -EACCES; 7362 return -EROFS;
7363 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
7364 return -EACCES;
7365 }
7334 return generic_permission(inode, mask); 7366 return generic_permission(inode, mask);
7335} 7367}
7336 7368
@@ -7452,4 +7484,5 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7452 7484
7453const struct dentry_operations btrfs_dentry_operations = { 7485const struct dentry_operations btrfs_dentry_operations = {
7454 .d_delete = btrfs_dentry_delete, 7486 .d_delete = btrfs_dentry_delete,
7487 .d_release = btrfs_dentry_release,
7455}; 7488};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 622543309eb2..3351b1b24574 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -859,8 +859,8 @@ again:
859 /* step one, lock all the pages */ 859 /* step one, lock all the pages */
860 for (i = 0; i < num_pages; i++) { 860 for (i = 0; i < num_pages; i++) {
861 struct page *page; 861 struct page *page;
862 page = grab_cache_page(inode->i_mapping, 862 page = find_or_create_page(inode->i_mapping,
863 start_index + i); 863 start_index + i, GFP_NOFS);
864 if (!page) 864 if (!page)
865 break; 865 break;
866 866
@@ -930,7 +930,9 @@ again:
930 GFP_NOFS); 930 GFP_NOFS);
931 931
932 if (i_done != num_pages) { 932 if (i_done != num_pages) {
933 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 933 spin_lock(&BTRFS_I(inode)->lock);
934 BTRFS_I(inode)->outstanding_extents++;
935 spin_unlock(&BTRFS_I(inode)->lock);
934 btrfs_delalloc_release_space(inode, 936 btrfs_delalloc_release_space(inode,
935 (num_pages - i_done) << PAGE_CACHE_SHIFT); 937 (num_pages - i_done) << PAGE_CACHE_SHIFT);
936 } 938 }
@@ -1747,11 +1749,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1747 key.objectid = key.offset; 1749 key.objectid = key.offset;
1748 key.offset = (u64)-1; 1750 key.offset = (u64)-1;
1749 dirid = key.objectid; 1751 dirid = key.objectid;
1750
1751 } 1752 }
1752 if (ptr < name) 1753 if (ptr < name)
1753 goto out; 1754 goto out;
1754 memcpy(name, ptr, total_len); 1755 memmove(name, ptr, total_len);
1755 name[total_len]='\0'; 1756 name[total_len]='\0';
1756 ret = 0; 1757 ret = 0;
1757out: 1758out:
@@ -2219,6 +2220,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2219 !IS_ALIGNED(destoff, bs)) 2220 !IS_ALIGNED(destoff, bs))
2220 goto out_unlock; 2221 goto out_unlock;
2221 2222
2223 if (destoff > inode->i_size) {
2224 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2225 if (ret)
2226 goto out_unlock;
2227 }
2228
2222 /* do any pending delalloc/csum calc on src, one way or 2229 /* do any pending delalloc/csum calc on src, one way or
2223 another, and lock file content */ 2230 another, and lock file content */
2224 while (1) { 2231 while (1) {
@@ -2235,6 +2242,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2235 btrfs_wait_ordered_range(src, off, len); 2242 btrfs_wait_ordered_range(src, off, len);
2236 } 2243 }
2237 2244
2245 /* truncate page cache pages from target inode range */
2246 truncate_inode_pages_range(&inode->i_data, off,
2247 ALIGN(off + len, PAGE_CACHE_SIZE) - 1);
2248
2238 /* clone data */ 2249 /* clone data */
2239 key.objectid = btrfs_ino(src); 2250 key.objectid = btrfs_ino(src);
2240 key.type = BTRFS_EXTENT_DATA_KEY; 2251 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -2320,14 +2331,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2320 2331
2321 if (type == BTRFS_FILE_EXTENT_REG || 2332 if (type == BTRFS_FILE_EXTENT_REG ||
2322 type == BTRFS_FILE_EXTENT_PREALLOC) { 2333 type == BTRFS_FILE_EXTENT_PREALLOC) {
2334 /*
2335 * a | --- range to clone ---| b
2336 * | ------------- extent ------------- |
2337 */
2338
2339 /* substract range b */
2340 if (key.offset + datal > off + len)
2341 datal = off + len - key.offset;
2342
2343 /* substract range a */
2323 if (off > key.offset) { 2344 if (off > key.offset) {
2324 datao += off - key.offset; 2345 datao += off - key.offset;
2325 datal -= off - key.offset; 2346 datal -= off - key.offset;
2326 } 2347 }
2327 2348
2328 if (key.offset + datal > off + len)
2329 datal = off + len - key.offset;
2330
2331 ret = btrfs_drop_extents(trans, inode, 2349 ret = btrfs_drop_extents(trans, inode,
2332 new_key.offset, 2350 new_key.offset,
2333 new_key.offset + datal, 2351 new_key.offset + datal,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43dc3f0f..d77b67c4b275 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
24#include "extent_io.h" 24#include "extent_io.h"
25#include "locking.h" 25#include "locking.h"
26 26
27static inline void spin_nested(struct extent_buffer *eb) 27void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
28{
29 spin_lock(&eb->lock);
30}
31 28
32/* 29/*
33 * Setting a lock to blocking will drop the spinlock and set the 30 * if we currently have a spinning reader or writer lock
34 * flag that forces other procs who want the lock to wait. After 31 * (indicated by the rw flag) this will bump the count
35 * this you can safely schedule with the lock held. 32 * of blocking holders and drop the spinlock.
36 */ 33 */
37void btrfs_set_lock_blocking(struct extent_buffer *eb) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
38{ 35{
39 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 36 if (rw == BTRFS_WRITE_LOCK) {
40 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 37 if (atomic_read(&eb->blocking_writers) == 0) {
41 spin_unlock(&eb->lock); 38 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
39 atomic_dec(&eb->spinning_writers);
40 btrfs_assert_tree_locked(eb);
41 atomic_inc(&eb->blocking_writers);
42 write_unlock(&eb->lock);
43 }
44 } else if (rw == BTRFS_READ_LOCK) {
45 btrfs_assert_tree_read_locked(eb);
46 atomic_inc(&eb->blocking_readers);
47 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
48 atomic_dec(&eb->spinning_readers);
49 read_unlock(&eb->lock);
42 } 50 }
43 /* exit with the spin lock released and the bit set */ 51 return;
44} 52}
45 53
46/* 54/*
47 * clearing the blocking flag will take the spinlock again. 55 * if we currently have a blocking lock, take the spinlock
48 * After this you can't safely schedule 56 * and drop our blocking count
49 */ 57 */
50void btrfs_clear_lock_blocking(struct extent_buffer *eb) 58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
51{ 59{
52 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
53 spin_nested(eb); 61 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
54 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 62 write_lock(&eb->lock);
55 smp_mb__after_clear_bit(); 63 WARN_ON(atomic_read(&eb->spinning_writers));
64 atomic_inc(&eb->spinning_writers);
65 if (atomic_dec_and_test(&eb->blocking_writers))
66 wake_up(&eb->write_lock_wq);
67 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
68 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
69 read_lock(&eb->lock);
70 atomic_inc(&eb->spinning_readers);
71 if (atomic_dec_and_test(&eb->blocking_readers))
72 wake_up(&eb->read_lock_wq);
56 } 73 }
57 /* exit with the spin lock held */ 74 return;
58} 75}
59 76
60/* 77/*
61 * unfortunately, many of the places that currently set a lock to blocking 78 * take a spinning read lock. This will wait for any blocking
62 * don't end up blocking for very long, and often they don't block 79 * writers
63 * at all. For a dbench 50 run, if we don't spin on the blocking bit
64 * at all, the context switch rate can jump up to 400,000/sec or more.
65 *
66 * So, we're still stuck with this crummy spin on the blocking bit,
67 * at least until the most common causes of the short blocks
68 * can be dealt with.
69 */ 80 */
70static int btrfs_spin_on_block(struct extent_buffer *eb) 81void btrfs_tree_read_lock(struct extent_buffer *eb)
71{ 82{
72 int i; 83again:
73 84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
74 for (i = 0; i < 512; i++) { 85 read_lock(&eb->lock);
75 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 86 if (atomic_read(&eb->blocking_writers)) {
76 return 1; 87 read_unlock(&eb->lock);
77 if (need_resched()) 88 wait_event(eb->write_lock_wq,
78 break; 89 atomic_read(&eb->blocking_writers) == 0);
79 cpu_relax(); 90 goto again;
80 } 91 }
81 return 0; 92 atomic_inc(&eb->read_locks);
93 atomic_inc(&eb->spinning_readers);
82} 94}
83 95
84/* 96/*
85 * This is somewhat different from trylock. It will take the 97 * returns 1 if we get the read lock and 0 if we don't
86 * spinlock but if it finds the lock is set to blocking, it will 98 * this won't wait for blocking writers
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */ 99 */
94int btrfs_try_spin_lock(struct extent_buffer *eb) 100int btrfs_try_tree_read_lock(struct extent_buffer *eb)
95{ 101{
96 int i; 102 if (atomic_read(&eb->blocking_writers))
103 return 0;
97 104
98 if (btrfs_spin_on_block(eb)) { 105 read_lock(&eb->lock);
99 spin_nested(eb); 106 if (atomic_read(&eb->blocking_writers)) {
100 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 107 read_unlock(&eb->lock);
101 return 1; 108 return 0;
102 spin_unlock(&eb->lock);
103 } 109 }
104 /* spin for a bit on the BLOCKING flag */ 110 atomic_inc(&eb->read_locks);
105 for (i = 0; i < 2; i++) { 111 atomic_inc(&eb->spinning_readers);
106 cpu_relax(); 112 return 1;
107 if (!btrfs_spin_on_block(eb))
108 break;
109
110 spin_nested(eb);
111 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112 return 1;
113 spin_unlock(&eb->lock);
114 }
115 return 0;
116} 113}
117 114
118/* 115/*
119 * the autoremove wake function will return 0 if it tried to wake up 116 * returns 1 if we get the read lock and 0 if we don't
120 * a process that was already awake, which means that process won't 117 * this won't wait for blocking writers or readers
121 * count as an exclusive wakeup. The waitq code will continue waking
122 * procs until it finds one that was actually sleeping.
123 *
124 * For btrfs, this isn't quite what we want. We want a single proc
125 * to be notified that the lock is ready for taking. If that proc
126 * already happen to be awake, great, it will loop around and try for
127 * the lock.
128 *
129 * So, btrfs_wake_function always returns 1, even when the proc that we
130 * tried to wake up was already awake.
131 */ 118 */
132static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 119int btrfs_try_tree_write_lock(struct extent_buffer *eb)
133 int sync, void *key)
134{ 120{
135 autoremove_wake_function(wait, mode, sync, key); 121 if (atomic_read(&eb->blocking_writers) ||
122 atomic_read(&eb->blocking_readers))
123 return 0;
124 write_lock(&eb->lock);
125 if (atomic_read(&eb->blocking_writers) ||
126 atomic_read(&eb->blocking_readers)) {
127 write_unlock(&eb->lock);
128 return 0;
129 }
130 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers);
136 return 1; 132 return 1;
137} 133}
138 134
139/* 135/*
140 * returns with the extent buffer spinlocked. 136 * drop a spinning read lock
141 * 137 */
142 * This will spin and/or wait as required to take the lock, and then 138void btrfs_tree_read_unlock(struct extent_buffer *eb)
143 * return with the spinlock held. 139{
144 * 140 btrfs_assert_tree_read_locked(eb);
145 * After this call, scheduling is not safe without first calling 141 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
146 * btrfs_set_lock_blocking() 142 atomic_dec(&eb->spinning_readers);
143 atomic_dec(&eb->read_locks);
144 read_unlock(&eb->lock);
145}
146
147/*
148 * drop a blocking read lock
149 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{
152 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers))
155 wake_up(&eb->read_lock_wq);
156 atomic_dec(&eb->read_locks);
157}
158
159/*
160 * take a spinning write lock. This will wait for both
161 * blocking readers or writers
147 */ 162 */
148int btrfs_tree_lock(struct extent_buffer *eb) 163int btrfs_tree_lock(struct extent_buffer *eb)
149{ 164{
150 DEFINE_WAIT(wait); 165again:
151 wait.func = btrfs_wake_function; 166 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
152 167 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
153 if (!btrfs_spin_on_block(eb)) 168 write_lock(&eb->lock);
154 goto sleep; 169 if (atomic_read(&eb->blocking_readers)) {
155 170 write_unlock(&eb->lock);
156 while(1) { 171 wait_event(eb->read_lock_wq,
157 spin_nested(eb); 172 atomic_read(&eb->blocking_readers) == 0);
158 173 goto again;
159 /* nobody is blocking, exit with the spinlock held */
160 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
161 return 0;
162
163 /*
164 * we have the spinlock, but the real owner is blocking.
165 * wait for them
166 */
167 spin_unlock(&eb->lock);
168
169 /*
170 * spin for a bit, and if the blocking flag goes away,
171 * loop around
172 */
173 cpu_relax();
174 if (btrfs_spin_on_block(eb))
175 continue;
176sleep:
177 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
178 TASK_UNINTERRUPTIBLE);
179
180 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
181 schedule();
182
183 finish_wait(&eb->lock_wq, &wait);
184 } 174 }
175 if (atomic_read(&eb->blocking_writers)) {
176 write_unlock(&eb->lock);
177 wait_event(eb->write_lock_wq,
178 atomic_read(&eb->blocking_writers) == 0);
179 goto again;
180 }
181 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks);
185 return 0; 184 return 0;
186} 185}
187 186
187/*
188 * drop a spinning or a blocking write lock.
189 */
188int btrfs_tree_unlock(struct extent_buffer *eb) 190int btrfs_tree_unlock(struct extent_buffer *eb)
189{ 191{
190 /* 192 int blockers = atomic_read(&eb->blocking_writers);
191 * if we were a blocking owner, we don't have the spinlock held 193
192 * just clear the bit and look for waiters 194 BUG_ON(blockers > 1);
193 */ 195
194 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 196 btrfs_assert_tree_locked(eb);
195 smp_mb__after_clear_bit(); 197 atomic_dec(&eb->write_locks);
196 else 198
197 spin_unlock(&eb->lock); 199 if (blockers) {
198 200 WARN_ON(atomic_read(&eb->spinning_writers));
199 if (waitqueue_active(&eb->lock_wq)) 201 atomic_dec(&eb->blocking_writers);
200 wake_up(&eb->lock_wq); 202 smp_wmb();
203 wake_up(&eb->write_lock_wq);
204 } else {
205 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
206 atomic_dec(&eb->spinning_writers);
207 write_unlock(&eb->lock);
208 }
201 return 0; 209 return 0;
202} 210}
203 211
204void btrfs_assert_tree_locked(struct extent_buffer *eb) 212void btrfs_assert_tree_locked(struct extent_buffer *eb)
205{ 213{
206 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 214 BUG_ON(!atomic_read(&eb->write_locks));
207 assert_spin_locked(&eb->lock); 215}
216
217void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
218{
219 BUG_ON(!atomic_read(&eb->read_locks));
208} 220}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a560a2f1..17247ddb81a0 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
19#ifndef __BTRFS_LOCKING_ 19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_ 20#define __BTRFS_LOCKING_
21 21
22#define BTRFS_WRITE_LOCK 1
23#define BTRFS_READ_LOCK 2
24#define BTRFS_WRITE_LOCK_BLOCKING 3
25#define BTRFS_READ_LOCK_BLOCKING 4
26
22int btrfs_tree_lock(struct extent_buffer *eb); 27int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 28int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_try_spin_lock(struct extent_buffer *eb); 29int btrfs_try_spin_lock(struct extent_buffer *eb);
25 30
26void btrfs_set_lock_blocking(struct extent_buffer *eb); 31void btrfs_tree_read_lock(struct extent_buffer *eb);
27void btrfs_clear_lock_blocking(struct extent_buffer *eb); 32void btrfs_tree_read_unlock(struct extent_buffer *eb);
33void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
35void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
28void btrfs_assert_tree_locked(struct extent_buffer *eb); 36void btrfs_assert_tree_locked(struct extent_buffer *eb);
37int btrfs_try_tree_read_lock(struct extent_buffer *eb);
38int btrfs_try_tree_write_lock(struct extent_buffer *eb);
39
40static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
41{
42 if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
43 btrfs_tree_unlock(eb);
44 else if (rw == BTRFS_READ_LOCK_BLOCKING)
45 btrfs_tree_read_unlock_blocking(eb);
46 else if (rw == BTRFS_READ_LOCK)
47 btrfs_tree_read_unlock(eb);
48 else
49 BUG();
50}
51
52static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
53{
54 btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
55}
56
57static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
58{
59 btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
60}
29#endif 61#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
deleted file mode 100644
index 82d569cb6267..000000000000
--- a/fs/btrfs/ref-cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/sort.h>
22#include "ctree.h"
23#include "ref-cache.h"
24#include "transaction.h"
25
26static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
27 struct rb_node *node)
28{
29 struct rb_node **p = &root->rb_node;
30 struct rb_node *parent = NULL;
31 struct btrfs_leaf_ref *entry;
32
33 while (*p) {
34 parent = *p;
35 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
36
37 if (bytenr < entry->bytenr)
38 p = &(*p)->rb_left;
39 else if (bytenr > entry->bytenr)
40 p = &(*p)->rb_right;
41 else
42 return parent;
43 }
44
45 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
46 rb_link_node(node, parent, p);
47 rb_insert_color(node, root);
48 return NULL;
49}
50
51static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
52{
53 struct rb_node *n = root->rb_node;
54 struct btrfs_leaf_ref *entry;
55
56 while (n) {
57 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
58 WARN_ON(!entry->in_tree);
59
60 if (bytenr < entry->bytenr)
61 n = n->rb_left;
62 else if (bytenr > entry->bytenr)
63 n = n->rb_right;
64 else
65 return n;
66 }
67 return NULL;
68}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
deleted file mode 100644
index 24f7001f6387..000000000000
--- a/fs/btrfs/ref-cache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5e0a3dc79a45..59bb1764273d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
2955 page_cache_sync_readahead(inode->i_mapping, 2955 page_cache_sync_readahead(inode->i_mapping,
2956 ra, NULL, index, 2956 ra, NULL, index,
2957 last_index + 1 - index); 2957 last_index + 1 - index);
2958 page = grab_cache_page(inode->i_mapping, index); 2958 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS);
2959 if (!page) { 2960 if (!page) {
2960 btrfs_delalloc_release_metadata(inode, 2961 btrfs_delalloc_release_metadata(inode,
2961 PAGE_CACHE_SIZE); 2962 PAGE_CACHE_SIZE);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ebe45443de06..f4099904565a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,13 +71,12 @@ out:
71 return ret; 71 return ret;
72} 72}
73 73
74int btrfs_set_root_node(struct btrfs_root_item *item, 74void btrfs_set_root_node(struct btrfs_root_item *item,
75 struct extent_buffer *node) 75 struct extent_buffer *node)
76{ 76{
77 btrfs_set_root_bytenr(item, node->start); 77 btrfs_set_root_bytenr(item, node->start);
78 btrfs_set_root_level(item, btrfs_header_level(node)); 78 btrfs_set_root_level(item, btrfs_header_level(node));
79 btrfs_set_root_generation(item, btrfs_header_generation(node)); 79 btrfs_set_root_generation(item, btrfs_header_generation(node));
80 return 0;
81} 80}
82 81
83/* 82/*
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c0f7ecaf1e79..bc1f6ad18442 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
50 unsigned long part_offset = (unsigned long)s; \ 50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \ 51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \ 52 type *p; \
53 /* ugly, but we want the fast path here */ \ 53 int err; \
54 if (eb->map_token && offset >= eb->map_start && \ 54 char *kaddr; \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \ 55 unsigned long map_start; \
56 eb->map_len) { \ 56 unsigned long map_len; \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \ 57 u##bits res; \
58 return le##bits##_to_cpu(p->member); \ 58 err = map_private_extent_buffer(eb, offset, \
59 } \ 59 sizeof(((type *)0)->member), \
60 { \ 60 &kaddr, &map_start, &map_len); \
61 int err; \ 61 if (err) { \
62 char *map_token; \ 62 __le##bits leres; \
63 char *kaddr; \ 63 read_eb_member(eb, s, type, member, &leres); \
64 int unmap_on_exit = (eb->map_token == NULL); \ 64 return le##bits##_to_cpu(leres); \
65 unsigned long map_start; \ 65 } \
66 unsigned long map_len; \ 66 p = (type *)(kaddr + part_offset - map_start); \
67 u##bits res; \ 67 res = le##bits##_to_cpu(p->member); \
68 err = map_extent_buffer(eb, offset, \ 68 return res; \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \ 69} \
84void btrfs_set_##name(struct extent_buffer *eb, \ 70void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \ 71 type *s, u##bits val) \
@@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \
87 unsigned long part_offset = (unsigned long)s; \ 73 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \ 74 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \ 75 type *p; \
90 /* ugly, but we want the fast path here */ \ 76 int err; \
91 if (eb->map_token && offset >= eb->map_start && \ 77 char *kaddr; \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \ 78 unsigned long map_start; \
93 eb->map_len) { \ 79 unsigned long map_len; \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \ 80 err = map_private_extent_buffer(eb, offset, \
95 p->member = cpu_to_le##bits(val); \ 81 sizeof(((type *)0)->member), \
96 return; \ 82 &kaddr, &map_start, &map_len); \
97 } \ 83 if (err) { \
98 { \ 84 __le##bits val2; \
99 int err; \ 85 val2 = cpu_to_le##bits(val); \
100 char *map_token; \ 86 write_eb_member(eb, s, type, member, &val2); \
101 char *kaddr; \ 87 return; \
102 int unmap_on_exit = (eb->map_token == NULL); \ 88 } \
103 unsigned long map_start; \ 89 p = (type *)(kaddr + part_offset - map_start); \
104 unsigned long map_len; \ 90 p->member = cpu_to_le##bits(val); \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120} 91}
121 92
122#include "ctree.h" 93#include "ctree.h"
@@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr) 96 struct btrfs_disk_key *disk_key, int nr)
126{ 97{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr); 98 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr, 99 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key); 100 struct btrfs_key_ptr, key, disk_key);
139} 101}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 51dcec86757f..e24b7964a155 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -216,17 +216,11 @@ static void wait_current_trans(struct btrfs_root *root)
216 spin_lock(&root->fs_info->trans_lock); 216 spin_lock(&root->fs_info->trans_lock);
217 cur_trans = root->fs_info->running_transaction; 217 cur_trans = root->fs_info->running_transaction;
218 if (cur_trans && cur_trans->blocked) { 218 if (cur_trans && cur_trans->blocked) {
219 DEFINE_WAIT(wait);
220 atomic_inc(&cur_trans->use_count); 219 atomic_inc(&cur_trans->use_count);
221 spin_unlock(&root->fs_info->trans_lock); 220 spin_unlock(&root->fs_info->trans_lock);
222 while (1) { 221
223 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 222 wait_event(root->fs_info->transaction_wait,
224 TASK_UNINTERRUPTIBLE); 223 !cur_trans->blocked);
225 if (!cur_trans->blocked)
226 break;
227 schedule();
228 }
229 finish_wait(&root->fs_info->transaction_wait, &wait);
230 put_transaction(cur_trans); 224 put_transaction(cur_trans);
231 } else { 225 } else {
232 spin_unlock(&root->fs_info->trans_lock); 226 spin_unlock(&root->fs_info->trans_lock);
@@ -260,7 +254,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
260{ 254{
261 struct btrfs_trans_handle *h; 255 struct btrfs_trans_handle *h;
262 struct btrfs_transaction *cur_trans; 256 struct btrfs_transaction *cur_trans;
263 int retries = 0; 257 u64 num_bytes = 0;
264 int ret; 258 int ret;
265 259
266 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 260 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -274,6 +268,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
274 h->block_rsv = NULL; 268 h->block_rsv = NULL;
275 goto got_it; 269 goto got_it;
276 } 270 }
271
272 /*
273 * Do the reservation before we join the transaction so we can do all
274 * the appropriate flushing if need be.
275 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root,
279 &root->fs_info->trans_block_rsv,
280 num_bytes);
281 if (ret)
282 return ERR_PTR(ret);
283 }
277again: 284again:
278 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 285 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
279 if (!h) 286 if (!h)
@@ -310,24 +317,9 @@ again:
310 goto again; 317 goto again;
311 } 318 }
312 319
313 if (num_items > 0) { 320 if (num_bytes) {
314 ret = btrfs_trans_reserve_metadata(h, root, num_items); 321 h->block_rsv = &root->fs_info->trans_block_rsv;
315 if (ret == -EAGAIN && !retries) { 322 h->bytes_reserved = num_bytes;
316 retries++;
317 btrfs_commit_transaction(h, root);
318 goto again;
319 } else if (ret == -EAGAIN) {
320 /*
321 * We have already retried and got EAGAIN, so really we
322 * don't have space, so set ret to -ENOSPC.
323 */
324 ret = -ENOSPC;
325 }
326
327 if (ret < 0) {
328 btrfs_end_transaction(h, root);
329 return ERR_PTR(ret);
330 }
331 } 323 }
332 324
333got_it: 325got_it:
@@ -359,19 +351,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
359} 351}
360 352
361/* wait for a transaction commit to be fully complete */ 353/* wait for a transaction commit to be fully complete */
362static noinline int wait_for_commit(struct btrfs_root *root, 354static noinline void wait_for_commit(struct btrfs_root *root,
363 struct btrfs_transaction *commit) 355 struct btrfs_transaction *commit)
364{ 356{
365 DEFINE_WAIT(wait); 357 wait_event(commit->commit_wait, commit->commit_done);
366 while (!commit->commit_done) {
367 prepare_to_wait(&commit->commit_wait, &wait,
368 TASK_UNINTERRUPTIBLE);
369 if (commit->commit_done)
370 break;
371 schedule();
372 }
373 finish_wait(&commit->commit_wait, &wait);
374 return 0;
375} 358}
376 359
377int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 360int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -499,10 +482,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
499 } 482 }
500 483
501 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 484 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
502 if (throttle) 485 if (throttle) {
486 /*
487 * We may race with somebody else here so end up having
488 * to call end_transaction on ourselves again, so inc
489 * our use_count.
490 */
491 trans->use_count++;
503 return btrfs_commit_transaction(trans, root); 492 return btrfs_commit_transaction(trans, root);
504 else 493 } else {
505 wake_up_process(info->transaction_kthread); 494 wake_up_process(info->transaction_kthread);
495 }
506 } 496 }
507 497
508 WARN_ON(cur_trans != info->running_transaction); 498 WARN_ON(cur_trans != info->running_transaction);
@@ -894,6 +884,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
894 struct btrfs_root *tree_root = fs_info->tree_root; 884 struct btrfs_root *tree_root = fs_info->tree_root;
895 struct btrfs_root *root = pending->root; 885 struct btrfs_root *root = pending->root;
896 struct btrfs_root *parent_root; 886 struct btrfs_root *parent_root;
887 struct btrfs_block_rsv *rsv;
897 struct inode *parent_inode; 888 struct inode *parent_inode;
898 struct dentry *parent; 889 struct dentry *parent;
899 struct dentry *dentry; 890 struct dentry *dentry;
@@ -905,6 +896,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
905 u64 objectid; 896 u64 objectid;
906 u64 root_flags; 897 u64 root_flags;
907 898
899 rsv = trans->block_rsv;
900
908 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 901 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
909 if (!new_root_item) { 902 if (!new_root_item) {
910 pending->error = -ENOMEM; 903 pending->error = -ENOMEM;
@@ -1012,6 +1005,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1012 btrfs_orphan_post_snapshot(trans, pending); 1005 btrfs_orphan_post_snapshot(trans, pending);
1013fail: 1006fail:
1014 kfree(new_root_item); 1007 kfree(new_root_item);
1008 trans->block_rsv = rsv;
1015 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1009 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1016 return 0; 1010 return 0;
1017} 1011}
@@ -1080,22 +1074,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1080static void wait_current_trans_commit_start(struct btrfs_root *root, 1074static void wait_current_trans_commit_start(struct btrfs_root *root,
1081 struct btrfs_transaction *trans) 1075 struct btrfs_transaction *trans)
1082{ 1076{
1083 DEFINE_WAIT(wait); 1077 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
1084
1085 if (trans->in_commit)
1086 return;
1087
1088 while (1) {
1089 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1090 TASK_UNINTERRUPTIBLE);
1091 if (trans->in_commit) {
1092 finish_wait(&root->fs_info->transaction_blocked_wait,
1093 &wait);
1094 break;
1095 }
1096 schedule();
1097 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1098 }
1099} 1078}
1100 1079
1101/* 1080/*
@@ -1105,24 +1084,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
1105static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, 1084static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1106 struct btrfs_transaction *trans) 1085 struct btrfs_transaction *trans)
1107{ 1086{
1108 DEFINE_WAIT(wait); 1087 wait_event(root->fs_info->transaction_wait,
1109 1088 trans->commit_done || (trans->in_commit && !trans->blocked));
1110 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1111 return;
1112
1113 while (1) {
1114 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1115 TASK_UNINTERRUPTIBLE);
1116 if (trans->commit_done ||
1117 (trans->in_commit && !trans->blocked)) {
1118 finish_wait(&root->fs_info->transaction_wait,
1119 &wait);
1120 break;
1121 }
1122 schedule();
1123 finish_wait(&root->fs_info->transaction_wait,
1124 &wait);
1125 }
1126} 1089}
1127 1090
1128/* 1091/*
@@ -1229,8 +1192,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1229 atomic_inc(&cur_trans->use_count); 1192 atomic_inc(&cur_trans->use_count);
1230 btrfs_end_transaction(trans, root); 1193 btrfs_end_transaction(trans, root);
1231 1194
1232 ret = wait_for_commit(root, cur_trans); 1195 wait_for_commit(root, cur_trans);
1233 BUG_ON(ret);
1234 1196
1235 put_transaction(cur_trans); 1197 put_transaction(cur_trans);
1236 1198
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f41d1e..786639fca067 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct extent_buffer *eb, int slot, 799 struct extent_buffer *eb, int slot,
800 struct btrfs_key *key) 800 struct btrfs_key *key)
801{ 801{
802 struct inode *dir;
803 int ret;
804 struct btrfs_inode_ref *ref; 802 struct btrfs_inode_ref *ref;
803 struct btrfs_dir_item *di;
804 struct inode *dir;
805 struct inode *inode; 805 struct inode *inode;
806 char *name;
807 int namelen;
808 unsigned long ref_ptr; 806 unsigned long ref_ptr;
809 unsigned long ref_end; 807 unsigned long ref_end;
808 char *name;
809 int namelen;
810 int ret;
810 int search_done = 0; 811 int search_done = 0;
811 812
812 /* 813 /*
@@ -909,6 +910,25 @@ again:
909 } 910 }
910 btrfs_release_path(path); 911 btrfs_release_path(path);
911 912
913 /* look for a conflicting sequence number */
914 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
915 btrfs_inode_ref_index(eb, ref),
916 name, namelen, 0);
917 if (di && !IS_ERR(di)) {
918 ret = drop_one_dir_item(trans, root, path, dir, di);
919 BUG_ON(ret);
920 }
921 btrfs_release_path(path);
922
923 /* look for a conflicing name */
924 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
925 name, namelen, 0);
926 if (di && !IS_ERR(di)) {
927 ret = drop_one_dir_item(trans, root, path, dir, di);
928 BUG_ON(ret);
929 }
930 btrfs_release_path(path);
931
912insert: 932insert:
913 /* insert our name */ 933 /* insert our name */
914 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 934 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
@@ -1617,7 +1637,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1617 return 0; 1637 return 0;
1618 1638
1619 path = btrfs_alloc_path(); 1639 path = btrfs_alloc_path();
1620 BUG_ON(!path); 1640 if (!path)
1641 return -ENOMEM;
1621 1642
1622 nritems = btrfs_header_nritems(eb); 1643 nritems = btrfs_header_nritems(eb);
1623 for (i = 0; i < nritems; i++) { 1644 for (i = 0; i < nritems; i++) {
@@ -1723,15 +1744,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1723 return -ENOMEM; 1744 return -ENOMEM;
1724 1745
1725 if (*level == 1) { 1746 if (*level == 1) {
1726 wc->process_func(root, next, wc, ptr_gen); 1747 ret = wc->process_func(root, next, wc, ptr_gen);
1748 if (ret)
1749 return ret;
1727 1750
1728 path->slots[*level]++; 1751 path->slots[*level]++;
1729 if (wc->free) { 1752 if (wc->free) {
1730 btrfs_read_buffer(next, ptr_gen); 1753 btrfs_read_buffer(next, ptr_gen);
1731 1754
1732 btrfs_tree_lock(next); 1755 btrfs_tree_lock(next);
1733 clean_tree_block(trans, root, next);
1734 btrfs_set_lock_blocking(next); 1756 btrfs_set_lock_blocking(next);
1757 clean_tree_block(trans, root, next);
1735 btrfs_wait_tree_block_writeback(next); 1758 btrfs_wait_tree_block_writeback(next);
1736 btrfs_tree_unlock(next); 1759 btrfs_tree_unlock(next);
1737 1760
@@ -1788,16 +1811,19 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1788 parent = path->nodes[*level + 1]; 1811 parent = path->nodes[*level + 1];
1789 1812
1790 root_owner = btrfs_header_owner(parent); 1813 root_owner = btrfs_header_owner(parent);
1791 wc->process_func(root, path->nodes[*level], wc, 1814 ret = wc->process_func(root, path->nodes[*level], wc,
1792 btrfs_header_generation(path->nodes[*level])); 1815 btrfs_header_generation(path->nodes[*level]));
1816 if (ret)
1817 return ret;
1818
1793 if (wc->free) { 1819 if (wc->free) {
1794 struct extent_buffer *next; 1820 struct extent_buffer *next;
1795 1821
1796 next = path->nodes[*level]; 1822 next = path->nodes[*level];
1797 1823
1798 btrfs_tree_lock(next); 1824 btrfs_tree_lock(next);
1799 clean_tree_block(trans, root, next);
1800 btrfs_set_lock_blocking(next); 1825 btrfs_set_lock_blocking(next);
1826 clean_tree_block(trans, root, next);
1801 btrfs_wait_tree_block_writeback(next); 1827 btrfs_wait_tree_block_writeback(next);
1802 btrfs_tree_unlock(next); 1828 btrfs_tree_unlock(next);
1803 1829
@@ -1864,8 +1890,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1864 next = path->nodes[orig_level]; 1890 next = path->nodes[orig_level];
1865 1891
1866 btrfs_tree_lock(next); 1892 btrfs_tree_lock(next);
1867 clean_tree_block(trans, log, next);
1868 btrfs_set_lock_blocking(next); 1893 btrfs_set_lock_blocking(next);
1894 clean_tree_block(trans, log, next);
1869 btrfs_wait_tree_block_writeback(next); 1895 btrfs_wait_tree_block_writeback(next);
1870 btrfs_tree_unlock(next); 1896 btrfs_tree_unlock(next);
1871 1897
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 19450bc53632..f2a4cc79da61 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
142 unsigned long limit; 142 unsigned long limit;
143 unsigned long last_waited = 0; 143 unsigned long last_waited = 0;
144 int force_reg = 0; 144 int force_reg = 0;
145 int sync_pending = 0;
145 struct blk_plug plug; 146 struct blk_plug plug;
146 147
147 /* 148 /*
@@ -229,6 +230,22 @@ loop_lock:
229 230
230 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 231 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
231 232
233 /*
234 * if we're doing the sync list, record that our
235 * plug has some sync requests on it
236 *
237 * If we're doing the regular list and there are
238 * sync requests sitting around, unplug before
239 * we add more
240 */
241 if (pending_bios == &device->pending_sync_bios) {
242 sync_pending = 1;
243 } else if (sync_pending) {
244 blk_finish_plug(&plug);
245 blk_start_plug(&plug);
246 sync_pending = 0;
247 }
248
232 submit_bio(cur->bi_rw, cur); 249 submit_bio(cur->bi_rw, cur);
233 num_run++; 250 num_run++;
234 batch_run++; 251 batch_run++;
@@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
500 fs_devices->rw_devices--; 517 fs_devices->rw_devices--;
501 } 518 }
502 519
520 if (device->can_discard)
521 fs_devices->num_can_discard--;
522
503 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 523 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
504 BUG_ON(!new_device); 524 BUG_ON(!new_device);
505 memcpy(new_device, device, sizeof(*new_device)); 525 memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
508 new_device->bdev = NULL; 528 new_device->bdev = NULL;
509 new_device->writeable = 0; 529 new_device->writeable = 0;
510 new_device->in_fs_metadata = 0; 530 new_device->in_fs_metadata = 0;
531 new_device->can_discard = 0;
511 list_replace_rcu(&device->dev_list, &new_device->dev_list); 532 list_replace_rcu(&device->dev_list, &new_device->dev_list);
512 533
513 call_rcu(&device->rcu, free_device); 534 call_rcu(&device->rcu, free_device);
@@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
547static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 568static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
548 fmode_t flags, void *holder) 569 fmode_t flags, void *holder)
549{ 570{
571 struct request_queue *q;
550 struct block_device *bdev; 572 struct block_device *bdev;
551 struct list_head *head = &fs_devices->devices; 573 struct list_head *head = &fs_devices->devices;
552 struct btrfs_device *device; 574 struct btrfs_device *device;
@@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
603 seeding = 0; 625 seeding = 0;
604 } 626 }
605 627
628 q = bdev_get_queue(bdev);
629 if (blk_queue_discard(q)) {
630 device->can_discard = 1;
631 fs_devices->num_can_discard++;
632 }
633
606 device->bdev = bdev; 634 device->bdev = bdev;
607 device->in_fs_metadata = 0; 635 device->in_fs_metadata = 0;
608 device->mode = flags; 636 device->mode = flags;
@@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
835 863
836 max_hole_start = search_start; 864 max_hole_start = search_start;
837 max_hole_size = 0; 865 max_hole_size = 0;
866 hole_size = 0;
838 867
839 if (search_start >= search_end) { 868 if (search_start >= search_end) {
840 ret = -ENOSPC; 869 ret = -ENOSPC;
@@ -917,7 +946,14 @@ next:
917 cond_resched(); 946 cond_resched();
918 } 947 }
919 948
920 hole_size = search_end- search_start; 949 /*
950 * At this point, search_start should be the end of
951 * allocated dev extents, and when shrinking the device,
952 * search_end may be smaller than search_start.
953 */
954 if (search_end > search_start)
955 hole_size = search_end - search_start;
956
921 if (hole_size > max_hole_size) { 957 if (hole_size > max_hole_size) {
922 max_hole_start = search_start; 958 max_hole_start = search_start;
923 max_hole_size = hole_size; 959 max_hole_size = hole_size;
@@ -1037,7 +1073,8 @@ static noinline int find_next_chunk(struct btrfs_root *root,
1037 struct btrfs_key found_key; 1073 struct btrfs_key found_key;
1038 1074
1039 path = btrfs_alloc_path(); 1075 path = btrfs_alloc_path();
1040 BUG_ON(!path); 1076 if (!path)
1077 return -ENOMEM;
1041 1078
1042 key.objectid = objectid; 1079 key.objectid = objectid;
1043 key.offset = (u64)-1; 1080 key.offset = (u64)-1;
@@ -1542,6 +1579,7 @@ error:
1542 1579
1543int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1580int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1544{ 1581{
1582 struct request_queue *q;
1545 struct btrfs_trans_handle *trans; 1583 struct btrfs_trans_handle *trans;
1546 struct btrfs_device *device; 1584 struct btrfs_device *device;
1547 struct block_device *bdev; 1585 struct block_device *bdev;
@@ -1611,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1611 1649
1612 lock_chunks(root); 1650 lock_chunks(root);
1613 1651
1652 q = bdev_get_queue(bdev);
1653 if (blk_queue_discard(q))
1654 device->can_discard = 1;
1614 device->writeable = 1; 1655 device->writeable = 1;
1615 device->work.func = pending_bios_fn; 1656 device->work.func = pending_bios_fn;
1616 generate_random_uuid(device->uuid); 1657 generate_random_uuid(device->uuid);
@@ -1646,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1646 root->fs_info->fs_devices->num_devices++; 1687 root->fs_info->fs_devices->num_devices++;
1647 root->fs_info->fs_devices->open_devices++; 1688 root->fs_info->fs_devices->open_devices++;
1648 root->fs_info->fs_devices->rw_devices++; 1689 root->fs_info->fs_devices->rw_devices++;
1690 if (device->can_discard)
1691 root->fs_info->fs_devices->num_can_discard++;
1649 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1650 1693
1651 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1694 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
@@ -2061,8 +2104,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
2061 2104
2062 /* step two, relocate all the chunks */ 2105 /* step two, relocate all the chunks */
2063 path = btrfs_alloc_path(); 2106 path = btrfs_alloc_path();
2064 BUG_ON(!path); 2107 if (!path) {
2065 2108 ret = -ENOMEM;
2109 goto error;
2110 }
2066 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2111 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2067 key.offset = (u64)-1; 2112 key.offset = (u64)-1;
2068 key.type = BTRFS_CHUNK_ITEM_KEY; 2113 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -2410,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2410 total_avail = device->total_bytes - device->bytes_used; 2455 total_avail = device->total_bytes - device->bytes_used;
2411 else 2456 else
2412 total_avail = 0; 2457 total_avail = 0;
2413 /* avail is off by max(alloc_start, 1MB), but that is the same 2458
2414 * for all devices, so it doesn't hurt the sorting later on 2459 /* If there is no space on this device, skip it. */
2415 */ 2460 if (total_avail == 0)
2461 continue;
2416 2462
2417 ret = find_free_dev_extent(trans, device, 2463 ret = find_free_dev_extent(trans, device,
2418 max_stripe_size * dev_stripes, 2464 max_stripe_size * dev_stripes,
@@ -2661,7 +2707,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2661 2707
2662 ret = find_next_chunk(fs_info->chunk_root, 2708 ret = find_next_chunk(fs_info->chunk_root,
2663 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2709 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2664 BUG_ON(ret); 2710 if (ret)
2711 return ret;
2665 2712
2666 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2713 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2667 (fs_info->metadata_alloc_profile & 2714 (fs_info->metadata_alloc_profile &
@@ -3595,7 +3642,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3595 if (!sb) 3642 if (!sb)
3596 return -ENOMEM; 3643 return -ENOMEM;
3597 btrfs_set_buffer_uptodate(sb); 3644 btrfs_set_buffer_uptodate(sb);
3598 btrfs_set_buffer_lockdep_class(sb, 0); 3645 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
3599 3646
3600 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3647 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3601 array_size = btrfs_super_sys_array_size(super_copy); 3648 array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61ae7ae..6d866db4e177 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_device {
48 int writeable; 48 int writeable;
49 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing; 50 int missing;
51 int can_discard;
51 52
52 spinlock_t io_lock; 53 spinlock_t io_lock;
53 54
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
104 u64 rw_devices; 105 u64 rw_devices;
105 u64 missing_devices; 106 u64 missing_devices;
106 u64 total_rw_bytes; 107 u64 total_rw_bytes;
108 u64 num_can_discard;
107 struct block_device *latest_bdev; 109 struct block_device *latest_bdev;
108 110
109 /* all of the devices in the FS, protected by a mutex 111 /* all of the devices in the FS, protected by a mutex
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5366fe452ab0..69565e5fc6a0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -102,48 +102,71 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
102 if (!path) 102 if (!path)
103 return -ENOMEM; 103 return -ENOMEM;
104 104
105 /* first lets see if we already have this xattr */ 105 if (flags & XATTR_REPLACE) {
106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, 106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
107 strlen(name), -1); 107 name_len, -1);
108 if (IS_ERR(di)) { 108 if (IS_ERR(di)) {
109 ret = PTR_ERR(di); 109 ret = PTR_ERR(di);
110 goto out; 110 goto out;
111 } 111 } else if (!di) {
112 112 ret = -ENODATA;
113 /* ok we already have this xattr, lets remove it */
114 if (di) {
115 /* if we want create only exit */
116 if (flags & XATTR_CREATE) {
117 ret = -EEXIST;
118 goto out; 113 goto out;
119 } 114 }
120
121 ret = btrfs_delete_one_dir_name(trans, root, path, di); 115 ret = btrfs_delete_one_dir_name(trans, root, path, di);
122 BUG_ON(ret); 116 if (ret)
117 goto out;
123 btrfs_release_path(path); 118 btrfs_release_path(path);
124 119
125 /* if we don't have a value then we are removing the xattr */ 120 /*
121 * remove the attribute
122 */
126 if (!value) 123 if (!value)
127 goto out; 124 goto out;
128 } else { 125 }
126
127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size);
130 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE)
132 goto out;
133 /*
134 * We can't use the path we already have since we won't have the
135 * proper locking for a delete, so release the path and
136 * re-lookup to delete the thing.
137 */
129 btrfs_release_path(path); 138 btrfs_release_path(path);
139 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
140 name, name_len, -1);
141 if (IS_ERR(di)) {
142 ret = PTR_ERR(di);
143 goto out;
144 } else if (!di) {
145 /* Shouldn't happen but just in case... */
146 btrfs_release_path(path);
147 goto again;
148 }
130 149
131 if (flags & XATTR_REPLACE) { 150 ret = btrfs_delete_one_dir_name(trans, root, path, di);
132 /* we couldn't find the attr to replace */ 151 if (ret)
133 ret = -ENODATA;
134 goto out; 152 goto out;
153
154 /*
155 * We have a value to set, so go back and try to insert it now.
156 */
157 if (value) {
158 btrfs_release_path(path);
159 goto again;
135 } 160 }
136 } 161 }
137
138 /* ok we have to create a completely new xattr */
139 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
140 name, name_len, value, size);
141 BUG_ON(ret);
142out: 162out:
143 btrfs_free_path(path); 163 btrfs_free_path(path);
144 return ret; 164 return ret;
145} 165}
146 166
167/*
168 * @value: "" makes the attribute to empty, NULL removes it
169 */
147int __btrfs_setxattr(struct btrfs_trans_handle *trans, 170int __btrfs_setxattr(struct btrfs_trans_handle *trans,
148 struct inode *inode, const char *name, 171 struct inode *inode, const char *name,
149 const void *value, size_t size, int flags) 172 const void *value, size_t size, int flags)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 0dba6915712b..fb962efdacee 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p)
102 path = NULL; 102 path = NULL;
103 spin_lock(&req->r_old_dentry->d_lock); 103 spin_lock(&req->r_old_dentry->d_lock);
104 seq_printf(s, " #%llx/%.*s (%s)", 104 seq_printf(s, " #%llx/%.*s (%s)",
105 ceph_ino(req->r_old_dentry->d_parent->d_inode), 105 ceph_ino(req->r_old_dentry_dir),
106 req->r_old_dentry->d_name.len, 106 req->r_old_dentry->d_name.len,
107 req->r_old_dentry->d_name.name, 107 req->r_old_dentry->d_name.name,
108 path ? path : ""); 108 path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1065ac779840..382abc9a6a54 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry)
40 if (dentry->d_fsdata) 40 if (dentry->d_fsdata)
41 return 0; 41 return 0;
42 42
43 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
44 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
45 d_set_d_op(dentry, &ceph_dentry_ops);
46 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
47 d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
48 else
49 d_set_d_op(dentry, &ceph_snap_dentry_ops);
50
51 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); 43 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
52 if (!di) 44 if (!di)
53 return -ENOMEM; /* oh well */ 45 return -ENOMEM; /* oh well */
@@ -58,16 +50,42 @@ int ceph_init_dentry(struct dentry *dentry)
58 kmem_cache_free(ceph_dentry_cachep, di); 50 kmem_cache_free(ceph_dentry_cachep, di);
59 goto out_unlock; 51 goto out_unlock;
60 } 52 }
53
54 if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
55 ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
56 d_set_d_op(dentry, &ceph_dentry_ops);
57 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
58 d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
59 else
60 d_set_d_op(dentry, &ceph_snap_dentry_ops);
61
61 di->dentry = dentry; 62 di->dentry = dentry;
62 di->lease_session = NULL; 63 di->lease_session = NULL;
63 dentry->d_fsdata = di;
64 dentry->d_time = jiffies; 64 dentry->d_time = jiffies;
65 /* avoid reordering d_fsdata setup so that the check above is safe */
66 smp_mb();
67 dentry->d_fsdata = di;
65 ceph_dentry_lru_add(dentry); 68 ceph_dentry_lru_add(dentry);
66out_unlock: 69out_unlock:
67 spin_unlock(&dentry->d_lock); 70 spin_unlock(&dentry->d_lock);
68 return 0; 71 return 0;
69} 72}
70 73
74struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
75{
76 struct inode *inode = NULL;
77
78 if (!dentry)
79 return NULL;
80
81 spin_lock(&dentry->d_lock);
82 if (dentry->d_parent) {
83 inode = dentry->d_parent->d_inode;
84 ihold(inode);
85 }
86 spin_unlock(&dentry->d_lock);
87 return inode;
88}
71 89
72 90
73/* 91/*
@@ -133,7 +151,7 @@ more:
133 d_unhashed(dentry) ? "!hashed" : "hashed", 151 d_unhashed(dentry) ? "!hashed" : "hashed",
134 parent->d_subdirs.prev, parent->d_subdirs.next); 152 parent->d_subdirs.prev, parent->d_subdirs.next);
135 if (p == &parent->d_subdirs) { 153 if (p == &parent->d_subdirs) {
136 fi->at_end = 1; 154 fi->flags |= CEPH_F_ATEND;
137 goto out_unlock; 155 goto out_unlock;
138 } 156 }
139 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 157 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
@@ -234,7 +252,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
234 const int max_bytes = fsc->mount_options->max_readdir_bytes; 252 const int max_bytes = fsc->mount_options->max_readdir_bytes;
235 253
236 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 254 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
237 if (fi->at_end) 255 if (fi->flags & CEPH_F_ATEND)
238 return 0; 256 return 0;
239 257
240 /* always start with . and .. */ 258 /* always start with . and .. */
@@ -403,7 +421,7 @@ more:
403 dout("readdir next frag is %x\n", frag); 421 dout("readdir next frag is %x\n", frag);
404 goto more; 422 goto more;
405 } 423 }
406 fi->at_end = 1; 424 fi->flags |= CEPH_F_ATEND;
407 425
408 /* 426 /*
409 * if dir_release_count still matches the dir, no dentries 427 * if dir_release_count still matches the dir, no dentries
@@ -435,7 +453,7 @@ static void reset_readdir(struct ceph_file_info *fi)
435 dput(fi->dentry); 453 dput(fi->dentry);
436 fi->dentry = NULL; 454 fi->dentry = NULL;
437 } 455 }
438 fi->at_end = 0; 456 fi->flags &= ~CEPH_F_ATEND;
439} 457}
440 458
441static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 459static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
@@ -463,7 +481,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
463 if (offset != file->f_pos) { 481 if (offset != file->f_pos) {
464 file->f_pos = offset; 482 file->f_pos = offset;
465 file->f_version = 0; 483 file->f_version = 0;
466 fi->at_end = 0; 484 fi->flags &= ~CEPH_F_ATEND;
467 } 485 }
468 retval = offset; 486 retval = offset;
469 487
@@ -488,21 +506,13 @@ out:
488} 506}
489 507
490/* 508/*
491 * Process result of a lookup/open request. 509 * Handle lookups for the hidden .snap directory.
492 *
493 * Mainly, make sure we return the final req->r_dentry (if it already
494 * existed) in place of the original VFS-provided dentry when they
495 * differ.
496 *
497 * Gracefully handle the case where the MDS replies with -ENOENT and
498 * no trace (which it may do, at its discretion, e.g., if it doesn't
499 * care to issue a lease on the negative dentry).
500 */ 510 */
501struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 511int ceph_handle_snapdir(struct ceph_mds_request *req,
502 struct dentry *dentry, int err) 512 struct dentry *dentry, int err)
503{ 513{
504 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 514 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
505 struct inode *parent = dentry->d_parent->d_inode; 515 struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
506 516
507 /* .snap dir? */ 517 /* .snap dir? */
508 if (err == -ENOENT && 518 if (err == -ENOENT &&
@@ -516,7 +526,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
516 d_add(dentry, inode); 526 d_add(dentry, inode);
517 err = 0; 527 err = 0;
518 } 528 }
529 return err;
530}
519 531
532/*
533 * Figure out final result of a lookup/open request.
534 *
535 * Mainly, make sure we return the final req->r_dentry (if it already
536 * existed) in place of the original VFS-provided dentry when they
537 * differ.
538 *
539 * Gracefully handle the case where the MDS replies with -ENOENT and
540 * no trace (which it may do, at its discretion, e.g., if it doesn't
541 * care to issue a lease on the negative dentry).
542 */
543struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
544 struct dentry *dentry, int err)
545{
520 if (err == -ENOENT) { 546 if (err == -ENOENT) {
521 /* no trace? */ 547 /* no trace? */
522 err = 0; 548 err = 0;
@@ -610,6 +636,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
610 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 636 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
611 req->r_locked_dir = dir; 637 req->r_locked_dir = dir;
612 err = ceph_mdsc_do_request(mdsc, NULL, req); 638 err = ceph_mdsc_do_request(mdsc, NULL, req);
639 err = ceph_handle_snapdir(req, dentry, err);
613 dentry = ceph_finish_lookup(req, dentry, err); 640 dentry = ceph_finish_lookup(req, dentry, err);
614 ceph_mdsc_put_request(req); /* will dput(dentry) */ 641 ceph_mdsc_put_request(req); /* will dput(dentry) */
615 dout("lookup result=%p\n", dentry); 642 dout("lookup result=%p\n", dentry);
@@ -789,6 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
789 req->r_dentry = dget(dentry); 816 req->r_dentry = dget(dentry);
790 req->r_num_caps = 2; 817 req->r_num_caps = 2;
791 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ 818 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
819 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
792 req->r_locked_dir = dir; 820 req->r_locked_dir = dir;
793 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 821 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
794 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 822 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -887,6 +915,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
887 req->r_dentry = dget(new_dentry); 915 req->r_dentry = dget(new_dentry);
888 req->r_num_caps = 2; 916 req->r_num_caps = 2;
889 req->r_old_dentry = dget(old_dentry); 917 req->r_old_dentry = dget(old_dentry);
918 req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
890 req->r_locked_dir = new_dir; 919 req->r_locked_dir = new_dir;
891 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; 920 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
892 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; 921 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -1002,36 +1031,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
1002 */ 1031 */
1003static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) 1032static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
1004{ 1033{
1034 int valid = 0;
1005 struct inode *dir; 1035 struct inode *dir;
1006 1036
1007 if (nd && nd->flags & LOOKUP_RCU) 1037 if (nd && nd->flags & LOOKUP_RCU)
1008 return -ECHILD; 1038 return -ECHILD;
1009 1039
1010 dir = dentry->d_parent->d_inode;
1011
1012 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, 1040 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
1013 dentry->d_name.len, dentry->d_name.name, dentry->d_inode, 1041 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
1014 ceph_dentry(dentry)->offset); 1042 ceph_dentry(dentry)->offset);
1015 1043
1044 dir = ceph_get_dentry_parent_inode(dentry);
1045
1016 /* always trust cached snapped dentries, snapdir dentry */ 1046 /* always trust cached snapped dentries, snapdir dentry */
1017 if (ceph_snap(dir) != CEPH_NOSNAP) { 1047 if (ceph_snap(dir) != CEPH_NOSNAP) {
1018 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, 1048 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
1019 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 1049 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
1020 goto out_touch; 1050 valid = 1;
1051 } else if (dentry->d_inode &&
1052 ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
1053 valid = 1;
1054 } else if (dentry_lease_is_valid(dentry) ||
1055 dir_lease_is_valid(dir, dentry)) {
1056 valid = 1;
1021 } 1057 }
1022 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
1023 goto out_touch;
1024
1025 if (dentry_lease_is_valid(dentry) ||
1026 dir_lease_is_valid(dir, dentry))
1027 goto out_touch;
1028 1058
1029 dout("d_revalidate %p invalid\n", dentry); 1059 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1030 d_drop(dentry); 1060 if (valid)
1031 return 0; 1061 ceph_dentry_lru_touch(dentry);
1032out_touch: 1062 else
1033 ceph_dentry_lru_touch(dentry); 1063 d_drop(dentry);
1034 return 1; 1064 iput(dir);
1065 return valid;
1035} 1066}
1036 1067
1037/* 1068/*
@@ -1228,9 +1259,8 @@ void ceph_dentry_lru_del(struct dentry *dn)
1228 * Return name hash for a given dentry. This is dependent on 1259 * Return name hash for a given dentry. This is dependent on
1229 * the parent directory's hash function. 1260 * the parent directory's hash function.
1230 */ 1261 */
1231unsigned ceph_dentry_hash(struct dentry *dn) 1262unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
1232{ 1263{
1233 struct inode *dir = dn->d_parent->d_inode;
1234 struct ceph_inode_info *dci = ceph_inode(dir); 1264 struct ceph_inode_info *dci = ceph_inode(dir);
1235 1265
1236 switch (dci->i_dir_layout.dl_dir_hash) { 1266 switch (dci->i_dir_layout.dl_dir_hash) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index f67b687550de..9fbcdecaaccd 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
46 int type; 46 int type;
47 struct ceph_nfs_fh *fh = (void *)rawfh; 47 struct ceph_nfs_fh *fh = (void *)rawfh;
48 struct ceph_nfs_confh *cfh = (void *)rawfh; 48 struct ceph_nfs_confh *cfh = (void *)rawfh;
49 struct dentry *parent = dentry->d_parent; 49 struct dentry *parent;
50 struct inode *inode = dentry->d_inode; 50 struct inode *inode = dentry->d_inode;
51 int connected_handle_length = sizeof(*cfh)/4; 51 int connected_handle_length = sizeof(*cfh)/4;
52 int handle_length = sizeof(*fh)/4; 52 int handle_length = sizeof(*fh)/4;
@@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
55 if (ceph_snap(inode) != CEPH_NOSNAP) 55 if (ceph_snap(inode) != CEPH_NOSNAP)
56 return -EINVAL; 56 return -EINVAL;
57 57
58 spin_lock(&dentry->d_lock);
59 parent = dget(dentry->d_parent);
60 spin_unlock(&dentry->d_lock);
61
58 if (*max_len >= connected_handle_length) { 62 if (*max_len >= connected_handle_length) {
59 dout("encode_fh %p connectable\n", dentry); 63 dout("encode_fh %p connectable\n", dentry);
60 cfh->ino = ceph_ino(dentry->d_inode); 64 cfh->ino = ceph_ino(dentry->d_inode);
61 cfh->parent_ino = ceph_ino(parent->d_inode); 65 cfh->parent_ino = ceph_ino(parent->d_inode);
62 cfh->parent_name_hash = ceph_dentry_hash(parent); 66 cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
67 dentry);
63 *max_len = connected_handle_length; 68 *max_len = connected_handle_length;
64 type = 2; 69 type = 2;
65 } else if (*max_len >= handle_length) { 70 } else if (*max_len >= handle_length) {
66 if (connectable) { 71 if (connectable) {
67 *max_len = connected_handle_length; 72 *max_len = connected_handle_length;
68 return 255; 73 type = 255;
74 } else {
75 dout("encode_fh %p\n", dentry);
76 fh->ino = ceph_ino(dentry->d_inode);
77 *max_len = handle_length;
78 type = 1;
69 } 79 }
70 dout("encode_fh %p\n", dentry);
71 fh->ino = ceph_ino(dentry->d_inode);
72 *max_len = handle_length;
73 type = 1;
74 } else { 80 } else {
75 *max_len = handle_length; 81 *max_len = handle_length;
76 return 255; 82 type = 255;
77 } 83 }
84 dput(parent);
78 return type; 85 return type;
79} 86}
80 87
@@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
123 return dentry; 130 return dentry;
124 } 131 }
125 err = ceph_init_dentry(dentry); 132 err = ceph_init_dentry(dentry);
126
127 if (err < 0) { 133 if (err < 0) {
128 iput(inode); 134 iput(inode);
129 return ERR_PTR(err); 135 return ERR_PTR(err);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0d0eae05598f..ce549d31eeb7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file)
122 struct ceph_mds_client *mdsc = fsc->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
123 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
124 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = NULL;
126 int err; 126 int err;
127 int flags, fmode, wanted; 127 int flags, fmode, wanted;
128 128
@@ -194,7 +194,10 @@ int ceph_open(struct inode *inode, struct file *file)
194 req->r_inode = inode; 194 req->r_inode = inode;
195 ihold(inode); 195 ihold(inode);
196 req->r_num_caps = 1; 196 req->r_num_caps = 1;
197 if (flags & (O_CREAT|O_TRUNC))
198 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
197 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 199 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
200 iput(parent_inode);
198 if (!err) 201 if (!err)
199 err = ceph_init_file(inode, file, req->r_fmode); 202 err = ceph_init_file(inode, file, req->r_fmode);
200 ceph_mdsc_put_request(req); 203 ceph_mdsc_put_request(req);
@@ -222,9 +225,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
222{ 225{
223 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 226 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
224 struct ceph_mds_client *mdsc = fsc->mdsc; 227 struct ceph_mds_client *mdsc = fsc->mdsc;
225 struct file *file = nd->intent.open.file; 228 struct file *file;
226 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
227 struct ceph_mds_request *req; 229 struct ceph_mds_request *req;
230 struct dentry *ret;
228 int err; 231 int err;
229 int flags = nd->intent.open.flags; 232 int flags = nd->intent.open.flags;
230 233
@@ -242,16 +245,24 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
242 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 245 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
243 } 246 }
244 req->r_locked_dir = dir; /* caller holds dir->i_mutex */ 247 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
245 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 248 err = ceph_mdsc_do_request(mdsc,
246 dentry = ceph_finish_lookup(req, dentry, err); 249 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
247 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 250 req);
251 err = ceph_handle_snapdir(req, dentry, err);
252 if (err)
253 goto out;
254 if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
248 err = ceph_handle_notrace_create(dir, dentry); 255 err = ceph_handle_notrace_create(dir, dentry);
249 if (!err) 256 if (err)
250 err = ceph_init_file(req->r_dentry->d_inode, file, 257 goto out;
251 req->r_fmode); 258 file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open);
259 if (IS_ERR(file))
260 err = PTR_ERR(file);
261out:
262 ret = ceph_finish_lookup(req, dentry, err);
252 ceph_mdsc_put_request(req); 263 ceph_mdsc_put_request(req);
253 dout("ceph_lookup_open result=%p\n", dentry); 264 dout("ceph_lookup_open result=%p\n", ret);
254 return dentry; 265 return ret;
255} 266}
256 267
257int ceph_release(struct inode *inode, struct file *file) 268int ceph_release(struct inode *inode, struct file *file)
@@ -643,7 +654,8 @@ again:
643 654
644 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 655 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
645 (iocb->ki_filp->f_flags & O_DIRECT) || 656 (iocb->ki_filp->f_flags & O_DIRECT) ||
646 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) 657 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
658 (fi->flags & CEPH_F_SYNC))
647 /* hmm, this isn't really async... */ 659 /* hmm, this isn't really async... */
648 ret = ceph_sync_read(filp, base, len, ppos, &checkeof); 660 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
649 else 661 else
@@ -712,7 +724,7 @@ retry_snap:
712 want = CEPH_CAP_FILE_BUFFER; 724 want = CEPH_CAP_FILE_BUFFER;
713 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); 725 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
714 if (ret < 0) 726 if (ret < 0)
715 goto out; 727 goto out_put;
716 728
717 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", 729 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
718 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 730 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
@@ -720,12 +732,23 @@ retry_snap:
720 732
721 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 733 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
722 (iocb->ki_filp->f_flags & O_DIRECT) || 734 (iocb->ki_filp->f_flags & O_DIRECT) ||
723 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { 735 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
736 (fi->flags & CEPH_F_SYNC)) {
724 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, 737 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
725 &iocb->ki_pos); 738 &iocb->ki_pos);
726 } else { 739 } else {
727 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 740 /*
741 * buffered write; drop Fw early to avoid slow
742 * revocation if we get stuck on balance_dirty_pages
743 */
744 int dirty;
728 745
746 spin_lock(&inode->i_lock);
747 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
748 spin_unlock(&inode->i_lock);
749 ceph_put_cap_refs(ci, got);
750
751 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
729 if ((ret >= 0 || ret == -EIOCBQUEUED) && 752 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
730 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 753 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
731 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 754 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
@@ -733,7 +756,12 @@ retry_snap:
733 if (err < 0) 756 if (err < 0)
734 ret = err; 757 ret = err;
735 } 758 }
759
760 if (dirty)
761 __mark_inode_dirty(inode, dirty);
762 goto out;
736 } 763 }
764
737 if (ret >= 0) { 765 if (ret >= 0) {
738 int dirty; 766 int dirty;
739 spin_lock(&inode->i_lock); 767 spin_lock(&inode->i_lock);
@@ -743,12 +771,13 @@ retry_snap:
743 __mark_inode_dirty(inode, dirty); 771 __mark_inode_dirty(inode, dirty);
744 } 772 }
745 773
746out: 774out_put:
747 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 775 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
748 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 776 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
749 ceph_cap_string(got)); 777 ceph_cap_string(got));
750 ceph_put_cap_refs(ci, got); 778 ceph_put_cap_refs(ci, got);
751 779
780out:
752 if (ret == -EOLDSNAPC) { 781 if (ret == -EOLDSNAPC) {
753 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", 782 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
754 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); 783 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index dfb2831d8d85..095799ba9dd1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -560,7 +560,8 @@ static int fill_inode(struct inode *inode,
560 struct ceph_mds_reply_inode *info = iinfo->in; 560 struct ceph_mds_reply_inode *info = iinfo->in;
561 struct ceph_inode_info *ci = ceph_inode(inode); 561 struct ceph_inode_info *ci = ceph_inode(inode);
562 int i; 562 int i;
563 int issued, implemented; 563 int issued = 0, implemented;
564 int updating_inode = 0;
564 struct timespec mtime, atime, ctime; 565 struct timespec mtime, atime, ctime;
565 u32 nsplits; 566 u32 nsplits;
566 struct ceph_buffer *xattr_blob = NULL; 567 struct ceph_buffer *xattr_blob = NULL;
@@ -599,7 +600,8 @@ static int fill_inode(struct inode *inode,
599 if (le64_to_cpu(info->version) > 0 && 600 if (le64_to_cpu(info->version) > 0 &&
600 (ci->i_version & ~1) >= le64_to_cpu(info->version)) 601 (ci->i_version & ~1) >= le64_to_cpu(info->version))
601 goto no_change; 602 goto no_change;
602 603
604 updating_inode = 1;
603 issued = __ceph_caps_issued(ci, &implemented); 605 issued = __ceph_caps_issued(ci, &implemented);
604 issued |= implemented | __ceph_caps_dirty(ci); 606 issued |= implemented | __ceph_caps_dirty(ci);
605 607
@@ -707,17 +709,6 @@ static int fill_inode(struct inode *inode,
707 ci->i_rfiles = le64_to_cpu(info->rfiles); 709 ci->i_rfiles = le64_to_cpu(info->rfiles);
708 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); 710 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
709 ceph_decode_timespec(&ci->i_rctime, &info->rctime); 711 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
710
711 /* set dir completion flag? */
712 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
713 ceph_snap(inode) == CEPH_NOSNAP &&
714 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
715 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
716 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
717 dout(" marking %p complete (empty)\n", inode);
718 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
719 ci->i_max_offset = 2;
720 }
721 break; 712 break;
722 default: 713 default:
723 pr_err("fill_inode %llx.%llx BAD mode 0%o\n", 714 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -774,6 +765,19 @@ no_change:
774 __ceph_get_fmode(ci, cap_fmode); 765 __ceph_get_fmode(ci, cap_fmode);
775 } 766 }
776 767
768 /* set dir completion flag? */
769 if (S_ISDIR(inode->i_mode) &&
770 updating_inode && /* didn't jump to no_change */
771 ci->i_files == 0 && ci->i_subdirs == 0 &&
772 ceph_snap(inode) == CEPH_NOSNAP &&
773 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
774 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
775 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
776 dout(" marking %p complete (empty)\n", inode);
777 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
778 ci->i_max_offset = 2;
779 }
780
777 /* update delegation info? */ 781 /* update delegation info? */
778 if (dirinfo) 782 if (dirinfo)
779 ceph_fill_dirfrag(inode, dirinfo); 783 ceph_fill_dirfrag(inode, dirinfo);
@@ -805,14 +809,14 @@ static void update_dentry_lease(struct dentry *dentry,
805 return; 809 return;
806 810
807 spin_lock(&dentry->d_lock); 811 spin_lock(&dentry->d_lock);
808 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", 812 dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
809 dentry, le16_to_cpu(lease->mask), duration, ttl); 813 dentry, duration, ttl);
810 814
811 /* make lease_rdcache_gen match directory */ 815 /* make lease_rdcache_gen match directory */
812 dir = dentry->d_parent->d_inode; 816 dir = dentry->d_parent->d_inode;
813 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; 817 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
814 818
815 if (lease->mask == 0) 819 if (duration == 0)
816 goto out_unlock; 820 goto out_unlock;
817 821
818 if (di->lease_gen == session->s_cap_gen && 822 if (di->lease_gen == session->s_cap_gen &&
@@ -839,11 +843,13 @@ out_unlock:
839/* 843/*
840 * Set dentry's directory position based on the current dir's max, and 844 * Set dentry's directory position based on the current dir's max, and
841 * order it in d_subdirs, so that dcache_readdir behaves. 845 * order it in d_subdirs, so that dcache_readdir behaves.
846 *
847 * Always called under directory's i_mutex.
842 */ 848 */
843static void ceph_set_dentry_offset(struct dentry *dn) 849static void ceph_set_dentry_offset(struct dentry *dn)
844{ 850{
845 struct dentry *dir = dn->d_parent; 851 struct dentry *dir = dn->d_parent;
846 struct inode *inode = dn->d_parent->d_inode; 852 struct inode *inode = dir->d_inode;
847 struct ceph_dentry_info *di; 853 struct ceph_dentry_info *di;
848 854
849 BUG_ON(!inode); 855 BUG_ON(!inode);
@@ -1022,9 +1028,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1022 1028
1023 /* do we have a dn lease? */ 1029 /* do we have a dn lease? */
1024 have_lease = have_dir_cap || 1030 have_lease = have_dir_cap ||
1025 (le16_to_cpu(rinfo->dlease->mask) & 1031 le32_to_cpu(rinfo->dlease->duration_ms);
1026 CEPH_LOCK_DN);
1027
1028 if (!have_lease) 1032 if (!have_lease)
1029 dout("fill_trace no dentry lease or dir cap\n"); 1033 dout("fill_trace no dentry lease or dir cap\n");
1030 1034
@@ -1560,7 +1564,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1560{ 1564{
1561 struct inode *inode = dentry->d_inode; 1565 struct inode *inode = dentry->d_inode;
1562 struct ceph_inode_info *ci = ceph_inode(inode); 1566 struct ceph_inode_info *ci = ceph_inode(inode);
1563 struct inode *parent_inode = dentry->d_parent->d_inode; 1567 struct inode *parent_inode;
1564 const unsigned int ia_valid = attr->ia_valid; 1568 const unsigned int ia_valid = attr->ia_valid;
1565 struct ceph_mds_request *req; 1569 struct ceph_mds_request *req;
1566 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; 1570 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1743,7 +1747,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1743 req->r_inode_drop = release; 1747 req->r_inode_drop = release;
1744 req->r_args.setattr.mask = cpu_to_le32(mask); 1748 req->r_args.setattr.mask = cpu_to_le32(mask);
1745 req->r_num_caps = 1; 1749 req->r_num_caps = 1;
1750 parent_inode = ceph_get_dentry_parent_inode(dentry);
1746 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 1751 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1752 iput(parent_inode);
1747 } 1753 }
1748 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, 1754 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1749 ceph_cap_string(dirtied), mask); 1755 ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index ef0b5f48e13a..3b256b50f7d8 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
38static long ceph_ioctl_set_layout(struct file *file, void __user *arg) 38static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
39{ 39{
40 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode;
42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
43 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
44 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
@@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
87 req->r_args.setlayout.layout.fl_pg_preferred = 87 req->r_args.setlayout.layout.fl_pg_preferred =
88 cpu_to_le32(l.preferred_osd); 88 cpu_to_le32(l.preferred_osd);
89 89
90 parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
90 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 91 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
92 iput(parent_inode);
91 ceph_mdsc_put_request(req); 93 ceph_mdsc_put_request(req);
92 return err; 94 return err;
93} 95}
@@ -231,6 +233,14 @@ static long ceph_ioctl_lazyio(struct file *file)
231 return 0; 233 return 0;
232} 234}
233 235
236static long ceph_ioctl_syncio(struct file *file)
237{
238 struct ceph_file_info *fi = file->private_data;
239
240 fi->flags |= CEPH_F_SYNC;
241 return 0;
242}
243
234long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 244long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
235{ 245{
236 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); 246 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -249,6 +259,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
249 259
250 case CEPH_IOC_LAZYIO: 260 case CEPH_IOC_LAZYIO:
251 return ceph_ioctl_lazyio(file); 261 return ceph_ioctl_lazyio(file);
262
263 case CEPH_IOC_SYNCIO:
264 return ceph_ioctl_syncio(file);
252 } 265 }
253 266
254 return -ENOTTY; 267 return -ENOTTY;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 52e8fd74d450..0c5167e43180 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -40,5 +40,6 @@ struct ceph_ioctl_dataloc {
40 struct ceph_ioctl_dataloc) 40 struct ceph_ioctl_dataloc)
41 41
42#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) 42#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
43#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
43 44
44#endif 45#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0c1d91756528..86c59e16ba74 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref)
483 destroy_reply_info(&req->r_reply_info); 483 destroy_reply_info(&req->r_reply_info);
484 } 484 }
485 if (req->r_inode) { 485 if (req->r_inode) {
486 ceph_put_cap_refs(ceph_inode(req->r_inode), 486 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
487 CEPH_CAP_PIN);
488 iput(req->r_inode); 487 iput(req->r_inode);
489 } 488 }
490 if (req->r_locked_dir) 489 if (req->r_locked_dir)
491 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), 490 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
492 CEPH_CAP_PIN);
493 if (req->r_target_inode) 491 if (req->r_target_inode)
494 iput(req->r_target_inode); 492 iput(req->r_target_inode);
495 if (req->r_dentry) 493 if (req->r_dentry)
496 dput(req->r_dentry); 494 dput(req->r_dentry);
497 if (req->r_old_dentry) { 495 if (req->r_old_dentry) {
498 ceph_put_cap_refs( 496 /*
499 ceph_inode(req->r_old_dentry->d_parent->d_inode), 497 * track (and drop pins for) r_old_dentry_dir
500 CEPH_CAP_PIN); 498 * separately, since r_old_dentry's d_parent may have
499 * changed between the dir mutex being dropped and
500 * this request being freed.
501 */
502 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
503 CEPH_CAP_PIN);
501 dput(req->r_old_dentry); 504 dput(req->r_old_dentry);
505 iput(req->r_old_dentry_dir);
502 } 506 }
503 kfree(req->r_path1); 507 kfree(req->r_path1);
504 kfree(req->r_path2); 508 kfree(req->r_path2);
@@ -617,6 +621,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
617 */ 621 */
618struct dentry *get_nonsnap_parent(struct dentry *dentry) 622struct dentry *get_nonsnap_parent(struct dentry *dentry)
619{ 623{
624 /*
625 * we don't need to worry about protecting the d_parent access
626 * here because we never renaming inside the snapped namespace
627 * except to resplice to another snapdir, and either the old or new
628 * result is a valid result.
629 */
620 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) 630 while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
621 dentry = dentry->d_parent; 631 dentry = dentry->d_parent;
622 return dentry; 632 return dentry;
@@ -652,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
652 if (req->r_inode) { 662 if (req->r_inode) {
653 inode = req->r_inode; 663 inode = req->r_inode;
654 } else if (req->r_dentry) { 664 } else if (req->r_dentry) {
655 struct inode *dir = req->r_dentry->d_parent->d_inode; 665 /* ignore race with rename; old or new d_parent is okay */
666 struct dentry *parent = req->r_dentry->d_parent;
667 struct inode *dir = parent->d_inode;
656 668
657 if (dir->i_sb != mdsc->fsc->sb) { 669 if (dir->i_sb != mdsc->fsc->sb) {
658 /* not this fs! */ 670 /* not this fs! */
@@ -660,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
660 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 672 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
661 /* direct snapped/virtual snapdir requests 673 /* direct snapped/virtual snapdir requests
662 * based on parent dir inode */ 674 * based on parent dir inode */
663 struct dentry *dn = 675 struct dentry *dn = get_nonsnap_parent(parent);
664 get_nonsnap_parent(req->r_dentry->d_parent);
665 inode = dn->d_inode; 676 inode = dn->d_inode;
666 dout("__choose_mds using nonsnap parent %p\n", inode); 677 dout("__choose_mds using nonsnap parent %p\n", inode);
667 } else if (req->r_dentry->d_inode) { 678 } else if (req->r_dentry->d_inode) {
@@ -670,7 +681,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
670 } else { 681 } else {
671 /* dir + name */ 682 /* dir + name */
672 inode = dir; 683 inode = dir;
673 hash = ceph_dentry_hash(req->r_dentry); 684 hash = ceph_dentry_hash(dir, req->r_dentry);
674 is_hash = true; 685 is_hash = true;
675 } 686 }
676 } 687 }
@@ -1584,7 +1595,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1584 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); 1595 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1585 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, 1596 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1586 *ppath); 1597 *ppath);
1587 } else if (rpath) { 1598 } else if (rpath || rino) {
1588 *ino = rino; 1599 *ino = rino;
1589 *ppath = rpath; 1600 *ppath = rpath;
1590 *pathlen = strlen(rpath); 1601 *pathlen = strlen(rpath);
@@ -1931,9 +1942,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1931 if (req->r_locked_dir) 1942 if (req->r_locked_dir)
1932 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 1943 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1933 if (req->r_old_dentry) 1944 if (req->r_old_dentry)
1934 ceph_get_cap_refs( 1945 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
1935 ceph_inode(req->r_old_dentry->d_parent->d_inode), 1946 CEPH_CAP_PIN);
1936 CEPH_CAP_PIN);
1937 1947
1938 /* issue */ 1948 /* issue */
1939 mutex_lock(&mdsc->mutex); 1949 mutex_lock(&mdsc->mutex);
@@ -2714,7 +2724,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2714 struct ceph_mds_lease *h = msg->front.iov_base; 2724 struct ceph_mds_lease *h = msg->front.iov_base;
2715 u32 seq; 2725 u32 seq;
2716 struct ceph_vino vino; 2726 struct ceph_vino vino;
2717 int mask;
2718 struct qstr dname; 2727 struct qstr dname;
2719 int release = 0; 2728 int release = 0;
2720 2729
@@ -2725,7 +2734,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2725 goto bad; 2734 goto bad;
2726 vino.ino = le64_to_cpu(h->ino); 2735 vino.ino = le64_to_cpu(h->ino);
2727 vino.snap = CEPH_NOSNAP; 2736 vino.snap = CEPH_NOSNAP;
2728 mask = le16_to_cpu(h->mask);
2729 seq = le32_to_cpu(h->seq); 2737 seq = le32_to_cpu(h->seq);
2730 dname.name = (void *)h + sizeof(*h) + sizeof(u32); 2738 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2731 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); 2739 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
@@ -2737,8 +2745,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2737 2745
2738 /* lookup inode */ 2746 /* lookup inode */
2739 inode = ceph_find_inode(sb, vino); 2747 inode = ceph_find_inode(sb, vino);
2740 dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", 2748 dout("handle_lease %s, ino %llx %p %.*s\n",
2741 ceph_lease_op_name(h->action), mask, vino.ino, inode, 2749 ceph_lease_op_name(h->action), vino.ino, inode,
2742 dname.len, dname.name); 2750 dname.len, dname.name);
2743 if (inode == NULL) { 2751 if (inode == NULL) {
2744 dout("handle_lease no inode %llx\n", vino.ino); 2752 dout("handle_lease no inode %llx\n", vino.ino);
@@ -2828,7 +2836,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2828 return; 2836 return;
2829 lease = msg->front.iov_base; 2837 lease = msg->front.iov_base;
2830 lease->action = action; 2838 lease->action = action;
2831 lease->mask = cpu_to_le16(1);
2832 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2839 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2833 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2840 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2834 lease->seq = cpu_to_le32(seq); 2841 lease->seq = cpu_to_le32(seq);
@@ -2850,7 +2857,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2850 * Pass @inode always, @dentry is optional. 2857 * Pass @inode always, @dentry is optional.
2851 */ 2858 */
2852void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, 2859void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2853 struct dentry *dentry, int mask) 2860 struct dentry *dentry)
2854{ 2861{
2855 struct ceph_dentry_info *di; 2862 struct ceph_dentry_info *di;
2856 struct ceph_mds_session *session; 2863 struct ceph_mds_session *session;
@@ -2858,7 +2865,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2858 2865
2859 BUG_ON(inode == NULL); 2866 BUG_ON(inode == NULL);
2860 BUG_ON(dentry == NULL); 2867 BUG_ON(dentry == NULL);
2861 BUG_ON(mask == 0);
2862 2868
2863 /* is dentry lease valid? */ 2869 /* is dentry lease valid? */
2864 spin_lock(&dentry->d_lock); 2870 spin_lock(&dentry->d_lock);
@@ -2868,8 +2874,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2868 di->lease_gen != di->lease_session->s_cap_gen || 2874 di->lease_gen != di->lease_session->s_cap_gen ||
2869 !time_before(jiffies, dentry->d_time)) { 2875 !time_before(jiffies, dentry->d_time)) {
2870 dout("lease_release inode %p dentry %p -- " 2876 dout("lease_release inode %p dentry %p -- "
2871 "no lease on %d\n", 2877 "no lease\n",
2872 inode, dentry, mask); 2878 inode, dentry);
2873 spin_unlock(&dentry->d_lock); 2879 spin_unlock(&dentry->d_lock);
2874 return; 2880 return;
2875 } 2881 }
@@ -2880,8 +2886,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2880 __ceph_mdsc_drop_dentry_lease(dentry); 2886 __ceph_mdsc_drop_dentry_lease(dentry);
2881 spin_unlock(&dentry->d_lock); 2887 spin_unlock(&dentry->d_lock);
2882 2888
2883 dout("lease_release inode %p dentry %p mask %d to mds%d\n", 2889 dout("lease_release inode %p dentry %p to mds%d\n",
2884 inode, dentry, mask, session->s_mds); 2890 inode, dentry, session->s_mds);
2885 ceph_mdsc_lease_send_msg(session, inode, dentry, 2891 ceph_mdsc_lease_send_msg(session, inode, dentry,
2886 CEPH_MDS_LEASE_RELEASE, seq); 2892 CEPH_MDS_LEASE_RELEASE, seq);
2887 ceph_put_mds_session(session); 2893 ceph_put_mds_session(session);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7d8a0d662d56..4bb239921dbd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -171,6 +171,7 @@ struct ceph_mds_request {
171 struct inode *r_inode; /* arg1 */ 171 struct inode *r_inode; /* arg1 */
172 struct dentry *r_dentry; /* arg1 */ 172 struct dentry *r_dentry; /* arg1 */
173 struct dentry *r_old_dentry; /* arg2: rename from or link from */ 173 struct dentry *r_old_dentry; /* arg2: rename from or link from */
174 struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */
174 char *r_path1, *r_path2; 175 char *r_path1, *r_path2;
175 struct ceph_vino r_ino1, r_ino2; 176 struct ceph_vino r_ino1, r_ino2;
176 177
@@ -333,7 +334,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
333 334
334extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, 335extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
335 struct inode *inode, 336 struct inode *inode,
336 struct dentry *dn, int mask); 337 struct dentry *dn);
337 338
338extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); 339extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
339 340
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 54b14de2e729..e26437191333 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
449 spin_lock(&inode->i_lock); 449 spin_lock(&inode->i_lock);
450 used = __ceph_caps_used(ci); 450 used = __ceph_caps_used(ci);
451 dirty = __ceph_caps_dirty(ci); 451 dirty = __ceph_caps_dirty(ci);
452
453 /*
454 * If there is a write in progress, treat that as a dirty Fw,
455 * even though it hasn't completed yet; by the time we finish
456 * up this capsnap it will be.
457 */
458 if (used & CEPH_CAP_FILE_WR)
459 dirty |= CEPH_CAP_FILE_WR;
460
452 if (__ceph_have_pending_cap_snap(ci)) { 461 if (__ceph_have_pending_cap_snap(ci)) {
453 /* there is no point in queuing multiple "pending" cap_snaps, 462 /* there is no point in queuing multiple "pending" cap_snaps,
454 as no new writes are allowed to start when pending, so any 463 as no new writes are allowed to start when pending, so any
@@ -456,13 +465,19 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
456 cap_snap. lucky us. */ 465 cap_snap. lucky us. */
457 dout("queue_cap_snap %p already pending\n", inode); 466 dout("queue_cap_snap %p already pending\n", inode);
458 kfree(capsnap); 467 kfree(capsnap);
459 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || 468 } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
460 (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| 469 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
461 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
462 struct ceph_snap_context *snapc = ci->i_head_snapc; 470 struct ceph_snap_context *snapc = ci->i_head_snapc;
463 471
464 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, 472 /*
465 capsnap, snapc); 473 * if we are a sync write, we may need to go to the snaprealm
474 * to get the current snapc.
475 */
476 if (!snapc)
477 snapc = ci->i_snap_realm->cached_context;
478
479 dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
480 inode, capsnap, snapc, ceph_cap_string(dirty));
466 ihold(inode); 481 ihold(inode);
467 482
468 atomic_set(&capsnap->nref, 1); 483 atomic_set(&capsnap->nref, 1);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f2f77fd3c14c..88bacaf385d9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
73 */ 73 */
74 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 74 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
75 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); 75 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
76 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> 76 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
77 (CEPH_BLOCK_SHIFT-10);
78 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 77 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
79 78
80 buf->f_files = le64_to_cpu(st.num_objects); 79 buf->f_files = le64_to_cpu(st.num_objects);
@@ -780,6 +779,10 @@ static int ceph_register_bdi(struct super_block *sb,
780 fsc->backing_dev_info.ra_pages = 779 fsc->backing_dev_info.ra_pages =
781 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 780 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
782 >> PAGE_SHIFT; 781 >> PAGE_SHIFT;
782 else
783 fsc->backing_dev_info.ra_pages =
784 default_backing_dev_info.ra_pages;
785
783 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", 786 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
784 atomic_long_inc_return(&bdi_seq)); 787 atomic_long_inc_return(&bdi_seq));
785 if (!err) 788 if (!err)
@@ -810,8 +813,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
810 fsc = create_fs_client(fsopt, opt); 813 fsc = create_fs_client(fsopt, opt);
811 if (IS_ERR(fsc)) { 814 if (IS_ERR(fsc)) {
812 res = ERR_CAST(fsc); 815 res = ERR_CAST(fsc);
813 kfree(fsopt); 816 destroy_mount_options(fsopt);
814 kfree(opt); 817 ceph_destroy_options(opt);
815 goto out_final; 818 goto out_final;
816 } 819 }
817 820
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 30446b144e3d..a23eed526f05 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -543,13 +543,16 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
543/* 543/*
544 * we keep buffered readdir results attached to file->private_data 544 * we keep buffered readdir results attached to file->private_data
545 */ 545 */
546#define CEPH_F_SYNC 1
547#define CEPH_F_ATEND 2
548
546struct ceph_file_info { 549struct ceph_file_info {
547 int fmode; /* initialized on open */ 550 short fmode; /* initialized on open */
551 short flags; /* CEPH_F_* */
548 552
549 /* readdir: position within the dir */ 553 /* readdir: position within the dir */
550 u32 frag; 554 u32 frag;
551 struct ceph_mds_request *last_readdir; 555 struct ceph_mds_request *last_readdir;
552 int at_end;
553 556
554 /* readdir: position within a frag */ 557 /* readdir: position within a frag */
555 unsigned offset; /* offset of last chunk, adjusted for . and .. */ 558 unsigned offset; /* offset of last chunk, adjusted for . and .. */
@@ -789,6 +792,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
789 ceph_snapdir_dentry_ops; 792 ceph_snapdir_dentry_ops;
790 793
791extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 794extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
795extern int ceph_handle_snapdir(struct ceph_mds_request *req,
796 struct dentry *dentry, int err);
792extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 797extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
793 struct dentry *dentry, int err); 798 struct dentry *dentry, int err);
794 799
@@ -796,7 +801,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
796extern void ceph_dentry_lru_touch(struct dentry *dn); 801extern void ceph_dentry_lru_touch(struct dentry *dn);
797extern void ceph_dentry_lru_del(struct dentry *dn); 802extern void ceph_dentry_lru_del(struct dentry *dn);
798extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 803extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
799extern unsigned ceph_dentry_hash(struct dentry *dn); 804extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
805extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
800 806
801/* 807/*
802 * our d_ops vary depending on whether the inode is live, 808 * our d_ops vary depending on whether the inode is live,
@@ -819,14 +825,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
819 int p_locks, int f_locks); 825 int p_locks, int f_locks);
820extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); 826extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
821 827
822static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
823{
824 if (dentry && dentry->d_parent)
825 return dentry->d_parent->d_inode;
826
827 return NULL;
828}
829
830/* debugfs.c */ 828/* debugfs.c */
831extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); 829extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
832extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); 830extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f42d730f1b66..96c6739a0280 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
629 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 629 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
630 struct inode *inode = dentry->d_inode; 630 struct inode *inode = dentry->d_inode;
631 struct ceph_inode_info *ci = ceph_inode(inode); 631 struct ceph_inode_info *ci = ceph_inode(inode);
632 struct inode *parent_inode = dentry->d_parent->d_inode; 632 struct inode *parent_inode;
633 struct ceph_mds_request *req; 633 struct ceph_mds_request *req;
634 struct ceph_mds_client *mdsc = fsc->mdsc; 634 struct ceph_mds_client *mdsc = fsc->mdsc;
635 int err; 635 int err;
@@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
677 req->r_data_len = size; 677 req->r_data_len = size;
678 678
679 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 679 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
680 parent_inode = ceph_get_dentry_parent_inode(dentry);
680 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 681 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
682 iput(parent_inode);
681 ceph_mdsc_put_request(req); 683 ceph_mdsc_put_request(req);
682 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); 684 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
683 685
@@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
788 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 790 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
789 struct ceph_mds_client *mdsc = fsc->mdsc; 791 struct ceph_mds_client *mdsc = fsc->mdsc;
790 struct inode *inode = dentry->d_inode; 792 struct inode *inode = dentry->d_inode;
791 struct inode *parent_inode = dentry->d_parent->d_inode; 793 struct inode *parent_inode;
792 struct ceph_mds_request *req; 794 struct ceph_mds_request *req;
793 int err; 795 int err;
794 796
@@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
802 req->r_num_caps = 1; 804 req->r_num_caps = 1;
803 req->r_path2 = kstrdup(name, GFP_NOFS); 805 req->r_path2 = kstrdup(name, GFP_NOFS);
804 806
807 parent_inode = ceph_get_dentry_parent_inode(dentry);
805 err = ceph_mdsc_do_request(mdsc, parent_inode, req); 808 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
809 iput(parent_inode);
806 ceph_mdsc_put_request(req); 810 ceph_mdsc_put_request(req);
807 return err; 811 return err;
808} 812}
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 2fe3cf13b2e9..6d40656e1e29 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
176 176
177#ifdef CONFIG_CIFS_STATS2 177#ifdef CONFIG_CIFS_STATS2
178 seq_printf(m, " In Send: %d In MaxReq Wait: %d", 178 seq_printf(m, " In Send: %d In MaxReq Wait: %d",
179 atomic_read(&server->inSend), 179 atomic_read(&server->in_send),
180 atomic_read(&server->num_waiters)); 180 atomic_read(&server->num_waiters));
181#endif 181#endif
182 182
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 8d8f28c94c0f..6873bb634a97 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -141,10 +141,11 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 141
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc < 0) { 143 if (rc < 0) {
144 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", 144 cFYI(1, "%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc); 145 __func__, *devname, rc);
146 goto compose_mount_options_err; 146 goto compose_mount_options_err;
147 } 147 }
148
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 149 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
149 * assuming that we have 'unc=' and 'ip=' in 150 * assuming that we have 'unc=' and 'ip=' in
150 * the original sb_mountdata 151 * the original sb_mountdata
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 21de1d6d5849..d0f59faefb78 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -991,24 +991,6 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
991 return pntsd; 991 return pntsd;
992} 992}
993 993
994static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
995 struct cifs_ntsd *pnntsd, u32 acllen)
996{
997 int xid, rc;
998 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
999
1000 if (IS_ERR(tlink))
1001 return PTR_ERR(tlink);
1002
1003 xid = GetXid();
1004 rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
1005 FreeXid(xid);
1006 cifs_put_tlink(tlink);
1007
1008 cFYI(DBG2, "SetCIFSACL rc = %d", rc);
1009 return rc;
1010}
1011
1012static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, 994static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
1013 struct cifs_ntsd *pnntsd, u32 acllen) 995 struct cifs_ntsd *pnntsd, u32 acllen)
1014{ 996{
@@ -1047,18 +1029,10 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
1047 struct inode *inode, const char *path) 1029 struct inode *inode, const char *path)
1048{ 1030{
1049 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1031 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1050 struct cifsFileInfo *open_file;
1051 int rc;
1052 1032
1053 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); 1033 cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
1054 1034
1055 open_file = find_readable_file(CIFS_I(inode), true); 1035 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
1056 if (!open_file)
1057 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
1058
1059 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
1060 cifsFileInfo_put(open_file);
1061 return rc;
1062} 1036}
1063 1037
1064/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 1038/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5a0ee7f2af06..e76bfeb68267 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -52,19 +52,29 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
52 52
53 rc = crypto_shash_init(&server->secmech.sdescmd5->shash); 53 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
54 if (rc) { 54 if (rc) {
55 cERROR(1, "%s: Oould not init md5\n", __func__); 55 cERROR(1, "%s: Could not init md5\n", __func__);
56 return rc; 56 return rc;
57 } 57 }
58 58
59 crypto_shash_update(&server->secmech.sdescmd5->shash, 59 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
60 server->session_key.response, server->session_key.len); 60 server->session_key.response, server->session_key.len);
61 if (rc) {
62 cERROR(1, "%s: Could not update with response\n", __func__);
63 return rc;
64 }
61 65
62 crypto_shash_update(&server->secmech.sdescmd5->shash, 66 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
63 cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); 67 cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
68 if (rc) {
69 cERROR(1, "%s: Could not update with payload\n", __func__);
70 return rc;
71 }
64 72
65 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); 73 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
74 if (rc)
75 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
66 76
67 return 0; 77 return rc;
68} 78}
69 79
70/* must be called with server->srv_mutex held */ 80/* must be called with server->srv_mutex held */
@@ -77,9 +87,15 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
77 if ((cifs_pdu == NULL) || (server == NULL)) 87 if ((cifs_pdu == NULL) || (server == NULL))
78 return -EINVAL; 88 return -EINVAL;
79 89
80 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 90 if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
91 server->tcpStatus == CifsNeedNegotiate)
81 return rc; 92 return rc;
82 93
94 if (!server->session_estab) {
95 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
96 return rc;
97 }
98
83 cifs_pdu->Signature.Sequence.SequenceNumber = 99 cifs_pdu->Signature.Sequence.SequenceNumber =
84 cpu_to_le32(server->sequence_number); 100 cpu_to_le32(server->sequence_number);
85 cifs_pdu->Signature.Sequence.Reserved = 0; 101 cifs_pdu->Signature.Sequence.Reserved = 0;
@@ -112,12 +128,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
112 128
113 rc = crypto_shash_init(&server->secmech.sdescmd5->shash); 129 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
114 if (rc) { 130 if (rc) {
115 cERROR(1, "%s: Oould not init md5\n", __func__); 131 cERROR(1, "%s: Could not init md5\n", __func__);
116 return rc; 132 return rc;
117 } 133 }
118 134
119 crypto_shash_update(&server->secmech.sdescmd5->shash, 135 rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
120 server->session_key.response, server->session_key.len); 136 server->session_key.response, server->session_key.len);
137 if (rc) {
138 cERROR(1, "%s: Could not update with response\n", __func__);
139 return rc;
140 }
121 141
122 for (i = 0; i < n_vec; i++) { 142 for (i = 0; i < n_vec; i++) {
123 if (iov[i].iov_len == 0) 143 if (iov[i].iov_len == 0)
@@ -131,14 +151,24 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
131 if (i == 0) { 151 if (i == 0) {
132 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 152 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
133 break; /* nothing to sign or corrupt header */ 153 break; /* nothing to sign or corrupt header */
154 rc =
134 crypto_shash_update(&server->secmech.sdescmd5->shash, 155 crypto_shash_update(&server->secmech.sdescmd5->shash,
135 iov[i].iov_base + 4, iov[i].iov_len - 4); 156 iov[i].iov_base + 4, iov[i].iov_len - 4);
136 } else 157 } else {
158 rc =
137 crypto_shash_update(&server->secmech.sdescmd5->shash, 159 crypto_shash_update(&server->secmech.sdescmd5->shash,
138 iov[i].iov_base, iov[i].iov_len); 160 iov[i].iov_base, iov[i].iov_len);
161 }
162 if (rc) {
163 cERROR(1, "%s: Could not update with payload\n",
164 __func__);
165 return rc;
166 }
139 } 167 }
140 168
141 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); 169 rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
170 if (rc)
171 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
142 172
143 return rc; 173 return rc;
144} 174}
@@ -154,8 +184,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
154 if ((cifs_pdu == NULL) || (server == NULL)) 184 if ((cifs_pdu == NULL) || (server == NULL))
155 return -EINVAL; 185 return -EINVAL;
156 186
157 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 187 if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
188 server->tcpStatus == CifsNeedNegotiate)
189 return rc;
190
191 if (!server->session_estab) {
192 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
158 return rc; 193 return rc;
194 }
159 195
160 cifs_pdu->Signature.Sequence.SequenceNumber = 196 cifs_pdu->Signature.Sequence.SequenceNumber =
161 cpu_to_le32(server->sequence_number); 197 cpu_to_le32(server->sequence_number);
@@ -463,8 +499,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
463 /* calculate md4 hash of password */ 499 /* calculate md4 hash of password */
464 E_md4hash(ses->password, nt_hash); 500 E_md4hash(ses->password, nt_hash);
465 501
466 crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, 502 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
467 CIFS_NTHASH_SIZE); 503 CIFS_NTHASH_SIZE);
504 if (rc) {
505 cERROR(1, "%s: Could not set NT Hash as a key", __func__);
506 return rc;
507 }
468 508
469 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); 509 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
470 if (rc) { 510 if (rc) {
@@ -478,13 +518,18 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
478 if (user == NULL) { 518 if (user == NULL) {
479 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); 519 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
480 rc = -ENOMEM; 520 rc = -ENOMEM;
481 goto calc_exit_2; 521 return rc;
482 } 522 }
483 len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); 523 len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
484 UniStrupr(user); 524 UniStrupr(user);
485 525
486 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 526 rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
487 (char *)user, 2 * len); 527 (char *)user, 2 * len);
528 kfree(user);
529 if (rc) {
530 cERROR(1, "%s: Could not update with user\n", __func__);
531 return rc;
532 }
488 533
489 /* convert ses->domainName to unicode and uppercase */ 534 /* convert ses->domainName to unicode and uppercase */
490 if (ses->domainName) { 535 if (ses->domainName) {
@@ -494,13 +539,19 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
494 if (domain == NULL) { 539 if (domain == NULL) {
495 cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); 540 cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
496 rc = -ENOMEM; 541 rc = -ENOMEM;
497 goto calc_exit_1; 542 return rc;
498 } 543 }
499 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, 544 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
500 nls_cp); 545 nls_cp);
546 rc =
501 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 547 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
502 (char *)domain, 2 * len); 548 (char *)domain, 2 * len);
503 kfree(domain); 549 kfree(domain);
550 if (rc) {
551 cERROR(1, "%s: Could not update with domain\n",
552 __func__);
553 return rc;
554 }
504 } else if (ses->serverName) { 555 } else if (ses->serverName) {
505 len = strlen(ses->serverName); 556 len = strlen(ses->serverName);
506 557
@@ -508,21 +559,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
508 if (server == NULL) { 559 if (server == NULL) {
509 cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); 560 cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
510 rc = -ENOMEM; 561 rc = -ENOMEM;
511 goto calc_exit_1; 562 return rc;
512 } 563 }
513 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, 564 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
514 nls_cp); 565 nls_cp);
566 rc =
515 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 567 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
516 (char *)server, 2 * len); 568 (char *)server, 2 * len);
517 kfree(server); 569 kfree(server);
570 if (rc) {
571 cERROR(1, "%s: Could not update with server\n",
572 __func__);
573 return rc;
574 }
518 } 575 }
519 576
520 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, 577 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
521 ntlmv2_hash); 578 ntlmv2_hash);
579 if (rc)
580 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
522 581
523calc_exit_1:
524 kfree(user);
525calc_exit_2:
526 return rc; 582 return rc;
527} 583}
528 584
@@ -537,8 +593,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
537 return -1; 593 return -1;
538 } 594 }
539 595
540 crypto_shash_setkey(ses->server->secmech.hmacmd5, 596 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
541 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); 597 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
598 if (rc) {
599 cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
600 return rc;
601 }
542 602
543 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); 603 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
544 if (rc) { 604 if (rc) {
@@ -552,11 +612,17 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
552 else 612 else
553 memcpy(ses->auth_key.response + offset, 613 memcpy(ses->auth_key.response + offset,
554 ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); 614 ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
555 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 615 rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
556 ses->auth_key.response + offset, ses->auth_key.len - offset); 616 ses->auth_key.response + offset, ses->auth_key.len - offset);
617 if (rc) {
618 cERROR(1, "%s: Could not update with response\n", __func__);
619 return rc;
620 }
557 621
558 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, 622 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
559 ses->auth_key.response + CIFS_SESS_KEY_SIZE); 623 ses->auth_key.response + CIFS_SESS_KEY_SIZE);
624 if (rc)
625 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
560 626
561 return rc; 627 return rc;
562} 628}
@@ -626,8 +692,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
626 } 692 }
627 693
628 /* now calculate the session key for NTLMv2 */ 694 /* now calculate the session key for NTLMv2 */
629 crypto_shash_setkey(ses->server->secmech.hmacmd5, 695 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
630 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); 696 ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
697 if (rc) {
698 cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
699 goto setup_ntlmv2_rsp_ret;
700 }
631 701
632 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); 702 rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
633 if (rc) { 703 if (rc) {
@@ -635,12 +705,18 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
635 goto setup_ntlmv2_rsp_ret; 705 goto setup_ntlmv2_rsp_ret;
636 } 706 }
637 707
638 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 708 rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
639 ses->auth_key.response + CIFS_SESS_KEY_SIZE, 709 ses->auth_key.response + CIFS_SESS_KEY_SIZE,
640 CIFS_HMAC_MD5_HASH_SIZE); 710 CIFS_HMAC_MD5_HASH_SIZE);
711 if (rc) {
712 cERROR(1, "%s: Could not update with response\n", __func__);
713 goto setup_ntlmv2_rsp_ret;
714 }
641 715
642 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, 716 rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
643 ses->auth_key.response); 717 ses->auth_key.response);
718 if (rc)
719 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
644 720
645setup_ntlmv2_rsp_ret: 721setup_ntlmv2_rsp_ret:
646 kfree(tiblob); 722 kfree(tiblob);
@@ -668,8 +744,12 @@ calc_seckey(struct cifs_ses *ses)
668 744
669 desc.tfm = tfm_arc4; 745 desc.tfm = tfm_arc4;
670 746
671 crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, 747 rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
672 CIFS_SESS_KEY_SIZE); 748 CIFS_SESS_KEY_SIZE);
749 if (rc) {
750 cERROR(1, "%s: Could not set response as a key", __func__);
751 return rc;
752 }
673 753
674 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); 754 sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
675 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); 755 sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
@@ -688,7 +768,7 @@ calc_seckey(struct cifs_ses *ses)
688 768
689 crypto_free_blkcipher(tfm_arc4); 769 crypto_free_blkcipher(tfm_arc4);
690 770
691 return 0; 771 return rc;
692} 772}
693 773
694void 774void
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 865517470967..f93eb948d071 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -86,24 +86,6 @@ extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 87extern mempool_t *cifs_mid_poolp;
88 88
89void
90cifs_sb_active(struct super_block *sb)
91{
92 struct cifs_sb_info *server = CIFS_SB(sb);
93
94 if (atomic_inc_return(&server->active) == 1)
95 atomic_inc(&sb->s_active);
96}
97
98void
99cifs_sb_deactive(struct super_block *sb)
100{
101 struct cifs_sb_info *server = CIFS_SB(sb);
102
103 if (atomic_dec_and_test(&server->active))
104 deactivate_super(sb);
105}
106
107static int 89static int
108cifs_read_super(struct super_block *sb) 90cifs_read_super(struct super_block *sb)
109{ 91{
@@ -581,6 +563,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
581 mutex_unlock(&dir->i_mutex); 563 mutex_unlock(&dir->i_mutex);
582 dput(dentry); 564 dput(dentry);
583 dentry = child; 565 dentry = child;
566 if (!dentry->d_inode) {
567 dput(dentry);
568 dentry = ERR_PTR(-ENOENT);
569 }
584 } while (!IS_ERR(dentry)); 570 } while (!IS_ERR(dentry));
585 _FreeXid(xid); 571 _FreeXid(xid);
586 kfree(full_path); 572 kfree(full_path);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index fbd050c8d52a..95da8027983d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,10 +41,6 @@ extern struct file_system_type cifs_fs_type;
41extern const struct address_space_operations cifs_addr_ops; 41extern const struct address_space_operations cifs_addr_ops;
42extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
43 43
44/* Functions related to super block operations */
45extern void cifs_sb_active(struct super_block *sb);
46extern void cifs_sb_deactive(struct super_block *sb);
47
48/* Functions related to inodes */ 44/* Functions related to inodes */
49extern const struct inode_operations cifs_dir_inode_ops; 45extern const struct inode_operations cifs_dir_inode_ops;
50extern struct inode *cifs_root_iget(struct super_block *); 46extern struct inode *cifs_root_iget(struct super_block *);
@@ -129,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
129extern const struct export_operations cifs_export_ops; 125extern const struct export_operations cifs_export_ops;
130#endif /* CIFS_NFSD_EXPORT */ 126#endif /* CIFS_NFSD_EXPORT */
131 127
132#define CIFS_VERSION "1.74" 128#define CIFS_VERSION "1.75"
133#endif /* _CIFSFS_H */ 129#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6255fa812c7a..95dad9d14cf1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -291,7 +291,7 @@ struct TCP_Server_Info {
291 struct fscache_cookie *fscache; /* client index cache cookie */ 291 struct fscache_cookie *fscache; /* client index cache cookie */
292#endif 292#endif
293#ifdef CONFIG_CIFS_STATS2 293#ifdef CONFIG_CIFS_STATS2
294 atomic_t inSend; /* requests trying to send */ 294 atomic_t in_send; /* requests trying to send */
295 atomic_t num_waiters; /* blocked waiting to get in sendrecv */ 295 atomic_t num_waiters; /* blocked waiting to get in sendrecv */
296#endif 296#endif
297}; 297};
@@ -501,7 +501,7 @@ struct cifs_search_info {
501 char *ntwrk_buf_start; 501 char *ntwrk_buf_start;
502 char *srch_entries_start; 502 char *srch_entries_start;
503 char *last_entry; 503 char *last_entry;
504 char *presume_name; 504 const char *presume_name;
505 unsigned int resume_name_len; 505 unsigned int resume_name_len;
506 bool endOfSearch:1; 506 bool endOfSearch:1;
507 bool emptyDir:1; 507 bool emptyDir:1;
@@ -672,12 +672,54 @@ struct mid_q_entry {
672 bool multiEnd:1; /* both received */ 672 bool multiEnd:1; /* both received */
673}; 673};
674 674
675struct oplock_q_entry { 675/* Make code in transport.c a little cleaner by moving
676 struct list_head qhead; 676 update of optional stats into function below */
677 struct inode *pinode; 677#ifdef CONFIG_CIFS_STATS2
678 struct cifs_tcon *tcon; 678
679 __u16 netfid; 679static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
680}; 680{
681 atomic_inc(&server->in_send);
682}
683
684static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
685{
686 atomic_dec(&server->in_send);
687}
688
689static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
690{
691 atomic_inc(&server->num_waiters);
692}
693
694static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
695{
696 atomic_dec(&server->num_waiters);
697}
698
699static inline void cifs_save_when_sent(struct mid_q_entry *mid)
700{
701 mid->when_sent = jiffies;
702}
703#else
704static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
705{
706}
707static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
708{
709}
710
711static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
712{
713}
714
715static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
716{
717}
718
719static inline void cifs_save_when_sent(struct mid_q_entry *mid)
720{
721}
722#endif
681 723
682/* for pending dnotify requests */ 724/* for pending dnotify requests */
683struct dir_notify_req { 725struct dir_notify_req {
@@ -942,8 +984,6 @@ GLOBAL_EXTERN spinlock_t siduidlock;
942GLOBAL_EXTERN spinlock_t sidgidlock; 984GLOBAL_EXTERN spinlock_t sidgidlock;
943 985
944void cifs_oplock_break(struct work_struct *work); 986void cifs_oplock_break(struct work_struct *work);
945void cifs_oplock_break_get(struct cifsFileInfo *cfile);
946void cifs_oplock_break_put(struct cifsFileInfo *cfile);
947 987
948extern const struct slow_work_ops cifs_oplock_break_ops; 988extern const struct slow_work_ops cifs_oplock_break_ops;
949 989
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1a9fe7f816d1..aac37d99a487 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -107,7 +107,7 @@ static void mark_open_files_invalid(struct cifs_tcon *pTcon)
107static int 107static int
108cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) 108cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
109{ 109{
110 int rc = 0; 110 int rc;
111 struct cifs_ses *ses; 111 struct cifs_ses *ses;
112 struct TCP_Server_Info *server; 112 struct TCP_Server_Info *server;
113 struct nls_table *nls_codepage; 113 struct nls_table *nls_codepage;
@@ -5720,6 +5720,7 @@ CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
5720 char *temp_ptr; 5720 char *temp_ptr;
5721 char *end_of_smb; 5721 char *end_of_smb;
5722 __u16 params, byte_count, data_offset; 5722 __u16 params, byte_count, data_offset;
5723 unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
5723 5724
5724 cFYI(1, "In Query All EAs path %s", searchName); 5725 cFYI(1, "In Query All EAs path %s", searchName);
5725QAllEAsRetry: 5726QAllEAsRetry:
@@ -5837,7 +5838,8 @@ QAllEAsRetry:
5837 } 5838 }
5838 5839
5839 if (ea_name) { 5840 if (ea_name) {
5840 if (strncmp(ea_name, temp_ptr, name_len) == 0) { 5841 if (ea_name_len == name_len &&
5842 strncmp(ea_name, temp_ptr, name_len) == 0) {
5841 temp_ptr += name_len + 1; 5843 temp_ptr += name_len + 1;
5842 rc = value_len; 5844 rc = value_len;
5843 if (buf_size == 0) 5845 if (buf_size == 0)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e66297bad412..633c246b6775 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -319,25 +319,328 @@ requeue_echo:
319 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); 319 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
320} 320}
321 321
322static bool
323allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
324 bool is_large_buf)
325{
326 char *bbuf = *bigbuf, *sbuf = *smallbuf;
327
328 if (bbuf == NULL) {
329 bbuf = (char *)cifs_buf_get();
330 if (!bbuf) {
331 cERROR(1, "No memory for large SMB response");
332 msleep(3000);
333 /* retry will check if exiting */
334 return false;
335 }
336 } else if (is_large_buf) {
337 /* we are reusing a dirty large buf, clear its start */
338 memset(bbuf, 0, size);
339 }
340
341 if (sbuf == NULL) {
342 sbuf = (char *)cifs_small_buf_get();
343 if (!sbuf) {
344 cERROR(1, "No memory for SMB response");
345 msleep(1000);
346 /* retry will check if exiting */
347 return false;
348 }
349 /* beginning of smb buffer is cleared in our buf_get */
350 } else {
351 /* if existing small buf clear beginning */
352 memset(sbuf, 0, size);
353 }
354
355 *bigbuf = bbuf;
356 *smallbuf = sbuf;
357
358 return true;
359}
360
361static int
362read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
363 struct kvec *iov, unsigned int to_read,
364 unsigned int *ptotal_read, bool is_header_read)
365{
366 int length, rc = 0;
367 unsigned int total_read;
368 char *buf = iov->iov_base;
369
370 for (total_read = 0; total_read < to_read; total_read += length) {
371 length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
372 to_read - total_read, 0);
373 if (server->tcpStatus == CifsExiting) {
374 /* then will exit */
375 rc = 2;
376 break;
377 } else if (server->tcpStatus == CifsNeedReconnect) {
378 cifs_reconnect(server);
379 /* Reconnect wakes up rspns q */
380 /* Now we will reread sock */
381 rc = 1;
382 break;
383 } else if (length == -ERESTARTSYS ||
384 length == -EAGAIN ||
385 length == -EINTR) {
386 /*
387 * Minimum sleep to prevent looping, allowing socket
388 * to clear and app threads to set tcpStatus
389 * CifsNeedReconnect if server hung.
390 */
391 usleep_range(1000, 2000);
392 length = 0;
393 if (!is_header_read)
394 continue;
395 /* Special handling for header read */
396 if (total_read) {
397 iov->iov_base = (to_read - total_read) +
398 buf;
399 iov->iov_len = to_read - total_read;
400 smb_msg->msg_control = NULL;
401 smb_msg->msg_controllen = 0;
402 rc = 3;
403 } else
404 rc = 1;
405 break;
406 } else if (length <= 0) {
407 cERROR(1, "Received no data, expecting %d",
408 to_read - total_read);
409 cifs_reconnect(server);
410 rc = 1;
411 break;
412 }
413 }
414
415 *ptotal_read = total_read;
416 return rc;
417}
418
419static bool
420check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
421{
422 char temp = *buf;
423 unsigned int pdu_length = be32_to_cpu(
424 ((struct smb_hdr *)buf)->smb_buf_length);
425
426 /*
427 * The first byte big endian of the length field,
428 * is actually not part of the length but the type
429 * with the most common, zero, as regular data.
430 */
431 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
432 return false;
433 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
434 cFYI(1, "Good RFC 1002 session rsp");
435 return false;
436 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
437 /*
438 * We get this from Windows 98 instead of an error on
439 * SMB negprot response.
440 */
441 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
442 pdu_length);
443 /* give server a second to clean up */
444 msleep(1000);
445 /*
446 * Always try 445 first on reconnect since we get NACK
447 * on some if we ever connected to port 139 (the NACK
448 * is since we do not begin with RFC1001 session
449 * initialize frame).
450 */
451 cifs_set_port((struct sockaddr *)
452 &server->dstaddr, CIFS_PORT);
453 cifs_reconnect(server);
454 wake_up(&server->response_q);
455 return false;
456 } else if (temp != (char) 0) {
457 cERROR(1, "Unknown RFC 1002 frame");
458 cifs_dump_mem(" Received Data: ", buf, 4);
459 cifs_reconnect(server);
460 return false;
461 }
462
463 /* else we have an SMB response */
464 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
465 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
466 cERROR(1, "Invalid size SMB length %d pdu_length %d",
467 4, pdu_length+4);
468 cifs_reconnect(server);
469 wake_up(&server->response_q);
470 return false;
471 }
472
473 return true;
474}
475
476static struct mid_q_entry *
477find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
478 int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
479{
480 struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
481
482 spin_lock(&GlobalMid_Lock);
483 list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
484 if (mid->mid != buf->Mid ||
485 mid->midState != MID_REQUEST_SUBMITTED ||
486 mid->command != buf->Command)
487 continue;
488
489 if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
490 /* We have a multipart transact2 resp */
491 *is_multi_rsp = true;
492 if (mid->resp_buf) {
493 /* merge response - fix up 1st*/
494 *length = coalesce_t2(buf, mid->resp_buf);
495 if (*length > 0) {
496 *length = 0;
497 mid->multiRsp = true;
498 break;
499 }
500 /* All parts received or packet is malformed. */
501 mid->multiEnd = true;
502 goto multi_t2_fnd;
503 }
504 if (!is_large_buf) {
505 /*FIXME: switch to already allocated largebuf?*/
506 cERROR(1, "1st trans2 resp needs bigbuf");
507 } else {
508 /* Have first buffer */
509 mid->resp_buf = buf;
510 mid->largeBuf = true;
511 *bigbuf = NULL;
512 }
513 break;
514 }
515 mid->resp_buf = buf;
516 mid->largeBuf = is_large_buf;
517multi_t2_fnd:
518 if (*length == 0)
519 mid->midState = MID_RESPONSE_RECEIVED;
520 else
521 mid->midState = MID_RESPONSE_MALFORMED;
522#ifdef CONFIG_CIFS_STATS2
523 mid->when_received = jiffies;
524#endif
525 list_del_init(&mid->qhead);
526 ret = mid;
527 break;
528 }
529 spin_unlock(&GlobalMid_Lock);
530
531 return ret;
532}
533
534static void clean_demultiplex_info(struct TCP_Server_Info *server)
535{
536 int length;
537
538 /* take it off the list, if it's not already */
539 spin_lock(&cifs_tcp_ses_lock);
540 list_del_init(&server->tcp_ses_list);
541 spin_unlock(&cifs_tcp_ses_lock);
542
543 spin_lock(&GlobalMid_Lock);
544 server->tcpStatus = CifsExiting;
545 spin_unlock(&GlobalMid_Lock);
546 wake_up_all(&server->response_q);
547
548 /*
549 * Check if we have blocked requests that need to free. Note that
550 * cifs_max_pending is normally 50, but can be set at module install
551 * time to as little as two.
552 */
553 spin_lock(&GlobalMid_Lock);
554 if (atomic_read(&server->inFlight) >= cifs_max_pending)
555 atomic_set(&server->inFlight, cifs_max_pending - 1);
556 /*
557 * We do not want to set the max_pending too low or we could end up
558 * with the counter going negative.
559 */
560 spin_unlock(&GlobalMid_Lock);
561 /*
562 * Although there should not be any requests blocked on this queue it
563 * can not hurt to be paranoid and try to wake up requests that may
564 * haven been blocked when more than 50 at time were on the wire to the
565 * same server - they now will see the session is in exit state and get
566 * out of SendReceive.
567 */
568 wake_up_all(&server->request_q);
569 /* give those requests time to exit */
570 msleep(125);
571
572 if (server->ssocket) {
573 sock_release(server->ssocket);
574 server->ssocket = NULL;
575 }
576
577 if (!list_empty(&server->pending_mid_q)) {
578 struct list_head dispose_list;
579 struct mid_q_entry *mid_entry;
580 struct list_head *tmp, *tmp2;
581
582 INIT_LIST_HEAD(&dispose_list);
583 spin_lock(&GlobalMid_Lock);
584 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
585 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
586 cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
587 mid_entry->midState = MID_SHUTDOWN;
588 list_move(&mid_entry->qhead, &dispose_list);
589 }
590 spin_unlock(&GlobalMid_Lock);
591
592 /* now walk dispose list and issue callbacks */
593 list_for_each_safe(tmp, tmp2, &dispose_list) {
594 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
595 cFYI(1, "Callback mid 0x%x", mid_entry->mid);
596 list_del_init(&mid_entry->qhead);
597 mid_entry->callback(mid_entry);
598 }
599 /* 1/8th of sec is more than enough time for them to exit */
600 msleep(125);
601 }
602
603 if (!list_empty(&server->pending_mid_q)) {
604 /*
605 * mpx threads have not exited yet give them at least the smb
606 * send timeout time for long ops.
607 *
608 * Due to delays on oplock break requests, we need to wait at
609 * least 45 seconds before giving up on a request getting a
610 * response and going ahead and killing cifsd.
611 */
612 cFYI(1, "Wait for exit from demultiplex thread");
613 msleep(46000);
614 /*
615 * If threads still have not exited they are probably never
616 * coming home not much else we can do but free the memory.
617 */
618 }
619
620 kfree(server->hostname);
621 kfree(server);
622
623 length = atomic_dec_return(&tcpSesAllocCount);
624 if (length > 0)
625 mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
626 GFP_KERNEL);
627}
628
322static int 629static int
323cifs_demultiplex_thread(void *p) 630cifs_demultiplex_thread(void *p)
324{ 631{
325 int length; 632 int length;
326 struct TCP_Server_Info *server = p; 633 struct TCP_Server_Info *server = p;
327 unsigned int pdu_length, total_read; 634 unsigned int pdu_length, total_read;
635 char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
328 struct smb_hdr *smb_buffer = NULL; 636 struct smb_hdr *smb_buffer = NULL;
329 struct smb_hdr *bigbuf = NULL;
330 struct smb_hdr *smallbuf = NULL;
331 struct msghdr smb_msg; 637 struct msghdr smb_msg;
332 struct kvec iov; 638 struct kvec iov;
333 struct socket *csocket = server->ssocket;
334 struct list_head *tmp, *tmp2;
335 struct task_struct *task_to_wake = NULL; 639 struct task_struct *task_to_wake = NULL;
336 struct mid_q_entry *mid_entry; 640 struct mid_q_entry *mid_entry;
337 char temp;
338 bool isLargeBuf = false; 641 bool isLargeBuf = false;
339 bool isMultiRsp; 642 bool isMultiRsp = false;
340 int reconnect; 643 int rc;
341 644
342 current->flags |= PF_MEMALLOC; 645 current->flags |= PF_MEMALLOC;
343 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); 646 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -351,35 +654,16 @@ cifs_demultiplex_thread(void *p)
351 while (server->tcpStatus != CifsExiting) { 654 while (server->tcpStatus != CifsExiting) {
352 if (try_to_freeze()) 655 if (try_to_freeze())
353 continue; 656 continue;
354 if (bigbuf == NULL) {
355 bigbuf = cifs_buf_get();
356 if (!bigbuf) {
357 cERROR(1, "No memory for large SMB response");
358 msleep(3000);
359 /* retry will check if exiting */
360 continue;
361 }
362 } else if (isLargeBuf) {
363 /* we are reusing a dirty large buf, clear its start */
364 memset(bigbuf, 0, sizeof(struct smb_hdr));
365 }
366 657
367 if (smallbuf == NULL) { 658 if (!allocate_buffers(&bigbuf, &smallbuf,
368 smallbuf = cifs_small_buf_get(); 659 sizeof(struct smb_hdr), isLargeBuf))
369 if (!smallbuf) { 660 continue;
370 cERROR(1, "No memory for SMB response");
371 msleep(1000);
372 /* retry will check if exiting */
373 continue;
374 }
375 /* beginning of smb buffer is cleared in our buf_get */
376 } else /* if existing small buf clear beginning */
377 memset(smallbuf, 0, sizeof(struct smb_hdr));
378 661
379 isLargeBuf = false; 662 isLargeBuf = false;
380 isMultiRsp = false; 663 isMultiRsp = false;
381 smb_buffer = smallbuf; 664 smb_buffer = (struct smb_hdr *)smallbuf;
382 iov.iov_base = smb_buffer; 665 buf = smallbuf;
666 iov.iov_base = buf;
383 iov.iov_len = 4; 667 iov.iov_len = 4;
384 smb_msg.msg_control = NULL; 668 smb_msg.msg_control = NULL;
385 smb_msg.msg_controllen = 0; 669 smb_msg.msg_controllen = 0;
@@ -393,158 +677,50 @@ incomplete_rcv:
393 "Reconnecting...", server->hostname, 677 "Reconnecting...", server->hostname,
394 (echo_retries * SMB_ECHO_INTERVAL / HZ)); 678 (echo_retries * SMB_ECHO_INTERVAL / HZ));
395 cifs_reconnect(server); 679 cifs_reconnect(server);
396 csocket = server->ssocket;
397 wake_up(&server->response_q); 680 wake_up(&server->response_q);
398 continue; 681 continue;
399 } 682 }
400 683
401 length = 684 rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
402 kernel_recvmsg(csocket, &smb_msg, 685 &total_read, true /* header read */);
403 &iov, 1, pdu_length, 0 /* BB other flags? */); 686 if (rc == 3)
404 687 goto incomplete_rcv;
405 if (server->tcpStatus == CifsExiting) { 688 else if (rc == 2)
406 break; 689 break;
407 } else if (server->tcpStatus == CifsNeedReconnect) { 690 else if (rc == 1)
408 cFYI(1, "Reconnect after server stopped responding");
409 cifs_reconnect(server);
410 cFYI(1, "call to reconnect done");
411 csocket = server->ssocket;
412 continue;
413 } else if (length == -ERESTARTSYS ||
414 length == -EAGAIN ||
415 length == -EINTR) {
416 msleep(1); /* minimum sleep to prevent looping
417 allowing socket to clear and app threads to set
418 tcpStatus CifsNeedReconnect if server hung */
419 if (pdu_length < 4) {
420 iov.iov_base = (4 - pdu_length) +
421 (char *)smb_buffer;
422 iov.iov_len = pdu_length;
423 smb_msg.msg_control = NULL;
424 smb_msg.msg_controllen = 0;
425 goto incomplete_rcv;
426 } else
427 continue;
428 } else if (length <= 0) {
429 cFYI(1, "Reconnect after unexpected peek error %d",
430 length);
431 cifs_reconnect(server);
432 csocket = server->ssocket;
433 wake_up(&server->response_q);
434 continue; 691 continue;
435 } else if (length < pdu_length) {
436 cFYI(1, "requested %d bytes but only got %d bytes",
437 pdu_length, length);
438 pdu_length -= length;
439 msleep(1);
440 goto incomplete_rcv;
441 }
442
443 /* The right amount was read from socket - 4 bytes */
444 /* so we can now interpret the length field */
445 692
446 /* the first byte big endian of the length field, 693 /*
447 is actually not part of the length but the type 694 * The right amount was read from socket - 4 bytes,
448 with the most common, zero, as regular data */ 695 * so we can now interpret the length field.
449 temp = *((char *) smb_buffer); 696 */
450 697
451 /* Note that FC 1001 length is big endian on the wire, 698 /*
452 but we convert it here so it is always manipulated 699 * Note that RFC 1001 length is big endian on the wire,
453 as host byte order */ 700 * but we convert it here so it is always manipulated
701 * as host byte order.
702 */
454 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); 703 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
455 704
456 cFYI(1, "rfc1002 length 0x%x", pdu_length+4); 705 cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
457 706 if (!check_rfc1002_header(server, buf))
458 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
459 continue;
460 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
461 cFYI(1, "Good RFC 1002 session rsp");
462 continue;
463 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
464 /* we get this from Windows 98 instead of
465 an error on SMB negprot response */
466 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
467 pdu_length);
468 /* give server a second to clean up */
469 msleep(1000);
470 /* always try 445 first on reconnect since we get NACK
471 * on some if we ever connected to port 139 (the NACK
472 * is since we do not begin with RFC1001 session
473 * initialize frame)
474 */
475 cifs_set_port((struct sockaddr *)
476 &server->dstaddr, CIFS_PORT);
477 cifs_reconnect(server);
478 csocket = server->ssocket;
479 wake_up(&server->response_q);
480 continue;
481 } else if (temp != (char) 0) {
482 cERROR(1, "Unknown RFC 1002 frame");
483 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
484 length);
485 cifs_reconnect(server);
486 csocket = server->ssocket;
487 continue; 707 continue;
488 }
489
490 /* else we have an SMB response */
491 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
492 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
493 cERROR(1, "Invalid size SMB length %d pdu_length %d",
494 length, pdu_length+4);
495 cifs_reconnect(server);
496 csocket = server->ssocket;
497 wake_up(&server->response_q);
498 continue;
499 }
500 708
501 /* else length ok */ 709 /* else length ok */
502 reconnect = 0;
503
504 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { 710 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
505 isLargeBuf = true; 711 isLargeBuf = true;
506 memcpy(bigbuf, smallbuf, 4); 712 memcpy(bigbuf, smallbuf, 4);
507 smb_buffer = bigbuf; 713 smb_buffer = (struct smb_hdr *)bigbuf;
714 buf = bigbuf;
508 } 715 }
509 length = 0; 716
510 iov.iov_base = 4 + (char *)smb_buffer; 717 iov.iov_base = 4 + buf;
511 iov.iov_len = pdu_length; 718 iov.iov_len = pdu_length;
512 for (total_read = 0; total_read < pdu_length; 719 rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
513 total_read += length) { 720 &total_read, false);
514 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, 721 if (rc == 2)
515 pdu_length - total_read, 0);
516 if (server->tcpStatus == CifsExiting) {
517 /* then will exit */
518 reconnect = 2;
519 break;
520 } else if (server->tcpStatus == CifsNeedReconnect) {
521 cifs_reconnect(server);
522 csocket = server->ssocket;
523 /* Reconnect wakes up rspns q */
524 /* Now we will reread sock */
525 reconnect = 1;
526 break;
527 } else if (length == -ERESTARTSYS ||
528 length == -EAGAIN ||
529 length == -EINTR) {
530 msleep(1); /* minimum sleep to prevent looping,
531 allowing socket to clear and app
532 threads to set tcpStatus
533 CifsNeedReconnect if server hung*/
534 length = 0;
535 continue;
536 } else if (length <= 0) {
537 cERROR(1, "Received no data, expecting %d",
538 pdu_length - total_read);
539 cifs_reconnect(server);
540 csocket = server->ssocket;
541 reconnect = 1;
542 break;
543 }
544 }
545 if (reconnect == 2)
546 break; 722 break;
547 else if (reconnect == 1) 723 else if (rc == 1)
548 continue; 724 continue;
549 725
550 total_read += 4; /* account for rfc1002 hdr */ 726 total_read += 4; /* account for rfc1002 hdr */
@@ -562,75 +738,13 @@ incomplete_rcv:
562 */ 738 */
563 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); 739 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
564 if (length != 0) 740 if (length != 0)
565 cifs_dump_mem("Bad SMB: ", smb_buffer, 741 cifs_dump_mem("Bad SMB: ", buf,
566 min_t(unsigned int, total_read, 48)); 742 min_t(unsigned int, total_read, 48));
567 743
568 mid_entry = NULL;
569 server->lstrp = jiffies; 744 server->lstrp = jiffies;
570 745
571 spin_lock(&GlobalMid_Lock); 746 mid_entry = find_cifs_mid(server, smb_buffer, &length,
572 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { 747 isLargeBuf, &isMultiRsp, &bigbuf);
573 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
574
575 if (mid_entry->mid != smb_buffer->Mid ||
576 mid_entry->midState != MID_REQUEST_SUBMITTED ||
577 mid_entry->command != smb_buffer->Command) {
578 mid_entry = NULL;
579 continue;
580 }
581
582 if (length == 0 &&
583 check2ndT2(smb_buffer, server->maxBuf) > 0) {
584 /* We have a multipart transact2 resp */
585 isMultiRsp = true;
586 if (mid_entry->resp_buf) {
587 /* merge response - fix up 1st*/
588 length = coalesce_t2(smb_buffer,
589 mid_entry->resp_buf);
590 if (length > 0) {
591 length = 0;
592 mid_entry->multiRsp = true;
593 break;
594 } else {
595 /* all parts received or
596 * packet is malformed
597 */
598 mid_entry->multiEnd = true;
599 goto multi_t2_fnd;
600 }
601 } else {
602 if (!isLargeBuf) {
603 /*
604 * FIXME: switch to already
605 * allocated largebuf?
606 */
607 cERROR(1, "1st trans2 resp "
608 "needs bigbuf");
609 } else {
610 /* Have first buffer */
611 mid_entry->resp_buf =
612 smb_buffer;
613 mid_entry->largeBuf = true;
614 bigbuf = NULL;
615 }
616 }
617 break;
618 }
619 mid_entry->resp_buf = smb_buffer;
620 mid_entry->largeBuf = isLargeBuf;
621multi_t2_fnd:
622 if (length == 0)
623 mid_entry->midState = MID_RESPONSE_RECEIVED;
624 else
625 mid_entry->midState = MID_RESPONSE_MALFORMED;
626#ifdef CONFIG_CIFS_STATS2
627 mid_entry->when_received = jiffies;
628#endif
629 list_del_init(&mid_entry->qhead);
630 break;
631 }
632 spin_unlock(&GlobalMid_Lock);
633
634 if (mid_entry != NULL) { 748 if (mid_entry != NULL) {
635 mid_entry->callback(mid_entry); 749 mid_entry->callback(mid_entry);
636 /* Was previous buf put in mpx struct for multi-rsp? */ 750 /* Was previous buf put in mpx struct for multi-rsp? */
@@ -648,7 +762,7 @@ multi_t2_fnd:
648 !isMultiRsp) { 762 !isMultiRsp) {
649 cERROR(1, "No task to wake, unknown frame received! " 763 cERROR(1, "No task to wake, unknown frame received! "
650 "NumMids %d", atomic_read(&midCount)); 764 "NumMids %d", atomic_read(&midCount));
651 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 765 cifs_dump_mem("Received Data is: ", buf,
652 sizeof(struct smb_hdr)); 766 sizeof(struct smb_hdr));
653#ifdef CONFIG_CIFS_DEBUG2 767#ifdef CONFIG_CIFS_DEBUG2
654 cifs_dump_detail(smb_buffer); 768 cifs_dump_detail(smb_buffer);
@@ -658,88 +772,13 @@ multi_t2_fnd:
658 } 772 }
659 } /* end while !EXITING */ 773 } /* end while !EXITING */
660 774
661 /* take it off the list, if it's not already */
662 spin_lock(&cifs_tcp_ses_lock);
663 list_del_init(&server->tcp_ses_list);
664 spin_unlock(&cifs_tcp_ses_lock);
665
666 spin_lock(&GlobalMid_Lock);
667 server->tcpStatus = CifsExiting;
668 spin_unlock(&GlobalMid_Lock);
669 wake_up_all(&server->response_q);
670
671 /* check if we have blocked requests that need to free */
672 /* Note that cifs_max_pending is normally 50, but
673 can be set at module install time to as little as two */
674 spin_lock(&GlobalMid_Lock);
675 if (atomic_read(&server->inFlight) >= cifs_max_pending)
676 atomic_set(&server->inFlight, cifs_max_pending - 1);
677 /* We do not want to set the max_pending too low or we
678 could end up with the counter going negative */
679 spin_unlock(&GlobalMid_Lock);
680 /* Although there should not be any requests blocked on
681 this queue it can not hurt to be paranoid and try to wake up requests
682 that may haven been blocked when more than 50 at time were on the wire
683 to the same server - they now will see the session is in exit state
684 and get out of SendReceive. */
685 wake_up_all(&server->request_q);
686 /* give those requests time to exit */
687 msleep(125);
688
689 if (server->ssocket) {
690 sock_release(csocket);
691 server->ssocket = NULL;
692 }
693 /* buffer usually freed in free_mid - need to free it here on exit */ 775 /* buffer usually freed in free_mid - need to free it here on exit */
694 cifs_buf_release(bigbuf); 776 cifs_buf_release(bigbuf);
695 if (smallbuf) /* no sense logging a debug message if NULL */ 777 if (smallbuf) /* no sense logging a debug message if NULL */
696 cifs_small_buf_release(smallbuf); 778 cifs_small_buf_release(smallbuf);
697 779
698 if (!list_empty(&server->pending_mid_q)) {
699 struct list_head dispose_list;
700
701 INIT_LIST_HEAD(&dispose_list);
702 spin_lock(&GlobalMid_Lock);
703 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
704 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
705 cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
706 mid_entry->midState = MID_SHUTDOWN;
707 list_move(&mid_entry->qhead, &dispose_list);
708 }
709 spin_unlock(&GlobalMid_Lock);
710
711 /* now walk dispose list and issue callbacks */
712 list_for_each_safe(tmp, tmp2, &dispose_list) {
713 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
714 cFYI(1, "Callback mid 0x%x", mid_entry->mid);
715 list_del_init(&mid_entry->qhead);
716 mid_entry->callback(mid_entry);
717 }
718 /* 1/8th of sec is more than enough time for them to exit */
719 msleep(125);
720 }
721
722 if (!list_empty(&server->pending_mid_q)) {
723 /* mpx threads have not exited yet give them
724 at least the smb send timeout time for long ops */
725 /* due to delays on oplock break requests, we need
726 to wait at least 45 seconds before giving up
727 on a request getting a response and going ahead
728 and killing cifsd */
729 cFYI(1, "Wait for exit from demultiplex thread");
730 msleep(46000);
731 /* if threads still have not exited they are probably never
732 coming home not much else we can do but free the memory */
733 }
734
735 kfree(server->hostname);
736 task_to_wake = xchg(&server->tsk, NULL); 780 task_to_wake = xchg(&server->tsk, NULL);
737 kfree(server); 781 clean_demultiplex_info(server);
738
739 length = atomic_dec_return(&tcpSesAllocCount);
740 if (length > 0)
741 mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
742 GFP_KERNEL);
743 782
744 /* if server->tsk was NULL then wait for a signal before exiting */ 783 /* if server->tsk was NULL then wait for a signal before exiting */
745 if (!task_to_wake) { 784 if (!task_to_wake) {
@@ -2839,7 +2878,8 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
2839 kfree(volume_info->username); 2878 kfree(volume_info->username);
2840 kzfree(volume_info->password); 2879 kzfree(volume_info->password);
2841 kfree(volume_info->UNC); 2880 kfree(volume_info->UNC);
2842 kfree(volume_info->UNCip); 2881 if (volume_info->UNCip != volume_info->UNC + 2)
2882 kfree(volume_info->UNCip);
2843 kfree(volume_info->domainname); 2883 kfree(volume_info->domainname);
2844 kfree(volume_info->iocharset); 2884 kfree(volume_info->iocharset);
2845 kfree(volume_info->prepath); 2885 kfree(volume_info->prepath);
@@ -3193,15 +3233,9 @@ mount_fail_check:
3193 else 3233 else
3194 cifs_put_tcp_session(srvTcp); 3234 cifs_put_tcp_session(srvTcp);
3195 bdi_destroy(&cifs_sb->bdi); 3235 bdi_destroy(&cifs_sb->bdi);
3196 goto out;
3197 } 3236 }
3198 3237
3199 /* volume_info->password is freed above when existing session found
3200 (in which case it is not needed anymore) but when new sesion is created
3201 the password ptr is put in the new session structure (in which case the
3202 password will be freed at unmount time) */
3203out: 3238out:
3204 /* zero out password before freeing */
3205 FreeXid(xid); 3239 FreeXid(xid);
3206 return rc; 3240 return rc;
3207} 3241}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 499f27fc8576..72d448bf96ce 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -57,11 +57,6 @@ build_path_from_dentry(struct dentry *direntry)
57 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); 57 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
58 unsigned seq; 58 unsigned seq;
59 59
60 if (direntry == NULL)
61 return NULL; /* not much we can do if dentry is freed and
62 we need to reopen the file after it was closed implicitly
63 when the server crashed */
64
65 dirsep = CIFS_DIR_SEP(cifs_sb); 60 dirsep = CIFS_DIR_SEP(cifs_sb);
66 if (tcon->Flags & SMB_SHARE_IS_IN_DFS) 61 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
67 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); 62 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
@@ -110,8 +105,8 @@ cifs_bp_rename_retry:
110 } 105 }
111 rcu_read_unlock(); 106 rcu_read_unlock();
112 if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { 107 if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
113 cERROR(1, "did not end path lookup where expected namelen is %d", 108 cFYI(1, "did not end path lookup where expected. namelen=%d "
114 namelen); 109 "dfsplen=%d", namelen, dfsplen);
115 /* presumably this is only possible if racing with a rename 110 /* presumably this is only possible if racing with a rename
116 of one of the parent directories (we can not lock the dentries 111 of one of the parent directories (we can not lock the dentries
117 above us to prevent this, but retrying should be harmless) */ 112 above us to prevent this, but retrying should be harmless) */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 548f06230a6d..1d2d91d9bf65 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -79,8 +79,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
79 /* Perform the upcall */ 79 /* Perform the upcall */
80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); 80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
81 if (rc < 0) 81 if (rc < 0)
82 cERROR(1, "%s: unable to resolve: %*.*s", 82 cFYI(1, "%s: unable to resolve: %*.*s",
83 __func__, len, len, hostname); 83 __func__, len, len, hostname);
84 else 84 else
85 cFYI(1, "%s: resolved: %*.*s to %s", 85 cFYI(1, "%s: resolved: %*.*s to %s",
86 __func__, len, len, hostname, *ip_addr); 86 __func__, len, len, hostname, *ip_addr);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 378acdafa356..9f41a10523a1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -314,6 +314,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
314 } 314 }
315 spin_unlock(&cifs_file_list_lock); 315 spin_unlock(&cifs_file_list_lock);
316 316
317 cancel_work_sync(&cifs_file->oplock_break);
318
317 if (!tcon->need_reconnect && !cifs_file->invalidHandle) { 319 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
318 int xid, rc; 320 int xid, rc;
319 321
@@ -2418,31 +2420,6 @@ void cifs_oplock_break(struct work_struct *work)
2418 cinode->clientCanCacheRead ? 1 : 0); 2420 cinode->clientCanCacheRead ? 1 : 0);
2419 cFYI(1, "Oplock release rc = %d", rc); 2421 cFYI(1, "Oplock release rc = %d", rc);
2420 } 2422 }
2421
2422 /*
2423 * We might have kicked in before is_valid_oplock_break()
2424 * finished grabbing reference for us. Make sure it's done by
2425 * waiting for cifs_file_list_lock.
2426 */
2427 spin_lock(&cifs_file_list_lock);
2428 spin_unlock(&cifs_file_list_lock);
2429
2430 cifs_oplock_break_put(cfile);
2431}
2432
2433/* must be called while holding cifs_file_list_lock */
2434void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2435{
2436 cifs_sb_active(cfile->dentry->d_sb);
2437 cifsFileInfo_get(cfile);
2438}
2439
2440void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2441{
2442 struct super_block *sb = cfile->dentry->d_sb;
2443
2444 cifsFileInfo_put(cfile);
2445 cifs_sb_deactive(sb);
2446} 2423}
2447 2424
2448const struct address_space_operations cifs_addr_ops = { 2425const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9b018c8334fa..a7b2dcd4a53e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
764 if (full_path == NULL) 764 if (full_path == NULL)
765 return full_path; 765 return full_path;
766 766
767 if (dfsplen) { 767 if (dfsplen)
768 strncpy(full_path, tcon->treeName, dfsplen); 768 strncpy(full_path, tcon->treeName, dfsplen);
769 /* switch slash direction in prepath depending on whether
770 * windows or posix style path names
771 */
772 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
773 int i;
774 for (i = 0; i < dfsplen; i++) {
775 if (full_path[i] == '\\')
776 full_path[i] = '/';
777 }
778 }
779 }
780 strncpy(full_path + dfsplen, vol->prepath, pplen); 769 strncpy(full_path + dfsplen, vol->prepath, pplen);
770 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
781 full_path[dfsplen + pplen] = 0; /* add trailing null */ 771 full_path[dfsplen + pplen] = 0; /* add trailing null */
782 return full_path; 772 return full_path;
783} 773}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 556b1a0b54de..db3f18cdf024 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -74,8 +74,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
74 cERROR(1, "%s: Could not init md5 shash\n", __func__); 74 cERROR(1, "%s: Could not init md5 shash\n", __func__);
75 goto symlink_hash_err; 75 goto symlink_hash_err;
76 } 76 }
77 crypto_shash_update(&sdescmd5->shash, link_str, link_len); 77 rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
78 if (rc) {
79 cERROR(1, "%s: Could not update iwth link_str\n", __func__);
80 goto symlink_hash_err;
81 }
78 rc = crypto_shash_final(&sdescmd5->shash, md5_hash); 82 rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
83 if (rc)
84 cERROR(1, "%s: Could not generate md5 hash\n", __func__);
79 85
80symlink_hash_err: 86symlink_hash_err:
81 crypto_free_shash(md5); 87 crypto_free_shash(md5);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 03a1f491d39b..7c1693392598 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -585,15 +585,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
585 585
586 cifs_set_oplock_level(pCifsInode, 586 cifs_set_oplock_level(pCifsInode,
587 pSMB->OplockLevel ? OPLOCK_READ : 0); 587 pSMB->OplockLevel ? OPLOCK_READ : 0);
588 /* 588 queue_work(system_nrt_wq,
589 * cifs_oplock_break_put() can't be called 589 &netfile->oplock_break);
590 * from here. Get reference after queueing
591 * succeeded. cifs_oplock_break() will
592 * synchronize using cifs_file_list_lock.
593 */
594 if (queue_work(system_nrt_wq,
595 &netfile->oplock_break))
596 cifs_oplock_break_get(netfile);
597 netfile->oplock_break_cancelled = false; 590 netfile->oplock_break_cancelled = false;
598 591
599 spin_unlock(&cifs_file_list_lock); 592 spin_unlock(&cifs_file_list_lock);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 965a3af186a1..5de03ec20144 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -4,6 +4,7 @@
4 * Directory search handling 4 * Directory search handling
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2004, 2008 6 * Copyright (C) International Business Machines Corp., 2004, 2008
7 * Copyright (C) Red Hat, Inc., 2011
7 * Author(s): Steve French (sfrench@us.ibm.com) 8 * Author(s): Steve French (sfrench@us.ibm.com)
8 * 9 *
9 * This library is free software; you can redistribute it and/or modify 10 * This library is free software; you can redistribute it and/or modify
@@ -290,10 +291,10 @@ error_exit:
290} 291}
291 292
292/* return length of unicode string in bytes */ 293/* return length of unicode string in bytes */
293static int cifs_unicode_bytelen(char *str) 294static int cifs_unicode_bytelen(const char *str)
294{ 295{
295 int len; 296 int len;
296 __le16 *ustr = (__le16 *)str; 297 const __le16 *ustr = (const __le16 *)str;
297 298
298 for (len = 0; len <= PATH_MAX; len++) { 299 for (len = 0; len <= PATH_MAX; len++) {
299 if (ustr[len] == 0) 300 if (ustr[len] == 0)
@@ -334,78 +335,128 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
334 335
335} 336}
336 337
338struct cifs_dirent {
339 const char *name;
340 size_t namelen;
341 u32 resume_key;
342 u64 ino;
343};
344
345static void cifs_fill_dirent_unix(struct cifs_dirent *de,
346 const FILE_UNIX_INFO *info, bool is_unicode)
347{
348 de->name = &info->FileName[0];
349 if (is_unicode)
350 de->namelen = cifs_unicode_bytelen(de->name);
351 else
352 de->namelen = strnlen(de->name, PATH_MAX);
353 de->resume_key = info->ResumeKey;
354 de->ino = le64_to_cpu(info->basic.UniqueId);
355}
356
357static void cifs_fill_dirent_dir(struct cifs_dirent *de,
358 const FILE_DIRECTORY_INFO *info)
359{
360 de->name = &info->FileName[0];
361 de->namelen = le32_to_cpu(info->FileNameLength);
362 de->resume_key = info->FileIndex;
363}
364
365static void cifs_fill_dirent_full(struct cifs_dirent *de,
366 const FILE_FULL_DIRECTORY_INFO *info)
367{
368 de->name = &info->FileName[0];
369 de->namelen = le32_to_cpu(info->FileNameLength);
370 de->resume_key = info->FileIndex;
371}
372
373static void cifs_fill_dirent_search(struct cifs_dirent *de,
374 const SEARCH_ID_FULL_DIR_INFO *info)
375{
376 de->name = &info->FileName[0];
377 de->namelen = le32_to_cpu(info->FileNameLength);
378 de->resume_key = info->FileIndex;
379 de->ino = le64_to_cpu(info->UniqueId);
380}
381
382static void cifs_fill_dirent_both(struct cifs_dirent *de,
383 const FILE_BOTH_DIRECTORY_INFO *info)
384{
385 de->name = &info->FileName[0];
386 de->namelen = le32_to_cpu(info->FileNameLength);
387 de->resume_key = info->FileIndex;
388}
389
390static void cifs_fill_dirent_std(struct cifs_dirent *de,
391 const FIND_FILE_STANDARD_INFO *info)
392{
393 de->name = &info->FileName[0];
394 /* one byte length, no endianess conversion */
395 de->namelen = info->FileNameLength;
396 de->resume_key = info->ResumeKey;
397}
398
399static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
400 u16 level, bool is_unicode)
401{
402 memset(de, 0, sizeof(*de));
403
404 switch (level) {
405 case SMB_FIND_FILE_UNIX:
406 cifs_fill_dirent_unix(de, info, is_unicode);
407 break;
408 case SMB_FIND_FILE_DIRECTORY_INFO:
409 cifs_fill_dirent_dir(de, info);
410 break;
411 case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
412 cifs_fill_dirent_full(de, info);
413 break;
414 case SMB_FIND_FILE_ID_FULL_DIR_INFO:
415 cifs_fill_dirent_search(de, info);
416 break;
417 case SMB_FIND_FILE_BOTH_DIRECTORY_INFO:
418 cifs_fill_dirent_both(de, info);
419 break;
420 case SMB_FIND_FILE_INFO_STANDARD:
421 cifs_fill_dirent_std(de, info);
422 break;
423 default:
424 cFYI(1, "Unknown findfirst level %d", level);
425 return -EINVAL;
426 }
427
428 return 0;
429}
430
337#define UNICODE_DOT cpu_to_le16(0x2e) 431#define UNICODE_DOT cpu_to_le16(0x2e)
338 432
339/* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ 433/* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */
340static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) 434static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
341{ 435{
342 int rc = 0; 436 int rc = 0;
343 char *filename = NULL;
344 int len = 0;
345
346 if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
347 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
348 filename = &pFindData->FileName[0];
349 if (cfile->srch_inf.unicode) {
350 len = cifs_unicode_bytelen(filename);
351 } else {
352 /* BB should we make this strnlen of PATH_MAX? */
353 len = strnlen(filename, 5);
354 }
355 } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
356 FILE_DIRECTORY_INFO *pFindData =
357 (FILE_DIRECTORY_INFO *)current_entry;
358 filename = &pFindData->FileName[0];
359 len = le32_to_cpu(pFindData->FileNameLength);
360 } else if (cfile->srch_inf.info_level ==
361 SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
362 FILE_FULL_DIRECTORY_INFO *pFindData =
363 (FILE_FULL_DIRECTORY_INFO *)current_entry;
364 filename = &pFindData->FileName[0];
365 len = le32_to_cpu(pFindData->FileNameLength);
366 } else if (cfile->srch_inf.info_level ==
367 SMB_FIND_FILE_ID_FULL_DIR_INFO) {
368 SEARCH_ID_FULL_DIR_INFO *pFindData =
369 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
370 filename = &pFindData->FileName[0];
371 len = le32_to_cpu(pFindData->FileNameLength);
372 } else if (cfile->srch_inf.info_level ==
373 SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
374 FILE_BOTH_DIRECTORY_INFO *pFindData =
375 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
376 filename = &pFindData->FileName[0];
377 len = le32_to_cpu(pFindData->FileNameLength);
378 } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
379 FIND_FILE_STANDARD_INFO *pFindData =
380 (FIND_FILE_STANDARD_INFO *)current_entry;
381 filename = &pFindData->FileName[0];
382 len = pFindData->FileNameLength;
383 } else {
384 cFYI(1, "Unknown findfirst level %d",
385 cfile->srch_inf.info_level);
386 }
387 437
388 if (filename) { 438 if (!de->name)
389 if (cfile->srch_inf.unicode) { 439 return 0;
390 __le16 *ufilename = (__le16 *)filename; 440
391 if (len == 2) { 441 if (is_unicode) {
392 /* check for . */ 442 __le16 *ufilename = (__le16 *)de->name;
393 if (ufilename[0] == UNICODE_DOT) 443 if (de->namelen == 2) {
394 rc = 1; 444 /* check for . */
395 } else if (len == 4) { 445 if (ufilename[0] == UNICODE_DOT)
396 /* check for .. */ 446 rc = 1;
397 if ((ufilename[0] == UNICODE_DOT) 447 } else if (de->namelen == 4) {
398 && (ufilename[1] == UNICODE_DOT)) 448 /* check for .. */
399 rc = 2; 449 if (ufilename[0] == UNICODE_DOT &&
400 } 450 ufilename[1] == UNICODE_DOT)
401 } else /* ASCII */ { 451 rc = 2;
402 if (len == 1) { 452 }
403 if (filename[0] == '.') 453 } else /* ASCII */ {
404 rc = 1; 454 if (de->namelen == 1) {
405 } else if (len == 2) { 455 if (de->name[0] == '.')
406 if ((filename[0] == '.') && (filename[1] == '.')) 456 rc = 1;
407 rc = 2; 457 } else if (de->namelen == 2) {
408 } 458 if (de->name[0] == '.' && de->name[1] == '.')
459 rc = 2;
409 } 460 }
410 } 461 }
411 462
@@ -427,66 +478,18 @@ static int is_dir_changed(struct file *file)
427} 478}
428 479
429static int cifs_save_resume_key(const char *current_entry, 480static int cifs_save_resume_key(const char *current_entry,
430 struct cifsFileInfo *cifsFile) 481 struct cifsFileInfo *file_info)
431{ 482{
432 int rc = 0; 483 struct cifs_dirent de;
433 unsigned int len = 0; 484 int rc;
434 __u16 level;
435 char *filename;
436
437 if ((cifsFile == NULL) || (current_entry == NULL))
438 return -EINVAL;
439
440 level = cifsFile->srch_inf.info_level;
441
442 if (level == SMB_FIND_FILE_UNIX) {
443 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
444 485
445 filename = &pFindData->FileName[0]; 486 rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level,
446 if (cifsFile->srch_inf.unicode) { 487 file_info->srch_inf.unicode);
447 len = cifs_unicode_bytelen(filename); 488 if (!rc) {
448 } else { 489 file_info->srch_inf.presume_name = de.name;
449 /* BB should we make this strnlen of PATH_MAX? */ 490 file_info->srch_inf.resume_name_len = de.namelen;
450 len = strnlen(filename, PATH_MAX); 491 file_info->srch_inf.resume_key = de.resume_key;
451 }
452 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
453 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
454 FILE_DIRECTORY_INFO *pFindData =
455 (FILE_DIRECTORY_INFO *)current_entry;
456 filename = &pFindData->FileName[0];
457 len = le32_to_cpu(pFindData->FileNameLength);
458 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
459 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
460 FILE_FULL_DIRECTORY_INFO *pFindData =
461 (FILE_FULL_DIRECTORY_INFO *)current_entry;
462 filename = &pFindData->FileName[0];
463 len = le32_to_cpu(pFindData->FileNameLength);
464 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
465 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
466 SEARCH_ID_FULL_DIR_INFO *pFindData =
467 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
468 filename = &pFindData->FileName[0];
469 len = le32_to_cpu(pFindData->FileNameLength);
470 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
471 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
472 FILE_BOTH_DIRECTORY_INFO *pFindData =
473 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
474 filename = &pFindData->FileName[0];
475 len = le32_to_cpu(pFindData->FileNameLength);
476 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
477 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
478 FIND_FILE_STANDARD_INFO *pFindData =
479 (FIND_FILE_STANDARD_INFO *)current_entry;
480 filename = &pFindData->FileName[0];
481 /* one byte length, no name conversion */
482 len = (unsigned int)pFindData->FileNameLength;
483 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
484 } else {
485 cFYI(1, "Unknown findfirst level %d", level);
486 return -EINVAL;
487 } 492 }
488 cifsFile->srch_inf.resume_name_len = len;
489 cifsFile->srch_inf.presume_name = filename;
490 return rc; 493 return rc;
491} 494}
492 495
@@ -605,136 +608,70 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
605 return rc; 608 return rc;
606} 609}
607 610
608/* inode num, inode type and filename returned */ 611static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
609static int cifs_get_name_from_search_buf(struct qstr *pqst, 612 void *dirent, char *scratch_buf, unsigned int max_len)
610 char *current_entry, __u16 level, unsigned int unicode,
611 struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
612{ 613{
614 struct cifsFileInfo *file_info = file->private_data;
615 struct super_block *sb = file->f_path.dentry->d_sb;
616 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
617 struct cifs_dirent de = { NULL, };
618 struct cifs_fattr fattr;
619 struct dentry *dentry;
620 struct qstr name;
613 int rc = 0; 621 int rc = 0;
614 unsigned int len = 0; 622 ino_t ino;
615 char *filename;
616 struct nls_table *nlt = cifs_sb->local_nls;
617
618 *pinum = 0;
619
620 if (level == SMB_FIND_FILE_UNIX) {
621 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
622
623 filename = &pFindData->FileName[0];
624 if (unicode) {
625 len = cifs_unicode_bytelen(filename);
626 } else {
627 /* BB should we make this strnlen of PATH_MAX? */
628 len = strnlen(filename, PATH_MAX);
629 }
630 623
631 *pinum = le64_to_cpu(pFindData->basic.UniqueId); 624 rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level,
632 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { 625 file_info->srch_inf.unicode);
633 FILE_DIRECTORY_INFO *pFindData = 626 if (rc)
634 (FILE_DIRECTORY_INFO *)current_entry; 627 return rc;
635 filename = &pFindData->FileName[0];
636 len = le32_to_cpu(pFindData->FileNameLength);
637 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
638 FILE_FULL_DIRECTORY_INFO *pFindData =
639 (FILE_FULL_DIRECTORY_INFO *)current_entry;
640 filename = &pFindData->FileName[0];
641 len = le32_to_cpu(pFindData->FileNameLength);
642 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
643 SEARCH_ID_FULL_DIR_INFO *pFindData =
644 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
645 filename = &pFindData->FileName[0];
646 len = le32_to_cpu(pFindData->FileNameLength);
647 *pinum = le64_to_cpu(pFindData->UniqueId);
648 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
649 FILE_BOTH_DIRECTORY_INFO *pFindData =
650 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
651 filename = &pFindData->FileName[0];
652 len = le32_to_cpu(pFindData->FileNameLength);
653 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
654 FIND_FILE_STANDARD_INFO *pFindData =
655 (FIND_FILE_STANDARD_INFO *)current_entry;
656 filename = &pFindData->FileName[0];
657 /* one byte length, no name conversion */
658 len = (unsigned int)pFindData->FileNameLength;
659 } else {
660 cFYI(1, "Unknown findfirst level %d", level);
661 return -EINVAL;
662 }
663 628
664 if (len > max_len) { 629 if (de.namelen > max_len) {
665 cERROR(1, "bad search response length %d past smb end", len); 630 cERROR(1, "bad search response length %zd past smb end",
631 de.namelen);
666 return -EINVAL; 632 return -EINVAL;
667 } 633 }
668 634
669 if (unicode) {
670 pqst->len = cifs_from_ucs2((char *) pqst->name,
671 (__le16 *) filename,
672 UNICODE_NAME_MAX,
673 min(len, max_len), nlt,
674 cifs_sb->mnt_cifs_flags &
675 CIFS_MOUNT_MAP_SPECIAL_CHR);
676 pqst->len -= nls_nullsize(nlt);
677 } else {
678 pqst->name = filename;
679 pqst->len = len;
680 }
681 return rc;
682}
683
684static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
685 void *direntry, char *scratch_buf, unsigned int max_len)
686{
687 int rc = 0;
688 struct qstr qstring;
689 struct cifsFileInfo *pCifsF;
690 u64 inum;
691 ino_t ino;
692 struct super_block *sb;
693 struct cifs_sb_info *cifs_sb;
694 struct dentry *tmp_dentry;
695 struct cifs_fattr fattr;
696
697 /* get filename and len into qstring */
698 /* get dentry */
699 /* decide whether to create and populate ionde */
700 if ((direntry == NULL) || (file == NULL))
701 return -EINVAL;
702
703 pCifsF = file->private_data;
704
705 if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
706 return -ENOENT;
707
708 rc = cifs_entry_is_dot(pfindEntry, pCifsF);
709 /* skip . and .. since we added them first */ 635 /* skip . and .. since we added them first */
710 if (rc != 0) 636 if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode))
711 return 0; 637 return 0;
712 638
713 sb = file->f_path.dentry->d_sb; 639 if (file_info->srch_inf.unicode) {
714 cifs_sb = CIFS_SB(sb); 640 struct nls_table *nlt = cifs_sb->local_nls;
715
716 qstring.name = scratch_buf;
717 rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
718 pCifsF->srch_inf.info_level,
719 pCifsF->srch_inf.unicode, cifs_sb,
720 max_len, &inum /* returned */);
721 641
722 if (rc) 642 name.name = scratch_buf;
723 return rc; 643 name.len =
644 cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
645 UNICODE_NAME_MAX,
646 min(de.namelen, (size_t)max_len), nlt,
647 cifs_sb->mnt_cifs_flags &
648 CIFS_MOUNT_MAP_SPECIAL_CHR);
649 name.len -= nls_nullsize(nlt);
650 } else {
651 name.name = de.name;
652 name.len = de.namelen;
653 }
724 654
725 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) 655 switch (file_info->srch_inf.info_level) {
656 case SMB_FIND_FILE_UNIX:
726 cifs_unix_basic_to_fattr(&fattr, 657 cifs_unix_basic_to_fattr(&fattr,
727 &((FILE_UNIX_INFO *) pfindEntry)->basic, 658 &((FILE_UNIX_INFO *)find_entry)->basic,
728 cifs_sb); 659 cifs_sb);
729 else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) 660 break;
730 cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) 661 case SMB_FIND_FILE_INFO_STANDARD:
731 pfindEntry, cifs_sb); 662 cifs_std_info_to_fattr(&fattr,
732 else 663 (FIND_FILE_STANDARD_INFO *)find_entry,
733 cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) 664 cifs_sb);
734 pfindEntry, cifs_sb); 665 break;
666 default:
667 cifs_dir_info_to_fattr(&fattr,
668 (FILE_DIRECTORY_INFO *)find_entry,
669 cifs_sb);
670 break;
671 }
735 672
736 if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { 673 if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
737 fattr.cf_uniqueid = inum; 674 fattr.cf_uniqueid = de.ino;
738 } else { 675 } else {
739 fattr.cf_uniqueid = iunique(sb, ROOT_I); 676 fattr.cf_uniqueid = iunique(sb, ROOT_I);
740 cifs_autodisable_serverino(cifs_sb); 677 cifs_autodisable_serverino(cifs_sb);
@@ -750,12 +687,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
750 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; 687 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
751 688
752 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 689 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
753 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 690 dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
754 691
755 rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, 692 rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
756 ino, fattr.cf_dtype); 693 fattr.cf_dtype);
757 694
758 dput(tmp_dentry); 695 dput(dentry);
759 return rc; 696 return rc;
760} 697}
761 698
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 1c5b770c3141..42b9fff48751 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -157,8 +157,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
157 cERROR(1, "%s: Could not init md4 shash\n", __func__); 157 cERROR(1, "%s: Could not init md4 shash\n", __func__);
158 goto mdfour_err; 158 goto mdfour_err;
159 } 159 }
160 crypto_shash_update(&sdescmd4->shash, link_str, link_len); 160 rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
161 if (rc) {
162 cERROR(1, "%s: Could not update with link_str\n", __func__);
163 goto mdfour_err;
164 }
161 rc = crypto_shash_final(&sdescmd4->shash, md4_hash); 165 rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
166 if (rc)
167 cERROR(1, "%s: Could not genereate md4 hash\n", __func__);
162 168
163mdfour_err: 169mdfour_err:
164 crypto_free_shash(md4); 170 crypto_free_shash(md4);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 147aa22c3c3a..10ca6b2c26b7 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -266,15 +266,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
266 while (1) { 266 while (1) {
267 if (atomic_read(&server->inFlight) >= cifs_max_pending) { 267 if (atomic_read(&server->inFlight) >= cifs_max_pending) {
268 spin_unlock(&GlobalMid_Lock); 268 spin_unlock(&GlobalMid_Lock);
269#ifdef CONFIG_CIFS_STATS2 269 cifs_num_waiters_inc(server);
270 atomic_inc(&server->num_waiters);
271#endif
272 wait_event(server->request_q, 270 wait_event(server->request_q,
273 atomic_read(&server->inFlight) 271 atomic_read(&server->inFlight)
274 < cifs_max_pending); 272 < cifs_max_pending);
275#ifdef CONFIG_CIFS_STATS2 273 cifs_num_waiters_dec(server);
276 atomic_dec(&server->num_waiters);
277#endif
278 spin_lock(&GlobalMid_Lock); 274 spin_lock(&GlobalMid_Lock);
279 } else { 275 } else {
280 if (server->tcpStatus == CifsExiting) { 276 if (server->tcpStatus == CifsExiting) {
@@ -362,6 +358,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
362 mid = AllocMidQEntry(hdr, server); 358 mid = AllocMidQEntry(hdr, server);
363 if (mid == NULL) { 359 if (mid == NULL) {
364 mutex_unlock(&server->srv_mutex); 360 mutex_unlock(&server->srv_mutex);
361 atomic_dec(&server->inFlight);
362 wake_up(&server->request_q);
365 return -ENOMEM; 363 return -ENOMEM;
366 } 364 }
367 365
@@ -379,15 +377,13 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
379 mid->callback = callback; 377 mid->callback = callback;
380 mid->callback_data = cbdata; 378 mid->callback_data = cbdata;
381 mid->midState = MID_REQUEST_SUBMITTED; 379 mid->midState = MID_REQUEST_SUBMITTED;
382#ifdef CONFIG_CIFS_STATS2 380
383 atomic_inc(&server->inSend); 381 cifs_in_send_inc(server);
384#endif
385 rc = smb_sendv(server, iov, nvec); 382 rc = smb_sendv(server, iov, nvec);
386#ifdef CONFIG_CIFS_STATS2 383 cifs_in_send_dec(server);
387 atomic_dec(&server->inSend); 384 cifs_save_when_sent(mid);
388 mid->when_sent = jiffies;
389#endif
390 mutex_unlock(&server->srv_mutex); 385 mutex_unlock(&server->srv_mutex);
386
391 if (rc) 387 if (rc)
392 goto out_err; 388 goto out_err;
393 389
@@ -573,14 +569,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
573 } 569 }
574 570
575 midQ->midState = MID_REQUEST_SUBMITTED; 571 midQ->midState = MID_REQUEST_SUBMITTED;
576#ifdef CONFIG_CIFS_STATS2 572 cifs_in_send_inc(ses->server);
577 atomic_inc(&ses->server->inSend);
578#endif
579 rc = smb_sendv(ses->server, iov, n_vec); 573 rc = smb_sendv(ses->server, iov, n_vec);
580#ifdef CONFIG_CIFS_STATS2 574 cifs_in_send_dec(ses->server);
581 atomic_dec(&ses->server->inSend); 575 cifs_save_when_sent(midQ);
582 midQ->when_sent = jiffies;
583#endif
584 576
585 mutex_unlock(&ses->server->srv_mutex); 577 mutex_unlock(&ses->server->srv_mutex);
586 578
@@ -701,14 +693,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
701 } 693 }
702 694
703 midQ->midState = MID_REQUEST_SUBMITTED; 695 midQ->midState = MID_REQUEST_SUBMITTED;
704#ifdef CONFIG_CIFS_STATS2 696
705 atomic_inc(&ses->server->inSend); 697 cifs_in_send_inc(ses->server);
706#endif
707 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); 698 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
708#ifdef CONFIG_CIFS_STATS2 699 cifs_in_send_dec(ses->server);
709 atomic_dec(&ses->server->inSend); 700 cifs_save_when_sent(midQ);
710 midQ->when_sent = jiffies;
711#endif
712 mutex_unlock(&ses->server->srv_mutex); 701 mutex_unlock(&ses->server->srv_mutex);
713 702
714 if (rc < 0) 703 if (rc < 0)
@@ -841,14 +830,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
841 } 830 }
842 831
843 midQ->midState = MID_REQUEST_SUBMITTED; 832 midQ->midState = MID_REQUEST_SUBMITTED;
844#ifdef CONFIG_CIFS_STATS2 833 cifs_in_send_inc(ses->server);
845 atomic_inc(&ses->server->inSend);
846#endif
847 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); 834 rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
848#ifdef CONFIG_CIFS_STATS2 835 cifs_in_send_dec(ses->server);
849 atomic_dec(&ses->server->inSend); 836 cifs_save_when_sent(midQ);
850 midQ->when_sent = jiffies;
851#endif
852 mutex_unlock(&ses->server->srv_mutex); 837 mutex_unlock(&ses->server->srv_mutex);
853 838
854 if (rc < 0) { 839 if (rc < 0) {
diff --git a/fs/compat.c b/fs/compat.c
index 0b48d018e38a..58b1da459893 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1675,11 +1675,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
1675} 1675}
1676#endif /* HAVE_SET_RESTORE_SIGMASK */ 1676#endif /* HAVE_SET_RESTORE_SIGMASK */
1677 1677
1678long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
1679{
1680 return sys_ni_syscall();
1681}
1682
1683#ifdef CONFIG_EPOLL 1678#ifdef CONFIG_EPOLL
1684 1679
1685#ifdef HAVE_SET_RESTORE_SIGMASK 1680#ifdef HAVE_SET_RESTORE_SIGMASK
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8be086e9abe4..51352de88ef1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT)
1003COMPATIBLE_IOCTL(PPPIOCDISCONN) 1003COMPATIBLE_IOCTL(PPPIOCDISCONN)
1004COMPATIBLE_IOCTL(PPPIOCATTCHAN) 1004COMPATIBLE_IOCTL(PPPIOCATTCHAN)
1005COMPATIBLE_IOCTL(PPPIOCGCHAN) 1005COMPATIBLE_IOCTL(PPPIOCGCHAN)
1006COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
1006/* PPPOX */ 1007/* PPPOX */
1007COMPATIBLE_IOCTL(PPPOEIOCSFWD) 1008COMPATIBLE_IOCTL(PPPOEIOCSFWD)
1008COMPATIBLE_IOCTL(PPPOEIOCDFWD) 1009COMPATIBLE_IOCTL(PPPOEIOCDFWD)
diff --git a/fs/dcache.c b/fs/dcache.c
index be18598c7fd7..a88948b8bd17 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -301,6 +301,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
301 return parent; 301 return parent;
302} 302}
303 303
304/*
305 * Unhash a dentry without inserting an RCU walk barrier or checking that
306 * dentry->d_lock is locked. The caller must take care of that, if
307 * appropriate.
308 */
309static void __d_shrink(struct dentry *dentry)
310{
311 if (!d_unhashed(dentry)) {
312 struct hlist_bl_head *b;
313 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
314 b = &dentry->d_sb->s_anon;
315 else
316 b = d_hash(dentry->d_parent, dentry->d_name.hash);
317
318 hlist_bl_lock(b);
319 __hlist_bl_del(&dentry->d_hash);
320 dentry->d_hash.pprev = NULL;
321 hlist_bl_unlock(b);
322 }
323}
324
304/** 325/**
305 * d_drop - drop a dentry 326 * d_drop - drop a dentry
306 * @dentry: dentry to drop 327 * @dentry: dentry to drop
@@ -319,17 +340,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
319void __d_drop(struct dentry *dentry) 340void __d_drop(struct dentry *dentry)
320{ 341{
321 if (!d_unhashed(dentry)) { 342 if (!d_unhashed(dentry)) {
322 struct hlist_bl_head *b; 343 __d_shrink(dentry);
323 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
324 b = &dentry->d_sb->s_anon;
325 else
326 b = d_hash(dentry->d_parent, dentry->d_name.hash);
327
328 hlist_bl_lock(b);
329 __hlist_bl_del(&dentry->d_hash);
330 dentry->d_hash.pprev = NULL;
331 hlist_bl_unlock(b);
332
333 dentry_rcuwalk_barrier(dentry); 344 dentry_rcuwalk_barrier(dentry);
334 } 345 }
335} 346}
@@ -784,6 +795,7 @@ relock:
784 795
785/** 796/**
786 * prune_dcache_sb - shrink the dcache 797 * prune_dcache_sb - shrink the dcache
798 * @sb: superblock
787 * @nr_to_scan: number of entries to try to free 799 * @nr_to_scan: number of entries to try to free
788 * 800 *
789 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 801 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
@@ -828,44 +840,24 @@ EXPORT_SYMBOL(shrink_dcache_sb);
828static void shrink_dcache_for_umount_subtree(struct dentry *dentry) 840static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
829{ 841{
830 struct dentry *parent; 842 struct dentry *parent;
831 unsigned detached = 0;
832 843
833 BUG_ON(!IS_ROOT(dentry)); 844 BUG_ON(!IS_ROOT(dentry));
834 845
835 /* detach this root from the system */
836 spin_lock(&dentry->d_lock);
837 dentry_lru_del(dentry);
838 __d_drop(dentry);
839 spin_unlock(&dentry->d_lock);
840
841 for (;;) { 846 for (;;) {
842 /* descend to the first leaf in the current subtree */ 847 /* descend to the first leaf in the current subtree */
843 while (!list_empty(&dentry->d_subdirs)) { 848 while (!list_empty(&dentry->d_subdirs))
844 struct dentry *loop;
845
846 /* this is a branch with children - detach all of them
847 * from the system in one go */
848 spin_lock(&dentry->d_lock);
849 list_for_each_entry(loop, &dentry->d_subdirs,
850 d_u.d_child) {
851 spin_lock_nested(&loop->d_lock,
852 DENTRY_D_LOCK_NESTED);
853 dentry_lru_del(loop);
854 __d_drop(loop);
855 spin_unlock(&loop->d_lock);
856 }
857 spin_unlock(&dentry->d_lock);
858
859 /* move to the first child */
860 dentry = list_entry(dentry->d_subdirs.next, 849 dentry = list_entry(dentry->d_subdirs.next,
861 struct dentry, d_u.d_child); 850 struct dentry, d_u.d_child);
862 }
863 851
864 /* consume the dentries from this leaf up through its parents 852 /* consume the dentries from this leaf up through its parents
865 * until we find one with children or run out altogether */ 853 * until we find one with children or run out altogether */
866 do { 854 do {
867 struct inode *inode; 855 struct inode *inode;
868 856
857 /* detach from the system */
858 dentry_lru_del(dentry);
859 __d_shrink(dentry);
860
869 if (dentry->d_count != 0) { 861 if (dentry->d_count != 0) {
870 printk(KERN_ERR 862 printk(KERN_ERR
871 "BUG: Dentry %p{i=%lx,n=%s}" 863 "BUG: Dentry %p{i=%lx,n=%s}"
@@ -886,14 +878,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
886 list_del(&dentry->d_u.d_child); 878 list_del(&dentry->d_u.d_child);
887 } else { 879 } else {
888 parent = dentry->d_parent; 880 parent = dentry->d_parent;
889 spin_lock(&parent->d_lock);
890 parent->d_count--; 881 parent->d_count--;
891 list_del(&dentry->d_u.d_child); 882 list_del(&dentry->d_u.d_child);
892 spin_unlock(&parent->d_lock);
893 } 883 }
894 884
895 detached++;
896
897 inode = dentry->d_inode; 885 inode = dentry->d_inode;
898 if (inode) { 886 if (inode) {
899 dentry->d_inode = NULL; 887 dentry->d_inode = NULL;
@@ -938,9 +926,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
938 926
939 dentry = sb->s_root; 927 dentry = sb->s_root;
940 sb->s_root = NULL; 928 sb->s_root = NULL;
941 spin_lock(&dentry->d_lock);
942 dentry->d_count--; 929 dentry->d_count--;
943 spin_unlock(&dentry->d_lock);
944 shrink_dcache_for_umount_subtree(dentry); 930 shrink_dcache_for_umount_subtree(dentry);
945 931
946 while (!hlist_bl_empty(&sb->s_anon)) { 932 while (!hlist_bl_empty(&sb->s_anon)) {
@@ -1743,7 +1729,7 @@ seqretry:
1743 */ 1729 */
1744 if (read_seqcount_retry(&dentry->d_seq, *seq)) 1730 if (read_seqcount_retry(&dentry->d_seq, *seq))
1745 goto seqretry; 1731 goto seqretry;
1746 if (parent->d_flags & DCACHE_OP_COMPARE) { 1732 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1747 if (parent->d_op->d_compare(parent, *inode, 1733 if (parent->d_op->d_compare(parent, *inode,
1748 dentry, i, 1734 dentry, i,
1749 tlen, tname, name)) 1735 tlen, tname, name))
@@ -2138,8 +2124,9 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
2138 * @target: new dentry 2124 * @target: new dentry
2139 * 2125 *
2140 * Update the dcache to reflect the move of a file name. Negative 2126 * Update the dcache to reflect the move of a file name. Negative
2141 * dcache entries should not be moved in this way. Caller hold 2127 * dcache entries should not be moved in this way. Caller must hold
2142 * rename_lock. 2128 * rename_lock, the i_mutex of the source and target directories,
2129 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
2143 */ 2130 */
2144static void __d_move(struct dentry * dentry, struct dentry * target) 2131static void __d_move(struct dentry * dentry, struct dentry * target)
2145{ 2132{
@@ -2202,7 +2189,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
2202 * @target: new dentry 2189 * @target: new dentry
2203 * 2190 *
2204 * Update the dcache to reflect the move of a file name. Negative 2191 * Update the dcache to reflect the move of a file name. Negative
2205 * dcache entries should not be moved in this way. 2192 * dcache entries should not be moved in this way. See the locking
2193 * requirements for __d_move.
2206 */ 2194 */
2207void d_move(struct dentry *dentry, struct dentry *target) 2195void d_move(struct dentry *dentry, struct dentry *target)
2208{ 2196{
@@ -2320,7 +2308,8 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2320 * @inode: inode to bind to the dentry, to which aliases may be attached 2308 * @inode: inode to bind to the dentry, to which aliases may be attached
2321 * 2309 *
2322 * Introduces an dentry into the tree, substituting an extant disconnected 2310 * Introduces an dentry into the tree, substituting an extant disconnected
2323 * root directory alias in its place if there is one 2311 * root directory alias in its place if there is one. Caller must hold the
2312 * i_mutex of the parent directory.
2324 */ 2313 */
2325struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) 2314struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2326{ 2315{
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 01d2d9ef609c..44a360ca8046 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -35,7 +35,7 @@
35#include <linux/buffer_head.h> 35#include <linux/buffer_head.h>
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <asm/atomic.h> 38#include <linux/atomic.h>
39 39
40/* 40/*
41 * How many user pages to map in one call to get_user_pages(). This determines 41 * How many user pages to map in one call to get_user_pages(). This determines
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1cd6d9d3e29a..cc16562654de 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
1config ECRYPT_FS 1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)" 2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO 3 depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
4 select CRYPTO_ECB 4 select CRYPTO_ECB
5 select CRYPTO_CBC 5 select CRYPTO_CBC
6 select CRYPTO_MD5 6 select CRYPTO_MD5
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 43c7c43b06f5..b36c5572b3f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -29,6 +29,7 @@
29#define ECRYPTFS_KERNEL_H 29#define ECRYPTFS_KERNEL_H
30 30
31#include <keys/user-type.h> 31#include <keys/user-type.h>
32#include <keys/encrypted-type.h>
32#include <linux/fs.h> 33#include <linux/fs.h>
33#include <linux/fs_stack.h> 34#include <linux/fs_stack.h>
34#include <linux/namei.h> 35#include <linux/namei.h>
@@ -36,125 +37,18 @@
36#include <linux/hash.h> 37#include <linux/hash.h>
37#include <linux/nsproxy.h> 38#include <linux/nsproxy.h>
38#include <linux/backing-dev.h> 39#include <linux/backing-dev.h>
40#include <linux/ecryptfs.h>
39 41
40/* Version verification for shared data structures w/ userspace */
41#define ECRYPTFS_VERSION_MAJOR 0x00
42#define ECRYPTFS_VERSION_MINOR 0x04
43#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03
44/* These flags indicate which features are supported by the kernel
45 * module; userspace tools such as the mount helper read
46 * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
47 * how to behave. */
48#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001
49#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002
50#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
51#define ECRYPTFS_VERSIONING_POLICY 0x00000008
52#define ECRYPTFS_VERSIONING_XATTR 0x00000010
53#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020
54#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040
55#define ECRYPTFS_VERSIONING_HMAC 0x00000080
56#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100
57#define ECRYPTFS_VERSIONING_GCM 0x00000200
58#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
59 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
60 | ECRYPTFS_VERSIONING_PUBKEY \
61 | ECRYPTFS_VERSIONING_XATTR \
62 | ECRYPTFS_VERSIONING_MULTKEY \
63 | ECRYPTFS_VERSIONING_DEVMISC \
64 | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
65#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
66#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
67#define ECRYPTFS_SALT_SIZE 8
68#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
69/* The original signature size is only for what is stored on disk; all
70 * in-memory representations are expanded hex, so it better adapted to
71 * be passed around or referenced on the command line */
72#define ECRYPTFS_SIG_SIZE 8
73#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
74#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
75#define ECRYPTFS_MAX_KEY_BYTES 64
76#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
77#define ECRYPTFS_DEFAULT_IV_BYTES 16 42#define ECRYPTFS_DEFAULT_IV_BYTES 16
78#define ECRYPTFS_FILE_VERSION 0x03
79#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 43#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
80#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 44#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
81#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 45#define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32
82#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ 46#define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ
83#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) 47#define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3)
84#define ECRYPTFS_MAX_PKI_NAME_BYTES 16
85#define ECRYPTFS_DEFAULT_NUM_USERS 4 48#define ECRYPTFS_DEFAULT_NUM_USERS 4
86#define ECRYPTFS_MAX_NUM_USERS 32768 49#define ECRYPTFS_MAX_NUM_USERS 32768
87#define ECRYPTFS_XATTR_NAME "user.ecryptfs" 50#define ECRYPTFS_XATTR_NAME "user.ecryptfs"
88 51
89#define RFC2440_CIPHER_DES3_EDE 0x02
90#define RFC2440_CIPHER_CAST_5 0x03
91#define RFC2440_CIPHER_BLOWFISH 0x04
92#define RFC2440_CIPHER_AES_128 0x07
93#define RFC2440_CIPHER_AES_192 0x08
94#define RFC2440_CIPHER_AES_256 0x09
95#define RFC2440_CIPHER_TWOFISH 0x0a
96#define RFC2440_CIPHER_CAST_6 0x0b
97
98#define RFC2440_CIPHER_RSA 0x01
99
100/**
101 * For convenience, we may need to pass around the encrypted session
102 * key between kernel and userspace because the authentication token
103 * may not be extractable. For example, the TPM may not release the
104 * private key, instead requiring the encrypted data and returning the
105 * decrypted data.
106 */
107struct ecryptfs_session_key {
108#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
109#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
110#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
111#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
112 u32 flags;
113 u32 encrypted_key_size;
114 u32 decrypted_key_size;
115 u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
116 u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
117};
118
119struct ecryptfs_password {
120 u32 password_bytes;
121 s32 hash_algo;
122 u32 hash_iterations;
123 u32 session_key_encryption_key_bytes;
124#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
125#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
126 u32 flags;
127 /* Iterated-hash concatenation of salt and passphrase */
128 u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
129 u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
130 /* Always in expanded hex */
131 u8 salt[ECRYPTFS_SALT_SIZE];
132};
133
134enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
135
136struct ecryptfs_private_key {
137 u32 key_size;
138 u32 data_len;
139 u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
140 char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1];
141 u8 data[];
142};
143
144/* May be a password or a private key */
145struct ecryptfs_auth_tok {
146 u16 version; /* 8-bit major and 8-bit minor */
147 u16 token_type;
148#define ECRYPTFS_ENCRYPT_ONLY 0x00000001
149 u32 flags;
150 struct ecryptfs_session_key session_key;
151 u8 reserved[32];
152 union {
153 struct ecryptfs_password password;
154 struct ecryptfs_private_key private_key;
155 } token;
156} __attribute__ ((packed));
157
158void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); 52void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
159extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); 53extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
160extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); 54extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
@@ -185,11 +79,47 @@ struct ecryptfs_page_crypt_context {
185 } param; 79 } param;
186}; 80};
187 81
82#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
83static inline struct ecryptfs_auth_tok *
84ecryptfs_get_encrypted_key_payload_data(struct key *key)
85{
86 if (key->type == &key_type_encrypted)
87 return (struct ecryptfs_auth_tok *)
88 (&((struct encrypted_key_payload *)key->payload.data)->payload_data);
89 else
90 return NULL;
91}
92
93static inline struct key *ecryptfs_get_encrypted_key(char *sig)
94{
95 return request_key(&key_type_encrypted, sig, NULL);
96}
97
98#else
99static inline struct ecryptfs_auth_tok *
100ecryptfs_get_encrypted_key_payload_data(struct key *key)
101{
102 return NULL;
103}
104
105static inline struct key *ecryptfs_get_encrypted_key(char *sig)
106{
107 return ERR_PTR(-ENOKEY);
108}
109
110#endif /* CONFIG_ENCRYPTED_KEYS */
111
188static inline struct ecryptfs_auth_tok * 112static inline struct ecryptfs_auth_tok *
189ecryptfs_get_key_payload_data(struct key *key) 113ecryptfs_get_key_payload_data(struct key *key)
190{ 114{
191 return (struct ecryptfs_auth_tok *) 115 struct ecryptfs_auth_tok *auth_tok;
192 (((struct user_key_payload*)key->payload.data)->data); 116
117 auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
118 if (!auth_tok)
119 return (struct ecryptfs_auth_tok *)
120 (((struct user_key_payload *)key->payload.data)->data);
121 else
122 return auth_tok;
193} 123}
194 124
195#define ECRYPTFS_MAX_KEYSET_SIZE 1024 125#define ECRYPTFS_MAX_KEYSET_SIZE 1024
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 340c657a108c..11f8582d7218 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -69,6 +69,7 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
69 inode->i_ino = lower_inode->i_ino; 69 inode->i_ino = lower_inode->i_ino;
70 inode->i_version++; 70 inode->i_version++;
71 inode->i_mapping->a_ops = &ecryptfs_aops; 71 inode->i_mapping->a_ops = &ecryptfs_aops;
72 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
72 73
73 if (S_ISLNK(inode->i_mode)) 74 if (S_ISLNK(inode->i_mode))
74 inode->i_op = &ecryptfs_symlink_iops; 75 inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index fa8049ecdc64..ac1ad48c2376 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1635,11 +1635,14 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
1635 1635
1636 (*auth_tok_key) = request_key(&key_type_user, sig, NULL); 1636 (*auth_tok_key) = request_key(&key_type_user, sig, NULL);
1637 if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { 1637 if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
1638 printk(KERN_ERR "Could not find key with description: [%s]\n", 1638 (*auth_tok_key) = ecryptfs_get_encrypted_key(sig);
1639 sig); 1639 if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
1640 rc = process_request_key_err(PTR_ERR(*auth_tok_key)); 1640 printk(KERN_ERR "Could not find key with description: [%s]\n",
1641 (*auth_tok_key) = NULL; 1641 sig);
1642 goto out; 1642 rc = process_request_key_err(PTR_ERR(*auth_tok_key));
1643 (*auth_tok_key) = NULL;
1644 goto out;
1645 }
1643 } 1646 }
1644 down_write(&(*auth_tok_key)->sem); 1647 down_write(&(*auth_tok_key)->sem);
1645 rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); 1648 rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok);
@@ -1868,11 +1871,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1868 * just one will be sufficient to decrypt to get the FEK. */ 1871 * just one will be sufficient to decrypt to get the FEK. */
1869find_next_matching_auth_tok: 1872find_next_matching_auth_tok:
1870 found_auth_tok = 0; 1873 found_auth_tok = 0;
1871 if (auth_tok_key) {
1872 up_write(&(auth_tok_key->sem));
1873 key_put(auth_tok_key);
1874 auth_tok_key = NULL;
1875 }
1876 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { 1874 list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
1877 candidate_auth_tok = &auth_tok_list_item->auth_tok; 1875 candidate_auth_tok = &auth_tok_list_item->auth_tok;
1878 if (unlikely(ecryptfs_verbosity > 0)) { 1876 if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1909,14 +1907,22 @@ found_matching_auth_tok:
1909 memcpy(&(candidate_auth_tok->token.private_key), 1907 memcpy(&(candidate_auth_tok->token.private_key),
1910 &(matching_auth_tok->token.private_key), 1908 &(matching_auth_tok->token.private_key),
1911 sizeof(struct ecryptfs_private_key)); 1909 sizeof(struct ecryptfs_private_key));
1910 up_write(&(auth_tok_key->sem));
1911 key_put(auth_tok_key);
1912 rc = decrypt_pki_encrypted_session_key(candidate_auth_tok, 1912 rc = decrypt_pki_encrypted_session_key(candidate_auth_tok,
1913 crypt_stat); 1913 crypt_stat);
1914 } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { 1914 } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) {
1915 memcpy(&(candidate_auth_tok->token.password), 1915 memcpy(&(candidate_auth_tok->token.password),
1916 &(matching_auth_tok->token.password), 1916 &(matching_auth_tok->token.password),
1917 sizeof(struct ecryptfs_password)); 1917 sizeof(struct ecryptfs_password));
1918 up_write(&(auth_tok_key->sem));
1919 key_put(auth_tok_key);
1918 rc = decrypt_passphrase_encrypted_session_key( 1920 rc = decrypt_passphrase_encrypted_session_key(
1919 candidate_auth_tok, crypt_stat); 1921 candidate_auth_tok, crypt_stat);
1922 } else {
1923 up_write(&(auth_tok_key->sem));
1924 key_put(auth_tok_key);
1925 rc = -EINVAL;
1920 } 1926 }
1921 if (rc) { 1927 if (rc) {
1922 struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; 1928 struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp;
@@ -1956,21 +1962,18 @@ found_matching_auth_tok:
1956out_wipe_list: 1962out_wipe_list:
1957 wipe_auth_tok_list(&auth_tok_list); 1963 wipe_auth_tok_list(&auth_tok_list);
1958out: 1964out:
1959 if (auth_tok_key) {
1960 up_write(&(auth_tok_key->sem));
1961 key_put(auth_tok_key);
1962 }
1963 return rc; 1965 return rc;
1964} 1966}
1965 1967
1966static int 1968static int
1967pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, 1969pki_encrypt_session_key(struct key *auth_tok_key,
1970 struct ecryptfs_auth_tok *auth_tok,
1968 struct ecryptfs_crypt_stat *crypt_stat, 1971 struct ecryptfs_crypt_stat *crypt_stat,
1969 struct ecryptfs_key_record *key_rec) 1972 struct ecryptfs_key_record *key_rec)
1970{ 1973{
1971 struct ecryptfs_msg_ctx *msg_ctx = NULL; 1974 struct ecryptfs_msg_ctx *msg_ctx = NULL;
1972 char *payload = NULL; 1975 char *payload = NULL;
1973 size_t payload_len; 1976 size_t payload_len = 0;
1974 struct ecryptfs_message *msg; 1977 struct ecryptfs_message *msg;
1975 int rc; 1978 int rc;
1976 1979
@@ -1979,6 +1982,8 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
1979 crypt_stat->cipher, 1982 crypt_stat->cipher,
1980 crypt_stat->key_size), 1983 crypt_stat->key_size),
1981 crypt_stat, &payload, &payload_len); 1984 crypt_stat, &payload, &payload_len);
1985 up_write(&(auth_tok_key->sem));
1986 key_put(auth_tok_key);
1982 if (rc) { 1987 if (rc) {
1983 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); 1988 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
1984 goto out; 1989 goto out;
@@ -2008,6 +2013,8 @@ out:
2008 * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet 2013 * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet
2009 * @dest: Buffer into which to write the packet 2014 * @dest: Buffer into which to write the packet
2010 * @remaining_bytes: Maximum number of bytes that can be writtn 2015 * @remaining_bytes: Maximum number of bytes that can be writtn
2016 * @auth_tok_key: The authentication token key to unlock and put when done with
2017 * @auth_tok
2011 * @auth_tok: The authentication token used for generating the tag 1 packet 2018 * @auth_tok: The authentication token used for generating the tag 1 packet
2012 * @crypt_stat: The cryptographic context 2019 * @crypt_stat: The cryptographic context
2013 * @key_rec: The key record struct for the tag 1 packet 2020 * @key_rec: The key record struct for the tag 1 packet
@@ -2018,7 +2025,7 @@ out:
2018 */ 2025 */
2019static int 2026static int
2020write_tag_1_packet(char *dest, size_t *remaining_bytes, 2027write_tag_1_packet(char *dest, size_t *remaining_bytes,
2021 struct ecryptfs_auth_tok *auth_tok, 2028 struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok,
2022 struct ecryptfs_crypt_stat *crypt_stat, 2029 struct ecryptfs_crypt_stat *crypt_stat,
2023 struct ecryptfs_key_record *key_rec, size_t *packet_size) 2030 struct ecryptfs_key_record *key_rec, size_t *packet_size)
2024{ 2031{
@@ -2039,12 +2046,15 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes,
2039 memcpy(key_rec->enc_key, 2046 memcpy(key_rec->enc_key,
2040 auth_tok->session_key.encrypted_key, 2047 auth_tok->session_key.encrypted_key,
2041 auth_tok->session_key.encrypted_key_size); 2048 auth_tok->session_key.encrypted_key_size);
2049 up_write(&(auth_tok_key->sem));
2050 key_put(auth_tok_key);
2042 goto encrypted_session_key_set; 2051 goto encrypted_session_key_set;
2043 } 2052 }
2044 if (auth_tok->session_key.encrypted_key_size == 0) 2053 if (auth_tok->session_key.encrypted_key_size == 0)
2045 auth_tok->session_key.encrypted_key_size = 2054 auth_tok->session_key.encrypted_key_size =
2046 auth_tok->token.private_key.key_size; 2055 auth_tok->token.private_key.key_size;
2047 rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec); 2056 rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat,
2057 key_rec);
2048 if (rc) { 2058 if (rc) {
2049 printk(KERN_ERR "Failed to encrypt session key via a key " 2059 printk(KERN_ERR "Failed to encrypt session key via a key "
2050 "module; rc = [%d]\n", rc); 2060 "module; rc = [%d]\n", rc);
@@ -2421,6 +2431,8 @@ ecryptfs_generate_key_packet_set(char *dest_base,
2421 &max, auth_tok, 2431 &max, auth_tok,
2422 crypt_stat, key_rec, 2432 crypt_stat, key_rec,
2423 &written); 2433 &written);
2434 up_write(&(auth_tok_key->sem));
2435 key_put(auth_tok_key);
2424 if (rc) { 2436 if (rc) {
2425 ecryptfs_printk(KERN_WARNING, "Error " 2437 ecryptfs_printk(KERN_WARNING, "Error "
2426 "writing tag 3 packet\n"); 2438 "writing tag 3 packet\n");
@@ -2438,8 +2450,8 @@ ecryptfs_generate_key_packet_set(char *dest_base,
2438 } 2450 }
2439 (*len) += written; 2451 (*len) += written;
2440 } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { 2452 } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
2441 rc = write_tag_1_packet(dest_base + (*len), 2453 rc = write_tag_1_packet(dest_base + (*len), &max,
2442 &max, auth_tok, 2454 auth_tok_key, auth_tok,
2443 crypt_stat, key_rec, &written); 2455 crypt_stat, key_rec, &written);
2444 if (rc) { 2456 if (rc) {
2445 ecryptfs_printk(KERN_WARNING, "Error " 2457 ecryptfs_printk(KERN_WARNING, "Error "
@@ -2448,14 +2460,13 @@ ecryptfs_generate_key_packet_set(char *dest_base,
2448 } 2460 }
2449 (*len) += written; 2461 (*len) += written;
2450 } else { 2462 } else {
2463 up_write(&(auth_tok_key->sem));
2464 key_put(auth_tok_key);
2451 ecryptfs_printk(KERN_WARNING, "Unsupported " 2465 ecryptfs_printk(KERN_WARNING, "Unsupported "
2452 "authentication token type\n"); 2466 "authentication token type\n");
2453 rc = -EINVAL; 2467 rc = -EINVAL;
2454 goto out_free; 2468 goto out_free;
2455 } 2469 }
2456 up_write(&(auth_tok_key->sem));
2457 key_put(auth_tok_key);
2458 auth_tok_key = NULL;
2459 } 2470 }
2460 if (likely(max > 0)) { 2471 if (likely(max > 0)) {
2461 dest_base[(*len)] = 0x00; 2472 dest_base[(*len)] = 0x00;
@@ -2468,11 +2479,6 @@ out_free:
2468out: 2479out:
2469 if (rc) 2480 if (rc)
2470 (*len) = 0; 2481 (*len) = 0;
2471 if (auth_tok_key) {
2472 up_write(&(auth_tok_key->sem));
2473 key_put(auth_tok_key);
2474 }
2475
2476 mutex_unlock(&crypt_stat->keysig_list_mutex); 2482 mutex_unlock(&crypt_stat->keysig_list_mutex);
2477 return rc; 2483 return rc;
2478} 2484}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f1bb747d77d..b4a6befb1216 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
175 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, 175 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
176 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, 176 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
177 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, 177 ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
178 ecryptfs_opt_check_dev_ruid,
178 ecryptfs_opt_err }; 179 ecryptfs_opt_err };
179 180
180static const match_table_t tokens = { 181static const match_table_t tokens = {
@@ -191,6 +192,7 @@ static const match_table_t tokens = {
191 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, 192 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
192 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, 193 {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
193 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, 194 {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
195 {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
194 {ecryptfs_opt_err, NULL} 196 {ecryptfs_opt_err, NULL}
195}; 197};
196 198
@@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat(
236 * ecryptfs_parse_options 238 * ecryptfs_parse_options
237 * @sb: The ecryptfs super block 239 * @sb: The ecryptfs super block
238 * @options: The options passed to the kernel 240 * @options: The options passed to the kernel
241 * @check_ruid: set to 1 if device uid should be checked against the ruid
239 * 242 *
240 * Parse mount options: 243 * Parse mount options:
241 * debug=N - ecryptfs_verbosity level for debug output 244 * debug=N - ecryptfs_verbosity level for debug output
@@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat(
251 * 254 *
252 * Returns zero on success; non-zero on error 255 * Returns zero on success; non-zero on error
253 */ 256 */
254static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) 257static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
258 uid_t *check_ruid)
255{ 259{
256 char *p; 260 char *p;
257 int rc = 0; 261 int rc = 0;
@@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
276 char *cipher_key_bytes_src; 280 char *cipher_key_bytes_src;
277 char *fn_cipher_key_bytes_src; 281 char *fn_cipher_key_bytes_src;
278 282
283 *check_ruid = 0;
284
279 if (!options) { 285 if (!options) {
280 rc = -EINVAL; 286 rc = -EINVAL;
281 goto out; 287 goto out;
@@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
380 mount_crypt_stat->flags |= 386 mount_crypt_stat->flags |=
381 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; 387 ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
382 break; 388 break;
389 case ecryptfs_opt_check_dev_ruid:
390 *check_ruid = 1;
391 break;
383 case ecryptfs_opt_err: 392 case ecryptfs_opt_err:
384 default: 393 default:
385 printk(KERN_WARNING 394 printk(KERN_WARNING
@@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
475 const char *err = "Getting sb failed"; 484 const char *err = "Getting sb failed";
476 struct inode *inode; 485 struct inode *inode;
477 struct path path; 486 struct path path;
487 uid_t check_ruid;
478 int rc; 488 int rc;
479 489
480 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); 490 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
483 goto out; 493 goto out;
484 } 494 }
485 495
486 rc = ecryptfs_parse_options(sbi, raw_data); 496 rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
487 if (rc) { 497 if (rc) {
488 err = "Error parsing options"; 498 err = "Error parsing options";
489 goto out; 499 goto out;
@@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
521 "known incompatibilities\n"); 531 "known incompatibilities\n");
522 goto out_free; 532 goto out_free;
523 } 533 }
534
535 if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) {
536 rc = -EPERM;
537 printk(KERN_ERR "Mount of device (uid: %d) not owned by "
538 "requested user (uid: %d)\n",
539 path.dentry->d_inode->i_uid, current_uid());
540 goto out_free;
541 }
542
524 ecryptfs_set_superblock_lower(s, path.dentry->d_sb); 543 ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
525 s->s_maxbytes = path.dentry->d_sb->s_maxbytes; 544 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
526 s->s_blocksize = path.dentry->d_sb->s_blocksize; 545 s->s_blocksize = path.dentry->d_sb->s_blocksize;
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 85d430963116..3745f7c2b9c2 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -39,15 +39,16 @@
39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, 39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
40 loff_t offset, size_t size) 40 loff_t offset, size_t size)
41{ 41{
42 struct ecryptfs_inode_info *inode_info; 42 struct file *lower_file;
43 mm_segment_t fs_save; 43 mm_segment_t fs_save;
44 ssize_t rc; 44 ssize_t rc;
45 45
46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode); 46 lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
47 BUG_ON(!inode_info->lower_file); 47 if (!lower_file)
48 return -EIO;
48 fs_save = get_fs(); 49 fs_save = get_fs();
49 set_fs(get_ds()); 50 set_fs(get_ds());
50 rc = vfs_write(inode_info->lower_file, data, size, &offset); 51 rc = vfs_write(lower_file, data, size, &offset);
51 set_fs(fs_save); 52 set_fs(fs_save);
52 mark_inode_dirty_sync(ecryptfs_inode); 53 mark_inode_dirty_sync(ecryptfs_inode);
53 return rc; 54 return rc;
@@ -225,15 +226,16 @@ out:
225int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 226int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
226 struct inode *ecryptfs_inode) 227 struct inode *ecryptfs_inode)
227{ 228{
228 struct ecryptfs_inode_info *inode_info = 229 struct file *lower_file;
229 ecryptfs_inode_to_private(ecryptfs_inode);
230 mm_segment_t fs_save; 230 mm_segment_t fs_save;
231 ssize_t rc; 231 ssize_t rc;
232 232
233 BUG_ON(!inode_info->lower_file); 233 lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
234 if (!lower_file)
235 return -EIO;
234 fs_save = get_fs(); 236 fs_save = get_fs();
235 set_fs(get_ds()); 237 set_fs(get_ds());
236 rc = vfs_read(inode_info->lower_file, data, size, &offset); 238 rc = vfs_read(lower_file, data, size, &offset);
237 set_fs(fs_save); 239 set_fs(fs_save);
238 return rc; 240 return rc;
239} 241}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5e480d555049..9026fc91fe3b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,7 +37,7 @@
37#include <asm/system.h> 37#include <asm/system.h>
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/mman.h> 39#include <asm/mman.h>
40#include <asm/atomic.h> 40#include <linux/atomic.h>
41 41
42/* 42/*
43 * LOCKING: 43 * LOCKING:
diff --git a/fs/exec.c b/fs/exec.c
index 842d5700c155..25dcbe5fc356 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -181,14 +181,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
181 return; 181 return;
182 182
183 bprm->vma_pages = pages; 183 bprm->vma_pages = pages;
184
185#ifdef SPLIT_RSS_COUNTING
186 add_mm_counter(mm, MM_ANONPAGES, diff);
187#else
188 spin_lock(&mm->page_table_lock);
189 add_mm_counter(mm, MM_ANONPAGES, diff); 184 add_mm_counter(mm, MM_ANONPAGES, diff);
190 spin_unlock(&mm->page_table_lock);
191#endif
192} 185}
193 186
194static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, 187static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
@@ -277,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
277 * use STACK_TOP because that can depend on attributes which aren't 270 * use STACK_TOP because that can depend on attributes which aren't
278 * configured yet. 271 * configured yet.
279 */ 272 */
280 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 273 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
281 vma->vm_end = STACK_TOP_MAX; 274 vma->vm_end = STACK_TOP_MAX;
282 vma->vm_start = vma->vm_end - PAGE_SIZE; 275 vma->vm_start = vma->vm_end - PAGE_SIZE;
283 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 276 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
@@ -1430,9 +1423,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1430 } 1423 }
1431 } 1424 }
1432 read_unlock(&binfmt_lock); 1425 read_unlock(&binfmt_lock);
1426#ifdef CONFIG_MODULES
1433 if (retval != -ENOEXEC || bprm->mm == NULL) { 1427 if (retval != -ENOEXEC || bprm->mm == NULL) {
1434 break; 1428 break;
1435#ifdef CONFIG_MODULES
1436 } else { 1429 } else {
1437#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) 1430#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1438 if (printable(bprm->buf[0]) && 1431 if (printable(bprm->buf[0]) &&
@@ -1440,9 +1433,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1440 printable(bprm->buf[2]) && 1433 printable(bprm->buf[2]) &&
1441 printable(bprm->buf[3])) 1434 printable(bprm->buf[3]))
1442 break; /* -ENOEXEC */ 1435 break; /* -ENOEXEC */
1436 if (try)
1437 break; /* -ENOEXEC */
1443 request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); 1438 request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1444#endif
1445 } 1439 }
1440#else
1441 break;
1442#endif
1446 } 1443 }
1447 return retval; 1444 return retval;
1448} 1445}
@@ -1462,6 +1459,23 @@ static int do_execve_common(const char *filename,
1462 struct files_struct *displaced; 1459 struct files_struct *displaced;
1463 bool clear_in_exec; 1460 bool clear_in_exec;
1464 int retval; 1461 int retval;
1462 const struct cred *cred = current_cred();
1463
1464 /*
1465 * We move the actual failure in case of RLIMIT_NPROC excess from
1466 * set*uid() to execve() because too many poorly written programs
1467 * don't check setuid() return code. Here we additionally recheck
1468 * whether NPROC limit is still exceeded.
1469 */
1470 if ((current->flags & PF_NPROC_EXCEEDED) &&
1471 atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
1472 retval = -EAGAIN;
1473 goto out_ret;
1474 }
1475
1476 /* We're below the limit (still or again), so we don't want to make
1477 * further execve() calls fail. */
1478 current->flags &= ~PF_NPROC_EXCEEDED;
1465 1479
1466 retval = unshare_files(&displaced); 1480 retval = unshare_files(&displaced);
1467 if (retval) 1481 if (retval)
@@ -1649,15 +1663,26 @@ expand_fail:
1649 return ret; 1663 return ret;
1650} 1664}
1651 1665
1666static void cn_escape(char *str)
1667{
1668 for (; *str; str++)
1669 if (*str == '/')
1670 *str = '!';
1671}
1672
1652static int cn_print_exe_file(struct core_name *cn) 1673static int cn_print_exe_file(struct core_name *cn)
1653{ 1674{
1654 struct file *exe_file; 1675 struct file *exe_file;
1655 char *pathbuf, *path, *p; 1676 char *pathbuf, *path;
1656 int ret; 1677 int ret;
1657 1678
1658 exe_file = get_mm_exe_file(current->mm); 1679 exe_file = get_mm_exe_file(current->mm);
1659 if (!exe_file) 1680 if (!exe_file) {
1660 return cn_printf(cn, "(unknown)"); 1681 char *commstart = cn->corename + cn->used;
1682 ret = cn_printf(cn, "%s (path unknown)", current->comm);
1683 cn_escape(commstart);
1684 return ret;
1685 }
1661 1686
1662 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); 1687 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
1663 if (!pathbuf) { 1688 if (!pathbuf) {
@@ -1671,9 +1696,7 @@ static int cn_print_exe_file(struct core_name *cn)
1671 goto free_buf; 1696 goto free_buf;
1672 } 1697 }
1673 1698
1674 for (p = path; *p; p++) 1699 cn_escape(path);
1675 if (*p == '/')
1676 *p = '!';
1677 1700
1678 ret = cn_printf(cn, "%s", path); 1701 ret = cn_printf(cn, "%s", path);
1679 1702
@@ -1745,16 +1768,22 @@ static int format_corename(struct core_name *cn, long signr)
1745 break; 1768 break;
1746 } 1769 }
1747 /* hostname */ 1770 /* hostname */
1748 case 'h': 1771 case 'h': {
1772 char *namestart = cn->corename + cn->used;
1749 down_read(&uts_sem); 1773 down_read(&uts_sem);
1750 err = cn_printf(cn, "%s", 1774 err = cn_printf(cn, "%s",
1751 utsname()->nodename); 1775 utsname()->nodename);
1752 up_read(&uts_sem); 1776 up_read(&uts_sem);
1777 cn_escape(namestart);
1753 break; 1778 break;
1779 }
1754 /* executable */ 1780 /* executable */
1755 case 'e': 1781 case 'e': {
1782 char *commstart = cn->corename + cn->used;
1756 err = cn_printf(cn, "%s", current->comm); 1783 err = cn_printf(cn, "%s", current->comm);
1784 cn_escape(commstart);
1757 break; 1785 break;
1786 }
1758 case 'E': 1787 case 'E':
1759 err = cn_print_exe_file(cn); 1788 err = cn_print_exe_file(cn);
1760 break; 1789 break;
@@ -2118,16 +2147,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
2118 2147
2119 ispipe = format_corename(&cn, signr); 2148 ispipe = format_corename(&cn, signr);
2120 2149
2121 if (ispipe == -ENOMEM) {
2122 printk(KERN_WARNING "format_corename failed\n");
2123 printk(KERN_WARNING "Aborting core\n");
2124 goto fail_corename;
2125 }
2126
2127 if (ispipe) { 2150 if (ispipe) {
2128 int dump_count; 2151 int dump_count;
2129 char **helper_argv; 2152 char **helper_argv;
2130 2153
2154 if (ispipe < 0) {
2155 printk(KERN_WARNING "format_corename failed\n");
2156 printk(KERN_WARNING "Aborting core\n");
2157 goto fail_corename;
2158 }
2159
2131 if (cprm.limit == 1) { 2160 if (cprm.limit == 1) {
2132 /* 2161 /*
2133 * Normally core limits are irrelevant to pipes, since 2162 * Normally core limits are irrelevant to pipes, since
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 2d0f757fda3e..c5a5855a6c44 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,8 @@
12# Kbuild - Gets included from the Kernels Makefile and build system 12# Kbuild - Gets included from the Kernels Makefile and build system
13# 13#
14 14
15exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o 15# ore module library
16obj-$(CONFIG_ORE) += ore.o
17
18exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o 19obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 86194b2f799d..70bae4149291 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,6 +1,10 @@
1config ORE
2 tristate
3
1config EXOFS_FS 4config EXOFS_FS
2 tristate "exofs: OSD based file system support" 5 tristate "exofs: OSD based file system support"
3 depends on SCSI_OSD_ULD 6 depends on SCSI_OSD_ULD
7 select ORE
4 help 8 help
5 EXOFS is a file system that uses an OSD storage device, 9 EXOFS is a file system that uses an OSD storage device,
6 as its backing storage. 10 as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c965806c2821..f4e442ec7445 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -36,12 +36,9 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/backing-dev.h> 38#include <linux/backing-dev.h>
39#include "common.h" 39#include <scsi/osd_ore.h>
40 40
41/* FIXME: Remove once pnfs hits mainline 41#include "common.h"
42 * #include <linux/exportfs/pnfs_osd_xdr.h>
43 */
44#include "pnfs.h"
45 42
46#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) 43#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
47 44
@@ -56,27 +53,11 @@
56/* u64 has problems with printk this will cast it to unsigned long long */ 53/* u64 has problems with printk this will cast it to unsigned long long */
57#define _LLU(x) (unsigned long long)(x) 54#define _LLU(x) (unsigned long long)(x)
58 55
59struct exofs_layout {
60 osd_id s_pid; /* partition ID of file system*/
61
62 /* Our way of looking at the data_map */
63 unsigned stripe_unit;
64 unsigned mirrors_p1;
65
66 unsigned group_width;
67 u64 group_depth;
68 unsigned group_count;
69
70 enum exofs_inode_layout_gen_functions lay_func;
71
72 unsigned s_numdevs; /* Num of devices in array */
73 struct osd_dev *s_ods[0]; /* Variable length */
74};
75
76/* 56/*
77 * our extension to the in-memory superblock 57 * our extension to the in-memory superblock
78 */ 58 */
79struct exofs_sb_info { 59struct exofs_sb_info {
60 struct backing_dev_info bdi; /* register our bdi with VFS */
80 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ 61 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
81 int s_timeout; /* timeout for OSD operations */ 62 int s_timeout; /* timeout for OSD operations */
82 uint64_t s_nextid; /* highest object ID used */ 63 uint64_t s_nextid; /* highest object ID used */
@@ -84,16 +65,13 @@ struct exofs_sb_info {
84 spinlock_t s_next_gen_lock; /* spinlock for gen # update */ 65 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
85 u32 s_next_generation; /* next gen # to use */ 66 u32 s_next_generation; /* next gen # to use */
86 atomic_t s_curr_pending; /* number of pending commands */ 67 atomic_t s_curr_pending; /* number of pending commands */
87 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
88 struct backing_dev_info bdi; /* register our bdi with VFS */
89 68
90 struct pnfs_osd_data_map data_map; /* Default raid to use 69 struct pnfs_osd_data_map data_map; /* Default raid to use
91 * FIXME: Needed ? 70 * FIXME: Needed ?
92 */ 71 */
93/* struct exofs_layout dir_layout;*/ /* Default dir layout */ 72 struct ore_layout layout; /* Default files layout */
94 struct exofs_layout layout; /* Default files layout, 73 struct ore_comp one_comp; /* id & cred of partition id=0*/
95 * contains the variable osd_dev 74 struct ore_components comps; /* comps for the partition */
96 * array. Keep last */
97 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ 75 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
98}; 76};
99 77
@@ -107,7 +85,8 @@ struct exofs_i_info {
107 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ 85 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
108 uint32_t i_dir_start_lookup; /* which page to start lookup */ 86 uint32_t i_dir_start_lookup; /* which page to start lookup */
109 uint64_t i_commit_size; /* the object's written length */ 87 uint64_t i_commit_size; /* the object's written length */
110 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ 88 struct ore_comp one_comp; /* same component for all devices */
89 struct ore_components comps; /* inode view of the device table */
111}; 90};
112 91
113static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 92static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -115,52 +94,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
115 return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; 94 return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
116} 95}
117 96
118struct exofs_io_state;
119typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
120
121struct exofs_io_state {
122 struct kref kref;
123
124 void *private;
125 exofs_io_done_fn done;
126
127 struct exofs_layout *layout;
128 struct osd_obj_id obj;
129 u8 *cred;
130
131 /* Global read/write IO*/
132 loff_t offset;
133 unsigned long length;
134 void *kern_buff;
135
136 struct page **pages;
137 unsigned nr_pages;
138 unsigned pgbase;
139 unsigned pages_consumed;
140
141 /* Attributes */
142 unsigned in_attr_len;
143 struct osd_attr *in_attr;
144 unsigned out_attr_len;
145 struct osd_attr *out_attr;
146
147 /* Variable array of size numdevs */
148 unsigned numdevs;
149 struct exofs_per_dev_state {
150 struct osd_request *or;
151 struct bio *bio;
152 loff_t offset;
153 unsigned length;
154 unsigned dev;
155 } per_dev[];
156};
157
158static inline unsigned exofs_io_state_size(unsigned numdevs)
159{
160 return sizeof(struct exofs_io_state) +
161 sizeof(struct exofs_per_dev_state) * numdevs;
162}
163
164/* 97/*
165 * our inode flags 98 * our inode flags
166 */ 99 */
@@ -205,12 +138,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
205} 138}
206 139
207/* 140/*
208 * Given a layout, object_number and stripe_index return the associated global
209 * dev_index
210 */
211unsigned exofs_layout_od_id(struct exofs_layout *layout,
212 osd_id obj_no, unsigned layout_index);
213/*
214 * Maximum count of links to a file 141 * Maximum count of links to a file
215 */ 142 */
216#define EXOFS_LINK_MAX 32000 143#define EXOFS_LINK_MAX 32000
@@ -219,44 +146,8 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout,
219 * function declarations * 146 * function declarations *
220 *************************/ 147 *************************/
221 148
222/* ios.c */
223void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
224 const struct osd_obj_id *obj);
225int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
226 u64 offset, void *p, unsigned length);
227
228int exofs_get_io_state(struct exofs_layout *layout,
229 struct exofs_io_state **ios);
230void exofs_put_io_state(struct exofs_io_state *ios);
231
232int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
233
234int exofs_sbi_create(struct exofs_io_state *ios);
235int exofs_sbi_remove(struct exofs_io_state *ios);
236int exofs_sbi_write(struct exofs_io_state *ios);
237int exofs_sbi_read(struct exofs_io_state *ios);
238
239int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
240
241int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
242static inline int exofs_oi_write(struct exofs_i_info *oi,
243 struct exofs_io_state *ios)
244{
245 ios->obj.id = exofs_oi_objno(oi);
246 ios->cred = oi->i_cred;
247 return exofs_sbi_write(ios);
248}
249
250static inline int exofs_oi_read(struct exofs_i_info *oi,
251 struct exofs_io_state *ios)
252{
253 ios->obj.id = exofs_oi_objno(oi);
254 ios->cred = oi->i_cred;
255 return exofs_sbi_read(ios);
256}
257
258/* inode.c */ 149/* inode.c */
259unsigned exofs_max_io_pages(struct exofs_layout *layout, 150unsigned exofs_max_io_pages(struct ore_layout *layout,
260 unsigned expected_pages); 151 unsigned expected_pages);
261int exofs_setattr(struct dentry *, struct iattr *); 152int exofs_setattr(struct dentry *, struct iattr *);
262int exofs_write_begin(struct file *file, struct address_space *mapping, 153int exofs_write_begin(struct file *file, struct address_space *mapping,
@@ -281,6 +172,8 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
281 struct inode *); 172 struct inode *);
282 173
283/* super.c */ 174/* super.c */
175void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
176 const struct osd_obj_id *obj);
284int exofs_sbi_write_stats(struct exofs_sb_info *sbi); 177int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
285 178
286/********************* 179/*********************
@@ -295,7 +188,6 @@ extern const struct file_operations exofs_file_operations;
295 188
296/* inode.c */ 189/* inode.c */
297extern const struct address_space_operations exofs_aops; 190extern const struct address_space_operations exofs_aops;
298extern const struct osd_attr g_attr_logical_length;
299 191
300/* namei.c */ 192/* namei.c */
301extern const struct inode_operations exofs_dir_inode_operations; 193extern const struct inode_operations exofs_dir_inode_operations;
@@ -305,4 +197,33 @@ extern const struct inode_operations exofs_special_inode_operations;
305extern const struct inode_operations exofs_symlink_inode_operations; 197extern const struct inode_operations exofs_symlink_inode_operations;
306extern const struct inode_operations exofs_fast_symlink_inode_operations; 198extern const struct inode_operations exofs_fast_symlink_inode_operations;
307 199
200/* exofs_init_comps will initialize an ore_components device array
201 * pointing to a single ore_comp struct, and a round-robin view
202 * of the device table.
203 * The first device of each inode is the [inode->ino % num_devices]
204 * and the rest of the devices sequentially following where the
205 * first device is after the last device.
206 * It is assumed that the global device array at @sbi is twice
207 * bigger and that the device table repeats twice.
208 * See: exofs_read_lookup_dev_table()
209 */
210static inline void exofs_init_comps(struct ore_components *comps,
211 struct ore_comp *one_comp,
212 struct exofs_sb_info *sbi, osd_id oid)
213{
214 unsigned dev_mod = (unsigned)oid, first_dev;
215
216 one_comp->obj.partition = sbi->one_comp.obj.partition;
217 one_comp->obj.id = oid;
218 exofs_make_credential(one_comp->cred, &one_comp->obj);
219
220 comps->numdevs = sbi->comps.numdevs;
221 comps->single_comp = EC_SINGLE_COMP;
222 comps->comps = one_comp;
223
224 /* Round robin device view of the table */
225 first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
226 comps->ods = sbi->comps.ods + first_dev;
227}
228
308#endif 229#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 8472c098445d..f39a38fc2349 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,7 +43,7 @@ enum { BIO_MAX_PAGES_KMALLOC =
43 PAGE_SIZE / sizeof(struct page *), 43 PAGE_SIZE / sizeof(struct page *),
44}; 44};
45 45
46unsigned exofs_max_io_pages(struct exofs_layout *layout, 46unsigned exofs_max_io_pages(struct ore_layout *layout,
47 unsigned expected_pages) 47 unsigned expected_pages)
48{ 48{
49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); 49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
@@ -58,7 +58,7 @@ struct page_collect {
58 struct exofs_sb_info *sbi; 58 struct exofs_sb_info *sbi;
59 struct inode *inode; 59 struct inode *inode;
60 unsigned expected_pages; 60 unsigned expected_pages;
61 struct exofs_io_state *ios; 61 struct ore_io_state *ios;
62 62
63 struct page **pages; 63 struct page **pages;
64 unsigned alloc_pages; 64 unsigned alloc_pages;
@@ -110,13 +110,6 @@ static int pcol_try_alloc(struct page_collect *pcol)
110{ 110{
111 unsigned pages; 111 unsigned pages;
112 112
113 if (!pcol->ios) { /* First time allocate io_state */
114 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
115
116 if (ret)
117 return ret;
118 }
119
120 /* TODO: easily support bio chaining */ 113 /* TODO: easily support bio chaining */
121 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); 114 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
122 115
@@ -140,7 +133,7 @@ static void pcol_free(struct page_collect *pcol)
140 pcol->pages = NULL; 133 pcol->pages = NULL;
141 134
142 if (pcol->ios) { 135 if (pcol->ios) {
143 exofs_put_io_state(pcol->ios); 136 ore_put_io_state(pcol->ios);
144 pcol->ios = NULL; 137 pcol->ios = NULL;
145 } 138 }
146} 139}
@@ -200,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol)
200 u64 resid; 193 u64 resid;
201 u64 good_bytes; 194 u64 good_bytes;
202 u64 length = 0; 195 u64 length = 0;
203 int ret = exofs_check_io(pcol->ios, &resid); 196 int ret = ore_check_io(pcol->ios, &resid);
204 197
205 if (likely(!ret)) 198 if (likely(!ret))
206 good_bytes = pcol->length; 199 good_bytes = pcol->length;
@@ -241,7 +234,7 @@ static int __readpages_done(struct page_collect *pcol)
241} 234}
242 235
243/* callback of async reads */ 236/* callback of async reads */
244static void readpages_done(struct exofs_io_state *ios, void *p) 237static void readpages_done(struct ore_io_state *ios, void *p)
245{ 238{
246 struct page_collect *pcol = p; 239 struct page_collect *pcol = p;
247 240
@@ -269,20 +262,28 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
269static int read_exec(struct page_collect *pcol) 262static int read_exec(struct page_collect *pcol)
270{ 263{
271 struct exofs_i_info *oi = exofs_i(pcol->inode); 264 struct exofs_i_info *oi = exofs_i(pcol->inode);
272 struct exofs_io_state *ios = pcol->ios; 265 struct ore_io_state *ios;
273 struct page_collect *pcol_copy = NULL; 266 struct page_collect *pcol_copy = NULL;
274 int ret; 267 int ret;
275 268
276 if (!pcol->pages) 269 if (!pcol->pages)
277 return 0; 270 return 0;
278 271
272 if (!pcol->ios) {
273 int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
274 pcol->pg_first << PAGE_CACHE_SHIFT,
275 pcol->length, &pcol->ios);
276
277 if (ret)
278 return ret;
279 }
280
281 ios = pcol->ios;
279 ios->pages = pcol->pages; 282 ios->pages = pcol->pages;
280 ios->nr_pages = pcol->nr_pages; 283 ios->nr_pages = pcol->nr_pages;
281 ios->length = pcol->length;
282 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
283 284
284 if (pcol->read_4_write) { 285 if (pcol->read_4_write) {
285 exofs_oi_read(oi, pcol->ios); 286 ore_read(pcol->ios);
286 return __readpages_done(pcol); 287 return __readpages_done(pcol);
287 } 288 }
288 289
@@ -295,14 +296,14 @@ static int read_exec(struct page_collect *pcol)
295 *pcol_copy = *pcol; 296 *pcol_copy = *pcol;
296 ios->done = readpages_done; 297 ios->done = readpages_done;
297 ios->private = pcol_copy; 298 ios->private = pcol_copy;
298 ret = exofs_oi_read(oi, ios); 299 ret = ore_read(ios);
299 if (unlikely(ret)) 300 if (unlikely(ret))
300 goto err; 301 goto err;
301 302
302 atomic_inc(&pcol->sbi->s_curr_pending); 303 atomic_inc(&pcol->sbi->s_curr_pending);
303 304
304 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 305 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
305 ios->obj.id, _LLU(ios->offset), pcol->length); 306 oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
306 307
307 /* pages ownership was passed to pcol_copy */ 308 /* pages ownership was passed to pcol_copy */
308 _pcol_reset(pcol); 309 _pcol_reset(pcol);
@@ -457,14 +458,14 @@ static int exofs_readpage(struct file *file, struct page *page)
457} 458}
458 459
459/* Callback for osd_write. All writes are asynchronous */ 460/* Callback for osd_write. All writes are asynchronous */
460static void writepages_done(struct exofs_io_state *ios, void *p) 461static void writepages_done(struct ore_io_state *ios, void *p)
461{ 462{
462 struct page_collect *pcol = p; 463 struct page_collect *pcol = p;
463 int i; 464 int i;
464 u64 resid; 465 u64 resid;
465 u64 good_bytes; 466 u64 good_bytes;
466 u64 length = 0; 467 u64 length = 0;
467 int ret = exofs_check_io(ios, &resid); 468 int ret = ore_check_io(ios, &resid);
468 469
469 atomic_dec(&pcol->sbi->s_curr_pending); 470 atomic_dec(&pcol->sbi->s_curr_pending);
470 471
@@ -507,13 +508,21 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
507static int write_exec(struct page_collect *pcol) 508static int write_exec(struct page_collect *pcol)
508{ 509{
509 struct exofs_i_info *oi = exofs_i(pcol->inode); 510 struct exofs_i_info *oi = exofs_i(pcol->inode);
510 struct exofs_io_state *ios = pcol->ios; 511 struct ore_io_state *ios;
511 struct page_collect *pcol_copy = NULL; 512 struct page_collect *pcol_copy = NULL;
512 int ret; 513 int ret;
513 514
514 if (!pcol->pages) 515 if (!pcol->pages)
515 return 0; 516 return 0;
516 517
518 BUG_ON(pcol->ios);
519 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
520 pcol->pg_first << PAGE_CACHE_SHIFT,
521 pcol->length, &pcol->ios);
522
523 if (unlikely(ret))
524 goto err;
525
517 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 526 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
518 if (!pcol_copy) { 527 if (!pcol_copy) {
519 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); 528 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
@@ -523,16 +532,15 @@ static int write_exec(struct page_collect *pcol)
523 532
524 *pcol_copy = *pcol; 533 *pcol_copy = *pcol;
525 534
535 ios = pcol->ios;
526 ios->pages = pcol_copy->pages; 536 ios->pages = pcol_copy->pages;
527 ios->nr_pages = pcol_copy->nr_pages; 537 ios->nr_pages = pcol_copy->nr_pages;
528 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
529 ios->length = pcol_copy->length;
530 ios->done = writepages_done; 538 ios->done = writepages_done;
531 ios->private = pcol_copy; 539 ios->private = pcol_copy;
532 540
533 ret = exofs_oi_write(oi, ios); 541 ret = ore_write(ios);
534 if (unlikely(ret)) { 542 if (unlikely(ret)) {
535 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); 543 EXOFS_ERR("write_exec: ore_write() Failed\n");
536 goto err; 544 goto err;
537 } 545 }
538 546
@@ -844,17 +852,15 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
844 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); 852 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
845} 853}
846 854
847const struct osd_attr g_attr_logical_length = ATTR_DEF(
848 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
849
850static int _do_truncate(struct inode *inode, loff_t newsize) 855static int _do_truncate(struct inode *inode, loff_t newsize)
851{ 856{
852 struct exofs_i_info *oi = exofs_i(inode); 857 struct exofs_i_info *oi = exofs_i(inode);
858 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
853 int ret; 859 int ret;
854 860
855 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 861 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
856 862
857 ret = exofs_oi_truncate(oi, (u64)newsize); 863 ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
858 if (likely(!ret)) 864 if (likely(!ret))
859 truncate_setsize(inode, newsize); 865 truncate_setsize(inode, newsize);
860 866
@@ -917,30 +923,26 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
917 [1] = g_attr_inode_file_layout, 923 [1] = g_attr_inode_file_layout,
918 [2] = g_attr_inode_dir_layout, 924 [2] = g_attr_inode_dir_layout,
919 }; 925 };
920 struct exofs_io_state *ios; 926 struct ore_io_state *ios;
921 struct exofs_on_disk_inode_layout *layout; 927 struct exofs_on_disk_inode_layout *layout;
922 int ret; 928 int ret;
923 929
924 ret = exofs_get_io_state(&sbi->layout, &ios); 930 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
925 if (unlikely(ret)) { 931 if (unlikely(ret)) {
926 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 932 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
927 return ret; 933 return ret;
928 } 934 }
929 935
930 ios->obj.id = exofs_oi_objno(oi); 936 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
931 exofs_make_credential(oi->i_cred, &ios->obj); 937 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
932 ios->cred = oi->i_cred;
933
934 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
935 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
936 938
937 ios->in_attr = attrs; 939 ios->in_attr = attrs;
938 ios->in_attr_len = ARRAY_SIZE(attrs); 940 ios->in_attr_len = ARRAY_SIZE(attrs);
939 941
940 ret = exofs_sbi_read(ios); 942 ret = ore_read(ios);
941 if (unlikely(ret)) { 943 if (unlikely(ret)) {
942 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", 944 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
943 _LLU(ios->obj.id), ret); 945 _LLU(oi->one_comp.obj.id), ret);
944 memset(inode, 0, sizeof(*inode)); 946 memset(inode, 0, sizeof(*inode));
945 inode->i_mode = 0040000 | (0777 & ~022); 947 inode->i_mode = 0040000 | (0777 & ~022);
946 /* If object is lost on target we might as well enable it's 948 /* If object is lost on target we might as well enable it's
@@ -990,7 +992,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
990 } 992 }
991 993
992out: 994out:
993 exofs_put_io_state(ios); 995 ore_put_io_state(ios);
994 return ret; 996 return ret;
995} 997}
996 998
@@ -1016,6 +1018,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1016 return inode; 1018 return inode;
1017 oi = exofs_i(inode); 1019 oi = exofs_i(inode);
1018 __oi_init(oi); 1020 __oi_init(oi);
1021 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
1022 exofs_oi_objno(oi));
1019 1023
1020 /* read the inode from the osd */ 1024 /* read the inode from the osd */
1021 ret = exofs_get_inode(sb, oi, &fcb); 1025 ret = exofs_get_inode(sb, oi, &fcb);
@@ -1107,21 +1111,22 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1107 * set the obj_created flag so that other methods know that the object exists on 1111 * set the obj_created flag so that other methods know that the object exists on
1108 * the OSD. 1112 * the OSD.
1109 */ 1113 */
1110static void create_done(struct exofs_io_state *ios, void *p) 1114static void create_done(struct ore_io_state *ios, void *p)
1111{ 1115{
1112 struct inode *inode = p; 1116 struct inode *inode = p;
1113 struct exofs_i_info *oi = exofs_i(inode); 1117 struct exofs_i_info *oi = exofs_i(inode);
1114 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 1118 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1115 int ret; 1119 int ret;
1116 1120
1117 ret = exofs_check_io(ios, NULL); 1121 ret = ore_check_io(ios, NULL);
1118 exofs_put_io_state(ios); 1122 ore_put_io_state(ios);
1119 1123
1120 atomic_dec(&sbi->s_curr_pending); 1124 atomic_dec(&sbi->s_curr_pending);
1121 1125
1122 if (unlikely(ret)) { 1126 if (unlikely(ret)) {
1123 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", 1127 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
1124 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1128 _LLU(exofs_oi_objno(oi)),
1129 _LLU(oi->one_comp.obj.partition));
1125 /*TODO: When FS is corrupted creation can fail, object already 1130 /*TODO: When FS is corrupted creation can fail, object already
1126 * exist. Get rid of this asynchronous creation, if exist 1131 * exist. Get rid of this asynchronous creation, if exist
1127 * increment the obj counter and try the next object. Until we 1132 * increment the obj counter and try the next object. Until we
@@ -1140,14 +1145,13 @@ static void create_done(struct exofs_io_state *ios, void *p)
1140 */ 1145 */
1141struct inode *exofs_new_inode(struct inode *dir, int mode) 1146struct inode *exofs_new_inode(struct inode *dir, int mode)
1142{ 1147{
1143 struct super_block *sb; 1148 struct super_block *sb = dir->i_sb;
1149 struct exofs_sb_info *sbi = sb->s_fs_info;
1144 struct inode *inode; 1150 struct inode *inode;
1145 struct exofs_i_info *oi; 1151 struct exofs_i_info *oi;
1146 struct exofs_sb_info *sbi; 1152 struct ore_io_state *ios;
1147 struct exofs_io_state *ios;
1148 int ret; 1153 int ret;
1149 1154
1150 sb = dir->i_sb;
1151 inode = new_inode(sb); 1155 inode = new_inode(sb);
1152 if (!inode) 1156 if (!inode)
1153 return ERR_PTR(-ENOMEM); 1157 return ERR_PTR(-ENOMEM);
@@ -1157,8 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1157 1161
1158 set_obj_2bcreated(oi); 1162 set_obj_2bcreated(oi);
1159 1163
1160 sbi = sb->s_fs_info;
1161
1162 inode->i_mapping->backing_dev_info = sb->s_bdi; 1164 inode->i_mapping->backing_dev_info = sb->s_bdi;
1163 inode_init_owner(inode, dir, mode); 1165 inode_init_owner(inode, dir, mode);
1164 inode->i_ino = sbi->s_nextid++; 1166 inode->i_ino = sbi->s_nextid++;
@@ -1170,25 +1172,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1170 spin_unlock(&sbi->s_next_gen_lock); 1172 spin_unlock(&sbi->s_next_gen_lock);
1171 insert_inode_hash(inode); 1173 insert_inode_hash(inode);
1172 1174
1175 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
1176 exofs_oi_objno(oi));
1173 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ 1177 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1174 1178
1175 mark_inode_dirty(inode); 1179 mark_inode_dirty(inode);
1176 1180
1177 ret = exofs_get_io_state(&sbi->layout, &ios); 1181 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
1178 if (unlikely(ret)) { 1182 if (unlikely(ret)) {
1179 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1183 EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
1180 return ERR_PTR(ret); 1184 return ERR_PTR(ret);
1181 } 1185 }
1182 1186
1183 ios->obj.id = exofs_oi_objno(oi);
1184 exofs_make_credential(oi->i_cred, &ios->obj);
1185
1186 ios->done = create_done; 1187 ios->done = create_done;
1187 ios->private = inode; 1188 ios->private = inode;
1188 ios->cred = oi->i_cred; 1189
1189 ret = exofs_sbi_create(ios); 1190 ret = ore_create(ios);
1190 if (ret) { 1191 if (ret) {
1191 exofs_put_io_state(ios); 1192 ore_put_io_state(ios);
1192 return ERR_PTR(ret); 1193 return ERR_PTR(ret);
1193 } 1194 }
1194 atomic_inc(&sbi->s_curr_pending); 1195 atomic_inc(&sbi->s_curr_pending);
@@ -1207,11 +1208,11 @@ struct updatei_args {
1207/* 1208/*
1208 * Callback function from exofs_update_inode(). 1209 * Callback function from exofs_update_inode().
1209 */ 1210 */
1210static void updatei_done(struct exofs_io_state *ios, void *p) 1211static void updatei_done(struct ore_io_state *ios, void *p)
1211{ 1212{
1212 struct updatei_args *args = p; 1213 struct updatei_args *args = p;
1213 1214
1214 exofs_put_io_state(ios); 1215 ore_put_io_state(ios);
1215 1216
1216 atomic_dec(&args->sbi->s_curr_pending); 1217 atomic_dec(&args->sbi->s_curr_pending);
1217 1218
@@ -1227,7 +1228,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1227 struct exofs_i_info *oi = exofs_i(inode); 1228 struct exofs_i_info *oi = exofs_i(inode);
1228 struct super_block *sb = inode->i_sb; 1229 struct super_block *sb = inode->i_sb;
1229 struct exofs_sb_info *sbi = sb->s_fs_info; 1230 struct exofs_sb_info *sbi = sb->s_fs_info;
1230 struct exofs_io_state *ios; 1231 struct ore_io_state *ios;
1231 struct osd_attr attr; 1232 struct osd_attr attr;
1232 struct exofs_fcb *fcb; 1233 struct exofs_fcb *fcb;
1233 struct updatei_args *args; 1234 struct updatei_args *args;
@@ -1266,9 +1267,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1266 } else 1267 } else
1267 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1268 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1268 1269
1269 ret = exofs_get_io_state(&sbi->layout, &ios); 1270 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
1270 if (unlikely(ret)) { 1271 if (unlikely(ret)) {
1271 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1272 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
1272 goto free_args; 1273 goto free_args;
1273 } 1274 }
1274 1275
@@ -1285,13 +1286,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1285 ios->private = args; 1286 ios->private = args;
1286 } 1287 }
1287 1288
1288 ret = exofs_oi_write(oi, ios); 1289 ret = ore_write(ios);
1289 if (!do_sync && !ret) { 1290 if (!do_sync && !ret) {
1290 atomic_inc(&sbi->s_curr_pending); 1291 atomic_inc(&sbi->s_curr_pending);
1291 goto out; /* deallocation in updatei_done */ 1292 goto out; /* deallocation in updatei_done */
1292 } 1293 }
1293 1294
1294 exofs_put_io_state(ios); 1295 ore_put_io_state(ios);
1295free_args: 1296free_args:
1296 kfree(args); 1297 kfree(args);
1297out: 1298out:
@@ -1310,11 +1311,11 @@ int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1310 * Callback function from exofs_delete_inode() - don't have much cleaning up to 1311 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1311 * do. 1312 * do.
1312 */ 1313 */
1313static void delete_done(struct exofs_io_state *ios, void *p) 1314static void delete_done(struct ore_io_state *ios, void *p)
1314{ 1315{
1315 struct exofs_sb_info *sbi = p; 1316 struct exofs_sb_info *sbi = p;
1316 1317
1317 exofs_put_io_state(ios); 1318 ore_put_io_state(ios);
1318 1319
1319 atomic_dec(&sbi->s_curr_pending); 1320 atomic_dec(&sbi->s_curr_pending);
1320} 1321}
@@ -1329,7 +1330,7 @@ void exofs_evict_inode(struct inode *inode)
1329 struct exofs_i_info *oi = exofs_i(inode); 1330 struct exofs_i_info *oi = exofs_i(inode);
1330 struct super_block *sb = inode->i_sb; 1331 struct super_block *sb = inode->i_sb;
1331 struct exofs_sb_info *sbi = sb->s_fs_info; 1332 struct exofs_sb_info *sbi = sb->s_fs_info;
1332 struct exofs_io_state *ios; 1333 struct ore_io_state *ios;
1333 int ret; 1334 int ret;
1334 1335
1335 truncate_inode_pages(&inode->i_data, 0); 1336 truncate_inode_pages(&inode->i_data, 0);
@@ -1349,20 +1350,19 @@ void exofs_evict_inode(struct inode *inode)
1349 /* ignore the error, attempt a remove anyway */ 1350 /* ignore the error, attempt a remove anyway */
1350 1351
1351 /* Now Remove the OSD objects */ 1352 /* Now Remove the OSD objects */
1352 ret = exofs_get_io_state(&sbi->layout, &ios); 1353 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
1353 if (unlikely(ret)) { 1354 if (unlikely(ret)) {
1354 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1355 EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
1355 return; 1356 return;
1356 } 1357 }
1357 1358
1358 ios->obj.id = exofs_oi_objno(oi);
1359 ios->done = delete_done; 1359 ios->done = delete_done;
1360 ios->private = sbi; 1360 ios->private = sbi;
1361 ios->cred = oi->i_cred; 1361
1362 ret = exofs_sbi_remove(ios); 1362 ret = ore_remove(ios);
1363 if (ret) { 1363 if (ret) {
1364 EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); 1364 EXOFS_ERR("%s: ore_remove failed\n", __func__);
1365 exofs_put_io_state(ios); 1365 ore_put_io_state(ios);
1366 return; 1366 return;
1367 } 1367 }
1368 atomic_inc(&sbi->s_curr_pending); 1368 atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ore.c
index f74a2ec027a6..25305af88198 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ore.c
@@ -23,81 +23,87 @@
23 */ 23 */
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <scsi/scsi_device.h>
27#include <asm/div64.h> 26#include <asm/div64.h>
28 27
29#include "exofs.h" 28#include <scsi/osd_ore.h>
30 29
31#define EXOFS_DBGMSG2(M...) do {} while (0) 30#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
32/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
33 31
34void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) 32#ifdef CONFIG_EXOFS_DEBUG
35{ 33#define ORE_DBGMSG(fmt, a...) \
36 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); 34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
37} 35#else
36#define ORE_DBGMSG(fmt, a...) \
37 do { if (0) printk(fmt, ##a); } while (0)
38#endif
38 39
39int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, 40/* u64 has problems with printk this will cast it to unsigned long long */
40 u64 offset, void *p, unsigned length) 41#define _LLU(x) (unsigned long long)(x)
41{
42 struct osd_request *or = osd_start_request(od, GFP_KERNEL);
43/* struct osd_sense_info osi = {.key = 0};*/
44 int ret;
45 42
46 if (unlikely(!or)) { 43#define ORE_DBGMSG2(M...) do {} while (0)
47 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); 44/* #define ORE_DBGMSG2 ORE_DBGMSG */
48 return -ENOMEM;
49 }
50 ret = osd_req_read_kern(or, obj, offset, p, length);
51 if (unlikely(ret)) {
52 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
53 goto out;
54 }
55 45
56 ret = osd_finalize_request(or, 0, cred, NULL); 46MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
57 if (unlikely(ret)) { 47MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
58 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); 48MODULE_LICENSE("GPL");
59 goto out;
60 }
61 49
62 ret = osd_execute_request(or); 50static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
63 if (unlikely(ret)) 51{
64 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); 52 return ios->comps->comps[index & ios->comps->single_comp].cred;
65 /* osd_req_decode_sense(or, ret); */ 53}
66 54
67out: 55static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
68 osd_end_request(or); 56{
69 return ret; 57 return &ios->comps->comps[index & ios->comps->single_comp].obj;
70} 58}
71 59
72int exofs_get_io_state(struct exofs_layout *layout, 60static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
73 struct exofs_io_state **pios)
74{ 61{
75 struct exofs_io_state *ios; 62 return ios->comps->ods[index];
63}
64
65int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
66 bool is_reading, u64 offset, u64 length,
67 struct ore_io_state **pios)
68{
69 struct ore_io_state *ios;
76 70
77 /*TODO: Maybe use kmem_cach per sbi of size 71 /*TODO: Maybe use kmem_cach per sbi of size
78 * exofs_io_state_size(layout->s_numdevs) 72 * exofs_io_state_size(layout->s_numdevs)
79 */ 73 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); 74 ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) { 75 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", 76 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs)); 77 ore_io_state_size(comps->numdevs));
84 *pios = NULL; 78 *pios = NULL;
85 return -ENOMEM; 79 return -ENOMEM;
86 } 80 }
87 81
88 ios->layout = layout; 82 ios->layout = layout;
89 ios->obj.partition = layout->s_pid; 83 ios->comps = comps;
84 ios->offset = offset;
85 ios->length = length;
86 ios->reading = is_reading;
87
90 *pios = ios; 88 *pios = ios;
91 return 0; 89 return 0;
92} 90}
91EXPORT_SYMBOL(ore_get_rw_state);
92
93int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
94 struct ore_io_state **ios)
95{
96 return ore_get_rw_state(layout, comps, true, 0, 0, ios);
97}
98EXPORT_SYMBOL(ore_get_io_state);
93 99
94void exofs_put_io_state(struct exofs_io_state *ios) 100void ore_put_io_state(struct ore_io_state *ios)
95{ 101{
96 if (ios) { 102 if (ios) {
97 unsigned i; 103 unsigned i;
98 104
99 for (i = 0; i < ios->numdevs; i++) { 105 for (i = 0; i < ios->numdevs; i++) {
100 struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; 106 struct ore_per_dev_state *per_dev = &ios->per_dev[i];
101 107
102 if (per_dev->or) 108 if (per_dev->or)
103 osd_end_request(per_dev->or); 109 osd_end_request(per_dev->or);
@@ -108,31 +114,9 @@ void exofs_put_io_state(struct exofs_io_state *ios)
108 kfree(ios); 114 kfree(ios);
109 } 115 }
110} 116}
117EXPORT_SYMBOL(ore_put_io_state);
111 118
112unsigned exofs_layout_od_id(struct exofs_layout *layout, 119static void _sync_done(struct ore_io_state *ios, void *p)
113 osd_id obj_no, unsigned layout_index)
114{
115/* switch (layout->lay_func) {
116 case LAYOUT_MOVING_WINDOW:
117 {*/
118 unsigned dev_mod = obj_no;
119
120 return (layout_index + dev_mod * layout->mirrors_p1) %
121 layout->s_numdevs;
122/* }
123 case LAYOUT_FUNC_IMPLICT:
124 return layout->devs[layout_index];
125 }*/
126}
127
128static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
129 unsigned layout_index)
130{
131 return ios->layout->s_ods[
132 exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
133}
134
135static void _sync_done(struct exofs_io_state *ios, void *p)
136{ 120{
137 struct completion *waiting = p; 121 struct completion *waiting = p;
138 122
@@ -141,20 +125,20 @@ static void _sync_done(struct exofs_io_state *ios, void *p)
141 125
142static void _last_io(struct kref *kref) 126static void _last_io(struct kref *kref)
143{ 127{
144 struct exofs_io_state *ios = container_of( 128 struct ore_io_state *ios = container_of(
145 kref, struct exofs_io_state, kref); 129 kref, struct ore_io_state, kref);
146 130
147 ios->done(ios, ios->private); 131 ios->done(ios, ios->private);
148} 132}
149 133
150static void _done_io(struct osd_request *or, void *p) 134static void _done_io(struct osd_request *or, void *p)
151{ 135{
152 struct exofs_io_state *ios = p; 136 struct ore_io_state *ios = p;
153 137
154 kref_put(&ios->kref, _last_io); 138 kref_put(&ios->kref, _last_io);
155} 139}
156 140
157static int exofs_io_execute(struct exofs_io_state *ios) 141static int ore_io_execute(struct ore_io_state *ios)
158{ 142{
159 DECLARE_COMPLETION_ONSTACK(wait); 143 DECLARE_COMPLETION_ONSTACK(wait);
160 bool sync = (ios->done == NULL); 144 bool sync = (ios->done == NULL);
@@ -170,9 +154,9 @@ static int exofs_io_execute(struct exofs_io_state *ios)
170 if (unlikely(!or)) 154 if (unlikely(!or))
171 continue; 155 continue;
172 156
173 ret = osd_finalize_request(or, 0, ios->cred, NULL); 157 ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
174 if (unlikely(ret)) { 158 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", 159 ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
176 ret); 160 ret);
177 return ret; 161 return ret;
178 } 162 }
@@ -194,7 +178,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
194 178
195 if (sync) { 179 if (sync) {
196 wait_for_completion(&wait); 180 wait_for_completion(&wait);
197 ret = exofs_check_io(ios, NULL); 181 ret = ore_check_io(ios, NULL);
198 } 182 }
199 return ret; 183 return ret;
200} 184}
@@ -214,7 +198,7 @@ static void _clear_bio(struct bio *bio)
214 } 198 }
215} 199}
216 200
217int exofs_check_io(struct exofs_io_state *ios, u64 *resid) 201int ore_check_io(struct ore_io_state *ios, u64 *resid)
218{ 202{
219 enum osd_err_priority acumulated_osd_err = 0; 203 enum osd_err_priority acumulated_osd_err = 0;
220 int acumulated_lin_err = 0; 204 int acumulated_lin_err = 0;
@@ -235,7 +219,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
235 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 219 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
236 /* start read offset passed endof file */ 220 /* start read offset passed endof file */
237 _clear_bio(ios->per_dev[i].bio); 221 _clear_bio(ios->per_dev[i].bio);
238 EXOFS_DBGMSG("start read offset passed end of file " 222 ORE_DBGMSG("start read offset passed end of file "
239 "offset=0x%llx, length=0x%llx\n", 223 "offset=0x%llx, length=0x%llx\n",
240 _LLU(ios->per_dev[i].offset), 224 _LLU(ios->per_dev[i].offset),
241 _LLU(ios->per_dev[i].length)); 225 _LLU(ios->per_dev[i].length));
@@ -259,6 +243,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
259 243
260 return acumulated_lin_err; 244 return acumulated_lin_err;
261} 245}
246EXPORT_SYMBOL(ore_check_io);
262 247
263/* 248/*
264 * L - logical offset into the file 249 * L - logical offset into the file
@@ -305,20 +290,21 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
305struct _striping_info { 290struct _striping_info {
306 u64 obj_offset; 291 u64 obj_offset;
307 u64 group_length; 292 u64 group_length;
293 u64 M; /* for truncate */
308 unsigned dev; 294 unsigned dev;
309 unsigned unit_off; 295 unsigned unit_off;
310}; 296};
311 297
312static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, 298static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
313 struct _striping_info *si) 299 struct _striping_info *si)
314{ 300{
315 u32 stripe_unit = ios->layout->stripe_unit; 301 u32 stripe_unit = layout->stripe_unit;
316 u32 group_width = ios->layout->group_width; 302 u32 group_width = layout->group_width;
317 u64 group_depth = ios->layout->group_depth; 303 u64 group_depth = layout->group_depth;
318 304
319 u32 U = stripe_unit * group_width; 305 u32 U = stripe_unit * group_width;
320 u64 T = U * group_depth; 306 u64 T = U * group_depth;
321 u64 S = T * ios->layout->group_count; 307 u64 S = T * layout->group_count;
322 u64 M = div64_u64(file_offset, S); 308 u64 M = div64_u64(file_offset, S);
323 309
324 /* 310 /*
@@ -333,7 +319,7 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
333 319
334 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 320 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
335 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 321 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
336 si->dev *= ios->layout->mirrors_p1; 322 si->dev *= layout->mirrors_p1;
337 323
338 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 324 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
339 325
@@ -341,15 +327,16 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
341 (M * group_depth * stripe_unit); 327 (M * group_depth * stripe_unit);
342 328
343 si->group_length = T - H; 329 si->group_length = T - H;
330 si->M = M;
344} 331}
345 332
346static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, 333static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
347 unsigned pgbase, struct exofs_per_dev_state *per_dev, 334 unsigned pgbase, struct ore_per_dev_state *per_dev,
348 int cur_len) 335 int cur_len)
349{ 336{
350 unsigned pg = *cur_pg; 337 unsigned pg = *cur_pg;
351 struct request_queue *q = 338 struct request_queue *q =
352 osd_request_queue(exofs_ios_od(ios, per_dev->dev)); 339 osd_request_queue(_ios_od(ios, per_dev->dev));
353 340
354 per_dev->length += cur_len; 341 per_dev->length += cur_len;
355 342
@@ -361,7 +348,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
361 348
362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 349 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
363 if (unlikely(!per_dev->bio)) { 350 if (unlikely(!per_dev->bio)) {
364 EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", 351 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
365 bio_size); 352 bio_size);
366 return -ENOMEM; 353 return -ENOMEM;
367 } 354 }
@@ -387,7 +374,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
387 return 0; 374 return 0;
388} 375}
389 376
390static int _prepare_one_group(struct exofs_io_state *ios, u64 length, 377static int _prepare_one_group(struct ore_io_state *ios, u64 length,
391 struct _striping_info *si) 378 struct _striping_info *si)
392{ 379{
393 unsigned stripe_unit = ios->layout->stripe_unit; 380 unsigned stripe_unit = ios->layout->stripe_unit;
@@ -400,7 +387,7 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
400 int ret = 0; 387 int ret = 0;
401 388
402 while (length) { 389 while (length) {
403 struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; 390 struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
404 unsigned cur_len, page_off = 0; 391 unsigned cur_len, page_off = 0;
405 392
406 if (!per_dev->length) { 393 if (!per_dev->length) {
@@ -443,7 +430,7 @@ out:
443 return ret; 430 return ret;
444} 431}
445 432
446static int _prepare_for_striping(struct exofs_io_state *ios) 433static int _prepare_for_striping(struct ore_io_state *ios)
447{ 434{
448 u64 length = ios->length; 435 u64 length = ios->length;
449 u64 offset = ios->offset; 436 u64 offset = ios->offset;
@@ -452,9 +439,9 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
452 439
453 if (!ios->pages) { 440 if (!ios->pages) {
454 if (ios->kern_buff) { 441 if (ios->kern_buff) {
455 struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; 442 struct ore_per_dev_state *per_dev = &ios->per_dev[0];
456 443
457 _calc_stripe_info(ios, ios->offset, &si); 444 _calc_stripe_info(ios->layout, ios->offset, &si);
458 per_dev->offset = si.obj_offset; 445 per_dev->offset = si.obj_offset;
459 per_dev->dev = si.dev; 446 per_dev->dev = si.dev;
460 447
@@ -468,7 +455,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
468 } 455 }
469 456
470 while (length) { 457 while (length) {
471 _calc_stripe_info(ios, offset, &si); 458 _calc_stripe_info(ios->layout, offset, &si);
472 459
473 if (length < si.group_length) 460 if (length < si.group_length)
474 si.group_length = length; 461 si.group_length = length;
@@ -485,57 +472,59 @@ out:
485 return ret; 472 return ret;
486} 473}
487 474
488int exofs_sbi_create(struct exofs_io_state *ios) 475int ore_create(struct ore_io_state *ios)
489{ 476{
490 int i, ret; 477 int i, ret;
491 478
492 for (i = 0; i < ios->layout->s_numdevs; i++) { 479 for (i = 0; i < ios->comps->numdevs; i++) {
493 struct osd_request *or; 480 struct osd_request *or;
494 481
495 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); 482 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
496 if (unlikely(!or)) { 483 if (unlikely(!or)) {
497 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 484 ORE_ERR("%s: osd_start_request failed\n", __func__);
498 ret = -ENOMEM; 485 ret = -ENOMEM;
499 goto out; 486 goto out;
500 } 487 }
501 ios->per_dev[i].or = or; 488 ios->per_dev[i].or = or;
502 ios->numdevs++; 489 ios->numdevs++;
503 490
504 osd_req_create_object(or, &ios->obj); 491 osd_req_create_object(or, _ios_obj(ios, i));
505 } 492 }
506 ret = exofs_io_execute(ios); 493 ret = ore_io_execute(ios);
507 494
508out: 495out:
509 return ret; 496 return ret;
510} 497}
498EXPORT_SYMBOL(ore_create);
511 499
512int exofs_sbi_remove(struct exofs_io_state *ios) 500int ore_remove(struct ore_io_state *ios)
513{ 501{
514 int i, ret; 502 int i, ret;
515 503
516 for (i = 0; i < ios->layout->s_numdevs; i++) { 504 for (i = 0; i < ios->comps->numdevs; i++) {
517 struct osd_request *or; 505 struct osd_request *or;
518 506
519 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); 507 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
520 if (unlikely(!or)) { 508 if (unlikely(!or)) {
521 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 509 ORE_ERR("%s: osd_start_request failed\n", __func__);
522 ret = -ENOMEM; 510 ret = -ENOMEM;
523 goto out; 511 goto out;
524 } 512 }
525 ios->per_dev[i].or = or; 513 ios->per_dev[i].or = or;
526 ios->numdevs++; 514 ios->numdevs++;
527 515
528 osd_req_remove_object(or, &ios->obj); 516 osd_req_remove_object(or, _ios_obj(ios, i));
529 } 517 }
530 ret = exofs_io_execute(ios); 518 ret = ore_io_execute(ios);
531 519
532out: 520out:
533 return ret; 521 return ret;
534} 522}
523EXPORT_SYMBOL(ore_remove);
535 524
536static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) 525static int _write_mirror(struct ore_io_state *ios, int cur_comp)
537{ 526{
538 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; 527 struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
539 unsigned dev = ios->per_dev[cur_comp].dev; 528 unsigned dev = ios->per_dev[cur_comp].dev;
540 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 529 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
541 int ret = 0; 530 int ret = 0;
@@ -544,12 +533,12 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
544 return 0; /* Just an empty slot */ 533 return 0; /* Just an empty slot */
545 534
546 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 535 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
547 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 536 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
548 struct osd_request *or; 537 struct osd_request *or;
549 538
550 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); 539 or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
551 if (unlikely(!or)) { 540 if (unlikely(!or)) {
552 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 541 ORE_ERR("%s: osd_start_request failed\n", __func__);
553 ret = -ENOMEM; 542 ret = -ENOMEM;
554 goto out; 543 goto out;
555 } 544 }
@@ -563,7 +552,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
563 bio = bio_kmalloc(GFP_KERNEL, 552 bio = bio_kmalloc(GFP_KERNEL,
564 master_dev->bio->bi_max_vecs); 553 master_dev->bio->bi_max_vecs);
565 if (unlikely(!bio)) { 554 if (unlikely(!bio)) {
566 EXOFS_DBGMSG( 555 ORE_DBGMSG(
567 "Failed to allocate BIO size=%u\n", 556 "Failed to allocate BIO size=%u\n",
568 master_dev->bio->bi_max_vecs); 557 master_dev->bio->bi_max_vecs);
569 ret = -ENOMEM; 558 ret = -ENOMEM;
@@ -582,25 +571,29 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
582 bio->bi_rw |= REQ_WRITE; 571 bio->bi_rw |= REQ_WRITE;
583 } 572 }
584 573
585 osd_req_write(or, &ios->obj, per_dev->offset, bio, 574 osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
586 per_dev->length); 575 bio, per_dev->length);
587 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " 576 ORE_DBGMSG("write(0x%llx) offset=0x%llx "
588 "length=0x%llx dev=%d\n", 577 "length=0x%llx dev=%d\n",
589 _LLU(ios->obj.id), _LLU(per_dev->offset), 578 _LLU(_ios_obj(ios, dev)->id),
579 _LLU(per_dev->offset),
590 _LLU(per_dev->length), dev); 580 _LLU(per_dev->length), dev);
591 } else if (ios->kern_buff) { 581 } else if (ios->kern_buff) {
592 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, 582 ret = osd_req_write_kern(or, _ios_obj(ios, dev),
593 ios->kern_buff, ios->length); 583 per_dev->offset,
584 ios->kern_buff, ios->length);
594 if (unlikely(ret)) 585 if (unlikely(ret))
595 goto out; 586 goto out;
596 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " 587 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
597 "length=0x%llx dev=%d\n", 588 "length=0x%llx dev=%d\n",
598 _LLU(ios->obj.id), _LLU(per_dev->offset), 589 _LLU(_ios_obj(ios, dev)->id),
590 _LLU(per_dev->offset),
599 _LLU(ios->length), dev); 591 _LLU(ios->length), dev);
600 } else { 592 } else {
601 osd_req_set_attributes(or, &ios->obj); 593 osd_req_set_attributes(or, _ios_obj(ios, dev));
602 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 594 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
603 _LLU(ios->obj.id), ios->out_attr_len, dev); 595 _LLU(_ios_obj(ios, dev)->id),
596 ios->out_attr_len, dev);
604 } 597 }
605 598
606 if (ios->out_attr) 599 if (ios->out_attr)
@@ -616,7 +609,7 @@ out:
616 return ret; 609 return ret;
617} 610}
618 611
619int exofs_sbi_write(struct exofs_io_state *ios) 612int ore_write(struct ore_io_state *ios)
620{ 613{
621 int i; 614 int i;
622 int ret; 615 int ret;
@@ -626,52 +619,55 @@ int exofs_sbi_write(struct exofs_io_state *ios)
626 return ret; 619 return ret;
627 620
628 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 621 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
629 ret = _sbi_write_mirror(ios, i); 622 ret = _write_mirror(ios, i);
630 if (unlikely(ret)) 623 if (unlikely(ret))
631 return ret; 624 return ret;
632 } 625 }
633 626
634 ret = exofs_io_execute(ios); 627 ret = ore_io_execute(ios);
635 return ret; 628 return ret;
636} 629}
630EXPORT_SYMBOL(ore_write);
637 631
638static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) 632static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
639{ 633{
640 struct osd_request *or; 634 struct osd_request *or;
641 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 635 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
642 unsigned first_dev = (unsigned)ios->obj.id; 636 struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
637 unsigned first_dev = (unsigned)obj->id;
643 638
644 if (ios->pages && !per_dev->length) 639 if (ios->pages && !per_dev->length)
645 return 0; /* Just an empty slot */ 640 return 0; /* Just an empty slot */
646 641
647 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; 642 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
648 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); 643 or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
649 if (unlikely(!or)) { 644 if (unlikely(!or)) {
650 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 645 ORE_ERR("%s: osd_start_request failed\n", __func__);
651 return -ENOMEM; 646 return -ENOMEM;
652 } 647 }
653 per_dev->or = or; 648 per_dev->or = or;
654 649
655 if (ios->pages) { 650 if (ios->pages) {
656 osd_req_read(or, &ios->obj, per_dev->offset, 651 osd_req_read(or, obj, per_dev->offset,
657 per_dev->bio, per_dev->length); 652 per_dev->bio, per_dev->length);
658 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 653 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
659 " dev=%d\n", _LLU(ios->obj.id), 654 " dev=%d\n", _LLU(obj->id),
660 _LLU(per_dev->offset), _LLU(per_dev->length), 655 _LLU(per_dev->offset), _LLU(per_dev->length),
661 first_dev); 656 first_dev);
662 } else if (ios->kern_buff) { 657 } else if (ios->kern_buff) {
663 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, 658 int ret = osd_req_read_kern(or, obj, per_dev->offset,
664 ios->kern_buff, ios->length); 659 ios->kern_buff, ios->length);
665 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " 660 ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
666 "length=0x%llx dev=%d ret=>%d\n", 661 "length=0x%llx dev=%d ret=>%d\n",
667 _LLU(ios->obj.id), _LLU(per_dev->offset), 662 _LLU(obj->id), _LLU(per_dev->offset),
668 _LLU(ios->length), first_dev, ret); 663 _LLU(ios->length), first_dev, ret);
669 if (unlikely(ret)) 664 if (unlikely(ret))
670 return ret; 665 return ret;
671 } else { 666 } else {
672 osd_req_get_attributes(or, &ios->obj); 667 osd_req_get_attributes(or, obj);
673 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 668 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
674 _LLU(ios->obj.id), ios->in_attr_len, first_dev); 669 _LLU(obj->id),
670 ios->in_attr_len, first_dev);
675 } 671 }
676 if (ios->out_attr) 672 if (ios->out_attr)
677 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); 673 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
@@ -682,7 +678,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
682 return 0; 678 return 0;
683} 679}
684 680
685int exofs_sbi_read(struct exofs_io_state *ios) 681int ore_read(struct ore_io_state *ios)
686{ 682{
687 int i; 683 int i;
688 int ret; 684 int ret;
@@ -692,16 +688,17 @@ int exofs_sbi_read(struct exofs_io_state *ios)
692 return ret; 688 return ret;
693 689
694 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 690 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
695 ret = _sbi_read_mirror(ios, i); 691 ret = _read_mirror(ios, i);
696 if (unlikely(ret)) 692 if (unlikely(ret))
697 return ret; 693 return ret;
698 } 694 }
699 695
700 ret = exofs_io_execute(ios); 696 ret = ore_io_execute(ios);
701 return ret; 697 return ret;
702} 698}
699EXPORT_SYMBOL(ore_read);
703 700
704int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) 701int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
705{ 702{
706 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ 703 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
707 void *iter = NULL; 704 void *iter = NULL;
@@ -721,83 +718,118 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
721 718
722 return -EIO; 719 return -EIO;
723} 720}
721EXPORT_SYMBOL(extract_attr_from_ios);
724 722
725static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, 723static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
726 struct osd_attr *attr) 724 struct osd_attr *attr)
727{ 725{
728 int last_comp = cur_comp + ios->layout->mirrors_p1; 726 int last_comp = cur_comp + ios->layout->mirrors_p1;
729 727
730 for (; cur_comp < last_comp; ++cur_comp) { 728 for (; cur_comp < last_comp; ++cur_comp) {
731 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 729 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
732 struct osd_request *or; 730 struct osd_request *or;
733 731
734 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); 732 or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
735 if (unlikely(!or)) { 733 if (unlikely(!or)) {
736 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 734 ORE_ERR("%s: osd_start_request failed\n", __func__);
737 return -ENOMEM; 735 return -ENOMEM;
738 } 736 }
739 per_dev->or = or; 737 per_dev->or = or;
740 738
741 osd_req_set_attributes(or, &ios->obj); 739 osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
742 osd_req_add_set_attr_list(or, attr, 1); 740 osd_req_add_set_attr_list(or, attr, 1);
743 } 741 }
744 742
745 return 0; 743 return 0;
746} 744}
747 745
748int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) 746struct _trunc_info {
747 struct _striping_info si;
748 u64 prev_group_obj_off;
749 u64 next_group_obj_off;
750
751 unsigned first_group_dev;
752 unsigned nex_group_dev;
753 unsigned max_devs;
754};
755
756void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
757 struct _trunc_info *ti)
758{
759 unsigned stripe_unit = layout->stripe_unit;
760
761 _calc_stripe_info(layout, file_offset, &ti->si);
762
763 ti->prev_group_obj_off = ti->si.M * stripe_unit;
764 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
765
766 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
767 ti->nex_group_dev = ti->first_group_dev + layout->group_width;
768 ti->max_devs = layout->group_width * layout->group_count;
769}
770
771int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
772 u64 size)
749{ 773{
750 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; 774 struct ore_io_state *ios;
751 struct exofs_io_state *ios;
752 struct exofs_trunc_attr { 775 struct exofs_trunc_attr {
753 struct osd_attr attr; 776 struct osd_attr attr;
754 __be64 newsize; 777 __be64 newsize;
755 } *size_attrs; 778 } *size_attrs;
756 struct _striping_info si; 779 struct _trunc_info ti;
757 int i, ret; 780 int i, ret;
758 781
759 ret = exofs_get_io_state(&sbi->layout, &ios); 782 ret = ore_get_io_state(layout, comps, &ios);
760 if (unlikely(ret)) 783 if (unlikely(ret))
761 return ret; 784 return ret;
762 785
763 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), 786 _calc_trunk_info(ios->layout, size, &ti);
787
788 size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
764 GFP_KERNEL); 789 GFP_KERNEL);
765 if (unlikely(!size_attrs)) { 790 if (unlikely(!size_attrs)) {
766 ret = -ENOMEM; 791 ret = -ENOMEM;
767 goto out; 792 goto out;
768 } 793 }
769 794
770 ios->obj.id = exofs_oi_objno(oi); 795 ios->numdevs = ios->comps->numdevs;
771 ios->cred = oi->i_cred;
772 796
773 ios->numdevs = ios->layout->s_numdevs; 797 for (i = 0; i < ti.max_devs; ++i) {
774 _calc_stripe_info(ios, size, &si);
775
776 for (i = 0; i < ios->layout->group_width; ++i) {
777 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 798 struct exofs_trunc_attr *size_attr = &size_attrs[i];
778 u64 obj_size; 799 u64 obj_size;
779 800
780 if (i < si.dev) 801 if (i < ti.first_group_dev)
781 obj_size = si.obj_offset + 802 obj_size = ti.prev_group_obj_off;
782 ios->layout->stripe_unit - si.unit_off; 803 else if (i >= ti.nex_group_dev)
783 else if (i == si.dev) 804 obj_size = ti.next_group_obj_off;
784 obj_size = si.obj_offset; 805 else if (i < ti.si.dev) /* dev within this group */
785 else /* i > si.dev */ 806 obj_size = ti.si.obj_offset +
786 obj_size = si.obj_offset - si.unit_off; 807 ios->layout->stripe_unit - ti.si.unit_off;
808 else if (i == ti.si.dev)
809 obj_size = ti.si.obj_offset;
810 else /* i > ti.dev */
811 obj_size = ti.si.obj_offset - ti.si.unit_off;
787 812
788 size_attr->newsize = cpu_to_be64(obj_size); 813 size_attr->newsize = cpu_to_be64(obj_size);
789 size_attr->attr = g_attr_logical_length; 814 size_attr->attr = g_attr_logical_length;
790 size_attr->attr.val_ptr = &size_attr->newsize; 815 size_attr->attr.val_ptr = &size_attr->newsize;
791 816
817 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
818 _LLU(comps->comps->obj.id), _LLU(obj_size), i);
792 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 819 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
793 &size_attr->attr); 820 &size_attr->attr);
794 if (unlikely(ret)) 821 if (unlikely(ret))
795 goto out; 822 goto out;
796 } 823 }
797 ret = exofs_io_execute(ios); 824 ret = ore_io_execute(ios);
798 825
799out: 826out:
800 kfree(size_attrs); 827 kfree(size_attrs);
801 exofs_put_io_state(ios); 828 ore_put_io_state(ios);
802 return ret; 829 return ret;
803} 830}
831EXPORT_SYMBOL(ore_truncate);
832
833const struct osd_attr g_attr_logical_length = ATTR_DEF(
834 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
835EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
deleted file mode 100644
index c52e9888b8ab..000000000000
--- a/fs/exofs/pnfs.h
+++ /dev/null
@@ -1,45 +0,0 @@
1/*
2 * Copyright (C) 2008, 2009
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of exofs.
6 *
7 * exofs is free software; you can redistribute it and/or modify it under the
8 * terms of the GNU General Public License version 2 as published by the Free
9 * Software Foundation.
10 *
11 */
12
13/* FIXME: Remove this file once pnfs hits mainline */
14
15#ifndef __EXOFS_PNFS_H__
16#define __EXOFS_PNFS_H__
17
18#if ! defined(__PNFS_OSD_XDR_H__)
19
20enum pnfs_iomode {
21 IOMODE_READ = 1,
22 IOMODE_RW = 2,
23 IOMODE_ANY = 3,
24};
25
26/* Layout Structure */
27enum pnfs_osd_raid_algorithm4 {
28 PNFS_OSD_RAID_0 = 1,
29 PNFS_OSD_RAID_4 = 2,
30 PNFS_OSD_RAID_5 = 3,
31 PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
32};
33
34struct pnfs_osd_data_map {
35 u32 odm_num_comps;
36 u64 odm_stripe_unit;
37 u32 odm_group_width;
38 u32 odm_group_depth;
39 u32 odm_mirror_cnt;
40 u32 odm_raid_algorithm;
41};
42
43#endif /* ! defined(__PNFS_OSD_XDR_H__) */
44
45#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index c57beddcc217..274894053b02 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -40,6 +40,8 @@
40 40
41#include "exofs.h" 41#include "exofs.h"
42 42
43#define EXOFS_DBGMSG2(M...) do {} while (0)
44
43/****************************************************************************** 45/******************************************************************************
44 * MOUNT OPTIONS 46 * MOUNT OPTIONS
45 *****************************************************************************/ 47 *****************************************************************************/
@@ -208,10 +210,48 @@ static void destroy_inodecache(void)
208} 210}
209 211
210/****************************************************************************** 212/******************************************************************************
211 * SUPERBLOCK FUNCTIONS 213 * Some osd helpers
212 *****************************************************************************/ 214 *****************************************************************************/
213static const struct super_operations exofs_sops; 215void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
214static const struct export_operations exofs_export_ops; 216{
217 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
218}
219
220static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
221 u64 offset, void *p, unsigned length)
222{
223 struct osd_request *or = osd_start_request(od, GFP_KERNEL);
224/* struct osd_sense_info osi = {.key = 0};*/
225 int ret;
226
227 if (unlikely(!or)) {
228 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
229 return -ENOMEM;
230 }
231 ret = osd_req_read_kern(or, obj, offset, p, length);
232 if (unlikely(ret)) {
233 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
234 goto out;
235 }
236
237 ret = osd_finalize_request(or, 0, cred, NULL);
238 if (unlikely(ret)) {
239 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
240 goto out;
241 }
242
243 ret = osd_execute_request(or);
244 if (unlikely(ret))
245 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
246 /* osd_req_decode_sense(or, ret); */
247
248out:
249 osd_end_request(or);
250 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
251 "length=0x%llx dev=%p ret=>%d\n",
252 _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
253 return ret;
254}
215 255
216static const struct osd_attr g_attr_sb_stats = ATTR_DEF( 256static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
217 EXOFS_APAGE_SB_DATA, 257 EXOFS_APAGE_SB_DATA,
@@ -223,21 +263,19 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
223 struct osd_attr attrs[] = { 263 struct osd_attr attrs[] = {
224 [0] = g_attr_sb_stats, 264 [0] = g_attr_sb_stats,
225 }; 265 };
226 struct exofs_io_state *ios; 266 struct ore_io_state *ios;
227 int ret; 267 int ret;
228 268
229 ret = exofs_get_io_state(&sbi->layout, &ios); 269 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
230 if (unlikely(ret)) { 270 if (unlikely(ret)) {
231 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 271 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
232 return ret; 272 return ret;
233 } 273 }
234 274
235 ios->cred = sbi->s_cred;
236
237 ios->in_attr = attrs; 275 ios->in_attr = attrs;
238 ios->in_attr_len = ARRAY_SIZE(attrs); 276 ios->in_attr_len = ARRAY_SIZE(attrs);
239 277
240 ret = exofs_sbi_read(ios); 278 ret = ore_read(ios);
241 if (unlikely(ret)) { 279 if (unlikely(ret)) {
242 EXOFS_ERR("Error reading super_block stats => %d\n", ret); 280 EXOFS_ERR("Error reading super_block stats => %d\n", ret);
243 goto out; 281 goto out;
@@ -264,13 +302,13 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
264 } 302 }
265 303
266out: 304out:
267 exofs_put_io_state(ios); 305 ore_put_io_state(ios);
268 return ret; 306 return ret;
269} 307}
270 308
271static void stats_done(struct exofs_io_state *ios, void *p) 309static void stats_done(struct ore_io_state *ios, void *p)
272{ 310{
273 exofs_put_io_state(ios); 311 ore_put_io_state(ios);
274 /* Good thanks nothing to do anymore */ 312 /* Good thanks nothing to do anymore */
275} 313}
276 314
@@ -280,12 +318,12 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
280 struct osd_attr attrs[] = { 318 struct osd_attr attrs[] = {
281 [0] = g_attr_sb_stats, 319 [0] = g_attr_sb_stats,
282 }; 320 };
283 struct exofs_io_state *ios; 321 struct ore_io_state *ios;
284 int ret; 322 int ret;
285 323
286 ret = exofs_get_io_state(&sbi->layout, &ios); 324 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
287 if (unlikely(ret)) { 325 if (unlikely(ret)) {
288 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 326 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
289 return ret; 327 return ret;
290 } 328 }
291 329
@@ -293,21 +331,27 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
293 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); 331 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
294 attrs[0].val_ptr = &sbi->s_ess; 332 attrs[0].val_ptr = &sbi->s_ess;
295 333
296 ios->cred = sbi->s_cred; 334
297 ios->done = stats_done; 335 ios->done = stats_done;
298 ios->private = sbi; 336 ios->private = sbi;
299 ios->out_attr = attrs; 337 ios->out_attr = attrs;
300 ios->out_attr_len = ARRAY_SIZE(attrs); 338 ios->out_attr_len = ARRAY_SIZE(attrs);
301 339
302 ret = exofs_sbi_write(ios); 340 ret = ore_write(ios);
303 if (unlikely(ret)) { 341 if (unlikely(ret)) {
304 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 342 EXOFS_ERR("%s: ore_write failed.\n", __func__);
305 exofs_put_io_state(ios); 343 ore_put_io_state(ios);
306 } 344 }
307 345
308 return ret; 346 return ret;
309} 347}
310 348
349/******************************************************************************
350 * SUPERBLOCK FUNCTIONS
351 *****************************************************************************/
352static const struct super_operations exofs_sops;
353static const struct export_operations exofs_export_ops;
354
311/* 355/*
312 * Write the superblock to the OSD 356 * Write the superblock to the OSD
313 */ 357 */
@@ -315,7 +359,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
315{ 359{
316 struct exofs_sb_info *sbi; 360 struct exofs_sb_info *sbi;
317 struct exofs_fscb *fscb; 361 struct exofs_fscb *fscb;
318 struct exofs_io_state *ios; 362 struct ore_comp one_comp;
363 struct ore_components comps;
364 struct ore_io_state *ios;
319 int ret = -ENOMEM; 365 int ret = -ENOMEM;
320 366
321 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); 367 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
@@ -331,7 +377,10 @@ int exofs_sync_fs(struct super_block *sb, int wait)
331 * version). Otherwise the exofs_fscb is read-only from mkfs time. All 377 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
332 * the writeable info is set in exofs_sbi_write_stats() above. 378 * the writeable info is set in exofs_sbi_write_stats() above.
333 */ 379 */
334 ret = exofs_get_io_state(&sbi->layout, &ios); 380
381 exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
382
383 ret = ore_get_io_state(&sbi->layout, &comps, &ios);
335 if (unlikely(ret)) 384 if (unlikely(ret))
336 goto out; 385 goto out;
337 386
@@ -345,14 +394,12 @@ int exofs_sync_fs(struct super_block *sb, int wait)
345 fscb->s_newfs = 0; 394 fscb->s_newfs = 0;
346 fscb->s_version = EXOFS_FSCB_VER; 395 fscb->s_version = EXOFS_FSCB_VER;
347 396
348 ios->obj.id = EXOFS_SUPER_ID;
349 ios->offset = 0; 397 ios->offset = 0;
350 ios->kern_buff = fscb; 398 ios->kern_buff = fscb;
351 ios->cred = sbi->s_cred;
352 399
353 ret = exofs_sbi_write(ios); 400 ret = ore_write(ios);
354 if (unlikely(ret)) 401 if (unlikely(ret))
355 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 402 EXOFS_ERR("%s: ore_write failed.\n", __func__);
356 else 403 else
357 sb->s_dirt = 0; 404 sb->s_dirt = 0;
358 405
@@ -360,7 +407,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
360 unlock_super(sb); 407 unlock_super(sb);
361out: 408out:
362 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 409 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
363 exofs_put_io_state(ios); 410 ore_put_io_state(ios);
364 kfree(fscb); 411 kfree(fscb);
365 return ret; 412 return ret;
366} 413}
@@ -384,15 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
384 431
385void exofs_free_sbi(struct exofs_sb_info *sbi) 432void exofs_free_sbi(struct exofs_sb_info *sbi)
386{ 433{
387 while (sbi->layout.s_numdevs) { 434 while (sbi->comps.numdevs) {
388 int i = --sbi->layout.s_numdevs; 435 int i = --sbi->comps.numdevs;
389 struct osd_dev *od = sbi->layout.s_ods[i]; 436 struct osd_dev *od = sbi->comps.ods[i];
390 437
391 if (od) { 438 if (od) {
392 sbi->layout.s_ods[i] = NULL; 439 sbi->comps.ods[i] = NULL;
393 osduld_put_device(od); 440 osduld_put_device(od);
394 } 441 }
395 } 442 }
443 if (sbi->comps.ods != sbi->_min_one_dev)
444 kfree(sbi->comps.ods);
396 kfree(sbi); 445 kfree(sbi);
397} 446}
398 447
@@ -419,8 +468,8 @@ static void exofs_put_super(struct super_block *sb)
419 msecs_to_jiffies(100)); 468 msecs_to_jiffies(100));
420 } 469 }
421 470
422 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], 471 _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
423 sbi->layout.s_pid); 472 sbi->one_comp.obj.partition);
424 473
425 bdi_destroy(&sbi->bdi); 474 bdi_destroy(&sbi->bdi);
426 exofs_free_sbi(sbi); 475 exofs_free_sbi(sbi);
@@ -501,10 +550,19 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
501 return -EINVAL; 550 return -EINVAL;
502 } 551 }
503 552
553 EXOFS_DBGMSG("exofs: layout: "
554 "num_comps=%u stripe_unit=0x%x group_width=%u "
555 "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
556 numdevs,
557 sbi->layout.stripe_unit,
558 sbi->layout.group_width,
559 _LLU(sbi->layout.group_depth),
560 sbi->layout.mirrors_p1,
561 sbi->data_map.odm_raid_algorithm);
504 return 0; 562 return 0;
505} 563}
506 564
507static unsigned __ra_pages(struct exofs_layout *layout) 565static unsigned __ra_pages(struct ore_layout *layout)
508{ 566{
509 const unsigned _MIN_RA = 32; /* min 128K read-ahead */ 567 const unsigned _MIN_RA = 32; /* min 128K read-ahead */
510 unsigned ra_pages = layout->group_width * layout->stripe_unit / 568 unsigned ra_pages = layout->group_width * layout->stripe_unit /
@@ -547,13 +605,11 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
547 return !(odi->systemid_len || odi->osdname_len); 605 return !(odi->systemid_len || odi->osdname_len);
548} 606}
549 607
550static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, 608static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
609 struct osd_dev *fscb_od,
551 unsigned table_count) 610 unsigned table_count)
552{ 611{
553 struct exofs_sb_info *sbi = *psbi; 612 struct ore_comp comp;
554 struct osd_dev *fscb_od;
555 struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
556 .id = EXOFS_DEVTABLE_ID};
557 struct exofs_device_table *dt; 613 struct exofs_device_table *dt;
558 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 614 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
559 sizeof(*dt); 615 sizeof(*dt);
@@ -567,10 +623,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
567 return -ENOMEM; 623 return -ENOMEM;
568 } 624 }
569 625
570 fscb_od = sbi->layout.s_ods[0]; 626 sbi->comps.numdevs = 0;
571 sbi->layout.s_ods[0] = NULL; 627
572 sbi->layout.s_numdevs = 0; 628 comp.obj.partition = sbi->one_comp.obj.partition;
573 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); 629 comp.obj.id = EXOFS_DEVTABLE_ID;
630 exofs_make_credential(comp.cred, &comp.obj);
631
632 ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
633 table_bytes);
574 if (unlikely(ret)) { 634 if (unlikely(ret)) {
575 EXOFS_ERR("ERROR: reading device table\n"); 635 EXOFS_ERR("ERROR: reading device table\n");
576 goto out; 636 goto out;
@@ -588,16 +648,18 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
588 goto out; 648 goto out;
589 649
590 if (likely(numdevs > 1)) { 650 if (likely(numdevs > 1)) {
591 unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); 651 unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
592 652
593 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); 653 /* Twice bigger table: See exofs_init_comps() and below
594 if (unlikely(!sbi)) { 654 * comment
655 */
656 sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
657 if (unlikely(!sbi->comps.ods)) {
658 EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
659 numdevs);
595 ret = -ENOMEM; 660 ret = -ENOMEM;
596 goto out; 661 goto out;
597 } 662 }
598 memset(&sbi->layout.s_ods[1], 0,
599 size - sizeof(sbi->layout.s_ods[0]));
600 *psbi = sbi;
601 } 663 }
602 664
603 for (i = 0; i < numdevs; i++) { 665 for (i = 0; i < numdevs; i++) {
@@ -619,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
619 * line. We always keep them in device-table order. 681 * line. We always keep them in device-table order.
620 */ 682 */
621 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 683 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
622 sbi->layout.s_ods[i] = fscb_od; 684 sbi->comps.ods[i] = fscb_od;
623 ++sbi->layout.s_numdevs; 685 ++sbi->comps.numdevs;
624 fscb_od = NULL; 686 fscb_od = NULL;
625 continue; 687 continue;
626 } 688 }
@@ -633,13 +695,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
633 goto out; 695 goto out;
634 } 696 }
635 697
636 sbi->layout.s_ods[i] = od; 698 sbi->comps.ods[i] = od;
637 ++sbi->layout.s_numdevs; 699 ++sbi->comps.numdevs;
638 700
639 /* Read the fscb of the other devices to make sure the FS 701 /* Read the fscb of the other devices to make sure the FS
640 * partition is there. 702 * partition is there.
641 */ 703 */
642 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, 704 ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
643 sizeof(fscb)); 705 sizeof(fscb));
644 if (unlikely(ret)) { 706 if (unlikely(ret)) {
645 EXOFS_ERR("ERROR: Malformed participating device " 707 EXOFS_ERR("ERROR: Malformed participating device "
@@ -656,13 +718,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
656 718
657out: 719out:
658 kfree(dt); 720 kfree(dt);
659 if (unlikely(!ret && fscb_od)) { 721 if (likely(!ret)) {
660 EXOFS_ERR( 722 unsigned numdevs = sbi->comps.numdevs;
661 "ERROR: Bad device-table container device not present\n");
662 osduld_put_device(fscb_od);
663 ret = -EINVAL;
664 }
665 723
724 if (unlikely(fscb_od)) {
725 EXOFS_ERR("ERROR: Bad device-table container device not present\n");
726 osduld_put_device(fscb_od);
727 return -EINVAL;
728 }
729 /* exofs round-robins the device table view according to inode
730 * number. We hold a: twice bigger table hence inodes can point
731 * to any device and have a sequential view of the table
732 * starting at this device. See exofs_init_comps()
733 */
734 for (i = 0; i < numdevs - 1; ++i)
735 sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
736 }
666 return ret; 737 return ret;
667} 738}
668 739
@@ -676,7 +747,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
676 struct exofs_sb_info *sbi; /*extended info */ 747 struct exofs_sb_info *sbi; /*extended info */
677 struct osd_dev *od; /* Master device */ 748 struct osd_dev *od; /* Master device */
678 struct exofs_fscb fscb; /*on-disk superblock info */ 749 struct exofs_fscb fscb; /*on-disk superblock info */
679 struct osd_obj_id obj; 750 struct ore_comp comp;
680 unsigned table_count; 751 unsigned table_count;
681 int ret; 752 int ret;
682 753
@@ -684,10 +755,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
684 if (!sbi) 755 if (!sbi)
685 return -ENOMEM; 756 return -ENOMEM;
686 757
687 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
688 if (ret)
689 goto free_bdi;
690
691 /* use mount options to fill superblock */ 758 /* use mount options to fill superblock */
692 if (opts->is_osdname) { 759 if (opts->is_osdname) {
693 struct osd_dev_info odi = {.systemid_len = 0}; 760 struct osd_dev_info odi = {.systemid_len = 0};
@@ -695,6 +762,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
695 odi.osdname_len = strlen(opts->dev_name); 762 odi.osdname_len = strlen(opts->dev_name);
696 odi.osdname = (u8 *)opts->dev_name; 763 odi.osdname = (u8 *)opts->dev_name;
697 od = osduld_info_lookup(&odi); 764 od = osduld_info_lookup(&odi);
765 kfree(opts->dev_name);
766 opts->dev_name = NULL;
698 } else { 767 } else {
699 od = osduld_path_lookup(opts->dev_name); 768 od = osduld_path_lookup(opts->dev_name);
700 } 769 }
@@ -709,11 +778,16 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
709 sbi->layout.group_width = 1; 778 sbi->layout.group_width = 1;
710 sbi->layout.group_depth = -1; 779 sbi->layout.group_depth = -1;
711 sbi->layout.group_count = 1; 780 sbi->layout.group_count = 1;
712 sbi->layout.s_ods[0] = od;
713 sbi->layout.s_numdevs = 1;
714 sbi->layout.s_pid = opts->pid;
715 sbi->s_timeout = opts->timeout; 781 sbi->s_timeout = opts->timeout;
716 782
783 sbi->one_comp.obj.partition = opts->pid;
784 sbi->one_comp.obj.id = 0;
785 exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
786 sbi->comps.numdevs = 1;
787 sbi->comps.single_comp = EC_SINGLE_COMP;
788 sbi->comps.comps = &sbi->one_comp;
789 sbi->comps.ods = sbi->_min_one_dev;
790
717 /* fill in some other data by hand */ 791 /* fill in some other data by hand */
718 memset(sb->s_id, 0, sizeof(sb->s_id)); 792 memset(sb->s_id, 0, sizeof(sb->s_id));
719 strcpy(sb->s_id, "exofs"); 793 strcpy(sb->s_id, "exofs");
@@ -724,11 +798,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
724 sb->s_bdev = NULL; 798 sb->s_bdev = NULL;
725 sb->s_dev = 0; 799 sb->s_dev = 0;
726 800
727 obj.partition = sbi->layout.s_pid; 801 comp.obj.partition = sbi->one_comp.obj.partition;
728 obj.id = EXOFS_SUPER_ID; 802 comp.obj.id = EXOFS_SUPER_ID;
729 exofs_make_credential(sbi->s_cred, &obj); 803 exofs_make_credential(comp.cred, &comp.obj);
730 804
731 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); 805 ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
732 if (unlikely(ret)) 806 if (unlikely(ret))
733 goto free_sbi; 807 goto free_sbi;
734 808
@@ -757,9 +831,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
757 831
758 table_count = le64_to_cpu(fscb.s_dev_table_count); 832 table_count = le64_to_cpu(fscb.s_dev_table_count);
759 if (table_count) { 833 if (table_count) {
760 ret = exofs_read_lookup_dev_table(&sbi, table_count); 834 ret = exofs_read_lookup_dev_table(sbi, od, table_count);
761 if (unlikely(ret)) 835 if (unlikely(ret))
762 goto free_sbi; 836 goto free_sbi;
837 } else {
838 sbi->comps.ods[0] = od;
763 } 839 }
764 840
765 __sbi_read_stats(sbi); 841 __sbi_read_stats(sbi);
@@ -793,20 +869,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
793 goto free_sbi; 869 goto free_sbi;
794 } 870 }
795 871
796 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], 872 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
797 sbi->layout.s_pid); 873 if (ret) {
798 if (opts->is_osdname) 874 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
799 kfree(opts->dev_name); 875 goto free_sbi;
876 }
877
878 _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
879 sbi->one_comp.obj.partition);
800 return 0; 880 return 0;
801 881
802free_sbi: 882free_sbi:
803 bdi_destroy(&sbi->bdi);
804free_bdi:
805 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 883 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
806 opts->dev_name, sbi->layout.s_pid, ret); 884 opts->dev_name, sbi->one_comp.obj.partition, ret);
807 exofs_free_sbi(sbi); 885 exofs_free_sbi(sbi);
808 if (opts->is_osdname)
809 kfree(opts->dev_name);
810 return ret; 886 return ret;
811} 887}
812 888
@@ -837,7 +913,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
837{ 913{
838 struct super_block *sb = dentry->d_sb; 914 struct super_block *sb = dentry->d_sb;
839 struct exofs_sb_info *sbi = sb->s_fs_info; 915 struct exofs_sb_info *sbi = sb->s_fs_info;
840 struct exofs_io_state *ios; 916 struct ore_io_state *ios;
841 struct osd_attr attrs[] = { 917 struct osd_attr attrs[] = {
842 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, 918 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
843 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), 919 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -846,21 +922,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
846 }; 922 };
847 uint64_t capacity = ULLONG_MAX; 923 uint64_t capacity = ULLONG_MAX;
848 uint64_t used = ULLONG_MAX; 924 uint64_t used = ULLONG_MAX;
849 uint8_t cred_a[OSD_CAP_LEN];
850 int ret; 925 int ret;
851 926
852 ret = exofs_get_io_state(&sbi->layout, &ios); 927 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
853 if (ret) { 928 if (ret) {
854 EXOFS_DBGMSG("exofs_get_io_state failed.\n"); 929 EXOFS_DBGMSG("ore_get_io_state failed.\n");
855 return ret; 930 return ret;
856 } 931 }
857 932
858 exofs_make_credential(cred_a, &ios->obj);
859 ios->cred = sbi->s_cred;
860 ios->in_attr = attrs; 933 ios->in_attr = attrs;
861 ios->in_attr_len = ARRAY_SIZE(attrs); 934 ios->in_attr_len = ARRAY_SIZE(attrs);
862 935
863 ret = exofs_sbi_read(ios); 936 ret = ore_read(ios);
864 if (unlikely(ret)) 937 if (unlikely(ret))
865 goto out; 938 goto out;
866 939
@@ -889,7 +962,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
889 buf->f_namelen = EXOFS_NAME_LEN; 962 buf->f_namelen = EXOFS_NAME_LEN;
890 963
891out: 964out:
892 exofs_put_io_state(ios); 965 ore_put_io_state(ios);
893 return ret; 966 return ret;
894} 967}
895 968
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 52c053763942..35d6a3cfd9ff 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 case ACL_TYPE_ACCESS: 194 case ACL_TYPE_ACCESS:
195 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; 195 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
196 if (acl) { 196 if (acl) {
197 mode_t mode = inode->i_mode; 197 error = posix_acl_equiv_mode(acl, &inode->i_mode);
198 error = posix_acl_equiv_mode(acl, &mode);
199 if (error < 0) 198 if (error < 0)
200 return error; 199 return error;
201 else { 200 else {
202 inode->i_mode = mode;
203 inode->i_ctime = CURRENT_TIME_SEC; 201 inode->i_ctime = CURRENT_TIME_SEC;
204 mark_inode_dirty(inode); 202 mark_inode_dirty(inode);
205 if (error == 0) 203 if (error == 0)
@@ -253,16 +251,14 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
253 inode->i_mode &= ~current_umask(); 251 inode->i_mode &= ~current_umask();
254 } 252 }
255 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 253 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
256 mode_t mode = inode->i_mode;
257 if (S_ISDIR(inode->i_mode)) { 254 if (S_ISDIR(inode->i_mode)) {
258 error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); 255 error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
259 if (error) 256 if (error)
260 goto cleanup; 257 goto cleanup;
261 } 258 }
262 error = posix_acl_create(&acl, GFP_KERNEL, &mode); 259 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
263 if (error < 0) 260 if (error < 0)
264 return error; 261 return error;
265 inode->i_mode = mode;
266 if (error > 0) { 262 if (error > 0) {
267 /* This is an extended ACL */ 263 /* This is an extended ACL */
268 error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); 264 error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 5c0a6a4fb052..503bfb0ed79b 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -61,7 +61,6 @@ extern int ext2_init_acl (struct inode *, struct inode *);
61#else 61#else
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext2_get_acl NULL 63#define ext2_get_acl NULL
64#define ext2_get_acl NULL
65#define ext2_set_acl NULL 64#define ext2_set_acl NULL
66 65
67static inline int 66static inline int
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 529970617a21..d27b71f1d183 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
161 161
162 if (name == NULL) 162 if (name == NULL)
163 return -EINVAL; 163 return -EINVAL;
164 name_len = strlen(name);
165 if (name_len > 255)
166 return -ERANGE;
167
164 down_read(&EXT2_I(inode)->xattr_sem); 168 down_read(&EXT2_I(inode)->xattr_sem);
165 error = -ENODATA; 169 error = -ENODATA;
166 if (!EXT2_I(inode)->i_file_acl) 170 if (!EXT2_I(inode)->i_file_acl)
@@ -181,12 +185,8 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
181 error = -EIO; 185 error = -EIO;
182 goto cleanup; 186 goto cleanup;
183 } 187 }
184 /* find named attribute */
185 name_len = strlen(name);
186 188
187 error = -ERANGE; 189 /* find named attribute */
188 if (name_len > 255)
189 goto cleanup;
190 entry = FIRST_ENTRY(bh); 190 entry = FIRST_ENTRY(bh);
191 while (!IS_LAST_ENTRY(entry)) { 191 while (!IS_LAST_ENTRY(entry)) {
192 struct ext2_xattr_entry *next = 192 struct ext2_xattr_entry *next =
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 6c29bf0df04a..3091f62e55b6 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
199 case ACL_TYPE_ACCESS: 199 case ACL_TYPE_ACCESS:
200 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; 200 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
201 if (acl) { 201 if (acl) {
202 mode_t mode = inode->i_mode; 202 error = posix_acl_equiv_mode(acl, &inode->i_mode);
203 error = posix_acl_equiv_mode(acl, &mode);
204 if (error < 0) 203 if (error < 0)
205 return error; 204 return error;
206 else { 205 else {
207 inode->i_mode = mode;
208 inode->i_ctime = CURRENT_TIME_SEC; 206 inode->i_ctime = CURRENT_TIME_SEC;
209 ext3_mark_inode_dirty(handle, inode); 207 ext3_mark_inode_dirty(handle, inode);
210 if (error == 0) 208 if (error == 0)
@@ -261,19 +259,16 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
261 inode->i_mode &= ~current_umask(); 259 inode->i_mode &= ~current_umask();
262 } 260 }
263 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 261 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
264 mode_t mode = inode->i_mode;
265
266 if (S_ISDIR(inode->i_mode)) { 262 if (S_ISDIR(inode->i_mode)) {
267 error = ext3_set_acl(handle, inode, 263 error = ext3_set_acl(handle, inode,
268 ACL_TYPE_DEFAULT, acl); 264 ACL_TYPE_DEFAULT, acl);
269 if (error) 265 if (error)
270 goto cleanup; 266 goto cleanup;
271 } 267 }
272 error = posix_acl_create(&acl, GFP_NOFS, &mode); 268 error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
273 if (error < 0) 269 if (error < 0)
274 return error; 270 return error;
275 271
276 inode->i_mode = mode;
277 if (error > 0) { 272 if (error > 0) {
278 /* This is an extended ACL */ 273 /* This is an extended ACL */
279 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); 274 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index fe52297e31ad..6386d76f44a7 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -21,6 +21,7 @@
21#include <linux/quotaops.h> 21#include <linux/quotaops.h>
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <trace/events/ext3.h>
24 25
25/* 26/*
26 * balloc.c contains the blocks allocation and deallocation routines 27 * balloc.c contains the blocks allocation and deallocation routines
@@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
161 desc = ext3_get_group_desc(sb, block_group, NULL); 162 desc = ext3_get_group_desc(sb, block_group, NULL);
162 if (!desc) 163 if (!desc)
163 return NULL; 164 return NULL;
165 trace_ext3_read_block_bitmap(sb, block_group);
164 bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); 166 bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
165 bh = sb_getblk(sb, bitmap_blk); 167 bh = sb_getblk(sb, bitmap_blk);
166 if (unlikely(!bh)) { 168 if (unlikely(!bh)) {
@@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb,
351 struct rb_node * parent = NULL; 353 struct rb_node * parent = NULL;
352 struct ext3_reserve_window_node *this; 354 struct ext3_reserve_window_node *this;
353 355
356 trace_ext3_rsv_window_add(sb, rsv);
354 while (*p) 357 while (*p)
355 { 358 {
356 parent = *p; 359 parent = *p;
@@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode)
476 rsv = &block_i->rsv_window_node; 479 rsv = &block_i->rsv_window_node;
477 if (!rsv_is_empty(&rsv->rsv_window)) { 480 if (!rsv_is_empty(&rsv->rsv_window)) {
478 spin_lock(rsv_lock); 481 spin_lock(rsv_lock);
479 if (!rsv_is_empty(&rsv->rsv_window)) 482 if (!rsv_is_empty(&rsv->rsv_window)) {
483 trace_ext3_discard_reservation(inode, rsv);
480 rsv_window_remove(inode->i_sb, rsv); 484 rsv_window_remove(inode->i_sb, rsv);
485 }
481 spin_unlock(rsv_lock); 486 spin_unlock(rsv_lock);
482 } 487 }
483} 488}
@@ -683,14 +688,10 @@ error_return:
683void ext3_free_blocks(handle_t *handle, struct inode *inode, 688void ext3_free_blocks(handle_t *handle, struct inode *inode,
684 ext3_fsblk_t block, unsigned long count) 689 ext3_fsblk_t block, unsigned long count)
685{ 690{
686 struct super_block * sb; 691 struct super_block *sb = inode->i_sb;
687 unsigned long dquot_freed_blocks; 692 unsigned long dquot_freed_blocks;
688 693
689 sb = inode->i_sb; 694 trace_ext3_free_blocks(inode, block, count);
690 if (!sb) {
691 printk ("ext3_free_blocks: nonexistent device");
692 return;
693 }
694 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 695 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
695 if (dquot_freed_blocks) 696 if (dquot_freed_blocks)
696 dquot_free_block(inode, dquot_freed_blocks); 697 dquot_free_block(inode, dquot_freed_blocks);
@@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1136 else 1137 else
1137 start_block = grp_goal + group_first_block; 1138 start_block = grp_goal + group_first_block;
1138 1139
1140 trace_ext3_alloc_new_reservation(sb, start_block);
1139 size = my_rsv->rsv_goal_size; 1141 size = my_rsv->rsv_goal_size;
1140 1142
1141 if (!rsv_is_empty(&my_rsv->rsv_window)) { 1143 if (!rsv_is_empty(&my_rsv->rsv_window)) {
@@ -1230,8 +1232,11 @@ retry:
1230 * check if the first free block is within the 1232 * check if the first free block is within the
1231 * free space we just reserved 1233 * free space we just reserved
1232 */ 1234 */
1233 if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) 1235 if (start_block >= my_rsv->rsv_start &&
1236 start_block <= my_rsv->rsv_end) {
1237 trace_ext3_reserved(sb, start_block, my_rsv);
1234 return 0; /* success */ 1238 return 0; /* success */
1239 }
1235 /* 1240 /*
1236 * if the first free bit we found is out of the reservable space 1241 * if the first free bit we found is out of the reservable space
1237 * continue search for next reservable space, 1242 * continue search for next reservable space,
@@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1514 1519
1515 *errp = -ENOSPC; 1520 *errp = -ENOSPC;
1516 sb = inode->i_sb; 1521 sb = inode->i_sb;
1517 if (!sb) {
1518 printk("ext3_new_block: nonexistent device");
1519 return 0;
1520 }
1521 1522
1522 /* 1523 /*
1523 * Check quota for allocation of this block. 1524 * Check quota for allocation of this block.
@@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1528 return 0; 1529 return 0;
1529 } 1530 }
1530 1531
1532 trace_ext3_request_blocks(inode, goal, num);
1533
1531 sbi = EXT3_SB(sb); 1534 sbi = EXT3_SB(sb);
1532 es = EXT3_SB(sb)->s_es; 1535 es = sbi->s_es;
1533 ext3_debug("goal=%lu.\n", goal); 1536 ext3_debug("goal=%lu.\n", goal);
1534 /* 1537 /*
1535 * Allocate a block from reservation only when 1538 * Allocate a block from reservation only when
@@ -1742,6 +1745,10 @@ allocated:
1742 brelse(bitmap_bh); 1745 brelse(bitmap_bh);
1743 dquot_free_block(inode, *count-num); 1746 dquot_free_block(inode, *count-num);
1744 *count = num; 1747 *count = num;
1748
1749 trace_ext3_allocate_blocks(inode, goal, num,
1750 (unsigned long long)ret_block);
1751
1745 return ret_block; 1752 return ret_block;
1746 1753
1747io_error: 1754io_error:
@@ -1996,6 +2003,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1996 if ((next - start) < minblocks) 2003 if ((next - start) < minblocks)
1997 goto free_extent; 2004 goto free_extent;
1998 2005
2006 trace_ext3_discard_blocks(sb, discard_block, next - start);
1999 /* Send the TRIM command down to the device */ 2007 /* Send the TRIM command down to the device */
2000 err = sb_issue_discard(sb, discard_block, next - start, 2008 err = sb_issue_discard(sb, discard_block, next - start,
2001 GFP_NOFS, 0); 2009 GFP_NOFS, 0);
@@ -2100,7 +2108,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2100 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) 2108 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2101 return -EINVAL; 2109 return -EINVAL;
2102 if (start >= max_blks) 2110 if (start >= max_blks)
2103 goto out; 2111 return -EINVAL;
2104 if (start + len > max_blks) 2112 if (start + len > max_blks)
2105 len = max_blks - start; 2113 len = max_blks - start;
2106 2114
@@ -2148,8 +2156,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2148 2156
2149 if (ret >= 0) 2157 if (ret >= 0)
2150 ret = 0; 2158 ret = 0;
2151
2152out:
2153 range->len = trimmed * sb->s_blocksize; 2159 range->len = trimmed * sb->s_blocksize;
2154 2160
2155 return ret; 2161 return ret;
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 2be5b99097f1..724df69847dc 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = {
71}; 71};
72 72
73const struct inode_operations ext3_file_inode_operations = { 73const struct inode_operations ext3_file_inode_operations = {
74 .truncate = ext3_truncate,
75 .setattr = ext3_setattr, 74 .setattr = ext3_setattr,
76#ifdef CONFIG_EXT3_FS_XATTR 75#ifdef CONFIG_EXT3_FS_XATTR
77 .setxattr = generic_setxattr, 76 .setxattr = generic_setxattr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 0bcf63adb80a..d494c554c6e6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -30,6 +30,7 @@
30#include <linux/jbd.h> 30#include <linux/jbd.h>
31#include <linux/ext3_fs.h> 31#include <linux/ext3_fs.h>
32#include <linux/ext3_jbd.h> 32#include <linux/ext3_jbd.h>
33#include <trace/events/ext3.h>
33 34
34/* 35/*
35 * akpm: A new design for ext3_sync_file(). 36 * akpm: A new design for ext3_sync_file().
@@ -51,12 +52,14 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
51 int ret, needs_barrier = 0; 52 int ret, needs_barrier = 0;
52 tid_t commit_tid; 53 tid_t commit_tid;
53 54
55 trace_ext3_sync_file_enter(file, datasync);
56
54 if (inode->i_sb->s_flags & MS_RDONLY) 57 if (inode->i_sb->s_flags & MS_RDONLY)
55 return 0; 58 return 0;
56 59
57 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 60 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
58 if (ret) 61 if (ret)
59 return ret; 62 goto out;
60 63
61 /* 64 /*
62 * Taking the mutex here just to keep consistent with how fsync was 65 * Taking the mutex here just to keep consistent with how fsync was
@@ -83,7 +86,8 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
83 */ 86 */
84 if (ext3_should_journal_data(inode)) { 87 if (ext3_should_journal_data(inode)) {
85 mutex_unlock(&inode->i_mutex); 88 mutex_unlock(&inode->i_mutex);
86 return ext3_force_commit(inode->i_sb); 89 ret = ext3_force_commit(inode->i_sb);
90 goto out;
87 } 91 }
88 92
89 if (datasync) 93 if (datasync)
@@ -104,6 +108,9 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
104 */ 108 */
105 if (needs_barrier) 109 if (needs_barrier)
106 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 110 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
111
107 mutex_unlock(&inode->i_mutex); 112 mutex_unlock(&inode->i_mutex);
113out:
114 trace_ext3_sync_file_exit(inode, ret);
108 return ret; 115 return ret;
109} 116}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bfc2dc43681d..bf09cbf938cc 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/bitops.h> 25#include <linux/bitops.h>
26#include <trace/events/ext3.h>
26 27
27#include <asm/byteorder.h> 28#include <asm/byteorder.h>
28 29
@@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
118 119
119 ino = inode->i_ino; 120 ino = inode->i_ino;
120 ext3_debug ("freeing inode %lu\n", ino); 121 ext3_debug ("freeing inode %lu\n", ino);
122 trace_ext3_free_inode(inode);
121 123
122 is_directory = S_ISDIR(inode->i_mode); 124 is_directory = S_ISDIR(inode->i_mode);
123 125
@@ -426,6 +428,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
426 return ERR_PTR(-EPERM); 428 return ERR_PTR(-EPERM);
427 429
428 sb = dir->i_sb; 430 sb = dir->i_sb;
431 trace_ext3_request_inode(dir, mode);
429 inode = new_inode(sb); 432 inode = new_inode(sb);
430 if (!inode) 433 if (!inode)
431 return ERR_PTR(-ENOMEM); 434 return ERR_PTR(-ENOMEM);
@@ -601,6 +604,7 @@ got:
601 } 604 }
602 605
603 ext3_debug("allocating inode %lu\n", inode->i_ino); 606 ext3_debug("allocating inode %lu\n", inode->i_ino);
607 trace_ext3_allocate_inode(inode, dir, mode);
604 goto really_out; 608 goto really_out;
605fail: 609fail:
606 ext3_std_error(sb, err); 610 ext3_std_error(sb, err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2978a2a17a59..04da6acde85d 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -38,10 +38,12 @@
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h> 39#include <linux/fiemap.h>
40#include <linux/namei.h> 40#include <linux/namei.h>
41#include <trace/events/ext3.h>
41#include "xattr.h" 42#include "xattr.h"
42#include "acl.h" 43#include "acl.h"
43 44
44static int ext3_writepage_trans_blocks(struct inode *inode); 45static int ext3_writepage_trans_blocks(struct inode *inode);
46static int ext3_block_truncate_page(struct inode *inode, loff_t from);
45 47
46/* 48/*
47 * Test whether an inode is a fast symlink. 49 * Test whether an inode is a fast symlink.
@@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
70 72
71 might_sleep(); 73 might_sleep();
72 74
75 trace_ext3_forget(inode, is_metadata, blocknr);
73 BUFFER_TRACE(bh, "enter"); 76 BUFFER_TRACE(bh, "enter");
74 77
75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 78 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
@@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
194 */ 197 */
195void ext3_evict_inode (struct inode *inode) 198void ext3_evict_inode (struct inode *inode)
196{ 199{
200 struct ext3_inode_info *ei = EXT3_I(inode);
197 struct ext3_block_alloc_info *rsv; 201 struct ext3_block_alloc_info *rsv;
198 handle_t *handle; 202 handle_t *handle;
199 int want_delete = 0; 203 int want_delete = 0;
200 204
205 trace_ext3_evict_inode(inode);
201 if (!inode->i_nlink && !is_bad_inode(inode)) { 206 if (!inode->i_nlink && !is_bad_inode(inode)) {
202 dquot_initialize(inode); 207 dquot_initialize(inode);
203 want_delete = 1; 208 want_delete = 1;
204 } 209 }
205 210
211 /*
212 * When journalling data dirty buffers are tracked only in the journal.
213 * So although mm thinks everything is clean and ready for reaping the
214 * inode might still have some pages to write in the running
215 * transaction or waiting to be checkpointed. Thus calling
216 * journal_invalidatepage() (via truncate_inode_pages()) to discard
217 * these buffers can cause data loss. Also even if we did not discard
218 * these buffers, we would have no way to find them after the inode
219 * is reaped and thus user could see stale data if he tries to read
220 * them before the transaction is checkpointed. So be careful and
221 * force everything to disk here... We use ei->i_datasync_tid to
222 * store the newest transaction containing inode's data.
223 *
224 * Note that directories do not have this problem because they don't
225 * use page cache.
226 */
227 if (inode->i_nlink && ext3_should_journal_data(inode) &&
228 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
229 tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
230 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
231
232 log_start_commit(journal, commit_tid);
233 log_wait_commit(journal, commit_tid);
234 filemap_write_and_wait(&inode->i_data);
235 }
206 truncate_inode_pages(&inode->i_data, 0); 236 truncate_inode_pages(&inode->i_data, 0);
207 237
208 ext3_discard_reservation(inode); 238 ext3_discard_reservation(inode);
209 rsv = EXT3_I(inode)->i_block_alloc_info; 239 rsv = ei->i_block_alloc_info;
210 EXT3_I(inode)->i_block_alloc_info = NULL; 240 ei->i_block_alloc_info = NULL;
211 if (unlikely(rsv)) 241 if (unlikely(rsv))
212 kfree(rsv); 242 kfree(rsv);
213 243
@@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode)
231 if (inode->i_blocks) 261 if (inode->i_blocks)
232 ext3_truncate(inode); 262 ext3_truncate(inode);
233 /* 263 /*
234 * Kill off the orphan record which ext3_truncate created. 264 * Kill off the orphan record created when the inode lost the last
235 * AKPM: I think this can be inside the above `if'. 265 * link. Note that ext3_orphan_del() has to be able to cope with the
236 * Note that ext3_orphan_del() has to be able to cope with the 266 * deletion of a non-existent orphan - ext3_truncate() could
237 * deletion of a non-existent orphan - this is because we don't 267 * have removed the record.
238 * know if ext3_truncate() actually created an orphan record.
239 * (Well, we could do this if we need to, but heck - it works)
240 */ 268 */
241 ext3_orphan_del(handle, inode); 269 ext3_orphan_del(handle, inode);
242 EXT3_I(inode)->i_dtime = get_seconds(); 270 ei->i_dtime = get_seconds();
243 271
244 /* 272 /*
245 * One subtle ordering requirement: if anything has gone wrong 273 * One subtle ordering requirement: if anything has gone wrong
@@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
842 ext3_fsblk_t first_block = 0; 870 ext3_fsblk_t first_block = 0;
843 871
844 872
873 trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
845 J_ASSERT(handle != NULL || create == 0); 874 J_ASSERT(handle != NULL || create == 0);
846 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); 875 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
847 876
@@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
886 if (!create || err == -EIO) 915 if (!create || err == -EIO)
887 goto cleanup; 916 goto cleanup;
888 917
918 /*
919 * Block out ext3_truncate while we alter the tree
920 */
889 mutex_lock(&ei->truncate_mutex); 921 mutex_lock(&ei->truncate_mutex);
890 922
891 /* 923 /*
@@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
934 */ 966 */
935 count = ext3_blks_to_allocate(partial, indirect_blks, 967 count = ext3_blks_to_allocate(partial, indirect_blks,
936 maxblocks, blocks_to_boundary); 968 maxblocks, blocks_to_boundary);
937 /*
938 * Block out ext3_truncate while we alter the tree
939 */
940 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, 969 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
941 offsets + (partial - chain), partial); 970 offsets + (partial - chain), partial);
942 971
@@ -970,6 +999,9 @@ cleanup:
970 } 999 }
971 BUFFER_TRACE(bh_result, "returned"); 1000 BUFFER_TRACE(bh_result, "returned");
972out: 1001out:
1002 trace_ext3_get_blocks_exit(inode, iblock,
1003 depth ? le32_to_cpu(chain[depth-1].key) : 0,
1004 count, err);
973 return err; 1005 return err;
974} 1006}
975 1007
@@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode)
1202 ext3_truncate(inode); 1234 ext3_truncate(inode);
1203} 1235}
1204 1236
1237/*
1238 * Truncate blocks that were not used by direct IO write. We have to zero out
1239 * the last file block as well because direct IO might have written to it.
1240 */
1241static void ext3_truncate_failed_direct_write(struct inode *inode)
1242{
1243 ext3_block_truncate_page(inode, inode->i_size);
1244 ext3_truncate(inode);
1245}
1246
1205static int ext3_write_begin(struct file *file, struct address_space *mapping, 1247static int ext3_write_begin(struct file *file, struct address_space *mapping,
1206 loff_t pos, unsigned len, unsigned flags, 1248 loff_t pos, unsigned len, unsigned flags,
1207 struct page **pagep, void **fsdata) 1249 struct page **pagep, void **fsdata)
@@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1217 * we allocate blocks but write fails for some reason */ 1259 * we allocate blocks but write fails for some reason */
1218 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; 1260 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1219 1261
1262 trace_ext3_write_begin(inode, pos, len, flags);
1263
1220 index = pos >> PAGE_CACHE_SHIFT; 1264 index = pos >> PAGE_CACHE_SHIFT;
1221 from = pos & (PAGE_CACHE_SIZE - 1); 1265 from = pos & (PAGE_CACHE_SIZE - 1);
1222 to = from + len; 1266 to = from + len;
@@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file,
1332 unsigned from, to; 1376 unsigned from, to;
1333 int ret = 0, ret2; 1377 int ret = 0, ret2;
1334 1378
1379 trace_ext3_ordered_write_end(inode, pos, len, copied);
1335 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1380 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1336 1381
1337 from = pos & (PAGE_CACHE_SIZE - 1); 1382 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file,
1367 struct inode *inode = file->f_mapping->host; 1412 struct inode *inode = file->f_mapping->host;
1368 int ret; 1413 int ret;
1369 1414
1415 trace_ext3_writeback_write_end(inode, pos, len, copied);
1370 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 1416 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1371 update_file_sizes(inode, pos, copied); 1417 update_file_sizes(inode, pos, copied);
1372 /* 1418 /*
@@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file,
1391{ 1437{
1392 handle_t *handle = ext3_journal_current_handle(); 1438 handle_t *handle = ext3_journal_current_handle();
1393 struct inode *inode = mapping->host; 1439 struct inode *inode = mapping->host;
1440 struct ext3_inode_info *ei = EXT3_I(inode);
1394 int ret = 0, ret2; 1441 int ret = 0, ret2;
1395 int partial = 0; 1442 int partial = 0;
1396 unsigned from, to; 1443 unsigned from, to;
1397 1444
1445 trace_ext3_journalled_write_end(inode, pos, len, copied);
1398 from = pos & (PAGE_CACHE_SIZE - 1); 1446 from = pos & (PAGE_CACHE_SIZE - 1);
1399 to = from + len; 1447 to = from + len;
1400 1448
@@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file,
1419 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1467 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1420 ext3_orphan_add(handle, inode); 1468 ext3_orphan_add(handle, inode);
1421 ext3_set_inode_state(inode, EXT3_STATE_JDATA); 1469 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1422 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1470 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
1423 EXT3_I(inode)->i_disksize = inode->i_size; 1471 if (inode->i_size > ei->i_disksize) {
1472 ei->i_disksize = inode->i_size;
1424 ret2 = ext3_mark_inode_dirty(handle, inode); 1473 ret2 = ext3_mark_inode_dirty(handle, inode);
1425 if (!ret) 1474 if (!ret)
1426 ret = ret2; 1475 ret = ret2;
@@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page,
1577 if (ext3_journal_current_handle()) 1626 if (ext3_journal_current_handle())
1578 goto out_fail; 1627 goto out_fail;
1579 1628
1629 trace_ext3_ordered_writepage(page);
1580 if (!page_has_buffers(page)) { 1630 if (!page_has_buffers(page)) {
1581 create_empty_buffers(page, inode->i_sb->s_blocksize, 1631 create_empty_buffers(page, inode->i_sb->s_blocksize,
1582 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1632 (1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page,
1647 if (ext3_journal_current_handle()) 1697 if (ext3_journal_current_handle())
1648 goto out_fail; 1698 goto out_fail;
1649 1699
1700 trace_ext3_writeback_writepage(page);
1650 if (page_has_buffers(page)) { 1701 if (page_has_buffers(page)) {
1651 if (!walk_page_buffers(NULL, page_buffers(page), 0, 1702 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1652 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { 1703 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
@@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page,
1689 if (ext3_journal_current_handle()) 1740 if (ext3_journal_current_handle())
1690 goto no_write; 1741 goto no_write;
1691 1742
1743 trace_ext3_journalled_writepage(page);
1692 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1744 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1693 if (IS_ERR(handle)) { 1745 if (IS_ERR(handle)) {
1694 ret = PTR_ERR(handle); 1746 ret = PTR_ERR(handle);
@@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page,
1715 if (ret == 0) 1767 if (ret == 0)
1716 ret = err; 1768 ret = err;
1717 ext3_set_inode_state(inode, EXT3_STATE_JDATA); 1769 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1770 atomic_set(&EXT3_I(inode)->i_datasync_tid,
1771 handle->h_transaction->t_tid);
1718 unlock_page(page); 1772 unlock_page(page);
1719 } else { 1773 } else {
1720 /* 1774 /*
@@ -1739,6 +1793,7 @@ out_unlock:
1739 1793
1740static int ext3_readpage(struct file *file, struct page *page) 1794static int ext3_readpage(struct file *file, struct page *page)
1741{ 1795{
1796 trace_ext3_readpage(page);
1742 return mpage_readpage(page, ext3_get_block); 1797 return mpage_readpage(page, ext3_get_block);
1743} 1798}
1744 1799
@@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset)
1753{ 1808{
1754 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1809 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1755 1810
1811 trace_ext3_invalidatepage(page, offset);
1812
1756 /* 1813 /*
1757 * If it's a full truncate we just forget about the pending dirtying 1814 * If it's a full truncate we just forget about the pending dirtying
1758 */ 1815 */
@@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
1766{ 1823{
1767 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1824 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1768 1825
1826 trace_ext3_releasepage(page);
1769 WARN_ON(PageChecked(page)); 1827 WARN_ON(PageChecked(page));
1770 if (!page_has_buffers(page)) 1828 if (!page_has_buffers(page))
1771 return 0; 1829 return 0;
@@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1794 size_t count = iov_length(iov, nr_segs); 1852 size_t count = iov_length(iov, nr_segs);
1795 int retries = 0; 1853 int retries = 0;
1796 1854
1855 trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
1856
1797 if (rw == WRITE) { 1857 if (rw == WRITE) {
1798 loff_t final_size = offset + count; 1858 loff_t final_size = offset + count;
1799 1859
@@ -1827,7 +1887,7 @@ retry:
1827 loff_t end = offset + iov_length(iov, nr_segs); 1887 loff_t end = offset + iov_length(iov, nr_segs);
1828 1888
1829 if (end > isize) 1889 if (end > isize)
1830 vmtruncate(inode, isize); 1890 ext3_truncate_failed_direct_write(inode);
1831 } 1891 }
1832 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1892 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1833 goto retry; 1893 goto retry;
@@ -1841,7 +1901,7 @@ retry:
1841 /* This is really bad luck. We've written the data 1901 /* This is really bad luck. We've written the data
1842 * but cannot extend i_size. Truncate allocated blocks 1902 * but cannot extend i_size. Truncate allocated blocks
1843 * and pretend the write failed... */ 1903 * and pretend the write failed... */
1844 ext3_truncate(inode); 1904 ext3_truncate_failed_direct_write(inode);
1845 ret = PTR_ERR(handle); 1905 ret = PTR_ERR(handle);
1846 goto out; 1906 goto out;
1847 } 1907 }
@@ -1867,6 +1927,8 @@ retry:
1867 ret = err; 1927 ret = err;
1868 } 1928 }
1869out: 1929out:
1930 trace_ext3_direct_IO_exit(inode, offset,
1931 iov_length(iov, nr_segs), rw, ret);
1870 return ret; 1932 return ret;
1871} 1933}
1872 1934
@@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode)
1949 * This required during truncate. We need to physically zero the tail end 2011 * This required during truncate. We need to physically zero the tail end
1950 * of that block so it doesn't yield old data if the file is later grown. 2012 * of that block so it doesn't yield old data if the file is later grown.
1951 */ 2013 */
1952static int ext3_block_truncate_page(handle_t *handle, struct page *page, 2014static int ext3_block_truncate_page(struct inode *inode, loff_t from)
1953 struct address_space *mapping, loff_t from)
1954{ 2015{
1955 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; 2016 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1956 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2017 unsigned offset = from & (PAGE_CACHE_SIZE - 1);
1957 unsigned blocksize, iblock, length, pos; 2018 unsigned blocksize, iblock, length, pos;
1958 struct inode *inode = mapping->host; 2019 struct page *page;
2020 handle_t *handle = NULL;
1959 struct buffer_head *bh; 2021 struct buffer_head *bh;
1960 int err = 0; 2022 int err = 0;
1961 2023
2024 /* Truncated on block boundary - nothing to do */
1962 blocksize = inode->i_sb->s_blocksize; 2025 blocksize = inode->i_sb->s_blocksize;
2026 if ((from & (blocksize - 1)) == 0)
2027 return 0;
2028
2029 page = grab_cache_page(inode->i_mapping, index);
2030 if (!page)
2031 return -ENOMEM;
1963 length = blocksize - (offset & (blocksize - 1)); 2032 length = blocksize - (offset & (blocksize - 1));
1964 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 2033 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1965 2034
@@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
2004 goto unlock; 2073 goto unlock;
2005 } 2074 }
2006 2075
2076 /* data=writeback mode doesn't need transaction to zero-out data */
2077 if (!ext3_should_writeback_data(inode)) {
2078 /* We journal at most one block */
2079 handle = ext3_journal_start(inode, 1);
2080 if (IS_ERR(handle)) {
2081 clear_highpage(page);
2082 flush_dcache_page(page);
2083 err = PTR_ERR(handle);
2084 goto unlock;
2085 }
2086 }
2087
2007 if (ext3_should_journal_data(inode)) { 2088 if (ext3_should_journal_data(inode)) {
2008 BUFFER_TRACE(bh, "get write access"); 2089 BUFFER_TRACE(bh, "get write access");
2009 err = ext3_journal_get_write_access(handle, bh); 2090 err = ext3_journal_get_write_access(handle, bh);
2010 if (err) 2091 if (err)
2011 goto unlock; 2092 goto stop;
2012 } 2093 }
2013 2094
2014 zero_user(page, offset, length); 2095 zero_user(page, offset, length);
@@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
2022 err = ext3_journal_dirty_data(handle, bh); 2103 err = ext3_journal_dirty_data(handle, bh);
2023 mark_buffer_dirty(bh); 2104 mark_buffer_dirty(bh);
2024 } 2105 }
2106stop:
2107 if (handle)
2108 ext3_journal_stop(handle);
2025 2109
2026unlock: 2110unlock:
2027 unlock_page(page); 2111 unlock_page(page);
@@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2390 2474
2391int ext3_can_truncate(struct inode *inode) 2475int ext3_can_truncate(struct inode *inode)
2392{ 2476{
2393 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2394 return 0;
2395 if (S_ISREG(inode->i_mode)) 2477 if (S_ISREG(inode->i_mode))
2396 return 1; 2478 return 1;
2397 if (S_ISDIR(inode->i_mode)) 2479 if (S_ISDIR(inode->i_mode))
@@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode)
2435 struct ext3_inode_info *ei = EXT3_I(inode); 2517 struct ext3_inode_info *ei = EXT3_I(inode);
2436 __le32 *i_data = ei->i_data; 2518 __le32 *i_data = ei->i_data;
2437 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); 2519 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2438 struct address_space *mapping = inode->i_mapping;
2439 int offsets[4]; 2520 int offsets[4];
2440 Indirect chain[4]; 2521 Indirect chain[4];
2441 Indirect *partial; 2522 Indirect *partial;
@@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode)
2443 int n; 2524 int n;
2444 long last_block; 2525 long last_block;
2445 unsigned blocksize = inode->i_sb->s_blocksize; 2526 unsigned blocksize = inode->i_sb->s_blocksize;
2446 struct page *page; 2527
2528 trace_ext3_truncate_enter(inode);
2447 2529
2448 if (!ext3_can_truncate(inode)) 2530 if (!ext3_can_truncate(inode))
2449 goto out_notrans; 2531 goto out_notrans;
@@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode)
2451 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2533 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2452 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); 2534 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2453 2535
2454 /*
2455 * We have to lock the EOF page here, because lock_page() nests
2456 * outside journal_start().
2457 */
2458 if ((inode->i_size & (blocksize - 1)) == 0) {
2459 /* Block boundary? Nothing to do */
2460 page = NULL;
2461 } else {
2462 page = grab_cache_page(mapping,
2463 inode->i_size >> PAGE_CACHE_SHIFT);
2464 if (!page)
2465 goto out_notrans;
2466 }
2467
2468 handle = start_transaction(inode); 2536 handle = start_transaction(inode);
2469 if (IS_ERR(handle)) { 2537 if (IS_ERR(handle))
2470 if (page) {
2471 clear_highpage(page);
2472 flush_dcache_page(page);
2473 unlock_page(page);
2474 page_cache_release(page);
2475 }
2476 goto out_notrans; 2538 goto out_notrans;
2477 }
2478 2539
2479 last_block = (inode->i_size + blocksize-1) 2540 last_block = (inode->i_size + blocksize-1)
2480 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); 2541 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2481
2482 if (page)
2483 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2484
2485 n = ext3_block_to_path(inode, last_block, offsets, NULL); 2542 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2486 if (n == 0) 2543 if (n == 0)
2487 goto out_stop; /* error */ 2544 goto out_stop; /* error */
@@ -2596,6 +2653,7 @@ out_stop:
2596 ext3_orphan_del(handle, inode); 2653 ext3_orphan_del(handle, inode);
2597 2654
2598 ext3_journal_stop(handle); 2655 ext3_journal_stop(handle);
2656 trace_ext3_truncate_exit(inode);
2599 return; 2657 return;
2600out_notrans: 2658out_notrans:
2601 /* 2659 /*
@@ -2604,6 +2662,7 @@ out_notrans:
2604 */ 2662 */
2605 if (inode->i_nlink) 2663 if (inode->i_nlink)
2606 ext3_orphan_del(NULL, inode); 2664 ext3_orphan_del(NULL, inode);
2665 trace_ext3_truncate_exit(inode);
2607} 2666}
2608 2667
2609static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, 2668static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2745,6 +2804,7 @@ make_io:
2745 * has in-inode xattrs, or we don't have this inode in memory. 2804 * has in-inode xattrs, or we don't have this inode in memory.
2746 * Read the block from disk. 2805 * Read the block from disk.
2747 */ 2806 */
2807 trace_ext3_load_inode(inode);
2748 get_bh(bh); 2808 get_bh(bh);
2749 bh->b_end_io = end_buffer_read_sync; 2809 bh->b_end_io = end_buffer_read_sync;
2750 submit_bh(READ_META, bh); 2810 submit_bh(READ_META, bh);
@@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3229 } 3289 }
3230 3290
3231 error = ext3_orphan_add(handle, inode); 3291 error = ext3_orphan_add(handle, inode);
3292 if (error) {
3293 ext3_journal_stop(handle);
3294 goto err_out;
3295 }
3232 EXT3_I(inode)->i_disksize = attr->ia_size; 3296 EXT3_I(inode)->i_disksize = attr->ia_size;
3233 rc = ext3_mark_inode_dirty(handle, inode); 3297 error = ext3_mark_inode_dirty(handle, inode);
3234 if (!error)
3235 error = rc;
3236 ext3_journal_stop(handle); 3298 ext3_journal_stop(handle);
3299 if (error) {
3300 /* Some hard fs error must have happened. Bail out. */
3301 ext3_orphan_del(NULL, inode);
3302 goto err_out;
3303 }
3304 rc = ext3_block_truncate_page(inode, attr->ia_size);
3305 if (rc) {
3306 /* Cleanup orphan list and exit */
3307 handle = ext3_journal_start(inode, 3);
3308 if (IS_ERR(handle)) {
3309 ext3_orphan_del(NULL, inode);
3310 goto err_out;
3311 }
3312 ext3_orphan_del(handle, inode);
3313 ext3_journal_stop(handle);
3314 goto err_out;
3315 }
3237 } 3316 }
3238 3317
3239 if ((attr->ia_valid & ATTR_SIZE) && 3318 if ((attr->ia_valid & ATTR_SIZE) &&
3240 attr->ia_size != i_size_read(inode)) { 3319 attr->ia_size != i_size_read(inode)) {
3241 rc = vmtruncate(inode, attr->ia_size); 3320 truncate_setsize(inode, attr->ia_size);
3242 if (rc) 3321 ext3_truncate(inode);
3243 goto err_out;
3244 } 3322 }
3245 3323
3246 setattr_copy(inode, attr); 3324 setattr_copy(inode, attr);
@@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3374 int err; 3452 int err;
3375 3453
3376 might_sleep(); 3454 might_sleep();
3455 trace_ext3_mark_inode_dirty(inode, _RET_IP_);
3377 err = ext3_reserve_inode_write(handle, inode, &iloc); 3456 err = ext3_reserve_inode_write(handle, inode, &iloc);
3378 if (!err) 3457 if (!err)
3379 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 3458 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index f4090bd2f345..c7f43944f160 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -285,7 +285,7 @@ group_add_out:
285 if (!capable(CAP_SYS_ADMIN)) 285 if (!capable(CAP_SYS_ADMIN))
286 return -EPERM; 286 return -EPERM;
287 287
288 if (copy_from_user(&range, (struct fstrim_range *)arg, 288 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
289 sizeof(range))) 289 sizeof(range)))
290 return -EFAULT; 290 return -EFAULT;
291 291
@@ -293,7 +293,7 @@ group_add_out:
293 if (ret < 0) 293 if (ret < 0)
294 return ret; 294 return ret;
295 295
296 if (copy_to_user((struct fstrim_range *)arg, &range, 296 if (copy_to_user((struct fstrim_range __user *)arg, &range,
297 sizeof(range))) 297 sizeof(range)))
298 return -EFAULT; 298 return -EFAULT;
299 299
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3b57230a17bb..5571708b6a58 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,6 +36,7 @@
36#include <linux/quotaops.h> 36#include <linux/quotaops.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <trace/events/ext3.h>
39 40
40#include "namei.h" 41#include "namei.h"
41#include "xattr.h" 42#include "xattr.h"
@@ -287,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
287 while (len--) printk("%c", *name++); 288 while (len--) printk("%c", *name++);
288 ext3fs_dirhash(de->name, de->name_len, &h); 289 ext3fs_dirhash(de->name, de->name_len, &h);
289 printk(":%x.%u ", h.hash, 290 printk(":%x.%u ", h.hash,
290 ((char *) de - base)); 291 (unsigned) ((char *) de - base));
291 } 292 }
292 space += EXT3_DIR_REC_LEN(de->name_len); 293 space += EXT3_DIR_REC_LEN(de->name_len);
293 names++; 294 names++;
@@ -1013,7 +1014,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
1013 1014
1014 *err = -ENOENT; 1015 *err = -ENOENT;
1015errout: 1016errout:
1016 dxtrace(printk("%s not found\n", name)); 1017 dxtrace(printk("%s not found\n", entry->name));
1017 dx_release (frames); 1018 dx_release (frames);
1018 return NULL; 1019 return NULL;
1019} 1020}
@@ -2140,6 +2141,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2140 struct ext3_dir_entry_2 * de; 2141 struct ext3_dir_entry_2 * de;
2141 handle_t *handle; 2142 handle_t *handle;
2142 2143
2144 trace_ext3_unlink_enter(dir, dentry);
2143 /* Initialize quotas before so that eventual writes go 2145 /* Initialize quotas before so that eventual writes go
2144 * in separate transaction */ 2146 * in separate transaction */
2145 dquot_initialize(dir); 2147 dquot_initialize(dir);
@@ -2185,6 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2185end_unlink: 2187end_unlink:
2186 ext3_journal_stop(handle); 2188 ext3_journal_stop(handle);
2187 brelse (bh); 2189 brelse (bh);
2190 trace_ext3_unlink_exit(dentry, retval);
2188 return retval; 2191 return retval;
2189} 2192}
2190 2193
@@ -2206,9 +2209,11 @@ static int ext3_symlink (struct inode * dir,
2206 /* 2209 /*
2207 * For non-fast symlinks, we just allocate inode and put it on 2210 * For non-fast symlinks, we just allocate inode and put it on
2208 * orphan list in the first transaction => we need bitmap, 2211 * orphan list in the first transaction => we need bitmap,
2209 * group descriptor, sb, inode block, quota blocks. 2212 * group descriptor, sb, inode block, quota blocks, and
2213 * possibly selinux xattr blocks.
2210 */ 2214 */
2211 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); 2215 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2216 EXT3_XATTR_TRANS_BLOCKS;
2212 } else { 2217 } else {
2213 /* 2218 /*
2214 * Fast symlink. We have to add entry to directory 2219 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b57ea2f91269..7beb69ae0015 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,9 @@
44#include "acl.h" 44#include "acl.h"
45#include "namei.h" 45#include "namei.h"
46 46
47#define CREATE_TRACE_POINTS
48#include <trace/events/ext3.h>
49
47#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED 50#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
48 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA 51 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
49#else 52#else
@@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
497 return &ei->vfs_inode; 500 return &ei->vfs_inode;
498} 501}
499 502
503static int ext3_drop_inode(struct inode *inode)
504{
505 int drop = generic_drop_inode(inode);
506
507 trace_ext3_drop_inode(inode, drop);
508 return drop;
509}
510
500static void ext3_i_callback(struct rcu_head *head) 511static void ext3_i_callback(struct rcu_head *head)
501{ 512{
502 struct inode *inode = container_of(head, struct inode, i_rcu); 513 struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -788,6 +799,7 @@ static const struct super_operations ext3_sops = {
788 .destroy_inode = ext3_destroy_inode, 799 .destroy_inode = ext3_destroy_inode,
789 .write_inode = ext3_write_inode, 800 .write_inode = ext3_write_inode,
790 .dirty_inode = ext3_dirty_inode, 801 .dirty_inode = ext3_dirty_inode,
802 .drop_inode = ext3_drop_inode,
791 .evict_inode = ext3_evict_inode, 803 .evict_inode = ext3_evict_inode,
792 .put_super = ext3_put_super, 804 .put_super = ext3_put_super,
793 .sync_fs = ext3_sync_fs, 805 .sync_fs = ext3_sync_fs,
@@ -2509,6 +2521,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
2509{ 2521{
2510 tid_t target; 2522 tid_t target;
2511 2523
2524 trace_ext3_sync_fs(sb, wait);
2512 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { 2525 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2513 if (wait) 2526 if (wait)
2514 log_wait_commit(EXT3_SB(sb)->s_journal, target); 2527 log_wait_commit(EXT3_SB(sb)->s_journal, target);
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 32e6cc23bd9a..d565759d82ee 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -803,8 +803,16 @@ inserted:
803 /* We need to allocate a new block */ 803 /* We need to allocate a new block */
804 ext3_fsblk_t goal = ext3_group_first_block_no(sb, 804 ext3_fsblk_t goal = ext3_group_first_block_no(sb,
805 EXT3_I(inode)->i_block_group); 805 EXT3_I(inode)->i_block_group);
806 ext3_fsblk_t block = ext3_new_block(handle, inode, 806 ext3_fsblk_t block;
807 goal, &error); 807
808 /*
809 * Protect us agaist concurrent allocations to the
810 * same inode from ext3_..._writepage(). Reservation
811 * code does not expect racing allocations.
812 */
813 mutex_lock(&EXT3_I(inode)->truncate_mutex);
814 block = ext3_new_block(handle, inode, goal, &error);
815 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
808 if (error) 816 if (error)
809 goto cleanup; 817 goto cleanup;
810 ea_idebug(inode, "creating block %d", block); 818 ea_idebug(inode, "creating block %d", block);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 04109460ba9e..56fd8f865930 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o 10 mmp.o indirect.o
11 11
12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dca2d1ded931..a5c29bb3b835 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
198 case ACL_TYPE_ACCESS: 198 case ACL_TYPE_ACCESS:
199 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; 199 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
200 if (acl) { 200 if (acl) {
201 mode_t mode = inode->i_mode; 201 error = posix_acl_equiv_mode(acl, &inode->i_mode);
202 error = posix_acl_equiv_mode(acl, &mode);
203 if (error < 0) 202 if (error < 0)
204 return error; 203 return error;
205 else { 204 else {
206 inode->i_mode = mode;
207 inode->i_ctime = ext4_current_time(inode); 205 inode->i_ctime = ext4_current_time(inode);
208 ext4_mark_inode_dirty(handle, inode); 206 ext4_mark_inode_dirty(handle, inode);
209 if (error == 0) 207 if (error == 0)
@@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
259 inode->i_mode &= ~current_umask(); 257 inode->i_mode &= ~current_umask();
260 } 258 }
261 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 259 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
262 mode_t mode = inode->i_mode;
263
264 if (S_ISDIR(inode->i_mode)) { 260 if (S_ISDIR(inode->i_mode)) {
265 error = ext4_set_acl(handle, inode, 261 error = ext4_set_acl(handle, inode,
266 ACL_TYPE_DEFAULT, acl); 262 ACL_TYPE_DEFAULT, acl);
267 if (error) 263 if (error)
268 goto cleanup; 264 goto cleanup;
269 } 265 }
270 error = posix_acl_create(&acl, GFP_NOFS, &mode); 266 error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
271 if (error < 0) 267 if (error < 0)
272 return error; 268 return error;
273 269
274 inode->i_mode = mode;
275 if (error > 0) { 270 if (error > 0) {
276 /* This is an extended ACL */ 271 /* This is an extended ACL */
277 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); 272 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 264f6949511e..f8224adf496e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
620 620
621} 621}
622 622
623/**
624 * ext4_inode_to_goal_block - return a hint for block allocation
625 * @inode: inode for block allocation
626 *
627 * Return the ideal location to start allocating blocks for a
628 * newly created inode.
629 */
630ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
631{
632 struct ext4_inode_info *ei = EXT4_I(inode);
633 ext4_group_t block_group;
634 ext4_grpblk_t colour;
635 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
636 ext4_fsblk_t bg_start;
637 ext4_fsblk_t last_block;
638
639 block_group = ei->i_block_group;
640 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
641 /*
642 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
643 * block groups per flexgroup, reserve the first block
644 * group for directories and special files. Regular
645 * files will start at the second block group. This
646 * tends to speed up directory access and improves
647 * fsck times.
648 */
649 block_group &= ~(flex_size-1);
650 if (S_ISREG(inode->i_mode))
651 block_group++;
652 }
653 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
654 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
655
656 /*
657 * If we are doing delayed allocation, we don't need take
658 * colour into account.
659 */
660 if (test_opt(inode->i_sb, DELALLOC))
661 return bg_start;
662
663 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
664 colour = (current->pid % 16) *
665 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
666 else
667 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
668 return bg_start + colour;
669}
670
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index fac90f3fba80..8efb2f0a3447 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
246 return 1; 246 return 1;
247} 247}
248 248
249int ext4_check_blockref(const char *function, unsigned int line,
250 struct inode *inode, __le32 *p, unsigned int max)
251{
252 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
253 __le32 *bref = p;
254 unsigned int blk;
255
256 while (bref < p+max) {
257 blk = le32_to_cpu(*bref++);
258 if (blk &&
259 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
260 blk, 1))) {
261 es->s_last_error_block = cpu_to_le64(blk);
262 ext4_error_inode(inode, function, line, blk,
263 "invalid block");
264 return -EIO;
265 }
266 }
267 return 0;
268}
269
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fa44df879711..b7d7bd0f066e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -175,6 +175,7 @@ struct mpage_da_data {
175 */ 175 */
176#define EXT4_IO_END_UNWRITTEN 0x0001 176#define EXT4_IO_END_UNWRITTEN 0x0001
177#define EXT4_IO_END_ERROR 0x0002 177#define EXT4_IO_END_ERROR 0x0002
178#define EXT4_IO_END_QUEUED 0x0004
178 179
179struct ext4_io_page { 180struct ext4_io_page {
180 struct page *p_page; 181 struct page *p_page;
@@ -526,6 +527,7 @@ struct ext4_new_group_data {
526#define EXT4_FREE_BLOCKS_METADATA 0x0001 527#define EXT4_FREE_BLOCKS_METADATA 0x0001
527#define EXT4_FREE_BLOCKS_FORGET 0x0002 528#define EXT4_FREE_BLOCKS_FORGET 0x0002
528#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 529#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
530#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
529 531
530/* 532/*
531 * ioctl commands 533 * ioctl commands
@@ -939,6 +941,8 @@ struct ext4_inode_info {
939#define ext4_find_next_zero_bit find_next_zero_bit_le 941#define ext4_find_next_zero_bit find_next_zero_bit_le
940#define ext4_find_next_bit find_next_bit_le 942#define ext4_find_next_bit find_next_bit_le
941 943
944extern void ext4_set_bits(void *bm, int cur, int len);
945
942/* 946/*
943 * Maximal mount counts between two filesystem checks 947 * Maximal mount counts between two filesystem checks
944 */ 948 */
@@ -1126,7 +1130,8 @@ struct ext4_sb_info {
1126 struct journal_s *s_journal; 1130 struct journal_s *s_journal;
1127 struct list_head s_orphan; 1131 struct list_head s_orphan;
1128 struct mutex s_orphan_lock; 1132 struct mutex s_orphan_lock;
1129 struct mutex s_resize_lock; 1133 unsigned long s_resize_flags; /* Flags indicating if there
1134 is a resizer */
1130 unsigned long s_commit_interval; 1135 unsigned long s_commit_interval;
1131 u32 s_max_batch_time; 1136 u32 s_max_batch_time;
1132 u32 s_min_batch_time; 1137 u32 s_min_batch_time;
@@ -1214,6 +1219,9 @@ struct ext4_sb_info {
1214 1219
1215 /* Kernel thread for multiple mount protection */ 1220 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk; 1221 struct task_struct *s_mmp_tsk;
1222
1223 /* record the last minlen when FITRIM is called. */
1224 atomic_t s_last_trim_minblks;
1217}; 1225};
1218 1226
1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1227static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1743,6 +1751,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1743 struct ext4_group_desc *desc); 1751 struct ext4_group_desc *desc);
1744#define ext4_free_blocks_after_init(sb, group, desc) \ 1752#define ext4_free_blocks_after_init(sb, group, desc) \
1745 ext4_init_block_bitmap(sb, NULL, group, desc) 1753 ext4_init_block_bitmap(sb, NULL, group, desc)
1754ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
1746 1755
1747/* dir.c */ 1756/* dir.c */
1748extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1757extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
@@ -1793,7 +1802,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1793 unsigned long count, int flags); 1802 unsigned long count, int flags);
1794extern int ext4_mb_add_groupinfo(struct super_block *sb, 1803extern int ext4_mb_add_groupinfo(struct super_block *sb,
1795 ext4_group_t i, struct ext4_group_desc *desc); 1804 ext4_group_t i, struct ext4_group_desc *desc);
1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1805extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
1797 ext4_fsblk_t block, unsigned long count); 1806 ext4_fsblk_t block, unsigned long count);
1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 1807extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1799 1808
@@ -1834,6 +1843,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1834extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1843extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1835extern void ext4_da_update_reserve_space(struct inode *inode, 1844extern void ext4_da_update_reserve_space(struct inode *inode,
1836 int used, int quota_claim); 1845 int used, int quota_claim);
1846
1847/* indirect.c */
1848extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
1849 struct ext4_map_blocks *map, int flags);
1850extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
1851 const struct iovec *iov, loff_t offset,
1852 unsigned long nr_segs);
1853extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
1854extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
1855extern void ext4_ind_truncate(struct inode *inode);
1856
1837/* ioctl.c */ 1857/* ioctl.c */
1838extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1858extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1839extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1859extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1855,6 +1875,9 @@ extern int ext4_group_extend(struct super_block *sb,
1855 ext4_fsblk_t n_blocks_count); 1875 ext4_fsblk_t n_blocks_count);
1856 1876
1857/* super.c */ 1877/* super.c */
1878extern void *ext4_kvmalloc(size_t size, gfp_t flags);
1879extern void *ext4_kvzalloc(size_t size, gfp_t flags);
1880extern void ext4_kvfree(void *ptr);
1858extern void __ext4_error(struct super_block *, const char *, unsigned int, 1881extern void __ext4_error(struct super_block *, const char *, unsigned int,
1859 const char *, ...) 1882 const char *, ...)
1860 __attribute__ ((format (printf, 4, 5))); 1883 __attribute__ ((format (printf, 4, 5)));
@@ -2067,11 +2090,19 @@ struct ext4_group_info {
2067 * 5 free 8-block regions. */ 2090 * 5 free 8-block regions. */
2068}; 2091};
2069 2092
2070#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 2093#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
2094#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
2071 2095
2072#define EXT4_MB_GRP_NEED_INIT(grp) \ 2096#define EXT4_MB_GRP_NEED_INIT(grp) \
2073 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 2097 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
2074 2098
2099#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
2100 (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
2101#define EXT4_MB_GRP_SET_TRIMMED(grp) \
2102 (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
2103#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
2104 (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
2105
2075#define EXT4_MAX_CONTENTION 8 2106#define EXT4_MAX_CONTENTION 8
2076#define EXT4_CONTENTION_THRESHOLD 2 2107#define EXT4_CONTENTION_THRESHOLD 2
2077 2108
@@ -2123,6 +2154,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb)
2123} 2154}
2124 2155
2125/* 2156/*
2157 * Block validity checking
2158 */
2159#define ext4_check_indirect_blockref(inode, bh) \
2160 ext4_check_blockref(__func__, __LINE__, inode, \
2161 (__le32 *)(bh)->b_data, \
2162 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
2163
2164#define ext4_ind_check_inode(inode) \
2165 ext4_check_blockref(__func__, __LINE__, inode, \
2166 EXT4_I(inode)->i_data, \
2167 EXT4_NDIR_BLOCKS)
2168
2169/*
2126 * Inodes and files operations 2170 * Inodes and files operations
2127 */ 2171 */
2128 2172
@@ -2151,6 +2195,8 @@ extern void ext4_exit_system_zone(void);
2151extern int ext4_data_block_valid(struct ext4_sb_info *sbi, 2195extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
2152 ext4_fsblk_t start_blk, 2196 ext4_fsblk_t start_blk,
2153 unsigned int count); 2197 unsigned int count);
2198extern int ext4_check_blockref(const char *, unsigned int,
2199 struct inode *, __le32 *, unsigned int);
2154 2200
2155/* extents.c */ 2201/* extents.c */
2156extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2202extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
@@ -2230,6 +2276,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2230extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 2276extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2231extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; 2277extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2232 2278
2279#define EXT4_RESIZING 0
2280extern int ext4_resize_begin(struct super_block *sb);
2281extern void ext4_resize_end(struct super_block *sb);
2282
2233#endif /* __KERNEL__ */ 2283#endif /* __KERNEL__ */
2234 2284
2235#endif /* _EXT4_H */ 2285#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index bb85757689b6..5802fa1dab18 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode)
289 289
290static inline int ext4_should_writeback_data(struct inode *inode) 290static inline int ext4_should_writeback_data(struct inode *inode)
291{ 291{
292 if (!S_ISREG(inode->i_mode))
293 return 0;
294 if (EXT4_JOURNAL(inode) == NULL) 292 if (EXT4_JOURNAL(inode) == NULL)
295 return 1; 293 return 1;
294 if (!S_ISREG(inode->i_mode))
295 return 0;
296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
297 return 0; 297 return 0;
298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f815cc81e7a2..57cf568a98ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
114 struct ext4_ext_path *path, 114 struct ext4_ext_path *path,
115 ext4_lblk_t block) 115 ext4_lblk_t block)
116{ 116{
117 struct ext4_inode_info *ei = EXT4_I(inode);
118 ext4_fsblk_t bg_start;
119 ext4_fsblk_t last_block;
120 ext4_grpblk_t colour;
121 ext4_group_t block_group;
122 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
123 int depth; 117 int depth;
124 118
125 if (path) { 119 if (path) {
@@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
161 } 155 }
162 156
163 /* OK. use inode's group */ 157 /* OK. use inode's group */
164 block_group = ei->i_block_group; 158 return ext4_inode_to_goal_block(inode);
165 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
166 /*
167 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
168 * block groups per flexgroup, reserve the first block
169 * group for directories and special files. Regular
170 * files will start at the second block group. This
171 * tends to speed up directory access and improves
172 * fsck times.
173 */
174 block_group &= ~(flex_size-1);
175 if (S_ISREG(inode->i_mode))
176 block_group++;
177 }
178 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
179 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
180
181 /*
182 * If we are doing delayed allocation, we don't need take
183 * colour into account.
184 */
185 if (test_opt(inode->i_sb, DELALLOC))
186 return bg_start;
187
188 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
189 colour = (current->pid % 16) *
190 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
191 else
192 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
193 return bg_start + colour + block;
194} 159}
195 160
196/* 161/*
@@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
776 logical, le32_to_cpu(curp->p_idx->ei_block)); 741 logical, le32_to_cpu(curp->p_idx->ei_block));
777 return -EIO; 742 return -EIO;
778 } 743 }
744
745 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
746 >= le16_to_cpu(curp->p_hdr->eh_max))) {
747 EXT4_ERROR_INODE(inode,
748 "eh_entries %d >= eh_max %d!",
749 le16_to_cpu(curp->p_hdr->eh_entries),
750 le16_to_cpu(curp->p_hdr->eh_max));
751 return -EIO;
752 }
753
779 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 754 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
780 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 755 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
781 /* insert after */ 756 /* insert after */
@@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
805 ext4_idx_store_pblock(ix, ptr); 780 ext4_idx_store_pblock(ix, ptr);
806 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 781 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
807 782
808 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
809 > le16_to_cpu(curp->p_hdr->eh_max))) {
810 EXT4_ERROR_INODE(inode,
811 "logical %d == ei_block %d!",
812 logical, le32_to_cpu(curp->p_idx->ei_block));
813 return -EIO;
814 }
815 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 783 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
816 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 784 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
817 return -EIO; 785 return -EIO;
@@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1446 * ext4_ext_next_leaf_block: 1414 * ext4_ext_next_leaf_block:
1447 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1415 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1448 */ 1416 */
1449static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1417static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1450 struct ext4_ext_path *path)
1451{ 1418{
1452 int depth; 1419 int depth;
1453 1420
@@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1757 goto merge; 1724 goto merge;
1758 } 1725 }
1759 1726
1760repeat:
1761 depth = ext_depth(inode); 1727 depth = ext_depth(inode);
1762 eh = path[depth].p_hdr; 1728 eh = path[depth].p_hdr;
1763 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1729 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
@@ -1765,9 +1731,10 @@ repeat:
1765 1731
1766 /* probably next leaf has space for us? */ 1732 /* probably next leaf has space for us? */
1767 fex = EXT_LAST_EXTENT(eh); 1733 fex = EXT_LAST_EXTENT(eh);
1768 next = ext4_ext_next_leaf_block(inode, path); 1734 next = EXT_MAX_BLOCKS;
1769 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1735 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1770 && next != EXT_MAX_BLOCKS) { 1736 next = ext4_ext_next_leaf_block(path);
1737 if (next != EXT_MAX_BLOCKS) {
1771 ext_debug("next leaf block - %d\n", next); 1738 ext_debug("next leaf block - %d\n", next);
1772 BUG_ON(npath != NULL); 1739 BUG_ON(npath != NULL);
1773 npath = ext4_ext_find_extent(inode, next, NULL); 1740 npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1779,7 +1746,7 @@ repeat:
1779 ext_debug("next leaf isn't full(%d)\n", 1746 ext_debug("next leaf isn't full(%d)\n",
1780 le16_to_cpu(eh->eh_entries)); 1747 le16_to_cpu(eh->eh_entries));
1781 path = npath; 1748 path = npath;
1782 goto repeat; 1749 goto has_space;
1783 } 1750 }
1784 ext_debug("next leaf has no free space(%d,%d)\n", 1751 ext_debug("next leaf has no free space(%d,%d)\n",
1785 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1752 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -1839,7 +1806,7 @@ has_space:
1839 ext4_ext_pblock(newext), 1806 ext4_ext_pblock(newext),
1840 ext4_ext_is_uninitialized(newext), 1807 ext4_ext_is_uninitialized(newext),
1841 ext4_ext_get_actual_len(newext), 1808 ext4_ext_get_actual_len(newext),
1842 nearex, len, nearex + 1, nearex + 2); 1809 nearex, len, nearex, nearex + 1);
1843 memmove(nearex + 1, nearex, len); 1810 memmove(nearex + 1, nearex, len);
1844 path[depth].p_ext = nearex; 1811 path[depth].p_ext = nearex;
1845 } 1812 }
@@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2052} 2019}
2053 2020
2054/* 2021/*
2055 * ext4_ext_in_cache() 2022 * ext4_ext_check_cache()
2056 * Checks to see if the given block is in the cache. 2023 * Checks to see if the given block is in the cache.
2057 * If it is, the cached extent is stored in the given 2024 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole, 2025 * cache extent pointer. If the cached extent is a hole,
@@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2134/* 2101/*
2135 * ext4_ext_rm_idx: 2102 * ext4_ext_rm_idx:
2136 * removes index from the index block. 2103 * removes index from the index block.
2137 * It's used in truncate case only, thus all requests are for
2138 * last index in the block only.
2139 */ 2104 */
2140static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2105static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2141 struct ext4_ext_path *path) 2106 struct ext4_ext_path *path)
@@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2153 err = ext4_ext_get_access(handle, inode, path); 2118 err = ext4_ext_get_access(handle, inode, path);
2154 if (err) 2119 if (err)
2155 return err; 2120 return err;
2121
2122 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2123 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2124 len *= sizeof(struct ext4_extent_idx);
2125 memmove(path->p_idx, path->p_idx + 1, len);
2126 }
2127
2156 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2128 le16_add_cpu(&path->p_hdr->eh_entries, -1);
2157 err = ext4_ext_dirty(handle, inode, path); 2129 err = ext4_ext_dirty(handle, inode, path);
2158 if (err) 2130 if (err)
@@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2534 return 1; 2506 return 1;
2535} 2507}
2536 2508
2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2509static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2538 ext4_lblk_t end)
2539{ 2510{
2540 struct super_block *sb = inode->i_sb; 2511 struct super_block *sb = inode->i_sb;
2541 int depth = ext_depth(inode); 2512 int depth = ext_depth(inode);
@@ -2575,7 +2546,7 @@ again:
2575 if (i == depth) { 2546 if (i == depth) {
2576 /* this is leaf block */ 2547 /* this is leaf block */
2577 err = ext4_ext_rm_leaf(handle, inode, path, 2548 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end); 2549 start, EXT_MAX_BLOCKS - 1);
2579 /* root level has p_bh == NULL, brelse() eats this */ 2550 /* root level has p_bh == NULL, brelse() eats this */
2580 brelse(path[i].p_bh); 2551 brelse(path[i].p_bh);
2581 path[i].p_bh = NULL; 2552 path[i].p_bh = NULL;
@@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3107 struct ext4_ext_path *path) 3078 struct ext4_ext_path *path)
3108{ 3079{
3109 struct ext4_extent *ex; 3080 struct ext4_extent *ex;
3110 struct ext4_extent_header *eh;
3111 int depth; 3081 int depth;
3112 int err = 0; 3082 int err = 0;
3113 3083
3114 depth = ext_depth(inode); 3084 depth = ext_depth(inode);
3115 eh = path[depth].p_hdr;
3116 ex = path[depth].p_ext; 3085 ex = path[depth].p_ext;
3117 3086
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3087 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
@@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3326 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3358 3327
3359 /* check in cache */ 3328 /* check in cache */
3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && 3329 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { 3330 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3362 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3331 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3364 /* 3333 /*
@@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3497 3466
3498 ext4_ext_mark_uninitialized(ex); 3467 ext4_ext_mark_uninitialized(ex);
3499 3468
3500 err = ext4_ext_remove_space(inode, map->m_lblk, 3469 ext4_ext_invalidate_cache(inode);
3501 map->m_lblk + punched_out); 3470
3471 err = ext4_ext_rm_leaf(handle, inode, path,
3472 map->m_lblk, map->m_lblk + punched_out);
3473
3474 if (!err && path->p_hdr->eh_entries == 0) {
3475 /*
3476 * Punch hole freed all of this sub tree,
3477 * so we need to correct eh_depth
3478 */
3479 err = ext4_ext_get_access(handle, inode, path);
3480 if (err == 0) {
3481 ext_inode_hdr(inode)->eh_depth = 0;
3482 ext_inode_hdr(inode)->eh_max =
3483 cpu_to_le16(ext4_ext_space_root(
3484 inode, 0));
3485
3486 err = ext4_ext_dirty(
3487 handle, inode, path);
3488 }
3489 }
3502 3490
3503 goto out2; 3491 goto out2;
3504 } 3492 }
@@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3596 } 3584 }
3597 3585
3598 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 3586 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3599 if (err) 3587 if (!err)
3600 goto out2; 3588 err = ext4_ext_insert_extent(handle, inode, path,
3601 3589 &newex, flags);
3602 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3603 if (err) { 3590 if (err) {
3591 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
3592 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
3604 /* free data blocks we just allocated */ 3593 /* free data blocks we just allocated */
3605 /* not a good idea to call discard here directly, 3594 /* not a good idea to call discard here directly,
3606 * but otherwise we'd need to call it every free() */ 3595 * but otherwise we'd need to call it every free() */
3607 ext4_discard_preallocations(inode); 3596 ext4_discard_preallocations(inode);
3608 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 3597 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3609 ext4_ext_get_actual_len(&newex), 0); 3598 ext4_ext_get_actual_len(&newex), fb_flags);
3610 goto out2; 3599 goto out2;
3611 } 3600 }
3612 3601
@@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode)
3699 3688
3700 last_block = (inode->i_size + sb->s_blocksize - 1) 3689 last_block = (inode->i_size + sb->s_blocksize - 1)
3701 >> EXT4_BLOCK_SIZE_BITS(sb); 3690 >> EXT4_BLOCK_SIZE_BITS(sb);
3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 3691 err = ext4_ext_remove_space(inode, last_block);
3703 3692
3704 /* In a multi-transaction truncate, we only make the final 3693 /* In a multi-transaction truncate, we only make the final
3705 * transaction synchronous. 3694 * transaction synchronous.
@@ -3835,7 +3824,7 @@ retry:
3835 blkbits) >> blkbits)) 3824 blkbits) >> blkbits))
3836 new_size = offset + len; 3825 new_size = offset + len;
3837 else 3826 else
3838 new_size = (map.m_lblk + ret) << blkbits; 3827 new_size = ((loff_t) map.m_lblk + ret) << blkbits;
3839 3828
3840 ext4_falloc_update_inode(inode, mode, new_size, 3829 ext4_falloc_update_inode(inode, mode, new_size,
3841 (map.m_flags & EXT4_MAP_NEW)); 3830 (map.m_flags & EXT4_MAP_NEW));
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index da3bed3e0c29..036f78f7a1ef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode)
129{ 129{
130 struct writeback_control wbc; 130 struct writeback_control wbc;
131 struct dentry *dentry = NULL; 131 struct dentry *dentry = NULL;
132 struct inode *next;
132 int ret = 0; 133 int ret = 0;
133 134
134 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { 135 if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
136 return 0;
137 inode = igrab(inode);
138 while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
135 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); 139 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
136 dentry = list_entry(inode->i_dentry.next, 140 dentry = NULL;
137 struct dentry, d_alias); 141 spin_lock(&inode->i_lock);
138 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) 142 if (!list_empty(&inode->i_dentry)) {
143 dentry = list_first_entry(&inode->i_dentry,
144 struct dentry, d_alias);
145 dget(dentry);
146 }
147 spin_unlock(&inode->i_lock);
148 if (!dentry)
139 break; 149 break;
140 inode = dentry->d_parent->d_inode; 150 next = igrab(dentry->d_parent->d_inode);
151 dput(dentry);
152 if (!next)
153 break;
154 iput(inode);
155 inode = next;
141 ret = sync_mapping_buffers(inode->i_mapping); 156 ret = sync_mapping_buffers(inode->i_mapping);
142 if (ret) 157 if (ret)
143 break; 158 break;
@@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode)
148 if (ret) 163 if (ret)
149 break; 164 break;
150 } 165 }
166 iput(inode);
151 return ret; 167 return ret;
152} 168}
153 169
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21bb2f61e502..9c63f273b550 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1287 group, used_blks, 1287 group, used_blks,
1288 ext4_itable_unused_count(sb, gdp)); 1288 ext4_itable_unused_count(sb, gdp));
1289 ret = 1; 1289 ret = 1;
1290 goto out; 1290 goto err_out;
1291 } 1291 }
1292 1292
1293 blk = ext4_inode_table(sb, gdp) + used_blks; 1293 blk = ext4_inode_table(sb, gdp) + used_blks;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 000000000000..0962642119c0
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1487 @@
1/*
2 * linux/fs/ext4/indirect.c
3 *
4 * from
5 *
6 * linux/fs/ext4/inode.c
7 *
8 * Copyright (C) 1992, 1993, 1994, 1995
9 * Remy Card (card@masi.ibp.fr)
10 * Laboratoire MASI - Institut Blaise Pascal
11 * Universite Pierre et Marie Curie (Paris VI)
12 *
13 * from
14 *
15 * linux/fs/minix/inode.c
16 *
17 * Copyright (C) 1991, 1992 Linus Torvalds
18 *
19 * Goal-directed block allocation by Stephen Tweedie
20 * (sct@redhat.com), 1993, 1998
21 */
22
23#include <linux/module.h>
24#include "ext4_jbd2.h"
25#include "truncate.h"
26
27#include <trace/events/ext4.h>
28
29typedef struct {
30 __le32 *p;
31 __le32 key;
32 struct buffer_head *bh;
33} Indirect;
34
35static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
36{
37 p->key = *(p->p = v);
38 p->bh = bh;
39}
40
41/**
42 * ext4_block_to_path - parse the block number into array of offsets
43 * @inode: inode in question (we are only interested in its superblock)
44 * @i_block: block number to be parsed
45 * @offsets: array to store the offsets in
46 * @boundary: set this non-zero if the referred-to block is likely to be
47 * followed (on disk) by an indirect block.
48 *
49 * To store the locations of file's data ext4 uses a data structure common
50 * for UNIX filesystems - tree of pointers anchored in the inode, with
51 * data blocks at leaves and indirect blocks in intermediate nodes.
52 * This function translates the block number into path in that tree -
53 * return value is the path length and @offsets[n] is the offset of
54 * pointer to (n+1)th node in the nth one. If @block is out of range
55 * (negative or too large) warning is printed and zero returned.
56 *
57 * Note: function doesn't find node addresses, so no IO is needed. All
58 * we need to know is the capacity of indirect blocks (taken from the
59 * inode->i_sb).
60 */
61
62/*
63 * Portability note: the last comparison (check that we fit into triple
64 * indirect block) is spelled differently, because otherwise on an
65 * architecture with 32-bit longs and 8Kb pages we might get into trouble
66 * if our filesystem had 8Kb blocks. We might use long long, but that would
67 * kill us on x86. Oh, well, at least the sign propagation does not matter -
68 * i_block would have to be negative in the very beginning, so we would not
69 * get there at all.
70 */
71
72static int ext4_block_to_path(struct inode *inode,
73 ext4_lblk_t i_block,
74 ext4_lblk_t offsets[4], int *boundary)
75{
76 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
77 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
78 const long direct_blocks = EXT4_NDIR_BLOCKS,
79 indirect_blocks = ptrs,
80 double_blocks = (1 << (ptrs_bits * 2));
81 int n = 0;
82 int final = 0;
83
84 if (i_block < direct_blocks) {
85 offsets[n++] = i_block;
86 final = direct_blocks;
87 } else if ((i_block -= direct_blocks) < indirect_blocks) {
88 offsets[n++] = EXT4_IND_BLOCK;
89 offsets[n++] = i_block;
90 final = ptrs;
91 } else if ((i_block -= indirect_blocks) < double_blocks) {
92 offsets[n++] = EXT4_DIND_BLOCK;
93 offsets[n++] = i_block >> ptrs_bits;
94 offsets[n++] = i_block & (ptrs - 1);
95 final = ptrs;
96 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
97 offsets[n++] = EXT4_TIND_BLOCK;
98 offsets[n++] = i_block >> (ptrs_bits * 2);
99 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
100 offsets[n++] = i_block & (ptrs - 1);
101 final = ptrs;
102 } else {
103 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
104 i_block + direct_blocks +
105 indirect_blocks + double_blocks, inode->i_ino);
106 }
107 if (boundary)
108 *boundary = final - 1 - (i_block & (ptrs - 1));
109 return n;
110}
111
112/**
113 * ext4_get_branch - read the chain of indirect blocks leading to data
114 * @inode: inode in question
115 * @depth: depth of the chain (1 - direct pointer, etc.)
116 * @offsets: offsets of pointers in inode/indirect blocks
117 * @chain: place to store the result
118 * @err: here we store the error value
119 *
120 * Function fills the array of triples <key, p, bh> and returns %NULL
121 * if everything went OK or the pointer to the last filled triple
122 * (incomplete one) otherwise. Upon the return chain[i].key contains
123 * the number of (i+1)-th block in the chain (as it is stored in memory,
124 * i.e. little-endian 32-bit), chain[i].p contains the address of that
125 * number (it points into struct inode for i==0 and into the bh->b_data
126 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
127 * block for i>0 and NULL for i==0. In other words, it holds the block
128 * numbers of the chain, addresses they were taken from (and where we can
129 * verify that chain did not change) and buffer_heads hosting these
130 * numbers.
131 *
132 * Function stops when it stumbles upon zero pointer (absent block)
133 * (pointer to last triple returned, *@err == 0)
134 * or when it gets an IO error reading an indirect block
135 * (ditto, *@err == -EIO)
136 * or when it reads all @depth-1 indirect blocks successfully and finds
137 * the whole chain, all way to the data (returns %NULL, *err == 0).
138 *
139 * Need to be called with
140 * down_read(&EXT4_I(inode)->i_data_sem)
141 */
142static Indirect *ext4_get_branch(struct inode *inode, int depth,
143 ext4_lblk_t *offsets,
144 Indirect chain[4], int *err)
145{
146 struct super_block *sb = inode->i_sb;
147 Indirect *p = chain;
148 struct buffer_head *bh;
149
150 *err = 0;
151 /* i_data is not going away, no lock needed */
152 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
153 if (!p->key)
154 goto no_block;
155 while (--depth) {
156 bh = sb_getblk(sb, le32_to_cpu(p->key));
157 if (unlikely(!bh))
158 goto failure;
159
160 if (!bh_uptodate_or_lock(bh)) {
161 if (bh_submit_read(bh) < 0) {
162 put_bh(bh);
163 goto failure;
164 }
165 /* validate block references */
166 if (ext4_check_indirect_blockref(inode, bh)) {
167 put_bh(bh);
168 goto failure;
169 }
170 }
171
172 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
173 /* Reader: end */
174 if (!p->key)
175 goto no_block;
176 }
177 return NULL;
178
179failure:
180 *err = -EIO;
181no_block:
182 return p;
183}
184
185/**
186 * ext4_find_near - find a place for allocation with sufficient locality
187 * @inode: owner
188 * @ind: descriptor of indirect block.
189 *
190 * This function returns the preferred place for block allocation.
191 * It is used when heuristic for sequential allocation fails.
192 * Rules are:
193 * + if there is a block to the left of our position - allocate near it.
194 * + if pointer will live in indirect block - allocate near that block.
195 * + if pointer will live in inode - allocate in the same
196 * cylinder group.
197 *
198 * In the latter case we colour the starting block by the callers PID to
199 * prevent it from clashing with concurrent allocations for a different inode
200 * in the same block group. The PID is used here so that functionally related
201 * files will be close-by on-disk.
202 *
203 * Caller must make sure that @ind is valid and will stay that way.
204 */
205static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
206{
207 struct ext4_inode_info *ei = EXT4_I(inode);
208 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
209 __le32 *p;
210
211 /* Try to find previous block */
212 for (p = ind->p - 1; p >= start; p--) {
213 if (*p)
214 return le32_to_cpu(*p);
215 }
216
217 /* No such thing, so let's try location of indirect block */
218 if (ind->bh)
219 return ind->bh->b_blocknr;
220
221 /*
222 * It is going to be referred to from the inode itself? OK, just put it
223 * into the same cylinder group then.
224 */
225 return ext4_inode_to_goal_block(inode);
226}
227
228/**
229 * ext4_find_goal - find a preferred place for allocation.
230 * @inode: owner
231 * @block: block we want
232 * @partial: pointer to the last triple within a chain
233 *
234 * Normally this function find the preferred place for block allocation,
235 * returns it.
236 * Because this is only used for non-extent files, we limit the block nr
237 * to 32 bits.
238 */
239static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
240 Indirect *partial)
241{
242 ext4_fsblk_t goal;
243
244 /*
245 * XXX need to get goal block from mballoc's data structures
246 */
247
248 goal = ext4_find_near(inode, partial);
249 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
250 return goal;
251}
252
253/**
254 * ext4_blks_to_allocate - Look up the block map and count the number
255 * of direct blocks need to be allocated for the given branch.
256 *
257 * @branch: chain of indirect blocks
258 * @k: number of blocks need for indirect blocks
259 * @blks: number of data blocks to be mapped.
260 * @blocks_to_boundary: the offset in the indirect block
261 *
262 * return the total number of blocks to be allocate, including the
263 * direct and indirect blocks.
264 */
265static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
266 int blocks_to_boundary)
267{
268 unsigned int count = 0;
269
270 /*
271 * Simple case, [t,d]Indirect block(s) has not allocated yet
272 * then it's clear blocks on that path have not allocated
273 */
274 if (k > 0) {
275 /* right now we don't handle cross boundary allocation */
276 if (blks < blocks_to_boundary + 1)
277 count += blks;
278 else
279 count += blocks_to_boundary + 1;
280 return count;
281 }
282
283 count++;
284 while (count < blks && count <= blocks_to_boundary &&
285 le32_to_cpu(*(branch[0].p + count)) == 0) {
286 count++;
287 }
288 return count;
289}
290
291/**
292 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
293 * @handle: handle for this transaction
294 * @inode: inode which needs allocated blocks
295 * @iblock: the logical block to start allocated at
296 * @goal: preferred physical block of allocation
297 * @indirect_blks: the number of blocks need to allocate for indirect
298 * blocks
299 * @blks: number of desired blocks
300 * @new_blocks: on return it will store the new block numbers for
301 * the indirect blocks(if needed) and the first direct block,
302 * @err: on return it will store the error code
303 *
304 * This function will return the number of blocks allocated as
305 * requested by the passed-in parameters.
306 */
307static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
308 ext4_lblk_t iblock, ext4_fsblk_t goal,
309 int indirect_blks, int blks,
310 ext4_fsblk_t new_blocks[4], int *err)
311{
312 struct ext4_allocation_request ar;
313 int target, i;
314 unsigned long count = 0, blk_allocated = 0;
315 int index = 0;
316 ext4_fsblk_t current_block = 0;
317 int ret = 0;
318
319 /*
320 * Here we try to allocate the requested multiple blocks at once,
321 * on a best-effort basis.
322 * To build a branch, we should allocate blocks for
323 * the indirect blocks(if not allocated yet), and at least
324 * the first direct block of this branch. That's the
325 * minimum number of blocks need to allocate(required)
326 */
327 /* first we try to allocate the indirect blocks */
328 target = indirect_blks;
329 while (target > 0) {
330 count = target;
331 /* allocating blocks for indirect blocks and direct blocks */
332 current_block = ext4_new_meta_blocks(handle, inode, goal,
333 0, &count, err);
334 if (*err)
335 goto failed_out;
336
337 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
338 EXT4_ERROR_INODE(inode,
339 "current_block %llu + count %lu > %d!",
340 current_block, count,
341 EXT4_MAX_BLOCK_FILE_PHYS);
342 *err = -EIO;
343 goto failed_out;
344 }
345
346 target -= count;
347 /* allocate blocks for indirect blocks */
348 while (index < indirect_blks && count) {
349 new_blocks[index++] = current_block++;
350 count--;
351 }
352 if (count > 0) {
353 /*
354 * save the new block number
355 * for the first direct block
356 */
357 new_blocks[index] = current_block;
358 printk(KERN_INFO "%s returned more blocks than "
359 "requested\n", __func__);
360 WARN_ON(1);
361 break;
362 }
363 }
364
365 target = blks - count ;
366 blk_allocated = count;
367 if (!target)
368 goto allocated;
369 /* Now allocate data blocks */
370 memset(&ar, 0, sizeof(ar));
371 ar.inode = inode;
372 ar.goal = goal;
373 ar.len = target;
374 ar.logical = iblock;
375 if (S_ISREG(inode->i_mode))
376 /* enable in-core preallocation only for regular files */
377 ar.flags = EXT4_MB_HINT_DATA;
378
379 current_block = ext4_mb_new_blocks(handle, &ar, err);
380 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
381 EXT4_ERROR_INODE(inode,
382 "current_block %llu + ar.len %d > %d!",
383 current_block, ar.len,
384 EXT4_MAX_BLOCK_FILE_PHYS);
385 *err = -EIO;
386 goto failed_out;
387 }
388
389 if (*err && (target == blks)) {
390 /*
391 * if the allocation failed and we didn't allocate
392 * any blocks before
393 */
394 goto failed_out;
395 }
396 if (!*err) {
397 if (target == blks) {
398 /*
399 * save the new block number
400 * for the first direct block
401 */
402 new_blocks[index] = current_block;
403 }
404 blk_allocated += ar.len;
405 }
406allocated:
407 /* total number of blocks allocated for direct blocks */
408 ret = blk_allocated;
409 *err = 0;
410 return ret;
411failed_out:
412 for (i = 0; i < index; i++)
413 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
414 return ret;
415}
416
417/**
418 * ext4_alloc_branch - allocate and set up a chain of blocks.
419 * @handle: handle for this transaction
420 * @inode: owner
421 * @indirect_blks: number of allocated indirect blocks
422 * @blks: number of allocated direct blocks
423 * @goal: preferred place for allocation
424 * @offsets: offsets (in the blocks) to store the pointers to next.
425 * @branch: place to store the chain in.
426 *
427 * This function allocates blocks, zeroes out all but the last one,
428 * links them into chain and (if we are synchronous) writes them to disk.
429 * In other words, it prepares a branch that can be spliced onto the
430 * inode. It stores the information about that chain in the branch[], in
431 * the same format as ext4_get_branch() would do. We are calling it after
432 * we had read the existing part of chain and partial points to the last
433 * triple of that (one with zero ->key). Upon the exit we have the same
434 * picture as after the successful ext4_get_block(), except that in one
435 * place chain is disconnected - *branch->p is still zero (we did not
436 * set the last link), but branch->key contains the number that should
437 * be placed into *branch->p to fill that gap.
438 *
439 * If allocation fails we free all blocks we've allocated (and forget
440 * their buffer_heads) and return the error value the from failed
441 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
442 * as described above and return 0.
443 */
444static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
445 ext4_lblk_t iblock, int indirect_blks,
446 int *blks, ext4_fsblk_t goal,
447 ext4_lblk_t *offsets, Indirect *branch)
448{
449 int blocksize = inode->i_sb->s_blocksize;
450 int i, n = 0;
451 int err = 0;
452 struct buffer_head *bh;
453 int num;
454 ext4_fsblk_t new_blocks[4];
455 ext4_fsblk_t current_block;
456
457 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
458 *blks, new_blocks, &err);
459 if (err)
460 return err;
461
462 branch[0].key = cpu_to_le32(new_blocks[0]);
463 /*
464 * metadata blocks and data blocks are allocated.
465 */
466 for (n = 1; n <= indirect_blks; n++) {
467 /*
468 * Get buffer_head for parent block, zero it out
469 * and set the pointer to new one, then send
470 * parent to disk.
471 */
472 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
473 if (unlikely(!bh)) {
474 err = -EIO;
475 goto failed;
476 }
477
478 branch[n].bh = bh;
479 lock_buffer(bh);
480 BUFFER_TRACE(bh, "call get_create_access");
481 err = ext4_journal_get_create_access(handle, bh);
482 if (err) {
483 /* Don't brelse(bh) here; it's done in
484 * ext4_journal_forget() below */
485 unlock_buffer(bh);
486 goto failed;
487 }
488
489 memset(bh->b_data, 0, blocksize);
490 branch[n].p = (__le32 *) bh->b_data + offsets[n];
491 branch[n].key = cpu_to_le32(new_blocks[n]);
492 *branch[n].p = branch[n].key;
493 if (n == indirect_blks) {
494 current_block = new_blocks[n];
495 /*
496 * End of chain, update the last new metablock of
497 * the chain to point to the new allocated
498 * data blocks numbers
499 */
500 for (i = 1; i < num; i++)
501 *(branch[n].p + i) = cpu_to_le32(++current_block);
502 }
503 BUFFER_TRACE(bh, "marking uptodate");
504 set_buffer_uptodate(bh);
505 unlock_buffer(bh);
506
507 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
508 err = ext4_handle_dirty_metadata(handle, inode, bh);
509 if (err)
510 goto failed;
511 }
512 *blks = num;
513 return err;
514failed:
515 /* Allocation failed, free what we already allocated */
516 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
517 for (i = 1; i <= n ; i++) {
518 /*
519 * branch[i].bh is newly allocated, so there is no
520 * need to revoke the block, which is why we don't
521 * need to set EXT4_FREE_BLOCKS_METADATA.
522 */
523 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
524 EXT4_FREE_BLOCKS_FORGET);
525 }
526 for (i = n+1; i < indirect_blks; i++)
527 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
528
529 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
530
531 return err;
532}
533
534/**
535 * ext4_splice_branch - splice the allocated branch onto inode.
536 * @handle: handle for this transaction
537 * @inode: owner
538 * @block: (logical) number of block we are adding
539 * @chain: chain of indirect blocks (with a missing link - see
540 * ext4_alloc_branch)
541 * @where: location of missing link
542 * @num: number of indirect blocks we are adding
543 * @blks: number of direct blocks we are adding
544 *
545 * This function fills the missing link and does all housekeeping needed in
546 * inode (->i_blocks, etc.). In case of success we end up with the full
547 * chain to new block and return 0.
548 */
549static int ext4_splice_branch(handle_t *handle, struct inode *inode,
550 ext4_lblk_t block, Indirect *where, int num,
551 int blks)
552{
553 int i;
554 int err = 0;
555 ext4_fsblk_t current_block;
556
557 /*
558 * If we're splicing into a [td]indirect block (as opposed to the
559 * inode) then we need to get write access to the [td]indirect block
560 * before the splice.
561 */
562 if (where->bh) {
563 BUFFER_TRACE(where->bh, "get_write_access");
564 err = ext4_journal_get_write_access(handle, where->bh);
565 if (err)
566 goto err_out;
567 }
568 /* That's it */
569
570 *where->p = where->key;
571
572 /*
573 * Update the host buffer_head or inode to point to more just allocated
574 * direct blocks blocks
575 */
576 if (num == 0 && blks > 1) {
577 current_block = le32_to_cpu(where->key) + 1;
578 for (i = 1; i < blks; i++)
579 *(where->p + i) = cpu_to_le32(current_block++);
580 }
581
582 /* We are done with atomic stuff, now do the rest of housekeeping */
583 /* had we spliced it onto indirect block? */
584 if (where->bh) {
585 /*
586 * If we spliced it onto an indirect block, we haven't
587 * altered the inode. Note however that if it is being spliced
588 * onto an indirect block at the very end of the file (the
589 * file is growing) then we *will* alter the inode to reflect
590 * the new i_size. But that is not done here - it is done in
591 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
592 */
593 jbd_debug(5, "splicing indirect only\n");
594 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
595 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
596 if (err)
597 goto err_out;
598 } else {
599 /*
600 * OK, we spliced it into the inode itself on a direct block.
601 */
602 ext4_mark_inode_dirty(handle, inode);
603 jbd_debug(5, "splicing direct\n");
604 }
605 return err;
606
607err_out:
608 for (i = 1; i <= num; i++) {
609 /*
610 * branch[i].bh is newly allocated, so there is no
611 * need to revoke the block, which is why we don't
612 * need to set EXT4_FREE_BLOCKS_METADATA.
613 */
614 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
615 EXT4_FREE_BLOCKS_FORGET);
616 }
617 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
618 blks, 0);
619
620 return err;
621}
622
623/*
624 * The ext4_ind_map_blocks() function handles non-extents inodes
625 * (i.e., using the traditional indirect/double-indirect i_blocks
626 * scheme) for ext4_map_blocks().
627 *
628 * Allocation strategy is simple: if we have to allocate something, we will
629 * have to go the whole way to leaf. So let's do it before attaching anything
630 * to tree, set linkage between the newborn blocks, write them if sync is
631 * required, recheck the path, free and repeat if check fails, otherwise
632 * set the last missing link (that will protect us from any truncate-generated
633 * removals - all blocks on the path are immune now) and possibly force the
634 * write on the parent block.
635 * That has a nice additional property: no special recovery from the failed
636 * allocations is needed - we simply release blocks and do not touch anything
637 * reachable from inode.
638 *
639 * `handle' can be NULL if create == 0.
640 *
641 * return > 0, # of blocks mapped or allocated.
642 * return = 0, if plain lookup failed.
643 * return < 0, error case.
644 *
645 * The ext4_ind_get_blocks() function should be called with
646 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
647 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
648 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
649 * blocks.
650 */
651int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
652 struct ext4_map_blocks *map,
653 int flags)
654{
655 int err = -EIO;
656 ext4_lblk_t offsets[4];
657 Indirect chain[4];
658 Indirect *partial;
659 ext4_fsblk_t goal;
660 int indirect_blks;
661 int blocks_to_boundary = 0;
662 int depth;
663 int count = 0;
664 ext4_fsblk_t first_block = 0;
665
666 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
667 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
668 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
669 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
670 &blocks_to_boundary);
671
672 if (depth == 0)
673 goto out;
674
675 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
676
677 /* Simplest case - block found, no allocation needed */
678 if (!partial) {
679 first_block = le32_to_cpu(chain[depth - 1].key);
680 count++;
681 /*map more blocks*/
682 while (count < map->m_len && count <= blocks_to_boundary) {
683 ext4_fsblk_t blk;
684
685 blk = le32_to_cpu(*(chain[depth-1].p + count));
686
687 if (blk == first_block + count)
688 count++;
689 else
690 break;
691 }
692 goto got_it;
693 }
694
695 /* Next simple case - plain lookup or failed read of indirect block */
696 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
697 goto cleanup;
698
699 /*
700 * Okay, we need to do block allocation.
701 */
702 goal = ext4_find_goal(inode, map->m_lblk, partial);
703
704 /* the number of blocks need to allocate for [d,t]indirect blocks */
705 indirect_blks = (chain + depth) - partial - 1;
706
707 /*
708 * Next look up the indirect map to count the totoal number of
709 * direct blocks to allocate for this branch.
710 */
711 count = ext4_blks_to_allocate(partial, indirect_blks,
712 map->m_len, blocks_to_boundary);
713 /*
714 * Block out ext4_truncate while we alter the tree
715 */
716 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
717 &count, goal,
718 offsets + (partial - chain), partial);
719
720 /*
721 * The ext4_splice_branch call will free and forget any buffers
722 * on the new chain if there is a failure, but that risks using
723 * up transaction credits, especially for bitmaps where the
724 * credits cannot be returned. Can we handle this somehow? We
725 * may need to return -EAGAIN upwards in the worst case. --sct
726 */
727 if (!err)
728 err = ext4_splice_branch(handle, inode, map->m_lblk,
729 partial, indirect_blks, count);
730 if (err)
731 goto cleanup;
732
733 map->m_flags |= EXT4_MAP_NEW;
734
735 ext4_update_inode_fsync_trans(handle, inode, 1);
736got_it:
737 map->m_flags |= EXT4_MAP_MAPPED;
738 map->m_pblk = le32_to_cpu(chain[depth-1].key);
739 map->m_len = count;
740 if (count > blocks_to_boundary)
741 map->m_flags |= EXT4_MAP_BOUNDARY;
742 err = count;
743 /* Clean up and exit */
744 partial = chain + depth - 1; /* the whole chain */
745cleanup:
746 while (partial > chain) {
747 BUFFER_TRACE(partial->bh, "call brelse");
748 brelse(partial->bh);
749 partial--;
750 }
751out:
752 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
753 map->m_pblk, map->m_len, err);
754 return err;
755}
756
757/*
758 * O_DIRECT for ext3 (or indirect map) based files
759 *
760 * If the O_DIRECT write will extend the file then add this inode to the
761 * orphan list. So recovery will truncate it back to the original size
762 * if the machine crashes during the write.
763 *
764 * If the O_DIRECT write is intantiating holes inside i_size and the machine
765 * crashes then stale disk data _may_ be exposed inside the file. But current
766 * VFS code falls back into buffered path in that case so we are safe.
767 */
768ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
769 const struct iovec *iov, loff_t offset,
770 unsigned long nr_segs)
771{
772 struct file *file = iocb->ki_filp;
773 struct inode *inode = file->f_mapping->host;
774 struct ext4_inode_info *ei = EXT4_I(inode);
775 handle_t *handle;
776 ssize_t ret;
777 int orphan = 0;
778 size_t count = iov_length(iov, nr_segs);
779 int retries = 0;
780
781 if (rw == WRITE) {
782 loff_t final_size = offset + count;
783
784 if (final_size > inode->i_size) {
785 /* Credits for sb + inode write */
786 handle = ext4_journal_start(inode, 2);
787 if (IS_ERR(handle)) {
788 ret = PTR_ERR(handle);
789 goto out;
790 }
791 ret = ext4_orphan_add(handle, inode);
792 if (ret) {
793 ext4_journal_stop(handle);
794 goto out;
795 }
796 orphan = 1;
797 ei->i_disksize = inode->i_size;
798 ext4_journal_stop(handle);
799 }
800 }
801
802retry:
803 if (rw == READ && ext4_should_dioread_nolock(inode)) {
804 if (unlikely(!list_empty(&ei->i_completed_io_list))) {
805 mutex_lock(&inode->i_mutex);
806 ext4_flush_completed_IO(inode);
807 mutex_unlock(&inode->i_mutex);
808 }
809 ret = __blockdev_direct_IO(rw, iocb, inode,
810 inode->i_sb->s_bdev, iov,
811 offset, nr_segs,
812 ext4_get_block, NULL, NULL, 0);
813 } else {
814 ret = blockdev_direct_IO(rw, iocb, inode, iov,
815 offset, nr_segs, ext4_get_block);
816
817 if (unlikely((rw & WRITE) && ret < 0)) {
818 loff_t isize = i_size_read(inode);
819 loff_t end = offset + iov_length(iov, nr_segs);
820
821 if (end > isize)
822 ext4_truncate_failed_write(inode);
823 }
824 }
825 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
826 goto retry;
827
828 if (orphan) {
829 int err;
830
831 /* Credits for sb + inode write */
832 handle = ext4_journal_start(inode, 2);
833 if (IS_ERR(handle)) {
834 /* This is really bad luck. We've written the data
835 * but cannot extend i_size. Bail out and pretend
836 * the write failed... */
837 ret = PTR_ERR(handle);
838 if (inode->i_nlink)
839 ext4_orphan_del(NULL, inode);
840
841 goto out;
842 }
843 if (inode->i_nlink)
844 ext4_orphan_del(handle, inode);
845 if (ret > 0) {
846 loff_t end = offset + ret;
847 if (end > inode->i_size) {
848 ei->i_disksize = end;
849 i_size_write(inode, end);
850 /*
851 * We're going to return a positive `ret'
852 * here due to non-zero-length I/O, so there's
853 * no way of reporting error returns from
854 * ext4_mark_inode_dirty() to userspace. So
855 * ignore it.
856 */
857 ext4_mark_inode_dirty(handle, inode);
858 }
859 }
860 err = ext4_journal_stop(handle);
861 if (ret == 0)
862 ret = err;
863 }
864out:
865 return ret;
866}
867
868/*
869 * Calculate the number of metadata blocks need to reserve
870 * to allocate a new block at @lblocks for non extent file based file
871 */
872int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
873{
874 struct ext4_inode_info *ei = EXT4_I(inode);
875 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
876 int blk_bits;
877
878 if (lblock < EXT4_NDIR_BLOCKS)
879 return 0;
880
881 lblock -= EXT4_NDIR_BLOCKS;
882
883 if (ei->i_da_metadata_calc_len &&
884 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
885 ei->i_da_metadata_calc_len++;
886 return 0;
887 }
888 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
889 ei->i_da_metadata_calc_len = 1;
890 blk_bits = order_base_2(lblock);
891 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
892}
893
894int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
895{
896 int indirects;
897
898 /* if nrblocks are contiguous */
899 if (chunk) {
900 /*
901 * With N contiguous data blocks, we need at most
902 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
903 * 2 dindirect blocks, and 1 tindirect block
904 */
905 return DIV_ROUND_UP(nrblocks,
906 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
907 }
908 /*
909 * if nrblocks are not contiguous, worse case, each block touch
910 * a indirect block, and each indirect block touch a double indirect
911 * block, plus a triple indirect block
912 */
913 indirects = nrblocks * 2 + 1;
914 return indirects;
915}
916
917/*
918 * Truncate transactions can be complex and absolutely huge. So we need to
919 * be able to restart the transaction at a conventient checkpoint to make
920 * sure we don't overflow the journal.
921 *
922 * start_transaction gets us a new handle for a truncate transaction,
923 * and extend_transaction tries to extend the existing one a bit. If
924 * extend fails, we need to propagate the failure up and restart the
925 * transaction in the top-level truncate loop. --sct
926 */
927static handle_t *start_transaction(struct inode *inode)
928{
929 handle_t *result;
930
931 result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
932 if (!IS_ERR(result))
933 return result;
934
935 ext4_std_error(inode->i_sb, PTR_ERR(result));
936 return result;
937}
938
939/*
940 * Try to extend this transaction for the purposes of truncation.
941 *
942 * Returns 0 if we managed to create more room. If we can't create more
943 * room, and the transaction must be restarted we return 1.
944 */
945static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
946{
947 if (!ext4_handle_valid(handle))
948 return 0;
949 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
950 return 0;
951 if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
952 return 0;
953 return 1;
954}
955
956/*
957 * Probably it should be a library function... search for first non-zero word
958 * or memcmp with zero_page, whatever is better for particular architecture.
959 * Linus?
960 */
961static inline int all_zeroes(__le32 *p, __le32 *q)
962{
963 while (p < q)
964 if (*p++)
965 return 0;
966 return 1;
967}
968
969/**
970 * ext4_find_shared - find the indirect blocks for partial truncation.
971 * @inode: inode in question
972 * @depth: depth of the affected branch
973 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
974 * @chain: place to store the pointers to partial indirect blocks
975 * @top: place to the (detached) top of branch
976 *
977 * This is a helper function used by ext4_truncate().
978 *
979 * When we do truncate() we may have to clean the ends of several
980 * indirect blocks but leave the blocks themselves alive. Block is
981 * partially truncated if some data below the new i_size is referred
982 * from it (and it is on the path to the first completely truncated
983 * data block, indeed). We have to free the top of that path along
984 * with everything to the right of the path. Since no allocation
985 * past the truncation point is possible until ext4_truncate()
986 * finishes, we may safely do the latter, but top of branch may
987 * require special attention - pageout below the truncation point
988 * might try to populate it.
989 *
990 * We atomically detach the top of branch from the tree, store the
991 * block number of its root in *@top, pointers to buffer_heads of
992 * partially truncated blocks - in @chain[].bh and pointers to
993 * their last elements that should not be removed - in
994 * @chain[].p. Return value is the pointer to last filled element
995 * of @chain.
996 *
997 * The work left to caller to do the actual freeing of subtrees:
998 * a) free the subtree starting from *@top
999 * b) free the subtrees whose roots are stored in
1000 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1001 * c) free the subtrees growing from the inode past the @chain[0].
1002 * (no partially truncated stuff there). */
1003
1004static Indirect *ext4_find_shared(struct inode *inode, int depth,
1005 ext4_lblk_t offsets[4], Indirect chain[4],
1006 __le32 *top)
1007{
1008 Indirect *partial, *p;
1009 int k, err;
1010
1011 *top = 0;
1012 /* Make k index the deepest non-null offset + 1 */
1013 for (k = depth; k > 1 && !offsets[k-1]; k--)
1014 ;
1015 partial = ext4_get_branch(inode, k, offsets, chain, &err);
1016 /* Writer: pointers */
1017 if (!partial)
1018 partial = chain + k-1;
1019 /*
1020 * If the branch acquired continuation since we've looked at it -
1021 * fine, it should all survive and (new) top doesn't belong to us.
1022 */
1023 if (!partial->key && *partial->p)
1024 /* Writer: end */
1025 goto no_top;
1026 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
1027 ;
1028 /*
1029 * OK, we've found the last block that must survive. The rest of our
1030 * branch should be detached before unlocking. However, if that rest
1031 * of branch is all ours and does not grow immediately from the inode
1032 * it's easier to cheat and just decrement partial->p.
1033 */
1034 if (p == chain + k - 1 && p > chain) {
1035 p->p--;
1036 } else {
1037 *top = *p->p;
1038 /* Nope, don't do this in ext4. Must leave the tree intact */
1039#if 0
1040 *p->p = 0;
1041#endif
1042 }
1043 /* Writer: end */
1044
1045 while (partial > p) {
1046 brelse(partial->bh);
1047 partial--;
1048 }
1049no_top:
1050 return partial;
1051}
1052
1053/*
1054 * Zero a number of block pointers in either an inode or an indirect block.
1055 * If we restart the transaction we must again get write access to the
1056 * indirect block for further modification.
1057 *
1058 * We release `count' blocks on disk, but (last - first) may be greater
1059 * than `count' because there can be holes in there.
1060 *
1061 * Return 0 on success, 1 on invalid block range
1062 * and < 0 on fatal error.
1063 */
1064static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
1065 struct buffer_head *bh,
1066 ext4_fsblk_t block_to_free,
1067 unsigned long count, __le32 *first,
1068 __le32 *last)
1069{
1070 __le32 *p;
1071 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
1072 int err;
1073
1074 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1075 flags |= EXT4_FREE_BLOCKS_METADATA;
1076
1077 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
1078 count)) {
1079 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
1080 "blocks %llu len %lu",
1081 (unsigned long long) block_to_free, count);
1082 return 1;
1083 }
1084
1085 if (try_to_extend_transaction(handle, inode)) {
1086 if (bh) {
1087 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1088 err = ext4_handle_dirty_metadata(handle, inode, bh);
1089 if (unlikely(err))
1090 goto out_err;
1091 }
1092 err = ext4_mark_inode_dirty(handle, inode);
1093 if (unlikely(err))
1094 goto out_err;
1095 err = ext4_truncate_restart_trans(handle, inode,
1096 ext4_blocks_for_truncate(inode));
1097 if (unlikely(err))
1098 goto out_err;
1099 if (bh) {
1100 BUFFER_TRACE(bh, "retaking write access");
1101 err = ext4_journal_get_write_access(handle, bh);
1102 if (unlikely(err))
1103 goto out_err;
1104 }
1105 }
1106
1107 for (p = first; p < last; p++)
1108 *p = 0;
1109
1110 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
1111 return 0;
1112out_err:
1113 ext4_std_error(inode->i_sb, err);
1114 return err;
1115}
1116
1117/**
1118 * ext4_free_data - free a list of data blocks
1119 * @handle: handle for this transaction
1120 * @inode: inode we are dealing with
1121 * @this_bh: indirect buffer_head which contains *@first and *@last
1122 * @first: array of block numbers
1123 * @last: points immediately past the end of array
1124 *
1125 * We are freeing all blocks referred from that array (numbers are stored as
1126 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1127 *
1128 * We accumulate contiguous runs of blocks to free. Conveniently, if these
1129 * blocks are contiguous then releasing them at one time will only affect one
1130 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1131 * actually use a lot of journal space.
1132 *
1133 * @this_bh will be %NULL if @first and @last point into the inode's direct
1134 * block pointers.
1135 */
1136static void ext4_free_data(handle_t *handle, struct inode *inode,
1137 struct buffer_head *this_bh,
1138 __le32 *first, __le32 *last)
1139{
1140 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
1141 unsigned long count = 0; /* Number of blocks in the run */
1142 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
1143 corresponding to
1144 block_to_free */
1145 ext4_fsblk_t nr; /* Current block # */
1146 __le32 *p; /* Pointer into inode/ind
1147 for current block */
1148 int err = 0;
1149
1150 if (this_bh) { /* For indirect block */
1151 BUFFER_TRACE(this_bh, "get_write_access");
1152 err = ext4_journal_get_write_access(handle, this_bh);
1153 /* Important: if we can't update the indirect pointers
1154 * to the blocks, we can't free them. */
1155 if (err)
1156 return;
1157 }
1158
1159 for (p = first; p < last; p++) {
1160 nr = le32_to_cpu(*p);
1161 if (nr) {
1162 /* accumulate blocks to free if they're contiguous */
1163 if (count == 0) {
1164 block_to_free = nr;
1165 block_to_free_p = p;
1166 count = 1;
1167 } else if (nr == block_to_free + count) {
1168 count++;
1169 } else {
1170 err = ext4_clear_blocks(handle, inode, this_bh,
1171 block_to_free, count,
1172 block_to_free_p, p);
1173 if (err)
1174 break;
1175 block_to_free = nr;
1176 block_to_free_p = p;
1177 count = 1;
1178 }
1179 }
1180 }
1181
1182 if (!err && count > 0)
1183 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
1184 count, block_to_free_p, p);
1185 if (err < 0)
1186 /* fatal error */
1187 return;
1188
1189 if (this_bh) {
1190 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
1191
1192 /*
1193 * The buffer head should have an attached journal head at this
1194 * point. However, if the data is corrupted and an indirect
1195 * block pointed to itself, it would have been detached when
1196 * the block was cleared. Check for this instead of OOPSing.
1197 */
1198 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
1199 ext4_handle_dirty_metadata(handle, inode, this_bh);
1200 else
1201 EXT4_ERROR_INODE(inode,
1202 "circular indirect block detected at "
1203 "block %llu",
1204 (unsigned long long) this_bh->b_blocknr);
1205 }
1206}
1207
1208/**
1209 * ext4_free_branches - free an array of branches
1210 * @handle: JBD handle for this transaction
1211 * @inode: inode we are dealing with
1212 * @parent_bh: the buffer_head which contains *@first and *@last
1213 * @first: array of block numbers
1214 * @last: pointer immediately past the end of array
1215 * @depth: depth of the branches to free
1216 *
1217 * We are freeing all blocks referred from these branches (numbers are
1218 * stored as little-endian 32-bit) and updating @inode->i_blocks
1219 * appropriately.
1220 */
1221static void ext4_free_branches(handle_t *handle, struct inode *inode,
1222 struct buffer_head *parent_bh,
1223 __le32 *first, __le32 *last, int depth)
1224{
1225 ext4_fsblk_t nr;
1226 __le32 *p;
1227
1228 if (ext4_handle_is_aborted(handle))
1229 return;
1230
1231 if (depth--) {
1232 struct buffer_head *bh;
1233 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1234 p = last;
1235 while (--p >= first) {
1236 nr = le32_to_cpu(*p);
1237 if (!nr)
1238 continue; /* A hole */
1239
1240 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
1241 nr, 1)) {
1242 EXT4_ERROR_INODE(inode,
1243 "invalid indirect mapped "
1244 "block %lu (level %d)",
1245 (unsigned long) nr, depth);
1246 break;
1247 }
1248
1249 /* Go read the buffer for the next level down */
1250 bh = sb_bread(inode->i_sb, nr);
1251
1252 /*
1253 * A read failure? Report error and clear slot
1254 * (should be rare).
1255 */
1256 if (!bh) {
1257 EXT4_ERROR_INODE_BLOCK(inode, nr,
1258 "Read failure");
1259 continue;
1260 }
1261
1262 /* This zaps the entire block. Bottom up. */
1263 BUFFER_TRACE(bh, "free child branches");
1264 ext4_free_branches(handle, inode, bh,
1265 (__le32 *) bh->b_data,
1266 (__le32 *) bh->b_data + addr_per_block,
1267 depth);
1268 brelse(bh);
1269
1270 /*
1271 * Everything below this this pointer has been
1272 * released. Now let this top-of-subtree go.
1273 *
1274 * We want the freeing of this indirect block to be
1275 * atomic in the journal with the updating of the
1276 * bitmap block which owns it. So make some room in
1277 * the journal.
1278 *
1279 * We zero the parent pointer *after* freeing its
1280 * pointee in the bitmaps, so if extend_transaction()
1281 * for some reason fails to put the bitmap changes and
1282 * the release into the same transaction, recovery
1283 * will merely complain about releasing a free block,
1284 * rather than leaking blocks.
1285 */
1286 if (ext4_handle_is_aborted(handle))
1287 return;
1288 if (try_to_extend_transaction(handle, inode)) {
1289 ext4_mark_inode_dirty(handle, inode);
1290 ext4_truncate_restart_trans(handle, inode,
1291 ext4_blocks_for_truncate(inode));
1292 }
1293
1294 /*
1295 * The forget flag here is critical because if
1296 * we are journaling (and not doing data
1297 * journaling), we have to make sure a revoke
1298 * record is written to prevent the journal
1299 * replay from overwriting the (former)
1300 * indirect block if it gets reallocated as a
1301 * data block. This must happen in the same
1302 * transaction where the data blocks are
1303 * actually freed.
1304 */
1305 ext4_free_blocks(handle, inode, NULL, nr, 1,
1306 EXT4_FREE_BLOCKS_METADATA|
1307 EXT4_FREE_BLOCKS_FORGET);
1308
1309 if (parent_bh) {
1310 /*
1311 * The block which we have just freed is
1312 * pointed to by an indirect block: journal it
1313 */
1314 BUFFER_TRACE(parent_bh, "get_write_access");
1315 if (!ext4_journal_get_write_access(handle,
1316 parent_bh)){
1317 *p = 0;
1318 BUFFER_TRACE(parent_bh,
1319 "call ext4_handle_dirty_metadata");
1320 ext4_handle_dirty_metadata(handle,
1321 inode,
1322 parent_bh);
1323 }
1324 }
1325 }
1326 } else {
1327 /* We have reached the bottom of the tree. */
1328 BUFFER_TRACE(parent_bh, "free data blocks");
1329 ext4_free_data(handle, inode, parent_bh, first, last);
1330 }
1331}
1332
1333void ext4_ind_truncate(struct inode *inode)
1334{
1335 handle_t *handle;
1336 struct ext4_inode_info *ei = EXT4_I(inode);
1337 __le32 *i_data = ei->i_data;
1338 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1339 struct address_space *mapping = inode->i_mapping;
1340 ext4_lblk_t offsets[4];
1341 Indirect chain[4];
1342 Indirect *partial;
1343 __le32 nr = 0;
1344 int n = 0;
1345 ext4_lblk_t last_block, max_block;
1346 unsigned blocksize = inode->i_sb->s_blocksize;
1347
1348 handle = start_transaction(inode);
1349 if (IS_ERR(handle))
1350 return; /* AKPM: return what? */
1351
1352 last_block = (inode->i_size + blocksize-1)
1353 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1354 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1355 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1356
1357 if (inode->i_size & (blocksize - 1))
1358 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
1359 goto out_stop;
1360
1361 if (last_block != max_block) {
1362 n = ext4_block_to_path(inode, last_block, offsets, NULL);
1363 if (n == 0)
1364 goto out_stop; /* error */
1365 }
1366
1367 /*
1368 * OK. This truncate is going to happen. We add the inode to the
1369 * orphan list, so that if this truncate spans multiple transactions,
1370 * and we crash, we will resume the truncate when the filesystem
1371 * recovers. It also marks the inode dirty, to catch the new size.
1372 *
1373 * Implication: the file must always be in a sane, consistent
1374 * truncatable state while each transaction commits.
1375 */
1376 if (ext4_orphan_add(handle, inode))
1377 goto out_stop;
1378
1379 /*
1380 * From here we block out all ext4_get_block() callers who want to
1381 * modify the block allocation tree.
1382 */
1383 down_write(&ei->i_data_sem);
1384
1385 ext4_discard_preallocations(inode);
1386
1387 /*
1388 * The orphan list entry will now protect us from any crash which
1389 * occurs before the truncate completes, so it is now safe to propagate
1390 * the new, shorter inode size (held for now in i_size) into the
1391 * on-disk inode. We do this via i_disksize, which is the value which
1392 * ext4 *really* writes onto the disk inode.
1393 */
1394 ei->i_disksize = inode->i_size;
1395
1396 if (last_block == max_block) {
1397 /*
1398 * It is unnecessary to free any data blocks if last_block is
1399 * equal to the indirect block limit.
1400 */
1401 goto out_unlock;
1402 } else if (n == 1) { /* direct blocks */
1403 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
1404 i_data + EXT4_NDIR_BLOCKS);
1405 goto do_indirects;
1406 }
1407
1408 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1409 /* Kill the top of shared branch (not detached) */
1410 if (nr) {
1411 if (partial == chain) {
1412 /* Shared branch grows from the inode */
1413 ext4_free_branches(handle, inode, NULL,
1414 &nr, &nr+1, (chain+n-1) - partial);
1415 *partial->p = 0;
1416 /*
1417 * We mark the inode dirty prior to restart,
1418 * and prior to stop. No need for it here.
1419 */
1420 } else {
1421 /* Shared branch grows from an indirect block */
1422 BUFFER_TRACE(partial->bh, "get_write_access");
1423 ext4_free_branches(handle, inode, partial->bh,
1424 partial->p,
1425 partial->p+1, (chain+n-1) - partial);
1426 }
1427 }
1428 /* Clear the ends of indirect blocks on the shared branch */
1429 while (partial > chain) {
1430 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
1431 (__le32*)partial->bh->b_data+addr_per_block,
1432 (chain+n-1) - partial);
1433 BUFFER_TRACE(partial->bh, "call brelse");
1434 brelse(partial->bh);
1435 partial--;
1436 }
1437do_indirects:
1438 /* Kill the remaining (whole) subtrees */
1439 switch (offsets[0]) {
1440 default:
1441 nr = i_data[EXT4_IND_BLOCK];
1442 if (nr) {
1443 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
1444 i_data[EXT4_IND_BLOCK] = 0;
1445 }
1446 case EXT4_IND_BLOCK:
1447 nr = i_data[EXT4_DIND_BLOCK];
1448 if (nr) {
1449 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
1450 i_data[EXT4_DIND_BLOCK] = 0;
1451 }
1452 case EXT4_DIND_BLOCK:
1453 nr = i_data[EXT4_TIND_BLOCK];
1454 if (nr) {
1455 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
1456 i_data[EXT4_TIND_BLOCK] = 0;
1457 }
1458 case EXT4_TIND_BLOCK:
1459 ;
1460 }
1461
1462out_unlock:
1463 up_write(&ei->i_data_sem);
1464 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1465 ext4_mark_inode_dirty(handle, inode);
1466
1467 /*
1468 * In a multi-transaction truncate, we only make the final transaction
1469 * synchronous
1470 */
1471 if (IS_SYNC(inode))
1472 ext4_handle_sync(handle);
1473out_stop:
1474 /*
1475 * If this was a simple ftruncate(), and the file will remain alive
1476 * then we need to clear up the orphan record which we created above.
1477 * However, if this was a real unlink then we were called by
1478 * ext4_delete_inode(), and we allow that function to clean up the
1479 * orphan info for us.
1480 */
1481 if (inode->i_nlink)
1482 ext4_orphan_del(handle, inode);
1483
1484 ext4_journal_stop(handle);
1485 trace_ext4_truncate_exit(inode);
1486}
1487
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 678cde834f19..18d2558b7624 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,10 +12,6 @@
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 15 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz) 16 * (jj@sunsite.ms.mff.cuni.cz)
21 * 17 *
@@ -47,6 +43,7 @@
47#include "xattr.h" 43#include "xattr.h"
48#include "acl.h" 44#include "acl.h"
49#include "ext4_extents.h" 45#include "ext4_extents.h"
46#include "truncate.h"
50 47
51#include <trace/events/ext4.h> 48#include <trace/events/ext4.h>
52 49
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
89} 86}
90 87
91/* 88/*
92 * Work out how many blocks we need to proceed with the next chunk of a
93 * truncate transaction.
94 */
95static unsigned long blocks_for_truncate(struct inode *inode)
96{
97 ext4_lblk_t needed;
98
99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
100
101 /* Give ourselves just enough room to cope with inodes in which
102 * i_blocks is corrupt: we've seen disk corruptions in the past
103 * which resulted in random data in an inode which looked enough
104 * like a regular file for ext4 to try to delete it. Things
105 * will go a bit crazy if that happens, but at least we should
106 * try not to panic the whole kernel. */
107 if (needed < 2)
108 needed = 2;
109
110 /* But we need to bound the transaction so we don't overflow the
111 * journal. */
112 if (needed > EXT4_MAX_TRANS_DATA)
113 needed = EXT4_MAX_TRANS_DATA;
114
115 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
116}
117
118/*
119 * Truncate transactions can be complex and absolutely huge. So we need to
120 * be able to restart the transaction at a conventient checkpoint to make
121 * sure we don't overflow the journal.
122 *
123 * start_transaction gets us a new handle for a truncate transaction,
124 * and extend_transaction tries to extend the existing one a bit. If
125 * extend fails, we need to propagate the failure up and restart the
126 * transaction in the top-level truncate loop. --sct
127 */
128static handle_t *start_transaction(struct inode *inode)
129{
130 handle_t *result;
131
132 result = ext4_journal_start(inode, blocks_for_truncate(inode));
133 if (!IS_ERR(result))
134 return result;
135
136 ext4_std_error(inode->i_sb, PTR_ERR(result));
137 return result;
138}
139
140/*
141 * Try to extend this transaction for the purposes of truncation.
142 *
143 * Returns 0 if we managed to create more room. If we can't create more
144 * room, and the transaction must be restarted we return 1.
145 */
146static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
147{
148 if (!ext4_handle_valid(handle))
149 return 0;
150 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
151 return 0;
152 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
153 return 0;
154 return 1;
155}
156
157/*
158 * Restart the transaction associated with *handle. This does a commit, 89 * Restart the transaction associated with *handle. This does a commit,
159 * so before we call here everything must be consistently dirtied against 90 * so before we call here everything must be consistently dirtied against
160 * this transaction. 91 * this transaction.
@@ -189,7 +120,37 @@ void ext4_evict_inode(struct inode *inode)
189 int err; 120 int err;
190 121
191 trace_ext4_evict_inode(inode); 122 trace_ext4_evict_inode(inode);
123
124 ext4_ioend_wait(inode);
125
192 if (inode->i_nlink) { 126 if (inode->i_nlink) {
127 /*
128 * When journalling data dirty buffers are tracked only in the
129 * journal. So although mm thinks everything is clean and
130 * ready for reaping the inode might still have some pages to
131 * write in the running transaction or waiting to be
132 * checkpointed. Thus calling jbd2_journal_invalidatepage()
133 * (via truncate_inode_pages()) to discard these buffers can
134 * cause data loss. Also even if we did not discard these
135 * buffers, we would have no way to find them after the inode
136 * is reaped and thus user could see stale data if he tries to
137 * read them before the transaction is checkpointed. So be
138 * careful and force everything to disk here... We use
139 * ei->i_datasync_tid to store the newest transaction
140 * containing inode's data.
141 *
142 * Note that directories do not have this problem because they
143 * don't use page cache.
144 */
145 if (ext4_should_journal_data(inode) &&
146 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
147 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
148 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
149
150 jbd2_log_start_commit(journal, commit_tid);
151 jbd2_log_wait_commit(journal, commit_tid);
152 filemap_write_and_wait(&inode->i_data);
153 }
193 truncate_inode_pages(&inode->i_data, 0); 154 truncate_inode_pages(&inode->i_data, 0);
194 goto no_delete; 155 goto no_delete;
195 } 156 }
@@ -204,7 +165,7 @@ void ext4_evict_inode(struct inode *inode)
204 if (is_bad_inode(inode)) 165 if (is_bad_inode(inode))
205 goto no_delete; 166 goto no_delete;
206 167
207 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 168 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
208 if (IS_ERR(handle)) { 169 if (IS_ERR(handle)) {
209 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 170 ext4_std_error(inode->i_sb, PTR_ERR(handle));
210 /* 171 /*
@@ -277,793 +238,6 @@ no_delete:
277 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 238 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
278} 239}
279 240
280typedef struct {
281 __le32 *p;
282 __le32 key;
283 struct buffer_head *bh;
284} Indirect;
285
286static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
287{
288 p->key = *(p->p = v);
289 p->bh = bh;
290}
291
292/**
293 * ext4_block_to_path - parse the block number into array of offsets
294 * @inode: inode in question (we are only interested in its superblock)
295 * @i_block: block number to be parsed
296 * @offsets: array to store the offsets in
297 * @boundary: set this non-zero if the referred-to block is likely to be
298 * followed (on disk) by an indirect block.
299 *
300 * To store the locations of file's data ext4 uses a data structure common
301 * for UNIX filesystems - tree of pointers anchored in the inode, with
302 * data blocks at leaves and indirect blocks in intermediate nodes.
303 * This function translates the block number into path in that tree -
304 * return value is the path length and @offsets[n] is the offset of
305 * pointer to (n+1)th node in the nth one. If @block is out of range
306 * (negative or too large) warning is printed and zero returned.
307 *
308 * Note: function doesn't find node addresses, so no IO is needed. All
309 * we need to know is the capacity of indirect blocks (taken from the
310 * inode->i_sb).
311 */
312
313/*
314 * Portability note: the last comparison (check that we fit into triple
315 * indirect block) is spelled differently, because otherwise on an
316 * architecture with 32-bit longs and 8Kb pages we might get into trouble
317 * if our filesystem had 8Kb blocks. We might use long long, but that would
318 * kill us on x86. Oh, well, at least the sign propagation does not matter -
319 * i_block would have to be negative in the very beginning, so we would not
320 * get there at all.
321 */
322
323static int ext4_block_to_path(struct inode *inode,
324 ext4_lblk_t i_block,
325 ext4_lblk_t offsets[4], int *boundary)
326{
327 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
328 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
329 const long direct_blocks = EXT4_NDIR_BLOCKS,
330 indirect_blocks = ptrs,
331 double_blocks = (1 << (ptrs_bits * 2));
332 int n = 0;
333 int final = 0;
334
335 if (i_block < direct_blocks) {
336 offsets[n++] = i_block;
337 final = direct_blocks;
338 } else if ((i_block -= direct_blocks) < indirect_blocks) {
339 offsets[n++] = EXT4_IND_BLOCK;
340 offsets[n++] = i_block;
341 final = ptrs;
342 } else if ((i_block -= indirect_blocks) < double_blocks) {
343 offsets[n++] = EXT4_DIND_BLOCK;
344 offsets[n++] = i_block >> ptrs_bits;
345 offsets[n++] = i_block & (ptrs - 1);
346 final = ptrs;
347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
348 offsets[n++] = EXT4_TIND_BLOCK;
349 offsets[n++] = i_block >> (ptrs_bits * 2);
350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
351 offsets[n++] = i_block & (ptrs - 1);
352 final = ptrs;
353 } else {
354 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
355 i_block + direct_blocks +
356 indirect_blocks + double_blocks, inode->i_ino);
357 }
358 if (boundary)
359 *boundary = final - 1 - (i_block & (ptrs - 1));
360 return n;
361}
362
363static int __ext4_check_blockref(const char *function, unsigned int line,
364 struct inode *inode,
365 __le32 *p, unsigned int max)
366{
367 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
368 __le32 *bref = p;
369 unsigned int blk;
370
371 while (bref < p+max) {
372 blk = le32_to_cpu(*bref++);
373 if (blk &&
374 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
375 blk, 1))) {
376 es->s_last_error_block = cpu_to_le64(blk);
377 ext4_error_inode(inode, function, line, blk,
378 "invalid block");
379 return -EIO;
380 }
381 }
382 return 0;
383}
384
385
386#define ext4_check_indirect_blockref(inode, bh) \
387 __ext4_check_blockref(__func__, __LINE__, inode, \
388 (__le32 *)(bh)->b_data, \
389 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
390
391#define ext4_check_inode_blockref(inode) \
392 __ext4_check_blockref(__func__, __LINE__, inode, \
393 EXT4_I(inode)->i_data, \
394 EXT4_NDIR_BLOCKS)
395
396/**
397 * ext4_get_branch - read the chain of indirect blocks leading to data
398 * @inode: inode in question
399 * @depth: depth of the chain (1 - direct pointer, etc.)
400 * @offsets: offsets of pointers in inode/indirect blocks
401 * @chain: place to store the result
402 * @err: here we store the error value
403 *
404 * Function fills the array of triples <key, p, bh> and returns %NULL
405 * if everything went OK or the pointer to the last filled triple
406 * (incomplete one) otherwise. Upon the return chain[i].key contains
407 * the number of (i+1)-th block in the chain (as it is stored in memory,
408 * i.e. little-endian 32-bit), chain[i].p contains the address of that
409 * number (it points into struct inode for i==0 and into the bh->b_data
410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411 * block for i>0 and NULL for i==0. In other words, it holds the block
412 * numbers of the chain, addresses they were taken from (and where we can
413 * verify that chain did not change) and buffer_heads hosting these
414 * numbers.
415 *
416 * Function stops when it stumbles upon zero pointer (absent block)
417 * (pointer to last triple returned, *@err == 0)
418 * or when it gets an IO error reading an indirect block
419 * (ditto, *@err == -EIO)
420 * or when it reads all @depth-1 indirect blocks successfully and finds
421 * the whole chain, all way to the data (returns %NULL, *err == 0).
422 *
423 * Need to be called with
424 * down_read(&EXT4_I(inode)->i_data_sem)
425 */
426static Indirect *ext4_get_branch(struct inode *inode, int depth,
427 ext4_lblk_t *offsets,
428 Indirect chain[4], int *err)
429{
430 struct super_block *sb = inode->i_sb;
431 Indirect *p = chain;
432 struct buffer_head *bh;
433
434 *err = 0;
435 /* i_data is not going away, no lock needed */
436 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
437 if (!p->key)
438 goto no_block;
439 while (--depth) {
440 bh = sb_getblk(sb, le32_to_cpu(p->key));
441 if (unlikely(!bh))
442 goto failure;
443
444 if (!bh_uptodate_or_lock(bh)) {
445 if (bh_submit_read(bh) < 0) {
446 put_bh(bh);
447 goto failure;
448 }
449 /* validate block references */
450 if (ext4_check_indirect_blockref(inode, bh)) {
451 put_bh(bh);
452 goto failure;
453 }
454 }
455
456 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
457 /* Reader: end */
458 if (!p->key)
459 goto no_block;
460 }
461 return NULL;
462
463failure:
464 *err = -EIO;
465no_block:
466 return p;
467}
468
469/**
470 * ext4_find_near - find a place for allocation with sufficient locality
471 * @inode: owner
472 * @ind: descriptor of indirect block.
473 *
474 * This function returns the preferred place for block allocation.
475 * It is used when heuristic for sequential allocation fails.
476 * Rules are:
477 * + if there is a block to the left of our position - allocate near it.
478 * + if pointer will live in indirect block - allocate near that block.
479 * + if pointer will live in inode - allocate in the same
480 * cylinder group.
481 *
482 * In the latter case we colour the starting block by the callers PID to
483 * prevent it from clashing with concurrent allocations for a different inode
484 * in the same block group. The PID is used here so that functionally related
485 * files will be close-by on-disk.
486 *
487 * Caller must make sure that @ind is valid and will stay that way.
488 */
489static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
490{
491 struct ext4_inode_info *ei = EXT4_I(inode);
492 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
493 __le32 *p;
494 ext4_fsblk_t bg_start;
495 ext4_fsblk_t last_block;
496 ext4_grpblk_t colour;
497 ext4_group_t block_group;
498 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
499
500 /* Try to find previous block */
501 for (p = ind->p - 1; p >= start; p--) {
502 if (*p)
503 return le32_to_cpu(*p);
504 }
505
506 /* No such thing, so let's try location of indirect block */
507 if (ind->bh)
508 return ind->bh->b_blocknr;
509
510 /*
511 * It is going to be referred to from the inode itself? OK, just put it
512 * into the same cylinder group then.
513 */
514 block_group = ei->i_block_group;
515 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
516 block_group &= ~(flex_size-1);
517 if (S_ISREG(inode->i_mode))
518 block_group++;
519 }
520 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
521 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
522
523 /*
524 * If we are doing delayed allocation, we don't need take
525 * colour into account.
526 */
527 if (test_opt(inode->i_sb, DELALLOC))
528 return bg_start;
529
530 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
531 colour = (current->pid % 16) *
532 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
533 else
534 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
535 return bg_start + colour;
536}
537
538/**
539 * ext4_find_goal - find a preferred place for allocation.
540 * @inode: owner
541 * @block: block we want
542 * @partial: pointer to the last triple within a chain
543 *
544 * Normally this function find the preferred place for block allocation,
545 * returns it.
546 * Because this is only used for non-extent files, we limit the block nr
547 * to 32 bits.
548 */
549static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
550 Indirect *partial)
551{
552 ext4_fsblk_t goal;
553
554 /*
555 * XXX need to get goal block from mballoc's data structures
556 */
557
558 goal = ext4_find_near(inode, partial);
559 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
560 return goal;
561}
562
563/**
564 * ext4_blks_to_allocate - Look up the block map and count the number
565 * of direct blocks need to be allocated for the given branch.
566 *
567 * @branch: chain of indirect blocks
568 * @k: number of blocks need for indirect blocks
569 * @blks: number of data blocks to be mapped.
570 * @blocks_to_boundary: the offset in the indirect block
571 *
572 * return the total number of blocks to be allocate, including the
573 * direct and indirect blocks.
574 */
575static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
576 int blocks_to_boundary)
577{
578 unsigned int count = 0;
579
580 /*
581 * Simple case, [t,d]Indirect block(s) has not allocated yet
582 * then it's clear blocks on that path have not allocated
583 */
584 if (k > 0) {
585 /* right now we don't handle cross boundary allocation */
586 if (blks < blocks_to_boundary + 1)
587 count += blks;
588 else
589 count += blocks_to_boundary + 1;
590 return count;
591 }
592
593 count++;
594 while (count < blks && count <= blocks_to_boundary &&
595 le32_to_cpu(*(branch[0].p + count)) == 0) {
596 count++;
597 }
598 return count;
599}
600
601/**
602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
607 * @indirect_blks: the number of blocks need to allocate for indirect
608 * blocks
609 * @blks: number of desired blocks
610 * @new_blocks: on return it will store the new block numbers for
611 * the indirect blocks(if needed) and the first direct block,
612 * @err: on return it will store the error code
613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
616 */
617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
618 ext4_lblk_t iblock, ext4_fsblk_t goal,
619 int indirect_blks, int blks,
620 ext4_fsblk_t new_blocks[4], int *err)
621{
622 struct ext4_allocation_request ar;
623 int target, i;
624 unsigned long count = 0, blk_allocated = 0;
625 int index = 0;
626 ext4_fsblk_t current_block = 0;
627 int ret = 0;
628
629 /*
630 * Here we try to allocate the requested multiple blocks at once,
631 * on a best-effort basis.
632 * To build a branch, we should allocate blocks for
633 * the indirect blocks(if not allocated yet), and at least
634 * the first direct block of this branch. That's the
635 * minimum number of blocks need to allocate(required)
636 */
637 /* first we try to allocate the indirect blocks */
638 target = indirect_blks;
639 while (target > 0) {
640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 0, &count, err);
644 if (*err)
645 goto failed_out;
646
647 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
648 EXT4_ERROR_INODE(inode,
649 "current_block %llu + count %lu > %d!",
650 current_block, count,
651 EXT4_MAX_BLOCK_FILE_PHYS);
652 *err = -EIO;
653 goto failed_out;
654 }
655
656 target -= count;
657 /* allocate blocks for indirect blocks */
658 while (index < indirect_blks && count) {
659 new_blocks[index++] = current_block++;
660 count--;
661 }
662 if (count > 0) {
663 /*
664 * save the new block number
665 * for the first direct block
666 */
667 new_blocks[index] = current_block;
668 printk(KERN_INFO "%s returned more blocks than "
669 "requested\n", __func__);
670 WARN_ON(1);
671 break;
672 }
673 }
674
675 target = blks - count ;
676 blk_allocated = count;
677 if (!target)
678 goto allocated;
679 /* Now allocate data blocks */
680 memset(&ar, 0, sizeof(ar));
681 ar.inode = inode;
682 ar.goal = goal;
683 ar.len = target;
684 ar.logical = iblock;
685 if (S_ISREG(inode->i_mode))
686 /* enable in-core preallocation only for regular files */
687 ar.flags = EXT4_MB_HINT_DATA;
688
689 current_block = ext4_mb_new_blocks(handle, &ar, err);
690 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
691 EXT4_ERROR_INODE(inode,
692 "current_block %llu + ar.len %d > %d!",
693 current_block, ar.len,
694 EXT4_MAX_BLOCK_FILE_PHYS);
695 *err = -EIO;
696 goto failed_out;
697 }
698
699 if (*err && (target == blks)) {
700 /*
701 * if the allocation failed and we didn't allocate
702 * any blocks before
703 */
704 goto failed_out;
705 }
706 if (!*err) {
707 if (target == blks) {
708 /*
709 * save the new block number
710 * for the first direct block
711 */
712 new_blocks[index] = current_block;
713 }
714 blk_allocated += ar.len;
715 }
716allocated:
717 /* total number of blocks allocated for direct blocks */
718 ret = blk_allocated;
719 *err = 0;
720 return ret;
721failed_out:
722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret;
725}
726
727/**
728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
730 * @inode: owner
731 * @indirect_blks: number of allocated indirect blocks
732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
734 * @offsets: offsets (in the blocks) to store the pointers to next.
735 * @branch: place to store the chain in.
736 *
737 * This function allocates blocks, zeroes out all but the last one,
738 * links them into chain and (if we are synchronous) writes them to disk.
739 * In other words, it prepares a branch that can be spliced onto the
740 * inode. It stores the information about that chain in the branch[], in
741 * the same format as ext4_get_branch() would do. We are calling it after
742 * we had read the existing part of chain and partial points to the last
743 * triple of that (one with zero ->key). Upon the exit we have the same
744 * picture as after the successful ext4_get_block(), except that in one
745 * place chain is disconnected - *branch->p is still zero (we did not
746 * set the last link), but branch->key contains the number that should
747 * be placed into *branch->p to fill that gap.
748 *
749 * If allocation fails we free all blocks we've allocated (and forget
750 * their buffer_heads) and return the error value the from failed
751 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
752 * as described above and return 0.
753 */
754static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 ext4_lblk_t iblock, int indirect_blks,
756 int *blks, ext4_fsblk_t goal,
757 ext4_lblk_t *offsets, Indirect *branch)
758{
759 int blocksize = inode->i_sb->s_blocksize;
760 int i, n = 0;
761 int err = 0;
762 struct buffer_head *bh;
763 int num;
764 ext4_fsblk_t new_blocks[4];
765 ext4_fsblk_t current_block;
766
767 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
768 *blks, new_blocks, &err);
769 if (err)
770 return err;
771
772 branch[0].key = cpu_to_le32(new_blocks[0]);
773 /*
774 * metadata blocks and data blocks are allocated.
775 */
776 for (n = 1; n <= indirect_blks; n++) {
777 /*
778 * Get buffer_head for parent block, zero it out
779 * and set the pointer to new one, then send
780 * parent to disk.
781 */
782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) {
784 err = -EIO;
785 goto failed;
786 }
787
788 branch[n].bh = bh;
789 lock_buffer(bh);
790 BUFFER_TRACE(bh, "call get_create_access");
791 err = ext4_journal_get_create_access(handle, bh);
792 if (err) {
793 /* Don't brelse(bh) here; it's done in
794 * ext4_journal_forget() below */
795 unlock_buffer(bh);
796 goto failed;
797 }
798
799 memset(bh->b_data, 0, blocksize);
800 branch[n].p = (__le32 *) bh->b_data + offsets[n];
801 branch[n].key = cpu_to_le32(new_blocks[n]);
802 *branch[n].p = branch[n].key;
803 if (n == indirect_blks) {
804 current_block = new_blocks[n];
805 /*
806 * End of chain, update the last new metablock of
807 * the chain to point to the new allocated
808 * data blocks numbers
809 */
810 for (i = 1; i < num; i++)
811 *(branch[n].p + i) = cpu_to_le32(++current_block);
812 }
813 BUFFER_TRACE(bh, "marking uptodate");
814 set_buffer_uptodate(bh);
815 unlock_buffer(bh);
816
817 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
818 err = ext4_handle_dirty_metadata(handle, inode, bh);
819 if (err)
820 goto failed;
821 }
822 *blks = num;
823 return err;
824failed:
825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) {
828 /*
829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */
833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET);
835 }
836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838
839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840
841 return err;
842}
843
844/**
845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
847 * @inode: owner
848 * @block: (logical) number of block we are adding
849 * @chain: chain of indirect blocks (with a missing link - see
850 * ext4_alloc_branch)
851 * @where: location of missing link
852 * @num: number of indirect blocks we are adding
853 * @blks: number of direct blocks we are adding
854 *
855 * This function fills the missing link and does all housekeeping needed in
856 * inode (->i_blocks, etc.). In case of success we end up with the full
857 * chain to new block and return 0.
858 */
859static int ext4_splice_branch(handle_t *handle, struct inode *inode,
860 ext4_lblk_t block, Indirect *where, int num,
861 int blks)
862{
863 int i;
864 int err = 0;
865 ext4_fsblk_t current_block;
866
867 /*
868 * If we're splicing into a [td]indirect block (as opposed to the
869 * inode) then we need to get write access to the [td]indirect block
870 * before the splice.
871 */
872 if (where->bh) {
873 BUFFER_TRACE(where->bh, "get_write_access");
874 err = ext4_journal_get_write_access(handle, where->bh);
875 if (err)
876 goto err_out;
877 }
878 /* That's it */
879
880 *where->p = where->key;
881
882 /*
883 * Update the host buffer_head or inode to point to more just allocated
884 * direct blocks blocks
885 */
886 if (num == 0 && blks > 1) {
887 current_block = le32_to_cpu(where->key) + 1;
888 for (i = 1; i < blks; i++)
889 *(where->p + i) = cpu_to_le32(current_block++);
890 }
891
892 /* We are done with atomic stuff, now do the rest of housekeeping */
893 /* had we spliced it onto indirect block? */
894 if (where->bh) {
895 /*
896 * If we spliced it onto an indirect block, we haven't
897 * altered the inode. Note however that if it is being spliced
898 * onto an indirect block at the very end of the file (the
899 * file is growing) then we *will* alter the inode to reflect
900 * the new i_size. But that is not done here - it is done in
901 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
902 */
903 jbd_debug(5, "splicing indirect only\n");
904 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
905 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
906 if (err)
907 goto err_out;
908 } else {
909 /*
910 * OK, we spliced it into the inode itself on a direct block.
911 */
912 ext4_mark_inode_dirty(handle, inode);
913 jbd_debug(5, "splicing direct\n");
914 }
915 return err;
916
917err_out:
918 for (i = 1; i <= num; i++) {
919 /*
920 * branch[i].bh is newly allocated, so there is no
921 * need to revoke the block, which is why we don't
922 * need to set EXT4_FREE_BLOCKS_METADATA.
923 */
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET);
926 }
927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0);
929
930 return err;
931}
932
933/*
934 * The ext4_ind_map_blocks() function handles non-extents inodes
935 * (i.e., using the traditional indirect/double-indirect i_blocks
936 * scheme) for ext4_map_blocks().
937 *
938 * Allocation strategy is simple: if we have to allocate something, we will
939 * have to go the whole way to leaf. So let's do it before attaching anything
940 * to tree, set linkage between the newborn blocks, write them if sync is
941 * required, recheck the path, free and repeat if check fails, otherwise
942 * set the last missing link (that will protect us from any truncate-generated
943 * removals - all blocks on the path are immune now) and possibly force the
944 * write on the parent block.
945 * That has a nice additional property: no special recovery from the failed
946 * allocations is needed - we simply release blocks and do not touch anything
947 * reachable from inode.
948 *
949 * `handle' can be NULL if create == 0.
950 *
951 * return > 0, # of blocks mapped or allocated.
952 * return = 0, if plain lookup failed.
953 * return < 0, error case.
954 *
955 * The ext4_ind_get_blocks() function should be called with
956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
959 * blocks.
960 */
961static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
962 struct ext4_map_blocks *map,
963 int flags)
964{
965 int err = -EIO;
966 ext4_lblk_t offsets[4];
967 Indirect chain[4];
968 Indirect *partial;
969 ext4_fsblk_t goal;
970 int indirect_blks;
971 int blocks_to_boundary = 0;
972 int depth;
973 int count = 0;
974 ext4_fsblk_t first_block = 0;
975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
980 &blocks_to_boundary);
981
982 if (depth == 0)
983 goto out;
984
985 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
986
987 /* Simplest case - block found, no allocation needed */
988 if (!partial) {
989 first_block = le32_to_cpu(chain[depth - 1].key);
990 count++;
991 /*map more blocks*/
992 while (count < map->m_len && count <= blocks_to_boundary) {
993 ext4_fsblk_t blk;
994
995 blk = le32_to_cpu(*(chain[depth-1].p + count));
996
997 if (blk == first_block + count)
998 count++;
999 else
1000 break;
1001 }
1002 goto got_it;
1003 }
1004
1005 /* Next simple case - plain lookup or failed read of indirect block */
1006 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
1007 goto cleanup;
1008
1009 /*
1010 * Okay, we need to do block allocation.
1011 */
1012 goal = ext4_find_goal(inode, map->m_lblk, partial);
1013
1014 /* the number of blocks need to allocate for [d,t]indirect blocks */
1015 indirect_blks = (chain + depth) - partial - 1;
1016
1017 /*
1018 * Next look up the indirect map to count the totoal number of
1019 * direct blocks to allocate for this branch.
1020 */
1021 count = ext4_blks_to_allocate(partial, indirect_blks,
1022 map->m_len, blocks_to_boundary);
1023 /*
1024 * Block out ext4_truncate while we alter the tree
1025 */
1026 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1027 &count, goal,
1028 offsets + (partial - chain), partial);
1029
1030 /*
1031 * The ext4_splice_branch call will free and forget any buffers
1032 * on the new chain if there is a failure, but that risks using
1033 * up transaction credits, especially for bitmaps where the
1034 * credits cannot be returned. Can we handle this somehow? We
1035 * may need to return -EAGAIN upwards in the worst case. --sct
1036 */
1037 if (!err)
1038 err = ext4_splice_branch(handle, inode, map->m_lblk,
1039 partial, indirect_blks, count);
1040 if (err)
1041 goto cleanup;
1042
1043 map->m_flags |= EXT4_MAP_NEW;
1044
1045 ext4_update_inode_fsync_trans(handle, inode, 1);
1046got_it:
1047 map->m_flags |= EXT4_MAP_MAPPED;
1048 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1049 map->m_len = count;
1050 if (count > blocks_to_boundary)
1051 map->m_flags |= EXT4_MAP_BOUNDARY;
1052 err = count;
1053 /* Clean up and exit */
1054 partial = chain + depth - 1; /* the whole chain */
1055cleanup:
1056 while (partial > chain) {
1057 BUFFER_TRACE(partial->bh, "call brelse");
1058 brelse(partial->bh);
1059 partial--;
1060 }
1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1064 return err;
1065}
1066
1067#ifdef CONFIG_QUOTA 241#ifdef CONFIG_QUOTA
1068qsize_t *ext4_get_reserved_space(struct inode *inode) 242qsize_t *ext4_get_reserved_space(struct inode *inode)
1069{ 243{
@@ -1073,33 +247,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
1073 247
1074/* 248/*
1075 * Calculate the number of metadata blocks need to reserve 249 * Calculate the number of metadata blocks need to reserve
1076 * to allocate a new block at @lblocks for non extent file based file
1077 */
1078static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1079 sector_t lblock)
1080{
1081 struct ext4_inode_info *ei = EXT4_I(inode);
1082 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1083 int blk_bits;
1084
1085 if (lblock < EXT4_NDIR_BLOCKS)
1086 return 0;
1087
1088 lblock -= EXT4_NDIR_BLOCKS;
1089
1090 if (ei->i_da_metadata_calc_len &&
1091 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1092 ei->i_da_metadata_calc_len++;
1093 return 0;
1094 }
1095 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1096 ei->i_da_metadata_calc_len = 1;
1097 blk_bits = order_base_2(lblock);
1098 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1099}
1100
1101/*
1102 * Calculate the number of metadata blocks need to reserve
1103 * to allocate a block located at @lblock 250 * to allocate a block located at @lblock
1104 */ 251 */
1105static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 252static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
@@ -1107,7 +254,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 254 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1108 return ext4_ext_calc_metadata_amount(inode, lblock); 255 return ext4_ext_calc_metadata_amount(inode, lblock);
1109 256
1110 return ext4_indirect_calc_metadata_amount(inode, lblock); 257 return ext4_ind_calc_metadata_amount(inode, lblock);
1111} 258}
1112 259
1113/* 260/*
@@ -1589,16 +736,6 @@ static int do_journal_get_write_access(handle_t *handle,
1589 return ret; 736 return ret;
1590} 737}
1591 738
1592/*
1593 * Truncate blocks that were not used by write. We have to truncate the
1594 * pagecache as well so that corresponding buffers get properly unmapped.
1595 */
1596static void ext4_truncate_failed_write(struct inode *inode)
1597{
1598 truncate_inode_pages(inode->i_mapping, inode->i_size);
1599 ext4_truncate(inode);
1600}
1601
1602static int ext4_get_block_write(struct inode *inode, sector_t iblock, 739static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1603 struct buffer_head *bh_result, int create); 740 struct buffer_head *bh_result, int create);
1604static int ext4_write_begin(struct file *file, struct address_space *mapping, 741static int ext4_write_begin(struct file *file, struct address_space *mapping,
@@ -1849,6 +986,8 @@ static int ext4_journalled_write_end(struct file *file,
1849 from = pos & (PAGE_CACHE_SIZE - 1); 986 from = pos & (PAGE_CACHE_SIZE - 1);
1850 to = from + len; 987 to = from + len;
1851 988
989 BUG_ON(!ext4_handle_valid(handle));
990
1852 if (copied < len) { 991 if (copied < len) {
1853 if (!PageUptodate(page)) 992 if (!PageUptodate(page))
1854 copied = 0; 993 copied = 0;
@@ -1863,6 +1002,7 @@ static int ext4_journalled_write_end(struct file *file,
1863 if (new_i_size > inode->i_size) 1002 if (new_i_size > inode->i_size)
1864 i_size_write(inode, pos+copied); 1003 i_size_write(inode, pos+copied);
1865 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1004 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1005 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1866 if (new_i_size > EXT4_I(inode)->i_disksize) { 1006 if (new_i_size > EXT4_I(inode)->i_disksize) {
1867 ext4_update_i_disksize(inode, new_i_size); 1007 ext4_update_i_disksize(inode, new_i_size);
1868 ret2 = ext4_mark_inode_dirty(handle, inode); 1008 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2148,7 +1288,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2148 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) 1288 else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
2149 err = ext4_bio_write_page(&io_submit, page, 1289 err = ext4_bio_write_page(&io_submit, page,
2150 len, mpd->wbc); 1290 len, mpd->wbc);
2151 else 1291 else if (buffer_uninit(page_bufs)) {
1292 ext4_set_bh_endio(page_bufs, inode);
1293 err = block_write_full_page_endio(page,
1294 noalloc_get_block_write,
1295 mpd->wbc, ext4_end_io_buffer_write);
1296 } else
2152 err = block_write_full_page(page, 1297 err = block_write_full_page(page,
2153 noalloc_get_block_write, mpd->wbc); 1298 noalloc_get_block_write, mpd->wbc);
2154 1299
@@ -2564,6 +1709,8 @@ static int __ext4_journalled_writepage(struct page *page,
2564 goto out; 1709 goto out;
2565 } 1710 }
2566 1711
1712 BUG_ON(!ext4_handle_valid(handle));
1713
2567 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1714 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
2568 do_journal_get_write_access); 1715 do_journal_get_write_access);
2569 1716
@@ -2571,6 +1718,7 @@ static int __ext4_journalled_writepage(struct page *page,
2571 write_end_fn); 1718 write_end_fn);
2572 if (ret == 0) 1719 if (ret == 0)
2573 ret = err; 1720 ret = err;
1721 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
2574 err = ext4_journal_stop(handle); 1722 err = ext4_journal_stop(handle);
2575 if (!ret) 1723 if (!ret)
2576 ret = err; 1724 ret = err;
@@ -2741,7 +1889,7 @@ static int write_cache_pages_da(struct address_space *mapping,
2741 index = wbc->range_start >> PAGE_CACHE_SHIFT; 1889 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2742 end = wbc->range_end >> PAGE_CACHE_SHIFT; 1890 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2743 1891
2744 if (wbc->sync_mode == WB_SYNC_ALL) 1892 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2745 tag = PAGECACHE_TAG_TOWRITE; 1893 tag = PAGECACHE_TAG_TOWRITE;
2746 else 1894 else
2747 tag = PAGECACHE_TAG_DIRTY; 1895 tag = PAGECACHE_TAG_DIRTY;
@@ -2973,7 +2121,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2973 } 2121 }
2974 2122
2975retry: 2123retry:
2976 if (wbc->sync_mode == WB_SYNC_ALL) 2124 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2977 tag_pages_for_writeback(mapping, index, end); 2125 tag_pages_for_writeback(mapping, index, end);
2978 2126
2979 while (!ret && wbc->nr_to_write > 0) { 2127 while (!ret && wbc->nr_to_write > 0) {
@@ -3450,112 +2598,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3450} 2598}
3451 2599
3452/* 2600/*
3453 * O_DIRECT for ext3 (or indirect map) based files
3454 *
3455 * If the O_DIRECT write will extend the file then add this inode to the
3456 * orphan list. So recovery will truncate it back to the original size
3457 * if the machine crashes during the write.
3458 *
3459 * If the O_DIRECT write is intantiating holes inside i_size and the machine
3460 * crashes then stale disk data _may_ be exposed inside the file. But current
3461 * VFS code falls back into buffered path in that case so we are safe.
3462 */
3463static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3464 const struct iovec *iov, loff_t offset,
3465 unsigned long nr_segs)
3466{
3467 struct file *file = iocb->ki_filp;
3468 struct inode *inode = file->f_mapping->host;
3469 struct ext4_inode_info *ei = EXT4_I(inode);
3470 handle_t *handle;
3471 ssize_t ret;
3472 int orphan = 0;
3473 size_t count = iov_length(iov, nr_segs);
3474 int retries = 0;
3475
3476 if (rw == WRITE) {
3477 loff_t final_size = offset + count;
3478
3479 if (final_size > inode->i_size) {
3480 /* Credits for sb + inode write */
3481 handle = ext4_journal_start(inode, 2);
3482 if (IS_ERR(handle)) {
3483 ret = PTR_ERR(handle);
3484 goto out;
3485 }
3486 ret = ext4_orphan_add(handle, inode);
3487 if (ret) {
3488 ext4_journal_stop(handle);
3489 goto out;
3490 }
3491 orphan = 1;
3492 ei->i_disksize = inode->i_size;
3493 ext4_journal_stop(handle);
3494 }
3495 }
3496
3497retry:
3498 if (rw == READ && ext4_should_dioread_nolock(inode))
3499 ret = __blockdev_direct_IO(rw, iocb, inode,
3500 inode->i_sb->s_bdev, iov,
3501 offset, nr_segs,
3502 ext4_get_block, NULL, NULL, 0);
3503 else {
3504 ret = blockdev_direct_IO(rw, iocb, inode, iov,
3505 offset, nr_segs, ext4_get_block);
3506
3507 if (unlikely((rw & WRITE) && ret < 0)) {
3508 loff_t isize = i_size_read(inode);
3509 loff_t end = offset + iov_length(iov, nr_segs);
3510
3511 if (end > isize)
3512 ext4_truncate_failed_write(inode);
3513 }
3514 }
3515 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3516 goto retry;
3517
3518 if (orphan) {
3519 int err;
3520
3521 /* Credits for sb + inode write */
3522 handle = ext4_journal_start(inode, 2);
3523 if (IS_ERR(handle)) {
3524 /* This is really bad luck. We've written the data
3525 * but cannot extend i_size. Bail out and pretend
3526 * the write failed... */
3527 ret = PTR_ERR(handle);
3528 if (inode->i_nlink)
3529 ext4_orphan_del(NULL, inode);
3530
3531 goto out;
3532 }
3533 if (inode->i_nlink)
3534 ext4_orphan_del(handle, inode);
3535 if (ret > 0) {
3536 loff_t end = offset + ret;
3537 if (end > inode->i_size) {
3538 ei->i_disksize = end;
3539 i_size_write(inode, end);
3540 /*
3541 * We're going to return a positive `ret'
3542 * here due to non-zero-length I/O, so there's
3543 * no way of reporting error returns from
3544 * ext4_mark_inode_dirty() to userspace. So
3545 * ignore it.
3546 */
3547 ext4_mark_inode_dirty(handle, inode);
3548 }
3549 }
3550 err = ext4_journal_stop(handle);
3551 if (ret == 0)
3552 ret = err;
3553 }
3554out:
3555 return ret;
3556}
3557
3558/*
3559 * ext4_get_block used when preparing for a DIO write or buffer write. 2601 * ext4_get_block used when preparing for a DIO write or buffer write.
3560 * We allocate an uinitialized extent if blocks haven't been allocated. 2602 * We allocate an uinitialized extent if blocks haven't been allocated.
3561 * The extent will be converted to initialized after the IO is complete. 2603 * The extent will be converted to initialized after the IO is complete.
@@ -3638,8 +2680,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3638 goto out; 2680 goto out;
3639 } 2681 }
3640 2682
3641 io_end->flag = EXT4_IO_END_UNWRITTEN; 2683 /*
2684 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
2685 * but being more careful is always safe for the future change.
2686 */
3642 inode = io_end->inode; 2687 inode = io_end->inode;
2688 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2689 io_end->flag |= EXT4_IO_END_UNWRITTEN;
2690 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
2691 }
3643 2692
3644 /* Add the io_end to per-inode completed io list*/ 2693 /* Add the io_end to per-inode completed io list*/
3645 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 2694 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -4033,383 +3082,6 @@ unlock:
4033 return err; 3082 return err;
4034} 3083}
4035 3084
4036/*
4037 * Probably it should be a library function... search for first non-zero word
4038 * or memcmp with zero_page, whatever is better for particular architecture.
4039 * Linus?
4040 */
4041static inline int all_zeroes(__le32 *p, __le32 *q)
4042{
4043 while (p < q)
4044 if (*p++)
4045 return 0;
4046 return 1;
4047}
4048
4049/**
4050 * ext4_find_shared - find the indirect blocks for partial truncation.
4051 * @inode: inode in question
4052 * @depth: depth of the affected branch
4053 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
4054 * @chain: place to store the pointers to partial indirect blocks
4055 * @top: place to the (detached) top of branch
4056 *
4057 * This is a helper function used by ext4_truncate().
4058 *
4059 * When we do truncate() we may have to clean the ends of several
4060 * indirect blocks but leave the blocks themselves alive. Block is
4061 * partially truncated if some data below the new i_size is referred
4062 * from it (and it is on the path to the first completely truncated
4063 * data block, indeed). We have to free the top of that path along
4064 * with everything to the right of the path. Since no allocation
4065 * past the truncation point is possible until ext4_truncate()
4066 * finishes, we may safely do the latter, but top of branch may
4067 * require special attention - pageout below the truncation point
4068 * might try to populate it.
4069 *
4070 * We atomically detach the top of branch from the tree, store the
4071 * block number of its root in *@top, pointers to buffer_heads of
4072 * partially truncated blocks - in @chain[].bh and pointers to
4073 * their last elements that should not be removed - in
4074 * @chain[].p. Return value is the pointer to last filled element
4075 * of @chain.
4076 *
4077 * The work left to caller to do the actual freeing of subtrees:
4078 * a) free the subtree starting from *@top
4079 * b) free the subtrees whose roots are stored in
4080 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
4081 * c) free the subtrees growing from the inode past the @chain[0].
4082 * (no partially truncated stuff there). */
4083
4084static Indirect *ext4_find_shared(struct inode *inode, int depth,
4085 ext4_lblk_t offsets[4], Indirect chain[4],
4086 __le32 *top)
4087{
4088 Indirect *partial, *p;
4089 int k, err;
4090
4091 *top = 0;
4092 /* Make k index the deepest non-null offset + 1 */
4093 for (k = depth; k > 1 && !offsets[k-1]; k--)
4094 ;
4095 partial = ext4_get_branch(inode, k, offsets, chain, &err);
4096 /* Writer: pointers */
4097 if (!partial)
4098 partial = chain + k-1;
4099 /*
4100 * If the branch acquired continuation since we've looked at it -
4101 * fine, it should all survive and (new) top doesn't belong to us.
4102 */
4103 if (!partial->key && *partial->p)
4104 /* Writer: end */
4105 goto no_top;
4106 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4107 ;
4108 /*
4109 * OK, we've found the last block that must survive. The rest of our
4110 * branch should be detached before unlocking. However, if that rest
4111 * of branch is all ours and does not grow immediately from the inode
4112 * it's easier to cheat and just decrement partial->p.
4113 */
4114 if (p == chain + k - 1 && p > chain) {
4115 p->p--;
4116 } else {
4117 *top = *p->p;
4118 /* Nope, don't do this in ext4. Must leave the tree intact */
4119#if 0
4120 *p->p = 0;
4121#endif
4122 }
4123 /* Writer: end */
4124
4125 while (partial > p) {
4126 brelse(partial->bh);
4127 partial--;
4128 }
4129no_top:
4130 return partial;
4131}
4132
4133/*
4134 * Zero a number of block pointers in either an inode or an indirect block.
4135 * If we restart the transaction we must again get write access to the
4136 * indirect block for further modification.
4137 *
4138 * We release `count' blocks on disk, but (last - first) may be greater
4139 * than `count' because there can be holes in there.
4140 *
4141 * Return 0 on success, 1 on invalid block range
4142 * and < 0 on fatal error.
4143 */
4144static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4145 struct buffer_head *bh,
4146 ext4_fsblk_t block_to_free,
4147 unsigned long count, __le32 *first,
4148 __le32 *last)
4149{
4150 __le32 *p;
4151 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4152 int err;
4153
4154 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4155 flags |= EXT4_FREE_BLOCKS_METADATA;
4156
4157 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4158 count)) {
4159 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4160 "blocks %llu len %lu",
4161 (unsigned long long) block_to_free, count);
4162 return 1;
4163 }
4164
4165 if (try_to_extend_transaction(handle, inode)) {
4166 if (bh) {
4167 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4168 err = ext4_handle_dirty_metadata(handle, inode, bh);
4169 if (unlikely(err))
4170 goto out_err;
4171 }
4172 err = ext4_mark_inode_dirty(handle, inode);
4173 if (unlikely(err))
4174 goto out_err;
4175 err = ext4_truncate_restart_trans(handle, inode,
4176 blocks_for_truncate(inode));
4177 if (unlikely(err))
4178 goto out_err;
4179 if (bh) {
4180 BUFFER_TRACE(bh, "retaking write access");
4181 err = ext4_journal_get_write_access(handle, bh);
4182 if (unlikely(err))
4183 goto out_err;
4184 }
4185 }
4186
4187 for (p = first; p < last; p++)
4188 *p = 0;
4189
4190 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4191 return 0;
4192out_err:
4193 ext4_std_error(inode->i_sb, err);
4194 return err;
4195}
4196
4197/**
4198 * ext4_free_data - free a list of data blocks
4199 * @handle: handle for this transaction
4200 * @inode: inode we are dealing with
4201 * @this_bh: indirect buffer_head which contains *@first and *@last
4202 * @first: array of block numbers
4203 * @last: points immediately past the end of array
4204 *
4205 * We are freeing all blocks referred from that array (numbers are stored as
4206 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4207 *
4208 * We accumulate contiguous runs of blocks to free. Conveniently, if these
4209 * blocks are contiguous then releasing them at one time will only affect one
4210 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4211 * actually use a lot of journal space.
4212 *
4213 * @this_bh will be %NULL if @first and @last point into the inode's direct
4214 * block pointers.
4215 */
4216static void ext4_free_data(handle_t *handle, struct inode *inode,
4217 struct buffer_head *this_bh,
4218 __le32 *first, __le32 *last)
4219{
4220 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
4221 unsigned long count = 0; /* Number of blocks in the run */
4222 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
4223 corresponding to
4224 block_to_free */
4225 ext4_fsblk_t nr; /* Current block # */
4226 __le32 *p; /* Pointer into inode/ind
4227 for current block */
4228 int err = 0;
4229
4230 if (this_bh) { /* For indirect block */
4231 BUFFER_TRACE(this_bh, "get_write_access");
4232 err = ext4_journal_get_write_access(handle, this_bh);
4233 /* Important: if we can't update the indirect pointers
4234 * to the blocks, we can't free them. */
4235 if (err)
4236 return;
4237 }
4238
4239 for (p = first; p < last; p++) {
4240 nr = le32_to_cpu(*p);
4241 if (nr) {
4242 /* accumulate blocks to free if they're contiguous */
4243 if (count == 0) {
4244 block_to_free = nr;
4245 block_to_free_p = p;
4246 count = 1;
4247 } else if (nr == block_to_free + count) {
4248 count++;
4249 } else {
4250 err = ext4_clear_blocks(handle, inode, this_bh,
4251 block_to_free, count,
4252 block_to_free_p, p);
4253 if (err)
4254 break;
4255 block_to_free = nr;
4256 block_to_free_p = p;
4257 count = 1;
4258 }
4259 }
4260 }
4261
4262 if (!err && count > 0)
4263 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4264 count, block_to_free_p, p);
4265 if (err < 0)
4266 /* fatal error */
4267 return;
4268
4269 if (this_bh) {
4270 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4271
4272 /*
4273 * The buffer head should have an attached journal head at this
4274 * point. However, if the data is corrupted and an indirect
4275 * block pointed to itself, it would have been detached when
4276 * the block was cleared. Check for this instead of OOPSing.
4277 */
4278 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4279 ext4_handle_dirty_metadata(handle, inode, this_bh);
4280 else
4281 EXT4_ERROR_INODE(inode,
4282 "circular indirect block detected at "
4283 "block %llu",
4284 (unsigned long long) this_bh->b_blocknr);
4285 }
4286}
4287
4288/**
4289 * ext4_free_branches - free an array of branches
4290 * @handle: JBD handle for this transaction
4291 * @inode: inode we are dealing with
4292 * @parent_bh: the buffer_head which contains *@first and *@last
4293 * @first: array of block numbers
4294 * @last: pointer immediately past the end of array
4295 * @depth: depth of the branches to free
4296 *
4297 * We are freeing all blocks referred from these branches (numbers are
4298 * stored as little-endian 32-bit) and updating @inode->i_blocks
4299 * appropriately.
4300 */
4301static void ext4_free_branches(handle_t *handle, struct inode *inode,
4302 struct buffer_head *parent_bh,
4303 __le32 *first, __le32 *last, int depth)
4304{
4305 ext4_fsblk_t nr;
4306 __le32 *p;
4307
4308 if (ext4_handle_is_aborted(handle))
4309 return;
4310
4311 if (depth--) {
4312 struct buffer_head *bh;
4313 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4314 p = last;
4315 while (--p >= first) {
4316 nr = le32_to_cpu(*p);
4317 if (!nr)
4318 continue; /* A hole */
4319
4320 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4321 nr, 1)) {
4322 EXT4_ERROR_INODE(inode,
4323 "invalid indirect mapped "
4324 "block %lu (level %d)",
4325 (unsigned long) nr, depth);
4326 break;
4327 }
4328
4329 /* Go read the buffer for the next level down */
4330 bh = sb_bread(inode->i_sb, nr);
4331
4332 /*
4333 * A read failure? Report error and clear slot
4334 * (should be rare).
4335 */
4336 if (!bh) {
4337 EXT4_ERROR_INODE_BLOCK(inode, nr,
4338 "Read failure");
4339 continue;
4340 }
4341
4342 /* This zaps the entire block. Bottom up. */
4343 BUFFER_TRACE(bh, "free child branches");
4344 ext4_free_branches(handle, inode, bh,
4345 (__le32 *) bh->b_data,
4346 (__le32 *) bh->b_data + addr_per_block,
4347 depth);
4348 brelse(bh);
4349
4350 /*
4351 * Everything below this this pointer has been
4352 * released. Now let this top-of-subtree go.
4353 *
4354 * We want the freeing of this indirect block to be
4355 * atomic in the journal with the updating of the
4356 * bitmap block which owns it. So make some room in
4357 * the journal.
4358 *
4359 * We zero the parent pointer *after* freeing its
4360 * pointee in the bitmaps, so if extend_transaction()
4361 * for some reason fails to put the bitmap changes and
4362 * the release into the same transaction, recovery
4363 * will merely complain about releasing a free block,
4364 * rather than leaking blocks.
4365 */
4366 if (ext4_handle_is_aborted(handle))
4367 return;
4368 if (try_to_extend_transaction(handle, inode)) {
4369 ext4_mark_inode_dirty(handle, inode);
4370 ext4_truncate_restart_trans(handle, inode,
4371 blocks_for_truncate(inode));
4372 }
4373
4374 /*
4375 * The forget flag here is critical because if
4376 * we are journaling (and not doing data
4377 * journaling), we have to make sure a revoke
4378 * record is written to prevent the journal
4379 * replay from overwriting the (former)
4380 * indirect block if it gets reallocated as a
4381 * data block. This must happen in the same
4382 * transaction where the data blocks are
4383 * actually freed.
4384 */
4385 ext4_free_blocks(handle, inode, NULL, nr, 1,
4386 EXT4_FREE_BLOCKS_METADATA|
4387 EXT4_FREE_BLOCKS_FORGET);
4388
4389 if (parent_bh) {
4390 /*
4391 * The block which we have just freed is
4392 * pointed to by an indirect block: journal it
4393 */
4394 BUFFER_TRACE(parent_bh, "get_write_access");
4395 if (!ext4_journal_get_write_access(handle,
4396 parent_bh)){
4397 *p = 0;
4398 BUFFER_TRACE(parent_bh,
4399 "call ext4_handle_dirty_metadata");
4400 ext4_handle_dirty_metadata(handle,
4401 inode,
4402 parent_bh);
4403 }
4404 }
4405 }
4406 } else {
4407 /* We have reached the bottom of the tree. */
4408 BUFFER_TRACE(parent_bh, "free data blocks");
4409 ext4_free_data(handle, inode, parent_bh, first, last);
4410 }
4411}
4412
4413int ext4_can_truncate(struct inode *inode) 3085int ext4_can_truncate(struct inode *inode)
4414{ 3086{
4415 if (S_ISREG(inode->i_mode)) 3087 if (S_ISREG(inode->i_mode))
@@ -4476,19 +3148,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4476 */ 3148 */
4477void ext4_truncate(struct inode *inode) 3149void ext4_truncate(struct inode *inode)
4478{ 3150{
4479 handle_t *handle;
4480 struct ext4_inode_info *ei = EXT4_I(inode);
4481 __le32 *i_data = ei->i_data;
4482 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4483 struct address_space *mapping = inode->i_mapping;
4484 ext4_lblk_t offsets[4];
4485 Indirect chain[4];
4486 Indirect *partial;
4487 __le32 nr = 0;
4488 int n = 0;
4489 ext4_lblk_t last_block, max_block;
4490 unsigned blocksize = inode->i_sb->s_blocksize;
4491
4492 trace_ext4_truncate_enter(inode); 3151 trace_ext4_truncate_enter(inode);
4493 3152
4494 if (!ext4_can_truncate(inode)) 3153 if (!ext4_can_truncate(inode))
@@ -4499,149 +3158,11 @@ void ext4_truncate(struct inode *inode)
4499 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3158 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4500 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 3159 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4501 3160
4502 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3161 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4503 ext4_ext_truncate(inode); 3162 ext4_ext_truncate(inode);
4504 trace_ext4_truncate_exit(inode); 3163 else
4505 return; 3164 ext4_ind_truncate(inode);
4506 }
4507
4508 handle = start_transaction(inode);
4509 if (IS_ERR(handle))
4510 return; /* AKPM: return what? */
4511
4512 last_block = (inode->i_size + blocksize-1)
4513 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4514 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4515 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4516
4517 if (inode->i_size & (blocksize - 1))
4518 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4519 goto out_stop;
4520
4521 if (last_block != max_block) {
4522 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4523 if (n == 0)
4524 goto out_stop; /* error */
4525 }
4526
4527 /*
4528 * OK. This truncate is going to happen. We add the inode to the
4529 * orphan list, so that if this truncate spans multiple transactions,
4530 * and we crash, we will resume the truncate when the filesystem
4531 * recovers. It also marks the inode dirty, to catch the new size.
4532 *
4533 * Implication: the file must always be in a sane, consistent
4534 * truncatable state while each transaction commits.
4535 */
4536 if (ext4_orphan_add(handle, inode))
4537 goto out_stop;
4538
4539 /*
4540 * From here we block out all ext4_get_block() callers who want to
4541 * modify the block allocation tree.
4542 */
4543 down_write(&ei->i_data_sem);
4544
4545 ext4_discard_preallocations(inode);
4546
4547 /*
4548 * The orphan list entry will now protect us from any crash which
4549 * occurs before the truncate completes, so it is now safe to propagate
4550 * the new, shorter inode size (held for now in i_size) into the
4551 * on-disk inode. We do this via i_disksize, which is the value which
4552 * ext4 *really* writes onto the disk inode.
4553 */
4554 ei->i_disksize = inode->i_size;
4555
4556 if (last_block == max_block) {
4557 /*
4558 * It is unnecessary to free any data blocks if last_block is
4559 * equal to the indirect block limit.
4560 */
4561 goto out_unlock;
4562 } else if (n == 1) { /* direct blocks */
4563 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4564 i_data + EXT4_NDIR_BLOCKS);
4565 goto do_indirects;
4566 }
4567
4568 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4569 /* Kill the top of shared branch (not detached) */
4570 if (nr) {
4571 if (partial == chain) {
4572 /* Shared branch grows from the inode */
4573 ext4_free_branches(handle, inode, NULL,
4574 &nr, &nr+1, (chain+n-1) - partial);
4575 *partial->p = 0;
4576 /*
4577 * We mark the inode dirty prior to restart,
4578 * and prior to stop. No need for it here.
4579 */
4580 } else {
4581 /* Shared branch grows from an indirect block */
4582 BUFFER_TRACE(partial->bh, "get_write_access");
4583 ext4_free_branches(handle, inode, partial->bh,
4584 partial->p,
4585 partial->p+1, (chain+n-1) - partial);
4586 }
4587 }
4588 /* Clear the ends of indirect blocks on the shared branch */
4589 while (partial > chain) {
4590 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4591 (__le32*)partial->bh->b_data+addr_per_block,
4592 (chain+n-1) - partial);
4593 BUFFER_TRACE(partial->bh, "call brelse");
4594 brelse(partial->bh);
4595 partial--;
4596 }
4597do_indirects:
4598 /* Kill the remaining (whole) subtrees */
4599 switch (offsets[0]) {
4600 default:
4601 nr = i_data[EXT4_IND_BLOCK];
4602 if (nr) {
4603 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4604 i_data[EXT4_IND_BLOCK] = 0;
4605 }
4606 case EXT4_IND_BLOCK:
4607 nr = i_data[EXT4_DIND_BLOCK];
4608 if (nr) {
4609 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4610 i_data[EXT4_DIND_BLOCK] = 0;
4611 }
4612 case EXT4_DIND_BLOCK:
4613 nr = i_data[EXT4_TIND_BLOCK];
4614 if (nr) {
4615 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4616 i_data[EXT4_TIND_BLOCK] = 0;
4617 }
4618 case EXT4_TIND_BLOCK:
4619 ;
4620 }
4621
4622out_unlock:
4623 up_write(&ei->i_data_sem);
4624 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4625 ext4_mark_inode_dirty(handle, inode);
4626
4627 /*
4628 * In a multi-transaction truncate, we only make the final transaction
4629 * synchronous
4630 */
4631 if (IS_SYNC(inode))
4632 ext4_handle_sync(handle);
4633out_stop:
4634 /*
4635 * If this was a simple ftruncate(), and the file will remain alive
4636 * then we need to clear up the orphan record which we created above.
4637 * However, if this was a real unlink then we were called by
4638 * ext4_delete_inode(), and we allow that function to clean up the
4639 * orphan info for us.
4640 */
4641 if (inode->i_nlink)
4642 ext4_orphan_del(handle, inode);
4643 3165
4644 ext4_journal_stop(handle);
4645 trace_ext4_truncate_exit(inode); 3166 trace_ext4_truncate_exit(inode);
4646} 3167}
4647 3168
@@ -5012,7 +3533,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5012 (S_ISLNK(inode->i_mode) && 3533 (S_ISLNK(inode->i_mode) &&
5013 !ext4_inode_is_fast_symlink(inode))) { 3534 !ext4_inode_is_fast_symlink(inode))) {
5014 /* Validate block references which are part of inode */ 3535 /* Validate block references which are part of inode */
5015 ret = ext4_check_inode_blockref(inode); 3536 ret = ext4_ind_check_inode(inode);
5016 } 3537 }
5017 if (ret) 3538 if (ret)
5018 goto bad_inode; 3539 goto bad_inode;
@@ -5459,34 +3980,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5459 return 0; 3980 return 0;
5460} 3981}
5461 3982
5462static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5463 int chunk)
5464{
5465 int indirects;
5466
5467 /* if nrblocks are contiguous */
5468 if (chunk) {
5469 /*
5470 * With N contiguous data blocks, we need at most
5471 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5472 * 2 dindirect blocks, and 1 tindirect block
5473 */
5474 return DIV_ROUND_UP(nrblocks,
5475 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5476 }
5477 /*
5478 * if nrblocks are not contiguous, worse case, each block touch
5479 * a indirect block, and each indirect block touch a double indirect
5480 * block, plus a triple indirect block
5481 */
5482 indirects = nrblocks * 2 + 1;
5483 return indirects;
5484}
5485
5486static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 3983static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5487{ 3984{
5488 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3985 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5489 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 3986 return ext4_ind_trans_blocks(inode, nrblocks, chunk);
5490 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 3987 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5491} 3988}
5492 3989
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 808c554e773f..f18bfe37aff8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -202,8 +202,9 @@ setversion_out:
202 struct super_block *sb = inode->i_sb; 202 struct super_block *sb = inode->i_sb;
203 int err, err2=0; 203 int err, err2=0;
204 204
205 if (!capable(CAP_SYS_RESOURCE)) 205 err = ext4_resize_begin(sb);
206 return -EPERM; 206 if (err)
207 return err;
207 208
208 if (get_user(n_blocks_count, (__u32 __user *)arg)) 209 if (get_user(n_blocks_count, (__u32 __user *)arg))
209 return -EFAULT; 210 return -EFAULT;
@@ -221,6 +222,7 @@ setversion_out:
221 if (err == 0) 222 if (err == 0)
222 err = err2; 223 err = err2;
223 mnt_drop_write(filp->f_path.mnt); 224 mnt_drop_write(filp->f_path.mnt);
225 ext4_resize_end(sb);
224 226
225 return err; 227 return err;
226 } 228 }
@@ -271,8 +273,9 @@ mext_out:
271 struct super_block *sb = inode->i_sb; 273 struct super_block *sb = inode->i_sb;
272 int err, err2=0; 274 int err, err2=0;
273 275
274 if (!capable(CAP_SYS_RESOURCE)) 276 err = ext4_resize_begin(sb);
275 return -EPERM; 277 if (err)
278 return err;
276 279
277 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, 280 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
278 sizeof(input))) 281 sizeof(input)))
@@ -291,6 +294,7 @@ mext_out:
291 if (err == 0) 294 if (err == 0)
292 err = err2; 295 err = err2;
293 mnt_drop_write(filp->f_path.mnt); 296 mnt_drop_write(filp->f_path.mnt);
297 ext4_resize_end(sb);
294 298
295 return err; 299 return err;
296 } 300 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6ed859d56850..17a5a57c415a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -75,8 +75,8 @@
75 * 75 *
76 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
77 * block. If only the logical file block falls within the range of prealloc 77 * block. If only the logical file block falls within the range of prealloc
78 * space we will consume the particular prealloc space. This make sure that 78 * space we will consume the particular prealloc space. This makes sure that
79 * that the we have contiguous physical blocks representing the file blocks 79 * we have contiguous physical blocks representing the file blocks
80 * 80 *
81 * The important thing to be noted in case of inode prealloc space is that 81 * The important thing to be noted in case of inode prealloc space is that
82 * we don't modify the values associated to inode prealloc space except 82 * we don't modify the values associated to inode prealloc space except
@@ -84,7 +84,7 @@
84 * 84 *
85 * If we are not able to find blocks in the inode prealloc space and if we 85 * If we are not able to find blocks in the inode prealloc space and if we
86 * have the group allocation flag set then we look at the locality group 86 * have the group allocation flag set then we look at the locality group
87 * prealloc space. These are per CPU prealloc list repreasented as 87 * prealloc space. These are per CPU prealloc list represented as
88 * 88 *
89 * ext4_sb_info.s_locality_groups[smp_processor_id()] 89 * ext4_sb_info.s_locality_groups[smp_processor_id()]
90 * 90 *
@@ -128,12 +128,13 @@
128 * we are doing a group prealloc we try to normalize the request to 128 * we are doing a group prealloc we try to normalize the request to
129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is 129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
130 * 512 blocks. This can be tuned via 130 * 512 blocks. This can be tuned via
131 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in 131 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
132 * terms of number of blocks. If we have mounted the file system with -O 132 * terms of number of blocks. If we have mounted the file system with -O
133 * stripe=<value> option the group prealloc request is normalized to the 133 * stripe=<value> option the group prealloc request is normalized to the
134 * stripe value (sbi->s_stripe) 134 * the smallest multiple of the stripe value (sbi->s_stripe) which is
135 * greater than the default mb_group_prealloc.
135 * 136 *
136 * The regular allocator(using the buddy cache) supports few tunables. 137 * The regular allocator (using the buddy cache) supports a few tunables.
137 * 138 *
138 * /sys/fs/ext4/<partition>/mb_min_to_scan 139 * /sys/fs/ext4/<partition>/mb_min_to_scan
139 * /sys/fs/ext4/<partition>/mb_max_to_scan 140 * /sys/fs/ext4/<partition>/mb_max_to_scan
@@ -152,7 +153,7 @@
152 * best extent in the found extents. Searching for the blocks starts with 153 * best extent in the found extents. Searching for the blocks starts with
153 * the group specified as the goal value in allocation context via 154 * the group specified as the goal value in allocation context via
154 * ac_g_ex. Each group is first checked based on the criteria whether it 155 * ac_g_ex. Each group is first checked based on the criteria whether it
155 * can used for allocation. ext4_mb_good_group explains how the groups are 156 * can be used for allocation. ext4_mb_good_group explains how the groups are
156 * checked. 157 * checked.
157 * 158 *
158 * Both the prealloc space are getting populated as above. So for the first 159 * Both the prealloc space are getting populated as above. So for the first
@@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
492 b2 = (unsigned char *) bitmap; 493 b2 = (unsigned char *) bitmap;
493 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 494 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
494 if (b1[i] != b2[i]) { 495 if (b1[i] != b2[i]) {
495 printk(KERN_ERR "corruption in group %u " 496 ext4_msg(e4b->bd_sb, KERN_ERR,
496 "at byte %u(%u): %x in copy != %x " 497 "corruption in group %u "
497 "on disk/prealloc\n", 498 "at byte %u(%u): %x in copy != %x "
498 e4b->bd_group, i, i * 8, b1[i], b2[i]); 499 "on disk/prealloc",
500 e4b->bd_group, i, i * 8, b1[i], b2[i]);
499 BUG(); 501 BUG();
500 } 502 }
501 } 503 }
@@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1125 grp = ext4_get_group_info(sb, group); 1127 grp = ext4_get_group_info(sb, group);
1126 1128
1127 e4b->bd_blkbits = sb->s_blocksize_bits; 1129 e4b->bd_blkbits = sb->s_blocksize_bits;
1128 e4b->bd_info = ext4_get_group_info(sb, group); 1130 e4b->bd_info = grp;
1129 e4b->bd_sb = sb; 1131 e4b->bd_sb = sb;
1130 e4b->bd_group = group; 1132 e4b->bd_group = group;
1131 e4b->bd_buddy_page = NULL; 1133 e4b->bd_buddy_page = NULL;
@@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len)
1281 } 1283 }
1282} 1284}
1283 1285
1284static void mb_set_bits(void *bm, int cur, int len) 1286void ext4_set_bits(void *bm, int cur, int len)
1285{ 1287{
1286 __u32 *addr; 1288 __u32 *addr;
1287 1289
@@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1510 } 1512 }
1511 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1513 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1512 1514
1513 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1515 ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1514 mb_check_buddy(e4b); 1516 mb_check_buddy(e4b);
1515 1517
1516 return ret; 1518 return ret;
@@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2223 EXT4_DESC_PER_BLOCK_BITS(sb); 2225 EXT4_DESC_PER_BLOCK_BITS(sb);
2224 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2226 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2225 if (meta_group_info == NULL) { 2227 if (meta_group_info == NULL) {
2226 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2228 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
2227 "buddy group\n"); 2229 "for a buddy group");
2228 goto exit_meta_group_info; 2230 goto exit_meta_group_info;
2229 } 2231 }
2230 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2232 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
@@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2237 2239
2238 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2240 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2239 if (meta_group_info[i] == NULL) { 2241 if (meta_group_info[i] == NULL) {
2240 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2242 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
2241 goto exit_group_info; 2243 goto exit_group_info;
2242 } 2244 }
2243 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2245 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2279 2281
2280exit_group_info: 2282exit_group_info:
2281 /* If a meta_group_info table has been allocated, release it now */ 2283 /* If a meta_group_info table has been allocated, release it now */
2282 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) 2284 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2283 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2285 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2286 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2287 }
2284exit_meta_group_info: 2288exit_meta_group_info:
2285 return -ENOMEM; 2289 return -ENOMEM;
2286} /* ext4_mb_add_groupinfo */ 2290} /* ext4_mb_add_groupinfo */
@@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
2328 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2332 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2329 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2333 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2330 * So a two level scheme suffices for now. */ 2334 * So a two level scheme suffices for now. */
2331 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); 2335 sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
2332 if (sbi->s_group_info == NULL) { 2336 if (sbi->s_group_info == NULL) {
2333 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2337 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2334 return -ENOMEM; 2338 return -ENOMEM;
2335 } 2339 }
2336 sbi->s_buddy_cache = new_inode(sb); 2340 sbi->s_buddy_cache = new_inode(sb);
2337 if (sbi->s_buddy_cache == NULL) { 2341 if (sbi->s_buddy_cache == NULL) {
2338 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2342 ext4_msg(sb, KERN_ERR, "can't get new inode");
2339 goto err_freesgi; 2343 goto err_freesgi;
2340 } 2344 }
2341 sbi->s_buddy_cache->i_ino = get_next_ino(); 2345 /* To avoid potentially colliding with an valid on-disk inode number,
2346 * use EXT4_BAD_INO for the buddy cache inode number. This inode is
2347 * not in the inode hash, so it should never be found by iget(), but
2348 * this will avoid confusion if it ever shows up during debugging. */
2349 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2342 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2350 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2343 for (i = 0; i < ngroups; i++) { 2351 for (i = 0; i < ngroups; i++) {
2344 desc = ext4_get_group_desc(sb, i, NULL); 2352 desc = ext4_get_group_desc(sb, i, NULL);
2345 if (desc == NULL) { 2353 if (desc == NULL) {
2346 printk(KERN_ERR 2354 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2347 "EXT4-fs: can't read descriptor %u\n", i);
2348 goto err_freebuddy; 2355 goto err_freebuddy;
2349 } 2356 }
2350 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2357 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2362,7 +2369,7 @@ err_freebuddy:
2362 kfree(sbi->s_group_info[i]); 2369 kfree(sbi->s_group_info[i]);
2363 iput(sbi->s_buddy_cache); 2370 iput(sbi->s_buddy_cache);
2364err_freesgi: 2371err_freesgi:
2365 kfree(sbi->s_group_info); 2372 ext4_kvfree(sbi->s_group_info);
2366 return -ENOMEM; 2373 return -ENOMEM;
2367} 2374}
2368 2375
@@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size)
2404 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 2411 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2405 NULL); 2412 NULL);
2406 2413
2414 ext4_groupinfo_caches[cache_index] = cachep;
2415
2407 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2416 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2408 if (!cachep) { 2417 if (!cachep) {
2409 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); 2418 printk(KERN_EMERG
2419 "EXT4-fs: no memory for groupinfo slab cache\n");
2410 return -ENOMEM; 2420 return -ENOMEM;
2411 } 2421 }
2412 2422
2413 ext4_groupinfo_caches[cache_index] = cachep;
2414
2415 return 0; 2423 return 0;
2416} 2424}
2417 2425
@@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2457 i++; 2465 i++;
2458 } while (i <= sb->s_blocksize_bits + 1); 2466 } while (i <= sb->s_blocksize_bits + 1);
2459 2467
2460 /* init file for buddy data */
2461 ret = ext4_mb_init_backend(sb);
2462 if (ret != 0) {
2463 goto out;
2464 }
2465
2466 spin_lock_init(&sbi->s_md_lock); 2468 spin_lock_init(&sbi->s_md_lock);
2467 spin_lock_init(&sbi->s_bal_lock); 2469 spin_lock_init(&sbi->s_bal_lock);
2468 2470
@@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2472 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2474 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2473 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2475 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2474 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2476 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2477 /*
2478 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2479 * to the lowest multiple of s_stripe which is bigger than
2480 * the s_mb_group_prealloc as determined above. We want
2481 * the preallocation size to be an exact multiple of the
2482 * RAID stripe size so that preallocations don't fragment
2483 * the stripes.
2484 */
2485 if (sbi->s_stripe > 1) {
2486 sbi->s_mb_group_prealloc = roundup(
2487 sbi->s_mb_group_prealloc, sbi->s_stripe);
2488 }
2475 2489
2476 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2490 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2477 if (sbi->s_locality_groups == NULL) { 2491 if (sbi->s_locality_groups == NULL) {
@@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2487 spin_lock_init(&lg->lg_prealloc_lock); 2501 spin_lock_init(&lg->lg_prealloc_lock);
2488 } 2502 }
2489 2503
2504 /* init file for buddy data */
2505 ret = ext4_mb_init_backend(sb);
2506 if (ret != 0) {
2507 goto out;
2508 }
2509
2490 if (sbi->s_proc) 2510 if (sbi->s_proc)
2491 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2511 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2492 &ext4_mb_seq_groups_fops, sb); 2512 &ext4_mb_seq_groups_fops, sb);
@@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb)
2544 EXT4_DESC_PER_BLOCK_BITS(sb); 2564 EXT4_DESC_PER_BLOCK_BITS(sb);
2545 for (i = 0; i < num_meta_group_infos; i++) 2565 for (i = 0; i < num_meta_group_infos; i++)
2546 kfree(sbi->s_group_info[i]); 2566 kfree(sbi->s_group_info[i]);
2547 kfree(sbi->s_group_info); 2567 ext4_kvfree(sbi->s_group_info);
2548 } 2568 }
2549 kfree(sbi->s_mb_offsets); 2569 kfree(sbi->s_mb_offsets);
2550 kfree(sbi->s_mb_maxs); 2570 kfree(sbi->s_mb_maxs);
2551 if (sbi->s_buddy_cache) 2571 if (sbi->s_buddy_cache)
2552 iput(sbi->s_buddy_cache); 2572 iput(sbi->s_buddy_cache);
2553 if (sbi->s_mb_stats) { 2573 if (sbi->s_mb_stats) {
2554 printk(KERN_INFO 2574 ext4_msg(sb, KERN_INFO,
2555 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", 2575 "mballoc: %u blocks %u reqs (%u success)",
2556 atomic_read(&sbi->s_bal_allocated), 2576 atomic_read(&sbi->s_bal_allocated),
2557 atomic_read(&sbi->s_bal_reqs), 2577 atomic_read(&sbi->s_bal_reqs),
2558 atomic_read(&sbi->s_bal_success)); 2578 atomic_read(&sbi->s_bal_success));
2559 printk(KERN_INFO 2579 ext4_msg(sb, KERN_INFO,
2560 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " 2580 "mballoc: %u extents scanned, %u goal hits, "
2561 "%u 2^N hits, %u breaks, %u lost\n", 2581 "%u 2^N hits, %u breaks, %u lost",
2562 atomic_read(&sbi->s_bal_ex_scanned), 2582 atomic_read(&sbi->s_bal_ex_scanned),
2563 atomic_read(&sbi->s_bal_goals), 2583 atomic_read(&sbi->s_bal_goals),
2564 atomic_read(&sbi->s_bal_2orders), 2584 atomic_read(&sbi->s_bal_2orders),
2565 atomic_read(&sbi->s_bal_breaks), 2585 atomic_read(&sbi->s_bal_breaks),
2566 atomic_read(&sbi->s_mb_lost_chunks)); 2586 atomic_read(&sbi->s_mb_lost_chunks));
2567 printk(KERN_INFO 2587 ext4_msg(sb, KERN_INFO,
2568 "EXT4-fs: mballoc: %lu generated and it took %Lu\n", 2588 "mballoc: %lu generated and it took %Lu",
2569 sbi->s_mb_buddies_generated++, 2589 sbi->s_mb_buddies_generated,
2570 sbi->s_mb_generation_time); 2590 sbi->s_mb_generation_time);
2571 printk(KERN_INFO 2591 ext4_msg(sb, KERN_INFO,
2572 "EXT4-fs: mballoc: %u preallocated, %u discarded\n", 2592 "mballoc: %u preallocated, %u discarded",
2573 atomic_read(&sbi->s_mb_preallocated), 2593 atomic_read(&sbi->s_mb_preallocated),
2574 atomic_read(&sbi->s_mb_discarded)); 2594 atomic_read(&sbi->s_mb_discarded));
2575 } 2595 }
@@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2628 rb_erase(&entry->node, &(db->bb_free_root)); 2648 rb_erase(&entry->node, &(db->bb_free_root));
2629 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2649 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2630 2650
2651 /*
2652 * Clear the trimmed flag for the group so that the next
2653 * ext4_trim_fs can trim it.
2654 * If the volume is mounted with -o discard, online discard
2655 * is supported and the free blocks will be trimmed online.
2656 */
2657 if (!test_opt(sb, DISCARD))
2658 EXT4_MB_GRP_CLEAR_TRIMMED(db);
2659
2631 if (!db->bb_free_root.rb_node) { 2660 if (!db->bb_free_root.rb_node) {
2632 /* No more items in the per group rb tree 2661 /* No more items in the per group rb tree
2633 * balance refcounts from ext4_mb_free_metadata() 2662 * balance refcounts from ext4_mb_free_metadata()
@@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2771 * We leak some of the blocks here. 2800 * We leak some of the blocks here.
2772 */ 2801 */
2773 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2802 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2774 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2803 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2775 ac->ac_b_ex.fe_len); 2804 ac->ac_b_ex.fe_len);
2776 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2805 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2777 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2806 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2778 if (!err) 2807 if (!err)
@@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2790 } 2819 }
2791 } 2820 }
2792#endif 2821#endif
2793 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); 2822 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2823 ac->ac_b_ex.fe_len);
2794 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2824 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2795 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2825 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2796 ext4_free_blks_set(sb, gdp, 2826 ext4_free_blks_set(sb, gdp,
@@ -2830,8 +2860,9 @@ out_err:
2830 2860
2831/* 2861/*
2832 * here we normalize request for locality group 2862 * here we normalize request for locality group
2833 * Group request are normalized to s_strip size if we set the same via mount 2863 * Group request are normalized to s_mb_group_prealloc, which goes to
2834 * option. If not we set it to s_mb_group_prealloc which can be configured via 2864 * s_strip if we set the same via mount option.
2865 * s_mb_group_prealloc can be configured via
2835 * /sys/fs/ext4/<partition>/mb_group_prealloc 2866 * /sys/fs/ext4/<partition>/mb_group_prealloc
2836 * 2867 *
2837 * XXX: should we try to preallocate more than the group has now? 2868 * XXX: should we try to preallocate more than the group has now?
@@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2842 struct ext4_locality_group *lg = ac->ac_lg; 2873 struct ext4_locality_group *lg = ac->ac_lg;
2843 2874
2844 BUG_ON(lg == NULL); 2875 BUG_ON(lg == NULL);
2845 if (EXT4_SB(sb)->s_stripe) 2876 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2846 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
2847 else
2848 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2849 mb_debug(1, "#%u: goal %u blocks for locality group\n", 2877 mb_debug(1, "#%u: goal %u blocks for locality group\n",
2850 current->pid, ac->ac_g_ex.fe_len); 2878 current->pid, ac->ac_g_ex.fe_len);
2851} 2879}
@@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3001 3029
3002 if (start + size <= ac->ac_o_ex.fe_logical && 3030 if (start + size <= ac->ac_o_ex.fe_logical &&
3003 start > ac->ac_o_ex.fe_logical) { 3031 start > ac->ac_o_ex.fe_logical) {
3004 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", 3032 ext4_msg(ac->ac_sb, KERN_ERR,
3005 (unsigned long) start, (unsigned long) size, 3033 "start %lu, size %lu, fe_logical %lu",
3006 (unsigned long) ac->ac_o_ex.fe_logical); 3034 (unsigned long) start, (unsigned long) size,
3035 (unsigned long) ac->ac_o_ex.fe_logical);
3007 } 3036 }
3008 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3037 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3009 start > ac->ac_o_ex.fe_logical); 3038 start > ac->ac_o_ex.fe_logical);
@@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3262 3291
3263 while (n) { 3292 while (n) {
3264 entry = rb_entry(n, struct ext4_free_data, node); 3293 entry = rb_entry(n, struct ext4_free_data, node);
3265 mb_set_bits(bitmap, entry->start_blk, entry->count); 3294 ext4_set_bits(bitmap, entry->start_blk, entry->count);
3266 n = rb_next(n); 3295 n = rb_next(n);
3267 } 3296 }
3268 return; 3297 return;
@@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3304 if (unlikely(len == 0)) 3333 if (unlikely(len == 0))
3305 continue; 3334 continue;
3306 BUG_ON(groupnr != group); 3335 BUG_ON(groupnr != group);
3307 mb_set_bits(bitmap, start, len); 3336 ext4_set_bits(bitmap, start, len);
3308 preallocated += len; 3337 preallocated += len;
3309 count++; 3338 count++;
3310 } 3339 }
@@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3584 bit = next + 1; 3613 bit = next + 1;
3585 } 3614 }
3586 if (free != pa->pa_free) { 3615 if (free != pa->pa_free) {
3587 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", 3616 ext4_msg(e4b->bd_sb, KERN_CRIT,
3588 pa, (unsigned long) pa->pa_lstart, 3617 "pa %p: logic %lu, phys. %lu, len %lu",
3589 (unsigned long) pa->pa_pstart, 3618 pa, (unsigned long) pa->pa_lstart,
3590 (unsigned long) pa->pa_len); 3619 (unsigned long) pa->pa_pstart,
3620 (unsigned long) pa->pa_len);
3591 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 3621 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3592 free, pa->pa_free); 3622 free, pa->pa_free);
3593 /* 3623 /*
@@ -3775,7 +3805,8 @@ repeat:
3775 * use preallocation while we're discarding it */ 3805 * use preallocation while we're discarding it */
3776 spin_unlock(&pa->pa_lock); 3806 spin_unlock(&pa->pa_lock);
3777 spin_unlock(&ei->i_prealloc_lock); 3807 spin_unlock(&ei->i_prealloc_lock);
3778 printk(KERN_ERR "uh-oh! used pa while discarding\n"); 3808 ext4_msg(sb, KERN_ERR,
3809 "uh-oh! used pa while discarding");
3779 WARN_ON(1); 3810 WARN_ON(1);
3780 schedule_timeout_uninterruptible(HZ); 3811 schedule_timeout_uninterruptible(HZ);
3781 goto repeat; 3812 goto repeat;
@@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3852 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3883 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3853 return; 3884 return;
3854 3885
3855 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3886 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
3856 " Allocation context details:\n"); 3887 " Allocation context details:");
3857 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3888 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
3858 ac->ac_status, ac->ac_flags); 3889 ac->ac_status, ac->ac_flags);
3859 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " 3890 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
3860 "best %lu/%lu/%lu@%lu cr %d\n", 3891 "goal %lu/%lu/%lu@%lu, "
3892 "best %lu/%lu/%lu@%lu cr %d",
3861 (unsigned long)ac->ac_o_ex.fe_group, 3893 (unsigned long)ac->ac_o_ex.fe_group,
3862 (unsigned long)ac->ac_o_ex.fe_start, 3894 (unsigned long)ac->ac_o_ex.fe_start,
3863 (unsigned long)ac->ac_o_ex.fe_len, 3895 (unsigned long)ac->ac_o_ex.fe_len,
@@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3871 (unsigned long)ac->ac_b_ex.fe_len, 3903 (unsigned long)ac->ac_b_ex.fe_len,
3872 (unsigned long)ac->ac_b_ex.fe_logical, 3904 (unsigned long)ac->ac_b_ex.fe_logical,
3873 (int)ac->ac_criteria); 3905 (int)ac->ac_criteria);
3874 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 3906 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
3875 ac->ac_found); 3907 ac->ac_ex_scanned, ac->ac_found);
3876 printk(KERN_ERR "EXT4-fs: groups: \n"); 3908 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
3877 ngroups = ext4_get_groups_count(sb); 3909 ngroups = ext4_get_groups_count(sb);
3878 for (i = 0; i < ngroups; i++) { 3910 for (i = 0; i < ngroups; i++) {
3879 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3911 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4637,7 +4669,7 @@ do_more:
4637 } 4669 }
4638 ext4_mark_super_dirty(sb); 4670 ext4_mark_super_dirty(sb);
4639error_return: 4671error_return:
4640 if (freed) 4672 if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4641 dquot_free_block(inode, freed); 4673 dquot_free_block(inode, freed);
4642 brelse(bitmap_bh); 4674 brelse(bitmap_bh);
4643 ext4_std_error(sb, err); 4675 ext4_std_error(sb, err);
@@ -4645,7 +4677,7 @@ error_return:
4645} 4677}
4646 4678
4647/** 4679/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group 4680 * ext4_group_add_blocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction 4681 * @handle: handle to this transaction
4650 * @sb: super block 4682 * @sb: super block
4651 * @block: start physcial block to add to the block group 4683 * @block: start physcial block to add to the block group
@@ -4653,7 +4685,7 @@ error_return:
4653 * 4685 *
4654 * This marks the blocks as free in the bitmap and buddy. 4686 * This marks the blocks as free in the bitmap and buddy.
4655 */ 4687 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 4688int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count) 4689 ext4_fsblk_t block, unsigned long count)
4658{ 4690{
4659 struct buffer_head *bitmap_bh = NULL; 4691 struct buffer_head *bitmap_bh = NULL;
@@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4666 struct ext4_buddy e4b; 4698 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count; 4699 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed; 4700 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670 4701
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 4702 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672 4703
4704 if (count == 0)
4705 return 0;
4706
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4707 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /* 4708 /*
4676 * Check to see if we are freeing blocks across a group 4709 * Check to see if we are freeing blocks across a group
4677 * boundary. 4710 * boundary.
4678 */ 4711 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) 4712 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4713 ext4_warning(sb, "too much blocks added to group %u\n",
4714 block_group);
4715 err = -EINVAL;
4680 goto error_return; 4716 goto error_return;
4717 }
4681 4718
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4719 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh) 4720 if (!bitmap_bh) {
4721 err = -EIO;
4684 goto error_return; 4722 goto error_return;
4723 }
4724
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 4725 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc) 4726 if (!desc) {
4727 err = -EIO;
4687 goto error_return; 4728 goto error_return;
4729 }
4688 4730
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) || 4731 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) || 4732 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
@@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4694 ext4_error(sb, "Adding blocks in system zones - " 4736 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu", 4737 "Block = %llu, count = %lu",
4696 block, count); 4738 block, count);
4739 err = -EINVAL;
4697 goto error_return; 4740 goto error_return;
4698 } 4741 }
4699 4742
@@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4762error_return: 4805error_return:
4763 brelse(bitmap_bh); 4806 brelse(bitmap_bh);
4764 ext4_std_error(sb, err); 4807 ext4_std_error(sb, err);
4765 return; 4808 return err;
4766} 4809}
4767 4810
4768/** 4811/**
@@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
4782{ 4825{
4783 struct ext4_free_extent ex; 4826 struct ext4_free_extent ex;
4784 4827
4828 trace_ext4_trim_extent(sb, group, start, count);
4829
4785 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4830 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4786 4831
4787 ex.fe_start = start; 4832 ex.fe_start = start;
@@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
4802/** 4847/**
4803 * ext4_trim_all_free -- function to trim all free space in alloc. group 4848 * ext4_trim_all_free -- function to trim all free space in alloc. group
4804 * @sb: super block for file system 4849 * @sb: super block for file system
4805 * @e4b: ext4 buddy 4850 * @group: group to be trimmed
4806 * @start: first group block to examine 4851 * @start: first group block to examine
4807 * @max: last group block to examine 4852 * @max: last group block to examine
4808 * @minblocks: minimum extent block count 4853 * @minblocks: minimum extent block count
@@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4823 ext4_grpblk_t minblocks) 4868 ext4_grpblk_t minblocks)
4824{ 4869{
4825 void *bitmap; 4870 void *bitmap;
4826 ext4_grpblk_t next, count = 0; 4871 ext4_grpblk_t next, count = 0, free_count = 0;
4827 struct ext4_buddy e4b; 4872 struct ext4_buddy e4b;
4828 int ret; 4873 int ret;
4829 4874
4875 trace_ext4_trim_all_free(sb, group, start, max);
4876
4830 ret = ext4_mb_load_buddy(sb, group, &e4b); 4877 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) { 4878 if (ret) {
4832 ext4_error(sb, "Error in loading buddy " 4879 ext4_error(sb, "Error in loading buddy "
@@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4836 bitmap = e4b.bd_bitmap; 4883 bitmap = e4b.bd_bitmap;
4837 4884
4838 ext4_lock_group(sb, group); 4885 ext4_lock_group(sb, group);
4886 if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
4887 minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
4888 goto out;
4889
4839 start = (e4b.bd_info->bb_first_free > start) ? 4890 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start; 4891 e4b.bd_info->bb_first_free : start;
4841 4892
@@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4850 next - start, group, &e4b); 4901 next - start, group, &e4b);
4851 count += next - start; 4902 count += next - start;
4852 } 4903 }
4904 free_count += next - start;
4853 start = next + 1; 4905 start = next + 1;
4854 4906
4855 if (fatal_signal_pending(current)) { 4907 if (fatal_signal_pending(current)) {
@@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4863 ext4_lock_group(sb, group); 4915 ext4_lock_group(sb, group);
4864 } 4916 }
4865 4917
4866 if ((e4b.bd_info->bb_free - count) < minblocks) 4918 if ((e4b.bd_info->bb_free - free_count) < minblocks)
4867 break; 4919 break;
4868 } 4920 }
4921
4922 if (!ret)
4923 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
4924out:
4869 ext4_unlock_group(sb, group); 4925 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b); 4926 ext4_mb_unload_buddy(&e4b);
4871 4927
@@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4904 4960
4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4961 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4906 return -EINVAL; 4962 return -EINVAL;
4963 if (start + len <= first_data_blk)
4964 goto out;
4907 if (start < first_data_blk) { 4965 if (start < first_data_blk) {
4908 len -= first_data_blk - start; 4966 len -= first_data_blk - start;
4909 start = first_data_blk; 4967 start = first_data_blk;
@@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4952 } 5010 }
4953 range->len = trimmed * sb->s_blocksize; 5011 range->len = trimmed * sb->s_blocksize;
4954 5012
5013 if (!ret)
5014 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5015
5016out:
4955 return ret; 5017 return ret;
4956} 5018}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 20b5e7bfebd1..9d4a636b546c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -187,7 +187,6 @@ struct ext4_allocation_context {
187 __u16 ac_flags; /* allocation hints */ 187 __u16 ac_flags; /* allocation hints */
188 __u8 ac_status; 188 __u8 ac_status;
189 __u8 ac_criteria; 189 __u8 ac_criteria;
190 __u8 ac_repeats;
191 __u8 ac_2order; /* if request is to allocate 2^N blocks and 190 __u8 ac_2order; /* if request is to allocate 2^N blocks and
192 * N > 0, the field stores N, otherwise 0 */ 191 * N > 0, the field stores N, otherwise 0 */
193 __u8 ac_op; /* operation, for history only */ 192 __u8 ac_op; /* operation, for history only */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8c9babac43dc..f8068c7bae9f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
289 while (len--) printk("%c", *name++); 289 while (len--) printk("%c", *name++);
290 ext4fs_dirhash(de->name, de->name_len, &h); 290 ext4fs_dirhash(de->name, de->name_len, &h);
291 printk(":%x.%u ", h.hash, 291 printk(":%x.%u ", h.hash,
292 ((char *) de - base)); 292 (unsigned) ((char *) de - base));
293 } 293 }
294 space += EXT4_DIR_REC_LEN(de->name_len); 294 space += EXT4_DIR_REC_LEN(de->name_len);
295 names++; 295 names++;
@@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1013 1013
1014 *err = -ENOENT; 1014 *err = -ENOENT;
1015errout: 1015errout:
1016 dxtrace(printk(KERN_DEBUG "%s not found\n", name)); 1016 dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1017 dx_release (frames); 1017 dx_release (frames);
1018 return NULL; 1018 return NULL;
1019} 1019}
@@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1985 if (!list_empty(&EXT4_I(inode)->i_orphan)) 1985 if (!list_empty(&EXT4_I(inode)->i_orphan))
1986 goto out_unlock; 1986 goto out_unlock;
1987 1987
1988 /* Orphan handling is only valid for files with data blocks 1988 /*
1989 * being truncated, or files being unlinked. */ 1989 * Orphan handling is only valid for files with data blocks
1990 1990 * being truncated, or files being unlinked. Note that we either
1991 /* @@@ FIXME: Observation from aviro: 1991 * hold i_mutex, or the inode can not be referenced from outside,
1992 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block 1992 * so i_nlink should not be bumped due to race
1993 * here (on s_orphan_lock), so race with ext4_link() which might bump
1994 * ->i_nlink. For, say it, character device. Not a regular file,
1995 * not a directory, not a symlink and ->i_nlink > 0.
1996 *
1997 * tytso, 4/25/2009: I'm not sure how that could happen;
1998 * shouldn't the fs core protect us from these sort of
1999 * unlink()/link() races?
2000 */ 1993 */
2001 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1994 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2002 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1995 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2260,9 +2253,11 @@ static int ext4_symlink(struct inode *dir,
2260 /* 2253 /*
2261 * For non-fast symlinks, we just allocate inode and put it on 2254 * For non-fast symlinks, we just allocate inode and put it on
2262 * orphan list in the first transaction => we need bitmap, 2255 * orphan list in the first transaction => we need bitmap,
2263 * group descriptor, sb, inode block, quota blocks. 2256 * group descriptor, sb, inode block, quota blocks, and
2257 * possibly selinux xattr blocks.
2264 */ 2258 */
2265 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); 2259 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2260 EXT4_XATTR_TRANS_BLOCKS;
2266 } else { 2261 } else {
2267 /* 2262 /*
2268 * Fast symlink. We have to add entry to directory 2263 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7bb8f76d470a..92f38ee13f8a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work)
142 unsigned long flags; 142 unsigned long flags;
143 int ret; 143 int ret;
144 144
145 mutex_lock(&inode->i_mutex); 145 if (!mutex_trylock(&inode->i_mutex)) {
146 /*
147 * Requeue the work instead of waiting so that the work
148 * items queued after this can be processed.
149 */
150 queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
151 /*
152 * To prevent the ext4-dio-unwritten thread from keeping
153 * requeueing end_io requests and occupying cpu for too long,
154 * yield the cpu if it sees an end_io request that has already
155 * been requeued.
156 */
157 if (io->flag & EXT4_IO_END_QUEUED)
158 yield();
159 io->flag |= EXT4_IO_END_QUEUED;
160 return;
161 }
146 ret = ext4_end_io_nolock(io); 162 ret = ext4_end_io_nolock(io);
147 if (ret < 0) { 163 if (ret < 0) {
148 mutex_unlock(&inode->i_mutex); 164 mutex_unlock(&inode->i_mutex);
@@ -285,11 +301,7 @@ static int io_submit_init(struct ext4_io_submit *io,
285 io_end = ext4_init_io_end(inode, GFP_NOFS); 301 io_end = ext4_init_io_end(inode, GFP_NOFS);
286 if (!io_end) 302 if (!io_end)
287 return -ENOMEM; 303 return -ENOMEM;
288 do { 304 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
289 bio = bio_alloc(GFP_NOIO, nvecs);
290 nvecs >>= 1;
291 } while (bio == NULL);
292
293 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 305 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
294 bio->bi_bdev = bh->b_bdev; 306 bio->bi_bdev = bh->b_bdev;
295 bio->bi_private = io->io_end = io_end; 307 bio->bi_private = io->io_end = io_end;
@@ -338,8 +350,10 @@ submit_and_retry:
338 if ((io_end->num_io_pages >= MAX_IO_PAGES) && 350 if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
339 (io_end->pages[io_end->num_io_pages-1] != io_page)) 351 (io_end->pages[io_end->num_io_pages-1] != io_page))
340 goto submit_and_retry; 352 goto submit_and_retry;
341 if (buffer_uninit(bh)) 353 if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
342 io->io_end->flag |= EXT4_IO_END_UNWRITTEN; 354 io_end->flag |= EXT4_IO_END_UNWRITTEN;
355 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
356 }
343 io->io_end->size += bh->b_size; 357 io->io_end->size += bh->b_size;
344 io->io_next_block++; 358 io->io_next_block++;
345 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 359 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 80bbc9c60c24..707d3f16f7ce 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -16,6 +16,35 @@
16 16
17#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
18 18
19int ext4_resize_begin(struct super_block *sb)
20{
21 int ret = 0;
22
23 if (!capable(CAP_SYS_RESOURCE))
24 return -EPERM;
25
26 /*
27 * We are not allowed to do online-resizing on a filesystem mounted
28 * with error, because it can destroy the filesystem easily.
29 */
30 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
31 ext4_warning(sb, "There are errors in the filesystem, "
32 "so online resizing is not allowed\n");
33 return -EPERM;
34 }
35
36 if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
37 ret = -EBUSY;
38
39 return ret;
40}
41
42void ext4_resize_end(struct super_block *sb)
43{
44 clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
45 smp_mb__after_clear_bit();
46}
47
19#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 48#define outside(b, first, last) ((b) < (first) || (b) >= (last))
20#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 49#define inside(b, first, last) ((b) >= (first) && (b) < (last))
21 50
@@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
118 brelse(bh); 147 brelse(bh);
119 bh = ERR_PTR(err); 148 bh = ERR_PTR(err);
120 } else { 149 } else {
121 lock_buffer(bh);
122 memset(bh->b_data, 0, sb->s_blocksize); 150 memset(bh->b_data, 0, sb->s_blocksize);
123 set_buffer_uptodate(bh); 151 set_buffer_uptodate(bh);
124 unlock_buffer(bh);
125 } 152 }
126 153
127 return bh; 154 return bh;
@@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
132 * If that fails, restart the transaction & regain write access for the 159 * If that fails, restart the transaction & regain write access for the
133 * buffer head which is used for block_bitmap modifications. 160 * buffer head which is used for block_bitmap modifications.
134 */ 161 */
135static int extend_or_restart_transaction(handle_t *handle, int thresh, 162static int extend_or_restart_transaction(handle_t *handle, int thresh)
136 struct buffer_head *bh)
137{ 163{
138 int err; 164 int err;
139 165
@@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
144 if (err < 0) 170 if (err < 0)
145 return err; 171 return err;
146 if (err) { 172 if (err) {
147 if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 173 err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
148 return err; 174 if (err)
149 if ((err = ext4_journal_get_write_access(handle, bh)))
150 return err; 175 return err;
151 } 176 }
152 177
@@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
181 if (IS_ERR(handle)) 206 if (IS_ERR(handle))
182 return PTR_ERR(handle); 207 return PTR_ERR(handle);
183 208
184 mutex_lock(&sbi->s_resize_lock); 209 BUG_ON(input->group != sbi->s_groups_count);
185 if (input->group != sbi->s_groups_count) {
186 err = -EBUSY;
187 goto exit_journal;
188 }
189
190 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
191 err = PTR_ERR(bh);
192 goto exit_journal;
193 }
194
195 if (ext4_bg_has_super(sb, input->group)) {
196 ext4_debug("mark backup superblock %#04llx (+0)\n", start);
197 ext4_set_bit(0, bh->b_data);
198 }
199 210
200 /* Copy all of the GDT blocks into the backup in this group */ 211 /* Copy all of the GDT blocks into the backup in this group */
201 for (i = 0, bit = 1, block = start + 1; 212 for (i = 0, bit = 1, block = start + 1;
@@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb,
203 struct buffer_head *gdb; 214 struct buffer_head *gdb;
204 215
205 ext4_debug("update backup group %#04llx (+%d)\n", block, bit); 216 ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
206 217 err = extend_or_restart_transaction(handle, 1);
207 if ((err = extend_or_restart_transaction(handle, 1, bh))) 218 if (err)
208 goto exit_bh; 219 goto exit_journal;
209 220
210 gdb = sb_getblk(sb, block); 221 gdb = sb_getblk(sb, block);
211 if (!gdb) { 222 if (!gdb) {
212 err = -EIO; 223 err = -EIO;
213 goto exit_bh; 224 goto exit_journal;
214 } 225 }
215 if ((err = ext4_journal_get_write_access(handle, gdb))) { 226 if ((err = ext4_journal_get_write_access(handle, gdb))) {
216 brelse(gdb); 227 brelse(gdb);
217 goto exit_bh; 228 goto exit_journal;
218 } 229 }
219 lock_buffer(gdb);
220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 230 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
221 set_buffer_uptodate(gdb); 231 set_buffer_uptodate(gdb);
222 unlock_buffer(gdb);
223 err = ext4_handle_dirty_metadata(handle, NULL, gdb); 232 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
224 if (unlikely(err)) { 233 if (unlikely(err)) {
225 brelse(gdb); 234 brelse(gdb);
226 goto exit_bh; 235 goto exit_journal;
227 } 236 }
228 ext4_set_bit(bit, bh->b_data);
229 brelse(gdb); 237 brelse(gdb);
230 } 238 }
231 239
@@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb,
235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, 243 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
236 GFP_NOFS); 244 GFP_NOFS);
237 if (err) 245 if (err)
238 goto exit_bh; 246 goto exit_journal;
239 for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) 247
240 ext4_set_bit(bit, bh->b_data); 248 err = extend_or_restart_transaction(handle, 2);
249 if (err)
250 goto exit_journal;
251
252 bh = bclean(handle, sb, input->block_bitmap);
253 if (IS_ERR(bh)) {
254 err = PTR_ERR(bh);
255 goto exit_journal;
256 }
257
258 if (ext4_bg_has_super(sb, input->group)) {
259 ext4_debug("mark backup group tables %#04llx (+0)\n", start);
260 ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
261 }
241 262
242 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 263 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
243 input->block_bitmap - start); 264 input->block_bitmap - start);
@@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb,
253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 274 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
254 if (err) 275 if (err)
255 goto exit_bh; 276 goto exit_bh;
256 for (i = 0, bit = input->inode_table - start; 277 ext4_set_bits(bh->b_data, input->inode_table - start,
257 i < sbi->s_itb_per_group; i++, bit++) 278 sbi->s_itb_per_group);
258 ext4_set_bit(bit, bh->b_data);
259 279
260 if ((err = extend_or_restart_transaction(handle, 2, bh)))
261 goto exit_bh;
262 280
263 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, 281 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
264 bh->b_data); 282 bh->b_data);
@@ -285,7 +303,6 @@ exit_bh:
285 brelse(bh); 303 brelse(bh);
286 304
287exit_journal: 305exit_journal:
288 mutex_unlock(&sbi->s_resize_lock);
289 if ((err2 = ext4_journal_stop(handle)) && !err) 306 if ((err2 = ext4_journal_stop(handle)) && !err)
290 err = err2; 307 err = err2;
291 308
@@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb,
377 * fail once we start modifying the data on disk, because JBD has no rollback. 394 * fail once we start modifying the data on disk, because JBD has no rollback.
378 */ 395 */
379static int add_new_gdb(handle_t *handle, struct inode *inode, 396static int add_new_gdb(handle_t *handle, struct inode *inode,
380 struct ext4_new_group_data *input, 397 ext4_group_t group)
381 struct buffer_head **primary)
382{ 398{
383 struct super_block *sb = inode->i_sb; 399 struct super_block *sb = inode->i_sb;
384 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 400 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
385 unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 401 unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
386 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; 402 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
387 struct buffer_head **o_group_desc, **n_group_desc; 403 struct buffer_head **o_group_desc, **n_group_desc;
388 struct buffer_head *dind; 404 struct buffer_head *dind;
405 struct buffer_head *gdb_bh;
389 int gdbackups; 406 int gdbackups;
390 struct ext4_iloc iloc; 407 struct ext4_iloc iloc;
391 __le32 *data; 408 __le32 *data;
@@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
408 return -EPERM; 425 return -EPERM;
409 } 426 }
410 427
411 *primary = sb_bread(sb, gdblock); 428 gdb_bh = sb_bread(sb, gdblock);
412 if (!*primary) 429 if (!gdb_bh)
413 return -EIO; 430 return -EIO;
414 431
415 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { 432 gdbackups = verify_reserved_gdb(sb, gdb_bh);
433 if (gdbackups < 0) {
416 err = gdbackups; 434 err = gdbackups;
417 goto exit_bh; 435 goto exit_bh;
418 } 436 }
@@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
427 data = (__le32 *)dind->b_data; 445 data = (__le32 *)dind->b_data;
428 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
429 ext4_warning(sb, "new group %u GDT block %llu not reserved", 447 ext4_warning(sb, "new group %u GDT block %llu not reserved",
430 input->group, gdblock); 448 group, gdblock);
431 err = -EINVAL; 449 err = -EINVAL;
432 goto exit_dind; 450 goto exit_dind;
433 } 451 }
@@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
436 if (unlikely(err)) 454 if (unlikely(err))
437 goto exit_dind; 455 goto exit_dind;
438 456
439 err = ext4_journal_get_write_access(handle, *primary); 457 err = ext4_journal_get_write_access(handle, gdb_bh);
440 if (unlikely(err)) 458 if (unlikely(err))
441 goto exit_sbh; 459 goto exit_sbh;
442 460
@@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
449 if (unlikely(err)) 467 if (unlikely(err))
450 goto exit_dindj; 468 goto exit_dindj;
451 469
452 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), 470 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
453 GFP_NOFS); 471 sizeof(struct buffer_head *),
472 GFP_NOFS);
454 if (!n_group_desc) { 473 if (!n_group_desc) {
455 err = -ENOMEM; 474 err = -ENOMEM;
456 ext4_warning(sb, 475 ext4_warning(sb, "not enough memory for %lu groups",
457 "not enough memory for %lu groups", gdb_num + 1); 476 gdb_num + 1);
458 goto exit_inode; 477 goto exit_inode;
459 } 478 }
460 479
@@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
475 } 494 }
476 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 495 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
477 ext4_mark_iloc_dirty(handle, inode, &iloc); 496 ext4_mark_iloc_dirty(handle, inode, &iloc);
478 memset((*primary)->b_data, 0, sb->s_blocksize); 497 memset(gdb_bh->b_data, 0, sb->s_blocksize);
479 err = ext4_handle_dirty_metadata(handle, NULL, *primary); 498 err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
480 if (unlikely(err)) { 499 if (unlikely(err)) {
481 ext4_std_error(sb, err); 500 ext4_std_error(sb, err);
482 goto exit_inode; 501 goto exit_inode;
@@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
486 o_group_desc = EXT4_SB(sb)->s_group_desc; 505 o_group_desc = EXT4_SB(sb)->s_group_desc;
487 memcpy(n_group_desc, o_group_desc, 506 memcpy(n_group_desc, o_group_desc,
488 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); 507 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
489 n_group_desc[gdb_num] = *primary; 508 n_group_desc[gdb_num] = gdb_bh;
490 EXT4_SB(sb)->s_group_desc = n_group_desc; 509 EXT4_SB(sb)->s_group_desc = n_group_desc;
491 EXT4_SB(sb)->s_gdb_count++; 510 EXT4_SB(sb)->s_gdb_count++;
492 kfree(o_group_desc); 511 ext4_kvfree(o_group_desc);
493 512
494 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 513 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
495 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 514 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
@@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
499 return err; 518 return err;
500 519
501exit_inode: 520exit_inode:
521 ext4_kvfree(n_group_desc);
502 /* ext4_handle_release_buffer(handle, iloc.bh); */ 522 /* ext4_handle_release_buffer(handle, iloc.bh); */
503 brelse(iloc.bh); 523 brelse(iloc.bh);
504exit_dindj: 524exit_dindj:
@@ -508,7 +528,7 @@ exit_sbh:
508exit_dind: 528exit_dind:
509 brelse(dind); 529 brelse(dind);
510exit_bh: 530exit_bh:
511 brelse(*primary); 531 brelse(gdb_bh);
512 532
513 ext4_debug("leaving with error %d\n", err); 533 ext4_debug("leaving with error %d\n", err);
514 return err; 534 return err;
@@ -528,7 +548,7 @@ exit_bh:
528 * backup GDT blocks are stored in their reserved primary GDT block. 548 * backup GDT blocks are stored in their reserved primary GDT block.
529 */ 549 */
530static int reserve_backup_gdb(handle_t *handle, struct inode *inode, 550static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
531 struct ext4_new_group_data *input) 551 ext4_group_t group)
532{ 552{
533 struct super_block *sb = inode->i_sb; 553 struct super_block *sb = inode->i_sb;
534 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); 554 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
599 * Finally we can add each of the reserved backup GDT blocks from 619 * Finally we can add each of the reserved backup GDT blocks from
600 * the new group to its reserved primary GDT block. 620 * the new group to its reserved primary GDT block.
601 */ 621 */
602 blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); 622 blk = group * EXT4_BLOCKS_PER_GROUP(sb);
603 for (i = 0; i < reserved_gdb; i++) { 623 for (i = 0; i < reserved_gdb; i++) {
604 int err2; 624 int err2;
605 data = (__le32 *)primary[i]->b_data; 625 data = (__le32 *)primary[i]->b_data;
@@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
799 goto exit_put; 819 goto exit_put;
800 } 820 }
801 821
802 mutex_lock(&sbi->s_resize_lock);
803 if (input->group != sbi->s_groups_count) {
804 ext4_warning(sb, "multiple resizers run on filesystem!");
805 err = -EBUSY;
806 goto exit_journal;
807 }
808
809 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) 822 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
810 goto exit_journal; 823 goto exit_journal;
811 824
@@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
820 if ((err = ext4_journal_get_write_access(handle, primary))) 833 if ((err = ext4_journal_get_write_access(handle, primary)))
821 goto exit_journal; 834 goto exit_journal;
822 835
823 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && 836 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
824 (err = reserve_backup_gdb(handle, inode, input))) 837 err = reserve_backup_gdb(handle, inode, input->group);
838 if (err)
839 goto exit_journal;
840 }
841 } else {
842 /*
843 * Note that we can access new group descriptor block safely
844 * only if add_new_gdb() succeeds.
845 */
846 err = add_new_gdb(handle, inode, input->group);
847 if (err)
825 goto exit_journal; 848 goto exit_journal;
826 } else if ((err = add_new_gdb(handle, inode, input, &primary))) 849 primary = sbi->s_group_desc[gdb_num];
827 goto exit_journal; 850 }
828 851
829 /* 852 /*
830 * OK, now we've set up the new group. Time to make it active. 853 * OK, now we've set up the new group. Time to make it active.
831 * 854 *
832 * We do not lock all allocations via s_resize_lock
833 * so we have to be safe wrt. concurrent accesses the group 855 * so we have to be safe wrt. concurrent accesses the group
834 * data. So we need to be careful to set all of the relevant 856 * data. So we need to be careful to set all of the relevant
835 * group descriptor data etc. *before* we enable the group. 857 * group descriptor data etc. *before* we enable the group.
@@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
886 * 908 *
887 * The precise rules we use are: 909 * The precise rules we use are:
888 * 910 *
889 * * Writers of s_groups_count *must* hold s_resize_lock
890 * AND
891 * * Writers must perform a smp_wmb() after updating all dependent 911 * * Writers must perform a smp_wmb() after updating all dependent
892 * data and before modifying the groups count 912 * data and before modifying the groups count
893 * 913 *
894 * * Readers must hold s_resize_lock over the access
895 * OR
896 * * Readers must perform an smp_rmb() after reading the groups count 914 * * Readers must perform an smp_rmb() after reading the groups count
897 * and before reading any dependent data. 915 * and before reading any dependent data.
898 * 916 *
@@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 ext4_handle_dirty_super(handle, sb); 955 ext4_handle_dirty_super(handle, sb);
938 956
939exit_journal: 957exit_journal:
940 mutex_unlock(&sbi->s_resize_lock);
941 if ((err2 = ext4_journal_stop(handle)) && !err) 958 if ((err2 = ext4_journal_stop(handle)) && !err)
942 err = err2; 959 err = err2;
943 if (!err) { 960 if (!err && primary) {
944 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, 961 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
945 sizeof(struct ext4_super_block)); 962 sizeof(struct ext4_super_block));
946 update_backups(sb, primary->b_blocknr, primary->b_data, 963 update_backups(sb, primary->b_blocknr, primary->b_data,
@@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
969 ext4_grpblk_t add; 986 ext4_grpblk_t add;
970 struct buffer_head *bh; 987 struct buffer_head *bh;
971 handle_t *handle; 988 handle_t *handle;
972 int err; 989 int err, err2;
973 ext4_group_t group; 990 ext4_group_t group;
974 991
975 /* We don't need to worry about locking wrt other resizers just
976 * yet: we're going to revalidate es->s_blocks_count after
977 * taking the s_resize_lock below. */
978 o_blocks_count = ext4_blocks_count(es); 992 o_blocks_count = ext4_blocks_count(es);
979 993
980 if (test_opt(sb, DEBUG)) 994 if (test_opt(sb, DEBUG))
981 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", 995 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
982 o_blocks_count, n_blocks_count); 996 o_blocks_count, n_blocks_count);
983 997
984 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 998 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
995 1009
996 if (n_blocks_count < o_blocks_count) { 1010 if (n_blocks_count < o_blocks_count) {
997 ext4_warning(sb, "can't shrink FS - resize aborted"); 1011 ext4_warning(sb, "can't shrink FS - resize aborted");
998 return -EBUSY; 1012 return -EINVAL;
999 } 1013 }
1000 1014
1001 /* Handle the remaining blocks in the last group only. */ 1015 /* Handle the remaining blocks in the last group only. */
@@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1038 goto exit_put; 1052 goto exit_put;
1039 } 1053 }
1040 1054
1041 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1042 if (o_blocks_count != ext4_blocks_count(es)) {
1043 ext4_warning(sb, "multiple resizers run on filesystem!");
1044 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1045 ext4_journal_stop(handle);
1046 err = -EBUSY;
1047 goto exit_put;
1048 }
1049
1050 if ((err = ext4_journal_get_write_access(handle, 1055 if ((err = ext4_journal_get_write_access(handle,
1051 EXT4_SB(sb)->s_sbh))) { 1056 EXT4_SB(sb)->s_sbh))) {
1052 ext4_warning(sb, "error %d on journal write access", err); 1057 ext4_warning(sb, "error %d on journal write access", err);
1053 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1054 ext4_journal_stop(handle); 1058 ext4_journal_stop(handle);
1055 goto exit_put; 1059 goto exit_put;
1056 } 1060 }
1057 ext4_blocks_count_set(es, o_blocks_count + add); 1061 ext4_blocks_count_set(es, o_blocks_count + add);
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1062 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1060 o_blocks_count + add); 1063 o_blocks_count + add);
1061 /* We add the blocks to the bitmap and set the group need init bit */ 1064 /* We add the blocks to the bitmap and set the group need init bit */
1062 ext4_add_groupblocks(handle, sb, o_blocks_count, add); 1065 err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
1063 ext4_handle_dirty_super(handle, sb); 1066 ext4_handle_dirty_super(handle, sb);
1064 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1067 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1065 o_blocks_count + add); 1068 o_blocks_count + add);
1066 if ((err = ext4_journal_stop(handle))) 1069 err2 = ext4_journal_stop(handle);
1070 if (!err && err2)
1071 err = err2;
1072
1073 if (err)
1067 goto exit_put; 1074 goto exit_put;
1068 1075
1069 if (test_opt(sb, DEBUG)) 1076 if (test_opt(sb, DEBUG))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ea71aa864b3..44d0c8db2239 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = {
110#define IS_EXT3_SB(sb) (0) 110#define IS_EXT3_SB(sb) (0)
111#endif 111#endif
112 112
113void *ext4_kvmalloc(size_t size, gfp_t flags)
114{
115 void *ret;
116
117 ret = kmalloc(size, flags);
118 if (!ret)
119 ret = __vmalloc(size, flags, PAGE_KERNEL);
120 return ret;
121}
122
123void *ext4_kvzalloc(size_t size, gfp_t flags)
124{
125 void *ret;
126
127 ret = kzalloc(size, flags);
128 if (!ret)
129 ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
130 return ret;
131}
132
133void ext4_kvfree(void *ptr)
134{
135 if (is_vmalloc_addr(ptr))
136 vfree(ptr);
137 else
138 kfree(ptr);
139
140}
141
113ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 142ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
114 struct ext4_group_desc *bg) 143 struct ext4_group_desc *bg)
115{ 144{
@@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
269 journal_t *journal; 298 journal_t *journal;
270 handle_t *handle; 299 handle_t *handle;
271 300
301 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
272 if (sb->s_flags & MS_RDONLY) 302 if (sb->s_flags & MS_RDONLY)
273 return ERR_PTR(-EROFS); 303 return ERR_PTR(-EROFS);
274 304
@@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb)
789 819
790 for (i = 0; i < sbi->s_gdb_count; i++) 820 for (i = 0; i < sbi->s_gdb_count; i++)
791 brelse(sbi->s_group_desc[i]); 821 brelse(sbi->s_group_desc[i]);
792 kfree(sbi->s_group_desc); 822 ext4_kvfree(sbi->s_group_desc);
793 if (is_vmalloc_addr(sbi->s_flex_groups)) 823 ext4_kvfree(sbi->s_flex_groups);
794 vfree(sbi->s_flex_groups);
795 else
796 kfree(sbi->s_flex_groups);
797 percpu_counter_destroy(&sbi->s_freeblocks_counter); 824 percpu_counter_destroy(&sbi->s_freeblocks_counter);
798 percpu_counter_destroy(&sbi->s_freeinodes_counter); 825 percpu_counter_destroy(&sbi->s_freeinodes_counter);
799 percpu_counter_destroy(&sbi->s_dirs_counter); 826 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -892,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head)
892 919
893static void ext4_destroy_inode(struct inode *inode) 920static void ext4_destroy_inode(struct inode *inode)
894{ 921{
895 ext4_ioend_wait(inode);
896 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 922 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
897 ext4_msg(inode->i_sb, KERN_ERR, 923 ext4_msg(inode->i_sb, KERN_ERR,
898 "Inode %lu (%p): orphan list check failed!", 924 "Inode %lu (%p): orphan list check failed!",
@@ -1976,15 +2002,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
1976 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 2002 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
1977 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; 2003 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
1978 size = flex_group_count * sizeof(struct flex_groups); 2004 size = flex_group_count * sizeof(struct flex_groups);
1979 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); 2005 sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
1980 if (sbi->s_flex_groups == NULL) { 2006 if (sbi->s_flex_groups == NULL) {
1981 sbi->s_flex_groups = vzalloc(size); 2007 ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
1982 if (sbi->s_flex_groups == NULL) { 2008 flex_group_count);
1983 ext4_msg(sb, KERN_ERR, 2009 goto failed;
1984 "not enough memory for %u flex groups",
1985 flex_group_count);
1986 goto failed;
1987 }
1988 } 2010 }
1989 2011
1990 for (i = 0; i < sbi->s_groups_count; i++) { 2012 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2383,17 +2405,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2383 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); 2405 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
2384 unsigned long stripe_width = 2406 unsigned long stripe_width =
2385 le32_to_cpu(sbi->s_es->s_raid_stripe_width); 2407 le32_to_cpu(sbi->s_es->s_raid_stripe_width);
2408 int ret;
2386 2409
2387 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) 2410 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
2388 return sbi->s_stripe; 2411 ret = sbi->s_stripe;
2389 2412 else if (stripe_width <= sbi->s_blocks_per_group)
2390 if (stripe_width <= sbi->s_blocks_per_group) 2413 ret = stripe_width;
2391 return stripe_width; 2414 else if (stride <= sbi->s_blocks_per_group)
2415 ret = stride;
2416 else
2417 ret = 0;
2392 2418
2393 if (stride <= sbi->s_blocks_per_group) 2419 /*
2394 return stride; 2420 * If the stripe width is 1, this makes no sense and
2421 * we set it to 0 to turn off stripe handling code.
2422 */
2423 if (ret <= 1)
2424 ret = 0;
2395 2425
2396 return 0; 2426 return ret;
2397} 2427}
2398 2428
2399/* sysfs supprt */ 2429/* sysfs supprt */
@@ -3408,8 +3438,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3408 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); 3438 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
3409 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 3439 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
3410 EXT4_DESC_PER_BLOCK(sb); 3440 EXT4_DESC_PER_BLOCK(sb);
3411 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 3441 sbi->s_group_desc = ext4_kvmalloc(db_count *
3412 GFP_KERNEL); 3442 sizeof(struct buffer_head *),
3443 GFP_KERNEL);
3413 if (sbi->s_group_desc == NULL) { 3444 if (sbi->s_group_desc == NULL) {
3414 ext4_msg(sb, KERN_ERR, "not enough memory"); 3445 ext4_msg(sb, KERN_ERR, "not enough memory");
3415 goto failed_mount; 3446 goto failed_mount;
@@ -3491,7 +3522,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3491 3522
3492 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3523 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3493 mutex_init(&sbi->s_orphan_lock); 3524 mutex_init(&sbi->s_orphan_lock);
3494 mutex_init(&sbi->s_resize_lock); 3525 sbi->s_resize_flags = 0;
3495 3526
3496 sb->s_root = NULL; 3527 sb->s_root = NULL;
3497 3528
@@ -3741,12 +3772,8 @@ failed_mount_wq:
3741 } 3772 }
3742failed_mount3: 3773failed_mount3:
3743 del_timer(&sbi->s_err_report); 3774 del_timer(&sbi->s_err_report);
3744 if (sbi->s_flex_groups) { 3775 if (sbi->s_flex_groups)
3745 if (is_vmalloc_addr(sbi->s_flex_groups)) 3776 ext4_kvfree(sbi->s_flex_groups);
3746 vfree(sbi->s_flex_groups);
3747 else
3748 kfree(sbi->s_flex_groups);
3749 }
3750 percpu_counter_destroy(&sbi->s_freeblocks_counter); 3777 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3751 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3778 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3752 percpu_counter_destroy(&sbi->s_dirs_counter); 3779 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -3756,7 +3783,7 @@ failed_mount3:
3756failed_mount2: 3783failed_mount2:
3757 for (i = 0; i < db_count; i++) 3784 for (i = 0; i < db_count; i++)
3758 brelse(sbi->s_group_desc[i]); 3785 brelse(sbi->s_group_desc[i]);
3759 kfree(sbi->s_group_desc); 3786 ext4_kvfree(sbi->s_group_desc);
3760failed_mount: 3787failed_mount:
3761 if (sbi->s_proc) { 3788 if (sbi->s_proc) {
3762 remove_proc_entry(sb->s_id, ext4_proc_root); 3789 remove_proc_entry(sb->s_id, ext4_proc_root);
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 000000000000..011ba6670d99
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
1/*
2 * linux/fs/ext4/truncate.h
3 *
4 * Common inline functions needed for truncate support
5 */
6
7/*
8 * Truncate blocks that were not used by write. We have to truncate the
9 * pagecache as well so that corresponding buffers get properly unmapped.
10 */
11static inline void ext4_truncate_failed_write(struct inode *inode)
12{
13 truncate_inode_pages(inode->i_mapping, inode->i_size);
14 ext4_truncate(inode);
15}
16
17/*
18 * Work out how many blocks we need to proceed with the next chunk of a
19 * truncate transaction.
20 */
21static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
22{
23 ext4_lblk_t needed;
24
25 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
26
27 /* Give ourselves just enough room to cope with inodes in which
28 * i_blocks is corrupt: we've seen disk corruptions in the past
29 * which resulted in random data in an inode which looked enough
30 * like a regular file for ext4 to try to delete it. Things
31 * will go a bit crazy if that happens, but at least we should
32 * try not to panic the whole kernel. */
33 if (needed < 2)
34 needed = 2;
35
36 /* But we need to bound the transaction so we don't overflow the
37 * journal. */
38 if (needed > EXT4_MAX_TRANS_DATA)
39 needed = EXT4_MAX_TRANS_DATA;
40
41 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
42}
43
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ad64732cbce..5efbd5d7701a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
1231 struct super_block *sb = dir->i_sb; 1231 struct super_block *sb = dir->i_sb;
1232 struct msdos_sb_info *sbi = MSDOS_SB(sb); 1232 struct msdos_sb_info *sbi = MSDOS_SB(sb);
1233 struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ 1233 struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
1234 struct msdos_dir_entry *de; 1234 struct msdos_dir_entry *uninitialized_var(de);
1235 int err, free_slots, i, nr_bhs; 1235 int err, free_slots, i, nr_bhs;
1236 loff_t pos, i_pos; 1236 loff_t pos, i_pos;
1237 1237
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5942fec22c65..1726d7303047 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
1188out: 1188out:
1189 /* UTF-8 doesn't provide FAT semantics */ 1189 /* UTF-8 doesn't provide FAT semantics */
1190 if (!strcmp(opts->iocharset, "utf8")) { 1190 if (!strcmp(opts->iocharset, "utf8")) {
1191 fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" 1191 fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
1192 " for FAT filesystems, filesystem will be " 1192 " for FAT filesystems, filesystem will be "
1193 "case sensitive!\n"); 1193 "case sensitive!");
1194 } 1194 }
1195 1195
1196 /* If user doesn't specify allow_utime, it's initialized from dmask. */ 1196 /* If user doesn't specify allow_utime, it's initialized from dmask. */
@@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1367 sbi->free_clusters = -1; /* Don't know yet */ 1367 sbi->free_clusters = -1; /* Don't know yet */
1368 sbi->free_clus_valid = 0; 1368 sbi->free_clus_valid = 0;
1369 sbi->prev_free = FAT_START_ENT; 1369 sbi->prev_free = FAT_START_ENT;
1370 sb->s_maxbytes = 0xffffffff;
1370 1371
1371 if (!sbi->fat_length && b->fat32_length) { 1372 if (!sbi->fat_length && b->fat32_length) {
1372 struct fat_boot_fsinfo *fsinfo; 1373 struct fat_boot_fsinfo *fsinfo;
@@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1377 sbi->fat_length = le32_to_cpu(b->fat32_length); 1378 sbi->fat_length = le32_to_cpu(b->fat32_length);
1378 sbi->root_cluster = le32_to_cpu(b->root_cluster); 1379 sbi->root_cluster = le32_to_cpu(b->root_cluster);
1379 1380
1380 sb->s_maxbytes = 0xffffffff;
1381
1382 /* MC - if info_sector is 0, don't multiply by 0 */ 1381 /* MC - if info_sector is 0, don't multiply by 0 */
1383 sbi->fsinfo_sector = le16_to_cpu(b->info_sector); 1382 sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
1384 if (sbi->fsinfo_sector == 0) 1383 if (sbi->fsinfo_sector == 0)
diff --git a/fs/file_table.c b/fs/file_table.c
index 01e4c1e8e6b6..c322794f7360 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,7 +25,7 @@
25#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/ima.h> 26#include <linux/ima.h>
27 27
28#include <asm/atomic.h> 28#include <linux/atomic.h>
29 29
30#include "internal.h" 30#include "internal.h"
31 31
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b8c507ca42f7..04cf3b91e501 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,7 +35,9 @@
35struct wb_writeback_work { 35struct wb_writeback_work {
36 long nr_pages; 36 long nr_pages;
37 struct super_block *sb; 37 struct super_block *sb;
38 unsigned long *older_than_this;
38 enum writeback_sync_modes sync_mode; 39 enum writeback_sync_modes sync_mode;
40 unsigned int tagged_writepages:1;
39 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
40 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
41 unsigned int for_background:1; 43 unsigned int for_background:1;
@@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
180 */ 182 */
181void inode_wb_list_del(struct inode *inode) 183void inode_wb_list_del(struct inode *inode)
182{ 184{
183 spin_lock(&inode_wb_list_lock); 185 struct backing_dev_info *bdi = inode_to_bdi(inode);
186
187 spin_lock(&bdi->wb.list_lock);
184 list_del_init(&inode->i_wb_list); 188 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock); 189 spin_unlock(&bdi->wb.list_lock);
186} 190}
187 191
188
189/* 192/*
190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 193 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
191 * furthest end of its superblock's dirty-inode list. 194 * furthest end of its superblock's dirty-inode list.
@@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode)
195 * the case then the inode must have been redirtied while it was being written 198 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 199 * out and we don't reset its dirtied_when.
197 */ 200 */
198static void redirty_tail(struct inode *inode) 201static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
199{ 202{
200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 203 assert_spin_locked(&wb->list_lock);
201
202 assert_spin_locked(&inode_wb_list_lock);
203 if (!list_empty(&wb->b_dirty)) { 204 if (!list_empty(&wb->b_dirty)) {
204 struct inode *tail; 205 struct inode *tail;
205 206
@@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode)
213/* 214/*
214 * requeue inode for re-scanning after bdi->b_io list is exhausted. 215 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 216 */
216static void requeue_io(struct inode *inode) 217static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
217{ 218{
218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 219 assert_spin_locked(&wb->list_lock);
219
220 assert_spin_locked(&inode_wb_list_lock);
221 list_move(&inode->i_wb_list, &wb->b_more_io); 220 list_move(&inode->i_wb_list, &wb->b_more_io);
222} 221}
223 222
@@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode)
225{ 224{
226 /* 225 /*
227 * Prevent speculative execution through 226 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock); 227 * spin_unlock(&wb->list_lock);
229 */ 228 */
230 229
231 smp_mb(); 230 smp_mb();
@@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
250/* 249/*
251 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 250 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
252 */ 251 */
253static void move_expired_inodes(struct list_head *delaying_queue, 252static int move_expired_inodes(struct list_head *delaying_queue,
254 struct list_head *dispatch_queue, 253 struct list_head *dispatch_queue,
255 unsigned long *older_than_this) 254 unsigned long *older_than_this)
256{ 255{
257 LIST_HEAD(tmp); 256 LIST_HEAD(tmp);
258 struct list_head *pos, *node; 257 struct list_head *pos, *node;
259 struct super_block *sb = NULL; 258 struct super_block *sb = NULL;
260 struct inode *inode; 259 struct inode *inode;
261 int do_sb_sort = 0; 260 int do_sb_sort = 0;
261 int moved = 0;
262 262
263 while (!list_empty(delaying_queue)) { 263 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 264 inode = wb_inode(delaying_queue->prev);
@@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue,
269 do_sb_sort = 1; 269 do_sb_sort = 1;
270 sb = inode->i_sb; 270 sb = inode->i_sb;
271 list_move(&inode->i_wb_list, &tmp); 271 list_move(&inode->i_wb_list, &tmp);
272 moved++;
272 } 273 }
273 274
274 /* just one sb in list, splice to dispatch_queue and we're done */ 275 /* just one sb in list, splice to dispatch_queue and we're done */
275 if (!do_sb_sort) { 276 if (!do_sb_sort) {
276 list_splice(&tmp, dispatch_queue); 277 list_splice(&tmp, dispatch_queue);
277 return; 278 goto out;
278 } 279 }
279 280
280 /* Move inodes from one superblock together */ 281 /* Move inodes from one superblock together */
@@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue,
286 list_move(&inode->i_wb_list, dispatch_queue); 287 list_move(&inode->i_wb_list, dispatch_queue);
287 } 288 }
288 } 289 }
290out:
291 return moved;
289} 292}
290 293
291/* 294/*
@@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
301 */ 304 */
302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 305static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
303{ 306{
304 assert_spin_locked(&inode_wb_list_lock); 307 int moved;
308 assert_spin_locked(&wb->list_lock);
305 list_splice_init(&wb->b_more_io, &wb->b_io); 309 list_splice_init(&wb->b_more_io, &wb->b_io);
306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
311 trace_writeback_queue_io(wb, older_than_this, moved);
307} 312}
308 313
309static int write_inode(struct inode *inode, struct writeback_control *wbc) 314static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
316/* 321/*
317 * Wait for writeback on an inode to complete. 322 * Wait for writeback on an inode to complete.
318 */ 323 */
319static void inode_wait_for_writeback(struct inode *inode) 324static void inode_wait_for_writeback(struct inode *inode,
325 struct bdi_writeback *wb)
320{ 326{
321 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 327 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
322 wait_queue_head_t *wqh; 328 wait_queue_head_t *wqh;
@@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode)
324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 330 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
325 while (inode->i_state & I_SYNC) { 331 while (inode->i_state & I_SYNC) {
326 spin_unlock(&inode->i_lock); 332 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock); 333 spin_unlock(&wb->list_lock);
328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 334 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
329 spin_lock(&inode_wb_list_lock); 335 spin_lock(&wb->list_lock);
330 spin_lock(&inode->i_lock); 336 spin_lock(&inode->i_lock);
331 } 337 }
332} 338}
333 339
334/* 340/*
335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and 341 * Write out an inode's dirty pages. Called under wb->list_lock and
336 * inode->i_lock. Either the caller has an active reference on the inode or 342 * inode->i_lock. Either the caller has an active reference on the inode or
337 * the inode has I_WILL_FREE set. 343 * the inode has I_WILL_FREE set.
338 * 344 *
@@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode)
343 * livelocks, etc. 349 * livelocks, etc.
344 */ 350 */
345static int 351static int
346writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 352writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
353 struct writeback_control *wbc)
347{ 354{
348 struct address_space *mapping = inode->i_mapping; 355 struct address_space *mapping = inode->i_mapping;
356 long nr_to_write = wbc->nr_to_write;
349 unsigned dirty; 357 unsigned dirty;
350 int ret; 358 int ret;
351 359
352 assert_spin_locked(&inode_wb_list_lock); 360 assert_spin_locked(&wb->list_lock);
353 assert_spin_locked(&inode->i_lock); 361 assert_spin_locked(&inode->i_lock);
354 362
355 if (!atomic_read(&inode->i_count)) 363 if (!atomic_read(&inode->i_count))
@@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
367 * completed a full scan of b_io. 375 * completed a full scan of b_io.
368 */ 376 */
369 if (wbc->sync_mode != WB_SYNC_ALL) { 377 if (wbc->sync_mode != WB_SYNC_ALL) {
370 requeue_io(inode); 378 requeue_io(inode, wb);
379 trace_writeback_single_inode_requeue(inode, wbc,
380 nr_to_write);
371 return 0; 381 return 0;
372 } 382 }
373 383
374 /* 384 /*
375 * It's a data-integrity sync. We must wait. 385 * It's a data-integrity sync. We must wait.
376 */ 386 */
377 inode_wait_for_writeback(inode); 387 inode_wait_for_writeback(inode, wb);
378 } 388 }
379 389
380 BUG_ON(inode->i_state & I_SYNC); 390 BUG_ON(inode->i_state & I_SYNC);
@@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
383 inode->i_state |= I_SYNC; 393 inode->i_state |= I_SYNC;
384 inode->i_state &= ~I_DIRTY_PAGES; 394 inode->i_state &= ~I_DIRTY_PAGES;
385 spin_unlock(&inode->i_lock); 395 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock); 396 spin_unlock(&wb->list_lock);
387 397
388 ret = do_writepages(mapping, wbc); 398 ret = do_writepages(mapping, wbc);
389 399
@@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
414 ret = err; 424 ret = err;
415 } 425 }
416 426
417 spin_lock(&inode_wb_list_lock); 427 spin_lock(&wb->list_lock);
418 spin_lock(&inode->i_lock); 428 spin_lock(&inode->i_lock);
419 inode->i_state &= ~I_SYNC; 429 inode->i_state &= ~I_SYNC;
420 if (!(inode->i_state & I_FREEING)) { 430 if (!(inode->i_state & I_FREEING)) {
431 /*
432 * Sync livelock prevention. Each inode is tagged and synced in
433 * one shot. If still dirty, it will be redirty_tail()'ed below.
434 * Update the dirty time to prevent enqueue and sync it again.
435 */
436 if ((inode->i_state & I_DIRTY) &&
437 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
438 inode->dirtied_when = jiffies;
439
421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 440 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
422 /* 441 /*
423 * We didn't write back all the pages. nfs_writepages() 442 * We didn't write back all the pages. nfs_writepages()
@@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
428 /* 447 /*
429 * slice used up: queue for next turn 448 * slice used up: queue for next turn
430 */ 449 */
431 requeue_io(inode); 450 requeue_io(inode, wb);
432 } else { 451 } else {
433 /* 452 /*
434 * Writeback blocked by something other than 453 * Writeback blocked by something other than
@@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
437 * retrying writeback of the dirty page/inode 456 * retrying writeback of the dirty page/inode
438 * that cannot be performed immediately. 457 * that cannot be performed immediately.
439 */ 458 */
440 redirty_tail(inode); 459 redirty_tail(inode, wb);
441 } 460 }
442 } else if (inode->i_state & I_DIRTY) { 461 } else if (inode->i_state & I_DIRTY) {
443 /* 462 /*
@@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
446 * submission or metadata updates after data IO 465 * submission or metadata updates after data IO
447 * completion. 466 * completion.
448 */ 467 */
449 redirty_tail(inode); 468 redirty_tail(inode, wb);
450 } else { 469 } else {
451 /* 470 /*
452 * The inode is clean. At this point we either have 471 * The inode is clean. At this point we either have
@@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
457 } 476 }
458 } 477 }
459 inode_sync_complete(inode); 478 inode_sync_complete(inode);
479 trace_writeback_single_inode(inode, wbc, nr_to_write);
460 return ret; 480 return ret;
461} 481}
462 482
483static long writeback_chunk_size(struct backing_dev_info *bdi,
484 struct wb_writeback_work *work)
485{
486 long pages;
487
488 /*
489 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
490 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
491 * here avoids calling into writeback_inodes_wb() more than once.
492 *
493 * The intended call sequence for WB_SYNC_ALL writeback is:
494 *
495 * wb_writeback()
496 * writeback_sb_inodes() <== called only once
497 * write_cache_pages() <== called once for each inode
498 * (quickly) tag currently dirty pages
499 * (maybe slowly) sync all tagged pages
500 */
501 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
502 pages = LONG_MAX;
503 else {
504 pages = min(bdi->avg_write_bandwidth / 2,
505 global_dirty_limit / DIRTY_SCOPE);
506 pages = min(pages, work->nr_pages);
507 pages = round_down(pages + MIN_WRITEBACK_PAGES,
508 MIN_WRITEBACK_PAGES);
509 }
510
511 return pages;
512}
513
463/* 514/*
464 * Write a portion of b_io inodes which belong to @sb. 515 * Write a portion of b_io inodes which belong to @sb.
465 * 516 *
@@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
467 * inodes. Otherwise write only ones which go sequentially 518 * inodes. Otherwise write only ones which go sequentially
468 * in reverse order. 519 * in reverse order.
469 * 520 *
470 * Return 1, if the caller writeback routine should be 521 * Return the number of pages and/or inodes written.
471 * interrupted. Otherwise return 0.
472 */ 522 */
473static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, 523static long writeback_sb_inodes(struct super_block *sb,
474 struct writeback_control *wbc, bool only_this_sb) 524 struct bdi_writeback *wb,
525 struct wb_writeback_work *work)
475{ 526{
527 struct writeback_control wbc = {
528 .sync_mode = work->sync_mode,
529 .tagged_writepages = work->tagged_writepages,
530 .for_kupdate = work->for_kupdate,
531 .for_background = work->for_background,
532 .range_cyclic = work->range_cyclic,
533 .range_start = 0,
534 .range_end = LLONG_MAX,
535 };
536 unsigned long start_time = jiffies;
537 long write_chunk;
538 long wrote = 0; /* count both pages and inodes */
539
476 while (!list_empty(&wb->b_io)) { 540 while (!list_empty(&wb->b_io)) {
477 long pages_skipped;
478 struct inode *inode = wb_inode(wb->b_io.prev); 541 struct inode *inode = wb_inode(wb->b_io.prev);
479 542
480 if (inode->i_sb != sb) { 543 if (inode->i_sb != sb) {
481 if (only_this_sb) { 544 if (work->sb) {
482 /* 545 /*
483 * We only want to write back data for this 546 * We only want to write back data for this
484 * superblock, move all inodes not belonging 547 * superblock, move all inodes not belonging
485 * to it back onto the dirty list. 548 * to it back onto the dirty list.
486 */ 549 */
487 redirty_tail(inode); 550 redirty_tail(inode, wb);
488 continue; 551 continue;
489 } 552 }
490 553
@@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
493 * Bounce back to the caller to unpin this and 556 * Bounce back to the caller to unpin this and
494 * pin the next superblock. 557 * pin the next superblock.
495 */ 558 */
496 return 0; 559 break;
497 } 560 }
498 561
499 /* 562 /*
@@ -504,95 +567,96 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
504 spin_lock(&inode->i_lock); 567 spin_lock(&inode->i_lock);
505 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 568 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
506 spin_unlock(&inode->i_lock); 569 spin_unlock(&inode->i_lock);
507 requeue_io(inode); 570 redirty_tail(inode, wb);
508 continue; 571 continue;
509 } 572 }
510
511 /*
512 * Was this inode dirtied after sync_sb_inodes was called?
513 * This keeps sync from extra jobs and livelock.
514 */
515 if (inode_dirtied_after(inode, wbc->wb_start)) {
516 spin_unlock(&inode->i_lock);
517 return 1;
518 }
519
520 __iget(inode); 573 __iget(inode);
574 write_chunk = writeback_chunk_size(wb->bdi, work);
575 wbc.nr_to_write = write_chunk;
576 wbc.pages_skipped = 0;
577
578 writeback_single_inode(inode, wb, &wbc);
521 579
522 pages_skipped = wbc->pages_skipped; 580 work->nr_pages -= write_chunk - wbc.nr_to_write;
523 writeback_single_inode(inode, wbc); 581 wrote += write_chunk - wbc.nr_to_write;
524 if (wbc->pages_skipped != pages_skipped) { 582 if (!(inode->i_state & I_DIRTY))
583 wrote++;
584 if (wbc.pages_skipped) {
525 /* 585 /*
526 * writeback is not making progress due to locked 586 * writeback is not making progress due to locked
527 * buffers. Skip this inode for now. 587 * buffers. Skip this inode for now.
528 */ 588 */
529 redirty_tail(inode); 589 redirty_tail(inode, wb);
530 } 590 }
531 spin_unlock(&inode->i_lock); 591 spin_unlock(&inode->i_lock);
532 spin_unlock(&inode_wb_list_lock); 592 spin_unlock(&wb->list_lock);
533 iput(inode); 593 iput(inode);
534 cond_resched(); 594 cond_resched();
535 spin_lock(&inode_wb_list_lock); 595 spin_lock(&wb->list_lock);
536 if (wbc->nr_to_write <= 0) { 596 /*
537 wbc->more_io = 1; 597 * bail out to wb_writeback() often enough to check
538 return 1; 598 * background threshold and other termination conditions.
599 */
600 if (wrote) {
601 if (time_is_before_jiffies(start_time + HZ / 10UL))
602 break;
603 if (work->nr_pages <= 0)
604 break;
539 } 605 }
540 if (!list_empty(&wb->b_more_io))
541 wbc->more_io = 1;
542 } 606 }
543 /* b_io is empty */ 607 return wrote;
544 return 1;
545} 608}
546 609
547void writeback_inodes_wb(struct bdi_writeback *wb, 610static long __writeback_inodes_wb(struct bdi_writeback *wb,
548 struct writeback_control *wbc) 611 struct wb_writeback_work *work)
549{ 612{
550 int ret = 0; 613 unsigned long start_time = jiffies;
551 614 long wrote = 0;
552 if (!wbc->wb_start)
553 wbc->wb_start = jiffies; /* livelock avoidance */
554 spin_lock(&inode_wb_list_lock);
555 if (!wbc->for_kupdate || list_empty(&wb->b_io))
556 queue_io(wb, wbc->older_than_this);
557 615
558 while (!list_empty(&wb->b_io)) { 616 while (!list_empty(&wb->b_io)) {
559 struct inode *inode = wb_inode(wb->b_io.prev); 617 struct inode *inode = wb_inode(wb->b_io.prev);
560 struct super_block *sb = inode->i_sb; 618 struct super_block *sb = inode->i_sb;
561 619
562 if (!grab_super_passive(sb)) { 620 if (!grab_super_passive(sb)) {
563 requeue_io(inode); 621 /*
622 * grab_super_passive() may fail consistently due to
623 * s_umount being grabbed by someone else. Don't use
624 * requeue_io() to avoid busy retrying the inode/sb.
625 */
626 redirty_tail(inode, wb);
564 continue; 627 continue;
565 } 628 }
566 ret = writeback_sb_inodes(sb, wb, wbc, false); 629 wrote += writeback_sb_inodes(sb, wb, work);
567 drop_super(sb); 630 drop_super(sb);
568 631
569 if (ret) 632 /* refer to the same tests at the end of writeback_sb_inodes */
570 break; 633 if (wrote) {
634 if (time_is_before_jiffies(start_time + HZ / 10UL))
635 break;
636 if (work->nr_pages <= 0)
637 break;
638 }
571 } 639 }
572 spin_unlock(&inode_wb_list_lock);
573 /* Leave any unwritten inodes on b_io */ 640 /* Leave any unwritten inodes on b_io */
641 return wrote;
574} 642}
575 643
576static void __writeback_inodes_sb(struct super_block *sb, 644long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
577 struct bdi_writeback *wb, struct writeback_control *wbc)
578{ 645{
579 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 646 struct wb_writeback_work work = {
647 .nr_pages = nr_pages,
648 .sync_mode = WB_SYNC_NONE,
649 .range_cyclic = 1,
650 };
580 651
581 spin_lock(&inode_wb_list_lock); 652 spin_lock(&wb->list_lock);
582 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 653 if (list_empty(&wb->b_io))
583 queue_io(wb, wbc->older_than_this); 654 queue_io(wb, NULL);
584 writeback_sb_inodes(sb, wb, wbc, true); 655 __writeback_inodes_wb(wb, &work);
585 spin_unlock(&inode_wb_list_lock); 656 spin_unlock(&wb->list_lock);
586}
587 657
588/* 658 return nr_pages - work.nr_pages;
589 * The maximum number of pages to writeout in a single bdi flush/kupdate 659}
590 * operation. We do this so we don't hold I_SYNC against an inode for
591 * enormous amounts of time, which would block a userspace task which has
592 * been forced to throttle against that inode. Also, the code reevaluates
593 * the dirty each time it has written this many pages.
594 */
595#define MAX_WRITEBACK_PAGES 1024
596 660
597static inline bool over_bground_thresh(void) 661static inline bool over_bground_thresh(void)
598{ 662{
@@ -605,6 +669,16 @@ static inline bool over_bground_thresh(void)
605} 669}
606 670
607/* 671/*
672 * Called under wb->list_lock. If there are multiple wb per bdi,
673 * only the flusher working on the first wb should do it.
674 */
675static void wb_update_bandwidth(struct bdi_writeback *wb,
676 unsigned long start_time)
677{
678 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
679}
680
681/*
608 * Explicit flushing or periodic writeback of "old" data. 682 * Explicit flushing or periodic writeback of "old" data.
609 * 683 *
610 * Define "old": the first time one of an inode's pages is dirtied, we mark the 684 * Define "old": the first time one of an inode's pages is dirtied, we mark the
@@ -622,47 +696,16 @@ static inline bool over_bground_thresh(void)
622static long wb_writeback(struct bdi_writeback *wb, 696static long wb_writeback(struct bdi_writeback *wb,
623 struct wb_writeback_work *work) 697 struct wb_writeback_work *work)
624{ 698{
625 struct writeback_control wbc = { 699 unsigned long wb_start = jiffies;
626 .sync_mode = work->sync_mode, 700 long nr_pages = work->nr_pages;
627 .older_than_this = NULL,
628 .for_kupdate = work->for_kupdate,
629 .for_background = work->for_background,
630 .range_cyclic = work->range_cyclic,
631 };
632 unsigned long oldest_jif; 701 unsigned long oldest_jif;
633 long wrote = 0;
634 long write_chunk;
635 struct inode *inode; 702 struct inode *inode;
703 long progress;
636 704
637 if (wbc.for_kupdate) { 705 oldest_jif = jiffies;
638 wbc.older_than_this = &oldest_jif; 706 work->older_than_this = &oldest_jif;
639 oldest_jif = jiffies -
640 msecs_to_jiffies(dirty_expire_interval * 10);
641 }
642 if (!wbc.range_cyclic) {
643 wbc.range_start = 0;
644 wbc.range_end = LLONG_MAX;
645 }
646
647 /*
648 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
649 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
650 * here avoids calling into writeback_inodes_wb() more than once.
651 *
652 * The intended call sequence for WB_SYNC_ALL writeback is:
653 *
654 * wb_writeback()
655 * __writeback_inodes_sb() <== called only once
656 * write_cache_pages() <== called once for each inode
657 * (quickly) tag currently dirty pages
658 * (maybe slowly) sync all tagged pages
659 */
660 if (wbc.sync_mode == WB_SYNC_NONE)
661 write_chunk = MAX_WRITEBACK_PAGES;
662 else
663 write_chunk = LONG_MAX;
664 707
665 wbc.wb_start = jiffies; /* livelock avoidance */ 708 spin_lock(&wb->list_lock);
666 for (;;) { 709 for (;;) {
667 /* 710 /*
668 * Stop writeback when nr_pages has been consumed 711 * Stop writeback when nr_pages has been consumed
@@ -687,52 +730,54 @@ static long wb_writeback(struct bdi_writeback *wb,
687 if (work->for_background && !over_bground_thresh()) 730 if (work->for_background && !over_bground_thresh())
688 break; 731 break;
689 732
690 wbc.more_io = 0; 733 if (work->for_kupdate) {
691 wbc.nr_to_write = write_chunk; 734 oldest_jif = jiffies -
692 wbc.pages_skipped = 0; 735 msecs_to_jiffies(dirty_expire_interval * 10);
736 work->older_than_this = &oldest_jif;
737 }
693 738
694 trace_wbc_writeback_start(&wbc, wb->bdi); 739 trace_writeback_start(wb->bdi, work);
740 if (list_empty(&wb->b_io))
741 queue_io(wb, work->older_than_this);
695 if (work->sb) 742 if (work->sb)
696 __writeback_inodes_sb(work->sb, wb, &wbc); 743 progress = writeback_sb_inodes(work->sb, wb, work);
697 else 744 else
698 writeback_inodes_wb(wb, &wbc); 745 progress = __writeback_inodes_wb(wb, work);
699 trace_wbc_writeback_written(&wbc, wb->bdi); 746 trace_writeback_written(wb->bdi, work);
700 747
701 work->nr_pages -= write_chunk - wbc.nr_to_write; 748 wb_update_bandwidth(wb, wb_start);
702 wrote += write_chunk - wbc.nr_to_write;
703 749
704 /* 750 /*
705 * If we consumed everything, see if we have more 751 * Did we write something? Try for more
752 *
753 * Dirty inodes are moved to b_io for writeback in batches.
754 * The completion of the current batch does not necessarily
755 * mean the overall work is done. So we keep looping as long
756 * as made some progress on cleaning pages or inodes.
706 */ 757 */
707 if (wbc.nr_to_write <= 0) 758 if (progress)
708 continue; 759 continue;
709 /* 760 /*
710 * Didn't write everything and we don't have more IO, bail 761 * No more inodes for IO, bail
711 */ 762 */
712 if (!wbc.more_io) 763 if (list_empty(&wb->b_more_io))
713 break; 764 break;
714 /* 765 /*
715 * Did we write something? Try for more
716 */
717 if (wbc.nr_to_write < write_chunk)
718 continue;
719 /*
720 * Nothing written. Wait for some inode to 766 * Nothing written. Wait for some inode to
721 * become available for writeback. Otherwise 767 * become available for writeback. Otherwise
722 * we'll just busyloop. 768 * we'll just busyloop.
723 */ 769 */
724 spin_lock(&inode_wb_list_lock);
725 if (!list_empty(&wb->b_more_io)) { 770 if (!list_empty(&wb->b_more_io)) {
771 trace_writeback_wait(wb->bdi, work);
726 inode = wb_inode(wb->b_more_io.prev); 772 inode = wb_inode(wb->b_more_io.prev);
727 trace_wbc_writeback_wait(&wbc, wb->bdi);
728 spin_lock(&inode->i_lock); 773 spin_lock(&inode->i_lock);
729 inode_wait_for_writeback(inode); 774 inode_wait_for_writeback(inode, wb);
730 spin_unlock(&inode->i_lock); 775 spin_unlock(&inode->i_lock);
731 } 776 }
732 spin_unlock(&inode_wb_list_lock);
733 } 777 }
778 spin_unlock(&wb->list_lock);
734 779
735 return wrote; 780 return nr_pages - work->nr_pages;
736} 781}
737 782
738/* 783/*
@@ -1063,10 +1108,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1063 } 1108 }
1064 1109
1065 spin_unlock(&inode->i_lock); 1110 spin_unlock(&inode->i_lock);
1066 spin_lock(&inode_wb_list_lock); 1111 spin_lock(&bdi->wb.list_lock);
1067 inode->dirtied_when = jiffies; 1112 inode->dirtied_when = jiffies;
1068 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1113 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1069 spin_unlock(&inode_wb_list_lock); 1114 spin_unlock(&bdi->wb.list_lock);
1070 1115
1071 if (wakeup_bdi) 1116 if (wakeup_bdi)
1072 bdi_wakeup_thread_delayed(bdi); 1117 bdi_wakeup_thread_delayed(bdi);
@@ -1162,10 +1207,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1162{ 1207{
1163 DECLARE_COMPLETION_ONSTACK(done); 1208 DECLARE_COMPLETION_ONSTACK(done);
1164 struct wb_writeback_work work = { 1209 struct wb_writeback_work work = {
1165 .sb = sb, 1210 .sb = sb,
1166 .sync_mode = WB_SYNC_NONE, 1211 .sync_mode = WB_SYNC_NONE,
1167 .done = &done, 1212 .tagged_writepages = 1,
1168 .nr_pages = nr, 1213 .done = &done,
1214 .nr_pages = nr,
1169 }; 1215 };
1170 1216
1171 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1217 WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1267,6 +1313,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
1267 */ 1313 */
1268int write_inode_now(struct inode *inode, int sync) 1314int write_inode_now(struct inode *inode, int sync)
1269{ 1315{
1316 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1270 int ret; 1317 int ret;
1271 struct writeback_control wbc = { 1318 struct writeback_control wbc = {
1272 .nr_to_write = LONG_MAX, 1319 .nr_to_write = LONG_MAX,
@@ -1279,11 +1326,11 @@ int write_inode_now(struct inode *inode, int sync)
1279 wbc.nr_to_write = 0; 1326 wbc.nr_to_write = 0;
1280 1327
1281 might_sleep(); 1328 might_sleep();
1282 spin_lock(&inode_wb_list_lock); 1329 spin_lock(&wb->list_lock);
1283 spin_lock(&inode->i_lock); 1330 spin_lock(&inode->i_lock);
1284 ret = writeback_single_inode(inode, &wbc); 1331 ret = writeback_single_inode(inode, wb, &wbc);
1285 spin_unlock(&inode->i_lock); 1332 spin_unlock(&inode->i_lock);
1286 spin_unlock(&inode_wb_list_lock); 1333 spin_unlock(&wb->list_lock);
1287 if (sync) 1334 if (sync)
1288 inode_sync_wait(inode); 1335 inode_sync_wait(inode);
1289 return ret; 1336 return ret;
@@ -1303,13 +1350,14 @@ EXPORT_SYMBOL(write_inode_now);
1303 */ 1350 */
1304int sync_inode(struct inode *inode, struct writeback_control *wbc) 1351int sync_inode(struct inode *inode, struct writeback_control *wbc)
1305{ 1352{
1353 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1306 int ret; 1354 int ret;
1307 1355
1308 spin_lock(&inode_wb_list_lock); 1356 spin_lock(&wb->list_lock);
1309 spin_lock(&inode->i_lock); 1357 spin_lock(&inode->i_lock);
1310 ret = writeback_single_inode(inode, wbc); 1358 ret = writeback_single_inode(inode, wb, wbc);
1311 spin_unlock(&inode->i_lock); 1359 spin_unlock(&inode->i_lock);
1312 spin_unlock(&inode_wb_list_lock); 1360 spin_unlock(&wb->list_lock);
1313 return ret; 1361 return ret;
1314} 1362}
1315EXPORT_SYMBOL(sync_inode); 1363EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 640fc229df10..5cb8614508c3 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
258 forget->forget_one.nlookup = nlookup; 258 forget->forget_one.nlookup = nlookup;
259 259
260 spin_lock(&fc->lock); 260 spin_lock(&fc->lock);
261 fc->forget_list_tail->next = forget; 261 if (fc->connected) {
262 fc->forget_list_tail = forget; 262 fc->forget_list_tail->next = forget;
263 wake_up(&fc->waitq); 263 fc->forget_list_tail = forget;
264 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 264 wake_up(&fc->waitq);
265 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
266 } else {
267 kfree(forget);
268 }
265 spin_unlock(&fc->lock); 269 spin_unlock(&fc->lock);
266} 270}
267 271
@@ -1358,6 +1362,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
1358 if (outarg.namelen > FUSE_NAME_MAX) 1362 if (outarg.namelen > FUSE_NAME_MAX)
1359 goto err; 1363 goto err;
1360 1364
1365 err = -EINVAL;
1366 if (size != sizeof(outarg) + outarg.namelen + 1)
1367 goto err;
1368
1361 name.name = buf; 1369 name.name = buf;
1362 name.len = outarg.namelen; 1370 name.len = outarg.namelen;
1363 err = fuse_copy_one(cs, buf, outarg.namelen + 1); 1371 err = fuse_copy_one(cs, buf, outarg.namelen + 1);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d480d9af46c9..594f07a81c28 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -14,6 +14,7 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/compat.h> 16#include <linux/compat.h>
17#include <linux/swap.h>
17 18
18static const struct file_operations fuse_direct_io_file_operations; 19static const struct file_operations fuse_direct_io_file_operations;
19 20
@@ -245,6 +246,12 @@ void fuse_release_common(struct file *file, int opcode)
245 req = ff->reserved_req; 246 req = ff->reserved_req;
246 fuse_prepare_release(ff, file->f_flags, opcode); 247 fuse_prepare_release(ff, file->f_flags, opcode);
247 248
249 if (ff->flock) {
250 struct fuse_release_in *inarg = &req->misc.release.in;
251 inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
252 inarg->lock_owner = fuse_lock_owner_id(ff->fc,
253 (fl_owner_t) file);
254 }
248 /* Hold vfsmount and dentry until release is finished */ 255 /* Hold vfsmount and dentry until release is finished */
249 path_get(&file->f_path); 256 path_get(&file->f_path);
250 req->misc.release.path = file->f_path; 257 req->misc.release.path = file->f_path;
@@ -755,18 +762,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
755 return req->misc.write.out.size; 762 return req->misc.write.out.size;
756} 763}
757 764
758static int fuse_write_begin(struct file *file, struct address_space *mapping,
759 loff_t pos, unsigned len, unsigned flags,
760 struct page **pagep, void **fsdata)
761{
762 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
763
764 *pagep = grab_cache_page_write_begin(mapping, index, flags);
765 if (!*pagep)
766 return -ENOMEM;
767 return 0;
768}
769
770void fuse_write_update_size(struct inode *inode, loff_t pos) 765void fuse_write_update_size(struct inode *inode, loff_t pos)
771{ 766{
772 struct fuse_conn *fc = get_fuse_conn(inode); 767 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -779,62 +774,6 @@ void fuse_write_update_size(struct inode *inode, loff_t pos)
779 spin_unlock(&fc->lock); 774 spin_unlock(&fc->lock);
780} 775}
781 776
782static int fuse_buffered_write(struct file *file, struct inode *inode,
783 loff_t pos, unsigned count, struct page *page)
784{
785 int err;
786 size_t nres;
787 struct fuse_conn *fc = get_fuse_conn(inode);
788 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
789 struct fuse_req *req;
790
791 if (is_bad_inode(inode))
792 return -EIO;
793
794 /*
795 * Make sure writepages on the same page are not mixed up with
796 * plain writes.
797 */
798 fuse_wait_on_page_writeback(inode, page->index);
799
800 req = fuse_get_req(fc);
801 if (IS_ERR(req))
802 return PTR_ERR(req);
803
804 req->in.argpages = 1;
805 req->num_pages = 1;
806 req->pages[0] = page;
807 req->page_offset = offset;
808 nres = fuse_send_write(req, file, pos, count, NULL);
809 err = req->out.h.error;
810 fuse_put_request(fc, req);
811 if (!err && !nres)
812 err = -EIO;
813 if (!err) {
814 pos += nres;
815 fuse_write_update_size(inode, pos);
816 if (count == PAGE_CACHE_SIZE)
817 SetPageUptodate(page);
818 }
819 fuse_invalidate_attr(inode);
820 return err ? err : nres;
821}
822
823static int fuse_write_end(struct file *file, struct address_space *mapping,
824 loff_t pos, unsigned len, unsigned copied,
825 struct page *page, void *fsdata)
826{
827 struct inode *inode = mapping->host;
828 int res = 0;
829
830 if (copied)
831 res = fuse_buffered_write(file, inode, pos, copied, page);
832
833 unlock_page(page);
834 page_cache_release(page);
835 return res;
836}
837
838static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, 777static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
839 struct inode *inode, loff_t pos, 778 struct inode *inode, loff_t pos,
840 size_t count) 779 size_t count)
@@ -908,6 +847,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
908 pagefault_enable(); 847 pagefault_enable();
909 flush_dcache_page(page); 848 flush_dcache_page(page);
910 849
850 mark_page_accessed(page);
851
911 if (!tmp) { 852 if (!tmp) {
912 unlock_page(page); 853 unlock_page(page);
913 page_cache_release(page); 854 page_cache_release(page);
@@ -1559,11 +1500,14 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
1559 struct fuse_conn *fc = get_fuse_conn(inode); 1500 struct fuse_conn *fc = get_fuse_conn(inode);
1560 int err; 1501 int err;
1561 1502
1562 if (fc->no_lock) { 1503 if (fc->no_flock) {
1563 err = flock_lock_file_wait(file, fl); 1504 err = flock_lock_file_wait(file, fl);
1564 } else { 1505 } else {
1506 struct fuse_file *ff = file->private_data;
1507
1565 /* emulate flock with POSIX locks */ 1508 /* emulate flock with POSIX locks */
1566 fl->fl_owner = (fl_owner_t) file; 1509 fl->fl_owner = (fl_owner_t) file;
1510 ff->flock = true;
1567 err = fuse_setlk(file, fl, 1); 1511 err = fuse_setlk(file, fl, 1);
1568 } 1512 }
1569 1513
@@ -2201,8 +2145,6 @@ static const struct address_space_operations fuse_file_aops = {
2201 .readpage = fuse_readpage, 2145 .readpage = fuse_readpage,
2202 .writepage = fuse_writepage, 2146 .writepage = fuse_writepage,
2203 .launder_page = fuse_launder_page, 2147 .launder_page = fuse_launder_page,
2204 .write_begin = fuse_write_begin,
2205 .write_end = fuse_write_end,
2206 .readpages = fuse_readpages, 2148 .readpages = fuse_readpages,
2207 .set_page_dirty = __set_page_dirty_nobuffers, 2149 .set_page_dirty = __set_page_dirty_nobuffers,
2208 .bmap = fuse_bmap, 2150 .bmap = fuse_bmap,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c6aa2d4b8517..cf6db0a93219 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -135,6 +135,9 @@ struct fuse_file {
135 135
136 /** Wait queue head for poll */ 136 /** Wait queue head for poll */
137 wait_queue_head_t poll_wait; 137 wait_queue_head_t poll_wait;
138
139 /** Has flock been performed on this file? */
140 bool flock:1;
138}; 141};
139 142
140/** One input argument of a request */ 143/** One input argument of a request */
@@ -448,7 +451,7 @@ struct fuse_conn {
448 /** Is removexattr not implemented by fs? */ 451 /** Is removexattr not implemented by fs? */
449 unsigned no_removexattr:1; 452 unsigned no_removexattr:1;
450 453
451 /** Are file locking primitives not implemented by fs? */ 454 /** Are posix file locking primitives not implemented by fs? */
452 unsigned no_lock:1; 455 unsigned no_lock:1;
453 456
454 /** Is access not implemented by fs? */ 457 /** Is access not implemented by fs? */
@@ -472,6 +475,9 @@ struct fuse_conn {
472 /** Don't apply umask to creation modes */ 475 /** Don't apply umask to creation modes */
473 unsigned dont_mask:1; 476 unsigned dont_mask:1;
474 477
478 /** Are BSD file locking primitives not implemented by fs? */
479 unsigned no_flock:1;
480
475 /** The number of requests waiting for completion */ 481 /** The number of requests waiting for completion */
476 atomic_t num_waiting; 482 atomic_t num_waiting;
477 483
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 38f84cd48b67..add96f6ffda5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,7 +71,7 @@ struct fuse_mount_data {
71 unsigned blksize; 71 unsigned blksize;
72}; 72};
73 73
74struct fuse_forget_link *fuse_alloc_forget() 74struct fuse_forget_link *fuse_alloc_forget(void)
75{ 75{
76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); 76 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
77} 77}
@@ -809,6 +809,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
809 fc->async_read = 1; 809 fc->async_read = 1;
810 if (!(arg->flags & FUSE_POSIX_LOCKS)) 810 if (!(arg->flags & FUSE_POSIX_LOCKS))
811 fc->no_lock = 1; 811 fc->no_lock = 1;
812 if (arg->minor >= 17) {
813 if (!(arg->flags & FUSE_FLOCK_LOCKS))
814 fc->no_flock = 1;
815 } else {
816 if (!(arg->flags & FUSE_POSIX_LOCKS))
817 fc->no_flock = 1;
818 }
812 if (arg->flags & FUSE_ATOMIC_O_TRUNC) 819 if (arg->flags & FUSE_ATOMIC_O_TRUNC)
813 fc->atomic_o_trunc = 1; 820 fc->atomic_o_trunc = 1;
814 if (arg->minor >= 9) { 821 if (arg->minor >= 9) {
@@ -823,6 +830,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
823 } else { 830 } else {
824 ra_pages = fc->max_read / PAGE_CACHE_SIZE; 831 ra_pages = fc->max_read / PAGE_CACHE_SIZE;
825 fc->no_lock = 1; 832 fc->no_lock = 1;
833 fc->no_flock = 1;
826 } 834 }
827 835
828 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); 836 fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
@@ -843,7 +851,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
843 arg->minor = FUSE_KERNEL_MINOR_VERSION; 851 arg->minor = FUSE_KERNEL_MINOR_VERSION;
844 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; 852 arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
845 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | 853 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
846 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; 854 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
855 FUSE_FLOCK_LOCKS;
847 req->in.h.opcode = FUSE_INIT; 856 req->in.h.opcode = FUSE_INIT;
848 req->in.numargs = 1; 857 req->in.numargs = 1;
849 req->in.args[0].size = sizeof(*arg); 858 req->in.args[0].size = sizeof(*arg);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index d5e33a077a67..d0dddaceac59 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
82 return PTR_ERR(acl); 82 return PTR_ERR(acl);
83 } 83 }
84 if (acl) { 84 if (acl) {
85 mode_t mode;
86
87 error = posix_acl_valid(acl); 85 error = posix_acl_valid(acl);
88 if (error) 86 if (error)
89 goto failed; 87 goto failed;
90 switch (type) { 88 switch (type) {
91 case ACL_TYPE_ACCESS: 89 case ACL_TYPE_ACCESS:
92 mode = inode->i_mode; 90 error = posix_acl_equiv_mode(acl, &inode->i_mode);
93 error = posix_acl_equiv_mode(acl, &mode);
94 if (error < 0) 91 if (error < 0)
95 goto failed; 92 goto failed;
96 inode->i_mode = mode;
97 inode->i_ctime = CURRENT_TIME; 93 inode->i_ctime = CURRENT_TIME;
98 if (error == 0) { 94 if (error == 0) {
99 posix_acl_release(acl); 95 posix_acl_release(acl);
@@ -125,21 +121,20 @@ int
125generic_acl_init(struct inode *inode, struct inode *dir) 121generic_acl_init(struct inode *inode, struct inode *dir)
126{ 122{
127 struct posix_acl *acl = NULL; 123 struct posix_acl *acl = NULL;
128 mode_t mode = inode->i_mode;
129 int error; 124 int error;
130 125
131 inode->i_mode = mode & ~current_umask();
132 if (!S_ISLNK(inode->i_mode)) 126 if (!S_ISLNK(inode->i_mode))
133 acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); 127 acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
134 if (acl) { 128 if (acl) {
135 if (S_ISDIR(inode->i_mode)) 129 if (S_ISDIR(inode->i_mode))
136 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); 130 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
137 error = posix_acl_create(&acl, GFP_KERNEL, &mode); 131 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
138 if (error < 0) 132 if (error < 0)
139 return error; 133 return error;
140 inode->i_mode = mode;
141 if (error > 0) 134 if (error > 0)
142 set_cached_acl(inode, ACL_TYPE_ACCESS, acl); 135 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
136 } else {
137 inode->i_mode &= ~current_umask();
143 } 138 }
144 error = 0; 139 error = 0;
145 140
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 884c9af0542f..34501b64bc47 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
72 return gfs2_acl_get(GFS2_I(inode), type); 72 return gfs2_acl_get(GFS2_I(inode), type);
73} 73}
74 74
75static int gfs2_set_mode(struct inode *inode, mode_t mode) 75static int gfs2_set_mode(struct inode *inode, umode_t mode)
76{ 76{
77 int error = 0; 77 int error = 0;
78 78
@@ -117,7 +117,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
117{ 117{
118 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 118 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
119 struct posix_acl *acl; 119 struct posix_acl *acl;
120 mode_t mode = inode->i_mode; 120 umode_t mode = inode->i_mode;
121 int error = 0; 121 int error = 0;
122 122
123 if (!sdp->sd_args.ar_posix_acl) 123 if (!sdp->sd_args.ar_posix_acl)
@@ -276,7 +276,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
276 goto out_release; 276 goto out_release;
277 277
278 if (type == ACL_TYPE_ACCESS) { 278 if (type == ACL_TYPE_ACCESS) {
279 mode_t mode = inode->i_mode; 279 umode_t mode = inode->i_mode;
280 error = posix_acl_equiv_mode(acl, &mode); 280 error = posix_acl_equiv_mode(acl, &mode);
281 281
282 if (error <= 0) { 282 if (error <= 0) {
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 29e1ace7953d..8a139ff1919f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -16,7 +16,7 @@
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
18#include <linux/rculist_bl.h> 18#include <linux/rculist_bl.h>
19#include <asm/atomic.h> 19#include <linux/atomic.h>
20 20
21#include "gfs2.h" 21#include "gfs2.h"
22#include "incore.h" 22#include "incore.h"
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 516516e0c2a2..3bc073a4cf82 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1018,13 +1018,13 @@ hostdata_error:
1018 fsname++; 1018 fsname++;
1019 if (lm->lm_mount == NULL) { 1019 if (lm->lm_mount == NULL) {
1020 fs_info(sdp, "Now mounting FS...\n"); 1020 fs_info(sdp, "Now mounting FS...\n");
1021 complete(&sdp->sd_locking_init); 1021 complete_all(&sdp->sd_locking_init);
1022 return 0; 1022 return 0;
1023 } 1023 }
1024 ret = lm->lm_mount(sdp, fsname); 1024 ret = lm->lm_mount(sdp, fsname);
1025 if (ret == 0) 1025 if (ret == 0)
1026 fs_info(sdp, "Joined cluster. Now mounting FS...\n"); 1026 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
1027 complete(&sdp->sd_locking_init); 1027 complete_all(&sdp->sd_locking_init);
1028 return ret; 1028 return ret;
1029} 1029}
1030 1030
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 8635be5ffd97..970ea987b3f6 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -16,6 +16,7 @@
16#include <linux/statfs.h> 16#include <linux/statfs.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/pid_namespace.h> 18#include <linux/pid_namespace.h>
19#include <linux/namei.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include "os.h" 21#include "os.h"
21 22
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87b6e0421c12..ec889538e5a6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
491 inode->i_op = &page_symlink_inode_operations; 491 inode->i_op = &page_symlink_inode_operations;
492 break; 492 break;
493 } 493 }
494 lockdep_annotate_inode_mutex_key(inode);
494 } 495 }
495 return inode; 496 return inode;
496} 497}
diff --git a/fs/inode.c b/fs/inode.c
index 96c77b81167c..ec7924696a13 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -37,7 +37,7 @@
37 * inode->i_sb->s_inode_lru, inode->i_lru 37 * inode->i_sb->s_inode_lru, inode->i_lru
38 * inode_sb_list_lock protects: 38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list 39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects: 40 * bdi->wb.list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects: 42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash 43 * inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
48 * inode->i_lock 48 * inode->i_lock
49 * inode->i_sb->s_inode_lru_lock 49 * inode->i_sb->s_inode_lru_lock
50 * 50 *
51 * inode_wb_list_lock 51 * bdi->wb.list_lock
52 * inode->i_lock 52 * inode->i_lock
53 * 53 *
54 * inode_hash_lock 54 * inode_hash_lock
@@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly;
65static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 65static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
66 66
67__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 67__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
68__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
69 68
70/* 69/*
71 * Empty aops. Can be used for the cases where the user does not 70 * Empty aops. Can be used for the cases where the user does not
@@ -144,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
144 inode->i_op = &empty_iops; 143 inode->i_op = &empty_iops;
145 inode->i_fop = &empty_fops; 144 inode->i_fop = &empty_fops;
146 inode->i_nlink = 1; 145 inode->i_nlink = 1;
146 inode->i_opflags = 0;
147 inode->i_uid = 0; 147 inode->i_uid = 0;
148 inode->i_gid = 0; 148 inode->i_gid = 0;
149 atomic_set(&inode->i_writecount, 0); 149 atomic_set(&inode->i_writecount, 0);
@@ -362,9 +362,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add);
362 362
363static inline void inode_sb_list_del(struct inode *inode) 363static inline void inode_sb_list_del(struct inode *inode)
364{ 364{
365 spin_lock(&inode_sb_list_lock); 365 if (!list_empty(&inode->i_sb_list)) {
366 list_del_init(&inode->i_sb_list); 366 spin_lock(&inode_sb_list_lock);
367 spin_unlock(&inode_sb_list_lock); 367 list_del_init(&inode->i_sb_list);
368 spin_unlock(&inode_sb_list_lock);
369 }
368} 370}
369 371
370static unsigned long hash(struct super_block *sb, unsigned long hashval) 372static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -398,12 +400,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
398EXPORT_SYMBOL(__insert_inode_hash); 400EXPORT_SYMBOL(__insert_inode_hash);
399 401
400/** 402/**
401 * remove_inode_hash - remove an inode from the hash 403 * __remove_inode_hash - remove an inode from the hash
402 * @inode: inode to unhash 404 * @inode: inode to unhash
403 * 405 *
404 * Remove an inode from the superblock. 406 * Remove an inode from the superblock.
405 */ 407 */
406void remove_inode_hash(struct inode *inode) 408void __remove_inode_hash(struct inode *inode)
407{ 409{
408 spin_lock(&inode_hash_lock); 410 spin_lock(&inode_hash_lock);
409 spin_lock(&inode->i_lock); 411 spin_lock(&inode->i_lock);
@@ -411,7 +413,7 @@ void remove_inode_hash(struct inode *inode)
411 spin_unlock(&inode->i_lock); 413 spin_unlock(&inode->i_lock);
412 spin_unlock(&inode_hash_lock); 414 spin_unlock(&inode_hash_lock);
413} 415}
414EXPORT_SYMBOL(remove_inode_hash); 416EXPORT_SYMBOL(__remove_inode_hash);
415 417
416void end_writeback(struct inode *inode) 418void end_writeback(struct inode *inode)
417{ 419{
@@ -453,7 +455,9 @@ static void evict(struct inode *inode)
453 BUG_ON(!(inode->i_state & I_FREEING)); 455 BUG_ON(!(inode->i_state & I_FREEING));
454 BUG_ON(!list_empty(&inode->i_lru)); 456 BUG_ON(!list_empty(&inode->i_lru));
455 457
456 inode_wb_list_del(inode); 458 if (!list_empty(&inode->i_wb_list))
459 inode_wb_list_del(inode);
460
457 inode_sb_list_del(inode); 461 inode_sb_list_del(inode);
458 462
459 if (op->evict_inode) { 463 if (op->evict_inode) {
@@ -797,6 +801,29 @@ unsigned int get_next_ino(void)
797EXPORT_SYMBOL(get_next_ino); 801EXPORT_SYMBOL(get_next_ino);
798 802
799/** 803/**
804 * new_inode_pseudo - obtain an inode
805 * @sb: superblock
806 *
807 * Allocates a new inode for given superblock.
808 * Inode wont be chained in superblock s_inodes list
809 * This means :
810 * - fs can't be unmount
811 * - quotas, fsnotify, writeback can't work
812 */
813struct inode *new_inode_pseudo(struct super_block *sb)
814{
815 struct inode *inode = alloc_inode(sb);
816
817 if (inode) {
818 spin_lock(&inode->i_lock);
819 inode->i_state = 0;
820 spin_unlock(&inode->i_lock);
821 INIT_LIST_HEAD(&inode->i_sb_list);
822 }
823 return inode;
824}
825
826/**
800 * new_inode - obtain an inode 827 * new_inode - obtain an inode
801 * @sb: superblock 828 * @sb: superblock
802 * 829 *
@@ -814,27 +841,16 @@ struct inode *new_inode(struct super_block *sb)
814 841
815 spin_lock_prefetch(&inode_sb_list_lock); 842 spin_lock_prefetch(&inode_sb_list_lock);
816 843
817 inode = alloc_inode(sb); 844 inode = new_inode_pseudo(sb);
818 if (inode) { 845 if (inode)
819 spin_lock(&inode->i_lock);
820 inode->i_state = 0;
821 spin_unlock(&inode->i_lock);
822 inode_sb_list_add(inode); 846 inode_sb_list_add(inode);
823 }
824 return inode; 847 return inode;
825} 848}
826EXPORT_SYMBOL(new_inode); 849EXPORT_SYMBOL(new_inode);
827 850
828/**
829 * unlock_new_inode - clear the I_NEW state and wake up any waiters
830 * @inode: new inode to unlock
831 *
832 * Called when the inode is fully initialised to clear the new state of the
833 * inode and wake up anyone waiting for the inode to finish initialisation.
834 */
835void unlock_new_inode(struct inode *inode)
836{
837#ifdef CONFIG_DEBUG_LOCK_ALLOC 851#ifdef CONFIG_DEBUG_LOCK_ALLOC
852void lockdep_annotate_inode_mutex_key(struct inode *inode)
853{
838 if (S_ISDIR(inode->i_mode)) { 854 if (S_ISDIR(inode->i_mode)) {
839 struct file_system_type *type = inode->i_sb->s_type; 855 struct file_system_type *type = inode->i_sb->s_type;
840 856
@@ -850,7 +866,20 @@ void unlock_new_inode(struct inode *inode)
850 &type->i_mutex_dir_key); 866 &type->i_mutex_dir_key);
851 } 867 }
852 } 868 }
869}
870EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
853#endif 871#endif
872
873/**
874 * unlock_new_inode - clear the I_NEW state and wake up any waiters
875 * @inode: new inode to unlock
876 *
877 * Called when the inode is fully initialised to clear the new state of the
878 * inode and wake up anyone waiting for the inode to finish initialisation.
879 */
880void unlock_new_inode(struct inode *inode)
881{
882 lockdep_annotate_inode_mutex_key(inode);
854 spin_lock(&inode->i_lock); 883 spin_lock(&inode->i_lock);
855 WARN_ON(!(inode->i_state & I_NEW)); 884 WARN_ON(!(inode->i_state & I_NEW));
856 inode->i_state &= ~I_NEW; 885 inode->i_state &= ~I_NEW;
@@ -1308,7 +1337,8 @@ static void iput_final(struct inode *inode)
1308 } 1337 }
1309 1338
1310 inode->i_state |= I_FREEING; 1339 inode->i_state |= I_FREEING;
1311 inode_lru_list_del(inode); 1340 if (!list_empty(&inode->i_lru))
1341 inode_lru_list_del(inode);
1312 spin_unlock(&inode->i_lock); 1342 spin_unlock(&inode->i_lock);
1313 1343
1314 evict(inode); 1344 evict(inode);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e4b87bc1fa56..f94fc48ff3a0 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -22,6 +22,8 @@
22#include <linux/jbd.h> 22#include <linux/jbd.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/blkdev.h>
26#include <trace/events/jbd.h>
25 27
26/* 28/*
27 * Unlink a buffer from a transaction checkpoint list. 29 * Unlink a buffer from a transaction checkpoint list.
@@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
95 97
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && 98 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
97 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 99 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
100 /*
101 * Get our reference so that bh cannot be freed before
102 * we unlock it
103 */
104 get_bh(bh);
98 JBUFFER_TRACE(jh, "remove from checkpoint list"); 105 JBUFFER_TRACE(jh, "remove from checkpoint list");
99 ret = __journal_remove_checkpoint(jh) + 1; 106 ret = __journal_remove_checkpoint(jh) + 1;
100 jbd_unlock_bh_state(bh); 107 jbd_unlock_bh_state(bh);
101 journal_remove_journal_head(bh);
102 BUFFER_TRACE(bh, "release"); 108 BUFFER_TRACE(bh, "release");
103 __brelse(bh); 109 __brelse(bh);
104 } else { 110 } else {
@@ -220,8 +226,8 @@ restart:
220 spin_lock(&journal->j_list_lock); 226 spin_lock(&journal->j_list_lock);
221 goto restart; 227 goto restart;
222 } 228 }
229 get_bh(bh);
223 if (buffer_locked(bh)) { 230 if (buffer_locked(bh)) {
224 get_bh(bh);
225 spin_unlock(&journal->j_list_lock); 231 spin_unlock(&journal->j_list_lock);
226 jbd_unlock_bh_state(bh); 232 jbd_unlock_bh_state(bh);
227 wait_on_buffer(bh); 233 wait_on_buffer(bh);
@@ -240,7 +246,6 @@ restart:
240 */ 246 */
241 released = __journal_remove_checkpoint(jh); 247 released = __journal_remove_checkpoint(jh);
242 jbd_unlock_bh_state(bh); 248 jbd_unlock_bh_state(bh);
243 journal_remove_journal_head(bh);
244 __brelse(bh); 249 __brelse(bh);
245 } 250 }
246 251
@@ -253,9 +258,12 @@ static void
253__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 258__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
254{ 259{
255 int i; 260 int i;
261 struct blk_plug plug;
256 262
263 blk_start_plug(&plug);
257 for (i = 0; i < *batch_count; i++) 264 for (i = 0; i < *batch_count; i++)
258 write_dirty_buffer(bhs[i], WRITE); 265 write_dirty_buffer(bhs[i], WRITE_SYNC);
266 blk_finish_plug(&plug);
259 267
260 for (i = 0; i < *batch_count; i++) { 268 for (i = 0; i < *batch_count; i++) {
261 struct buffer_head *bh = bhs[i]; 269 struct buffer_head *bh = bhs[i];
@@ -304,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
304 ret = 1; 312 ret = 1;
305 if (unlikely(buffer_write_io_error(bh))) 313 if (unlikely(buffer_write_io_error(bh)))
306 ret = -EIO; 314 ret = -EIO;
315 get_bh(bh);
307 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 316 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
308 BUFFER_TRACE(bh, "remove from checkpoint"); 317 BUFFER_TRACE(bh, "remove from checkpoint");
309 __journal_remove_checkpoint(jh); 318 __journal_remove_checkpoint(jh);
310 spin_unlock(&journal->j_list_lock); 319 spin_unlock(&journal->j_list_lock);
311 jbd_unlock_bh_state(bh); 320 jbd_unlock_bh_state(bh);
312 journal_remove_journal_head(bh);
313 __brelse(bh); 321 __brelse(bh);
314 } else { 322 } else {
315 /* 323 /*
@@ -358,6 +366,7 @@ int log_do_checkpoint(journal_t *journal)
358 * journal straight away. 366 * journal straight away.
359 */ 367 */
360 result = cleanup_journal_tail(journal); 368 result = cleanup_journal_tail(journal);
369 trace_jbd_checkpoint(journal, result);
361 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 370 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
362 if (result <= 0) 371 if (result <= 0)
363 return result; 372 return result;
@@ -503,6 +512,7 @@ int cleanup_journal_tail(journal_t *journal)
503 if (blocknr < journal->j_tail) 512 if (blocknr < journal->j_tail)
504 freed = freed + journal->j_last - journal->j_first; 513 freed = freed + journal->j_last - journal->j_first;
505 514
515 trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
506 jbd_debug(1, 516 jbd_debug(1,
507 "Cleaning journal tail from %d to %d (offset %u), " 517 "Cleaning journal tail from %d to %d (offset %u), "
508 "freeing %u\n", 518 "freeing %u\n",
@@ -523,9 +533,9 @@ int cleanup_journal_tail(journal_t *journal)
523/* 533/*
524 * journal_clean_one_cp_list 534 * journal_clean_one_cp_list
525 * 535 *
526 * Find all the written-back checkpoint buffers in the given list and release them. 536 * Find all the written-back checkpoint buffers in the given list and release
537 * them.
527 * 538 *
528 * Called with the journal locked.
529 * Called with j_list_lock held. 539 * Called with j_list_lock held.
530 * Returns number of bufers reaped (for debug) 540 * Returns number of bufers reaped (for debug)
531 */ 541 */
@@ -632,8 +642,8 @@ out:
632 * checkpoint lists. 642 * checkpoint lists.
633 * 643 *
634 * The function returns 1 if it frees the transaction, 0 otherwise. 644 * The function returns 1 if it frees the transaction, 0 otherwise.
645 * The function can free jh and bh.
635 * 646 *
636 * This function is called with the journal locked.
637 * This function is called with j_list_lock held. 647 * This function is called with j_list_lock held.
638 * This function is called with jbd_lock_bh_state(jh2bh(jh)) 648 * This function is called with jbd_lock_bh_state(jh2bh(jh))
639 */ 649 */
@@ -652,13 +662,14 @@ int __journal_remove_checkpoint(struct journal_head *jh)
652 } 662 }
653 journal = transaction->t_journal; 663 journal = transaction->t_journal;
654 664
665 JBUFFER_TRACE(jh, "removing from transaction");
655 __buffer_unlink(jh); 666 __buffer_unlink(jh);
656 jh->b_cp_transaction = NULL; 667 jh->b_cp_transaction = NULL;
668 journal_put_journal_head(jh);
657 669
658 if (transaction->t_checkpoint_list != NULL || 670 if (transaction->t_checkpoint_list != NULL ||
659 transaction->t_checkpoint_io_list != NULL) 671 transaction->t_checkpoint_io_list != NULL)
660 goto out; 672 goto out;
661 JBUFFER_TRACE(jh, "transaction has no more buffers");
662 673
663 /* 674 /*
664 * There is one special case to worry about: if we have just pulled the 675 * There is one special case to worry about: if we have just pulled the
@@ -669,10 +680,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
669 * The locking here around t_state is a bit sleazy. 680 * The locking here around t_state is a bit sleazy.
670 * See the comment at the end of journal_commit_transaction(). 681 * See the comment at the end of journal_commit_transaction().
671 */ 682 */
672 if (transaction->t_state != T_FINISHED) { 683 if (transaction->t_state != T_FINISHED)
673 JBUFFER_TRACE(jh, "belongs to running/committing transaction");
674 goto out; 684 goto out;
675 }
676 685
677 /* OK, that was the last buffer for the transaction: we can now 686 /* OK, that was the last buffer for the transaction: we can now
678 safely remove this transaction from the log */ 687 safely remove this transaction from the log */
@@ -684,7 +693,6 @@ int __journal_remove_checkpoint(struct journal_head *jh)
684 wake_up(&journal->j_wait_logspace); 693 wake_up(&journal->j_wait_logspace);
685 ret = 1; 694 ret = 1;
686out: 695out:
687 JBUFFER_TRACE(jh, "exit");
688 return ret; 696 return ret;
689} 697}
690 698
@@ -703,6 +711,8 @@ void __journal_insert_checkpoint(struct journal_head *jh,
703 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); 711 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
704 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 712 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
705 713
714 /* Get reference for checkpointing transaction */
715 journal_grab_journal_head(jh2bh(jh));
706 jh->b_cp_transaction = transaction; 716 jh->b_cp_transaction = transaction;
707 717
708 if (!transaction->t_checkpoint_list) { 718 if (!transaction->t_checkpoint_list) {
@@ -752,6 +762,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
752 J_ASSERT(journal->j_committing_transaction != transaction); 762 J_ASSERT(journal->j_committing_transaction != transaction);
753 J_ASSERT(journal->j_running_transaction != transaction); 763 J_ASSERT(journal->j_running_transaction != transaction);
754 764
765 trace_jbd_drop_transaction(journal, transaction);
755 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 766 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
756 kfree(transaction); 767 kfree(transaction);
757} 768}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 72ffa974b0b8..8799207df058 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -21,6 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <trace/events/jbd.h>
24 25
25/* 26/*
26 * Default IO end handler for temporary BJ_IO buffer_heads. 27 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -204,6 +205,8 @@ write_out_data:
204 if (!trylock_buffer(bh)) { 205 if (!trylock_buffer(bh)) {
205 BUFFER_TRACE(bh, "needs blocking lock"); 206 BUFFER_TRACE(bh, "needs blocking lock");
206 spin_unlock(&journal->j_list_lock); 207 spin_unlock(&journal->j_list_lock);
208 trace_jbd_do_submit_data(journal,
209 commit_transaction);
207 /* Write out all data to prevent deadlocks */ 210 /* Write out all data to prevent deadlocks */
208 journal_do_submit_data(wbuf, bufs, write_op); 211 journal_do_submit_data(wbuf, bufs, write_op);
209 bufs = 0; 212 bufs = 0;
@@ -236,6 +239,8 @@ write_out_data:
236 jbd_unlock_bh_state(bh); 239 jbd_unlock_bh_state(bh);
237 if (bufs == journal->j_wbufsize) { 240 if (bufs == journal->j_wbufsize) {
238 spin_unlock(&journal->j_list_lock); 241 spin_unlock(&journal->j_list_lock);
242 trace_jbd_do_submit_data(journal,
243 commit_transaction);
239 journal_do_submit_data(wbuf, bufs, write_op); 244 journal_do_submit_data(wbuf, bufs, write_op);
240 bufs = 0; 245 bufs = 0;
241 goto write_out_data; 246 goto write_out_data;
@@ -253,10 +258,6 @@ write_out_data:
253 jbd_unlock_bh_state(bh); 258 jbd_unlock_bh_state(bh);
254 if (locked) 259 if (locked)
255 unlock_buffer(bh); 260 unlock_buffer(bh);
256 journal_remove_journal_head(bh);
257 /* One for our safety reference, other for
258 * journal_remove_journal_head() */
259 put_bh(bh);
260 release_data_buffer(bh); 261 release_data_buffer(bh);
261 } 262 }
262 263
@@ -266,6 +267,7 @@ write_out_data:
266 } 267 }
267 } 268 }
268 spin_unlock(&journal->j_list_lock); 269 spin_unlock(&journal->j_list_lock);
270 trace_jbd_do_submit_data(journal, commit_transaction);
269 journal_do_submit_data(wbuf, bufs, write_op); 271 journal_do_submit_data(wbuf, bufs, write_op);
270 272
271 return err; 273 return err;
@@ -316,12 +318,14 @@ void journal_commit_transaction(journal_t *journal)
316 commit_transaction = journal->j_running_transaction; 318 commit_transaction = journal->j_running_transaction;
317 J_ASSERT(commit_transaction->t_state == T_RUNNING); 319 J_ASSERT(commit_transaction->t_state == T_RUNNING);
318 320
321 trace_jbd_start_commit(journal, commit_transaction);
319 jbd_debug(1, "JBD: starting commit of transaction %d\n", 322 jbd_debug(1, "JBD: starting commit of transaction %d\n",
320 commit_transaction->t_tid); 323 commit_transaction->t_tid);
321 324
322 spin_lock(&journal->j_state_lock); 325 spin_lock(&journal->j_state_lock);
323 commit_transaction->t_state = T_LOCKED; 326 commit_transaction->t_state = T_LOCKED;
324 327
328 trace_jbd_commit_locking(journal, commit_transaction);
325 spin_lock(&commit_transaction->t_handle_lock); 329 spin_lock(&commit_transaction->t_handle_lock);
326 while (commit_transaction->t_updates) { 330 while (commit_transaction->t_updates) {
327 DEFINE_WAIT(wait); 331 DEFINE_WAIT(wait);
@@ -392,6 +396,7 @@ void journal_commit_transaction(journal_t *journal)
392 */ 396 */
393 journal_switch_revoke_table(journal); 397 journal_switch_revoke_table(journal);
394 398
399 trace_jbd_commit_flushing(journal, commit_transaction);
395 commit_transaction->t_state = T_FLUSH; 400 commit_transaction->t_state = T_FLUSH;
396 journal->j_committing_transaction = commit_transaction; 401 journal->j_committing_transaction = commit_transaction;
397 journal->j_running_transaction = NULL; 402 journal->j_running_transaction = NULL;
@@ -446,14 +451,9 @@ void journal_commit_transaction(journal_t *journal)
446 } 451 }
447 if (buffer_jbd(bh) && bh2jh(bh) == jh && 452 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
448 jh->b_transaction == commit_transaction && 453 jh->b_transaction == commit_transaction &&
449 jh->b_jlist == BJ_Locked) { 454 jh->b_jlist == BJ_Locked)
450 __journal_unfile_buffer(jh); 455 __journal_unfile_buffer(jh);
451 jbd_unlock_bh_state(bh); 456 jbd_unlock_bh_state(bh);
452 journal_remove_journal_head(bh);
453 put_bh(bh);
454 } else {
455 jbd_unlock_bh_state(bh);
456 }
457 release_data_buffer(bh); 457 release_data_buffer(bh);
458 cond_resched_lock(&journal->j_list_lock); 458 cond_resched_lock(&journal->j_list_lock);
459 } 459 }
@@ -493,6 +493,7 @@ void journal_commit_transaction(journal_t *journal)
493 commit_transaction->t_state = T_COMMIT; 493 commit_transaction->t_state = T_COMMIT;
494 spin_unlock(&journal->j_state_lock); 494 spin_unlock(&journal->j_state_lock);
495 495
496 trace_jbd_commit_logging(journal, commit_transaction);
496 J_ASSERT(commit_transaction->t_nr_buffers <= 497 J_ASSERT(commit_transaction->t_nr_buffers <=
497 commit_transaction->t_outstanding_credits); 498 commit_transaction->t_outstanding_credits);
498 499
@@ -797,10 +798,16 @@ restart_loop:
797 while (commit_transaction->t_forget) { 798 while (commit_transaction->t_forget) {
798 transaction_t *cp_transaction; 799 transaction_t *cp_transaction;
799 struct buffer_head *bh; 800 struct buffer_head *bh;
801 int try_to_free = 0;
800 802
801 jh = commit_transaction->t_forget; 803 jh = commit_transaction->t_forget;
802 spin_unlock(&journal->j_list_lock); 804 spin_unlock(&journal->j_list_lock);
803 bh = jh2bh(jh); 805 bh = jh2bh(jh);
806 /*
807 * Get a reference so that bh cannot be freed before we are
808 * done with it.
809 */
810 get_bh(bh);
804 jbd_lock_bh_state(bh); 811 jbd_lock_bh_state(bh);
805 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || 812 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
806 jh->b_transaction == journal->j_running_transaction); 813 jh->b_transaction == journal->j_running_transaction);
@@ -858,28 +865,27 @@ restart_loop:
858 __journal_insert_checkpoint(jh, commit_transaction); 865 __journal_insert_checkpoint(jh, commit_transaction);
859 if (is_journal_aborted(journal)) 866 if (is_journal_aborted(journal))
860 clear_buffer_jbddirty(bh); 867 clear_buffer_jbddirty(bh);
861 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
862 __journal_refile_buffer(jh);
863 jbd_unlock_bh_state(bh);
864 } else { 868 } else {
865 J_ASSERT_BH(bh, !buffer_dirty(bh)); 869 J_ASSERT_BH(bh, !buffer_dirty(bh));
866 /* The buffer on BJ_Forget list and not jbddirty means 870 /*
871 * The buffer on BJ_Forget list and not jbddirty means
867 * it has been freed by this transaction and hence it 872 * it has been freed by this transaction and hence it
868 * could not have been reallocated until this 873 * could not have been reallocated until this
869 * transaction has committed. *BUT* it could be 874 * transaction has committed. *BUT* it could be
870 * reallocated once we have written all the data to 875 * reallocated once we have written all the data to
871 * disk and before we process the buffer on BJ_Forget 876 * disk and before we process the buffer on BJ_Forget
872 * list. */ 877 * list.
873 JBUFFER_TRACE(jh, "refile or unfile freed buffer"); 878 */
874 __journal_refile_buffer(jh); 879 if (!jh->b_next_transaction)
875 if (!jh->b_transaction) { 880 try_to_free = 1;
876 jbd_unlock_bh_state(bh);
877 /* needs a brelse */
878 journal_remove_journal_head(bh);
879 release_buffer_page(bh);
880 } else
881 jbd_unlock_bh_state(bh);
882 } 881 }
882 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
883 __journal_refile_buffer(jh);
884 jbd_unlock_bh_state(bh);
885 if (try_to_free)
886 release_buffer_page(bh);
887 else
888 __brelse(bh);
883 cond_resched_lock(&journal->j_list_lock); 889 cond_resched_lock(&journal->j_list_lock);
884 } 890 }
885 spin_unlock(&journal->j_list_lock); 891 spin_unlock(&journal->j_list_lock);
@@ -946,6 +952,7 @@ restart_loop:
946 } 952 }
947 spin_unlock(&journal->j_list_lock); 953 spin_unlock(&journal->j_list_lock);
948 954
955 trace_jbd_end_commit(journal, commit_transaction);
949 jbd_debug(1, "JBD: commit %d complete, head %d\n", 956 jbd_debug(1, "JBD: commit %d complete, head %d\n",
950 journal->j_commit_sequence, journal->j_tail_sequence); 957 journal->j_commit_sequence, journal->j_tail_sequence);
951 958
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e2d4285fbe90..9fe061fb8779 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -38,6 +38,9 @@
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40 40
41#define CREATE_TRACE_POINTS
42#include <trace/events/jbd.h>
43
41#include <asm/uaccess.h> 44#include <asm/uaccess.h>
42#include <asm/page.h> 45#include <asm/page.h>
43 46
@@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait)
1065 } else 1068 } else
1066 write_dirty_buffer(bh, WRITE); 1069 write_dirty_buffer(bh, WRITE);
1067 1070
1071 trace_jbd_update_superblock_end(journal, wait);
1068out: 1072out:
1069 /* If we have just flushed the log (by marking s_start==0), then 1073 /* If we have just flushed the log (by marking s_start==0), then
1070 * any future commit will have to be careful to update the 1074 * any future commit will have to be careful to update the
@@ -1799,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh)
1799 * When a buffer has its BH_JBD bit set it is immune from being released by 1803 * When a buffer has its BH_JBD bit set it is immune from being released by
1800 * core kernel code, mainly via ->b_count. 1804 * core kernel code, mainly via ->b_count.
1801 * 1805 *
1802 * A journal_head may be detached from its buffer_head when the journal_head's 1806 * A journal_head is detached from its buffer_head when the journal_head's
1803 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. 1807 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
1804 * Various places in JBD call journal_remove_journal_head() to indicate that the 1808 * transaction (b_cp_transaction) hold their references to b_jcount.
1805 * journal_head can be dropped if needed.
1806 * 1809 *
1807 * Various places in the kernel want to attach a journal_head to a buffer_head 1810 * Various places in the kernel want to attach a journal_head to a buffer_head
1808 * _before_ attaching the journal_head to a transaction. To protect the 1811 * _before_ attaching the journal_head to a transaction. To protect the
@@ -1815,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh)
1815 * (Attach a journal_head if needed. Increments b_jcount) 1818 * (Attach a journal_head if needed. Increments b_jcount)
1816 * struct journal_head *jh = journal_add_journal_head(bh); 1819 * struct journal_head *jh = journal_add_journal_head(bh);
1817 * ... 1820 * ...
1818 * jh->b_transaction = xxx; 1821 * (Get another reference for transaction)
1819 * journal_put_journal_head(jh); 1822 * journal_grab_journal_head(bh);
1820 * 1823 * jh->b_transaction = xxx;
1821 * Now, the journal_head's b_jcount is zero, but it is safe from being released 1824 * (Put original reference)
1822 * because it has a non-zero b_transaction. 1825 * journal_put_journal_head(jh);
1823 */ 1826 */
1824 1827
1825/* 1828/*
1826 * Give a buffer_head a journal_head. 1829 * Give a buffer_head a journal_head.
1827 * 1830 *
1828 * Doesn't need the journal lock.
1829 * May sleep. 1831 * May sleep.
1830 */ 1832 */
1831struct journal_head *journal_add_journal_head(struct buffer_head *bh) 1833struct journal_head *journal_add_journal_head(struct buffer_head *bh)
@@ -1889,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
1889 struct journal_head *jh = bh2jh(bh); 1891 struct journal_head *jh = bh2jh(bh);
1890 1892
1891 J_ASSERT_JH(jh, jh->b_jcount >= 0); 1893 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1892 1894 J_ASSERT_JH(jh, jh->b_transaction == NULL);
1893 get_bh(bh); 1895 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1894 if (jh->b_jcount == 0) { 1896 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
1895 if (jh->b_transaction == NULL && 1897 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1896 jh->b_next_transaction == NULL && 1898 J_ASSERT_BH(bh, buffer_jbd(bh));
1897 jh->b_cp_transaction == NULL) { 1899 J_ASSERT_BH(bh, jh2bh(jh) == bh);
1898 J_ASSERT_JH(jh, jh->b_jlist == BJ_None); 1900 BUFFER_TRACE(bh, "remove journal_head");
1899 J_ASSERT_BH(bh, buffer_jbd(bh)); 1901 if (jh->b_frozen_data) {
1900 J_ASSERT_BH(bh, jh2bh(jh) == bh); 1902 printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
1901 BUFFER_TRACE(bh, "remove journal_head"); 1903 jbd_free(jh->b_frozen_data, bh->b_size);
1902 if (jh->b_frozen_data) {
1903 printk(KERN_WARNING "%s: freeing "
1904 "b_frozen_data\n",
1905 __func__);
1906 jbd_free(jh->b_frozen_data, bh->b_size);
1907 }
1908 if (jh->b_committed_data) {
1909 printk(KERN_WARNING "%s: freeing "
1910 "b_committed_data\n",
1911 __func__);
1912 jbd_free(jh->b_committed_data, bh->b_size);
1913 }
1914 bh->b_private = NULL;
1915 jh->b_bh = NULL; /* debug, really */
1916 clear_buffer_jbd(bh);
1917 __brelse(bh);
1918 journal_free_journal_head(jh);
1919 } else {
1920 BUFFER_TRACE(bh, "journal_head was locked");
1921 }
1922 } 1904 }
1905 if (jh->b_committed_data) {
1906 printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
1907 jbd_free(jh->b_committed_data, bh->b_size);
1908 }
1909 bh->b_private = NULL;
1910 jh->b_bh = NULL; /* debug, really */
1911 clear_buffer_jbd(bh);
1912 journal_free_journal_head(jh);
1923} 1913}
1924 1914
1925/* 1915/*
1926 * journal_remove_journal_head(): if the buffer isn't attached to a transaction 1916 * Drop a reference on the passed journal_head. If it fell to zero then
1927 * and has a zero b_jcount then remove and release its journal_head. If we did
1928 * see that the buffer is not used by any transaction we also "logically"
1929 * decrement ->b_count.
1930 *
1931 * We in fact take an additional increment on ->b_count as a convenience,
1932 * because the caller usually wants to do additional things with the bh
1933 * after calling here.
1934 * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
1935 * time. Once the caller has run __brelse(), the buffer is eligible for
1936 * reaping by try_to_free_buffers().
1937 */
1938void journal_remove_journal_head(struct buffer_head *bh)
1939{
1940 jbd_lock_bh_journal_head(bh);
1941 __journal_remove_journal_head(bh);
1942 jbd_unlock_bh_journal_head(bh);
1943}
1944
1945/*
1946 * Drop a reference on the passed journal_head. If it fell to zero then try to
1947 * release the journal_head from the buffer_head. 1917 * release the journal_head from the buffer_head.
1948 */ 1918 */
1949void journal_put_journal_head(struct journal_head *jh) 1919void journal_put_journal_head(struct journal_head *jh)
@@ -1953,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh)
1953 jbd_lock_bh_journal_head(bh); 1923 jbd_lock_bh_journal_head(bh);
1954 J_ASSERT_JH(jh, jh->b_jcount > 0); 1924 J_ASSERT_JH(jh, jh->b_jcount > 0);
1955 --jh->b_jcount; 1925 --jh->b_jcount;
1956 if (!jh->b_jcount && !jh->b_transaction) { 1926 if (!jh->b_jcount) {
1957 __journal_remove_journal_head(bh); 1927 __journal_remove_journal_head(bh);
1928 jbd_unlock_bh_journal_head(bh);
1958 __brelse(bh); 1929 __brelse(bh);
1959 } 1930 } else
1960 jbd_unlock_bh_journal_head(bh); 1931 jbd_unlock_bh_journal_head(bh);
1961} 1932}
1962 1933
1963/* 1934/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index f7ee81a065da..7e59c6e66f9b 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,6 +26,7 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/backing-dev.h>
29 30
30static void __journal_temp_unlink_buffer(struct journal_head *jh); 31static void __journal_temp_unlink_buffer(struct journal_head *jh);
31 32
@@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
99 100
100alloc_transaction: 101alloc_transaction:
101 if (!journal->j_running_transaction) { 102 if (!journal->j_running_transaction) {
102 new_transaction = kzalloc(sizeof(*new_transaction), 103 new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
103 GFP_NOFS|__GFP_NOFAIL);
104 if (!new_transaction) { 104 if (!new_transaction) {
105 ret = -ENOMEM; 105 congestion_wait(BLK_RW_ASYNC, HZ/50);
106 goto out; 106 goto alloc_transaction;
107 } 107 }
108 } 108 }
109 109
@@ -696,7 +696,6 @@ repeat:
696 if (!jh->b_transaction) { 696 if (!jh->b_transaction) {
697 JBUFFER_TRACE(jh, "no transaction"); 697 JBUFFER_TRACE(jh, "no transaction");
698 J_ASSERT_JH(jh, !jh->b_next_transaction); 698 J_ASSERT_JH(jh, !jh->b_next_transaction);
699 jh->b_transaction = transaction;
700 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 699 JBUFFER_TRACE(jh, "file as BJ_Reserved");
701 spin_lock(&journal->j_list_lock); 700 spin_lock(&journal->j_list_lock);
702 __journal_file_buffer(jh, transaction, BJ_Reserved); 701 __journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
818 * committed and so it's safe to clear the dirty bit. 817 * committed and so it's safe to clear the dirty bit.
819 */ 818 */
820 clear_buffer_dirty(jh2bh(jh)); 819 clear_buffer_dirty(jh2bh(jh));
821 jh->b_transaction = transaction;
822 820
823 /* first access by this transaction */ 821 /* first access by this transaction */
824 jh->b_modified = 0; 822 jh->b_modified = 0;
@@ -844,8 +842,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
844 */ 842 */
845 JBUFFER_TRACE(jh, "cancelling revoke"); 843 JBUFFER_TRACE(jh, "cancelling revoke");
846 journal_cancel_revoke(handle, jh); 844 journal_cancel_revoke(handle, jh);
847 journal_put_journal_head(jh);
848out: 845out:
846 journal_put_journal_head(jh);
849 return err; 847 return err;
850} 848}
851 849
@@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1069 ret = -EIO; 1067 ret = -EIO;
1070 goto no_journal; 1068 goto no_journal;
1071 } 1069 }
1072 1070 /* We might have slept so buffer could be refiled now */
1073 if (jh->b_transaction != NULL) { 1071 if (jh->b_transaction != NULL &&
1072 jh->b_transaction != handle->h_transaction) {
1074 JBUFFER_TRACE(jh, "unfile from commit"); 1073 JBUFFER_TRACE(jh, "unfile from commit");
1075 __journal_temp_unlink_buffer(jh); 1074 __journal_temp_unlink_buffer(jh);
1076 /* It still points to the committing 1075 /* It still points to the committing
@@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1091 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { 1090 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1092 JBUFFER_TRACE(jh, "not on correct data list: unfile"); 1091 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1093 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); 1092 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1094 __journal_temp_unlink_buffer(jh);
1095 jh->b_transaction = handle->h_transaction;
1096 JBUFFER_TRACE(jh, "file as data"); 1093 JBUFFER_TRACE(jh, "file as data");
1097 __journal_file_buffer(jh, handle->h_transaction, 1094 __journal_file_buffer(jh, handle->h_transaction,
1098 BJ_SyncData); 1095 BJ_SyncData);
@@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
1300 __journal_file_buffer(jh, transaction, BJ_Forget); 1297 __journal_file_buffer(jh, transaction, BJ_Forget);
1301 } else { 1298 } else {
1302 __journal_unfile_buffer(jh); 1299 __journal_unfile_buffer(jh);
1303 journal_remove_journal_head(bh);
1304 __brelse(bh);
1305 if (!buffer_jbd(bh)) { 1300 if (!buffer_jbd(bh)) {
1306 spin_unlock(&journal->j_list_lock); 1301 spin_unlock(&journal->j_list_lock);
1307 jbd_unlock_bh_state(bh); 1302 jbd_unlock_bh_state(bh);
@@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
1622 mark_buffer_dirty(bh); /* Expose it to the VM */ 1617 mark_buffer_dirty(bh); /* Expose it to the VM */
1623} 1618}
1624 1619
1620/*
1621 * Remove buffer from all transactions.
1622 *
1623 * Called with bh_state lock and j_list_lock
1624 *
1625 * jh and bh may be already freed when this function returns.
1626 */
1625void __journal_unfile_buffer(struct journal_head *jh) 1627void __journal_unfile_buffer(struct journal_head *jh)
1626{ 1628{
1627 __journal_temp_unlink_buffer(jh); 1629 __journal_temp_unlink_buffer(jh);
1628 jh->b_transaction = NULL; 1630 jh->b_transaction = NULL;
1631 journal_put_journal_head(jh);
1629} 1632}
1630 1633
1631void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) 1634void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1632{ 1635{
1633 jbd_lock_bh_state(jh2bh(jh)); 1636 struct buffer_head *bh = jh2bh(jh);
1637
1638 /* Get reference so that buffer cannot be freed before we unlock it */
1639 get_bh(bh);
1640 jbd_lock_bh_state(bh);
1634 spin_lock(&journal->j_list_lock); 1641 spin_lock(&journal->j_list_lock);
1635 __journal_unfile_buffer(jh); 1642 __journal_unfile_buffer(jh);
1636 spin_unlock(&journal->j_list_lock); 1643 spin_unlock(&journal->j_list_lock);
1637 jbd_unlock_bh_state(jh2bh(jh)); 1644 jbd_unlock_bh_state(bh);
1645 __brelse(bh);
1638} 1646}
1639 1647
1640/* 1648/*
@@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1661 /* A written-back ordered data buffer */ 1669 /* A written-back ordered data buffer */
1662 JBUFFER_TRACE(jh, "release data"); 1670 JBUFFER_TRACE(jh, "release data");
1663 __journal_unfile_buffer(jh); 1671 __journal_unfile_buffer(jh);
1664 journal_remove_journal_head(bh);
1665 __brelse(bh);
1666 } 1672 }
1667 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1673 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1668 /* written-back checkpointed metadata buffer */ 1674 /* written-back checkpointed metadata buffer */
1669 if (jh->b_jlist == BJ_None) { 1675 if (jh->b_jlist == BJ_None) {
1670 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1676 JBUFFER_TRACE(jh, "remove from checkpoint list");
1671 __journal_remove_checkpoint(jh); 1677 __journal_remove_checkpoint(jh);
1672 journal_remove_journal_head(bh);
1673 __brelse(bh);
1674 } 1678 }
1675 } 1679 }
1676 spin_unlock(&journal->j_list_lock); 1680 spin_unlock(&journal->j_list_lock);
@@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal,
1733 /* 1737 /*
1734 * We take our own ref against the journal_head here to avoid 1738 * We take our own ref against the journal_head here to avoid
1735 * having to add tons of locking around each instance of 1739 * having to add tons of locking around each instance of
1736 * journal_remove_journal_head() and journal_put_journal_head(). 1740 * journal_put_journal_head().
1737 */ 1741 */
1738 jh = journal_grab_journal_head(bh); 1742 jh = journal_grab_journal_head(bh);
1739 if (!jh) 1743 if (!jh)
@@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1770 int may_free = 1; 1774 int may_free = 1;
1771 struct buffer_head *bh = jh2bh(jh); 1775 struct buffer_head *bh = jh2bh(jh);
1772 1776
1773 __journal_unfile_buffer(jh);
1774
1775 if (jh->b_cp_transaction) { 1777 if (jh->b_cp_transaction) {
1776 JBUFFER_TRACE(jh, "on running+cp transaction"); 1778 JBUFFER_TRACE(jh, "on running+cp transaction");
1779 __journal_temp_unlink_buffer(jh);
1777 /* 1780 /*
1778 * We don't want to write the buffer anymore, clear the 1781 * We don't want to write the buffer anymore, clear the
1779 * bit so that we don't confuse checks in 1782 * bit so that we don't confuse checks in
@@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1784 may_free = 0; 1787 may_free = 0;
1785 } else { 1788 } else {
1786 JBUFFER_TRACE(jh, "on running transaction"); 1789 JBUFFER_TRACE(jh, "on running transaction");
1787 journal_remove_journal_head(bh); 1790 __journal_unfile_buffer(jh);
1788 __brelse(bh);
1789 } 1791 }
1790 return may_free; 1792 return may_free;
1791} 1793}
@@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh,
2070 2072
2071 if (jh->b_transaction) 2073 if (jh->b_transaction)
2072 __journal_temp_unlink_buffer(jh); 2074 __journal_temp_unlink_buffer(jh);
2075 else
2076 journal_grab_journal_head(bh);
2073 jh->b_transaction = transaction; 2077 jh->b_transaction = transaction;
2074 2078
2075 switch (jlist) { 2079 switch (jlist) {
@@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh,
2127 * already started to be used by a subsequent transaction, refile the 2131 * already started to be used by a subsequent transaction, refile the
2128 * buffer on that transaction's metadata list. 2132 * buffer on that transaction's metadata list.
2129 * 2133 *
2130 * Called under journal->j_list_lock 2134 * Called under j_list_lock
2131 *
2132 * Called under jbd_lock_bh_state(jh2bh(jh)) 2135 * Called under jbd_lock_bh_state(jh2bh(jh))
2136 *
2137 * jh and bh may be already free when this function returns
2133 */ 2138 */
2134void __journal_refile_buffer(struct journal_head *jh) 2139void __journal_refile_buffer(struct journal_head *jh)
2135{ 2140{
@@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh)
2153 2158
2154 was_dirty = test_clear_buffer_jbddirty(bh); 2159 was_dirty = test_clear_buffer_jbddirty(bh);
2155 __journal_temp_unlink_buffer(jh); 2160 __journal_temp_unlink_buffer(jh);
2161 /*
2162 * We set b_transaction here because b_next_transaction will inherit
2163 * our jh reference and thus __journal_file_buffer() must not take a
2164 * new one.
2165 */
2156 jh->b_transaction = jh->b_next_transaction; 2166 jh->b_transaction = jh->b_next_transaction;
2157 jh->b_next_transaction = NULL; 2167 jh->b_next_transaction = NULL;
2158 if (buffer_freed(bh)) 2168 if (buffer_freed(bh))
@@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh)
2169} 2179}
2170 2180
2171/* 2181/*
2172 * For the unlocked version of this call, also make sure that any 2182 * __journal_refile_buffer() with necessary locking added. We take our bh
2173 * hanging journal_head is cleaned up if necessary. 2183 * reference so that we can safely unlock bh.
2174 * 2184 *
2175 * __journal_refile_buffer is usually called as part of a single locked 2185 * The jh and bh may be freed by this call.
2176 * operation on a buffer_head, in which the caller is probably going to
2177 * be hooking the journal_head onto other lists. In that case it is up
2178 * to the caller to remove the journal_head if necessary. For the
2179 * unlocked journal_refile_buffer call, the caller isn't going to be
2180 * doing anything else to the buffer so we need to do the cleanup
2181 * ourselves to avoid a jh leak.
2182 *
2183 * *** The journal_head may be freed by this call! ***
2184 */ 2186 */
2185void journal_refile_buffer(journal_t *journal, struct journal_head *jh) 2187void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2186{ 2188{
2187 struct buffer_head *bh = jh2bh(jh); 2189 struct buffer_head *bh = jh2bh(jh);
2188 2190
2191 /* Get reference so that buffer cannot be freed before we unlock it */
2192 get_bh(bh);
2189 jbd_lock_bh_state(bh); 2193 jbd_lock_bh_state(bh);
2190 spin_lock(&journal->j_list_lock); 2194 spin_lock(&journal->j_list_lock);
2191
2192 __journal_refile_buffer(jh); 2195 __journal_refile_buffer(jh);
2193 jbd_unlock_bh_state(bh); 2196 jbd_unlock_bh_state(bh);
2194 journal_remove_journal_head(bh);
2195
2196 spin_unlock(&journal->j_list_lock); 2197 spin_unlock(&journal->j_list_lock);
2197 __brelse(bh); 2198 __brelse(bh);
2198} 2199}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 2c62c5aae82f..16a698bd906d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -257,9 +257,12 @@ static void
257__flush_batch(journal_t *journal, int *batch_count) 257__flush_batch(journal_t *journal, int *batch_count)
258{ 258{
259 int i; 259 int i;
260 struct blk_plug plug;
260 261
262 blk_start_plug(&plug);
261 for (i = 0; i < *batch_count; i++) 263 for (i = 0; i < *batch_count; i++)
262 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); 264 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
265 blk_finish_plug(&plug);
263 266
264 for (i = 0; i < *batch_count; i++) { 267 for (i = 0; i < *batch_count; i++) {
265 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 268 struct buffer_head *bh = journal->j_chkpt_bhs[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0dfa5b598e68..f24df13adc4e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2390,73 +2390,6 @@ static void __exit journal_exit(void)
2390 jbd2_journal_destroy_caches(); 2390 jbd2_journal_destroy_caches();
2391} 2391}
2392 2392
2393/*
2394 * jbd2_dev_to_name is a utility function used by the jbd2 and ext4
2395 * tracing infrastructure to map a dev_t to a device name.
2396 *
2397 * The caller should use rcu_read_lock() in order to make sure the
2398 * device name stays valid until its done with it. We use
2399 * rcu_read_lock() as well to make sure we're safe in case the caller
2400 * gets sloppy, and because rcu_read_lock() is cheap and can be safely
2401 * nested.
2402 */
2403struct devname_cache {
2404 struct rcu_head rcu;
2405 dev_t device;
2406 char devname[BDEVNAME_SIZE];
2407};
2408#define CACHE_SIZE_BITS 6
2409static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
2410static DEFINE_SPINLOCK(devname_cache_lock);
2411
2412static void free_devcache(struct rcu_head *rcu)
2413{
2414 kfree(rcu);
2415}
2416
2417const char *jbd2_dev_to_name(dev_t device)
2418{
2419 int i = hash_32(device, CACHE_SIZE_BITS);
2420 char *ret;
2421 struct block_device *bd;
2422 static struct devname_cache *new_dev;
2423
2424 rcu_read_lock();
2425 if (devcache[i] && devcache[i]->device == device) {
2426 ret = devcache[i]->devname;
2427 rcu_read_unlock();
2428 return ret;
2429 }
2430 rcu_read_unlock();
2431
2432 new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
2433 if (!new_dev)
2434 return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
2435 bd = bdget(device);
2436 spin_lock(&devname_cache_lock);
2437 if (devcache[i]) {
2438 if (devcache[i]->device == device) {
2439 kfree(new_dev);
2440 bdput(bd);
2441 ret = devcache[i]->devname;
2442 spin_unlock(&devname_cache_lock);
2443 return ret;
2444 }
2445 call_rcu(&devcache[i]->rcu, free_devcache);
2446 }
2447 devcache[i] = new_dev;
2448 devcache[i]->device = device;
2449 if (bd) {
2450 bdevname(bd, devcache[i]->devname);
2451 bdput(bd);
2452 } else
2453 __bdevname(device, devcache[i]->devname);
2454 ret = devcache[i]->devname;
2455 spin_unlock(&devname_cache_lock);
2456 return ret;
2457}
2458EXPORT_SYMBOL(jbd2_dev_to_name);
2459
2460MODULE_LICENSE("GPL"); 2393MODULE_LICENSE("GPL");
2461module_init(journal_init); 2394module_init(journal_init);
2462module_exit(journal_exit); 2395module_exit(journal_exit);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 27c511a1cf05..926d02068a14 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
227 case ACL_TYPE_ACCESS: 227 case ACL_TYPE_ACCESS:
228 xprefix = JFFS2_XPREFIX_ACL_ACCESS; 228 xprefix = JFFS2_XPREFIX_ACL_ACCESS;
229 if (acl) { 229 if (acl) {
230 mode_t mode = inode->i_mode; 230 umode_t mode = inode->i_mode;
231 rc = posix_acl_equiv_mode(acl, &mode); 231 rc = posix_acl_equiv_mode(acl, &mode);
232 if (rc < 0) 232 if (rc < 0)
233 return rc; 233 return rc;
@@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
259 return rc; 259 return rc;
260} 260}
261 261
262int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) 262int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
263{ 263{
264 struct posix_acl *acl; 264 struct posix_acl *acl;
265 int rc; 265 int rc;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index b3421c78d9f8..9b477246f2a6 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
28 28
29struct posix_acl *jffs2_get_acl(struct inode *inode, int type); 29struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern const struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index eeead33d8ef0..bbcb9755dd2b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -80,7 +80,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
80 ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); 80 ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
81 if (ret) { 81 if (ret) {
82 jffs2_free_raw_inode(ri); 82 jffs2_free_raw_inode(ri);
83 if (S_ISLNK(inode->i_mode & S_IFMT)) 83 if (S_ISLNK(inode->i_mode))
84 kfree(mdata); 84 kfree(mdata);
85 return ret; 85 return ret;
86 } 86 }
@@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
406 406
407/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, 407/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
408 fill in the raw_inode while you're at it. */ 408 fill in the raw_inode while you're at it. */
409struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri) 409struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri)
410{ 410{
411 struct inode *inode; 411 struct inode *inode;
412 struct super_block *sb = dir_i->i_sb; 412 struct super_block *sb = dir_i->i_sb;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 526979c607b6..6c1755c59c0f 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *);
173struct inode *jffs2_iget(struct super_block *, unsigned long); 173struct inode *jffs2_iget(struct super_block *, unsigned long);
174void jffs2_evict_inode (struct inode *); 174void jffs2_evict_inode (struct inode *);
175void jffs2_dirty_inode(struct inode *inode, int flags); 175void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
179int jffs2_remount_fs (struct super_block *, int *, char *); 179int jffs2_remount_fs (struct super_block *, int *, char *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index b3a32caf2b45..45559dc3ea2f 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -127,16 +127,14 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
127 return PTR_ERR(acl); 127 return PTR_ERR(acl);
128 128
129 if (acl) { 129 if (acl) {
130 mode_t mode = inode->i_mode;
131 if (S_ISDIR(inode->i_mode)) { 130 if (S_ISDIR(inode->i_mode)) {
132 rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); 131 rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
133 if (rc) 132 if (rc)
134 goto cleanup; 133 goto cleanup;
135 } 134 }
136 rc = posix_acl_create(&acl, GFP_KERNEL, &mode); 135 rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
137 if (rc < 0) 136 if (rc < 0)
138 goto cleanup; /* posix_acl_release(NULL) is no-op */ 137 goto cleanup; /* posix_acl_release(NULL) is no-op */
139 inode->i_mode = mode;
140 if (rc > 0) 138 if (rc > 0)
141 rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); 139 rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
142cleanup: 140cleanup:
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 4496872cf4e7..9cbd11a3f804 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3161{ 3161{
3162 int rc; 3162 int rc;
3163 int dbitno, word, rembits, nb, nwords, wbitno, agno; 3163 int dbitno, word, rembits, nb, nwords, wbitno, agno;
3164 s8 oldroot, *leaf; 3164 s8 oldroot;
3165 struct dmaptree *tp = (struct dmaptree *) & dp->tree; 3165 struct dmaptree *tp = (struct dmaptree *) & dp->tree;
3166 3166
3167 /* save the current value of the root (i.e. maximum free string) 3167 /* save the current value of the root (i.e. maximum free string)
@@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3169 */ 3169 */
3170 oldroot = tp->stree[ROOT]; 3170 oldroot = tp->stree[ROOT];
3171 3171
3172 /* pick up a pointer to the leaves of the dmap tree */
3173 leaf = tp->stree + LEAFIND;
3174
3175 /* determine the bit number and word within the dmap of the 3172 /* determine the bit number and word within the dmap of the
3176 * starting block. 3173 * starting block.
3177 */ 3174 */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index f6cc0c09ec63..af9606057dde 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1143,7 +1143,6 @@ int txCommit(tid_t tid, /* transaction identifier */
1143 struct jfs_log *log; 1143 struct jfs_log *log;
1144 struct tblock *tblk; 1144 struct tblock *tblk;
1145 struct lrd *lrd; 1145 struct lrd *lrd;
1146 int lsn;
1147 struct inode *ip; 1146 struct inode *ip;
1148 struct jfs_inode_info *jfs_ip; 1147 struct jfs_inode_info *jfs_ip;
1149 int k, n; 1148 int k, n;
@@ -1310,7 +1309,7 @@ int txCommit(tid_t tid, /* transaction identifier */
1310 */ 1309 */
1311 lrd->type = cpu_to_le16(LOG_COMMIT); 1310 lrd->type = cpu_to_le16(LOG_COMMIT);
1312 lrd->length = 0; 1311 lrd->length = 0;
1313 lsn = lmLog(log, tblk, lrd, NULL); 1312 lmLog(log, tblk, lrd, NULL);
1314 1313
1315 lmGroupCommit(log, tblk); 1314 lmGroupCommit(log, tblk);
1316 1315
@@ -2935,7 +2934,6 @@ int jfs_sync(void *arg)
2935{ 2934{
2936 struct inode *ip; 2935 struct inode *ip;
2937 struct jfs_inode_info *jfs_ip; 2936 struct jfs_inode_info *jfs_ip;
2938 int rc;
2939 tid_t tid; 2937 tid_t tid;
2940 2938
2941 do { 2939 do {
@@ -2961,7 +2959,7 @@ int jfs_sync(void *arg)
2961 */ 2959 */
2962 TXN_UNLOCK(); 2960 TXN_UNLOCK();
2963 tid = txBegin(ip->i_sb, COMMIT_INODE); 2961 tid = txBegin(ip->i_sb, COMMIT_INODE);
2964 rc = txCommit(tid, 1, &ip, 0); 2962 txCommit(tid, 1, &ip, 0);
2965 txEnd(tid); 2963 txEnd(tid);
2966 mutex_unlock(&jfs_ip->commit_mutex); 2964 mutex_unlock(&jfs_ip->commit_mutex);
2967 2965
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index adcf92d3b603..7971f37534a3 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
68 /* 68 /*
69 * Wait for outstanding transactions to be written to log: 69 * Wait for outstanding transactions to be written to log:
70 */ 70 */
71 jfs_flush_journal(log, 1); 71 jfs_flush_journal(log, 2);
72 72
73 /* 73 /*
74 * close fileset inode allocation map (aka fileset inode) 74 * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
146 * 146 *
147 * remove file system from log active file system list. 147 * remove file system from log active file system list.
148 */ 148 */
149 jfs_flush_journal(log, 1); 149 jfs_flush_journal(log, 2);
150 150
151 /* 151 /*
152 * Make sure all metadata makes it to disk 152 * Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 29b1f1a21142..e17545e15664 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
893 unchar *i_fastsymlink; 893 unchar *i_fastsymlink;
894 s64 xlen = 0; 894 s64 xlen = 0;
895 int bmask = 0, xsize; 895 int bmask = 0, xsize;
896 s64 extent = 0, xaddr; 896 s64 xaddr;
897 struct metapage *mp; 897 struct metapage *mp;
898 struct super_block *sb; 898 struct super_block *sb;
899 struct tblock *tblk; 899 struct tblock *tblk;
@@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
993 txAbort(tid, 0); 993 txAbort(tid, 0);
994 goto out3; 994 goto out3;
995 } 995 }
996 extent = xaddr;
997 ip->i_size = ssize - 1; 996 ip->i_size = ssize - 1;
998 while (ssize) { 997 while (ssize) {
999 /* This is kind of silly since PATH_MAX == 4K */ 998 /* This is kind of silly since PATH_MAX == 4K */
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 24838f1eeee5..e87fedef23db 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
693 return rc; 693 return rc;
694 } 694 }
695 if (acl) { 695 if (acl) {
696 mode_t mode = inode->i_mode; 696 rc = posix_acl_equiv_mode(acl, &inode->i_mode);
697 rc = posix_acl_equiv_mode(acl, &mode);
698 posix_acl_release(acl); 697 posix_acl_release(acl);
699 if (rc < 0) { 698 if (rc < 0) {
700 printk(KERN_ERR 699 printk(KERN_ERR
@@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
702 rc); 701 rc);
703 return rc; 702 return rc;
704 } 703 }
705 inode->i_mode = mode;
706 mark_inode_dirty(inode); 704 mark_inode_dirty(inode);
707 } 705 }
708 /* 706 /*
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index e374050a911c..8392cb85bd54 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -302,7 +302,8 @@ nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
302 /* We appear to be out of the grace period */ 302 /* We appear to be out of the grace period */
303 wake_up_all(&host->h_gracewait); 303 wake_up_all(&host->h_gracewait);
304 } 304 }
305 dprintk("lockd: server returns status %d\n", resp->status); 305 dprintk("lockd: server returns status %d\n",
306 ntohl(resp->status));
306 return 0; /* Okay, call complete */ 307 return 0; /* Okay, call complete */
307 } 308 }
308 309
@@ -690,7 +691,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
690 goto out; 691 goto out;
691 692
692 if (resp->status != nlm_lck_denied_nolocks) 693 if (resp->status != nlm_lck_denied_nolocks)
693 printk("lockd: unexpected unlock status: %d\n", resp->status); 694 printk("lockd: unexpected unlock status: %d\n",
695 ntohl(resp->status));
694 /* What to do now? I'm out of my depth... */ 696 /* What to do now? I'm out of my depth... */
695 status = -ENOLCK; 697 status = -ENOLCK;
696out: 698out:
@@ -843,6 +845,7 @@ nlm_stat_to_errno(__be32 status)
843 return -ENOLCK; 845 return -ENOLCK;
844#endif 846#endif
845 } 847 }
846 printk(KERN_NOTICE "lockd: unexpected server status %d\n", status); 848 printk(KERN_NOTICE "lockd: unexpected server status %d\n",
849 ntohl(status));
847 return -ENOLCK; 850 return -ENOLCK;
848} 851}
diff --git a/fs/namei.c b/fs/namei.c
index f8c69d373793..f4788365ea22 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -179,19 +179,14 @@ static int check_acl(struct inode *inode, int mask)
179#ifdef CONFIG_FS_POSIX_ACL 179#ifdef CONFIG_FS_POSIX_ACL
180 struct posix_acl *acl; 180 struct posix_acl *acl;
181 181
182 /*
183 * Under RCU walk, we cannot even do a "get_cached_acl()",
184 * because that involves locking and getting a refcount on
185 * a cached ACL.
186 *
187 * So the only case we handle during RCU walking is the
188 * case of a cached "no ACL at all", which needs no locks
189 * or refcounts.
190 */
191 if (mask & MAY_NOT_BLOCK) { 182 if (mask & MAY_NOT_BLOCK) {
192 if (negative_cached_acl(inode, ACL_TYPE_ACCESS)) 183 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
184 if (!acl)
193 return -EAGAIN; 185 return -EAGAIN;
194 return -ECHILD; 186 /* no ->get_acl() calls in RCU mode... */
187 if (acl == ACL_NOT_CACHED)
188 return -ECHILD;
189 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
195 } 190 }
196 191
197 acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 192 acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
@@ -313,6 +308,26 @@ int generic_permission(struct inode *inode, int mask)
313 return -EACCES; 308 return -EACCES;
314} 309}
315 310
311/*
312 * We _really_ want to just do "generic_permission()" without
313 * even looking at the inode->i_op values. So we keep a cache
314 * flag in inode->i_opflags, that says "this has not special
315 * permission function, use the fast case".
316 */
317static inline int do_inode_permission(struct inode *inode, int mask)
318{
319 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
320 if (likely(inode->i_op->permission))
321 return inode->i_op->permission(inode, mask);
322
323 /* This gets set once for the inode lifetime */
324 spin_lock(&inode->i_lock);
325 inode->i_opflags |= IOP_FASTPERM;
326 spin_unlock(&inode->i_lock);
327 }
328 return generic_permission(inode, mask);
329}
330
316/** 331/**
317 * inode_permission - check for access rights to a given inode 332 * inode_permission - check for access rights to a given inode
318 * @inode: inode to check permission on 333 * @inode: inode to check permission on
@@ -327,7 +342,7 @@ int inode_permission(struct inode *inode, int mask)
327{ 342{
328 int retval; 343 int retval;
329 344
330 if (mask & MAY_WRITE) { 345 if (unlikely(mask & MAY_WRITE)) {
331 umode_t mode = inode->i_mode; 346 umode_t mode = inode->i_mode;
332 347
333 /* 348 /*
@@ -344,11 +359,7 @@ int inode_permission(struct inode *inode, int mask)
344 return -EACCES; 359 return -EACCES;
345 } 360 }
346 361
347 if (inode->i_op->permission) 362 retval = do_inode_permission(inode, mask);
348 retval = inode->i_op->permission(inode, mask);
349 else
350 retval = generic_permission(inode, mask);
351
352 if (retval) 363 if (retval)
353 return retval; 364 return retval;
354 365
@@ -716,17 +727,20 @@ static int follow_automount(struct path *path, unsigned flags,
716 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) 727 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
717 return -EISDIR; /* we actually want to stop here */ 728 return -EISDIR; /* we actually want to stop here */
718 729
719 /* We want to mount if someone is trying to open/create a file of any 730 /* We don't want to mount if someone's just doing a stat -
720 * type under the mountpoint, wants to traverse through the mountpoint 731 * unless they're stat'ing a directory and appended a '/' to
721 * or wants to open the mounted directory. 732 * the name.
722 * 733 *
723 * We don't want to mount if someone's just doing a stat and they've 734 * We do, however, want to mount if someone wants to open or
724 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and 735 * create a file of any type under the mountpoint, wants to
725 * appended a '/' to the name. 736 * traverse through the mountpoint or wants to open the
737 * mounted directory. Also, autofs may mark negative dentries
738 * as being automount points. These will need the attentions
739 * of the daemon to instantiate them before they can be used.
726 */ 740 */
727 if (!(flags & LOOKUP_FOLLOW) && 741 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
728 !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 742 LOOKUP_OPEN | LOOKUP_CREATE)) &&
729 LOOKUP_OPEN | LOOKUP_CREATE))) 743 path->dentry->d_inode)
730 return -EISDIR; 744 return -EISDIR;
731 745
732 current->total_link_count++; 746 current->total_link_count++;
@@ -1244,6 +1258,26 @@ static void terminate_walk(struct nameidata *nd)
1244 } 1258 }
1245} 1259}
1246 1260
1261/*
1262 * Do we need to follow links? We _really_ want to be able
1263 * to do this check without having to look at inode->i_op,
1264 * so we keep a cache of "no, this doesn't need follow_link"
1265 * for the common case.
1266 */
1267static inline int should_follow_link(struct inode *inode, int follow)
1268{
1269 if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
1270 if (likely(inode->i_op->follow_link))
1271 return follow;
1272
1273 /* This gets set once for the inode lifetime */
1274 spin_lock(&inode->i_lock);
1275 inode->i_opflags |= IOP_NOFOLLOW;
1276 spin_unlock(&inode->i_lock);
1277 }
1278 return 0;
1279}
1280
1247static inline int walk_component(struct nameidata *nd, struct path *path, 1281static inline int walk_component(struct nameidata *nd, struct path *path,
1248 struct qstr *name, int type, int follow) 1282 struct qstr *name, int type, int follow)
1249{ 1283{
@@ -1266,7 +1300,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1266 terminate_walk(nd); 1300 terminate_walk(nd);
1267 return -ENOENT; 1301 return -ENOENT;
1268 } 1302 }
1269 if (unlikely(inode->i_op->follow_link) && follow) { 1303 if (should_follow_link(inode, follow)) {
1270 if (nd->flags & LOOKUP_RCU) { 1304 if (nd->flags & LOOKUP_RCU) {
1271 if (unlikely(unlazy_walk(nd, path->dentry))) { 1305 if (unlikely(unlazy_walk(nd, path->dentry))) {
1272 terminate_walk(nd); 1306 terminate_walk(nd);
@@ -1319,6 +1353,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
1319} 1353}
1320 1354
1321/* 1355/*
1356 * We really don't want to look at inode->i_op->lookup
1357 * when we don't have to. So we keep a cache bit in
1358 * the inode ->i_opflags field that says "yes, we can
1359 * do lookup on this inode".
1360 */
1361static inline int can_lookup(struct inode *inode)
1362{
1363 if (likely(inode->i_opflags & IOP_LOOKUP))
1364 return 1;
1365 if (likely(!inode->i_op->lookup))
1366 return 0;
1367
1368 /* We do this once for the lifetime of the inode */
1369 spin_lock(&inode->i_lock);
1370 inode->i_opflags |= IOP_LOOKUP;
1371 spin_unlock(&inode->i_lock);
1372 return 1;
1373}
1374
1375/*
1322 * Name resolution. 1376 * Name resolution.
1323 * This is the basic name resolution function, turning a pathname into 1377 * This is the basic name resolution function, turning a pathname into
1324 * the final dentry. We expect 'base' to be positive and a directory. 1378 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1397,10 +1451,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1397 if (err) 1451 if (err)
1398 return err; 1452 return err;
1399 } 1453 }
1454 if (can_lookup(nd->inode))
1455 continue;
1400 err = -ENOTDIR; 1456 err = -ENOTDIR;
1401 if (!nd->inode->i_op->lookup) 1457 break;
1402 break;
1403 continue;
1404 /* here ends the main loop */ 1458 /* here ends the main loop */
1405 1459
1406last_component: 1460last_component:
@@ -2562,6 +2616,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2562 if (!dir->i_op->rmdir) 2616 if (!dir->i_op->rmdir)
2563 return -EPERM; 2617 return -EPERM;
2564 2618
2619 dget(dentry);
2565 mutex_lock(&dentry->d_inode->i_mutex); 2620 mutex_lock(&dentry->d_inode->i_mutex);
2566 2621
2567 error = -EBUSY; 2622 error = -EBUSY;
@@ -2582,6 +2637,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2582 2637
2583out: 2638out:
2584 mutex_unlock(&dentry->d_inode->i_mutex); 2639 mutex_unlock(&dentry->d_inode->i_mutex);
2640 dput(dentry);
2585 if (!error) 2641 if (!error)
2586 d_delete(dentry); 2642 d_delete(dentry);
2587 return error; 2643 return error;
@@ -2971,6 +3027,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2971 if (error) 3027 if (error)
2972 return error; 3028 return error;
2973 3029
3030 dget(new_dentry);
2974 if (target) 3031 if (target)
2975 mutex_lock(&target->i_mutex); 3032 mutex_lock(&target->i_mutex);
2976 3033
@@ -2991,6 +3048,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2991out: 3048out:
2992 if (target) 3049 if (target)
2993 mutex_unlock(&target->i_mutex); 3050 mutex_unlock(&target->i_mutex);
3051 dput(new_dentry);
2994 if (!error) 3052 if (!error)
2995 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3053 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2996 d_move(old_dentry,new_dentry); 3054 d_move(old_dentry,new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 81515545ba75..dbcd82126aed 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -77,6 +77,7 @@ config NFS_V4
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL 79 depends on NFS_FS && NFS_V4 && EXPERIMENTAL
80 select SUNRPC_BACKCHANNEL
80 select PNFS_FILE_LAYOUT 81 select PNFS_FILE_LAYOUT
81 help 82 help
82 This option enables support for minor version 1 of the NFSv4 protocol 83 This option enables support for minor version 1 of the NFSv4 protocol
@@ -87,15 +88,15 @@ config NFS_V4_1
87config PNFS_FILE_LAYOUT 88config PNFS_FILE_LAYOUT
88 tristate 89 tristate
89 90
91config PNFS_BLOCK
92 tristate
93 depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM
94 default m
95
90config PNFS_OBJLAYOUT 96config PNFS_OBJLAYOUT
91 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" 97 tristate
92 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD 98 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
93 help 99 default m
94 Say M here if you want your pNFS client to support the Objects Layout Driver.
95 Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
96 upper level driver (SCSI_OSD_ULD).
97
98 If unsure, say N.
99 100
100config ROOT_NFS 101config ROOT_NFS
101 bool "Root file system on NFS" 102 bool "Root file system on NFS"
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6a34f7dd0e6f..b58613d0abb3 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
24 24
25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
26obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 000000000000..d5815505c020
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS block layout driver kernel module
3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 000000000000..9561c8fc8bdb
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1020 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/module.h>
34#include <linux/init.h>
35#include <linux/mount.h>
36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h>
40
41#include "blocklayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44
45MODULE_LICENSE("GPL");
46MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
47MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
48
49struct dentry *bl_device_pipe;
50wait_queue_head_t bl_wq;
51
52static void print_page(struct page *page)
53{
54 dprintk("PRINTPAGE page %p\n", page);
55 dprintk(" PagePrivate %d\n", PagePrivate(page));
56 dprintk(" PageUptodate %d\n", PageUptodate(page));
57 dprintk(" PageError %d\n", PageError(page));
58 dprintk(" PageDirty %d\n", PageDirty(page));
59 dprintk(" PageReferenced %d\n", PageReferenced(page));
60 dprintk(" PageLocked %d\n", PageLocked(page));
61 dprintk(" PageWriteback %d\n", PageWriteback(page));
62 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
63 dprintk("\n");
64}
65
66/* Given the be associated with isect, determine if page data needs to be
67 * initialized.
68 */
69static int is_hole(struct pnfs_block_extent *be, sector_t isect)
70{
71 if (be->be_state == PNFS_BLOCK_NONE_DATA)
72 return 1;
73 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
74 return 0;
75 else
76 return !bl_is_sector_init(be->be_inval, isect);
77}
78
79/* Given the be associated with isect, determine if page data can be
80 * written to disk.
81 */
82static int is_writable(struct pnfs_block_extent *be, sector_t isect)
83{
84 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
85 be->be_state == PNFS_BLOCK_INVALID_DATA);
86}
87
88/* The data we are handed might be spread across several bios. We need
89 * to track when the last one is finished.
90 */
91struct parallel_io {
92 struct kref refcnt;
93 struct rpc_call_ops call_ops;
94 void (*pnfs_callback) (void *data);
95 void *data;
96};
97
98static inline struct parallel_io *alloc_parallel(void *data)
99{
100 struct parallel_io *rv;
101
102 rv = kmalloc(sizeof(*rv), GFP_NOFS);
103 if (rv) {
104 rv->data = data;
105 kref_init(&rv->refcnt);
106 }
107 return rv;
108}
109
110static inline void get_parallel(struct parallel_io *p)
111{
112 kref_get(&p->refcnt);
113}
114
115static void destroy_parallel(struct kref *kref)
116{
117 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
118
119 dprintk("%s enter\n", __func__);
120 p->pnfs_callback(p->data);
121 kfree(p);
122}
123
124static inline void put_parallel(struct parallel_io *p)
125{
126 kref_put(&p->refcnt, destroy_parallel);
127}
128
129static struct bio *
130bl_submit_bio(int rw, struct bio *bio)
131{
132 if (bio) {
133 get_parallel(bio->bi_private);
134 dprintk("%s submitting %s bio %u@%llu\n", __func__,
135 rw == READ ? "read" : "write",
136 bio->bi_size, (unsigned long long)bio->bi_sector);
137 submit_bio(rw, bio);
138 }
139 return NULL;
140}
141
142static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
143 struct pnfs_block_extent *be,
144 void (*end_io)(struct bio *, int err),
145 struct parallel_io *par)
146{
147 struct bio *bio;
148
149 bio = bio_alloc(GFP_NOIO, npg);
150 if (!bio)
151 return NULL;
152
153 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
154 bio->bi_bdev = be->be_mdev;
155 bio->bi_end_io = end_io;
156 bio->bi_private = par;
157 return bio;
158}
159
160static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
161 sector_t isect, struct page *page,
162 struct pnfs_block_extent *be,
163 void (*end_io)(struct bio *, int err),
164 struct parallel_io *par)
165{
166retry:
167 if (!bio) {
168 bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
169 if (!bio)
170 return ERR_PTR(-ENOMEM);
171 }
172 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
173 bio = bl_submit_bio(rw, bio);
174 goto retry;
175 }
176 return bio;
177}
178
179static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
180{
181 if (lseg->pls_range.iomode == IOMODE_RW) {
182 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
183 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
184 } else {
185 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
186 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
187 }
188}
189
190/* This is basically copied from mpage_end_io_read */
191static void bl_end_io_read(struct bio *bio, int err)
192{
193 struct parallel_io *par = bio->bi_private;
194 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
195 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
196 struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
197
198 do {
199 struct page *page = bvec->bv_page;
200
201 if (--bvec >= bio->bi_io_vec)
202 prefetchw(&bvec->bv_page->flags);
203 if (uptodate)
204 SetPageUptodate(page);
205 } while (bvec >= bio->bi_io_vec);
206 if (!uptodate) {
207 if (!rdata->pnfs_error)
208 rdata->pnfs_error = -EIO;
209 bl_set_lo_fail(rdata->lseg);
210 }
211 bio_put(bio);
212 put_parallel(par);
213}
214
215static void bl_read_cleanup(struct work_struct *work)
216{
217 struct rpc_task *task;
218 struct nfs_read_data *rdata;
219 dprintk("%s enter\n", __func__);
220 task = container_of(work, struct rpc_task, u.tk_work);
221 rdata = container_of(task, struct nfs_read_data, task);
222 pnfs_ld_read_done(rdata);
223}
224
225static void
226bl_end_par_io_read(void *data)
227{
228 struct nfs_read_data *rdata = data;
229
230 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
231 schedule_work(&rdata->task.u.tk_work);
232}
233
234/* We don't want normal .rpc_call_done callback used, so we replace it
235 * with this stub.
236 */
237static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
238{
239 return;
240}
241
242static enum pnfs_try_status
243bl_read_pagelist(struct nfs_read_data *rdata)
244{
245 int i, hole;
246 struct bio *bio = NULL;
247 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
248 sector_t isect, extent_length = 0;
249 struct parallel_io *par;
250 loff_t f_offset = rdata->args.offset;
251 size_t count = rdata->args.count;
252 struct page **pages = rdata->args.pages;
253 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
254
255 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
256 rdata->npages, f_offset, count);
257
258 par = alloc_parallel(rdata);
259 if (!par)
260 goto use_mds;
261 par->call_ops = *rdata->mds_ops;
262 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
263 par->pnfs_callback = bl_end_par_io_read;
264 /* At this point, we can no longer jump to use_mds */
265
266 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
267 /* Code assumes extents are page-aligned */
268 for (i = pg_index; i < rdata->npages; i++) {
269 if (!extent_length) {
270 /* We've used up the previous extent */
271 bl_put_extent(be);
272 bl_put_extent(cow_read);
273 bio = bl_submit_bio(READ, bio);
274 /* Get the next one */
275 be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
276 isect, &cow_read);
277 if (!be) {
278 rdata->pnfs_error = -EIO;
279 goto out;
280 }
281 extent_length = be->be_length -
282 (isect - be->be_f_offset);
283 if (cow_read) {
284 sector_t cow_length = cow_read->be_length -
285 (isect - cow_read->be_f_offset);
286 extent_length = min(extent_length, cow_length);
287 }
288 }
289 hole = is_hole(be, isect);
290 if (hole && !cow_read) {
291 bio = bl_submit_bio(READ, bio);
292 /* Fill hole w/ zeroes w/o accessing device */
293 dprintk("%s Zeroing page for hole\n", __func__);
294 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
295 print_page(pages[i]);
296 SetPageUptodate(pages[i]);
297 } else {
298 struct pnfs_block_extent *be_read;
299
300 be_read = (hole && cow_read) ? cow_read : be;
301 bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
302 isect, pages[i], be_read,
303 bl_end_io_read, par);
304 if (IS_ERR(bio)) {
305 rdata->pnfs_error = PTR_ERR(bio);
306 goto out;
307 }
308 }
309 isect += PAGE_CACHE_SECTORS;
310 extent_length -= PAGE_CACHE_SECTORS;
311 }
312 if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
313 rdata->res.eof = 1;
314 rdata->res.count = rdata->inode->i_size - f_offset;
315 } else {
316 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
317 }
318out:
319 bl_put_extent(be);
320 bl_put_extent(cow_read);
321 bl_submit_bio(READ, bio);
322 put_parallel(par);
323 return PNFS_ATTEMPTED;
324
325 use_mds:
326 dprintk("Giving up and using normal NFS\n");
327 return PNFS_NOT_ATTEMPTED;
328}
329
330static void mark_extents_written(struct pnfs_block_layout *bl,
331 __u64 offset, __u32 count)
332{
333 sector_t isect, end;
334 struct pnfs_block_extent *be;
335
336 dprintk("%s(%llu, %u)\n", __func__, offset, count);
337 if (count == 0)
338 return;
339 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
340 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
341 end >>= SECTOR_SHIFT;
342 while (isect < end) {
343 sector_t len;
344 be = bl_find_get_extent(bl, isect, NULL);
345 BUG_ON(!be); /* FIXME */
346 len = min(end, be->be_f_offset + be->be_length) - isect;
347 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
348 bl_mark_for_commit(be, isect, len); /* What if fails? */
349 isect += len;
350 bl_put_extent(be);
351 }
352}
353
354static void bl_end_io_write_zero(struct bio *bio, int err)
355{
356 struct parallel_io *par = bio->bi_private;
357 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
358 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
359 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
360
361 do {
362 struct page *page = bvec->bv_page;
363
364 if (--bvec >= bio->bi_io_vec)
365 prefetchw(&bvec->bv_page->flags);
366 /* This is the zeroing page we added */
367 end_page_writeback(page);
368 page_cache_release(page);
369 } while (bvec >= bio->bi_io_vec);
370 if (!uptodate) {
371 if (!wdata->pnfs_error)
372 wdata->pnfs_error = -EIO;
373 bl_set_lo_fail(wdata->lseg);
374 }
375 bio_put(bio);
376 put_parallel(par);
377}
378
379/* This is basically copied from mpage_end_io_read */
380static void bl_end_io_write(struct bio *bio, int err)
381{
382 struct parallel_io *par = bio->bi_private;
383 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
384 struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
385
386 if (!uptodate) {
387 if (!wdata->pnfs_error)
388 wdata->pnfs_error = -EIO;
389 bl_set_lo_fail(wdata->lseg);
390 }
391 bio_put(bio);
392 put_parallel(par);
393}
394
395/* Function scheduled for call during bl_end_par_io_write,
396 * it marks sectors as written and extends the commitlist.
397 */
398static void bl_write_cleanup(struct work_struct *work)
399{
400 struct rpc_task *task;
401 struct nfs_write_data *wdata;
402 dprintk("%s enter\n", __func__);
403 task = container_of(work, struct rpc_task, u.tk_work);
404 wdata = container_of(task, struct nfs_write_data, task);
405 if (!wdata->pnfs_error) {
406 /* Marks for LAYOUTCOMMIT */
407 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
408 wdata->args.offset, wdata->args.count);
409 }
410 pnfs_ld_write_done(wdata);
411}
412
413/* Called when last of bios associated with a bl_write_pagelist call finishes */
414static void bl_end_par_io_write(void *data)
415{
416 struct nfs_write_data *wdata = data;
417
418 wdata->task.tk_status = 0;
419 wdata->verf.committed = NFS_FILE_SYNC;
420 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
421 schedule_work(&wdata->task.u.tk_work);
422}
423
424/* FIXME STUB - mark intersection of layout and page as bad, so is not
425 * used again.
426 */
427static void mark_bad_read(void)
428{
429 return;
430}
431
432/*
433 * map_block: map a requested I/0 block (isect) into an offset in the LVM
434 * block_device
435 */
436static void
437map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
438{
439 dprintk("%s enter be=%p\n", __func__, be);
440
441 set_buffer_mapped(bh);
442 bh->b_bdev = be->be_mdev;
443 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
444 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
445
446 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
447 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
448 bh->b_size);
449 return;
450}
451
452/* Given an unmapped page, zero it or read in page for COW, page is locked
453 * by caller.
454 */
455static int
456init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
457{
458 struct buffer_head *bh = NULL;
459 int ret = 0;
460 sector_t isect;
461
462 dprintk("%s enter, %p\n", __func__, page);
463 BUG_ON(PageUptodate(page));
464 if (!cow_read) {
465 zero_user_segment(page, 0, PAGE_SIZE);
466 SetPageUptodate(page);
467 goto cleanup;
468 }
469
470 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
471 if (!bh) {
472 ret = -ENOMEM;
473 goto cleanup;
474 }
475
476 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
477 map_block(bh, isect, cow_read);
478 if (!bh_uptodate_or_lock(bh))
479 ret = bh_submit_read(bh);
480 if (ret)
481 goto cleanup;
482 SetPageUptodate(page);
483
484cleanup:
485 bl_put_extent(cow_read);
486 if (bh)
487 free_buffer_head(bh);
488 if (ret) {
489 /* Need to mark layout with bad read...should now
490 * just use nfs4 for reads and writes.
491 */
492 mark_bad_read();
493 }
494 return ret;
495}
496
497static enum pnfs_try_status
498bl_write_pagelist(struct nfs_write_data *wdata, int sync)
499{
500 int i, ret, npg_zero, pg_index, last = 0;
501 struct bio *bio = NULL;
502 struct pnfs_block_extent *be = NULL, *cow_read = NULL;
503 sector_t isect, last_isect = 0, extent_length = 0;
504 struct parallel_io *par;
505 loff_t offset = wdata->args.offset;
506 size_t count = wdata->args.count;
507 struct page **pages = wdata->args.pages;
508 struct page *page;
509 pgoff_t index;
510 u64 temp;
511 int npg_per_block =
512 NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
513
514 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
515 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
516 * We want to write each, and if there is an error set pnfs_error
517 * to have it redone using nfs.
518 */
519 par = alloc_parallel(wdata);
520 if (!par)
521 return PNFS_NOT_ATTEMPTED;
522 par->call_ops = *wdata->mds_ops;
523 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
524 par->pnfs_callback = bl_end_par_io_write;
525 /* At this point, have to be more careful with error handling */
526
527 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
528 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
529 if (!be || !is_writable(be, isect)) {
530 dprintk("%s no matching extents!\n", __func__);
531 wdata->pnfs_error = -EINVAL;
532 goto out;
533 }
534
535 /* First page inside INVALID extent */
536 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
537 temp = offset >> PAGE_CACHE_SHIFT;
538 npg_zero = do_div(temp, npg_per_block);
539 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
540 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
541 extent_length = be->be_length - (isect - be->be_f_offset);
542
543fill_invalid_ext:
544 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
545 for (;npg_zero > 0; npg_zero--) {
546 /* page ref released in bl_end_io_write_zero */
547 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
548 dprintk("%s zero %dth page: index %lu isect %llu\n",
549 __func__, npg_zero, index,
550 (unsigned long long)isect);
551 page =
552 find_or_create_page(wdata->inode->i_mapping, index,
553 GFP_NOFS);
554 if (!page) {
555 dprintk("%s oom\n", __func__);
556 wdata->pnfs_error = -ENOMEM;
557 goto out;
558 }
559
560 /* PageDirty: Other will write this out
561 * PageWriteback: Other is writing this out
562 * PageUptodate: It was read before
563 * sector_initialized: already written out
564 */
565 if (PageDirty(page) || PageWriteback(page) ||
566 bl_is_sector_init(be->be_inval, isect)) {
567 print_page(page);
568 unlock_page(page);
569 page_cache_release(page);
570 goto next_page;
571 }
572 if (!PageUptodate(page)) {
573 /* New page, readin or zero it */
574 init_page_for_write(page, cow_read);
575 }
576 set_page_writeback(page);
577 unlock_page(page);
578
579 ret = bl_mark_sectors_init(be->be_inval, isect,
580 PAGE_CACHE_SECTORS,
581 NULL);
582 if (unlikely(ret)) {
583 dprintk("%s bl_mark_sectors_init fail %d\n",
584 __func__, ret);
585 end_page_writeback(page);
586 page_cache_release(page);
587 wdata->pnfs_error = ret;
588 goto out;
589 }
590 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
591 isect, page, be,
592 bl_end_io_write_zero, par);
593 if (IS_ERR(bio)) {
594 wdata->pnfs_error = PTR_ERR(bio);
595 goto out;
596 }
597 /* FIXME: This should be done in bi_end_io */
598 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
599 page->index << PAGE_CACHE_SHIFT,
600 PAGE_CACHE_SIZE);
601next_page:
602 isect += PAGE_CACHE_SECTORS;
603 extent_length -= PAGE_CACHE_SECTORS;
604 }
605 if (last)
606 goto write_done;
607 }
608 bio = bl_submit_bio(WRITE, bio);
609
610 /* Middle pages */
611 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
612 for (i = pg_index; i < wdata->npages; i++) {
613 if (!extent_length) {
614 /* We've used up the previous extent */
615 bl_put_extent(be);
616 bio = bl_submit_bio(WRITE, bio);
617 /* Get the next one */
618 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
619 isect, NULL);
620 if (!be || !is_writable(be, isect)) {
621 wdata->pnfs_error = -EINVAL;
622 goto out;
623 }
624 extent_length = be->be_length -
625 (isect - be->be_f_offset);
626 }
627 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
628 ret = bl_mark_sectors_init(be->be_inval, isect,
629 PAGE_CACHE_SECTORS,
630 NULL);
631 if (unlikely(ret)) {
632 dprintk("%s bl_mark_sectors_init fail %d\n",
633 __func__, ret);
634 wdata->pnfs_error = ret;
635 goto out;
636 }
637 }
638 bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
639 isect, pages[i], be,
640 bl_end_io_write, par);
641 if (IS_ERR(bio)) {
642 wdata->pnfs_error = PTR_ERR(bio);
643 goto out;
644 }
645 isect += PAGE_CACHE_SECTORS;
646 last_isect = isect;
647 extent_length -= PAGE_CACHE_SECTORS;
648 }
649
650 /* Last page inside INVALID extent */
651 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
652 bio = bl_submit_bio(WRITE, bio);
653 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
654 npg_zero = npg_per_block - do_div(temp, npg_per_block);
655 if (npg_zero < npg_per_block) {
656 last = 1;
657 goto fill_invalid_ext;
658 }
659 }
660
661write_done:
662 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
663 if (count < wdata->res.count) {
664 wdata->res.count = count;
665 }
666out:
667 bl_put_extent(be);
668 bl_submit_bio(WRITE, bio);
669 put_parallel(par);
670 return PNFS_ATTEMPTED;
671}
672
673/* FIXME - range ignored */
674static void
675release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
676{
677 int i;
678 struct pnfs_block_extent *be;
679
680 spin_lock(&bl->bl_ext_lock);
681 for (i = 0; i < EXTENT_LISTS; i++) {
682 while (!list_empty(&bl->bl_extents[i])) {
683 be = list_first_entry(&bl->bl_extents[i],
684 struct pnfs_block_extent,
685 be_node);
686 list_del(&be->be_node);
687 bl_put_extent(be);
688 }
689 }
690 spin_unlock(&bl->bl_ext_lock);
691}
692
693static void
694release_inval_marks(struct pnfs_inval_markings *marks)
695{
696 struct pnfs_inval_tracking *pos, *temp;
697
698 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
699 list_del(&pos->it_link);
700 kfree(pos);
701 }
702 return;
703}
704
705static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
706{
707 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
708
709 dprintk("%s enter\n", __func__);
710 release_extents(bl, NULL);
711 release_inval_marks(&bl->bl_inval);
712 kfree(bl);
713}
714
715static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
716 gfp_t gfp_flags)
717{
718 struct pnfs_block_layout *bl;
719
720 dprintk("%s enter\n", __func__);
721 bl = kzalloc(sizeof(*bl), gfp_flags);
722 if (!bl)
723 return NULL;
724 spin_lock_init(&bl->bl_ext_lock);
725 INIT_LIST_HEAD(&bl->bl_extents[0]);
726 INIT_LIST_HEAD(&bl->bl_extents[1]);
727 INIT_LIST_HEAD(&bl->bl_commit);
728 INIT_LIST_HEAD(&bl->bl_committing);
729 bl->bl_count = 0;
730 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
731 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
732 return &bl->bl_layout;
733}
734
735static void bl_free_lseg(struct pnfs_layout_segment *lseg)
736{
737 dprintk("%s enter\n", __func__);
738 kfree(lseg);
739}
740
741/* We pretty much ignore lseg, and store all data layout wide, so we
742 * can correctly merge.
743 */
744static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
745 struct nfs4_layoutget_res *lgr,
746 gfp_t gfp_flags)
747{
748 struct pnfs_layout_segment *lseg;
749 int status;
750
751 dprintk("%s enter\n", __func__);
752 lseg = kzalloc(sizeof(*lseg), gfp_flags);
753 if (!lseg)
754 return ERR_PTR(-ENOMEM);
755 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
756 if (status) {
757 /* We don't want to call the full-blown bl_free_lseg,
758 * since on error extents were not touched.
759 */
760 kfree(lseg);
761 return ERR_PTR(status);
762 }
763 return lseg;
764}
765
766static void
767bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
768 const struct nfs4_layoutcommit_args *arg)
769{
770 dprintk("%s enter\n", __func__);
771 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
772}
773
774static void
775bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
776{
777 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
778
779 dprintk("%s enter\n", __func__);
780 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
781}
782
783static void free_blk_mountid(struct block_mount_id *mid)
784{
785 if (mid) {
786 struct pnfs_block_dev *dev;
787 spin_lock(&mid->bm_lock);
788 while (!list_empty(&mid->bm_devlist)) {
789 dev = list_first_entry(&mid->bm_devlist,
790 struct pnfs_block_dev,
791 bm_node);
792 list_del(&dev->bm_node);
793 bl_free_block_dev(dev);
794 }
795 spin_unlock(&mid->bm_lock);
796 kfree(mid);
797 }
798}
799
800/* This is mostly copied from the filelayout's get_device_info function.
801 * It seems much of this should be at the generic pnfs level.
802 */
803static struct pnfs_block_dev *
804nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
805 struct nfs4_deviceid *d_id)
806{
807 struct pnfs_device *dev;
808 struct pnfs_block_dev *rv = NULL;
809 u32 max_resp_sz;
810 int max_pages;
811 struct page **pages = NULL;
812 int i, rc;
813
814 /*
815 * Use the session max response size as the basis for setting
816 * GETDEVICEINFO's maxcount
817 */
818 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
819 max_pages = max_resp_sz >> PAGE_SHIFT;
820 dprintk("%s max_resp_sz %u max_pages %d\n",
821 __func__, max_resp_sz, max_pages);
822
823 dev = kmalloc(sizeof(*dev), GFP_NOFS);
824 if (!dev) {
825 dprintk("%s kmalloc failed\n", __func__);
826 return NULL;
827 }
828
829 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
830 if (pages == NULL) {
831 kfree(dev);
832 return NULL;
833 }
834 for (i = 0; i < max_pages; i++) {
835 pages[i] = alloc_page(GFP_NOFS);
836 if (!pages[i])
837 goto out_free;
838 }
839
840 memcpy(&dev->dev_id, d_id, sizeof(*d_id));
841 dev->layout_type = LAYOUT_BLOCK_VOLUME;
842 dev->pages = pages;
843 dev->pgbase = 0;
844 dev->pglen = PAGE_SIZE * max_pages;
845 dev->mincount = 0;
846
847 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
848 rc = nfs4_proc_getdeviceinfo(server, dev);
849 dprintk("%s getdevice info returns %d\n", __func__, rc);
850 if (rc)
851 goto out_free;
852
853 rv = nfs4_blk_decode_device(server, dev);
854 out_free:
855 for (i = 0; i < max_pages; i++)
856 __free_page(pages[i]);
857 kfree(pages);
858 kfree(dev);
859 return rv;
860}
861
862static int
863bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
864{
865 struct block_mount_id *b_mt_id = NULL;
866 struct pnfs_devicelist *dlist = NULL;
867 struct pnfs_block_dev *bdev;
868 LIST_HEAD(block_disklist);
869 int status = 0, i;
870
871 dprintk("%s enter\n", __func__);
872
873 if (server->pnfs_blksize == 0) {
874 dprintk("%s Server did not return blksize\n", __func__);
875 return -EINVAL;
876 }
877 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
878 if (!b_mt_id) {
879 status = -ENOMEM;
880 goto out_error;
881 }
882 /* Initialize nfs4 block layout mount id */
883 spin_lock_init(&b_mt_id->bm_lock);
884 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
885
886 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
887 if (!dlist) {
888 status = -ENOMEM;
889 goto out_error;
890 }
891 dlist->eof = 0;
892 while (!dlist->eof) {
893 status = nfs4_proc_getdevicelist(server, fh, dlist);
894 if (status)
895 goto out_error;
896 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
897 __func__, dlist->num_devs, dlist->eof);
898 for (i = 0; i < dlist->num_devs; i++) {
899 bdev = nfs4_blk_get_deviceinfo(server, fh,
900 &dlist->dev_id[i]);
901 if (!bdev) {
902 status = -ENODEV;
903 goto out_error;
904 }
905 spin_lock(&b_mt_id->bm_lock);
906 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
907 spin_unlock(&b_mt_id->bm_lock);
908 }
909 }
910 dprintk("%s SUCCESS\n", __func__);
911 server->pnfs_ld_data = b_mt_id;
912
913 out_return:
914 kfree(dlist);
915 return status;
916
917 out_error:
918 free_blk_mountid(b_mt_id);
919 goto out_return;
920}
921
922static int
923bl_clear_layoutdriver(struct nfs_server *server)
924{
925 struct block_mount_id *b_mt_id = server->pnfs_ld_data;
926
927 dprintk("%s enter\n", __func__);
928 free_blk_mountid(b_mt_id);
929 dprintk("%s RETURNS\n", __func__);
930 return 0;
931}
932
933static const struct nfs_pageio_ops bl_pg_read_ops = {
934 .pg_init = pnfs_generic_pg_init_read,
935 .pg_test = pnfs_generic_pg_test,
936 .pg_doio = pnfs_generic_pg_readpages,
937};
938
939static const struct nfs_pageio_ops bl_pg_write_ops = {
940 .pg_init = pnfs_generic_pg_init_write,
941 .pg_test = pnfs_generic_pg_test,
942 .pg_doio = pnfs_generic_pg_writepages,
943};
944
945static struct pnfs_layoutdriver_type blocklayout_type = {
946 .id = LAYOUT_BLOCK_VOLUME,
947 .name = "LAYOUT_BLOCK_VOLUME",
948 .read_pagelist = bl_read_pagelist,
949 .write_pagelist = bl_write_pagelist,
950 .alloc_layout_hdr = bl_alloc_layout_hdr,
951 .free_layout_hdr = bl_free_layout_hdr,
952 .alloc_lseg = bl_alloc_lseg,
953 .free_lseg = bl_free_lseg,
954 .encode_layoutcommit = bl_encode_layoutcommit,
955 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
956 .set_layoutdriver = bl_set_layoutdriver,
957 .clear_layoutdriver = bl_clear_layoutdriver,
958 .pg_read_ops = &bl_pg_read_ops,
959 .pg_write_ops = &bl_pg_write_ops,
960};
961
962static const struct rpc_pipe_ops bl_upcall_ops = {
963 .upcall = bl_pipe_upcall,
964 .downcall = bl_pipe_downcall,
965 .destroy_msg = bl_pipe_destroy_msg,
966};
967
968static int __init nfs4blocklayout_init(void)
969{
970 struct vfsmount *mnt;
971 struct path path;
972 int ret;
973
974 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
975
976 ret = pnfs_register_layoutdriver(&blocklayout_type);
977 if (ret)
978 goto out;
979
980 init_waitqueue_head(&bl_wq);
981
982 mnt = rpc_get_mount();
983 if (IS_ERR(mnt)) {
984 ret = PTR_ERR(mnt);
985 goto out_remove;
986 }
987
988 ret = vfs_path_lookup(mnt->mnt_root,
989 mnt,
990 NFS_PIPE_DIRNAME, 0, &path);
991 if (ret)
992 goto out_remove;
993
994 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
995 &bl_upcall_ops, 0);
996 if (IS_ERR(bl_device_pipe)) {
997 ret = PTR_ERR(bl_device_pipe);
998 goto out_remove;
999 }
1000out:
1001 return ret;
1002
1003out_remove:
1004 pnfs_unregister_layoutdriver(&blocklayout_type);
1005 return ret;
1006}
1007
1008static void __exit nfs4blocklayout_exit(void)
1009{
1010 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1011 __func__);
1012
1013 pnfs_unregister_layoutdriver(&blocklayout_type);
1014 rpc_unlink(bl_device_pipe);
1015}
1016
1017MODULE_ALIAS("nfs-layouttype4-3");
1018
1019module_init(nfs4blocklayout_init);
1020module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 000000000000..f27d827960a3
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,207 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
33#define FS_NFS_NFS4BLOCKLAYOUT_H
34
35#include <linux/device-mapper.h>
36#include <linux/nfs_fs.h>
37#include <linux/sunrpc/rpc_pipe_fs.h>
38
39#include "../pnfs.h"
40
41#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
42#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
43
44struct block_mount_id {
45 spinlock_t bm_lock; /* protects list */
46 struct list_head bm_devlist; /* holds pnfs_block_dev */
47};
48
49struct pnfs_block_dev {
50 struct list_head bm_node;
51 struct nfs4_deviceid bm_mdevid; /* associated devid */
52 struct block_device *bm_mdev; /* meta device itself */
53};
54
55enum exstate4 {
56 PNFS_BLOCK_READWRITE_DATA = 0,
57 PNFS_BLOCK_READ_DATA = 1,
58 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
59 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
60};
61
62#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
63
64struct my_tree {
65 sector_t mtt_step_size; /* Internal sector alignment */
66 struct list_head mtt_stub; /* Should be a radix tree */
67};
68
69struct pnfs_inval_markings {
70 spinlock_t im_lock;
71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 sector_t im_block_size; /* Server blocksize in sectors */
73};
74
75struct pnfs_inval_tracking {
76 struct list_head it_link;
77 int it_sector;
78 int it_tags;
79};
80
81/* sector_t fields are all in 512-byte sectors */
82struct pnfs_block_extent {
83 struct kref be_refcnt;
84 struct list_head be_node; /* link into lseg list */
85 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
86 struct block_device *be_mdev;
87 sector_t be_f_offset; /* the starting offset in the file */
88 sector_t be_length; /* the size of the extent */
89 sector_t be_v_offset; /* the starting offset in the volume */
90 enum exstate4 be_state; /* the state of this extent */
91 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
92};
93
94/* Shortened extent used by LAYOUTCOMMIT */
95struct pnfs_block_short_extent {
96 struct list_head bse_node;
97 struct nfs4_deviceid bse_devid;
98 struct block_device *bse_mdev;
99 sector_t bse_f_offset; /* the starting offset in the file */
100 sector_t bse_length; /* the size of the extent */
101};
102
103static inline void
104BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
105{
106 spin_lock_init(&marks->im_lock);
107 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
108 marks->im_block_size = blocksize;
109 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 blocksize);
111}
112
113enum extentclass4 {
114 RW_EXTENT = 0, /* READWRTE and INVAL */
115 RO_EXTENT = 1, /* READ and NONE */
116 EXTENT_LISTS = 2,
117};
118
119static inline int bl_choose_list(enum exstate4 state)
120{
121 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
122 return RO_EXTENT;
123 else
124 return RW_EXTENT;
125}
126
127struct pnfs_block_layout {
128 struct pnfs_layout_hdr bl_layout;
129 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
130 spinlock_t bl_ext_lock; /* Protects list manipulation */
131 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
132 struct list_head bl_commit; /* Needs layout commit */
133 struct list_head bl_committing; /* Layout committing */
134 unsigned int bl_count; /* entries in bl_commit */
135 sector_t bl_blocksize; /* Server blocksize in sectors */
136};
137
138#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
139
140static inline struct pnfs_block_layout *
141BLK_LO2EXT(struct pnfs_layout_hdr *lo)
142{
143 return container_of(lo, struct pnfs_block_layout, bl_layout);
144}
145
146static inline struct pnfs_block_layout *
147BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
148{
149 return BLK_LO2EXT(lseg->pls_layout);
150}
151
152struct bl_dev_msg {
153 int status;
154 uint32_t major, minor;
155};
156
157struct bl_msg_hdr {
158 u8 type;
159 u16 totallen; /* length of entire message, including hdr itself */
160};
161
162extern struct dentry *bl_device_pipe;
163extern wait_queue_head_t bl_wq;
164
165#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
166#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
167#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
168#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
169#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
170
171/* blocklayoutdev.c */
172ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
173 char __user *, size_t);
174ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
175void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
176struct block_device *nfs4_blkdev_get(dev_t dev);
177int nfs4_blkdev_put(struct block_device *bdev);
178struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
179 struct pnfs_device *dev);
180int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
181 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
182
183/* blocklayoutdm.c */
184void bl_free_block_dev(struct pnfs_block_dev *bdev);
185
186/* extents.c */
187struct pnfs_block_extent *
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
189 struct pnfs_block_extent **cow_read);
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
191 sector_t offset, sector_t length,
192 sector_t **pages);
193void bl_put_extent(struct pnfs_block_extent *be);
194struct pnfs_block_extent *bl_alloc_extent(void);
195int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
196int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
197 struct xdr_stream *xdr,
198 const struct nfs4_layoutcommit_args *arg);
199void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
200 const struct nfs4_layoutcommit_args *arg,
201 int status);
202int bl_add_merge_extent(struct pnfs_block_layout *bl,
203 struct pnfs_block_extent *new);
204int bl_mark_for_commit(struct pnfs_block_extent *be,
205 sector_t offset, sector_t length);
206
207#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 000000000000..a83b393fb01c
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,410 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "%s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/* Open a block_device by device number. */
57struct block_device *nfs4_blkdev_get(dev_t dev)
58{
59 struct block_device *bd;
60
61 dprintk("%s enter\n", __func__);
62 bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
63 if (IS_ERR(bd))
64 goto fail;
65 return bd;
66fail:
67 dprintk("%s failed to open device : %ld\n",
68 __func__, PTR_ERR(bd));
69 return NULL;
70}
71
72/*
73 * Release the block device
74 */
75int nfs4_blkdev_put(struct block_device *bdev)
76{
77 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
78 MINOR(bdev->bd_dev));
79 return blkdev_put(bdev, FMODE_READ);
80}
81
82/*
83 * Shouldn't there be a rpc_generic_upcall() to do this for us?
84 */
85ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
86 char __user *dst, size_t buflen)
87{
88 char *data = (char *)msg->data + msg->copied;
89 size_t mlen = min(msg->len - msg->copied, buflen);
90 unsigned long left;
91
92 left = copy_to_user(dst, data, mlen);
93 if (left == mlen) {
94 msg->errno = -EFAULT;
95 return -EFAULT;
96 }
97
98 mlen -= left;
99 msg->copied += mlen;
100 msg->errno = 0;
101 return mlen;
102}
103
104static struct bl_dev_msg bl_mount_reply;
105
106ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
107 size_t mlen)
108{
109 if (mlen != sizeof (struct bl_dev_msg))
110 return -EINVAL;
111
112 if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
113 return -EFAULT;
114
115 wake_up(&bl_wq);
116
117 return mlen;
118}
119
120void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
121{
122 if (msg->errno >= 0)
123 return;
124 wake_up(&bl_wq);
125}
126
127/*
128 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
129 */
130struct pnfs_block_dev *
131nfs4_blk_decode_device(struct nfs_server *server,
132 struct pnfs_device *dev)
133{
134 struct pnfs_block_dev *rv = NULL;
135 struct block_device *bd = NULL;
136 struct rpc_pipe_msg msg;
137 struct bl_msg_hdr bl_msg = {
138 .type = BL_DEVICE_MOUNT,
139 .totallen = dev->mincount,
140 };
141 uint8_t *dataptr;
142 DECLARE_WAITQUEUE(wq, current);
143 struct bl_dev_msg *reply = &bl_mount_reply;
144 int offset, len, i;
145
146 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
147 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
148 dev->mincount);
149
150 memset(&msg, 0, sizeof(msg));
151 msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
152 if (!msg.data) {
153 rv = ERR_PTR(-ENOMEM);
154 goto out;
155 }
156
157 memcpy(msg.data, &bl_msg, sizeof(bl_msg));
158 dataptr = (uint8_t *) msg.data;
159 len = dev->mincount;
160 offset = sizeof(bl_msg);
161 for (i = 0; len > 0; i++) {
162 memcpy(&dataptr[offset], page_address(dev->pages[i]),
163 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
164 len -= PAGE_CACHE_SIZE;
165 offset += PAGE_CACHE_SIZE;
166 }
167 msg.len = sizeof(bl_msg) + dev->mincount;
168
169 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
170 add_wait_queue(&bl_wq, &wq);
171 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
172 remove_wait_queue(&bl_wq, &wq);
173 goto out;
174 }
175
176 set_current_state(TASK_UNINTERRUPTIBLE);
177 schedule();
178 __set_current_state(TASK_RUNNING);
179 remove_wait_queue(&bl_wq, &wq);
180
181 if (reply->status != BL_DEVICE_REQUEST_PROC) {
182 dprintk("%s failed to open device: %d\n",
183 __func__, reply->status);
184 rv = ERR_PTR(-EINVAL);
185 goto out;
186 }
187
188 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
189 if (IS_ERR(bd)) {
190 dprintk("%s failed to open device : %ld\n",
191 __func__, PTR_ERR(bd));
192 goto out;
193 }
194
195 rv = kzalloc(sizeof(*rv), GFP_NOFS);
196 if (!rv) {
197 rv = ERR_PTR(-ENOMEM);
198 goto out;
199 }
200
201 rv->bm_mdev = bd;
202 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
203 dprintk("%s Created device %s with bd_block_size %u\n",
204 __func__,
205 bd->bd_disk->disk_name,
206 bd->bd_block_size);
207
208out:
209 kfree(msg.data);
210 return rv;
211}
212
213/* Map deviceid returned by the server to constructed block_device */
214static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
215 struct nfs4_deviceid *id)
216{
217 struct block_device *rv = NULL;
218 struct block_mount_id *mid;
219 struct pnfs_block_dev *dev;
220
221 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
222 mid = BLK_ID(lo);
223 spin_lock(&mid->bm_lock);
224 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
225 if (memcmp(id->data, dev->bm_mdevid.data,
226 NFS4_DEVICEID4_SIZE) == 0) {
227 rv = dev->bm_mdev;
228 goto out;
229 }
230 }
231 out:
232 spin_unlock(&mid->bm_lock);
233 dprintk("%s returning %p\n", __func__, rv);
234 return rv;
235}
236
237/* Tracks info needed to ensure extents in layout obey constraints of spec */
238struct layout_verification {
239 u32 mode; /* R or RW */
240 u64 start; /* Expected start of next non-COW extent */
241 u64 inval; /* Start of INVAL coverage */
242 u64 cowread; /* End of COW read coverage */
243};
244
245/* Verify the extent meets the layout requirements of the pnfs-block draft,
246 * section 2.3.1.
247 */
248static int verify_extent(struct pnfs_block_extent *be,
249 struct layout_verification *lv)
250{
251 if (lv->mode == IOMODE_READ) {
252 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
253 be->be_state == PNFS_BLOCK_INVALID_DATA)
254 return -EIO;
255 if (be->be_f_offset != lv->start)
256 return -EIO;
257 lv->start += be->be_length;
258 return 0;
259 }
260 /* lv->mode == IOMODE_RW */
261 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
262 if (be->be_f_offset != lv->start)
263 return -EIO;
264 if (lv->cowread > lv->start)
265 return -EIO;
266 lv->start += be->be_length;
267 lv->inval = lv->start;
268 return 0;
269 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
270 if (be->be_f_offset != lv->start)
271 return -EIO;
272 lv->start += be->be_length;
273 return 0;
274 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
275 if (be->be_f_offset > lv->start)
276 return -EIO;
277 if (be->be_f_offset < lv->inval)
278 return -EIO;
279 if (be->be_f_offset < lv->cowread)
280 return -EIO;
281 /* It looks like you might want to min this with lv->start,
282 * but you really don't.
283 */
284 lv->inval = lv->inval + be->be_length;
285 lv->cowread = be->be_f_offset + be->be_length;
286 return 0;
287 } else
288 return -EIO;
289}
290
291/* XDR decode pnfs_block_layout4 structure */
292int
293nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
294 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
295{
296 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
297 int i, status = -EIO;
298 uint32_t count;
299 struct pnfs_block_extent *be = NULL, *save;
300 struct xdr_stream stream;
301 struct xdr_buf buf;
302 struct page *scratch;
303 __be32 *p;
304 struct layout_verification lv = {
305 .mode = lgr->range.iomode,
306 .start = lgr->range.offset >> SECTOR_SHIFT,
307 .inval = lgr->range.offset >> SECTOR_SHIFT,
308 .cowread = lgr->range.offset >> SECTOR_SHIFT,
309 };
310 LIST_HEAD(extents);
311
312 dprintk("---> %s\n", __func__);
313
314 scratch = alloc_page(gfp_flags);
315 if (!scratch)
316 return -ENOMEM;
317
318 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
319 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
320
321 p = xdr_inline_decode(&stream, 4);
322 if (unlikely(!p))
323 goto out_err;
324
325 count = be32_to_cpup(p++);
326
327 dprintk("%s enter, number of extents %i\n", __func__, count);
328 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
329 if (unlikely(!p))
330 goto out_err;
331
332 /* Decode individual extents, putting them in temporary
333 * staging area until whole layout is decoded to make error
334 * recovery easier.
335 */
336 for (i = 0; i < count; i++) {
337 be = bl_alloc_extent();
338 if (!be) {
339 status = -ENOMEM;
340 goto out_err;
341 }
342 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
343 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
344 be->be_mdev = translate_devid(lo, &be->be_devid);
345 if (!be->be_mdev)
346 goto out_err;
347
348 /* The next three values are read in as bytes,
349 * but stored as 512-byte sector lengths
350 */
351 if (decode_sector_number(&p, &be->be_f_offset) < 0)
352 goto out_err;
353 if (decode_sector_number(&p, &be->be_length) < 0)
354 goto out_err;
355 if (decode_sector_number(&p, &be->be_v_offset) < 0)
356 goto out_err;
357 be->be_state = be32_to_cpup(p++);
358 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
359 be->be_inval = &bl->bl_inval;
360 if (verify_extent(be, &lv)) {
361 dprintk("%s verify failed\n", __func__);
362 goto out_err;
363 }
364 list_add_tail(&be->be_node, &extents);
365 }
366 if (lgr->range.offset + lgr->range.length !=
367 lv.start << SECTOR_SHIFT) {
368 dprintk("%s Final length mismatch\n", __func__);
369 be = NULL;
370 goto out_err;
371 }
372 if (lv.start < lv.cowread) {
373 dprintk("%s Final uncovered COW extent\n", __func__);
374 be = NULL;
375 goto out_err;
376 }
377 /* Extents decoded properly, now try to merge them in to
378 * existing layout extents.
379 */
380 spin_lock(&bl->bl_ext_lock);
381 list_for_each_entry_safe(be, save, &extents, be_node) {
382 list_del(&be->be_node);
383 status = bl_add_merge_extent(bl, be);
384 if (status) {
385 spin_unlock(&bl->bl_ext_lock);
386 /* This is a fairly catastrophic error, as the
387 * entire layout extent lists are now corrupted.
388 * We should have some way to distinguish this.
389 */
390 be = NULL;
391 goto out_err;
392 }
393 }
394 spin_unlock(&bl->bl_ext_lock);
395 status = 0;
396 out:
397 __free_page(scratch);
398 dprintk("%s returns %i\n", __func__, status);
399 return status;
400
401 out_err:
402 bl_put_extent(be);
403 while (!list_empty(&extents)) {
404 be = list_first_entry(&extents, struct pnfs_block_extent,
405 be_node);
406 list_del(&be->be_node);
407 bl_put_extent(be);
408 }
409 goto out;
410}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 000000000000..d055c7558073
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(dev_t dev)
42{
43 struct rpc_pipe_msg msg;
44 struct bl_dev_msg bl_umount_request;
45 struct bl_msg_hdr bl_msg = {
46 .type = BL_DEVICE_UMOUNT,
47 .totallen = sizeof(bl_umount_request),
48 };
49 uint8_t *dataptr;
50 DECLARE_WAITQUEUE(wq, current);
51
52 dprintk("Entering %s\n", __func__);
53
54 memset(&msg, 0, sizeof(msg));
55 msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
56 if (!msg.data)
57 goto out;
58
59 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
60 bl_umount_request.major = MAJOR(dev);
61 bl_umount_request.minor = MINOR(dev);
62
63 memcpy(msg.data, &bl_msg, sizeof(bl_msg));
64 dataptr = (uint8_t *) msg.data;
65 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
66 msg.len = sizeof(bl_msg) + bl_msg.totallen;
67
68 add_wait_queue(&bl_wq, &wq);
69 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
70 remove_wait_queue(&bl_wq, &wq);
71 goto out;
72 }
73
74 set_current_state(TASK_UNINTERRUPTIBLE);
75 schedule();
76 __set_current_state(TASK_RUNNING);
77 remove_wait_queue(&bl_wq, &wq);
78
79out:
80 kfree(msg.data);
81}
82
83/*
84 * Release meta device
85 */
86static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
87{
88 int rv;
89
90 dprintk("%s Releasing\n", __func__);
91 rv = nfs4_blkdev_put(bdev->bm_mdev);
92 if (rv)
93 printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
94 __func__, rv);
95
96 dev_remove(bdev->bm_mdev->bd_dev);
97}
98
99void bl_free_block_dev(struct pnfs_block_dev *bdev)
100{
101 if (bdev) {
102 if (bdev->bm_mdev) {
103 dprintk("%s Removing DM device: %d:%d\n",
104 __func__,
105 MAJOR(bdev->bm_mdev->bd_dev),
106 MINOR(bdev->bm_mdev->bd_dev));
107 nfs4_blk_metadev_release(bdev);
108 }
109 kfree(bdev);
110 }
111}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 000000000000..19fa7b0b8c00
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,935 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - do_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 if (storage)
114 new = storage;
115 else {
116 new = kmalloc(sizeof(*new), GFP_NOFS);
117 if (!new)
118 return -ENOMEM;
119 }
120 new->it_sector = s;
121 new->it_tags = (1 << tag);
122 list_add(&new->it_link, &pos->it_link);
123 return 1;
124 }
125}
126
127/* XXXX Really want option to not create */
128/* Over range, unions tag with existing entries, else creates entry with tag */
129static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
130{
131 u64 i;
132
133 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
134 for (i = normalize(s, tree->mtt_step_size); i < s + length;
135 i += tree->mtt_step_size)
136 if (_add_entry(tree, i, tag, NULL))
137 return -ENOMEM;
138 return 0;
139}
140
141/* Ensure that future operations on given range of tree will not malloc */
142static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
143{
144 u64 start, end, s;
145 int count, i, used = 0, status = -ENOMEM;
146 struct pnfs_inval_tracking **storage;
147
148 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
149 start = normalize(offset, tree->mtt_step_size);
150 end = normalize_up(offset + length, tree->mtt_step_size);
151 count = (int)(end - start) / (int)tree->mtt_step_size;
152
153 /* Pre-malloc what memory we might need */
154 storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
155 if (!storage)
156 return -ENOMEM;
157 for (i = 0; i < count; i++) {
158 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
159 GFP_NOFS);
160 if (!storage[i])
161 goto out_cleanup;
162 }
163
164 /* Now need lock - HOW??? */
165
166 for (s = start; s < end; s += tree->mtt_step_size)
167 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
168
169 /* Unlock - HOW??? */
170 status = 0;
171
172 out_cleanup:
173 for (i = used; i < count; i++) {
174 if (!storage[i])
175 break;
176 kfree(storage[i]);
177 }
178 kfree(storage);
179 return status;
180}
181
182static void set_needs_init(sector_t *array, sector_t offset)
183{
184 sector_t *p = array;
185
186 dprintk("%s enter\n", __func__);
187 if (!p)
188 return;
189 while (*p < offset)
190 p++;
191 if (*p == offset)
192 return;
193 else if (*p == ~0) {
194 *p++ = offset;
195 *p = ~0;
196 return;
197 } else {
198 sector_t *save = p;
199 dprintk("%s Adding %llu\n", __func__, (u64)offset);
200 while (*p != ~0)
201 p++;
202 p++;
203 memmove(save + 1, save, (char *)p - (char *)save);
204 *save = offset;
205 return;
206 }
207}
208
209/* We are relying on page lock to serialize this */
210int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
211{
212 int rv;
213
214 spin_lock(&marks->im_lock);
215 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
216 spin_unlock(&marks->im_lock);
217 return rv;
218}
219
220/* Assume start, end already sector aligned */
221static int
222_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
223{
224 struct pnfs_inval_tracking *pos;
225 u64 expect = 0;
226
227 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
228 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
229 if (pos->it_sector >= end)
230 continue;
231 if (!expect) {
232 if ((pos->it_sector == end - tree->mtt_step_size) &&
233 (pos->it_tags & (1 << tag))) {
234 expect = pos->it_sector - tree->mtt_step_size;
235 if (pos->it_sector < tree->mtt_step_size || expect < start)
236 return 1;
237 continue;
238 } else {
239 return 0;
240 }
241 }
242 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
243 return 0;
244 expect -= tree->mtt_step_size;
245 if (expect < start)
246 return 1;
247 }
248 return 0;
249}
250
251static int is_range_written(struct pnfs_inval_markings *marks,
252 sector_t start, sector_t end)
253{
254 int rv;
255
256 spin_lock(&marks->im_lock);
257 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
258 spin_unlock(&marks->im_lock);
259 return rv;
260}
261
262/* Marks sectors in [offest, offset_length) as having been initialized.
263 * All lengths are step-aligned, where step is min(pagesize, blocksize).
264 * Notes where partial block is initialized, and helps prepare it for
265 * complete initialization later.
266 */
267/* Currently assumes offset is page-aligned */
268int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
269 sector_t offset, sector_t length,
270 sector_t **pages)
271{
272 sector_t s, start, end;
273 sector_t *array = NULL; /* Pages to mark */
274
275 dprintk("%s(offset=%llu,len=%llu) enter\n",
276 __func__, (u64)offset, (u64)length);
277 s = max((sector_t) 3,
278 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
279 dprintk("%s set max=%llu\n", __func__, (u64)s);
280 if (pages) {
281 array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
282 if (!array)
283 goto outerr;
284 array[0] = ~0;
285 }
286
287 start = normalize(offset, marks->im_block_size);
288 end = normalize_up(offset + length, marks->im_block_size);
289 if (_preload_range(&marks->im_tree, start, end - start))
290 goto outerr;
291
292 spin_lock(&marks->im_lock);
293
294 for (s = normalize_up(start, PAGE_CACHE_SECTORS);
295 s < offset; s += PAGE_CACHE_SECTORS) {
296 dprintk("%s pre-area pages\n", __func__);
297 /* Portion of used block is not initialized */
298 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
299 set_needs_init(array, s);
300 }
301 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
302 goto out_unlock;
303 for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
304 s < end; s += PAGE_CACHE_SECTORS) {
305 dprintk("%s post-area pages\n", __func__);
306 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
307 set_needs_init(array, s);
308 }
309
310 spin_unlock(&marks->im_lock);
311
312 if (pages) {
313 if (array[0] == ~0) {
314 kfree(array);
315 *pages = NULL;
316 } else
317 *pages = array;
318 }
319 return 0;
320
321 out_unlock:
322 spin_unlock(&marks->im_lock);
323 outerr:
324 if (pages) {
325 kfree(array);
326 *pages = NULL;
327 }
328 return -ENOMEM;
329}
330
331/* Marks sectors in [offest, offset+length) as having been written to disk.
332 * All lengths should be block aligned.
333 */
334static int mark_written_sectors(struct pnfs_inval_markings *marks,
335 sector_t offset, sector_t length)
336{
337 int status;
338
339 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
340 (u64)offset, (u64)length);
341 spin_lock(&marks->im_lock);
342 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
343 spin_unlock(&marks->im_lock);
344 return status;
345}
346
347static void print_short_extent(struct pnfs_block_short_extent *be)
348{
349 dprintk("PRINT SHORT EXTENT extent %p\n", be);
350 if (be) {
351 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
352 dprintk(" be_length %llu\n", (u64)be->bse_length);
353 }
354}
355
356static void print_clist(struct list_head *list, unsigned int count)
357{
358 struct pnfs_block_short_extent *be;
359 unsigned int i = 0;
360
361 ifdebug(FACILITY) {
362 printk(KERN_DEBUG "****************\n");
363 printk(KERN_DEBUG "Extent list looks like:\n");
364 list_for_each_entry(be, list, bse_node) {
365 i++;
366 print_short_extent(be);
367 }
368 if (i != count)
369 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
370 printk(KERN_DEBUG "****************\n");
371 }
372}
373
374/* Note: In theory, we should do more checking that devid's match between
375 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
376 */
377/* Note this is very similar to bl_add_merge_extent */
378static void add_to_commitlist(struct pnfs_block_layout *bl,
379 struct pnfs_block_short_extent *new)
380{
381 struct list_head *clist = &bl->bl_commit;
382 struct pnfs_block_short_extent *old, *save;
383 sector_t end = new->bse_f_offset + new->bse_length;
384
385 dprintk("%s enter\n", __func__);
386 print_short_extent(new);
387 print_clist(clist, bl->bl_count);
388 bl->bl_count++;
389 /* Scan for proper place to insert, extending new to the left
390 * as much as possible.
391 */
392 list_for_each_entry_safe(old, save, clist, bse_node) {
393 if (new->bse_f_offset < old->bse_f_offset)
394 break;
395 if (end <= old->bse_f_offset + old->bse_length) {
396 /* Range is already in list */
397 bl->bl_count--;
398 kfree(new);
399 return;
400 } else if (new->bse_f_offset <=
401 old->bse_f_offset + old->bse_length) {
402 /* new overlaps or abuts existing be */
403 if (new->bse_mdev == old->bse_mdev) {
404 /* extend new to fully replace old */
405 new->bse_length += new->bse_f_offset -
406 old->bse_f_offset;
407 new->bse_f_offset = old->bse_f_offset;
408 list_del(&old->bse_node);
409 bl->bl_count--;
410 kfree(old);
411 }
412 }
413 }
414 /* Note that if we never hit the above break, old will not point to a
415 * valid extent. However, in that case &old->bse_node==list.
416 */
417 list_add_tail(&new->bse_node, &old->bse_node);
418 /* Scan forward for overlaps. If we find any, extend new and
419 * remove the overlapped extent.
420 */
421 old = list_prepare_entry(new, clist, bse_node);
422 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
423 if (end < old->bse_f_offset)
424 break;
425 /* new overlaps or abuts old */
426 if (new->bse_mdev == old->bse_mdev) {
427 if (end < old->bse_f_offset + old->bse_length) {
428 /* extend new to fully cover old */
429 end = old->bse_f_offset + old->bse_length;
430 new->bse_length = end - new->bse_f_offset;
431 }
432 list_del(&old->bse_node);
433 bl->bl_count--;
434 kfree(old);
435 }
436 }
437 dprintk("%s: after merging\n", __func__);
438 print_clist(clist, bl->bl_count);
439}
440
441/* Note the range described by offset, length is guaranteed to be contained
442 * within be.
443 */
444int bl_mark_for_commit(struct pnfs_block_extent *be,
445 sector_t offset, sector_t length)
446{
447 sector_t new_end, end = offset + length;
448 struct pnfs_block_short_extent *new;
449 struct pnfs_block_layout *bl = container_of(be->be_inval,
450 struct pnfs_block_layout,
451 bl_inval);
452
453 new = kmalloc(sizeof(*new), GFP_NOFS);
454 if (!new)
455 return -ENOMEM;
456
457 mark_written_sectors(be->be_inval, offset, length);
458 /* We want to add the range to commit list, but it must be
459 * block-normalized, and verified that the normalized range has
460 * been entirely written to disk.
461 */
462 new->bse_f_offset = offset;
463 offset = normalize(offset, bl->bl_blocksize);
464 if (offset < new->bse_f_offset) {
465 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
466 new->bse_f_offset = offset;
467 else
468 new->bse_f_offset = offset + bl->bl_blocksize;
469 }
470 new_end = normalize_up(end, bl->bl_blocksize);
471 if (end < new_end) {
472 if (is_range_written(be->be_inval, end, new_end))
473 end = new_end;
474 else
475 end = new_end - bl->bl_blocksize;
476 }
477 if (end <= new->bse_f_offset) {
478 kfree(new);
479 return 0;
480 }
481 new->bse_length = end - new->bse_f_offset;
482 new->bse_devid = be->be_devid;
483 new->bse_mdev = be->be_mdev;
484
485 spin_lock(&bl->bl_ext_lock);
486 /* new will be freed, either by add_to_commitlist if it decides not
487 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
488 */
489 add_to_commitlist(bl, new);
490 spin_unlock(&bl->bl_ext_lock);
491 return 0;
492}
493
494static void print_bl_extent(struct pnfs_block_extent *be)
495{
496 dprintk("PRINT EXTENT extent %p\n", be);
497 if (be) {
498 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
499 dprintk(" be_length %llu\n", (u64)be->be_length);
500 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
501 dprintk(" be_state %d\n", be->be_state);
502 }
503}
504
505static void
506destroy_extent(struct kref *kref)
507{
508 struct pnfs_block_extent *be;
509
510 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
511 dprintk("%s be=%p\n", __func__, be);
512 kfree(be);
513}
514
515void
516bl_put_extent(struct pnfs_block_extent *be)
517{
518 if (be) {
519 dprintk("%s enter %p (%i)\n", __func__, be,
520 atomic_read(&be->be_refcnt.refcount));
521 kref_put(&be->be_refcnt, destroy_extent);
522 }
523}
524
525struct pnfs_block_extent *bl_alloc_extent(void)
526{
527 struct pnfs_block_extent *be;
528
529 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
530 if (!be)
531 return NULL;
532 INIT_LIST_HEAD(&be->be_node);
533 kref_init(&be->be_refcnt);
534 be->be_inval = NULL;
535 return be;
536}
537
538static void print_elist(struct list_head *list)
539{
540 struct pnfs_block_extent *be;
541 dprintk("****************\n");
542 dprintk("Extent list looks like:\n");
543 list_for_each_entry(be, list, be_node) {
544 print_bl_extent(be);
545 }
546 dprintk("****************\n");
547}
548
549static inline int
550extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
551{
552 /* Note this assumes new->be_f_offset >= old->be_f_offset */
553 return (new->be_state == old->be_state) &&
554 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
555 ((new->be_v_offset - old->be_v_offset ==
556 new->be_f_offset - old->be_f_offset) &&
557 new->be_mdev == old->be_mdev));
558}
559
560/* Adds new to appropriate list in bl, modifying new and removing existing
561 * extents as appropriate to deal with overlaps.
562 *
563 * See bl_find_get_extent for list constraints.
564 *
565 * Refcount on new is already set. If end up not using it, or error out,
566 * need to put the reference.
567 *
568 * bl->bl_ext_lock is held by caller.
569 */
570int
571bl_add_merge_extent(struct pnfs_block_layout *bl,
572 struct pnfs_block_extent *new)
573{
574 struct pnfs_block_extent *be, *tmp;
575 sector_t end = new->be_f_offset + new->be_length;
576 struct list_head *list;
577
578 dprintk("%s enter with be=%p\n", __func__, new);
579 print_bl_extent(new);
580 list = &bl->bl_extents[bl_choose_list(new->be_state)];
581 print_elist(list);
582
583 /* Scan for proper place to insert, extending new to the left
584 * as much as possible.
585 */
586 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
587 if (new->be_f_offset >= be->be_f_offset + be->be_length)
588 break;
589 if (new->be_f_offset >= be->be_f_offset) {
590 if (end <= be->be_f_offset + be->be_length) {
591 /* new is a subset of existing be*/
592 if (extents_consistent(be, new)) {
593 dprintk("%s: new is subset, ignoring\n",
594 __func__);
595 bl_put_extent(new);
596 return 0;
597 } else {
598 goto out_err;
599 }
600 } else {
601 /* |<-- be -->|
602 * |<-- new -->| */
603 if (extents_consistent(be, new)) {
604 /* extend new to fully replace be */
605 new->be_length += new->be_f_offset -
606 be->be_f_offset;
607 new->be_f_offset = be->be_f_offset;
608 new->be_v_offset = be->be_v_offset;
609 dprintk("%s: removing %p\n", __func__, be);
610 list_del(&be->be_node);
611 bl_put_extent(be);
612 } else {
613 goto out_err;
614 }
615 }
616 } else if (end >= be->be_f_offset + be->be_length) {
617 /* new extent overlap existing be */
618 if (extents_consistent(be, new)) {
619 /* extend new to fully replace be */
620 dprintk("%s: removing %p\n", __func__, be);
621 list_del(&be->be_node);
622 bl_put_extent(be);
623 } else {
624 goto out_err;
625 }
626 } else if (end > be->be_f_offset) {
627 /* |<-- be -->|
628 *|<-- new -->| */
629 if (extents_consistent(new, be)) {
630 /* extend new to fully replace be */
631 new->be_length += be->be_f_offset + be->be_length -
632 new->be_f_offset - new->be_length;
633 dprintk("%s: removing %p\n", __func__, be);
634 list_del(&be->be_node);
635 bl_put_extent(be);
636 } else {
637 goto out_err;
638 }
639 }
640 }
641 /* Note that if we never hit the above break, be will not point to a
642 * valid extent. However, in that case &be->be_node==list.
643 */
644 list_add(&new->be_node, &be->be_node);
645 dprintk("%s: inserting new\n", __func__);
646 print_elist(list);
647 /* FIXME - The per-list consistency checks have all been done,
648 * should now check cross-list consistency.
649 */
650 return 0;
651
652 out_err:
653 bl_put_extent(new);
654 return -EIO;
655}
656
657/* Returns extent, or NULL. If a second READ extent exists, it is returned
658 * in cow_read, if given.
659 *
660 * The extents are kept in two seperate ordered lists, one for READ and NONE,
661 * one for READWRITE and INVALID. Within each list, we assume:
662 * 1. Extents are ordered by file offset.
663 * 2. For any given isect, there is at most one extents that matches.
664 */
665struct pnfs_block_extent *
666bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
667 struct pnfs_block_extent **cow_read)
668{
669 struct pnfs_block_extent *be, *cow, *ret;
670 int i;
671
672 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
673 cow = ret = NULL;
674 spin_lock(&bl->bl_ext_lock);
675 for (i = 0; i < EXTENT_LISTS; i++) {
676 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
677 if (isect >= be->be_f_offset + be->be_length)
678 break;
679 if (isect >= be->be_f_offset) {
680 /* We have found an extent */
681 dprintk("%s Get %p (%i)\n", __func__, be,
682 atomic_read(&be->be_refcnt.refcount));
683 kref_get(&be->be_refcnt);
684 if (!ret)
685 ret = be;
686 else if (be->be_state != PNFS_BLOCK_READ_DATA)
687 bl_put_extent(be);
688 else
689 cow = be;
690 break;
691 }
692 }
693 if (ret &&
694 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
695 break;
696 }
697 spin_unlock(&bl->bl_ext_lock);
698 if (cow_read)
699 *cow_read = cow;
700 print_bl_extent(ret);
701 return ret;
702}
703
704/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
705static struct pnfs_block_extent *
706bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
707{
708 struct pnfs_block_extent *be, *ret = NULL;
709 int i;
710
711 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
712 for (i = 0; i < EXTENT_LISTS; i++) {
713 if (ret)
714 break;
715 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
716 if (isect >= be->be_f_offset + be->be_length)
717 break;
718 if (isect >= be->be_f_offset) {
719 /* We have found an extent */
720 dprintk("%s Get %p (%i)\n", __func__, be,
721 atomic_read(&be->be_refcnt.refcount));
722 kref_get(&be->be_refcnt);
723 ret = be;
724 break;
725 }
726 }
727 }
728 print_bl_extent(ret);
729 return ret;
730}
731
732int
733encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
734 struct xdr_stream *xdr,
735 const struct nfs4_layoutcommit_args *arg)
736{
737 struct pnfs_block_short_extent *lce, *save;
738 unsigned int count = 0;
739 __be32 *p, *xdr_start;
740
741 dprintk("%s enter\n", __func__);
742 /* BUG - creation of bl_commit is buggy - need to wait for
743 * entire block to be marked WRITTEN before it can be added.
744 */
745 spin_lock(&bl->bl_ext_lock);
746 /* Want to adjust for possible truncate */
747 /* We now want to adjust argument range */
748
749 /* XDR encode the ranges found */
750 xdr_start = xdr_reserve_space(xdr, 8);
751 if (!xdr_start)
752 goto out;
753 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
754 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
755 if (!p)
756 break;
757 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
758 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
759 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
760 p = xdr_encode_hyper(p, 0LL);
761 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
762 list_del(&lce->bse_node);
763 list_add_tail(&lce->bse_node, &bl->bl_committing);
764 bl->bl_count--;
765 count++;
766 }
767 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
768 xdr_start[1] = cpu_to_be32(count);
769out:
770 spin_unlock(&bl->bl_ext_lock);
771 dprintk("%s found %i ranges\n", __func__, count);
772 return 0;
773}
774
775/* Helper function to set_to_rw that initialize a new extent */
776static void
777_prep_new_extent(struct pnfs_block_extent *new,
778 struct pnfs_block_extent *orig,
779 sector_t offset, sector_t length, int state)
780{
781 kref_init(&new->be_refcnt);
782 /* don't need to INIT_LIST_HEAD(&new->be_node) */
783 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
784 new->be_mdev = orig->be_mdev;
785 new->be_f_offset = offset;
786 new->be_length = length;
787 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
788 new->be_state = state;
789 new->be_inval = orig->be_inval;
790}
791
792/* Tries to merge be with extent in front of it in list.
793 * Frees storage if not used.
794 */
795static struct pnfs_block_extent *
796_front_merge(struct pnfs_block_extent *be, struct list_head *head,
797 struct pnfs_block_extent *storage)
798{
799 struct pnfs_block_extent *prev;
800
801 if (!storage)
802 goto no_merge;
803 if (&be->be_node == head || be->be_node.prev == head)
804 goto no_merge;
805 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
806 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
807 !extents_consistent(prev, be))
808 goto no_merge;
809 _prep_new_extent(storage, prev, prev->be_f_offset,
810 prev->be_length + be->be_length, prev->be_state);
811 list_replace(&prev->be_node, &storage->be_node);
812 bl_put_extent(prev);
813 list_del(&be->be_node);
814 bl_put_extent(be);
815 return storage;
816
817 no_merge:
818 kfree(storage);
819 return be;
820}
821
822static u64
823set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
824{
825 u64 rv = offset + length;
826 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
827 struct pnfs_block_extent *children[3];
828 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
829 int i = 0, j;
830
831 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
832 /* Create storage for up to three new extents e1, e2, e3 */
833 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
834 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
835 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
836 /* BUG - we are ignoring any failure */
837 if (!e1 || !e2 || !e3)
838 goto out_nosplit;
839
840 spin_lock(&bl->bl_ext_lock);
841 be = bl_find_get_extent_locked(bl, offset);
842 rv = be->be_f_offset + be->be_length;
843 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
844 spin_unlock(&bl->bl_ext_lock);
845 goto out_nosplit;
846 }
847 /* Add e* to children, bumping e*'s krefs */
848 if (be->be_f_offset != offset) {
849 _prep_new_extent(e1, be, be->be_f_offset,
850 offset - be->be_f_offset,
851 PNFS_BLOCK_INVALID_DATA);
852 children[i++] = e1;
853 print_bl_extent(e1);
854 } else
855 merge1 = e1;
856 _prep_new_extent(e2, be, offset,
857 min(length, be->be_f_offset + be->be_length - offset),
858 PNFS_BLOCK_READWRITE_DATA);
859 children[i++] = e2;
860 print_bl_extent(e2);
861 if (offset + length < be->be_f_offset + be->be_length) {
862 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
863 be->be_f_offset + be->be_length -
864 offset - length,
865 PNFS_BLOCK_INVALID_DATA);
866 children[i++] = e3;
867 print_bl_extent(e3);
868 } else
869 merge2 = e3;
870
871 /* Remove be from list, and insert the e* */
872 /* We don't get refs on e*, since this list is the base reference
873 * set when init'ed.
874 */
875 if (i < 3)
876 children[i] = NULL;
877 new = children[0];
878 list_replace(&be->be_node, &new->be_node);
879 bl_put_extent(be);
880 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
881 for (j = 1; j < i; j++) {
882 old = new;
883 new = children[j];
884 list_add(&new->be_node, &old->be_node);
885 }
886 if (merge2) {
887 /* This is a HACK, should just create a _back_merge function */
888 new = list_entry(new->be_node.next,
889 struct pnfs_block_extent, be_node);
890 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
891 }
892 spin_unlock(&bl->bl_ext_lock);
893
894 /* Since we removed the base reference above, be is now scheduled for
895 * destruction.
896 */
897 bl_put_extent(be);
898 dprintk("%s returns %llu after split\n", __func__, rv);
899 return rv;
900
901 out_nosplit:
902 kfree(e1);
903 kfree(e2);
904 kfree(e3);
905 dprintk("%s returns %llu without splitting\n", __func__, rv);
906 return rv;
907}
908
909void
910clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
911 const struct nfs4_layoutcommit_args *arg,
912 int status)
913{
914 struct pnfs_block_short_extent *lce, *save;
915
916 dprintk("%s status %d\n", __func__, status);
917 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
918 if (likely(!status)) {
919 u64 offset = lce->bse_f_offset;
920 u64 end = offset + lce->bse_length;
921
922 do {
923 offset = set_to_rw(bl, offset, end - offset);
924 } while (offset < end);
925 list_del(&lce->bse_node);
926
927 kfree(lce);
928 } else {
929 list_del(&lce->bse_node);
930 spin_lock(&bl->bl_ext_lock);
931 add_to_commitlist(bl, lce);
932 spin_unlock(&bl->bl_ext_lock);
933 }
934 }
935}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 76f856e284e4..7cf6cafcc007 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -6,7 +6,7 @@
6 6
7#include <linux/completion.h> 7#include <linux/completion.h>
8#include <linux/sunrpc/cache.h> 8#include <linux/sunrpc/cache.h>
9#include <asm/atomic.h> 9#include <linux/atomic.h>
10 10
11/* 11/*
12 * Deferred request handling 12 * Deferred request handling
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b257383bb565..07df5f1d85e5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
38struct cb_process_state { 38struct cb_process_state {
39 __be32 drc_status; 39 __be32 drc_status;
40 struct nfs_client *clp; 40 struct nfs_client *clp;
41 int slotid;
41}; 42};
42 43
43struct cb_compound_hdr_arg { 44struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
166 void *dummy, struct cb_process_state *cps); 167 void *dummy, struct cb_process_state *cps);
167 168
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 169extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170 170
171struct cb_devicenotifyitem { 171struct cb_devicenotifyitem {
172 uint32_t cbd_notify_type; 172 uint32_t cbd_notify_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index d4d1954e9bb9..43926add945b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
111static u32 initiate_file_draining(struct nfs_client *clp, 111static u32 initiate_file_draining(struct nfs_client *clp,
112 struct cb_layoutrecallargs *args) 112 struct cb_layoutrecallargs *args)
113{ 113{
114 struct nfs_server *server;
114 struct pnfs_layout_hdr *lo; 115 struct pnfs_layout_hdr *lo;
115 struct inode *ino; 116 struct inode *ino;
116 bool found = false; 117 bool found = false;
@@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp,
118 LIST_HEAD(free_me_list); 119 LIST_HEAD(free_me_list);
119 120
120 spin_lock(&clp->cl_lock); 121 spin_lock(&clp->cl_lock);
121 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { 122 rcu_read_lock();
122 if (nfs_compare_fh(&args->cbl_fh, 123 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
123 &NFS_I(lo->plh_inode)->fh)) 124 list_for_each_entry(lo, &server->layouts, plh_layouts) {
124 continue; 125 if (nfs_compare_fh(&args->cbl_fh,
125 ino = igrab(lo->plh_inode); 126 &NFS_I(lo->plh_inode)->fh))
126 if (!ino) 127 continue;
127 continue; 128 ino = igrab(lo->plh_inode);
128 found = true; 129 if (!ino)
129 /* Without this, layout can be freed as soon 130 continue;
130 * as we release cl_lock. 131 found = true;
131 */ 132 /* Without this, layout can be freed as soon
132 get_layout_hdr(lo); 133 * as we release cl_lock.
133 break; 134 */
135 get_layout_hdr(lo);
136 break;
137 }
138 if (found)
139 break;
134 } 140 }
141 rcu_read_unlock();
135 spin_unlock(&clp->cl_lock); 142 spin_unlock(&clp->cl_lock);
143
136 if (!found) 144 if (!found)
137 return NFS4ERR_NOMATCHING_LAYOUT; 145 return NFS4ERR_NOMATCHING_LAYOUT;
138 146
@@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
154static u32 initiate_bulk_draining(struct nfs_client *clp, 162static u32 initiate_bulk_draining(struct nfs_client *clp,
155 struct cb_layoutrecallargs *args) 163 struct cb_layoutrecallargs *args)
156{ 164{
165 struct nfs_server *server;
157 struct pnfs_layout_hdr *lo; 166 struct pnfs_layout_hdr *lo;
158 struct inode *ino; 167 struct inode *ino;
159 u32 rv = NFS4ERR_NOMATCHING_LAYOUT; 168 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
167 }; 176 };
168 177
169 spin_lock(&clp->cl_lock); 178 spin_lock(&clp->cl_lock);
170 list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { 179 rcu_read_lock();
180 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
171 if ((args->cbl_recall_type == RETURN_FSID) && 181 if ((args->cbl_recall_type == RETURN_FSID) &&
172 memcmp(&NFS_SERVER(lo->plh_inode)->fsid, 182 memcmp(&server->fsid, &args->cbl_fsid,
173 &args->cbl_fsid, sizeof(struct nfs_fsid))) 183 sizeof(struct nfs_fsid)))
174 continue;
175 if (!igrab(lo->plh_inode))
176 continue; 184 continue;
177 get_layout_hdr(lo); 185
178 BUG_ON(!list_empty(&lo->plh_bulk_recall)); 186 list_for_each_entry(lo, &server->layouts, plh_layouts) {
179 list_add(&lo->plh_bulk_recall, &recall_list); 187 if (!igrab(lo->plh_inode))
188 continue;
189 get_layout_hdr(lo);
190 BUG_ON(!list_empty(&lo->plh_bulk_recall));
191 list_add(&lo->plh_bulk_recall, &recall_list);
192 }
180 } 193 }
194 rcu_read_unlock();
181 spin_unlock(&clp->cl_lock); 195 spin_unlock(&clp->cl_lock);
196
182 list_for_each_entry_safe(lo, tmp, 197 list_for_each_entry_safe(lo, tmp,
183 &recall_list, plh_bulk_recall) { 198 &recall_list, plh_bulk_recall) {
184 ino = lo->plh_inode; 199 ino = lo->plh_inode;
@@ -333,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
333 /* Normal */ 348 /* Normal */
334 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { 349 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
335 slot->seq_nr++; 350 slot->seq_nr++;
336 return htonl(NFS4_OK); 351 goto out_ok;
337 } 352 }
338 353
339 /* Replay */ 354 /* Replay */
@@ -352,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
352 /* Wraparound */ 367 /* Wraparound */
353 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { 368 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
354 slot->seq_nr = 1; 369 slot->seq_nr = 1;
355 return htonl(NFS4_OK); 370 goto out_ok;
356 } 371 }
357 372
358 /* Misordered request */ 373 /* Misordered request */
359 return htonl(NFS4ERR_SEQ_MISORDERED); 374 return htonl(NFS4ERR_SEQ_MISORDERED);
375out_ok:
376 tbl->highest_used_slotid = args->csa_slotid;
377 return htonl(NFS4_OK);
360} 378}
361 379
362/* 380/*
@@ -418,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
418 struct cb_sequenceres *res, 436 struct cb_sequenceres *res,
419 struct cb_process_state *cps) 437 struct cb_process_state *cps)
420{ 438{
439 struct nfs4_slot_table *tbl;
421 struct nfs_client *clp; 440 struct nfs_client *clp;
422 int i; 441 int i;
423 __be32 status = htonl(NFS4ERR_BADSESSION); 442 __be32 status = htonl(NFS4ERR_BADSESSION);
424 443
425 cps->clp = NULL;
426
427 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); 444 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
428 if (clp == NULL) 445 if (clp == NULL)
429 goto out; 446 goto out;
430 447
448 tbl = &clp->cl_session->bc_slot_table;
449
450 spin_lock(&tbl->slot_tbl_lock);
431 /* state manager is resetting the session */ 451 /* state manager is resetting the session */
432 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { 452 if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
433 status = NFS4ERR_DELAY; 453 spin_unlock(&tbl->slot_tbl_lock);
454 status = htonl(NFS4ERR_DELAY);
455 /* Return NFS4ERR_BADSESSION if we're draining the session
456 * in order to reset it.
457 */
458 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
459 status = htonl(NFS4ERR_BADSESSION);
434 goto out; 460 goto out;
435 } 461 }
436 462
437 status = validate_seqid(&clp->cl_session->bc_slot_table, args); 463 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
464 spin_unlock(&tbl->slot_tbl_lock);
438 if (status) 465 if (status)
439 goto out; 466 goto out;
440 467
468 cps->slotid = args->csa_slotid;
469
441 /* 470 /*
442 * Check for pending referring calls. If a match is found, a 471 * Check for pending referring calls. If a match is found, a
443 * related callback was received before the response to the original 472 * related callback was received before the response to the original
@@ -454,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
454 res->csr_slotid = args->csa_slotid; 483 res->csr_slotid = args->csa_slotid;
455 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 484 res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
456 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; 485 res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
457 nfs4_cb_take_slot(clp);
458 486
459out: 487out:
460 cps->clp = clp; /* put in nfs4_callback_compound */ 488 cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c6c86a77e043..918ad647afea 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
754 * Let the state manager know callback processing done. 754 * Let the state manager know callback processing done.
755 * A single slot, so highest used slotid is either 0 or -1 755 * A single slot, so highest used slotid is either 0 or -1
756 */ 756 */
757 tbl->highest_used_slotid--; 757 tbl->highest_used_slotid = -1;
758 nfs4_check_drain_bc_complete(session); 758 nfs4_check_drain_bc_complete(session);
759 spin_unlock(&tbl->slot_tbl_lock); 759 spin_unlock(&tbl->slot_tbl_lock);
760} 760}
761 761
762static void nfs4_cb_free_slot(struct nfs_client *clp) 762static void nfs4_cb_free_slot(struct cb_process_state *cps)
763{ 763{
764 if (clp && clp->cl_session) 764 if (cps->slotid != -1)
765 nfs4_callback_free_slot(clp->cl_session); 765 nfs4_callback_free_slot(cps->clp->cl_session);
766}
767
768/* A single slot, so highest used slotid is either 0 or -1 */
769void nfs4_cb_take_slot(struct nfs_client *clp)
770{
771 struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
772
773 spin_lock(&tbl->slot_tbl_lock);
774 tbl->highest_used_slotid++;
775 BUG_ON(tbl->highest_used_slotid != 0);
776 spin_unlock(&tbl->slot_tbl_lock);
777} 766}
778 767
779#else /* CONFIG_NFS_V4_1 */ 768#else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
784 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 773 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
785} 774}
786 775
787static void nfs4_cb_free_slot(struct nfs_client *clp) 776static void nfs4_cb_free_slot(struct cb_process_state *cps)
788{ 777{
789} 778}
790#endif /* CONFIG_NFS_V4_1 */ 779#endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
866 struct cb_process_state cps = { 855 struct cb_process_state cps = {
867 .drc_status = 0, 856 .drc_status = 0,
868 .clp = NULL, 857 .clp = NULL,
858 .slotid = -1,
869 }; 859 };
870 unsigned int nops = 0; 860 unsigned int nops = 0;
871 861
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
906 896
907 *hdr_res.status = status; 897 *hdr_res.status = status;
908 *hdr_res.nops = htonl(nops); 898 *hdr_res.nops = htonl(nops);
909 nfs4_cb_free_slot(cps.clp); 899 nfs4_cb_free_slot(&cps);
910 nfs_put_client(cps.clp); 900 nfs_put_client(cps.clp);
911 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 901 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
912 return rpc_success; 902 return rpc_success;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b3dc2b88b65b..5833fbbf59b0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
105 .nrvers = ARRAY_SIZE(nfs_version), 105 .nrvers = ARRAY_SIZE(nfs_version),
106 .version = nfs_version, 106 .version = nfs_version,
107 .stats = &nfs_rpcstat, 107 .stats = &nfs_rpcstat,
108 .pipe_dir_name = "/nfs", 108 .pipe_dir_name = NFS_PIPE_DIRNAME,
109}; 109};
110 110
111struct rpc_stat nfs_rpcstat = { 111struct rpc_stat nfs_rpcstat = {
@@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
188 cred = rpc_lookup_machine_cred(); 188 cred = rpc_lookup_machine_cred();
189 if (!IS_ERR(cred)) 189 if (!IS_ERR(cred))
190 clp->cl_machine_cred = cred; 190 clp->cl_machine_cred = cred;
191#if defined(CONFIG_NFS_V4_1)
192 INIT_LIST_HEAD(&clp->cl_layouts);
193#endif
194 nfs_fscache_get_client_cookie(clp); 191 nfs_fscache_get_client_cookie(clp);
195 192
196 return clp; 193 return clp;
@@ -293,6 +290,7 @@ static void nfs_free_client(struct nfs_client *clp)
293 nfs4_deviceid_purge_client(clp); 290 nfs4_deviceid_purge_client(clp);
294 291
295 kfree(clp->cl_hostname); 292 kfree(clp->cl_hostname);
293 kfree(clp->server_scope);
296 kfree(clp); 294 kfree(clp);
297 295
298 dprintk("<-- nfs_free_client()\n"); 296 dprintk("<-- nfs_free_client()\n");
@@ -906,7 +904,9 @@ error:
906/* 904/*
907 * Load up the server record from information gained in an fsinfo record 905 * Load up the server record from information gained in an fsinfo record
908 */ 906 */
909static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) 907static void nfs_server_set_fsinfo(struct nfs_server *server,
908 struct nfs_fh *mntfh,
909 struct nfs_fsinfo *fsinfo)
910{ 910{
911 unsigned long max_rpc_payload; 911 unsigned long max_rpc_payload;
912 912
@@ -936,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
936 if (server->wsize > NFS_MAX_FILE_IO_SIZE) 936 if (server->wsize > NFS_MAX_FILE_IO_SIZE)
937 server->wsize = NFS_MAX_FILE_IO_SIZE; 937 server->wsize = NFS_MAX_FILE_IO_SIZE;
938 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 938 server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
939 set_pnfs_layoutdriver(server, fsinfo->layouttype); 939 server->pnfs_blksize = fsinfo->blksize;
940 set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
940 941
941 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); 942 server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
942 943
@@ -982,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
982 if (error < 0) 983 if (error < 0)
983 goto out_error; 984 goto out_error;
984 985
985 nfs_server_set_fsinfo(server, &fsinfo); 986 nfs_server_set_fsinfo(server, mntfh, &fsinfo);
986 987
987 /* Get some general file system info */ 988 /* Get some general file system info */
988 if (server->namelen == 0) { 989 if (server->namelen == 0) {
@@ -1062,6 +1063,7 @@ static struct nfs_server *nfs_alloc_server(void)
1062 INIT_LIST_HEAD(&server->client_link); 1063 INIT_LIST_HEAD(&server->client_link);
1063 INIT_LIST_HEAD(&server->master_link); 1064 INIT_LIST_HEAD(&server->master_link);
1064 INIT_LIST_HEAD(&server->delegations); 1065 INIT_LIST_HEAD(&server->delegations);
1066 INIT_LIST_HEAD(&server->layouts);
1065 1067
1066 atomic_set(&server->active, 0); 1068 atomic_set(&server->active, 0);
1067 1069
@@ -1464,7 +1466,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
1464 dprintk("<-- %s %p\n", __func__, clp); 1466 dprintk("<-- %s %p\n", __func__, clp);
1465 return clp; 1467 return clp;
1466} 1468}
1467EXPORT_SYMBOL(nfs4_set_ds_client); 1469EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
1468 1470
1469/* 1471/*
1470 * Session has been established, and the client marked ready. 1472 * Session has been established, and the client marked ready.
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index dd25c2aec375..321a66bc3846 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode)
398 return err; 398 return err;
399} 399}
400 400
401static void nfs_mark_return_delegation(struct nfs_delegation *delegation) 401static void nfs_mark_return_delegation(struct nfs_server *server,
402 struct nfs_delegation *delegation)
402{ 403{
403 struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
404
405 set_bit(NFS_DELEGATION_RETURN, &delegation->flags); 404 set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
406 set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); 405 set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
407} 406}
408 407
409/** 408/**
@@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
441 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) 440 if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
442 continue; 441 continue;
443 if (delegation->type & flags) 442 if (delegation->type & flags)
444 nfs_mark_return_delegation(delegation); 443 nfs_mark_return_delegation(server, delegation);
445 } 444 }
446} 445}
447 446
@@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
508 list_for_each_entry_rcu(delegation, &server->delegations, super_list) { 507 list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
509 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) 508 if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
510 continue; 509 continue;
511 nfs_mark_return_delegation(delegation); 510 nfs_mark_return_delegation(server, delegation);
512 } 511 }
513} 512}
514 513
@@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
539int nfs_async_inode_return_delegation(struct inode *inode, 538int nfs_async_inode_return_delegation(struct inode *inode,
540 const nfs4_stateid *stateid) 539 const nfs4_stateid *stateid)
541{ 540{
542 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 541 struct nfs_server *server = NFS_SERVER(inode);
542 struct nfs_client *clp = server->nfs_client;
543 struct nfs_delegation *delegation; 543 struct nfs_delegation *delegation;
544 544
545 rcu_read_lock(); 545 rcu_read_lock();
@@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,
549 rcu_read_unlock(); 549 rcu_read_unlock();
550 return -ENOENT; 550 return -ENOENT;
551 } 551 }
552 nfs_mark_return_delegation(delegation); 552 nfs_mark_return_delegation(server, delegation);
553 rcu_read_unlock(); 553 rcu_read_unlock();
554 554
555 nfs_delegation_run_state_manager(clp); 555 nfs_delegation_run_state_manager(clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57f578e2560a..b238d95ac48c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {
134 134
135#endif /* CONFIG_NFS_V4 */ 135#endif /* CONFIG_NFS_V4 */
136 136
137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) 137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
138{ 138{
139 struct nfs_open_dir_context *ctx; 139 struct nfs_open_dir_context *ctx;
140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
141 if (ctx != NULL) { 141 if (ctx != NULL) {
142 ctx->duped = 0; 142 ctx->duped = 0;
143 ctx->attr_gencount = NFS_I(dir)->attr_gencount;
143 ctx->dir_cookie = 0; 144 ctx->dir_cookie = 0;
144 ctx->dup_cookie = 0; 145 ctx->dup_cookie = 0;
145 ctx->cred = get_rpccred(cred); 146 ctx->cred = get_rpccred(cred);
146 } else 147 return ctx;
147 ctx = ERR_PTR(-ENOMEM); 148 }
148 return ctx; 149 return ERR_PTR(-ENOMEM);
149} 150}
150 151
151static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) 152static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
173 cred = rpc_lookup_cred(); 174 cred = rpc_lookup_cred();
174 if (IS_ERR(cred)) 175 if (IS_ERR(cred))
175 return PTR_ERR(cred); 176 return PTR_ERR(cred);
176 ctx = alloc_nfs_open_dir_context(cred); 177 ctx = alloc_nfs_open_dir_context(inode, cred);
177 if (IS_ERR(ctx)) { 178 if (IS_ERR(ctx)) {
178 res = PTR_ERR(ctx); 179 res = PTR_ERR(ctx);
179 goto out; 180 goto out;
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
323{ 324{
324 loff_t diff = desc->file->f_pos - desc->current_index; 325 loff_t diff = desc->file->f_pos - desc->current_index;
325 unsigned int index; 326 unsigned int index;
326 struct nfs_open_dir_context *ctx = desc->file->private_data;
327 327
328 if (diff < 0) 328 if (diff < 0)
329 goto out_eof; 329 goto out_eof;
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
336 index = (unsigned int)diff; 336 index = (unsigned int)diff;
337 *desc->dir_cookie = array->array[index].cookie; 337 *desc->dir_cookie = array->array[index].cookie;
338 desc->cache_entry_index = index; 338 desc->cache_entry_index = index;
339 ctx->duped = 0;
340 return 0; 339 return 0;
341out_eof: 340out_eof:
342 desc->eof = 1; 341 desc->eof = 1;
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
349 int i; 348 int i;
350 loff_t new_pos; 349 loff_t new_pos;
351 int status = -EAGAIN; 350 int status = -EAGAIN;
352 struct nfs_open_dir_context *ctx = desc->file->private_data;
353 351
354 for (i = 0; i < array->size; i++) { 352 for (i = 0; i < array->size; i++) {
355 if (array->array[i].cookie == *desc->dir_cookie) { 353 if (array->array[i].cookie == *desc->dir_cookie) {
354 struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
355 struct nfs_open_dir_context *ctx = desc->file->private_data;
356
356 new_pos = desc->current_index + i; 357 new_pos = desc->current_index + i;
357 if (new_pos < desc->file->f_pos) { 358 if (ctx->attr_gencount != nfsi->attr_gencount
359 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
360 ctx->duped = 0;
361 ctx->attr_gencount = nfsi->attr_gencount;
362 } else if (new_pos < desc->file->f_pos) {
363 if (ctx->duped > 0
364 && ctx->dup_cookie == *desc->dir_cookie) {
365 if (printk_ratelimit()) {
366 pr_notice("NFS: directory %s/%s contains a readdir loop."
367 "Please contact your server vendor. "
368 "The file: %s has duplicate cookie %llu\n",
369 desc->file->f_dentry->d_parent->d_name.name,
370 desc->file->f_dentry->d_name.name,
371 array->array[i].string.name,
372 *desc->dir_cookie);
373 }
374 status = -ELOOP;
375 goto out;
376 }
358 ctx->dup_cookie = *desc->dir_cookie; 377 ctx->dup_cookie = *desc->dir_cookie;
359 ctx->duped = 1; 378 ctx->duped = -1;
360 } 379 }
361 desc->file->f_pos = new_pos; 380 desc->file->f_pos = new_pos;
362 desc->cache_entry_index = i; 381 desc->cache_entry_index = i;
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
368 if (*desc->dir_cookie == array->last_cookie) 387 if (*desc->dir_cookie == array->last_cookie)
369 desc->eof = 1; 388 desc->eof = 1;
370 } 389 }
390out:
371 return status; 391 return status;
372} 392}
373 393
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
740 struct nfs_cache_array *array = NULL; 760 struct nfs_cache_array *array = NULL;
741 struct nfs_open_dir_context *ctx = file->private_data; 761 struct nfs_open_dir_context *ctx = file->private_data;
742 762
743 if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
744 if (printk_ratelimit()) {
745 pr_notice("NFS: directory %s/%s contains a readdir loop. "
746 "Please contact your server vendor. "
747 "Offending cookie: %llu\n",
748 file->f_dentry->d_parent->d_name.name,
749 file->f_dentry->d_name.name,
750 *desc->dir_cookie);
751 }
752 res = -ELOOP;
753 goto out;
754 }
755
756 array = nfs_readdir_get_array(desc->page); 763 array = nfs_readdir_get_array(desc->page);
757 if (IS_ERR(array)) { 764 if (IS_ERR(array)) {
758 res = PTR_ERR(array); 765 res = PTR_ERR(array);
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
774 *desc->dir_cookie = array->array[i+1].cookie; 781 *desc->dir_cookie = array->array[i+1].cookie;
775 else 782 else
776 *desc->dir_cookie = array->last_cookie; 783 *desc->dir_cookie = array->last_cookie;
784 if (ctx->duped != 0)
785 ctx->duped = 1;
777 } 786 }
778 if (array->eof_index >= 0) 787 if (array->eof_index >= 0)
779 desc->eof = 1; 788 desc->eof = 1;
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
805 struct page *page = NULL; 814 struct page *page = NULL;
806 int status; 815 int status;
807 struct inode *inode = desc->file->f_path.dentry->d_inode; 816 struct inode *inode = desc->file->f_path.dentry->d_inode;
817 struct nfs_open_dir_context *ctx = desc->file->private_data;
808 818
809 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 819 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
810 (unsigned long long)*desc->dir_cookie); 820 (unsigned long long)*desc->dir_cookie);
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
818 desc->page_index = 0; 828 desc->page_index = 0;
819 desc->last_cookie = *desc->dir_cookie; 829 desc->last_cookie = *desc->dir_cookie;
820 desc->page = page; 830 desc->page = page;
831 ctx->duped = 0;
821 832
822 status = nfs_readdir_xdr_to_array(desc, page, inode); 833 status = nfs_readdir_xdr_to_array(desc, page, inode);
823 if (status < 0) 834 if (status < 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b35d25b98da6..1940f1a56a5f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -53,7 +53,7 @@
53 53
54#include <asm/system.h> 54#include <asm/system.h>
55#include <asm/uaccess.h> 55#include <asm/uaccess.h>
56#include <asm/atomic.h> 56#include <linux/atomic.h>
57 57
58#include "internal.h" 58#include "internal.h"
59#include "iostat.h" 59#include "iostat.h"
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a55347a2daa..ab12913dd473 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb);
277extern char *nfs_path(char **p, struct dentry *dentry, 277extern char *nfs_path(char **p, struct dentry *dentry,
278 char *buffer, ssize_t buflen); 278 char *buffer, ssize_t buflen);
279extern struct vfsmount *nfs_d_automount(struct path *path); 279extern struct vfsmount *nfs_d_automount(struct path *path);
280#ifdef CONFIG_NFS_V4
281rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
282#endif
280 283
281/* getroot.c */ 284/* getroot.c */
282extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, 285extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
@@ -288,12 +291,22 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
288extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); 291extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
289#endif 292#endif
290 293
294struct nfs_pageio_descriptor;
291/* read.c */ 295/* read.c */
292extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, 296extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
293 const struct rpc_call_ops *call_ops); 297 const struct rpc_call_ops *call_ops);
294extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 298extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
299extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
300 struct list_head *head);
301
302extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
303extern void nfs_readdata_release(struct nfs_read_data *rdata);
295 304
296/* write.c */ 305/* write.c */
306extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
307 struct list_head *head);
308extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
309extern void nfs_writedata_release(struct nfs_write_data *wdata);
297extern void nfs_commit_free(struct nfs_write_data *p); 310extern void nfs_commit_free(struct nfs_write_data *p);
298extern int nfs_initiate_write(struct nfs_write_data *data, 311extern int nfs_initiate_write(struct nfs_write_data *data,
299 struct rpc_clnt *clnt, 312 struct rpc_clnt *clnt,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 1f063bacd285..8102391bb374 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -119,7 +119,7 @@ Elong:
119} 119}
120 120
121#ifdef CONFIG_NFS_V4 121#ifdef CONFIG_NFS_V4
122static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) 122rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
123{ 123{
124 struct gss_api_mech *mech; 124 struct gss_api_mech *mech;
125 struct xdr_netobj oid; 125 struct xdr_netobj oid;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index e49e73107e62..7ef23979896d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -415,7 +415,7 @@ fail:
415} 415}
416 416
417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, 417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
418 mode_t mode) 418 umode_t mode)
419{ 419{
420 struct posix_acl *dfacl, *acl; 420 struct posix_acl *dfacl, *acl;
421 int error = 0; 421 int error = 0;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 38053d823eb0..85f1690ca08c 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nfs_open_context *ctx) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 umode_t mode = sattr->ia_mode;
320 int status = -ENOMEM; 320 int status = -ENOMEM;
321 321
322 dprintk("NFS call create %s\n", dentry->d_name.name); 322 dprintk("NFS call create %s\n", dentry->d_name.name);
@@ -562,7 +562,7 @@ static int
562nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 562nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
563{ 563{
564 struct nfs3_createdata *data; 564 struct nfs3_createdata *data;
565 int mode = sattr->ia_mode; 565 umode_t mode = sattr->ia_mode;
566 int status = -ENOMEM; 566 int status = -ENOMEM;
567 567
568 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 568 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
@@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
681 dev_t rdev) 681 dev_t rdev)
682{ 682{
683 struct nfs3_createdata *data; 683 struct nfs3_createdata *data;
684 mode_t mode = sattr->ia_mode; 684 umode_t mode = sattr->ia_mode;
685 int status = -ENOMEM; 685 int status = -ENOMEM;
686 686
687 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 687 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b788f2eb1ba0..1ec1a85fa71c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -48,6 +48,7 @@ enum nfs4_client_state {
48 NFS4CLNT_SESSION_RESET, 48 NFS4CLNT_SESSION_RESET,
49 NFS4CLNT_RECALL_SLOT, 49 NFS4CLNT_RECALL_SLOT,
50 NFS4CLNT_LEASE_CONFIRM, 50 NFS4CLNT_LEASE_CONFIRM,
51 NFS4CLNT_SERVER_SCOPE_MISMATCH,
51}; 52};
52 53
53enum nfs4_session_state { 54enum nfs4_session_state {
@@ -66,6 +67,8 @@ struct nfs4_minor_version_ops {
66 int cache_reply); 67 int cache_reply);
67 int (*validate_stateid)(struct nfs_delegation *, 68 int (*validate_stateid)(struct nfs_delegation *,
68 const nfs4_stateid *); 69 const nfs4_stateid *);
70 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
71 struct nfs_fsinfo *);
69 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 72 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
70 const struct nfs4_state_recovery_ops *nograce_recovery_ops; 73 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
71 const struct nfs4_state_maintenance_ops *state_renewal_ops; 74 const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -315,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
315extern const u32 nfs4_fattr_bitmap[2]; 318extern const u32 nfs4_fattr_bitmap[2];
316extern const u32 nfs4_statfs_bitmap[2]; 319extern const u32 nfs4_statfs_bitmap[2];
317extern const u32 nfs4_pathconf_bitmap[2]; 320extern const u32 nfs4_pathconf_bitmap[2];
318extern const u32 nfs4_fsinfo_bitmap[2]; 321extern const u32 nfs4_fsinfo_bitmap[3];
319extern const u32 nfs4_fs_locations_bitmap[2]; 322extern const u32 nfs4_fs_locations_bitmap[2];
320 323
321/* nfs4renewd.c */ 324/* nfs4renewd.c */
@@ -349,6 +352,8 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);
349extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); 352extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
350extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 353extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
351extern void nfs41_handle_recall_slot(struct nfs_client *clp); 354extern void nfs41_handle_recall_slot(struct nfs_client *clp);
355extern void nfs41_handle_server_scope(struct nfs_client *,
356 struct server_scope **);
352extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 357extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
353extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 358extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
354extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); 359extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index f9d03abcd04c..e8915d4840ad 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
170 170
171 pnfs_set_layoutcommit(wdata); 171 pnfs_set_layoutcommit(wdata);
172 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, 172 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
173 (unsigned long) wdata->lseg->pls_end_pos); 173 (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
174} 174}
175 175
176/* 176/*
@@ -334,6 +334,9 @@ filelayout_read_pagelist(struct nfs_read_data *data)
334 __func__, data->inode->i_ino, 334 __func__, data->inode->i_ino,
335 data->args.pgbase, (size_t)data->args.count, offset); 335 data->args.pgbase, (size_t)data->args.count, offset);
336 336
337 if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
338 return PNFS_NOT_ATTEMPTED;
339
337 /* Retrieve the correct rpc_client for the byte range */ 340 /* Retrieve the correct rpc_client for the byte range */
338 j = nfs4_fl_calc_j_index(lseg, offset); 341 j = nfs4_fl_calc_j_index(lseg, offset);
339 idx = nfs4_fl_calc_ds_index(lseg, j); 342 idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -344,8 +347,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
344 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 347 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
345 return PNFS_NOT_ATTEMPTED; 348 return PNFS_NOT_ATTEMPTED;
346 } 349 }
347 dprintk("%s USE DS:ip %x %hu\n", __func__, 350 dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);
348 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
349 351
350 /* No multipath support. Use first DS */ 352 /* No multipath support. Use first DS */
351 data->ds_clp = ds->ds_clp; 353 data->ds_clp = ds->ds_clp;
@@ -374,6 +376,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
374 struct nfs_fh *fh; 376 struct nfs_fh *fh;
375 int status; 377 int status;
376 378
379 if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
380 return PNFS_NOT_ATTEMPTED;
381
377 /* Retrieve the correct rpc_client for the byte range */ 382 /* Retrieve the correct rpc_client for the byte range */
378 j = nfs4_fl_calc_j_index(lseg, offset); 383 j = nfs4_fl_calc_j_index(lseg, offset);
379 idx = nfs4_fl_calc_ds_index(lseg, j); 384 idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -384,9 +389,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
384 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 389 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
385 return PNFS_NOT_ATTEMPTED; 390 return PNFS_NOT_ATTEMPTED;
386 } 391 }
387 dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, 392 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
388 data->inode->i_ino, sync, (size_t) data->args.count, offset, 393 data->inode->i_ino, sync, (size_t) data->args.count, offset,
389 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); 394 ds->ds_remotestr);
390 395
391 data->write_done_cb = filelayout_write_done_cb; 396 data->write_done_cb = filelayout_write_done_cb;
392 data->ds_clp = ds->ds_clp; 397 data->ds_clp = ds->ds_clp;
@@ -428,6 +433,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
428 433
429 dprintk("--> %s\n", __func__); 434 dprintk("--> %s\n", __func__);
430 435
436 /* FIXME: remove this check when layout segment support is added */
437 if (lgr->range.offset != 0 ||
438 lgr->range.length != NFS4_MAX_UINT64) {
439 dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
440 __func__);
441 goto out;
442 }
443
431 if (fl->pattern_offset > lgr->range.offset) { 444 if (fl->pattern_offset > lgr->range.offset) {
432 dprintk("%s pattern_offset %lld too large\n", 445 dprintk("%s pattern_offset %lld too large\n",
433 __func__, fl->pattern_offset); 446 __func__, fl->pattern_offset);
@@ -449,6 +462,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
449 goto out; 462 goto out;
450 } else 463 } else
451 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); 464 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
465 /* Found deviceid is being reaped */
466 if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags))
467 goto out_put;
468
452 fl->dsaddr = dsaddr; 469 fl->dsaddr = dsaddr;
453 470
454 if (fl->first_stripe_index < 0 || 471 if (fl->first_stripe_index < 0 ||
@@ -659,7 +676,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
659 * return true : coalesce page 676 * return true : coalesce page
660 * return false : don't coalesce page 677 * return false : don't coalesce page
661 */ 678 */
662bool 679static bool
663filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 680filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
664 struct nfs_page *req) 681 struct nfs_page *req)
665{ 682{
@@ -670,8 +687,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
670 !nfs_generic_pg_test(pgio, prev, req)) 687 !nfs_generic_pg_test(pgio, prev, req))
671 return false; 688 return false;
672 689
673 if (!pgio->pg_lseg)
674 return 1;
675 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; 690 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
676 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; 691 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
677 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; 692 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
@@ -682,6 +697,52 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
682 return (p_stripe == r_stripe); 697 return (p_stripe == r_stripe);
683} 698}
684 699
700void
701filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
702 struct nfs_page *req)
703{
704 BUG_ON(pgio->pg_lseg != NULL);
705
706 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
707 req->wb_context,
708 0,
709 NFS4_MAX_UINT64,
710 IOMODE_READ,
711 GFP_KERNEL);
712 /* If no lseg, fall back to read through mds */
713 if (pgio->pg_lseg == NULL)
714 nfs_pageio_reset_read_mds(pgio);
715}
716
717void
718filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
719 struct nfs_page *req)
720{
721 BUG_ON(pgio->pg_lseg != NULL);
722
723 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
724 req->wb_context,
725 0,
726 NFS4_MAX_UINT64,
727 IOMODE_RW,
728 GFP_NOFS);
729 /* If no lseg, fall back to write through mds */
730 if (pgio->pg_lseg == NULL)
731 nfs_pageio_reset_write_mds(pgio);
732}
733
734static const struct nfs_pageio_ops filelayout_pg_read_ops = {
735 .pg_init = filelayout_pg_init_read,
736 .pg_test = filelayout_pg_test,
737 .pg_doio = pnfs_generic_pg_readpages,
738};
739
740static const struct nfs_pageio_ops filelayout_pg_write_ops = {
741 .pg_init = filelayout_pg_init_write,
742 .pg_test = filelayout_pg_test,
743 .pg_doio = pnfs_generic_pg_writepages,
744};
745
685static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) 746static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
686{ 747{
687 return !FILELAYOUT_LSEG(lseg)->commit_through_mds; 748 return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
@@ -879,7 +940,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
879 .owner = THIS_MODULE, 940 .owner = THIS_MODULE,
880 .alloc_lseg = filelayout_alloc_lseg, 941 .alloc_lseg = filelayout_alloc_lseg,
881 .free_lseg = filelayout_free_lseg, 942 .free_lseg = filelayout_free_lseg,
882 .pg_test = filelayout_pg_test, 943 .pg_read_ops = &filelayout_pg_read_ops,
944 .pg_write_ops = &filelayout_pg_write_ops,
883 .mark_pnfs_commit = filelayout_mark_pnfs_commit, 945 .mark_pnfs_commit = filelayout_mark_pnfs_commit,
884 .choose_commit_list = filelayout_choose_commit_list, 946 .choose_commit_list = filelayout_choose_commit_list,
885 .commit_pagelist = filelayout_commit_pagelist, 947 .commit_pagelist = filelayout_commit_pagelist,
@@ -902,5 +964,7 @@ static void __exit nfs4filelayout_exit(void)
902 pnfs_unregister_layoutdriver(&filelayout_type); 964 pnfs_unregister_layoutdriver(&filelayout_type);
903} 965}
904 966
967MODULE_ALIAS("nfs-layouttype4-1");
968
905module_init(nfs4filelayout_init); 969module_init(nfs4filelayout_init);
906module_exit(nfs4filelayout_exit); 970module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index cebe01e3795e..2e42284253fa 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -47,10 +47,17 @@ enum stripetype4 {
47}; 47};
48 48
49/* Individual ip address */ 49/* Individual ip address */
50struct nfs4_pnfs_ds_addr {
51 struct sockaddr_storage da_addr;
52 size_t da_addrlen;
53 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
54 char *da_remotestr; /* human readable addr+port */
55};
56
50struct nfs4_pnfs_ds { 57struct nfs4_pnfs_ds {
51 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ 58 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
52 u32 ds_ip_addr; 59 char *ds_remotestr; /* comma sep list of addrs */
53 u32 ds_port; 60 struct list_head ds_addrs;
54 struct nfs_client *ds_clp; 61 struct nfs_client *ds_clp;
55 atomic_t ds_count; 62 atomic_t ds_count;
56}; 63};
@@ -89,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
89 generic_hdr); 96 generic_hdr);
90} 97}
91 98
99static inline struct nfs4_deviceid_node *
100FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
101{
102 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
103}
104
92extern struct nfs_fh * 105extern struct nfs_fh *
93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 106nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
94 107
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 3b7bf1377264..ed388aae9689 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -56,54 +56,139 @@ print_ds(struct nfs4_pnfs_ds *ds)
56 printk("%s NULL device\n", __func__); 56 printk("%s NULL device\n", __func__);
57 return; 57 return;
58 } 58 }
59 printk(" ip_addr %x port %hu\n" 59 printk(" ds %s\n"
60 " ref count %d\n" 60 " ref count %d\n"
61 " client %p\n" 61 " client %p\n"
62 " cl_exchange_flags %x\n", 62 " cl_exchange_flags %x\n",
63 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), 63 ds->ds_remotestr,
64 atomic_read(&ds->ds_count), ds->ds_clp, 64 atomic_read(&ds->ds_count), ds->ds_clp,
65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); 65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
66} 66}
67 67
68/* nfs4_ds_cache_lock is held */ 68static bool
69static struct nfs4_pnfs_ds * 69same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
70_data_server_lookup_locked(u32 ip_addr, u32 port)
71{ 70{
72 struct nfs4_pnfs_ds *ds; 71 struct sockaddr_in *a, *b;
72 struct sockaddr_in6 *a6, *b6;
73
74 if (addr1->sa_family != addr2->sa_family)
75 return false;
76
77 switch (addr1->sa_family) {
78 case AF_INET:
79 a = (struct sockaddr_in *)addr1;
80 b = (struct sockaddr_in *)addr2;
81
82 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
83 a->sin_port == b->sin_port)
84 return true;
85 break;
86
87 case AF_INET6:
88 a6 = (struct sockaddr_in6 *)addr1;
89 b6 = (struct sockaddr_in6 *)addr2;
90
91 /* LINKLOCAL addresses must have matching scope_id */
92 if (ipv6_addr_scope(&a6->sin6_addr) ==
93 IPV6_ADDR_SCOPE_LINKLOCAL &&
94 a6->sin6_scope_id != b6->sin6_scope_id)
95 return false;
96
97 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
98 a6->sin6_port == b6->sin6_port)
99 return true;
100 break;
101
102 default:
103 dprintk("%s: unhandled address family: %u\n",
104 __func__, addr1->sa_family);
105 return false;
106 }
73 107
74 dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", 108 return false;
75 ntohl(ip_addr), ntohs(port)); 109}
76 110
77 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { 111/*
78 if (ds->ds_ip_addr == ip_addr && 112 * Lookup DS by addresses. The first matching address returns true.
79 ds->ds_port == port) { 113 * nfs4_ds_cache_lock is held
80 return ds; 114 */
115static struct nfs4_pnfs_ds *
116_data_server_lookup_locked(struct list_head *dsaddrs)
117{
118 struct nfs4_pnfs_ds *ds;
119 struct nfs4_pnfs_ds_addr *da1, *da2;
120
121 list_for_each_entry(da1, dsaddrs, da_node) {
122 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
123 list_for_each_entry(da2, &ds->ds_addrs, da_node) {
124 if (same_sockaddr(
125 (struct sockaddr *)&da1->da_addr,
126 (struct sockaddr *)&da2->da_addr))
127 return ds;
128 }
81 } 129 }
82 } 130 }
83 return NULL; 131 return NULL;
84} 132}
85 133
86/* 134/*
135 * Compare two lists of addresses.
136 */
137static bool
138_data_server_match_all_addrs_locked(struct list_head *dsaddrs1,
139 struct list_head *dsaddrs2)
140{
141 struct nfs4_pnfs_ds_addr *da1, *da2;
142 size_t count1 = 0,
143 count2 = 0;
144
145 list_for_each_entry(da1, dsaddrs1, da_node)
146 count1++;
147
148 list_for_each_entry(da2, dsaddrs2, da_node) {
149 bool found = false;
150 count2++;
151 list_for_each_entry(da1, dsaddrs1, da_node) {
152 if (same_sockaddr((struct sockaddr *)&da1->da_addr,
153 (struct sockaddr *)&da2->da_addr)) {
154 found = true;
155 break;
156 }
157 }
158 if (!found)
159 return false;
160 }
161
162 return (count1 == count2);
163}
164
165/*
87 * Create an rpc connection to the nfs4_pnfs_ds data server 166 * Create an rpc connection to the nfs4_pnfs_ds data server
88 * Currently only support IPv4 167 * Currently only supports IPv4 and IPv6 addresses
89 */ 168 */
90static int 169static int
91nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) 170nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
92{ 171{
93 struct nfs_client *clp; 172 struct nfs_client *clp = ERR_PTR(-EIO);
94 struct sockaddr_in sin; 173 struct nfs4_pnfs_ds_addr *da;
95 int status = 0; 174 int status = 0;
96 175
97 dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, 176 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
98 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
99 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); 177 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
100 178
101 sin.sin_family = AF_INET; 179 BUG_ON(list_empty(&ds->ds_addrs));
102 sin.sin_addr.s_addr = ds->ds_ip_addr; 180
103 sin.sin_port = ds->ds_port; 181 list_for_each_entry(da, &ds->ds_addrs, da_node) {
182 dprintk("%s: DS %s: trying address %s\n",
183 __func__, ds->ds_remotestr, da->da_remotestr);
184
185 clp = nfs4_set_ds_client(mds_srv->nfs_client,
186 (struct sockaddr *)&da->da_addr,
187 da->da_addrlen, IPPROTO_TCP);
188 if (!IS_ERR(clp))
189 break;
190 }
104 191
105 clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
106 sizeof(sin), IPPROTO_TCP);
107 if (IS_ERR(clp)) { 192 if (IS_ERR(clp)) {
108 status = PTR_ERR(clp); 193 status = PTR_ERR(clp);
109 goto out; 194 goto out;
@@ -115,8 +200,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
115 goto out_put; 200 goto out_put;
116 } 201 }
117 ds->ds_clp = clp; 202 ds->ds_clp = clp;
118 dprintk("%s [existing] ip=%x, port=%hu\n", __func__, 203 dprintk("%s [existing] server=%s\n", __func__,
119 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); 204 ds->ds_remotestr);
120 goto out; 205 goto out;
121 } 206 }
122 207
@@ -135,8 +220,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
135 goto out_put; 220 goto out_put;
136 221
137 ds->ds_clp = clp; 222 ds->ds_clp = clp;
138 dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), 223 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
139 ntohs(ds->ds_port));
140out: 224out:
141 return status; 225 return status;
142out_put: 226out_put:
@@ -147,12 +231,25 @@ out_put:
147static void 231static void
148destroy_ds(struct nfs4_pnfs_ds *ds) 232destroy_ds(struct nfs4_pnfs_ds *ds)
149{ 233{
234 struct nfs4_pnfs_ds_addr *da;
235
150 dprintk("--> %s\n", __func__); 236 dprintk("--> %s\n", __func__);
151 ifdebug(FACILITY) 237 ifdebug(FACILITY)
152 print_ds(ds); 238 print_ds(ds);
153 239
154 if (ds->ds_clp) 240 if (ds->ds_clp)
155 nfs_put_client(ds->ds_clp); 241 nfs_put_client(ds->ds_clp);
242
243 while (!list_empty(&ds->ds_addrs)) {
244 da = list_first_entry(&ds->ds_addrs,
245 struct nfs4_pnfs_ds_addr,
246 da_node);
247 list_del_init(&da->da_node);
248 kfree(da->da_remotestr);
249 kfree(da);
250 }
251
252 kfree(ds->ds_remotestr);
156 kfree(ds); 253 kfree(ds);
157} 254}
158 255
@@ -179,31 +276,96 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
179 kfree(dsaddr); 276 kfree(dsaddr);
180} 277}
181 278
279/*
280 * Create a string with a human readable address and port to avoid
281 * complicated setup around many dprinks.
282 */
283static char *
284nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
285{
286 struct nfs4_pnfs_ds_addr *da;
287 char *remotestr;
288 size_t len;
289 char *p;
290
291 len = 3; /* '{', '}' and eol */
292 list_for_each_entry(da, dsaddrs, da_node) {
293 len += strlen(da->da_remotestr) + 1; /* string plus comma */
294 }
295
296 remotestr = kzalloc(len, gfp_flags);
297 if (!remotestr)
298 return NULL;
299
300 p = remotestr;
301 *(p++) = '{';
302 len--;
303 list_for_each_entry(da, dsaddrs, da_node) {
304 size_t ll = strlen(da->da_remotestr);
305
306 if (ll > len)
307 goto out_err;
308
309 memcpy(p, da->da_remotestr, ll);
310 p += ll;
311 len -= ll;
312
313 if (len < 1)
314 goto out_err;
315 (*p++) = ',';
316 len--;
317 }
318 if (len < 2)
319 goto out_err;
320 *(p++) = '}';
321 *p = '\0';
322 return remotestr;
323out_err:
324 kfree(remotestr);
325 return NULL;
326}
327
182static struct nfs4_pnfs_ds * 328static struct nfs4_pnfs_ds *
183nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) 329nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
184{ 330{
185 struct nfs4_pnfs_ds *tmp_ds, *ds; 331 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
332 char *remotestr;
186 333
187 ds = kzalloc(sizeof(*tmp_ds), gfp_flags); 334 if (list_empty(dsaddrs)) {
335 dprintk("%s: no addresses defined\n", __func__);
336 goto out;
337 }
338
339 ds = kzalloc(sizeof(*ds), gfp_flags);
188 if (!ds) 340 if (!ds)
189 goto out; 341 goto out;
190 342
343 /* this is only used for debugging, so it's ok if its NULL */
344 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
345
191 spin_lock(&nfs4_ds_cache_lock); 346 spin_lock(&nfs4_ds_cache_lock);
192 tmp_ds = _data_server_lookup_locked(ip_addr, port); 347 tmp_ds = _data_server_lookup_locked(dsaddrs);
193 if (tmp_ds == NULL) { 348 if (tmp_ds == NULL) {
194 ds->ds_ip_addr = ip_addr; 349 INIT_LIST_HEAD(&ds->ds_addrs);
195 ds->ds_port = port; 350 list_splice_init(dsaddrs, &ds->ds_addrs);
351 ds->ds_remotestr = remotestr;
196 atomic_set(&ds->ds_count, 1); 352 atomic_set(&ds->ds_count, 1);
197 INIT_LIST_HEAD(&ds->ds_node); 353 INIT_LIST_HEAD(&ds->ds_node);
198 ds->ds_clp = NULL; 354 ds->ds_clp = NULL;
199 list_add(&ds->ds_node, &nfs4_data_server_cache); 355 list_add(&ds->ds_node, &nfs4_data_server_cache);
200 dprintk("%s add new data server ip 0x%x\n", __func__, 356 dprintk("%s add new data server %s\n", __func__,
201 ds->ds_ip_addr); 357 ds->ds_remotestr);
202 } else { 358 } else {
359 if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs,
360 dsaddrs)) {
361 dprintk("%s: multipath address mismatch: %s != %s",
362 __func__, tmp_ds->ds_remotestr, remotestr);
363 }
364 kfree(remotestr);
203 kfree(ds); 365 kfree(ds);
204 atomic_inc(&tmp_ds->ds_count); 366 atomic_inc(&tmp_ds->ds_count);
205 dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", 367 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
206 __func__, tmp_ds->ds_ip_addr, 368 __func__, tmp_ds->ds_remotestr,
207 atomic_read(&tmp_ds->ds_count)); 369 atomic_read(&tmp_ds->ds_count));
208 ds = tmp_ds; 370 ds = tmp_ds;
209 } 371 }
@@ -213,18 +375,22 @@ out:
213} 375}
214 376
215/* 377/*
216 * Currently only support ipv4, and one multi-path address. 378 * Currently only supports ipv4, ipv6 and one multi-path address.
217 */ 379 */
218static struct nfs4_pnfs_ds * 380static struct nfs4_pnfs_ds_addr *
219decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) 381decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
220{ 382{
221 struct nfs4_pnfs_ds *ds = NULL; 383 struct nfs4_pnfs_ds_addr *da = NULL;
222 char *buf; 384 char *buf, *portstr;
223 const char *ipend, *pstr; 385 u32 port;
224 u32 ip_addr, port; 386 int nlen, rlen;
225 int nlen, rlen, i;
226 int tmp[2]; 387 int tmp[2];
227 __be32 *p; 388 __be32 *p;
389 char *netid, *match_netid;
390 size_t len, match_netid_len;
391 char *startsep = "";
392 char *endsep = "";
393
228 394
229 /* r_netid */ 395 /* r_netid */
230 p = xdr_inline_decode(streamp, 4); 396 p = xdr_inline_decode(streamp, 4);
@@ -236,64 +402,123 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla
236 if (unlikely(!p)) 402 if (unlikely(!p))
237 goto out_err; 403 goto out_err;
238 404
239 /* Check that netid is "tcp" */ 405 netid = kmalloc(nlen+1, gfp_flags);
240 if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { 406 if (unlikely(!netid))
241 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
242 goto out_err; 407 goto out_err;
243 }
244 408
245 /* r_addr */ 409 netid[nlen] = '\0';
410 memcpy(netid, p, nlen);
411
412 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
246 p = xdr_inline_decode(streamp, 4); 413 p = xdr_inline_decode(streamp, 4);
247 if (unlikely(!p)) 414 if (unlikely(!p))
248 goto out_err; 415 goto out_free_netid;
249 rlen = be32_to_cpup(p); 416 rlen = be32_to_cpup(p);
250 417
251 p = xdr_inline_decode(streamp, rlen); 418 p = xdr_inline_decode(streamp, rlen);
252 if (unlikely(!p)) 419 if (unlikely(!p))
253 goto out_err; 420 goto out_free_netid;
254 421
255 /* ipv6 length plus port is legal */ 422 /* port is ".ABC.DEF", 8 chars max */
256 if (rlen > INET6_ADDRSTRLEN + 8) { 423 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
257 dprintk("%s: Invalid address, length %d\n", __func__, 424 dprintk("%s: Invalid address, length %d\n", __func__,
258 rlen); 425 rlen);
259 goto out_err; 426 goto out_free_netid;
260 } 427 }
261 buf = kmalloc(rlen + 1, gfp_flags); 428 buf = kmalloc(rlen + 1, gfp_flags);
262 if (!buf) { 429 if (!buf) {
263 dprintk("%s: Not enough memory\n", __func__); 430 dprintk("%s: Not enough memory\n", __func__);
264 goto out_err; 431 goto out_free_netid;
265 } 432 }
266 buf[rlen] = '\0'; 433 buf[rlen] = '\0';
267 memcpy(buf, p, rlen); 434 memcpy(buf, p, rlen);
268 435
269 /* replace the port dots with dashes for the in4_pton() delimiter*/ 436 /* replace port '.' with '-' */
270 for (i = 0; i < 2; i++) { 437 portstr = strrchr(buf, '.');
271 char *res = strrchr(buf, '.'); 438 if (!portstr) {
272 if (!res) { 439 dprintk("%s: Failed finding expected dot in port\n",
273 dprintk("%s: Failed finding expected dots in port\n", 440 __func__);
274 __func__); 441 goto out_free_buf;
275 goto out_free; 442 }
276 } 443 *portstr = '-';
277 *res = '-'; 444
445 /* find '.' between address and port */
446 portstr = strrchr(buf, '.');
447 if (!portstr) {
448 dprintk("%s: Failed finding expected dot between address and "
449 "port\n", __func__);
450 goto out_free_buf;
278 } 451 }
452 *portstr = '\0';
279 453
280 /* Currently only support ipv4 address */ 454 da = kzalloc(sizeof(*da), gfp_flags);
281 if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { 455 if (unlikely(!da))
282 dprintk("%s: Only ipv4 addresses supported\n", __func__); 456 goto out_free_buf;
283 goto out_free; 457
458 INIT_LIST_HEAD(&da->da_node);
459
460 if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr,
461 sizeof(da->da_addr))) {
462 dprintk("%s: error parsing address %s\n", __func__, buf);
463 goto out_free_da;
284 } 464 }
285 465
286 /* port */ 466 portstr++;
287 pstr = ipend; 467 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
288 sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
289 port = htons((tmp[0] << 8) | (tmp[1])); 468 port = htons((tmp[0] << 8) | (tmp[1]));
290 469
291 ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); 470 switch (da->da_addr.ss_family) {
292 dprintk("%s: Decoded address and port %s\n", __func__, buf); 471 case AF_INET:
293out_free: 472 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
473 da->da_addrlen = sizeof(struct sockaddr_in);
474 match_netid = "tcp";
475 match_netid_len = 3;
476 break;
477
478 case AF_INET6:
479 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
480 da->da_addrlen = sizeof(struct sockaddr_in6);
481 match_netid = "tcp6";
482 match_netid_len = 4;
483 startsep = "[";
484 endsep = "]";
485 break;
486
487 default:
488 dprintk("%s: unsupported address family: %u\n",
489 __func__, da->da_addr.ss_family);
490 goto out_free_da;
491 }
492
493 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
494 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
495 __func__, netid, match_netid);
496 goto out_free_da;
497 }
498
499 /* save human readable address */
500 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
501 da->da_remotestr = kzalloc(len, gfp_flags);
502
503 /* NULL is ok, only used for dprintk */
504 if (da->da_remotestr)
505 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
506 buf, endsep, ntohs(port));
507
508 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
294 kfree(buf); 509 kfree(buf);
510 kfree(netid);
511 return da;
512
513out_free_da:
514 kfree(da);
515out_free_buf:
516 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
517 kfree(buf);
518out_free_netid:
519 kfree(netid);
295out_err: 520out_err:
296 return ds; 521 return NULL;
297} 522}
298 523
299/* Decode opaque device data and return the result */ 524/* Decode opaque device data and return the result */
@@ -310,6 +535,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
310 struct xdr_stream stream; 535 struct xdr_stream stream;
311 struct xdr_buf buf; 536 struct xdr_buf buf;
312 struct page *scratch; 537 struct page *scratch;
538 struct list_head dsaddrs;
539 struct nfs4_pnfs_ds_addr *da;
313 540
314 /* set up xdr stream */ 541 /* set up xdr stream */
315 scratch = alloc_page(gfp_flags); 542 scratch = alloc_page(gfp_flags);
@@ -386,6 +613,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
386 NFS_SERVER(ino)->nfs_client, 613 NFS_SERVER(ino)->nfs_client,
387 &pdev->dev_id); 614 &pdev->dev_id);
388 615
616 INIT_LIST_HEAD(&dsaddrs);
617
389 for (i = 0; i < dsaddr->ds_num; i++) { 618 for (i = 0; i < dsaddr->ds_num; i++) {
390 int j; 619 int j;
391 u32 mp_count; 620 u32 mp_count;
@@ -395,48 +624,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
395 goto out_err_free_deviceid; 624 goto out_err_free_deviceid;
396 625
397 mp_count = be32_to_cpup(p); /* multipath count */ 626 mp_count = be32_to_cpup(p); /* multipath count */
398 if (mp_count > 1) {
399 printk(KERN_WARNING
400 "%s: Multipath count %d not supported, "
401 "skipping all greater than 1\n", __func__,
402 mp_count);
403 }
404 for (j = 0; j < mp_count; j++) { 627 for (j = 0; j < mp_count; j++) {
405 if (j == 0) { 628 da = decode_ds_addr(&stream, gfp_flags);
406 dsaddr->ds_list[i] = decode_and_add_ds(&stream, 629 if (da)
407 ino, gfp_flags); 630 list_add_tail(&da->da_node, &dsaddrs);
408 if (dsaddr->ds_list[i] == NULL) 631 }
409 goto out_err_free_deviceid; 632 if (list_empty(&dsaddrs)) {
410 } else { 633 dprintk("%s: no suitable DS addresses found\n",
411 u32 len; 634 __func__);
412 /* skip extra multipath */ 635 goto out_err_free_deviceid;
413 636 }
414 /* read len, skip */ 637
415 p = xdr_inline_decode(&stream, 4); 638 dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
416 if (unlikely(!p)) 639 if (!dsaddr->ds_list[i])
417 goto out_err_free_deviceid; 640 goto out_err_drain_dsaddrs;
418 len = be32_to_cpup(p); 641
419 642 /* If DS was already in cache, free ds addrs */
420 p = xdr_inline_decode(&stream, len); 643 while (!list_empty(&dsaddrs)) {
421 if (unlikely(!p)) 644 da = list_first_entry(&dsaddrs,
422 goto out_err_free_deviceid; 645 struct nfs4_pnfs_ds_addr,
423 646 da_node);
424 /* read len, skip */ 647 list_del_init(&da->da_node);
425 p = xdr_inline_decode(&stream, 4); 648 kfree(da->da_remotestr);
426 if (unlikely(!p)) 649 kfree(da);
427 goto out_err_free_deviceid;
428 len = be32_to_cpup(p);
429
430 p = xdr_inline_decode(&stream, len);
431 if (unlikely(!p))
432 goto out_err_free_deviceid;
433 }
434 } 650 }
435 } 651 }
436 652
437 __free_page(scratch); 653 __free_page(scratch);
438 return dsaddr; 654 return dsaddr;
439 655
656out_err_drain_dsaddrs:
657 while (!list_empty(&dsaddrs)) {
658 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
659 da_node);
660 list_del_init(&da->da_node);
661 kfree(da->da_remotestr);
662 kfree(da);
663 }
440out_err_free_deviceid: 664out_err_free_deviceid:
441 nfs4_fl_free_deviceid(dsaddr); 665 nfs4_fl_free_deviceid(dsaddr);
442 /* stripe_indicies was part of dsaddr */ 666 /* stripe_indicies was part of dsaddr */
@@ -591,13 +815,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
591 815
592static void 816static void
593filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, 817filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
594 int err, u32 ds_addr) 818 int err, const char *ds_remotestr)
595{ 819{
596 u32 *p = (u32 *)&dsaddr->id_node.deviceid; 820 u32 *p = (u32 *)&dsaddr->id_node.deviceid;
597 821
598 printk(KERN_ERR "NFS: data server %x connection error %d." 822 printk(KERN_ERR "NFS: data server %s connection error %d."
599 " Deviceid [%x%x%x%x] marked out of use.\n", 823 " Deviceid [%x%x%x%x] marked out of use.\n",
600 ds_addr, err, p[0], p[1], p[2], p[3]); 824 ds_remotestr, err, p[0], p[1], p[2], p[3]);
601 825
602 spin_lock(&nfs4_ds_cache_lock); 826 spin_lock(&nfs4_ds_cache_lock);
603 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; 827 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
@@ -628,7 +852,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
628 err = nfs4_ds_connect(s, ds); 852 err = nfs4_ds_connect(s, ds);
629 if (err) { 853 if (err) {
630 filelayout_mark_devid_negative(dsaddr, err, 854 filelayout_mark_devid_negative(dsaddr, err,
631 ntohl(ds->ds_ip_addr)); 855 ds->ds_remotestr);
632 return NULL; 856 return NULL;
633 } 857 }
634 } 858 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26bece8f3083..8c77039e7a81 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -80,7 +80,10 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
80static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 80static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
81 struct nfs_fattr *fattr, struct iattr *sattr, 81 struct nfs_fattr *fattr, struct iattr *sattr,
82 struct nfs4_state *state); 82 struct nfs4_state *state);
83 83#ifdef CONFIG_NFS_V4_1
84static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *);
85static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *);
86#endif
84/* Prevent leaks of NFSv4 errors into userland */ 87/* Prevent leaks of NFSv4 errors into userland */
85static int nfs4_map_errors(int err) 88static int nfs4_map_errors(int err)
86{ 89{
@@ -137,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
137 0 140 0
138}; 141};
139 142
140const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE 143const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
141 | FATTR4_WORD0_MAXREAD 144 | FATTR4_WORD0_MAXREAD
142 | FATTR4_WORD0_MAXWRITE 145 | FATTR4_WORD0_MAXWRITE
143 | FATTR4_WORD0_LEASE_TIME, 146 | FATTR4_WORD0_LEASE_TIME,
144 FATTR4_WORD1_TIME_DELTA 147 FATTR4_WORD1_TIME_DELTA
145 | FATTR4_WORD1_FS_LAYOUT_TYPES 148 | FATTR4_WORD1_FS_LAYOUT_TYPES,
149 FATTR4_WORD2_LAYOUT_BLKSIZE
146}; 150};
147 151
148const u32 nfs4_fs_locations_bitmap[2] = { 152const u32 nfs4_fs_locations_bitmap[2] = {
@@ -1689,6 +1693,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
1689 return ret; 1693 return ret;
1690} 1694}
1691 1695
1696#if defined(CONFIG_NFS_V4_1)
1697static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
1698{
1699 int status;
1700 struct nfs_server *server = NFS_SERVER(state->inode);
1701
1702 status = nfs41_test_stateid(server, state);
1703 if (status == NFS_OK)
1704 return 0;
1705 nfs41_free_stateid(server, state);
1706 return nfs4_open_expired(sp, state);
1707}
1708#endif
1709
1692/* 1710/*
1693 * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* 1711 * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
1694 * fields corresponding to attributes that were used to store the verifier. 1712 * fields corresponding to attributes that were used to store the verifier.
@@ -2252,13 +2270,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2252static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, 2270static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
2253 struct nfs_fsinfo *info) 2271 struct nfs_fsinfo *info)
2254{ 2272{
2273 int minor_version = server->nfs_client->cl_minorversion;
2255 int status = nfs4_lookup_root(server, fhandle, info); 2274 int status = nfs4_lookup_root(server, fhandle, info);
2256 if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) 2275 if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
2257 /* 2276 /*
2258 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM 2277 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
2259 * by nfs4_map_errors() as this function exits. 2278 * by nfs4_map_errors() as this function exits.
2260 */ 2279 */
2261 status = nfs4_find_root_sec(server, fhandle, info); 2280 status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info);
2262 if (status == 0) 2281 if (status == 0)
2263 status = nfs4_server_capabilities(server, fhandle); 2282 status = nfs4_server_capabilities(server, fhandle);
2264 if (status == 0) 2283 if (status == 0)
@@ -4441,6 +4460,20 @@ out:
4441 return err; 4460 return err;
4442} 4461}
4443 4462
4463#if defined(CONFIG_NFS_V4_1)
4464static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
4465{
4466 int status;
4467 struct nfs_server *server = NFS_SERVER(state->inode);
4468
4469 status = nfs41_test_stateid(server, state);
4470 if (status == NFS_OK)
4471 return 0;
4472 nfs41_free_stateid(server, state);
4473 return nfs4_lock_expired(state, request);
4474}
4475#endif
4476
4444static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 4477static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
4445{ 4478{
4446 struct nfs_inode *nfsi = NFS_I(state->inode); 4479 struct nfs_inode *nfsi = NFS_I(state->inode);
@@ -4779,6 +4812,16 @@ out_inval:
4779 return -NFS4ERR_INVAL; 4812 return -NFS4ERR_INVAL;
4780} 4813}
4781 4814
4815static bool
4816nfs41_same_server_scope(struct server_scope *a, struct server_scope *b)
4817{
4818 if (a->server_scope_sz == b->server_scope_sz &&
4819 memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0)
4820 return true;
4821
4822 return false;
4823}
4824
4782/* 4825/*
4783 * nfs4_proc_exchange_id() 4826 * nfs4_proc_exchange_id()
4784 * 4827 *
@@ -4821,9 +4864,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4821 init_utsname()->domainname, 4864 init_utsname()->domainname,
4822 clp->cl_rpcclient->cl_auth->au_flavor); 4865 clp->cl_rpcclient->cl_auth->au_flavor);
4823 4866
4867 res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
4868 if (unlikely(!res.server_scope))
4869 return -ENOMEM;
4870
4824 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 4871 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
4825 if (!status) 4872 if (!status)
4826 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); 4873 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4874
4875 if (!status) {
4876 if (clp->server_scope &&
4877 !nfs41_same_server_scope(clp->server_scope,
4878 res.server_scope)) {
4879 dprintk("%s: server_scope mismatch detected\n",
4880 __func__);
4881 set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
4882 kfree(clp->server_scope);
4883 clp->server_scope = NULL;
4884 }
4885
4886 if (!clp->server_scope)
4887 clp->server_scope = res.server_scope;
4888 else
4889 kfree(res.server_scope);
4890 }
4891
4827 dprintk("<-- %s status= %d\n", __func__, status); 4892 dprintk("<-- %s status= %d\n", __func__, status);
4828 return status; 4893 return status;
4829} 4894}
@@ -5704,7 +5769,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5704{ 5769{
5705 struct nfs4_layoutreturn *lrp = calldata; 5770 struct nfs4_layoutreturn *lrp = calldata;
5706 struct nfs_server *server; 5771 struct nfs_server *server;
5707 struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; 5772 struct pnfs_layout_hdr *lo = lrp->args.layout;
5708 5773
5709 dprintk("--> %s\n", __func__); 5774 dprintk("--> %s\n", __func__);
5710 5775
@@ -5733,7 +5798,7 @@ static void nfs4_layoutreturn_release(void *calldata)
5733 struct nfs4_layoutreturn *lrp = calldata; 5798 struct nfs4_layoutreturn *lrp = calldata;
5734 5799
5735 dprintk("--> %s\n", __func__); 5800 dprintk("--> %s\n", __func__);
5736 put_layout_hdr(NFS_I(lrp->args.inode)->layout); 5801 put_layout_hdr(lrp->args.layout);
5737 kfree(calldata); 5802 kfree(calldata);
5738 dprintk("<-- %s\n", __func__); 5803 dprintk("<-- %s\n", __func__);
5739} 5804}
@@ -5770,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5770 return status; 5835 return status;
5771} 5836}
5772 5837
5838/*
5839 * Retrieve the list of Data Server devices from the MDS.
5840 */
5841static int _nfs4_getdevicelist(struct nfs_server *server,
5842 const struct nfs_fh *fh,
5843 struct pnfs_devicelist *devlist)
5844{
5845 struct nfs4_getdevicelist_args args = {
5846 .fh = fh,
5847 .layoutclass = server->pnfs_curr_ld->id,
5848 };
5849 struct nfs4_getdevicelist_res res = {
5850 .devlist = devlist,
5851 };
5852 struct rpc_message msg = {
5853 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
5854 .rpc_argp = &args,
5855 .rpc_resp = &res,
5856 };
5857 int status;
5858
5859 dprintk("--> %s\n", __func__);
5860 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
5861 &res.seq_res, 0);
5862 dprintk("<-- %s status=%d\n", __func__, status);
5863 return status;
5864}
5865
5866int nfs4_proc_getdevicelist(struct nfs_server *server,
5867 const struct nfs_fh *fh,
5868 struct pnfs_devicelist *devlist)
5869{
5870 struct nfs4_exception exception = { };
5871 int err;
5872
5873 do {
5874 err = nfs4_handle_exception(server,
5875 _nfs4_getdevicelist(server, fh, devlist),
5876 &exception);
5877 } while (exception.retry);
5878
5879 dprintk("%s: err=%d, num_devs=%u\n", __func__,
5880 err, devlist->num_devs);
5881
5882 return err;
5883}
5884EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
5885
5773static int 5886static int
5774_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5887_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5775{ 5888{
@@ -5848,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5848static void nfs4_layoutcommit_release(void *calldata) 5961static void nfs4_layoutcommit_release(void *calldata)
5849{ 5962{
5850 struct nfs4_layoutcommit_data *data = calldata; 5963 struct nfs4_layoutcommit_data *data = calldata;
5964 struct pnfs_layout_segment *lseg, *tmp;
5851 5965
5966 pnfs_cleanup_layoutcommit(data);
5852 /* Matched by references in pnfs_set_layoutcommit */ 5967 /* Matched by references in pnfs_set_layoutcommit */
5853 put_lseg(data->lseg); 5968 list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
5969 list_del_init(&lseg->pls_lc_list);
5970 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
5971 &lseg->pls_flags))
5972 put_lseg(lseg);
5973 }
5854 put_rpccred(data->cred); 5974 put_rpccred(data->cred);
5855 kfree(data); 5975 kfree(data);
5856} 5976}
@@ -5901,6 +6021,143 @@ out:
5901 rpc_put_task(task); 6021 rpc_put_task(task);
5902 return status; 6022 return status;
5903} 6023}
6024
6025static int
6026_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
6027 struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
6028{
6029 struct nfs41_secinfo_no_name_args args = {
6030 .style = SECINFO_STYLE_CURRENT_FH,
6031 };
6032 struct nfs4_secinfo_res res = {
6033 .flavors = flavors,
6034 };
6035 struct rpc_message msg = {
6036 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME],
6037 .rpc_argp = &args,
6038 .rpc_resp = &res,
6039 };
6040 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
6041}
6042
6043static int
6044nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
6045 struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
6046{
6047 struct nfs4_exception exception = { };
6048 int err;
6049 do {
6050 err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
6051 switch (err) {
6052 case 0:
6053 case -NFS4ERR_WRONGSEC:
6054 case -NFS4ERR_NOTSUPP:
6055 break;
6056 default:
6057 err = nfs4_handle_exception(server, err, &exception);
6058 }
6059 } while (exception.retry);
6060 return err;
6061}
6062
6063static int
6064nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
6065 struct nfs_fsinfo *info)
6066{
6067 int err;
6068 struct page *page;
6069 rpc_authflavor_t flavor;
6070 struct nfs4_secinfo_flavors *flavors;
6071
6072 page = alloc_page(GFP_KERNEL);
6073 if (!page) {
6074 err = -ENOMEM;
6075 goto out;
6076 }
6077
6078 flavors = page_address(page);
6079 err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
6080
6081 /*
6082 * Fall back on "guess and check" method if
6083 * the server doesn't support SECINFO_NO_NAME
6084 */
6085 if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
6086 err = nfs4_find_root_sec(server, fhandle, info);
6087 goto out_freepage;
6088 }
6089 if (err)
6090 goto out_freepage;
6091
6092 flavor = nfs_find_best_sec(flavors);
6093 if (err == 0)
6094 err = nfs4_lookup_root_sec(server, fhandle, info, flavor);
6095
6096out_freepage:
6097 put_page(page);
6098 if (err == -EACCES)
6099 return -EPERM;
6100out:
6101 return err;
6102}
6103static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
6104{
6105 int status;
6106 struct nfs41_test_stateid_args args = {
6107 .stateid = &state->stateid,
6108 };
6109 struct nfs41_test_stateid_res res;
6110 struct rpc_message msg = {
6111 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
6112 .rpc_argp = &args,
6113 .rpc_resp = &res,
6114 };
6115 args.seq_args.sa_session = res.seq_res.sr_session = NULL;
6116 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
6117 return status;
6118}
6119
6120static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
6121{
6122 struct nfs4_exception exception = { };
6123 int err;
6124 do {
6125 err = nfs4_handle_exception(server,
6126 _nfs41_test_stateid(server, state),
6127 &exception);
6128 } while (exception.retry);
6129 return err;
6130}
6131
6132static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state)
6133{
6134 int status;
6135 struct nfs41_free_stateid_args args = {
6136 .stateid = &state->stateid,
6137 };
6138 struct nfs41_free_stateid_res res;
6139 struct rpc_message msg = {
6140 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
6141 .rpc_argp = &args,
6142 .rpc_resp = &res,
6143 };
6144
6145 args.seq_args.sa_session = res.seq_res.sr_session = NULL;
6146 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
6147 return status;
6148}
6149
6150static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state)
6151{
6152 struct nfs4_exception exception = { };
6153 int err;
6154 do {
6155 err = nfs4_handle_exception(server,
6156 _nfs4_free_stateid(server, state),
6157 &exception);
6158 } while (exception.retry);
6159 return err;
6160}
5904#endif /* CONFIG_NFS_V4_1 */ 6161#endif /* CONFIG_NFS_V4_1 */
5905 6162
5906struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 6163struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5937,8 +6194,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
5937struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { 6194struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
5938 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, 6195 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
5939 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 6196 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
5940 .recover_open = nfs4_open_expired, 6197 .recover_open = nfs41_open_expired,
5941 .recover_lock = nfs4_lock_expired, 6198 .recover_lock = nfs41_lock_expired,
5942 .establish_clid = nfs41_init_clientid, 6199 .establish_clid = nfs41_init_clientid,
5943 .get_clid_cred = nfs4_get_exchange_id_cred, 6200 .get_clid_cred = nfs4_get_exchange_id_cred,
5944}; 6201};
@@ -5962,6 +6219,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
5962 .minor_version = 0, 6219 .minor_version = 0,
5963 .call_sync = _nfs4_call_sync, 6220 .call_sync = _nfs4_call_sync,
5964 .validate_stateid = nfs4_validate_delegation_stateid, 6221 .validate_stateid = nfs4_validate_delegation_stateid,
6222 .find_root_sec = nfs4_find_root_sec,
5965 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 6223 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
5966 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 6224 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
5967 .state_renewal_ops = &nfs40_state_renewal_ops, 6225 .state_renewal_ops = &nfs40_state_renewal_ops,
@@ -5972,6 +6230,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
5972 .minor_version = 1, 6230 .minor_version = 1,
5973 .call_sync = _nfs4_call_sync_session, 6231 .call_sync = _nfs4_call_sync_session,
5974 .validate_stateid = nfs41_validate_delegation_stateid, 6232 .validate_stateid = nfs41_validate_delegation_stateid,
6233 .find_root_sec = nfs41_find_root_sec,
5975 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 6234 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
5976 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 6235 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
5977 .state_renewal_ops = &nfs41_state_renewal_ops, 6236 .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7acfe8843626..72ab97ef3d61 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1643,7 +1643,14 @@ static void nfs4_state_manager(struct nfs_client *clp)
1643 goto out_error; 1643 goto out_error;
1644 } 1644 }
1645 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1645 clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1646 set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); 1646
1647 if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH,
1648 &clp->cl_state))
1649 nfs4_state_start_reclaim_nograce(clp);
1650 else
1651 set_bit(NFS4CLNT_RECLAIM_REBOOT,
1652 &clp->cl_state);
1653
1647 pnfs_destroy_all_layouts(clp); 1654 pnfs_destroy_all_layouts(clp);
1648 } 1655 }
1649 1656
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6e8f3b9a1de..1dce12f41a4f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
113#define encode_restorefh_maxsz (op_encode_hdr_maxsz) 113#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
114#define decode_restorefh_maxsz (op_decode_hdr_maxsz) 114#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
115#define encode_fsinfo_maxsz (encode_getattr_maxsz) 115#define encode_fsinfo_maxsz (encode_getattr_maxsz)
116#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) 116/* The 5 accounts for the PNFS attributes, and assumes that at most three
117 * layout types will be returned.
118 */
119#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
120 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
117#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 121#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
118#define decode_renew_maxsz (op_decode_hdr_maxsz) 122#define decode_renew_maxsz (op_decode_hdr_maxsz)
119#define encode_setclientid_maxsz \ 123#define encode_setclientid_maxsz \
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);
314 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 318 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
315#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 319#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
316#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 320#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
321#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
322 encode_verifier_maxsz)
323#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
324 2 /* nfs_cookie4 gdlr_cookie */ + \
325 decode_verifier_maxsz \
326 /* verifier4 gdlr_verifier */ + \
327 1 /* gdlr_deviceid_list count */ + \
328 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
329 NFS4_DEVICEID4_SIZE) \
330 /* gdlr_deviceid_list */ + \
331 1 /* bool gdlr_eof */)
317#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ 332#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
318 XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) 333 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
319#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 334#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -343,6 +358,14 @@ static int nfs4_stat_to_errno(int);
343 1 /* FIXME: opaque lrf_body always empty at the moment */) 358 1 /* FIXME: opaque lrf_body always empty at the moment */)
344#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 359#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
345 1 + decode_stateid_maxsz) 360 1 + decode_stateid_maxsz)
361#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
362#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz
363#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \
364 XDR_QUADLEN(NFS4_STATEID_SIZE))
365#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1)
366#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \
367 XDR_QUADLEN(NFS4_STATEID_SIZE))
368#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1)
346#else /* CONFIG_NFS_V4_1 */ 369#else /* CONFIG_NFS_V4_1 */
347#define encode_sequence_maxsz 0 370#define encode_sequence_maxsz 0
348#define decode_sequence_maxsz 0 371#define decode_sequence_maxsz 0
@@ -740,6 +763,14 @@ static int nfs4_stat_to_errno(int);
740#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 763#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
741 decode_sequence_maxsz + \ 764 decode_sequence_maxsz + \
742 decode_reclaim_complete_maxsz) 765 decode_reclaim_complete_maxsz)
766#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
767 encode_sequence_maxsz + \
768 encode_putfh_maxsz + \
769 encode_getdevicelist_maxsz)
770#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
771 decode_sequence_maxsz + \
772 decode_putfh_maxsz + \
773 decode_getdevicelist_maxsz)
743#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 774#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
744 encode_sequence_maxsz +\ 775 encode_sequence_maxsz +\
745 encode_getdeviceinfo_maxsz) 776 encode_getdeviceinfo_maxsz)
@@ -772,6 +803,26 @@ static int nfs4_stat_to_errno(int);
772 decode_sequence_maxsz + \ 803 decode_sequence_maxsz + \
773 decode_putfh_maxsz + \ 804 decode_putfh_maxsz + \
774 decode_layoutreturn_maxsz) 805 decode_layoutreturn_maxsz)
806#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \
807 encode_sequence_maxsz + \
808 encode_putrootfh_maxsz +\
809 encode_secinfo_no_name_maxsz)
810#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \
811 decode_sequence_maxsz + \
812 decode_putrootfh_maxsz + \
813 decode_secinfo_no_name_maxsz)
814#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \
815 encode_sequence_maxsz + \
816 encode_test_stateid_maxsz)
817#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \
818 decode_sequence_maxsz + \
819 decode_test_stateid_maxsz)
820#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \
821 encode_sequence_maxsz + \
822 encode_free_stateid_maxsz)
823#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \
824 decode_sequence_maxsz + \
825 decode_free_stateid_maxsz)
775 826
776const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 827const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
777 compound_encode_hdr_maxsz + 828 compound_encode_hdr_maxsz +
@@ -1076,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
1076 hdr->replen += decode_getattr_maxsz; 1127 hdr->replen += decode_getattr_maxsz;
1077} 1128}
1078 1129
1130static void
1131encode_getattr_three(struct xdr_stream *xdr,
1132 uint32_t bm0, uint32_t bm1, uint32_t bm2,
1133 struct compound_hdr *hdr)
1134{
1135 __be32 *p;
1136
1137 p = reserve_space(xdr, 4);
1138 *p = cpu_to_be32(OP_GETATTR);
1139 if (bm2) {
1140 p = reserve_space(xdr, 16);
1141 *p++ = cpu_to_be32(3);
1142 *p++ = cpu_to_be32(bm0);
1143 *p++ = cpu_to_be32(bm1);
1144 *p = cpu_to_be32(bm2);
1145 } else if (bm1) {
1146 p = reserve_space(xdr, 12);
1147 *p++ = cpu_to_be32(2);
1148 *p++ = cpu_to_be32(bm0);
1149 *p = cpu_to_be32(bm1);
1150 } else {
1151 p = reserve_space(xdr, 8);
1152 *p++ = cpu_to_be32(1);
1153 *p = cpu_to_be32(bm0);
1154 }
1155 hdr->nops++;
1156 hdr->replen += decode_getattr_maxsz;
1157}
1158
1079static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1159static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1080{ 1160{
1081 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], 1161 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1084,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
1084 1164
1085static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1165static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1086{ 1166{
1087 encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], 1167 encode_getattr_three(xdr,
1088 bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); 1168 bitmask[0] & nfs4_fsinfo_bitmap[0],
1169 bitmask[1] & nfs4_fsinfo_bitmap[1],
1170 bitmask[2] & nfs4_fsinfo_bitmap[2],
1171 hdr);
1089} 1172}
1090 1173
1091static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1174static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1827,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr,
1827 1910
1828#ifdef CONFIG_NFS_V4_1 1911#ifdef CONFIG_NFS_V4_1
1829static void 1912static void
1913encode_getdevicelist(struct xdr_stream *xdr,
1914 const struct nfs4_getdevicelist_args *args,
1915 struct compound_hdr *hdr)
1916{
1917 __be32 *p;
1918 nfs4_verifier dummy = {
1919 .data = "dummmmmy",
1920 };
1921
1922 p = reserve_space(xdr, 20);
1923 *p++ = cpu_to_be32(OP_GETDEVICELIST);
1924 *p++ = cpu_to_be32(args->layoutclass);
1925 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1926 xdr_encode_hyper(p, 0ULL); /* cookie */
1927 encode_nfs4_verifier(xdr, &dummy);
1928 hdr->nops++;
1929 hdr->replen += decode_getdevicelist_maxsz;
1930}
1931
1932static void
1830encode_getdeviceinfo(struct xdr_stream *xdr, 1933encode_getdeviceinfo(struct xdr_stream *xdr,
1831 const struct nfs4_getdeviceinfo_args *args, 1934 const struct nfs4_getdeviceinfo_args *args,
1832 struct compound_hdr *hdr) 1935 struct compound_hdr *hdr)
@@ -1888,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
1888 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1991 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1889 /* Only whole file layouts */ 1992 /* Only whole file layouts */
1890 p = xdr_encode_hyper(p, 0); /* offset */ 1993 p = xdr_encode_hyper(p, 0); /* offset */
1891 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ 1994 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
1892 *p++ = cpu_to_be32(0); /* reclaim */ 1995 *p++ = cpu_to_be32(0); /* reclaim */
1893 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); 1996 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
1894 *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1997 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -1938,6 +2041,46 @@ encode_layoutreturn(struct xdr_stream *xdr,
1938 hdr->nops++; 2041 hdr->nops++;
1939 hdr->replen += decode_layoutreturn_maxsz; 2042 hdr->replen += decode_layoutreturn_maxsz;
1940} 2043}
2044
2045static int
2046encode_secinfo_no_name(struct xdr_stream *xdr,
2047 const struct nfs41_secinfo_no_name_args *args,
2048 struct compound_hdr *hdr)
2049{
2050 __be32 *p;
2051 p = reserve_space(xdr, 8);
2052 *p++ = cpu_to_be32(OP_SECINFO_NO_NAME);
2053 *p++ = cpu_to_be32(args->style);
2054 hdr->nops++;
2055 hdr->replen += decode_secinfo_no_name_maxsz;
2056 return 0;
2057}
2058
2059static void encode_test_stateid(struct xdr_stream *xdr,
2060 struct nfs41_test_stateid_args *args,
2061 struct compound_hdr *hdr)
2062{
2063 __be32 *p;
2064
2065 p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE);
2066 *p++ = cpu_to_be32(OP_TEST_STATEID);
2067 *p++ = cpu_to_be32(1);
2068 xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
2069 hdr->nops++;
2070 hdr->replen += decode_test_stateid_maxsz;
2071}
2072
2073static void encode_free_stateid(struct xdr_stream *xdr,
2074 struct nfs41_free_stateid_args *args,
2075 struct compound_hdr *hdr)
2076{
2077 __be32 *p;
2078 p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE);
2079 *p++ = cpu_to_be32(OP_FREE_STATEID);
2080 xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
2081 hdr->nops++;
2082 hdr->replen += decode_free_stateid_maxsz;
2083}
1941#endif /* CONFIG_NFS_V4_1 */ 2084#endif /* CONFIG_NFS_V4_1 */
1942 2085
1943/* 2086/*
@@ -2536,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
2536 struct compound_hdr hdr = { 2679 struct compound_hdr hdr = {
2537 .nops = 0, 2680 .nops = 0,
2538 }; 2681 };
2539 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2682 const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
2540 2683
2541 encode_compound_hdr(xdr, req, &hdr); 2684 encode_compound_hdr(xdr, req, &hdr);
2542 encode_setclientid_confirm(xdr, arg, &hdr); 2685 encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2680,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
2680 struct compound_hdr hdr = { 2823 struct compound_hdr hdr = {
2681 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), 2824 .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
2682 }; 2825 };
2683 const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; 2826 const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
2684 2827
2685 encode_compound_hdr(xdr, req, &hdr); 2828 encode_compound_hdr(xdr, req, &hdr);
2686 encode_sequence(xdr, &args->la_seq_args, &hdr); 2829 encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2707,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2707} 2850}
2708 2851
2709/* 2852/*
2853 * Encode GETDEVICELIST request
2854 */
2855static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2856 struct xdr_stream *xdr,
2857 struct nfs4_getdevicelist_args *args)
2858{
2859 struct compound_hdr hdr = {
2860 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2861 };
2862
2863 encode_compound_hdr(xdr, req, &hdr);
2864 encode_sequence(xdr, &args->seq_args, &hdr);
2865 encode_putfh(xdr, args->fh, &hdr);
2866 encode_getdevicelist(xdr, args, &hdr);
2867 encode_nops(&hdr);
2868}
2869
2870/*
2710 * Encode GETDEVICEINFO request 2871 * Encode GETDEVICEINFO request
2711 */ 2872 */
2712static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2873static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -2790,6 +2951,59 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
2790 encode_layoutreturn(xdr, args, &hdr); 2951 encode_layoutreturn(xdr, args, &hdr);
2791 encode_nops(&hdr); 2952 encode_nops(&hdr);
2792} 2953}
2954
2955/*
2956 * Encode SECINFO_NO_NAME request
2957 */
2958static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req,
2959 struct xdr_stream *xdr,
2960 struct nfs41_secinfo_no_name_args *args)
2961{
2962 struct compound_hdr hdr = {
2963 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2964 };
2965
2966 encode_compound_hdr(xdr, req, &hdr);
2967 encode_sequence(xdr, &args->seq_args, &hdr);
2968 encode_putrootfh(xdr, &hdr);
2969 encode_secinfo_no_name(xdr, args, &hdr);
2970 encode_nops(&hdr);
2971 return 0;
2972}
2973
2974/*
2975 * Encode TEST_STATEID request
2976 */
2977static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req,
2978 struct xdr_stream *xdr,
2979 struct nfs41_test_stateid_args *args)
2980{
2981 struct compound_hdr hdr = {
2982 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2983 };
2984
2985 encode_compound_hdr(xdr, req, &hdr);
2986 encode_sequence(xdr, &args->seq_args, &hdr);
2987 encode_test_stateid(xdr, args, &hdr);
2988 encode_nops(&hdr);
2989}
2990
2991/*
2992 * Encode FREE_STATEID request
2993 */
2994static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
2995 struct xdr_stream *xdr,
2996 struct nfs41_free_stateid_args *args)
2997{
2998 struct compound_hdr hdr = {
2999 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
3000 };
3001
3002 encode_compound_hdr(xdr, req, &hdr);
3003 encode_sequence(xdr, &args->seq_args, &hdr);
3004 encode_free_stateid(xdr, args, &hdr);
3005 encode_nops(&hdr);
3006}
2793#endif /* CONFIG_NFS_V4_1 */ 3007#endif /* CONFIG_NFS_V4_1 */
2794 3008
2795static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 3009static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2890,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2890 goto out_overflow; 3104 goto out_overflow;
2891 bmlen = be32_to_cpup(p); 3105 bmlen = be32_to_cpup(p);
2892 3106
2893 bitmap[0] = bitmap[1] = 0; 3107 bitmap[0] = bitmap[1] = bitmap[2] = 0;
2894 p = xdr_inline_decode(xdr, (bmlen << 2)); 3108 p = xdr_inline_decode(xdr, (bmlen << 2));
2895 if (unlikely(!p)) 3109 if (unlikely(!p))
2896 goto out_overflow; 3110 goto out_overflow;
2897 if (bmlen > 0) { 3111 if (bmlen > 0) {
2898 bitmap[0] = be32_to_cpup(p++); 3112 bitmap[0] = be32_to_cpup(p++);
2899 if (bmlen > 1) 3113 if (bmlen > 1) {
2900 bitmap[1] = be32_to_cpup(p); 3114 bitmap[1] = be32_to_cpup(p++);
3115 if (bmlen > 2)
3116 bitmap[2] = be32_to_cpup(p);
3117 }
2901 } 3118 }
2902 return 0; 3119 return 0;
2903out_overflow: 3120out_overflow:
@@ -2929,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2929 return ret; 3146 return ret;
2930 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; 3147 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
2931 } else 3148 } else
2932 bitmask[0] = bitmask[1] = 0; 3149 bitmask[0] = bitmask[1] = bitmask[2] = 0;
2933 dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); 3150 dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
3151 bitmask[0], bitmask[1], bitmask[2]);
2934 return 0; 3152 return 0;
2935} 3153}
2936 3154
@@ -3984,7 +4202,7 @@ out_overflow:
3984static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 4202static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
3985{ 4203{
3986 __be32 *savep; 4204 __be32 *savep;
3987 uint32_t attrlen, bitmap[2] = {0}; 4205 uint32_t attrlen, bitmap[3] = {0};
3988 int status; 4206 int status;
3989 4207
3990 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4208 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4010,7 +4228,7 @@ xdr_error:
4010static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) 4228static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
4011{ 4229{
4012 __be32 *savep; 4230 __be32 *savep;
4013 uint32_t attrlen, bitmap[2] = {0}; 4231 uint32_t attrlen, bitmap[3] = {0};
4014 int status; 4232 int status;
4015 4233
4016 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4234 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4042,7 +4260,7 @@ xdr_error:
4042static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) 4260static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
4043{ 4261{
4044 __be32 *savep; 4262 __be32 *savep;
4045 uint32_t attrlen, bitmap[2] = {0}; 4263 uint32_t attrlen, bitmap[3] = {0};
4046 int status; 4264 int status;
4047 4265
4048 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4266 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4182,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4182{ 4400{
4183 __be32 *savep; 4401 __be32 *savep;
4184 uint32_t attrlen, 4402 uint32_t attrlen,
4185 bitmap[2] = {0}; 4403 bitmap[3] = {0};
4186 int status; 4404 int status;
4187 4405
4188 status = decode_op_hdr(xdr, OP_GETATTR); 4406 status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4268,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
4268 return status; 4486 return status;
4269} 4487}
4270 4488
4489/*
4490 * The prefered block size for layout directed io
4491 */
4492static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
4493 uint32_t *res)
4494{
4495 __be32 *p;
4496
4497 dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
4498 *res = 0;
4499 if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
4500 p = xdr_inline_decode(xdr, 4);
4501 if (unlikely(!p)) {
4502 print_overflow_msg(__func__, xdr);
4503 return -EIO;
4504 }
4505 *res = be32_to_cpup(p);
4506 bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
4507 }
4508 return 0;
4509}
4510
4271static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 4511static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
4272{ 4512{
4273 __be32 *savep; 4513 __be32 *savep;
4274 uint32_t attrlen, bitmap[2]; 4514 uint32_t attrlen, bitmap[3];
4275 int status; 4515 int status;
4276 4516
4277 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4517 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4299,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
4299 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); 4539 status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
4300 if (status != 0) 4540 if (status != 0)
4301 goto xdr_error; 4541 goto xdr_error;
4542 status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
4543 if (status)
4544 goto xdr_error;
4302 4545
4303 status = verify_attr_len(xdr, savep, attrlen); 4546 status = verify_attr_len(xdr, savep, attrlen);
4304xdr_error: 4547xdr_error:
@@ -4718,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
4718{ 4961{
4719 __be32 *savep; 4962 __be32 *savep;
4720 uint32_t attrlen, 4963 uint32_t attrlen,
4721 bitmap[2] = {0}; 4964 bitmap[3] = {0};
4722 struct kvec *iov = req->rq_rcv_buf.head; 4965 struct kvec *iov = req->rq_rcv_buf.head;
4723 int status; 4966 int status;
4724 4967
@@ -4977,11 +5220,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4977 if (unlikely(status)) 5220 if (unlikely(status))
4978 return status; 5221 return status;
4979 5222
4980 /* Throw away server_scope */ 5223 /* Save server_scope */
4981 status = decode_opaque_inline(xdr, &dummy, &dummy_str); 5224 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4982 if (unlikely(status)) 5225 if (unlikely(status))
4983 return status; 5226 return status;
4984 5227
5228 if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
5229 return -EIO;
5230
5231 memcpy(res->server_scope->server_scope, dummy_str, dummy);
5232 res->server_scope->server_scope_sz = dummy;
5233
4985 /* Throw away Implementation id array */ 5234 /* Throw away Implementation id array */
4986 status = decode_opaque_inline(xdr, &dummy, &dummy_str); 5235 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4987 if (unlikely(status)) 5236 if (unlikely(status))
@@ -5141,6 +5390,53 @@ out_overflow:
5141} 5390}
5142 5391
5143#if defined(CONFIG_NFS_V4_1) 5392#if defined(CONFIG_NFS_V4_1)
5393/*
5394 * TODO: Need to handle case when EOF != true;
5395 */
5396static int decode_getdevicelist(struct xdr_stream *xdr,
5397 struct pnfs_devicelist *res)
5398{
5399 __be32 *p;
5400 int status, i;
5401 struct nfs_writeverf verftemp;
5402
5403 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5404 if (status)
5405 return status;
5406
5407 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5408 if (unlikely(!p))
5409 goto out_overflow;
5410
5411 /* TODO: Skip cookie for now */
5412 p += 2;
5413
5414 /* Read verifier */
5415 p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
5416
5417 res->num_devs = be32_to_cpup(p);
5418
5419 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5420
5421 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5422 printk(KERN_ERR "%s too many result dev_num %u\n",
5423 __func__, res->num_devs);
5424 return -EIO;
5425 }
5426
5427 p = xdr_inline_decode(xdr,
5428 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5429 if (unlikely(!p))
5430 goto out_overflow;
5431 for (i = 0; i < res->num_devs; i++)
5432 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5433 NFS4_DEVICEID4_SIZE);
5434 res->eof = be32_to_cpup(p);
5435 return 0;
5436out_overflow:
5437 print_overflow_msg(__func__, xdr);
5438 return -EIO;
5439}
5144 5440
5145static int decode_getdeviceinfo(struct xdr_stream *xdr, 5441static int decode_getdeviceinfo(struct xdr_stream *xdr,
5146 struct pnfs_device *pdev) 5442 struct pnfs_device *pdev)
@@ -5303,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
5303 int status; 5599 int status;
5304 5600
5305 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); 5601 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
5602 res->status = status;
5306 if (status) 5603 if (status)
5307 return status; 5604 return status;
5308 5605
@@ -5322,6 +5619,55 @@ out_overflow:
5322 print_overflow_msg(__func__, xdr); 5619 print_overflow_msg(__func__, xdr);
5323 return -EIO; 5620 return -EIO;
5324} 5621}
5622
5623static int decode_test_stateid(struct xdr_stream *xdr,
5624 struct nfs41_test_stateid_res *res)
5625{
5626 __be32 *p;
5627 int status;
5628 int num_res;
5629
5630 status = decode_op_hdr(xdr, OP_TEST_STATEID);
5631 if (status)
5632 return status;
5633
5634 p = xdr_inline_decode(xdr, 4);
5635 if (unlikely(!p))
5636 goto out_overflow;
5637 num_res = be32_to_cpup(p++);
5638 if (num_res != 1)
5639 goto out;
5640
5641 p = xdr_inline_decode(xdr, 4);
5642 if (unlikely(!p))
5643 goto out_overflow;
5644 res->status = be32_to_cpup(p++);
5645 return res->status;
5646out_overflow:
5647 print_overflow_msg(__func__, xdr);
5648out:
5649 return -EIO;
5650}
5651
5652static int decode_free_stateid(struct xdr_stream *xdr,
5653 struct nfs41_free_stateid_res *res)
5654{
5655 __be32 *p;
5656 int status;
5657
5658 status = decode_op_hdr(xdr, OP_FREE_STATEID);
5659 if (status)
5660 return status;
5661
5662 p = xdr_inline_decode(xdr, 4);
5663 if (unlikely(!p))
5664 goto out_overflow;
5665 res->status = be32_to_cpup(p++);
5666 return res->status;
5667out_overflow:
5668 print_overflow_msg(__func__, xdr);
5669 return -EIO;
5670}
5325#endif /* CONFIG_NFS_V4_1 */ 5671#endif /* CONFIG_NFS_V4_1 */
5326 5672
5327/* 5673/*
@@ -6366,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
6366} 6712}
6367 6713
6368/* 6714/*
6715 * Decode GETDEVICELIST response
6716 */
6717static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
6718 struct xdr_stream *xdr,
6719 struct nfs4_getdevicelist_res *res)
6720{
6721 struct compound_hdr hdr;
6722 int status;
6723
6724 dprintk("encoding getdevicelist!\n");
6725
6726 status = decode_compound_hdr(xdr, &hdr);
6727 if (status != 0)
6728 goto out;
6729 status = decode_sequence(xdr, &res->seq_res, rqstp);
6730 if (status != 0)
6731 goto out;
6732 status = decode_putfh(xdr);
6733 if (status != 0)
6734 goto out;
6735 status = decode_getdevicelist(xdr, res->devlist);
6736out:
6737 return status;
6738}
6739
6740/*
6369 * Decode GETDEVINFO response 6741 * Decode GETDEVINFO response
6370 */ 6742 */
6371static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, 6743static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6461,6 +6833,72 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
6461out: 6833out:
6462 return status; 6834 return status;
6463} 6835}
6836
6837/*
6838 * Decode SECINFO_NO_NAME response
6839 */
6840static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp,
6841 struct xdr_stream *xdr,
6842 struct nfs4_secinfo_res *res)
6843{
6844 struct compound_hdr hdr;
6845 int status;
6846
6847 status = decode_compound_hdr(xdr, &hdr);
6848 if (status)
6849 goto out;
6850 status = decode_sequence(xdr, &res->seq_res, rqstp);
6851 if (status)
6852 goto out;
6853 status = decode_putrootfh(xdr);
6854 if (status)
6855 goto out;
6856 status = decode_secinfo(xdr, res);
6857out:
6858 return status;
6859}
6860
6861/*
6862 * Decode TEST_STATEID response
6863 */
6864static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp,
6865 struct xdr_stream *xdr,
6866 struct nfs41_test_stateid_res *res)
6867{
6868 struct compound_hdr hdr;
6869 int status;
6870
6871 status = decode_compound_hdr(xdr, &hdr);
6872 if (status)
6873 goto out;
6874 status = decode_sequence(xdr, &res->seq_res, rqstp);
6875 if (status)
6876 goto out;
6877 status = decode_test_stateid(xdr, res);
6878out:
6879 return status;
6880}
6881
6882/*
6883 * Decode FREE_STATEID response
6884 */
6885static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp,
6886 struct xdr_stream *xdr,
6887 struct nfs41_free_stateid_res *res)
6888{
6889 struct compound_hdr hdr;
6890 int status;
6891
6892 status = decode_compound_hdr(xdr, &hdr);
6893 if (status)
6894 goto out;
6895 status = decode_sequence(xdr, &res->seq_res, rqstp);
6896 if (status)
6897 goto out;
6898 status = decode_free_stateid(xdr, res);
6899out:
6900 return status;
6901}
6464#endif /* CONFIG_NFS_V4_1 */ 6902#endif /* CONFIG_NFS_V4_1 */
6465 6903
6466/** 6904/**
@@ -6480,7 +6918,7 @@ out:
6480int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, 6918int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6481 int plus) 6919 int plus)
6482{ 6920{
6483 uint32_t bitmap[2] = {0}; 6921 uint32_t bitmap[3] = {0};
6484 uint32_t len; 6922 uint32_t len;
6485 __be32 *p = xdr_inline_decode(xdr, 4); 6923 __be32 *p = xdr_inline_decode(xdr, 4);
6486 if (unlikely(!p)) 6924 if (unlikely(!p))
@@ -6663,6 +7101,10 @@ struct rpc_procinfo nfs4_procedures[] = {
6663 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 7101 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6664 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), 7102 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6665 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), 7103 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
7104 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7105 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7106 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7107 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
6666#endif /* CONFIG_NFS_V4_1 */ 7108#endif /* CONFIG_NFS_V4_1 */
6667}; 7109};
6668 7110
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 8ff2ea3f10ef..d0cda12fddc3 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
479 for (i = 0; i < ios->numdevs; i++) { 479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi; 480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or; 481 struct osd_request *or = ios->per_dev[i].or;
482 unsigned dev;
483 int ret; 482 int ret;
484 483
485 if (!or) 484 if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
500 499
501 continue; /* we recovered */ 500 continue; /* we recovered */
502 } 501 }
503 dev = ios->per_dev[i].dev; 502 objlayout_io_set_result(&ios->ol_state, i,
504 objlayout_io_set_result(&ios->ol_state, dev, 503 &ios->layout->comps[i].oc_object_id,
505 &ios->layout->comps[dev].oc_object_id,
506 osd_pri_2_pnfs_err(osi.osd_err_pri), 504 osd_pri_2_pnfs_err(osi.osd_err_pri),
507 ios->per_dev[i].offset, 505 ios->per_dev[i].offset,
508 ios->per_dev[i].length, 506 ios->per_dev[i].length,
@@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
589} 587}
590 588
591static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, 589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
592 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, 590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
593 gfp_t gfp_flags) 591 gfp_t gfp_flags)
594{ 592{
595 unsigned pg = *cur_pg; 593 unsigned pg = *cur_pg;
594 int cur_len = len;
596 struct request_queue *q = 595 struct request_queue *q =
597 osd_request_queue(_io_od(ios, per_dev->dev)); 596 osd_request_queue(_io_od(ios, per_dev->dev));
598 597
599 per_dev->length += cur_len;
600
601 if (per_dev->bio == NULL) { 598 if (per_dev->bio == NULL) {
602 unsigned stripes = ios->layout->num_comps / 599 unsigned pages_in_stripe = ios->layout->group_width *
603 ios->layout->mirrors_p1;
604 unsigned pages_in_stripe = stripes *
605 (ios->layout->stripe_unit / PAGE_SIZE); 600 (ios->layout->stripe_unit / PAGE_SIZE);
606 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / 601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
607 stripes; 602 ios->layout->group_width;
608 603
609 if (BIO_MAX_PAGES_KMALLOC < bio_size) 604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
610 bio_size = BIO_MAX_PAGES_KMALLOC; 605 bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
632 } 627 }
633 BUG_ON(cur_len); 628 BUG_ON(cur_len);
634 629
630 per_dev->length += len;
635 *cur_pg = pg; 631 *cur_pg = pg;
636 return 0; 632 return 0;
637} 633}
@@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
650 int ret = 0; 646 int ret = 0;
651 647
652 while (length) { 648 while (length) {
653 struct _objio_per_comp *per_dev = &ios->per_dev[dev]; 649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
654 unsigned cur_len, page_off = 0; 650 unsigned cur_len, page_off = 0;
655 651
656 if (!per_dev->length) { 652 if (!per_dev->length) {
@@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
670 cur_len = stripe_unit; 666 cur_len = stripe_unit;
671 } 667 }
672 668
673 if (max_comp < dev) 669 if (max_comp < dev - first_dev)
674 max_comp = dev; 670 max_comp = dev - first_dev;
675 } else { 671 } else {
676 cur_len = stripe_unit; 672 cur_len = stripe_unit;
677 } 673 }
@@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
806 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; 802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
807 unsigned dev = per_dev->dev; 803 unsigned dev = per_dev->dev;
808 struct pnfs_osd_object_cred *cred = 804 struct pnfs_osd_object_cred *cred =
809 &ios->layout->comps[dev]; 805 &ios->layout->comps[cur_comp];
810 struct osd_obj_id obj = { 806 struct osd_obj_id obj = {
811 .partition = cred->oc_object_id.oid_partition_id, 807 .partition = cred->oc_object_id.oid_partition_id,
812 .id = cred->oc_object_id.oid_object_id, 808 .id = cred->oc_object_id.oid_object_id,
@@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
904 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
905 struct osd_request *or = NULL; 901 struct osd_request *or = NULL;
906 struct pnfs_osd_object_cred *cred = 902 struct pnfs_osd_object_cred *cred =
907 &ios->layout->comps[dev]; 903 &ios->layout->comps[cur_comp];
908 struct osd_obj_id obj = { 904 struct osd_obj_id obj = {
909 .partition = cred->oc_object_id.oid_partition_id, 905 .partition = cred->oc_object_id.oid_partition_id,
910 .id = cred->oc_object_id.oid_object_id, 906 .id = cred->oc_object_id.oid_object_id,
@@ -1000,13 +996,22 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
1000 if (!pnfs_generic_pg_test(pgio, prev, req)) 996 if (!pnfs_generic_pg_test(pgio, prev, req))
1001 return false; 997 return false;
1002 998
1003 if (pgio->pg_lseg == NULL)
1004 return true;
1005
1006 return pgio->pg_count + req->wb_bytes <= 999 return pgio->pg_count + req->wb_bytes <=
1007 OBJIO_LSEG(pgio->pg_lseg)->max_io_size; 1000 OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
1008} 1001}
1009 1002
1003static const struct nfs_pageio_ops objio_pg_read_ops = {
1004 .pg_init = pnfs_generic_pg_init_read,
1005 .pg_test = objio_pg_test,
1006 .pg_doio = pnfs_generic_pg_readpages,
1007};
1008
1009static const struct nfs_pageio_ops objio_pg_write_ops = {
1010 .pg_init = pnfs_generic_pg_init_write,
1011 .pg_test = objio_pg_test,
1012 .pg_doio = pnfs_generic_pg_writepages,
1013};
1014
1010static struct pnfs_layoutdriver_type objlayout_type = { 1015static struct pnfs_layoutdriver_type objlayout_type = {
1011 .id = LAYOUT_OSD2_OBJECTS, 1016 .id = LAYOUT_OSD2_OBJECTS,
1012 .name = "LAYOUT_OSD2_OBJECTS", 1017 .name = "LAYOUT_OSD2_OBJECTS",
@@ -1020,7 +1025,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
1020 1025
1021 .read_pagelist = objlayout_read_pagelist, 1026 .read_pagelist = objlayout_read_pagelist,
1022 .write_pagelist = objlayout_write_pagelist, 1027 .write_pagelist = objlayout_write_pagelist,
1023 .pg_test = objio_pg_test, 1028 .pg_read_ops = &objio_pg_read_ops,
1029 .pg_write_ops = &objio_pg_write_ops,
1024 1030
1025 .free_deviceid_node = objio_free_deviceid_node, 1031 .free_deviceid_node = objio_free_deviceid_node,
1026 1032
@@ -1055,5 +1061,7 @@ objlayout_exit(void)
1055 __func__); 1061 __func__);
1056} 1062}
1057 1063
1064MODULE_ALIAS("nfs-layouttype4-2");
1065
1058module_init(objlayout_init); 1066module_init(objlayout_init);
1059module_exit(objlayout_exit); 1067module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index 16fc758e9123..b3918f7ac34d 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map); 170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++); 171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++); 172 layout->olo_num_comps = be32_to_cpup(p++);
173 dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
174 layout->olo_comps_index, layout->olo_num_comps);
175
173 iter->total_comps = layout->olo_num_comps; 176 iter->total_comps = layout->olo_num_comps;
174 return 0; 177 return 0;
175} 178}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 18449f43c568..b60970cc7f1f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
230 */ 230 */
231void nfs_pageio_init(struct nfs_pageio_descriptor *desc, 231void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
232 struct inode *inode, 232 struct inode *inode,
233 int (*doio)(struct nfs_pageio_descriptor *), 233 const struct nfs_pageio_ops *pg_ops,
234 size_t bsize, 234 size_t bsize,
235 int io_flags) 235 int io_flags)
236{ 236{
@@ -240,13 +240,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
240 desc->pg_bsize = bsize; 240 desc->pg_bsize = bsize;
241 desc->pg_base = 0; 241 desc->pg_base = 0;
242 desc->pg_moreio = 0; 242 desc->pg_moreio = 0;
243 desc->pg_recoalesce = 0;
243 desc->pg_inode = inode; 244 desc->pg_inode = inode;
244 desc->pg_doio = doio; 245 desc->pg_ops = pg_ops;
245 desc->pg_ioflags = io_flags; 246 desc->pg_ioflags = io_flags;
246 desc->pg_error = 0; 247 desc->pg_error = 0;
247 desc->pg_lseg = NULL; 248 desc->pg_lseg = NULL;
248 desc->pg_test = nfs_generic_pg_test;
249 pnfs_pageio_init(desc, inode);
250} 249}
251 250
252/** 251/**
@@ -276,7 +275,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
276 return false; 275 return false;
277 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 276 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
278 return false; 277 return false;
279 return pgio->pg_test(pgio, prev, req); 278 return pgio->pg_ops->pg_test(pgio, prev, req);
280} 279}
281 280
282/** 281/**
@@ -297,6 +296,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
297 if (!nfs_can_coalesce_requests(prev, req, desc)) 296 if (!nfs_can_coalesce_requests(prev, req, desc))
298 return 0; 297 return 0;
299 } else { 298 } else {
299 if (desc->pg_ops->pg_init)
300 desc->pg_ops->pg_init(desc, req);
300 desc->pg_base = req->wb_pgbase; 301 desc->pg_base = req->wb_pgbase;
301 } 302 }
302 nfs_list_remove_request(req); 303 nfs_list_remove_request(req);
@@ -311,7 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
311static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 312static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
312{ 313{
313 if (!list_empty(&desc->pg_list)) { 314 if (!list_empty(&desc->pg_list)) {
314 int error = desc->pg_doio(desc); 315 int error = desc->pg_ops->pg_doio(desc);
315 if (error < 0) 316 if (error < 0)
316 desc->pg_error = error; 317 desc->pg_error = error;
317 else 318 else
@@ -331,7 +332,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
331 * Returns true if the request 'req' was successfully coalesced into the 332 * Returns true if the request 'req' was successfully coalesced into the
332 * existing list of pages 'desc'. 333 * existing list of pages 'desc'.
333 */ 334 */
334int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 335static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
335 struct nfs_page *req) 336 struct nfs_page *req)
336{ 337{
337 while (!nfs_pageio_do_add_request(desc, req)) { 338 while (!nfs_pageio_do_add_request(desc, req)) {
@@ -340,17 +341,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
340 if (desc->pg_error < 0) 341 if (desc->pg_error < 0)
341 return 0; 342 return 0;
342 desc->pg_moreio = 0; 343 desc->pg_moreio = 0;
344 if (desc->pg_recoalesce)
345 return 0;
343 } 346 }
344 return 1; 347 return 1;
345} 348}
346 349
350static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
351{
352 LIST_HEAD(head);
353
354 do {
355 list_splice_init(&desc->pg_list, &head);
356 desc->pg_bytes_written -= desc->pg_count;
357 desc->pg_count = 0;
358 desc->pg_base = 0;
359 desc->pg_recoalesce = 0;
360
361 while (!list_empty(&head)) {
362 struct nfs_page *req;
363
364 req = list_first_entry(&head, struct nfs_page, wb_list);
365 nfs_list_remove_request(req);
366 if (__nfs_pageio_add_request(desc, req))
367 continue;
368 if (desc->pg_error < 0)
369 return 0;
370 break;
371 }
372 } while (desc->pg_recoalesce);
373 return 1;
374}
375
376int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
377 struct nfs_page *req)
378{
379 int ret;
380
381 do {
382 ret = __nfs_pageio_add_request(desc, req);
383 if (ret)
384 break;
385 if (desc->pg_error < 0)
386 break;
387 ret = nfs_do_recoalesce(desc);
388 } while (ret);
389 return ret;
390}
391
347/** 392/**
348 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 393 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
349 * @desc: pointer to io descriptor 394 * @desc: pointer to io descriptor
350 */ 395 */
351void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) 396void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
352{ 397{
353 nfs_pageio_doio(desc); 398 for (;;) {
399 nfs_pageio_doio(desc);
400 if (!desc->pg_recoalesce)
401 break;
402 if (!nfs_do_recoalesce(desc))
403 break;
404 }
354} 405}
355 406
356/** 407/**
@@ -369,7 +420,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
369 if (!list_empty(&desc->pg_list)) { 420 if (!list_empty(&desc->pg_list)) {
370 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); 421 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
371 if (index != prev->wb_index + 1) 422 if (index != prev->wb_index + 1)
372 nfs_pageio_doio(desc); 423 nfs_pageio_complete(desc);
373 } 424 }
374} 425}
375 426
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 29c0ca7fc347..e550e8836c37 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -28,6 +28,7 @@
28 */ 28 */
29 29
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include <linux/nfs_page.h>
31#include "internal.h" 32#include "internal.h"
32#include "pnfs.h" 33#include "pnfs.h"
33#include "iostat.h" 34#include "iostat.h"
@@ -75,8 +76,11 @@ find_pnfs_driver(u32 id)
75void 76void
76unset_pnfs_layoutdriver(struct nfs_server *nfss) 77unset_pnfs_layoutdriver(struct nfs_server *nfss)
77{ 78{
78 if (nfss->pnfs_curr_ld) 79 if (nfss->pnfs_curr_ld) {
80 if (nfss->pnfs_curr_ld->clear_layoutdriver)
81 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner); 82 module_put(nfss->pnfs_curr_ld->owner);
83 }
80 nfss->pnfs_curr_ld = NULL; 84 nfss->pnfs_curr_ld = NULL;
81} 85}
82 86
@@ -87,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
87 * @id layout type. Zero (illegal layout type) indicates pNFS not in use. 91 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
88 */ 92 */
89void 93void
90set_pnfs_layoutdriver(struct nfs_server *server, u32 id) 94set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
95 u32 id)
91{ 96{
92 struct pnfs_layoutdriver_type *ld_type = NULL; 97 struct pnfs_layoutdriver_type *ld_type = NULL;
93 98
@@ -114,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
114 goto out_no_driver; 119 goto out_no_driver;
115 } 120 }
116 server->pnfs_curr_ld = ld_type; 121 server->pnfs_curr_ld = ld_type;
122 if (ld_type->set_layoutdriver
123 && ld_type->set_layoutdriver(server, mntfh)) {
124 printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
125 __func__, id);
126 module_put(ld_type->owner);
127 goto out_no_driver;
128 }
117 129
118 dprintk("%s: pNFS module for %u set\n", __func__, id); 130 dprintk("%s: pNFS module for %u set\n", __func__, id);
119 return; 131 return;
@@ -189,6 +201,7 @@ static void
189pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 201pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
190{ 202{
191 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; 203 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
204 put_rpccred(lo->plh_lc_cred);
192 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); 205 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
193} 206}
194 207
@@ -223,6 +236,7 @@ static void
223init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 236init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
224{ 237{
225 INIT_LIST_HEAD(&lseg->pls_list); 238 INIT_LIST_HEAD(&lseg->pls_list);
239 INIT_LIST_HEAD(&lseg->pls_lc_list);
226 atomic_set(&lseg->pls_refcount, 1); 240 atomic_set(&lseg->pls_refcount, 1);
227 smp_mb(); 241 smp_mb();
228 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 242 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -448,11 +462,20 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
448void 462void
449pnfs_destroy_all_layouts(struct nfs_client *clp) 463pnfs_destroy_all_layouts(struct nfs_client *clp)
450{ 464{
465 struct nfs_server *server;
451 struct pnfs_layout_hdr *lo; 466 struct pnfs_layout_hdr *lo;
452 LIST_HEAD(tmp_list); 467 LIST_HEAD(tmp_list);
453 468
469 nfs4_deviceid_mark_client_invalid(clp);
470 nfs4_deviceid_purge_client(clp);
471
454 spin_lock(&clp->cl_lock); 472 spin_lock(&clp->cl_lock);
455 list_splice_init(&clp->cl_layouts, &tmp_list); 473 rcu_read_lock();
474 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
475 if (!list_empty(&server->layouts))
476 list_splice_init(&server->layouts, &tmp_list);
477 }
478 rcu_read_unlock();
456 spin_unlock(&clp->cl_lock); 479 spin_unlock(&clp->cl_lock);
457 480
458 while (!list_empty(&tmp_list)) { 481 while (!list_empty(&tmp_list)) {
@@ -661,6 +684,7 @@ _pnfs_return_layout(struct inode *ino)
661 lrp->args.stateid = stateid; 684 lrp->args.stateid = stateid;
662 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; 685 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
663 lrp->args.inode = ino; 686 lrp->args.inode = ino;
687 lrp->args.layout = lo;
664 lrp->clp = NFS_SERVER(ino)->nfs_client; 688 lrp->clp = NFS_SERVER(ino)->nfs_client;
665 689
666 status = nfs4_proc_layoutreturn(lrp); 690 status = nfs4_proc_layoutreturn(lrp);
@@ -805,7 +829,9 @@ out:
805} 829}
806 830
807static struct pnfs_layout_hdr * 831static struct pnfs_layout_hdr *
808alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) 832alloc_init_layout_hdr(struct inode *ino,
833 struct nfs_open_context *ctx,
834 gfp_t gfp_flags)
809{ 835{
810 struct pnfs_layout_hdr *lo; 836 struct pnfs_layout_hdr *lo;
811 837
@@ -817,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
817 INIT_LIST_HEAD(&lo->plh_segs); 843 INIT_LIST_HEAD(&lo->plh_segs);
818 INIT_LIST_HEAD(&lo->plh_bulk_recall); 844 INIT_LIST_HEAD(&lo->plh_bulk_recall);
819 lo->plh_inode = ino; 845 lo->plh_inode = ino;
846 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
820 return lo; 847 return lo;
821} 848}
822 849
823static struct pnfs_layout_hdr * 850static struct pnfs_layout_hdr *
824pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) 851pnfs_find_alloc_layout(struct inode *ino,
852 struct nfs_open_context *ctx,
853 gfp_t gfp_flags)
825{ 854{
826 struct nfs_inode *nfsi = NFS_I(ino); 855 struct nfs_inode *nfsi = NFS_I(ino);
827 struct pnfs_layout_hdr *new = NULL; 856 struct pnfs_layout_hdr *new = NULL;
@@ -836,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
836 return nfsi->layout; 865 return nfsi->layout;
837 } 866 }
838 spin_unlock(&ino->i_lock); 867 spin_unlock(&ino->i_lock);
839 new = alloc_init_layout_hdr(ino, gfp_flags); 868 new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
840 spin_lock(&ino->i_lock); 869 spin_lock(&ino->i_lock);
841 870
842 if (likely(nfsi->layout == NULL)) /* Won the race? */ 871 if (likely(nfsi->layout == NULL)) /* Won the race? */
@@ -920,7 +949,8 @@ pnfs_update_layout(struct inode *ino,
920 }; 949 };
921 unsigned pg_offset; 950 unsigned pg_offset;
922 struct nfs_inode *nfsi = NFS_I(ino); 951 struct nfs_inode *nfsi = NFS_I(ino);
923 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 952 struct nfs_server *server = NFS_SERVER(ino);
953 struct nfs_client *clp = server->nfs_client;
924 struct pnfs_layout_hdr *lo; 954 struct pnfs_layout_hdr *lo;
925 struct pnfs_layout_segment *lseg = NULL; 955 struct pnfs_layout_segment *lseg = NULL;
926 bool first = false; 956 bool first = false;
@@ -928,7 +958,7 @@ pnfs_update_layout(struct inode *ino,
928 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 958 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
929 return NULL; 959 return NULL;
930 spin_lock(&ino->i_lock); 960 spin_lock(&ino->i_lock);
931 lo = pnfs_find_alloc_layout(ino, gfp_flags); 961 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
932 if (lo == NULL) { 962 if (lo == NULL) {
933 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); 963 dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
934 goto out_unlock; 964 goto out_unlock;
@@ -964,7 +994,7 @@ pnfs_update_layout(struct inode *ino,
964 */ 994 */
965 spin_lock(&clp->cl_lock); 995 spin_lock(&clp->cl_lock);
966 BUG_ON(!list_empty(&lo->plh_layouts)); 996 BUG_ON(!list_empty(&lo->plh_layouts));
967 list_add_tail(&lo->plh_layouts, &clp->cl_layouts); 997 list_add_tail(&lo->plh_layouts, &server->layouts);
968 spin_unlock(&clp->cl_lock); 998 spin_unlock(&clp->cl_lock);
969 } 999 }
970 1000
@@ -973,7 +1003,8 @@ pnfs_update_layout(struct inode *ino,
973 arg.offset -= pg_offset; 1003 arg.offset -= pg_offset;
974 arg.length += pg_offset; 1004 arg.length += pg_offset;
975 } 1005 }
976 arg.length = PAGE_CACHE_ALIGN(arg.length); 1006 if (arg.length != NFS4_MAX_UINT64)
1007 arg.length = PAGE_CACHE_ALIGN(arg.length);
977 1008
978 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1009 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
979 if (!lseg && first) { 1010 if (!lseg && first) {
@@ -991,6 +1022,7 @@ out_unlock:
991 spin_unlock(&ino->i_lock); 1022 spin_unlock(&ino->i_lock);
992 goto out; 1023 goto out;
993} 1024}
1025EXPORT_SYMBOL_GPL(pnfs_update_layout);
994 1026
995int 1027int
996pnfs_layout_process(struct nfs4_layoutget *lgp) 1028pnfs_layout_process(struct nfs4_layoutget *lgp)
@@ -1048,35 +1080,71 @@ out_forget_reply:
1048 goto out; 1080 goto out;
1049} 1081}
1050 1082
1083void
1084pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1085{
1086 BUG_ON(pgio->pg_lseg != NULL);
1087
1088 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1089 req->wb_context,
1090 req_offset(req),
1091 req->wb_bytes,
1092 IOMODE_READ,
1093 GFP_KERNEL);
1094 /* If no lseg, fall back to read through mds */
1095 if (pgio->pg_lseg == NULL)
1096 nfs_pageio_reset_read_mds(pgio);
1097
1098}
1099EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1100
1101void
1102pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1103{
1104 BUG_ON(pgio->pg_lseg != NULL);
1105
1106 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1107 req->wb_context,
1108 req_offset(req),
1109 req->wb_bytes,
1110 IOMODE_RW,
1111 GFP_NOFS);
1112 /* If no lseg, fall back to write through mds */
1113 if (pgio->pg_lseg == NULL)
1114 nfs_pageio_reset_write_mds(pgio);
1115}
1116EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1117
1051bool 1118bool
1052pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1119pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
1053 struct nfs_page *req)
1054{ 1120{
1055 enum pnfs_iomode access_type; 1121 struct nfs_server *server = NFS_SERVER(inode);
1056 gfp_t gfp_flags; 1122 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1057 1123
1058 /* We assume that pg_ioflags == 0 iff we're reading a page */ 1124 if (ld == NULL)
1059 if (pgio->pg_ioflags == 0) { 1125 return false;
1060 access_type = IOMODE_READ; 1126 nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0);
1061 gfp_flags = GFP_KERNEL; 1127 return true;
1062 } else { 1128}
1063 access_type = IOMODE_RW;
1064 gfp_flags = GFP_NOFS;
1065 }
1066 1129
1067 if (pgio->pg_lseg == NULL) { 1130bool
1068 if (pgio->pg_count != prev->wb_bytes) 1131pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
1069 return true; 1132{
1070 /* This is first coelesce call for a series of nfs_pages */ 1133 struct nfs_server *server = NFS_SERVER(inode);
1071 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1134 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
1072 prev->wb_context, 1135
1073 req_offset(prev), 1136 if (ld == NULL)
1074 pgio->pg_count, 1137 return false;
1075 access_type, 1138 nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags);
1076 gfp_flags); 1139 return true;
1077 if (pgio->pg_lseg == NULL) 1140}
1078 return true; 1141
1079 } 1142bool
1143pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1144 struct nfs_page *req)
1145{
1146 if (pgio->pg_lseg == NULL)
1147 return nfs_generic_pg_test(pgio, prev, req);
1080 1148
1081 /* 1149 /*
1082 * Test if a nfs_page is fully contained in the pnfs_layout_range. 1150 * Test if a nfs_page is fully contained in the pnfs_layout_range.
@@ -1120,15 +1188,30 @@ pnfs_ld_write_done(struct nfs_write_data *data)
1120} 1188}
1121EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1189EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1122 1190
1123enum pnfs_try_status 1191static void
1192pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1193 struct nfs_write_data *data)
1194{
1195 list_splice_tail_init(&data->pages, &desc->pg_list);
1196 if (data->req && list_empty(&data->req->wb_list))
1197 nfs_list_add_request(data->req, &desc->pg_list);
1198 nfs_pageio_reset_write_mds(desc);
1199 desc->pg_recoalesce = 1;
1200 nfs_writedata_release(data);
1201}
1202
1203static enum pnfs_try_status
1124pnfs_try_to_write_data(struct nfs_write_data *wdata, 1204pnfs_try_to_write_data(struct nfs_write_data *wdata,
1125 const struct rpc_call_ops *call_ops, int how) 1205 const struct rpc_call_ops *call_ops,
1206 struct pnfs_layout_segment *lseg,
1207 int how)
1126{ 1208{
1127 struct inode *inode = wdata->inode; 1209 struct inode *inode = wdata->inode;
1128 enum pnfs_try_status trypnfs; 1210 enum pnfs_try_status trypnfs;
1129 struct nfs_server *nfss = NFS_SERVER(inode); 1211 struct nfs_server *nfss = NFS_SERVER(inode);
1130 1212
1131 wdata->mds_ops = call_ops; 1213 wdata->mds_ops = call_ops;
1214 wdata->lseg = get_lseg(lseg);
1132 1215
1133 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1216 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1134 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1217 inode->i_ino, wdata->args.count, wdata->args.offset, how);
@@ -1144,6 +1227,44 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
1144 return trypnfs; 1227 return trypnfs;
1145} 1228}
1146 1229
1230static void
1231pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
1232{
1233 struct nfs_write_data *data;
1234 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1235 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1236
1237 desc->pg_lseg = NULL;
1238 while (!list_empty(head)) {
1239 enum pnfs_try_status trypnfs;
1240
1241 data = list_entry(head->next, struct nfs_write_data, list);
1242 list_del_init(&data->list);
1243
1244 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
1245 if (trypnfs == PNFS_NOT_ATTEMPTED)
1246 pnfs_write_through_mds(desc, data);
1247 }
1248 put_lseg(lseg);
1249}
1250
1251int
1252pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1253{
1254 LIST_HEAD(head);
1255 int ret;
1256
1257 ret = nfs_generic_flush(desc, &head);
1258 if (ret != 0) {
1259 put_lseg(desc->pg_lseg);
1260 desc->pg_lseg = NULL;
1261 return ret;
1262 }
1263 pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags);
1264 return 0;
1265}
1266EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1267
1147/* 1268/*
1148 * Called by non rpc-based layout drivers 1269 * Called by non rpc-based layout drivers
1149 */ 1270 */
@@ -1167,18 +1288,32 @@ pnfs_ld_read_done(struct nfs_read_data *data)
1167} 1288}
1168EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1289EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1169 1290
1291static void
1292pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1293 struct nfs_read_data *data)
1294{
1295 list_splice_tail_init(&data->pages, &desc->pg_list);
1296 if (data->req && list_empty(&data->req->wb_list))
1297 nfs_list_add_request(data->req, &desc->pg_list);
1298 nfs_pageio_reset_read_mds(desc);
1299 desc->pg_recoalesce = 1;
1300 nfs_readdata_release(data);
1301}
1302
1170/* 1303/*
1171 * Call the appropriate parallel I/O subsystem read function. 1304 * Call the appropriate parallel I/O subsystem read function.
1172 */ 1305 */
1173enum pnfs_try_status 1306static enum pnfs_try_status
1174pnfs_try_to_read_data(struct nfs_read_data *rdata, 1307pnfs_try_to_read_data(struct nfs_read_data *rdata,
1175 const struct rpc_call_ops *call_ops) 1308 const struct rpc_call_ops *call_ops,
1309 struct pnfs_layout_segment *lseg)
1176{ 1310{
1177 struct inode *inode = rdata->inode; 1311 struct inode *inode = rdata->inode;
1178 struct nfs_server *nfss = NFS_SERVER(inode); 1312 struct nfs_server *nfss = NFS_SERVER(inode);
1179 enum pnfs_try_status trypnfs; 1313 enum pnfs_try_status trypnfs;
1180 1314
1181 rdata->mds_ops = call_ops; 1315 rdata->mds_ops = call_ops;
1316 rdata->lseg = get_lseg(lseg);
1182 1317
1183 dprintk("%s: Reading ino:%lu %u@%llu\n", 1318 dprintk("%s: Reading ino:%lu %u@%llu\n",
1184 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1319 __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
@@ -1194,17 +1329,56 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
1194 return trypnfs; 1329 return trypnfs;
1195} 1330}
1196 1331
1332static void
1333pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
1334{
1335 struct nfs_read_data *data;
1336 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1337 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1338
1339 desc->pg_lseg = NULL;
1340 while (!list_empty(head)) {
1341 enum pnfs_try_status trypnfs;
1342
1343 data = list_entry(head->next, struct nfs_read_data, list);
1344 list_del_init(&data->list);
1345
1346 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
1347 if (trypnfs == PNFS_NOT_ATTEMPTED)
1348 pnfs_read_through_mds(desc, data);
1349 }
1350 put_lseg(lseg);
1351}
1352
1353int
1354pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1355{
1356 LIST_HEAD(head);
1357 int ret;
1358
1359 ret = nfs_generic_pagein(desc, &head);
1360 if (ret != 0) {
1361 put_lseg(desc->pg_lseg);
1362 desc->pg_lseg = NULL;
1363 return ret;
1364 }
1365 pnfs_do_multiple_reads(desc, &head);
1366 return 0;
1367}
1368EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1369
1197/* 1370/*
1198 * Currently there is only one (whole file) write lseg. 1371 * There can be multiple RW segments.
1199 */ 1372 */
1200static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) 1373static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1201{ 1374{
1202 struct pnfs_layout_segment *lseg, *rv = NULL; 1375 struct pnfs_layout_segment *lseg;
1203 1376
1204 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) 1377 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
1205 if (lseg->pls_range.iomode == IOMODE_RW) 1378 if (lseg->pls_range.iomode == IOMODE_RW &&
1206 rv = lseg; 1379 test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1207 return rv; 1380 list_add(&lseg->pls_lc_list, listp);
1381 }
1208} 1382}
1209 1383
1210void 1384void
@@ -1216,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1216 1390
1217 spin_lock(&nfsi->vfs_inode.i_lock); 1391 spin_lock(&nfsi->vfs_inode.i_lock);
1218 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1392 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1219 /* references matched in nfs4_layoutcommit_release */
1220 get_lseg(wdata->lseg);
1221 wdata->lseg->pls_lc_cred =
1222 get_rpccred(wdata->args.context->state->owner->so_cred);
1223 mark_as_dirty = true; 1393 mark_as_dirty = true;
1224 dprintk("%s: Set layoutcommit for inode %lu ", 1394 dprintk("%s: Set layoutcommit for inode %lu ",
1225 __func__, wdata->inode->i_ino); 1395 __func__, wdata->inode->i_ino);
1226 } 1396 }
1227 if (end_pos > wdata->lseg->pls_end_pos) 1397 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
1228 wdata->lseg->pls_end_pos = end_pos; 1398 /* references matched in nfs4_layoutcommit_release */
1399 get_lseg(wdata->lseg);
1400 }
1401 if (end_pos > nfsi->layout->plh_lwb)
1402 nfsi->layout->plh_lwb = end_pos;
1229 spin_unlock(&nfsi->vfs_inode.i_lock); 1403 spin_unlock(&nfsi->vfs_inode.i_lock);
1404 dprintk("%s: lseg %p end_pos %llu\n",
1405 __func__, wdata->lseg, nfsi->layout->plh_lwb);
1230 1406
1231 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 1407 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1232 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 1408 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1235,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1235} 1411}
1236EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1412EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1237 1413
1414void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1415{
1416 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
1417
1418 if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
1419 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
1420}
1421
1238/* 1422/*
1239 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 1423 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1240 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 1424 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
@@ -1248,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1248{ 1432{
1249 struct nfs4_layoutcommit_data *data; 1433 struct nfs4_layoutcommit_data *data;
1250 struct nfs_inode *nfsi = NFS_I(inode); 1434 struct nfs_inode *nfsi = NFS_I(inode);
1251 struct pnfs_layout_segment *lseg;
1252 struct rpc_cred *cred;
1253 loff_t end_pos; 1435 loff_t end_pos;
1254 int status = 0; 1436 int status = 0;
1255 1437
@@ -1266,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1266 goto out; 1448 goto out;
1267 } 1449 }
1268 1450
1451 INIT_LIST_HEAD(&data->lseg_list);
1269 spin_lock(&inode->i_lock); 1452 spin_lock(&inode->i_lock);
1270 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1271 spin_unlock(&inode->i_lock); 1454 spin_unlock(&inode->i_lock);
1272 kfree(data); 1455 kfree(data);
1273 goto out; 1456 goto out;
1274 } 1457 }
1275 /*
1276 * Currently only one (whole file) write lseg which is referenced
1277 * in pnfs_set_layoutcommit and will be found.
1278 */
1279 lseg = pnfs_list_write_lseg(inode);
1280 1458
1281 end_pos = lseg->pls_end_pos; 1459 pnfs_list_write_lseg(inode, &data->lseg_list);
1282 cred = lseg->pls_lc_cred; 1460
1283 lseg->pls_end_pos = 0; 1461 end_pos = nfsi->layout->plh_lwb;
1284 lseg->pls_lc_cred = NULL; 1462 nfsi->layout->plh_lwb = 0;
1285 1463
1286 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, 1464 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
1287 sizeof(nfsi->layout->plh_stateid.data)); 1465 sizeof(nfsi->layout->plh_stateid.data));
1288 spin_unlock(&inode->i_lock); 1466 spin_unlock(&inode->i_lock);
1289 1467
1290 data->args.inode = inode; 1468 data->args.inode = inode;
1291 data->lseg = lseg; 1469 data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
1292 data->cred = cred;
1293 nfs_fattr_init(&data->fattr); 1470 nfs_fattr_init(&data->fattr);
1294 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 1471 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1295 data->res.fattr = &data->fattr; 1472 data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 96bf4e6f45be..01cbfd54f3cb 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,16 +36,16 @@
36enum { 36enum {
37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
38 NFS_LSEG_ROC, /* roc bit received from server */ 38 NFS_LSEG_ROC, /* roc bit received from server */
39 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
39}; 40};
40 41
41struct pnfs_layout_segment { 42struct pnfs_layout_segment {
42 struct list_head pls_list; 43 struct list_head pls_list;
44 struct list_head pls_lc_list;
43 struct pnfs_layout_range pls_range; 45 struct pnfs_layout_range pls_range;
44 atomic_t pls_refcount; 46 atomic_t pls_refcount;
45 unsigned long pls_flags; 47 unsigned long pls_flags;
46 struct pnfs_layout_hdr *pls_layout; 48 struct pnfs_layout_hdr *pls_layout;
47 struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
48 loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
49}; 49};
50 50
51enum pnfs_try_status { 51enum pnfs_try_status {
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
80 struct module *owner; 80 struct module *owner;
81 unsigned flags; 81 unsigned flags;
82 82
83 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
84 int (*clear_layoutdriver) (struct nfs_server *);
85
83 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); 86 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
84 void (*free_layout_hdr) (struct pnfs_layout_hdr *); 87 void (*free_layout_hdr) (struct pnfs_layout_hdr *);
85 88
@@ -87,7 +90,8 @@ struct pnfs_layoutdriver_type {
87 void (*free_lseg) (struct pnfs_layout_segment *lseg); 90 void (*free_lseg) (struct pnfs_layout_segment *lseg);
88 91
89 /* test for nfs page cache coalescing */ 92 /* test for nfs page cache coalescing */
90 bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 93 const struct nfs_pageio_ops *pg_read_ops;
94 const struct nfs_pageio_ops *pg_write_ops;
91 95
92 /* Returns true if layoutdriver wants to divert this request to 96 /* Returns true if layoutdriver wants to divert this request to
93 * driver's commit routine. 97 * driver's commit routine.
@@ -109,6 +113,8 @@ struct pnfs_layoutdriver_type {
109 struct xdr_stream *xdr, 113 struct xdr_stream *xdr,
110 const struct nfs4_layoutreturn_args *args); 114 const struct nfs4_layoutreturn_args *args);
111 115
116 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
117
112 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 118 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
113 struct xdr_stream *xdr, 119 struct xdr_stream *xdr,
114 const struct nfs4_layoutcommit_args *args); 120 const struct nfs4_layoutcommit_args *args);
@@ -124,6 +130,8 @@ struct pnfs_layout_hdr {
124 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ 130 unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
125 u32 plh_barrier; /* ignore lower seqids */ 131 u32 plh_barrier; /* ignore lower seqids */
126 unsigned long plh_flags; 132 unsigned long plh_flags;
133 loff_t plh_lwb; /* last write byte for layoutcommit */
134 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
127 struct inode *plh_inode; 135 struct inode *plh_inode;
128}; 136};
129 137
@@ -136,10 +144,21 @@ struct pnfs_device {
136 unsigned int pglen; 144 unsigned int pglen;
137}; 145};
138 146
147#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
148
149struct pnfs_devicelist {
150 unsigned int eof;
151 unsigned int num_devs;
152 struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
153};
154
139extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 155extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
140extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 156extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
141 157
142/* nfs4proc.c */ 158/* nfs4proc.c */
159extern int nfs4_proc_getdevicelist(struct nfs_server *server,
160 const struct nfs_fh *fh,
161 struct pnfs_devicelist *devlist);
143extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 162extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
144 struct pnfs_device *dev); 163 struct pnfs_device *dev);
145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 164extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
@@ -148,16 +167,16 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
148/* pnfs.c */ 167/* pnfs.c */
149void get_layout_hdr(struct pnfs_layout_hdr *lo); 168void get_layout_hdr(struct pnfs_layout_hdr *lo);
150void put_lseg(struct pnfs_layout_segment *lseg); 169void put_lseg(struct pnfs_layout_segment *lseg);
151struct pnfs_layout_segment * 170
152pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 171bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
153 loff_t pos, u64 count, enum pnfs_iomode access_type, 172bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
154 gfp_t gfp_flags); 173
155void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 174void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
156void unset_pnfs_layoutdriver(struct nfs_server *); 175void unset_pnfs_layoutdriver(struct nfs_server *);
157enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, 176void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
158 const struct rpc_call_ops *, int); 177int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
159enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, 178void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
160 const struct rpc_call_ops *); 179int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
161bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); 180bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
162int pnfs_layout_process(struct nfs4_layoutget *lgp); 181int pnfs_layout_process(struct nfs4_layoutget *lgp);
163void pnfs_free_lseg_list(struct list_head *tmp_list); 182void pnfs_free_lseg_list(struct list_head *tmp_list);
@@ -178,10 +197,24 @@ void pnfs_roc_release(struct inode *ino);
178void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 197void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
179bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 198bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
180void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 199void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
200void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
181int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 201int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
182int _pnfs_return_layout(struct inode *); 202int _pnfs_return_layout(struct inode *);
183int pnfs_ld_write_done(struct nfs_write_data *); 203int pnfs_ld_write_done(struct nfs_write_data *);
184int pnfs_ld_read_done(struct nfs_read_data *); 204int pnfs_ld_read_done(struct nfs_read_data *);
205struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
206 struct nfs_open_context *ctx,
207 loff_t pos,
208 u64 count,
209 enum pnfs_iomode iomode,
210 gfp_t gfp_flags);
211
212void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
213
214/* nfs4_deviceid_flags */
215enum {
216 NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
217};
185 218
186/* pnfs_dev.c */ 219/* pnfs_dev.c */
187struct nfs4_deviceid_node { 220struct nfs4_deviceid_node {
@@ -189,13 +222,13 @@ struct nfs4_deviceid_node {
189 struct hlist_node tmpnode; 222 struct hlist_node tmpnode;
190 const struct pnfs_layoutdriver_type *ld; 223 const struct pnfs_layoutdriver_type *ld;
191 const struct nfs_client *nfs_client; 224 const struct nfs_client *nfs_client;
225 unsigned long flags;
192 struct nfs4_deviceid deviceid; 226 struct nfs4_deviceid deviceid;
193 atomic_t ref; 227 atomic_t ref;
194}; 228};
195 229
196void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); 230void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
197struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 231struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
198struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
199void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 232void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
200void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 233void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
201 const struct pnfs_layoutdriver_type *, 234 const struct pnfs_layoutdriver_type *,
@@ -293,15 +326,6 @@ static inline int pnfs_return_layout(struct inode *ino)
293 return 0; 326 return 0;
294} 327}
295 328
296static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
297 struct inode *inode)
298{
299 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
300
301 if (ld)
302 pgio->pg_test = ld->pg_test;
303}
304
305#else /* CONFIG_NFS_V4_1 */ 329#else /* CONFIG_NFS_V4_1 */
306 330
307static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 331static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -322,28 +346,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
322{ 346{
323} 347}
324 348
325static inline struct pnfs_layout_segment *
326pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
327 loff_t pos, u64 count, enum pnfs_iomode access_type,
328 gfp_t gfp_flags)
329{
330 return NULL;
331}
332
333static inline enum pnfs_try_status
334pnfs_try_to_read_data(struct nfs_read_data *data,
335 const struct rpc_call_ops *call_ops)
336{
337 return PNFS_NOT_ATTEMPTED;
338}
339
340static inline enum pnfs_try_status
341pnfs_try_to_write_data(struct nfs_write_data *data,
342 const struct rpc_call_ops *call_ops, int how)
343{
344 return PNFS_NOT_ATTEMPTED;
345}
346
347static inline int pnfs_return_layout(struct inode *ino) 349static inline int pnfs_return_layout(struct inode *ino)
348{ 350{
349 return 0; 351 return 0;
@@ -377,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
377 return false; 379 return false;
378} 380}
379 381
380static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) 382static inline void set_pnfs_layoutdriver(struct nfs_server *s,
383 const struct nfs_fh *mntfh, u32 id)
381{ 384{
382} 385}
383 386
@@ -385,9 +388,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
385{ 388{
386} 389}
387 390
388static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, 391static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
389 struct inode *inode) 392{
393 return false;
394}
395
396static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
390{ 397{
398 return false;
391} 399}
392 400
393static inline void 401static inline void
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index f0f8e1e22f6c..6fda5228ef56 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -100,8 +100,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
100 100
101 rcu_read_lock(); 101 rcu_read_lock();
102 d = _lookup_deviceid(ld, clp, id, hash); 102 d = _lookup_deviceid(ld, clp, id, hash);
103 if (d && !atomic_inc_not_zero(&d->ref)) 103 if (d != NULL)
104 d = NULL; 104 atomic_inc(&d->ref);
105 rcu_read_unlock(); 105 rcu_read_unlock();
106 return d; 106 return d;
107} 107}
@@ -115,15 +115,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); 115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
116 116
117/* 117/*
118 * Unhash and put deviceid 118 * Remove a deviceid from cache
119 * 119 *
120 * @clp nfs_client associated with deviceid 120 * @clp nfs_client associated with deviceid
121 * @id the deviceid to unhash 121 * @id the deviceid to unhash
122 * 122 *
123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. 123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
124 */ 124 */
125struct nfs4_deviceid_node * 125void
126nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, 126nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
127 const struct nfs_client *clp, const struct nfs4_deviceid *id) 127 const struct nfs_client *clp, const struct nfs4_deviceid *id)
128{ 128{
129 struct nfs4_deviceid_node *d; 129 struct nfs4_deviceid_node *d;
@@ -134,7 +134,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
134 rcu_read_unlock(); 134 rcu_read_unlock();
135 if (!d) { 135 if (!d) {
136 spin_unlock(&nfs4_deviceid_lock); 136 spin_unlock(&nfs4_deviceid_lock);
137 return NULL; 137 return;
138 } 138 }
139 hlist_del_init_rcu(&d->node); 139 hlist_del_init_rcu(&d->node);
140 spin_unlock(&nfs4_deviceid_lock); 140 spin_unlock(&nfs4_deviceid_lock);
@@ -142,28 +142,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
142 142
143 /* balance the initial ref set in pnfs_insert_deviceid */ 143 /* balance the initial ref set in pnfs_insert_deviceid */
144 if (atomic_dec_and_test(&d->ref)) 144 if (atomic_dec_and_test(&d->ref))
145 return d; 145 d->ld->free_deviceid_node(d);
146
147 return NULL;
148}
149EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
150
151/*
152 * Delete a deviceid from cache
153 *
154 * @clp struct nfs_client qualifying the deviceid
155 * @id deviceid to delete
156 */
157void
158nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
159 const struct nfs_client *clp, const struct nfs4_deviceid *id)
160{
161 struct nfs4_deviceid_node *d;
162
163 d = nfs4_unhash_put_deviceid(ld, clp, id);
164 if (!d)
165 return;
166 d->ld->free_deviceid_node(d);
167} 146}
168EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 147EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
169 148
@@ -177,6 +156,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
177 INIT_HLIST_NODE(&d->tmpnode); 156 INIT_HLIST_NODE(&d->tmpnode);
178 d->ld = ld; 157 d->ld = ld;
179 d->nfs_client = nfs_client; 158 d->nfs_client = nfs_client;
159 d->flags = 0;
180 d->deviceid = *id; 160 d->deviceid = *id;
181 atomic_set(&d->ref, 1); 161 atomic_set(&d->ref, 1);
182} 162}
@@ -221,16 +201,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
221 * 201 *
222 * @d deviceid node to put 202 * @d deviceid node to put
223 * 203 *
224 * @ret true iff the node was deleted 204 * return true iff the node was deleted
205 * Note that since the test for d->ref == 0 is sufficient to establish
206 * that the node is no longer hashed in the global device id cache.
225 */ 207 */
226bool 208bool
227nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) 209nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
228{ 210{
229 if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) 211 if (!atomic_dec_and_test(&d->ref))
230 return false; 212 return false;
231 hlist_del_init_rcu(&d->node);
232 spin_unlock(&nfs4_deviceid_lock);
233 synchronize_rcu();
234 d->ld->free_deviceid_node(d); 213 d->ld->free_deviceid_node(d);
235 return true; 214 return true;
236} 215}
@@ -275,3 +254,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp)
275 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) 254 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
276 _deviceid_purge_client(clp, h); 255 _deviceid_purge_client(clp, h);
277} 256}
257
258/*
259 * Stop use of all deviceids associated with an nfs_client
260 */
261void
262nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
263{
264 struct nfs4_deviceid_node *d;
265 struct hlist_node *n;
266 int i;
267
268 rcu_read_lock();
269 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
270 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node)
271 if (d->nfs_client == clp)
272 set_bit(NFS_DEVICEID_INVALID, &d->flags);
273 }
274 rcu_read_unlock();
275}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a68679f538fc..2171c043ab08 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -30,8 +30,7 @@
30 30
31#define NFSDBG_FACILITY NFSDBG_PAGECACHE 31#define NFSDBG_FACILITY NFSDBG_PAGECACHE
32 32
33static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); 33static const struct nfs_pageio_ops nfs_pageio_read_ops;
34static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
35static const struct rpc_call_ops nfs_read_partial_ops; 34static const struct rpc_call_ops nfs_read_partial_ops;
36static const struct rpc_call_ops nfs_read_full_ops; 35static const struct rpc_call_ops nfs_read_full_ops;
37 36
@@ -68,7 +67,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
68 mempool_free(p, nfs_rdata_mempool); 67 mempool_free(p, nfs_rdata_mempool);
69} 68}
70 69
71static void nfs_readdata_release(struct nfs_read_data *rdata) 70void nfs_readdata_release(struct nfs_read_data *rdata)
72{ 71{
73 put_lseg(rdata->lseg); 72 put_lseg(rdata->lseg);
74 put_nfs_open_context(rdata->args.context); 73 put_nfs_open_context(rdata->args.context);
@@ -113,6 +112,27 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
113 } 112 }
114} 113}
115 114
115static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
116 struct inode *inode)
117{
118 nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
119 NFS_SERVER(inode)->rsize, 0);
120}
121
122void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
123{
124 pgio->pg_ops = &nfs_pageio_read_ops;
125 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
126}
127EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
128
129static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
130 struct inode *inode)
131{
132 if (!pnfs_pageio_init_read(pgio, inode))
133 nfs_pageio_init_read_mds(pgio, inode);
134}
135
116int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 136int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
117 struct page *page) 137 struct page *page)
118{ 138{
@@ -131,14 +151,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
131 if (len < PAGE_CACHE_SIZE) 151 if (len < PAGE_CACHE_SIZE)
132 zero_user_segment(page, len, PAGE_CACHE_SIZE); 152 zero_user_segment(page, len, PAGE_CACHE_SIZE);
133 153
134 nfs_pageio_init(&pgio, inode, NULL, 0, 0); 154 nfs_pageio_init_read(&pgio, inode);
135 nfs_list_add_request(new, &pgio.pg_list); 155 nfs_pageio_add_request(&pgio, new);
136 pgio.pg_count = len; 156 nfs_pageio_complete(&pgio);
137
138 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
139 nfs_pagein_multi(&pgio);
140 else
141 nfs_pagein_one(&pgio);
142 return 0; 157 return 0;
143} 158}
144 159
@@ -202,17 +217,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
202/* 217/*
203 * Set up the NFS read request struct 218 * Set up the NFS read request struct
204 */ 219 */
205static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, 220static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
206 const struct rpc_call_ops *call_ops, 221 unsigned int count, unsigned int offset)
207 unsigned int count, unsigned int offset,
208 struct pnfs_layout_segment *lseg)
209{ 222{
210 struct inode *inode = req->wb_context->dentry->d_inode; 223 struct inode *inode = req->wb_context->dentry->d_inode;
211 224
212 data->req = req; 225 data->req = req;
213 data->inode = inode; 226 data->inode = inode;
214 data->cred = req->wb_context->cred; 227 data->cred = req->wb_context->cred;
215 data->lseg = get_lseg(lseg);
216 228
217 data->args.fh = NFS_FH(inode); 229 data->args.fh = NFS_FH(inode);
218 data->args.offset = req_offset(req) + offset; 230 data->args.offset = req_offset(req) + offset;
@@ -226,14 +238,36 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
226 data->res.count = count; 238 data->res.count = count;
227 data->res.eof = 0; 239 data->res.eof = 0;
228 nfs_fattr_init(&data->fattr); 240 nfs_fattr_init(&data->fattr);
241}
229 242
230 if (data->lseg && 243static int nfs_do_read(struct nfs_read_data *data,
231 (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) 244 const struct rpc_call_ops *call_ops)
232 return 0; 245{
246 struct inode *inode = data->args.context->dentry->d_inode;
233 247
234 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); 248 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
235} 249}
236 250
251static int
252nfs_do_multiple_reads(struct list_head *head,
253 const struct rpc_call_ops *call_ops)
254{
255 struct nfs_read_data *data;
256 int ret = 0;
257
258 while (!list_empty(head)) {
259 int ret2;
260
261 data = list_entry(head->next, struct nfs_read_data, list);
262 list_del_init(&data->list);
263
264 ret2 = nfs_do_read(data, call_ops);
265 if (ret == 0)
266 ret = ret2;
267 }
268 return ret;
269}
270
237static void 271static void
238nfs_async_read_error(struct list_head *head) 272nfs_async_read_error(struct list_head *head)
239{ 273{
@@ -260,20 +294,19 @@ nfs_async_read_error(struct list_head *head)
260 * won't see the new data until our attribute cache is updated. This is more 294 * won't see the new data until our attribute cache is updated. This is more
261 * or less conventional NFS client behavior. 295 * or less conventional NFS client behavior.
262 */ 296 */
263static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) 297static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
264{ 298{
265 struct nfs_page *req = nfs_list_entry(desc->pg_list.next); 299 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
266 struct page *page = req->wb_page; 300 struct page *page = req->wb_page;
267 struct nfs_read_data *data; 301 struct nfs_read_data *data;
268 size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; 302 size_t rsize = desc->pg_bsize, nbytes;
269 unsigned int offset; 303 unsigned int offset;
270 int requests = 0; 304 int requests = 0;
271 int ret = 0; 305 int ret = 0;
272 struct pnfs_layout_segment *lseg;
273 LIST_HEAD(list);
274 306
275 nfs_list_remove_request(req); 307 nfs_list_remove_request(req);
276 308
309 offset = 0;
277 nbytes = desc->pg_count; 310 nbytes = desc->pg_count;
278 do { 311 do {
279 size_t len = min(nbytes,rsize); 312 size_t len = min(nbytes,rsize);
@@ -281,45 +314,21 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
281 data = nfs_readdata_alloc(1); 314 data = nfs_readdata_alloc(1);
282 if (!data) 315 if (!data)
283 goto out_bad; 316 goto out_bad;
284 list_add(&data->pages, &list); 317 data->pagevec[0] = page;
318 nfs_read_rpcsetup(req, data, len, offset);
319 list_add(&data->list, res);
285 requests++; 320 requests++;
286 nbytes -= len; 321 nbytes -= len;
322 offset += len;
287 } while(nbytes != 0); 323 } while(nbytes != 0);
288 atomic_set(&req->wb_complete, requests); 324 atomic_set(&req->wb_complete, requests);
289
290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
292 req_offset(req), desc->pg_count,
293 IOMODE_READ, GFP_KERNEL);
294 ClearPageError(page); 325 ClearPageError(page);
295 offset = 0; 326 desc->pg_rpc_callops = &nfs_read_partial_ops;
296 nbytes = desc->pg_count;
297 do {
298 int ret2;
299
300 data = list_entry(list.next, struct nfs_read_data, pages);
301 list_del_init(&data->pages);
302
303 data->pagevec[0] = page;
304
305 if (nbytes < rsize)
306 rsize = nbytes;
307 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
308 rsize, offset, lseg);
309 if (ret == 0)
310 ret = ret2;
311 offset += rsize;
312 nbytes -= rsize;
313 } while (nbytes != 0);
314 put_lseg(lseg);
315 desc->pg_lseg = NULL;
316
317 return ret; 327 return ret;
318
319out_bad: 328out_bad:
320 while (!list_empty(&list)) { 329 while (!list_empty(res)) {
321 data = list_entry(list.next, struct nfs_read_data, pages); 330 data = list_entry(res->next, struct nfs_read_data, list);
322 list_del(&data->pages); 331 list_del(&data->list);
323 nfs_readdata_free(data); 332 nfs_readdata_free(data);
324 } 333 }
325 SetPageError(page); 334 SetPageError(page);
@@ -327,19 +336,19 @@ out_bad:
327 return -ENOMEM; 336 return -ENOMEM;
328} 337}
329 338
330static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) 339static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
331{ 340{
332 struct nfs_page *req; 341 struct nfs_page *req;
333 struct page **pages; 342 struct page **pages;
334 struct nfs_read_data *data; 343 struct nfs_read_data *data;
335 struct list_head *head = &desc->pg_list; 344 struct list_head *head = &desc->pg_list;
336 struct pnfs_layout_segment *lseg = desc->pg_lseg; 345 int ret = 0;
337 int ret = -ENOMEM;
338 346
339 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, 347 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
340 desc->pg_count)); 348 desc->pg_count));
341 if (!data) { 349 if (!data) {
342 nfs_async_read_error(head); 350 nfs_async_read_error(head);
351 ret = -ENOMEM;
343 goto out; 352 goto out;
344 } 353 }
345 354
@@ -352,19 +361,37 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
352 *pages++ = req->wb_page; 361 *pages++ = req->wb_page;
353 } 362 }
354 req = nfs_list_entry(data->pages.next); 363 req = nfs_list_entry(data->pages.next);
355 if ((!lseg) && list_is_singular(&data->pages))
356 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
357 req_offset(req), desc->pg_count,
358 IOMODE_READ, GFP_KERNEL);
359 364
360 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 365 nfs_read_rpcsetup(req, data, desc->pg_count, 0);
361 0, lseg); 366 list_add(&data->list, res);
367 desc->pg_rpc_callops = &nfs_read_full_ops;
362out: 368out:
363 put_lseg(lseg);
364 desc->pg_lseg = NULL;
365 return ret; 369 return ret;
366} 370}
367 371
372int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head)
373{
374 if (desc->pg_bsize < PAGE_CACHE_SIZE)
375 return nfs_pagein_multi(desc, head);
376 return nfs_pagein_one(desc, head);
377}
378
379static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
380{
381 LIST_HEAD(head);
382 int ret;
383
384 ret = nfs_generic_pagein(desc, &head);
385 if (ret == 0)
386 ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops);
387 return ret;
388}
389
390static const struct nfs_pageio_ops nfs_pageio_read_ops = {
391 .pg_test = nfs_generic_pg_test,
392 .pg_doio = nfs_generic_pg_readpages,
393};
394
368/* 395/*
369 * This is the callback from RPC telling us whether a reply was 396 * This is the callback from RPC telling us whether a reply was
370 * received or some error occurred (timeout or socket shutdown). 397 * received or some error occurred (timeout or socket shutdown).
@@ -635,8 +662,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
635 .pgio = &pgio, 662 .pgio = &pgio,
636 }; 663 };
637 struct inode *inode = mapping->host; 664 struct inode *inode = mapping->host;
638 struct nfs_server *server = NFS_SERVER(inode);
639 size_t rsize = server->rsize;
640 unsigned long npages; 665 unsigned long npages;
641 int ret = -ESTALE; 666 int ret = -ESTALE;
642 667
@@ -664,10 +689,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
664 if (ret == 0) 689 if (ret == 0)
665 goto read_complete; /* all pages were read */ 690 goto read_complete; /* all pages were read */
666 691
667 if (rsize < PAGE_CACHE_SIZE) 692 nfs_pageio_init_read(&pgio, inode);
668 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
669 else
670 nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
671 693
672 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 694 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
673 695
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8d6864c2a5fa..b2fbbde58e44 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
147 147
148 alias = d_lookup(parent, &data->args.name); 148 alias = d_lookup(parent, &data->args.name);
149 if (alias != NULL) { 149 if (alias != NULL) {
150 int ret = 0; 150 int ret;
151 void *devname_garbage = NULL; 151 void *devname_garbage = NULL;
152 152
153 /* 153 /*
@@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
155 * the sillyrename information to the aliased dentry. 155 * the sillyrename information to the aliased dentry.
156 */ 156 */
157 nfs_free_dname(data); 157 nfs_free_dname(data);
158 ret = nfs_copy_dname(alias, data);
158 spin_lock(&alias->d_lock); 159 spin_lock(&alias->d_lock);
159 if (alias->d_inode != NULL && 160 if (ret == 0 && alias->d_inode != NULL &&
160 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { 161 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
161 devname_garbage = alias->d_fsdata; 162 devname_garbage = alias->d_fsdata;
162 alias->d_fsdata = data; 163 alias->d_fsdata = data;
163 alias->d_flags |= DCACHE_NFSFS_RENAMED; 164 alias->d_flags |= DCACHE_NFSFS_RENAMED;
164 ret = 1; 165 ret = 1;
165 } 166 } else
167 ret = 0;
166 spin_unlock(&alias->d_lock); 168 spin_unlock(&alias->d_lock);
167 nfs_dec_sillycount(dir); 169 nfs_dec_sillycount(dir);
168 dput(alias); 170 dput(alias);
@@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
171 * point dentry is definitely not a root, so we won't need 173 * point dentry is definitely not a root, so we won't need
172 * that anymore. 174 * that anymore.
173 */ 175 */
174 if (devname_garbage) 176 kfree(devname_garbage);
175 kfree(devname_garbage);
176 return ret; 177 return ret;
177 } 178 }
178 data->dir = igrab(dir); 179 data->dir = igrab(dir);
@@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
204 if (parent == NULL) 205 if (parent == NULL)
205 goto out_free; 206 goto out_free;
206 dir = parent->d_inode; 207 dir = parent->d_inode;
207 if (nfs_copy_dname(dentry, data) != 0)
208 goto out_dput;
209 /* Non-exclusive lock protects against concurrent lookup() calls */ 208 /* Non-exclusive lock protects against concurrent lookup() calls */
210 spin_lock(&dir->i_lock); 209 spin_lock(&dir->i_lock);
211 if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { 210 if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
@@ -366,6 +365,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
366 struct nfs_renamedata *data = calldata; 365 struct nfs_renamedata *data = calldata;
367 struct inode *old_dir = data->old_dir; 366 struct inode *old_dir = data->old_dir;
368 struct inode *new_dir = data->new_dir; 367 struct inode *new_dir = data->new_dir;
368 struct dentry *old_dentry = data->old_dentry;
369 struct dentry *new_dentry = data->new_dentry;
369 370
370 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { 371 if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
371 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); 372 nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
@@ -373,12 +374,12 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
373 } 374 }
374 375
375 if (task->tk_status != 0) { 376 if (task->tk_status != 0) {
376 nfs_cancel_async_unlink(data->old_dentry); 377 nfs_cancel_async_unlink(old_dentry);
377 return; 378 return;
378 } 379 }
379 380
380 nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); 381 d_drop(old_dentry);
381 d_move(data->old_dentry, data->new_dentry); 382 d_drop(new_dentry);
382} 383}
383 384
384/** 385/**
@@ -501,6 +502,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
501 * and only performs the unlink once the last reference to it is put. 502 * and only performs the unlink once the last reference to it is put.
502 * 503 *
503 * The final cleanup is done during dentry_iput. 504 * The final cleanup is done during dentry_iput.
505 *
506 * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server
507 * could take responsibility for keeping open files referenced. The server
508 * would also need to ensure that opened-but-deleted files were kept over
509 * reboots. However, we may not assume a server does so. (RFC 5661
510 * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can
511 * use to advertise that it does this; some day we may take advantage of
512 * it.))
504 */ 513 */
505int 514int
506nfs_sillyrename(struct inode *dir, struct dentry *dentry) 515nfs_sillyrename(struct inode *dir, struct dentry *dentry)
@@ -560,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
560 if (error) 569 if (error)
561 goto out_dput; 570 goto out_dput;
562 571
572 /* populate unlinkdata with the right dname */
573 error = nfs_copy_dname(sdentry,
574 (struct nfs_unlinkdata *)dentry->d_fsdata);
575 if (error) {
576 nfs_cancel_async_unlink(dentry);
577 goto out_dput;
578 }
579
563 /* run the rename task, undo unlink if it fails */ 580 /* run the rename task, undo unlink if it fails */
564 task = nfs_async_rename(dir, dir, dentry, sdentry); 581 task = nfs_async_rename(dir, dir, dentry, sdentry);
565 if (IS_ERR(task)) { 582 if (IS_ERR(task)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08579312c57b..b39b37f80913 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -97,7 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
97 mempool_free(p, nfs_wdata_mempool); 97 mempool_free(p, nfs_wdata_mempool);
98} 98}
99 99
100static void nfs_writedata_release(struct nfs_write_data *wdata) 100void nfs_writedata_release(struct nfs_write_data *wdata)
101{ 101{
102 put_lseg(wdata->lseg); 102 put_lseg(wdata->lseg);
103 put_nfs_open_context(wdata->args.context); 103 put_nfs_open_context(wdata->args.context);
@@ -845,11 +845,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
845/* 845/*
846 * Set up the argument/result storage required for the RPC call. 846 * Set up the argument/result storage required for the RPC call.
847 */ 847 */
848static int nfs_write_rpcsetup(struct nfs_page *req, 848static void nfs_write_rpcsetup(struct nfs_page *req,
849 struct nfs_write_data *data, 849 struct nfs_write_data *data,
850 const struct rpc_call_ops *call_ops,
851 unsigned int count, unsigned int offset, 850 unsigned int count, unsigned int offset,
852 struct pnfs_layout_segment *lseg,
853 int how) 851 int how)
854{ 852{
855 struct inode *inode = req->wb_context->dentry->d_inode; 853 struct inode *inode = req->wb_context->dentry->d_inode;
@@ -860,7 +858,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
860 data->req = req; 858 data->req = req;
861 data->inode = inode = req->wb_context->dentry->d_inode; 859 data->inode = inode = req->wb_context->dentry->d_inode;
862 data->cred = req->wb_context->cred; 860 data->cred = req->wb_context->cred;
863 data->lseg = get_lseg(lseg);
864 861
865 data->args.fh = NFS_FH(inode); 862 data->args.fh = NFS_FH(inode);
866 data->args.offset = req_offset(req) + offset; 863 data->args.offset = req_offset(req) + offset;
@@ -872,24 +869,51 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
872 data->args.context = get_nfs_open_context(req->wb_context); 869 data->args.context = get_nfs_open_context(req->wb_context);
873 data->args.lock_context = req->wb_lock_context; 870 data->args.lock_context = req->wb_lock_context;
874 data->args.stable = NFS_UNSTABLE; 871 data->args.stable = NFS_UNSTABLE;
875 if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { 872 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
876 data->args.stable = NFS_DATA_SYNC; 873 case 0:
877 if (!nfs_need_commit(NFS_I(inode))) 874 break;
878 data->args.stable = NFS_FILE_SYNC; 875 case FLUSH_COND_STABLE:
876 if (nfs_need_commit(NFS_I(inode)))
877 break;
878 default:
879 data->args.stable = NFS_FILE_SYNC;
879 } 880 }
880 881
881 data->res.fattr = &data->fattr; 882 data->res.fattr = &data->fattr;
882 data->res.count = count; 883 data->res.count = count;
883 data->res.verf = &data->verf; 884 data->res.verf = &data->verf;
884 nfs_fattr_init(&data->fattr); 885 nfs_fattr_init(&data->fattr);
886}
885 887
886 if (data->lseg && 888static int nfs_do_write(struct nfs_write_data *data,
887 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) 889 const struct rpc_call_ops *call_ops,
888 return 0; 890 int how)
891{
892 struct inode *inode = data->args.context->dentry->d_inode;
889 893
890 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); 894 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
891} 895}
892 896
897static int nfs_do_multiple_writes(struct list_head *head,
898 const struct rpc_call_ops *call_ops,
899 int how)
900{
901 struct nfs_write_data *data;
902 int ret = 0;
903
904 while (!list_empty(head)) {
905 int ret2;
906
907 data = list_entry(head->next, struct nfs_write_data, list);
908 list_del_init(&data->list);
909
910 ret2 = nfs_do_write(data, call_ops, how);
911 if (ret == 0)
912 ret = ret2;
913 }
914 return ret;
915}
916
893/* If a nfs_flush_* function fails, it should remove reqs from @head and 917/* If a nfs_flush_* function fails, it should remove reqs from @head and
894 * call this on each, which will prepare them to be retried on next 918 * call this on each, which will prepare them to be retried on next
895 * writeback using standard nfs. 919 * writeback using standard nfs.
@@ -907,17 +931,15 @@ static void nfs_redirty_request(struct nfs_page *req)
907 * Generate multiple small requests to write out a single 931 * Generate multiple small requests to write out a single
908 * contiguous dirty area on one page. 932 * contiguous dirty area on one page.
909 */ 933 */
910static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) 934static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
911{ 935{
912 struct nfs_page *req = nfs_list_entry(desc->pg_list.next); 936 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
913 struct page *page = req->wb_page; 937 struct page *page = req->wb_page;
914 struct nfs_write_data *data; 938 struct nfs_write_data *data;
915 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; 939 size_t wsize = desc->pg_bsize, nbytes;
916 unsigned int offset; 940 unsigned int offset;
917 int requests = 0; 941 int requests = 0;
918 int ret = 0; 942 int ret = 0;
919 struct pnfs_layout_segment *lseg;
920 LIST_HEAD(list);
921 943
922 nfs_list_remove_request(req); 944 nfs_list_remove_request(req);
923 945
@@ -927,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
927 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 949 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
928 950
929 951
952 offset = 0;
930 nbytes = desc->pg_count; 953 nbytes = desc->pg_count;
931 do { 954 do {
932 size_t len = min(nbytes, wsize); 955 size_t len = min(nbytes, wsize);
@@ -934,45 +957,21 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
934 data = nfs_writedata_alloc(1); 957 data = nfs_writedata_alloc(1);
935 if (!data) 958 if (!data)
936 goto out_bad; 959 goto out_bad;
937 list_add(&data->pages, &list); 960 data->pagevec[0] = page;
961 nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags);
962 list_add(&data->list, res);
938 requests++; 963 requests++;
939 nbytes -= len; 964 nbytes -= len;
965 offset += len;
940 } while (nbytes != 0); 966 } while (nbytes != 0);
941 atomic_set(&req->wb_complete, requests); 967 atomic_set(&req->wb_complete, requests);
942 968 desc->pg_rpc_callops = &nfs_write_partial_ops;
943 BUG_ON(desc->pg_lseg);
944 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
945 req_offset(req), desc->pg_count,
946 IOMODE_RW, GFP_NOFS);
947 ClearPageError(page);
948 offset = 0;
949 nbytes = desc->pg_count;
950 do {
951 int ret2;
952
953 data = list_entry(list.next, struct nfs_write_data, pages);
954 list_del_init(&data->pages);
955
956 data->pagevec[0] = page;
957
958 if (nbytes < wsize)
959 wsize = nbytes;
960 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
961 wsize, offset, lseg, desc->pg_ioflags);
962 if (ret == 0)
963 ret = ret2;
964 offset += wsize;
965 nbytes -= wsize;
966 } while (nbytes != 0);
967
968 put_lseg(lseg);
969 desc->pg_lseg = NULL;
970 return ret; 969 return ret;
971 970
972out_bad: 971out_bad:
973 while (!list_empty(&list)) { 972 while (!list_empty(res)) {
974 data = list_entry(list.next, struct nfs_write_data, pages); 973 data = list_entry(res->next, struct nfs_write_data, list);
975 list_del(&data->pages); 974 list_del(&data->list);
976 nfs_writedata_free(data); 975 nfs_writedata_free(data);
977 } 976 }
978 nfs_redirty_request(req); 977 nfs_redirty_request(req);
@@ -987,14 +986,13 @@ out_bad:
987 * This is the case if nfs_updatepage detects a conflicting request 986 * This is the case if nfs_updatepage detects a conflicting request
988 * that has been written but not committed. 987 * that has been written but not committed.
989 */ 988 */
990static int nfs_flush_one(struct nfs_pageio_descriptor *desc) 989static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
991{ 990{
992 struct nfs_page *req; 991 struct nfs_page *req;
993 struct page **pages; 992 struct page **pages;
994 struct nfs_write_data *data; 993 struct nfs_write_data *data;
995 struct list_head *head = &desc->pg_list; 994 struct list_head *head = &desc->pg_list;
996 struct pnfs_layout_segment *lseg = desc->pg_lseg; 995 int ret = 0;
997 int ret;
998 996
999 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, 997 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
1000 desc->pg_count)); 998 desc->pg_count));
@@ -1016,32 +1014,62 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
1016 *pages++ = req->wb_page; 1014 *pages++ = req->wb_page;
1017 } 1015 }
1018 req = nfs_list_entry(data->pages.next); 1016 req = nfs_list_entry(data->pages.next);
1019 if ((!lseg) && list_is_singular(&data->pages))
1020 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
1021 req_offset(req), desc->pg_count,
1022 IOMODE_RW, GFP_NOFS);
1023 1017
1024 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 1018 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1025 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) 1019 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
1026 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 1020 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
1027 1021
1028 /* Set up the argument struct */ 1022 /* Set up the argument struct */
1029 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); 1023 nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags);
1024 list_add(&data->list, res);
1025 desc->pg_rpc_callops = &nfs_write_full_ops;
1030out: 1026out:
1031 put_lseg(lseg); /* Cleans any gotten in ->pg_test */
1032 desc->pg_lseg = NULL;
1033 return ret; 1027 return ret;
1034} 1028}
1035 1029
1036static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1030int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head)
1031{
1032 if (desc->pg_bsize < PAGE_CACHE_SIZE)
1033 return nfs_flush_multi(desc, head);
1034 return nfs_flush_one(desc, head);
1035}
1036
1037static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1038{
1039 LIST_HEAD(head);
1040 int ret;
1041
1042 ret = nfs_generic_flush(desc, &head);
1043 if (ret == 0)
1044 ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops,
1045 desc->pg_ioflags);
1046 return ret;
1047}
1048
1049static const struct nfs_pageio_ops nfs_pageio_write_ops = {
1050 .pg_test = nfs_generic_pg_test,
1051 .pg_doio = nfs_generic_pg_writepages,
1052};
1053
1054static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
1037 struct inode *inode, int ioflags) 1055 struct inode *inode, int ioflags)
1038{ 1056{
1039 size_t wsize = NFS_SERVER(inode)->wsize; 1057 nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
1058 NFS_SERVER(inode)->wsize, ioflags);
1059}
1060
1061void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
1062{
1063 pgio->pg_ops = &nfs_pageio_write_ops;
1064 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
1065}
1066EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1040 1067
1041 if (wsize < PAGE_CACHE_SIZE) 1068static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1042 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1069 struct inode *inode, int ioflags)
1043 else 1070{
1044 nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); 1071 if (!pnfs_pageio_init_write(pgio, inode, ioflags))
1072 nfs_pageio_init_write_mds(pgio, inode, ioflags);
1045} 1073}
1046 1074
1047/* 1075/*
@@ -1566,8 +1594,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1566 int status; 1594 int status;
1567 bool sync = true; 1595 bool sync = true;
1568 1596
1569 if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || 1597 if (wbc->sync_mode == WB_SYNC_NONE)
1570 wbc->for_background)
1571 sync = false; 1598 sync = false;
1572 1599
1573 status = pnfs_layoutcommit_inode(inode, sync); 1600 status = pnfs_layoutcommit_inode(inode, sync);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d309f38449cb..63fc294a4692 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -26,7 +26,7 @@
26#include <linux/fsnotify_backend.h> 26#include <linux/fsnotify_backend.h>
27#include "fsnotify.h" 27#include "fsnotify.h"
28 28
29#include <asm/atomic.h> 29#include <linux/atomic.h>
30 30
31/* 31/*
32 * Final freeing of a group 32 * Final freeing of a group
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 07ea8d3e6ea2..b13c00ac48eb 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -23,7 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25 25
26#include <asm/atomic.h> 26#include <linux/atomic.h>
27 27
28#include <linux/fsnotify_backend.h> 28#include <linux/fsnotify_backend.h>
29#include "fsnotify.h" 29#include "fsnotify.h"
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 252ab1f6452b..e14587d55689 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,7 +92,7 @@
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/srcu.h> 93#include <linux/srcu.h>
94 94
95#include <asm/atomic.h> 95#include <linux/atomic.h>
96 96
97#include <linux/fsnotify_backend.h> 97#include <linux/fsnotify_backend.h>
98#include "fsnotify.h" 98#include "fsnotify.h"
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index f39260f8f865..ee188158a224 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -43,7 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45 45
46#include <asm/atomic.h> 46#include <linux/atomic.h>
47 47
48#include <linux/fsnotify_backend.h> 48#include <linux/fsnotify_backend.h>
49#include "fsnotify.h" 49#include "fsnotify.h"
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index e86577d6c5c3..778fe6cae3b0 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -24,7 +24,7 @@
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26 26
27#include <asm/atomic.h> 27#include <linux/atomic.h>
28 28
29#include <linux/fsnotify_backend.h> 29#include <linux/fsnotify_backend.h>
30#include "fsnotify.h" 30#include "fsnotify.h"
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 2dabf813456c..fe8e7e928889 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -24,7 +24,7 @@
24#ifndef _LINUX_NTFS_INODE_H 24#ifndef _LINUX_NTFS_INODE_H
25#define _LINUX_NTFS_INODE_H 25#define _LINUX_NTFS_INODE_H
26 26
27#include <asm/atomic.h> 27#include <linux/atomic.h>
28 28
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/list.h> 30#include <linux/list.h>
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 783c58d9daf1..a7219075b4de 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle,
247 case ACL_TYPE_ACCESS: 247 case ACL_TYPE_ACCESS:
248 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; 248 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
249 if (acl) { 249 if (acl) {
250 mode_t mode = inode->i_mode; 250 umode_t mode = inode->i_mode;
251 ret = posix_acl_equiv_mode(acl, &mode); 251 ret = posix_acl_equiv_mode(acl, &mode);
252 if (ret < 0) 252 if (ret < 0)
253 return ret; 253 return ret;
@@ -351,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle,
351 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 351 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
352 struct posix_acl *acl = NULL; 352 struct posix_acl *acl = NULL;
353 int ret = 0, ret2; 353 int ret = 0, ret2;
354 mode_t mode; 354 umode_t mode;
355 355
356 if (!S_ISLNK(inode->i_mode)) { 356 if (!S_ISLNK(inode->i_mode)) {
357 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 357 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3b8d3979e03b..98e544274390 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -93,7 +93,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb)
93 93
94 memset(bh->b_data, 0, sizeof(struct omfs_inode)); 94 memset(bh->b_data, 0, sizeof(struct omfs_inode));
95 95
96 if (inode->i_mode & S_IFDIR) { 96 if (S_ISDIR(inode->i_mode)) {
97 memset(&bh->b_data[OMFS_DIR_START], 0xff, 97 memset(&bh->b_data[OMFS_DIR_START], 0xff,
98 sbi->s_sys_blocksize - OMFS_DIR_START); 98 sbi->s_sys_blocksize - OMFS_DIR_START);
99 } else 99 } else
diff --git a/fs/open.c b/fs/open.c
index 739b751aa73e..f71192109457 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -446,74 +446,52 @@ out:
446 return error; 446 return error;
447} 447}
448 448
449SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) 449static int chmod_common(struct path *path, umode_t mode)
450{ 450{
451 struct inode * inode; 451 struct inode *inode = path->dentry->d_inode;
452 struct dentry * dentry;
453 struct file * file;
454 int err = -EBADF;
455 struct iattr newattrs; 452 struct iattr newattrs;
453 int error;
456 454
457 file = fget(fd); 455 error = mnt_want_write(path->mnt);
458 if (!file) 456 if (error)
459 goto out; 457 return error;
460
461 dentry = file->f_path.dentry;
462 inode = dentry->d_inode;
463
464 audit_inode(NULL, dentry);
465
466 err = mnt_want_write_file(file);
467 if (err)
468 goto out_putf;
469 mutex_lock(&inode->i_mutex); 458 mutex_lock(&inode->i_mutex);
470 err = security_path_chmod(dentry, file->f_vfsmnt, mode); 459 error = security_path_chmod(path->dentry, path->mnt, mode);
471 if (err) 460 if (error)
472 goto out_unlock; 461 goto out_unlock;
473 if (mode == (mode_t) -1)
474 mode = inode->i_mode;
475 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 462 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
476 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 463 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
477 err = notify_change(dentry, &newattrs); 464 error = notify_change(path->dentry, &newattrs);
478out_unlock: 465out_unlock:
479 mutex_unlock(&inode->i_mutex); 466 mutex_unlock(&inode->i_mutex);
480 mnt_drop_write(file->f_path.mnt); 467 mnt_drop_write(path->mnt);
481out_putf: 468 return error;
482 fput(file); 469}
483out: 470
471SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
472{
473 struct file * file;
474 int err = -EBADF;
475
476 file = fget(fd);
477 if (file) {
478 audit_inode(NULL, file->f_path.dentry);
479 err = chmod_common(&file->f_path, mode);
480 fput(file);
481 }
484 return err; 482 return err;
485} 483}
486 484
487SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) 485SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
488{ 486{
489 struct path path; 487 struct path path;
490 struct inode *inode;
491 int error; 488 int error;
492 struct iattr newattrs;
493 489
494 error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); 490 error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
495 if (error) 491 if (!error) {
496 goto out; 492 error = chmod_common(&path, mode);
497 inode = path.dentry->d_inode; 493 path_put(&path);
498 494 }
499 error = mnt_want_write(path.mnt);
500 if (error)
501 goto dput_and_out;
502 mutex_lock(&inode->i_mutex);
503 error = security_path_chmod(path.dentry, path.mnt, mode);
504 if (error)
505 goto out_unlock;
506 if (mode == (mode_t) -1)
507 mode = inode->i_mode;
508 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
509 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
510 error = notify_change(path.dentry, &newattrs);
511out_unlock:
512 mutex_unlock(&inode->i_mutex);
513 mnt_drop_write(path.mnt);
514dput_and_out:
515 path_put(&path);
516out:
517 return error; 495 return error;
518} 496}
519 497
diff --git a/fs/pipe.c b/fs/pipe.c
index 1b7f9af67ccf..0e0be1dc0f8e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = {
948 948
949static struct inode * get_pipe_inode(void) 949static struct inode * get_pipe_inode(void)
950{ 950{
951 struct inode *inode = new_inode(pipe_mnt->mnt_sb); 951 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
952 struct pipe_inode_info *pipe; 952 struct pipe_inode_info *pipe;
953 953
954 if (!inode) 954 if (!inode)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index a6227d219e93..10027b42b7e2 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -14,7 +14,7 @@
14 14
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <asm/atomic.h> 17#include <linux/atomic.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/posix_acl.h> 20#include <linux/posix_acl.h>
@@ -149,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl)
149 * file mode permission bits, or else 1. Returns -E... on error. 149 * file mode permission bits, or else 1. Returns -E... on error.
150 */ 150 */
151int 151int
152posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) 152posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
153{ 153{
154 const struct posix_acl_entry *pa, *pe; 154 const struct posix_acl_entry *pa, *pe;
155 mode_t mode = 0; 155 umode_t mode = 0;
156 int not_equiv = 0; 156 int not_equiv = 0;
157 157
158 FOREACH_ACL_ENTRY(pa, acl, pe) { 158 FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -188,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
188 * Create an ACL representing the file mode permission bits of an inode. 188 * Create an ACL representing the file mode permission bits of an inode.
189 */ 189 */
190struct posix_acl * 190struct posix_acl *
191posix_acl_from_mode(mode_t mode, gfp_t flags) 191posix_acl_from_mode(umode_t mode, gfp_t flags)
192{ 192{
193 struct posix_acl *acl = posix_acl_alloc(3, flags); 193 struct posix_acl *acl = posix_acl_alloc(3, flags);
194 if (!acl) 194 if (!acl)
@@ -279,11 +279,11 @@ check_perm:
279 * system calls. All permissions that are not granted by the acl are removed. 279 * system calls. All permissions that are not granted by the acl are removed.
280 * The permissions in the acl are changed to reflect the mode_p parameter. 280 * The permissions in the acl are changed to reflect the mode_p parameter.
281 */ 281 */
282static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) 282static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
283{ 283{
284 struct posix_acl_entry *pa, *pe; 284 struct posix_acl_entry *pa, *pe;
285 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; 285 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
286 mode_t mode = *mode_p; 286 umode_t mode = *mode_p;
287 int not_equiv = 0; 287 int not_equiv = 0;
288 288
289 /* assert(atomic_read(acl->a_refcount) == 1); */ 289 /* assert(atomic_read(acl->a_refcount) == 1); */
@@ -336,7 +336,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p)
336/* 336/*
337 * Modify the ACL for the chmod syscall. 337 * Modify the ACL for the chmod syscall.
338 */ 338 */
339static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) 339static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
340{ 340{
341 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; 341 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
342 struct posix_acl_entry *pa, *pe; 342 struct posix_acl_entry *pa, *pe;
@@ -382,7 +382,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode)
382} 382}
383 383
384int 384int
385posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) 385posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
386{ 386{
387 struct posix_acl *clone = posix_acl_clone(*acl, gfp); 387 struct posix_acl *clone = posix_acl_clone(*acl, gfp);
388 int err = -ENOMEM; 388 int err = -ENOMEM;
@@ -400,7 +400,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p)
400EXPORT_SYMBOL(posix_acl_create); 400EXPORT_SYMBOL(posix_acl_create);
401 401
402int 402int
403posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode) 403posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
404{ 404{
405 struct posix_acl *clone = posix_acl_clone(*acl, gfp); 405 struct posix_acl *clone = posix_acl_clone(*acl, gfp);
406 int err = -ENOMEM; 406 int err = -ENOMEM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c9e3f650f23c..5eb02069e1b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1118,7 +1118,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1118 * Warn that /proc/pid/oom_adj is deprecated, see 1118 * Warn that /proc/pid/oom_adj is deprecated, see
1119 * Documentation/feature-removal-schedule.txt. 1119 * Documentation/feature-removal-schedule.txt.
1120 */ 1120 */
1121 WARN_ONCE(1, "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", 1121 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1122 current->comm, task_pid_nr(current), task_pid_nr(task), 1122 current->comm, task_pid_nr(current), task_pid_nr(task),
1123 task_pid_nr(task)); 1123 task_pid_nr(task));
1124 task->signal->oom_adj = oom_adjust; 1124 task->signal->oom_adj = oom_adjust;
@@ -1919,6 +1919,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1919 spin_lock(&files->file_lock); 1919 spin_lock(&files->file_lock);
1920 file = fcheck_files(files, fd); 1920 file = fcheck_files(files, fd);
1921 if (file) { 1921 if (file) {
1922 unsigned int f_flags;
1923 struct fdtable *fdt;
1924
1925 fdt = files_fdtable(files);
1926 f_flags = file->f_flags & ~O_CLOEXEC;
1927 if (FD_ISSET(fd, fdt->close_on_exec))
1928 f_flags |= O_CLOEXEC;
1929
1922 if (path) { 1930 if (path) {
1923 *path = file->f_path; 1931 *path = file->f_path;
1924 path_get(&file->f_path); 1932 path_get(&file->f_path);
@@ -1928,7 +1936,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1928 "pos:\t%lli\n" 1936 "pos:\t%lli\n"
1929 "flags:\t0%o\n", 1937 "flags:\t0%o\n",
1930 (long long) file->f_pos, 1938 (long long) file->f_pos,
1931 file->f_flags); 1939 f_flags);
1932 spin_unlock(&files->file_lock); 1940 spin_unlock(&files->file_lock);
1933 put_files_struct(files); 1941 put_files_struct(files);
1934 return 0; 1942 return 0;
@@ -2706,9 +2714,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2706{ 2714{
2707 struct task_io_accounting acct = task->ioac; 2715 struct task_io_accounting acct = task->ioac;
2708 unsigned long flags; 2716 unsigned long flags;
2717 int result;
2709 2718
2710 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2719 result = mutex_lock_killable(&task->signal->cred_guard_mutex);
2711 return -EACCES; 2720 if (result)
2721 return result;
2722
2723 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
2724 result = -EACCES;
2725 goto out_unlock;
2726 }
2712 2727
2713 if (whole && lock_task_sighand(task, &flags)) { 2728 if (whole && lock_task_sighand(task, &flags)) {
2714 struct task_struct *t = task; 2729 struct task_struct *t = task;
@@ -2719,7 +2734,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2719 2734
2720 unlock_task_sighand(task, &flags); 2735 unlock_task_sighand(task, &flags);
2721 } 2736 }
2722 return sprintf(buffer, 2737 result = sprintf(buffer,
2723 "rchar: %llu\n" 2738 "rchar: %llu\n"
2724 "wchar: %llu\n" 2739 "wchar: %llu\n"
2725 "syscr: %llu\n" 2740 "syscr: %llu\n"
@@ -2734,6 +2749,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2734 (unsigned long long)acct.read_bytes, 2749 (unsigned long long)acct.read_bytes,
2735 (unsigned long long)acct.write_bytes, 2750 (unsigned long long)acct.write_bytes,
2736 (unsigned long long)acct.cancelled_write_bytes); 2751 (unsigned long long)acct.cancelled_write_bytes);
2752out_unlock:
2753 mutex_unlock(&task->signal->cred_guard_mutex);
2754 return result;
2737} 2755}
2738 2756
2739static int proc_tid_io_accounting(struct task_struct *task, char *buffer) 2757static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f1637f17c37c..9d99131d0d65 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -620,8 +620,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
620 if (!ent) goto out; 620 if (!ent) goto out;
621 621
622 memset(ent, 0, sizeof(struct proc_dir_entry)); 622 memset(ent, 0, sizeof(struct proc_dir_entry));
623 memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); 623 memcpy(ent->name, fn, len + 1);
624 ent->name = ((char *) ent) + sizeof(*ent);
625 ent->namelen = len; 624 ent->namelen = len;
626 ent->mode = mode; 625 ent->mode = mode;
627 ent->nlink = nlink; 626 ent->nlink = nlink;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 74b48cfa1bb2..7ed72d6c1c6f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -319,7 +319,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
319 if (!pde->proc_fops) { 319 if (!pde->proc_fops) {
320 spin_unlock(&pde->pde_unload_lock); 320 spin_unlock(&pde->pde_unload_lock);
321 kfree(pdeo); 321 kfree(pdeo);
322 return -EINVAL; 322 return -ENOENT;
323 } 323 }
324 pde->pde_users++; 324 pde->pde_users++;
325 open = pde->proc_fops->open; 325 open = pde->proc_fops->open;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index ed257d141568..586174168e2a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -10,7 +10,7 @@
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/vmstat.h> 12#include <linux/vmstat.h>
13#include <asm/atomic.h> 13#include <linux/atomic.h>
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include "internal.h" 16#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 9020ac15baaa..f738024ccc8e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -197,15 +197,15 @@ static __net_init int proc_net_ns_init(struct net *net)
197 int err; 197 int err;
198 198
199 err = -ENOMEM; 199 err = -ENOMEM;
200 netd = kzalloc(sizeof(*netd), GFP_KERNEL); 200 netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
201 if (!netd) 201 if (!netd)
202 goto out; 202 goto out;
203 203
204 netd->data = net; 204 netd->data = net;
205 netd->nlink = 2; 205 netd->nlink = 2;
206 netd->name = "net";
207 netd->namelen = 3; 206 netd->namelen = 3;
208 netd->parent = &proc_root; 207 netd->parent = &proc_root;
208 memcpy(netd->name, "net", 4);
209 209
210 err = -EEXIST; 210 err = -EEXIST;
211 net_statd = proc_net_mkdir(net, "stat", netd); 211 net_statd = proc_net_mkdir(net, "stat", netd);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index d6c3b416529b..9a8a2b77b874 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -186,13 +186,13 @@ static const struct inode_operations proc_root_inode_operations = {
186struct proc_dir_entry proc_root = { 186struct proc_dir_entry proc_root = {
187 .low_ino = PROC_ROOT_INO, 187 .low_ino = PROC_ROOT_INO,
188 .namelen = 5, 188 .namelen = 5,
189 .name = "/proc",
190 .mode = S_IFDIR | S_IRUGO | S_IXUGO, 189 .mode = S_IFDIR | S_IRUGO | S_IXUGO,
191 .nlink = 2, 190 .nlink = 2,
192 .count = ATOMIC_INIT(1), 191 .count = ATOMIC_INIT(1),
193 .proc_iops = &proc_root_inode_operations, 192 .proc_iops = &proc_root_inode_operations,
194 .proc_fops = &proc_root_operations, 193 .proc_fops = &proc_root_operations,
195 .parent = &proc_root, 194 .parent = &proc_root,
195 .name = "/proc",
196}; 196};
197 197
198int pid_ns_prepare_proc(struct pid_namespace *ns) 198int pid_ns_prepare_proc(struct pid_namespace *ns)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 977ed2723845..893b961dcfd8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -39,8 +39,9 @@
39#define PSTORE_NAMELEN 64 39#define PSTORE_NAMELEN 64
40 40
41struct pstore_private { 41struct pstore_private {
42 struct pstore_info *psi;
43 enum pstore_type_id type;
42 u64 id; 44 u64 id;
43 int (*erase)(u64);
44 ssize_t size; 45 ssize_t size;
45 char data[]; 46 char data[];
46}; 47};
@@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
73{ 74{
74 struct pstore_private *p = dentry->d_inode->i_private; 75 struct pstore_private *p = dentry->d_inode->i_private;
75 76
76 p->erase(p->id); 77 p->psi->erase(p->type, p->id, p->psi);
77 78
78 return simple_unlink(dir, dentry); 79 return simple_unlink(dir, dentry);
79} 80}
@@ -175,8 +176,8 @@ int pstore_is_mounted(void)
175 * Set the mtime & ctime to the date that this record was originally stored. 176 * Set the mtime & ctime to the date that this record was originally stored.
176 */ 177 */
177int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, 178int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
178 char *data, size_t size, 179 char *data, size_t size, struct timespec time,
179 struct timespec time, int (*erase)(u64)) 180 struct pstore_info *psi)
180{ 181{
181 struct dentry *root = pstore_sb->s_root; 182 struct dentry *root = pstore_sb->s_root;
182 struct dentry *dentry; 183 struct dentry *dentry;
@@ -192,8 +193,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
192 private = kmalloc(sizeof *private + size, GFP_KERNEL); 193 private = kmalloc(sizeof *private + size, GFP_KERNEL);
193 if (!private) 194 if (!private)
194 goto fail_alloc; 195 goto fail_alloc;
196 private->type = type;
195 private->id = id; 197 private->id = id;
196 private->erase = erase; 198 private->psi = psi;
197 199
198 switch (type) { 200 switch (type) {
199 case PSTORE_TYPE_DMESG: 201 case PSTORE_TYPE_DMESG:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 8c9f23eb1645..611c1b3c46fa 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -2,5 +2,5 @@ extern void pstore_set_kmsg_bytes(int);
2extern void pstore_get_records(void); 2extern void pstore_get_records(void);
3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
4 char *data, size_t size, 4 char *data, size_t size,
5 struct timespec time, int (*erase)(u64)); 5 struct timespec time, struct pstore_info *psi);
6extern int pstore_is_mounted(void); 6extern int pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f2c3ff20ea68..c5300ec31696 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -37,6 +37,8 @@
37static DEFINE_SPINLOCK(pstore_lock); 37static DEFINE_SPINLOCK(pstore_lock);
38static struct pstore_info *psinfo; 38static struct pstore_info *psinfo;
39 39
40static char *backend;
41
40/* How much of the console log to snapshot */ 42/* How much of the console log to snapshot */
41static unsigned long kmsg_bytes = 10240; 43static unsigned long kmsg_bytes = 10240;
42 44
@@ -67,7 +69,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
67 unsigned long size, total = 0; 69 unsigned long size, total = 0;
68 char *dst, *why; 70 char *dst, *why;
69 u64 id; 71 u64 id;
70 int hsize, part = 1; 72 int hsize;
73 unsigned int part = 1;
71 74
72 if (reason < ARRAY_SIZE(reason_str)) 75 if (reason < ARRAY_SIZE(reason_str))
73 why = reason_str[reason]; 76 why = reason_str[reason];
@@ -78,7 +81,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
78 oopscount++; 81 oopscount++;
79 while (total < kmsg_bytes) { 82 while (total < kmsg_bytes) {
80 dst = psinfo->buf; 83 dst = psinfo->buf;
81 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); 84 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part);
82 size = psinfo->bufsize - hsize; 85 size = psinfo->bufsize - hsize;
83 dst += hsize; 86 dst += hsize;
84 87
@@ -94,14 +97,16 @@ static void pstore_dump(struct kmsg_dumper *dumper,
94 memcpy(dst, s1 + s1_start, l1_cpy); 97 memcpy(dst, s1 + s1_start, l1_cpy);
95 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); 98 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
96 99
97 id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); 100 id = psinfo->write(PSTORE_TYPE_DMESG, part,
101 hsize + l1_cpy + l2_cpy, psinfo);
98 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 102 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
99 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, 103 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
100 psinfo->buf, hsize + l1_cpy + l2_cpy, 104 psinfo->buf, hsize + l1_cpy + l2_cpy,
101 CURRENT_TIME, psinfo->erase); 105 CURRENT_TIME, psinfo);
102 l1 -= l1_cpy; 106 l1 -= l1_cpy;
103 l2 -= l2_cpy; 107 l2 -= l2_cpy;
104 total += l1_cpy + l2_cpy; 108 total += l1_cpy + l2_cpy;
109 part++;
105 } 110 }
106 mutex_unlock(&psinfo->buf_mutex); 111 mutex_unlock(&psinfo->buf_mutex);
107} 112}
@@ -128,6 +133,12 @@ int pstore_register(struct pstore_info *psi)
128 spin_unlock(&pstore_lock); 133 spin_unlock(&pstore_lock);
129 return -EBUSY; 134 return -EBUSY;
130 } 135 }
136
137 if (backend && strcmp(backend, psi->name)) {
138 spin_unlock(&pstore_lock);
139 return -EINVAL;
140 }
141
131 psinfo = psi; 142 psinfo = psi;
132 spin_unlock(&pstore_lock); 143 spin_unlock(&pstore_lock);
133 144
@@ -166,9 +177,9 @@ void pstore_get_records(void)
166 if (rc) 177 if (rc)
167 goto out; 178 goto out;
168 179
169 while ((size = psi->read(&id, &type, &time)) > 0) { 180 while ((size = psi->read(&id, &type, &time, psi)) > 0) {
170 if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, 181 if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
171 time, psi->erase)) 182 time, psi))
172 failed++; 183 failed++;
173 } 184 }
174 psi->close(psi); 185 psi->close(psi);
@@ -196,12 +207,15 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
196 207
197 mutex_lock(&psinfo->buf_mutex); 208 mutex_lock(&psinfo->buf_mutex);
198 memcpy(psinfo->buf, buf, size); 209 memcpy(psinfo->buf, buf, size);
199 id = psinfo->write(type, size); 210 id = psinfo->write(type, 0, size, psinfo);
200 if (pstore_is_mounted()) 211 if (pstore_is_mounted())
201 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, 212 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
202 size, CURRENT_TIME, psinfo->erase); 213 size, CURRENT_TIME, psinfo);
203 mutex_unlock(&psinfo->buf_mutex); 214 mutex_unlock(&psinfo->buf_mutex);
204 215
205 return 0; 216 return 0;
206} 217}
207EXPORT_SYMBOL_GPL(pstore_write); 218EXPORT_SYMBOL_GPL(pstore_write);
219
220module_param(backend, charp, 0444);
221MODULE_PARM_DESC(backend, "Pstore backend to use");
diff --git a/fs/read_write.c b/fs/read_write.c
index 5907b49e4d7e..179f1c33ea57 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -166,8 +166,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
166 * long as offset isn't at the end of the file then the 166 * long as offset isn't at the end of the file then the
167 * offset is data. 167 * offset is data.
168 */ 168 */
169 if (offset >= inode->i_size) 169 if (offset >= inode->i_size) {
170 return -ENXIO; 170 retval = -ENXIO;
171 goto out;
172 }
171 break; 173 break;
172 case SEEK_HOLE: 174 case SEEK_HOLE:
173 /* 175 /*
@@ -175,8 +177,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
175 * as long as offset isn't i_size or larger, return 177 * as long as offset isn't i_size or larger, return
176 * i_size. 178 * i_size.
177 */ 179 */
178 if (offset >= inode->i_size) 180 if (offset >= inode->i_size) {
179 return -ENXIO; 181 retval = -ENXIO;
182 goto out;
183 }
180 offset = inode->i_size; 184 offset = inode->i_size;
181 break; 185 break;
182 } 186 }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 7362cf4c946a..6da0396e5052 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
272 case ACL_TYPE_ACCESS: 272 case ACL_TYPE_ACCESS:
273 name = POSIX_ACL_XATTR_ACCESS; 273 name = POSIX_ACL_XATTR_ACCESS;
274 if (acl) { 274 if (acl) {
275 mode_t mode = inode->i_mode; 275 error = posix_acl_equiv_mode(acl, &inode->i_mode);
276 error = posix_acl_equiv_mode(acl, &mode);
277 if (error < 0) 276 if (error < 0)
278 return error; 277 return error;
279 else { 278 else {
280 inode->i_mode = mode;
281 if (error == 0) 279 if (error == 0)
282 acl = NULL; 280 acl = NULL;
283 } 281 }
@@ -354,8 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
354 return PTR_ERR(acl); 352 return PTR_ERR(acl);
355 353
356 if (acl) { 354 if (acl) {
357 mode_t mode = inode->i_mode;
358
359 /* Copy the default ACL to the default ACL of a new directory */ 355 /* Copy the default ACL to the default ACL of a new directory */
360 if (S_ISDIR(inode->i_mode)) { 356 if (S_ISDIR(inode->i_mode)) {
361 err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, 357 err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
@@ -366,12 +362,10 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
366 362
367 /* Now we reconcile the new ACL and the mode, 363 /* Now we reconcile the new ACL and the mode,
368 potentially modifying both */ 364 potentially modifying both */
369 err = posix_acl_create(&acl, GFP_NOFS, &mode); 365 err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
370 if (err < 0) 366 if (err < 0)
371 return err; 367 return err;
372 368
373 inode->i_mode = mode;
374
375 /* If we need an ACL.. */ 369 /* If we need an ACL.. */
376 if (err > 0) 370 if (err > 0)
377 err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); 371 err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/stack.c b/fs/stack.c
index 4a6f7f440658..b4f2ab48a61f 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
29 * 29 *
30 * We don't actually know what locking is used at the lower level; 30 * We don't actually know what locking is used at the lower level;
31 * but if it's a filesystem that supports quotas, it will be using 31 * but if it's a filesystem that supports quotas, it will be using
32 * i_lock as in inode_add_bytes(). tmpfs uses other locking, and 32 * i_lock as in inode_add_bytes().
33 * its 32-bit is (just) able to exceed 2TB i_size with the aid of
34 * holes; but its i_blocks cannot carry into the upper long without
35 * almost 2TB swap - let's ignore that case.
36 */ 33 */
37 if (sizeof(i_blocks) > sizeof(long)) 34 if (sizeof(i_blocks) > sizeof(long))
38 spin_lock(&src->i_lock); 35 spin_lock(&src->i_lock);
diff --git a/fs/stat.c b/fs/stat.c
index 961039121cb8..ba5316ffac61 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
27 stat->uid = inode->i_uid; 27 stat->uid = inode->i_uid;
28 stat->gid = inode->i_gid; 28 stat->gid = inode->i_gid;
29 stat->rdev = inode->i_rdev; 29 stat->rdev = inode->i_rdev;
30 stat->size = i_size_read(inode);
30 stat->atime = inode->i_atime; 31 stat->atime = inode->i_atime;
31 stat->mtime = inode->i_mtime; 32 stat->mtime = inode->i_mtime;
32 stat->ctime = inode->i_ctime; 33 stat->ctime = inode->i_ctime;
33 stat->size = i_size_read(inode);
34 stat->blocks = inode->i_blocks;
35 stat->blksize = (1 << inode->i_blkbits); 34 stat->blksize = (1 << inode->i_blkbits);
35 stat->blocks = inode->i_blocks;
36} 36}
37 37
38EXPORT_SYMBOL(generic_fillattr); 38EXPORT_SYMBOL(generic_fillattr);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 45174b534377..feb361e252ac 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -335,9 +335,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
335#define DBGKEY(key) ((char *)(key)) 335#define DBGKEY(key) ((char *)(key))
336#define DBGKEY1(key) ((char *)(key)) 336#define DBGKEY1(key) ((char *)(key))
337 337
338#define ubifs_dbg_msg(fmt, ...) do { \ 338#define ubifs_dbg_msg(fmt, ...) do { \
339 if (0) \ 339 if (0) \
340 pr_debug(fmt "\n", ##__VA_ARGS__); \ 340 printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
341} while (0) 341} while (0)
342 342
343#define dbg_dump_stack() 343#define dbg_dump_stack()
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 75bb316529dd..427a4e82a588 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,44 +16,53 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19ccflags-y := -I$(src) -I$(src)/linux-2.6 19ccflags-y += -I$(src) # needed for trace events
20ccflags-$(CONFIG_XFS_DEBUG) += -g
21 20
22XFS_LINUX := linux-2.6 21ccflags-$(CONFIG_XFS_DEBUG) += -g
23 22
24obj-$(CONFIG_XFS_FS) += xfs.o 23obj-$(CONFIG_XFS_FS) += xfs.o
25 24
26xfs-y += linux-2.6/xfs_trace.o 25# this one should be compiled first, as the tracing macros can easily blow up
27 26xfs-y += xfs_trace.o
28xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
29 xfs_dquot.o \
30 xfs_dquot_item.o \
31 xfs_trans_dquot.o \
32 xfs_qm_syscalls.o \
33 xfs_qm_bhv.o \
34 xfs_qm.o)
35xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
36
37ifeq ($(CONFIG_XFS_QUOTA),y)
38xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
39endif
40
41xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
42xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o
43xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o
44xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o
45xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o
46 27
28# highlevel code
29xfs-y += xfs_aops.o \
30 xfs_bit.o \
31 xfs_buf.o \
32 xfs_dfrag.o \
33 xfs_discard.o \
34 xfs_error.o \
35 xfs_export.o \
36 xfs_file.o \
37 xfs_filestream.o \
38 xfs_fsops.o \
39 xfs_fs_subr.o \
40 xfs_globals.o \
41 xfs_iget.o \
42 xfs_ioctl.o \
43 xfs_iomap.o \
44 xfs_iops.o \
45 xfs_itable.o \
46 xfs_message.o \
47 xfs_mru_cache.o \
48 xfs_super.o \
49 xfs_sync.o \
50 xfs_xattr.o \
51 xfs_rename.o \
52 xfs_rw.o \
53 xfs_utils.o \
54 xfs_vnodeops.o \
55 kmem.o \
56 uuid.o
47 57
58# code shared with libxfs
48xfs-y += xfs_alloc.o \ 59xfs-y += xfs_alloc.o \
49 xfs_alloc_btree.o \ 60 xfs_alloc_btree.o \
50 xfs_attr.o \ 61 xfs_attr.o \
51 xfs_attr_leaf.o \ 62 xfs_attr_leaf.o \
52 xfs_bit.o \
53 xfs_bmap.o \ 63 xfs_bmap.o \
54 xfs_bmap_btree.o \ 64 xfs_bmap_btree.o \
55 xfs_btree.o \ 65 xfs_btree.o \
56 xfs_buf_item.o \
57 xfs_da_btree.o \ 66 xfs_da_btree.o \
58 xfs_dir2.o \ 67 xfs_dir2.o \
59 xfs_dir2_block.o \ 68 xfs_dir2_block.o \
@@ -61,49 +70,37 @@ xfs-y += xfs_alloc.o \
61 xfs_dir2_leaf.o \ 70 xfs_dir2_leaf.o \
62 xfs_dir2_node.o \ 71 xfs_dir2_node.o \
63 xfs_dir2_sf.o \ 72 xfs_dir2_sf.o \
64 xfs_error.o \
65 xfs_extfree_item.o \
66 xfs_filestream.o \
67 xfs_fsops.o \
68 xfs_ialloc.o \ 73 xfs_ialloc.o \
69 xfs_ialloc_btree.o \ 74 xfs_ialloc_btree.o \
70 xfs_iget.o \
71 xfs_inode.o \ 75 xfs_inode.o \
72 xfs_inode_item.o \
73 xfs_iomap.o \
74 xfs_itable.o \
75 xfs_dfrag.o \
76 xfs_log.o \
77 xfs_log_cil.o \
78 xfs_log_recover.o \ 76 xfs_log_recover.o \
79 xfs_mount.o \ 77 xfs_mount.o \
80 xfs_mru_cache.o \ 78 xfs_trans.o
81 xfs_rename.o \ 79
82 xfs_trans.o \ 80# low-level transaction/log code
81xfs-y += xfs_log.o \
82 xfs_log_cil.o \
83 xfs_buf_item.o \
84 xfs_extfree_item.o \
85 xfs_inode_item.o \
83 xfs_trans_ail.o \ 86 xfs_trans_ail.o \
84 xfs_trans_buf.o \ 87 xfs_trans_buf.o \
85 xfs_trans_extfree.o \ 88 xfs_trans_extfree.o \
86 xfs_trans_inode.o \ 89 xfs_trans_inode.o \
87 xfs_utils.o \
88 xfs_vnodeops.o \
89 xfs_rw.o
90
91# Objects in linux/
92xfs-y += $(addprefix $(XFS_LINUX)/, \
93 kmem.o \
94 xfs_aops.o \
95 xfs_buf.o \
96 xfs_discard.o \
97 xfs_export.o \
98 xfs_file.o \
99 xfs_fs_subr.o \
100 xfs_globals.o \
101 xfs_ioctl.o \
102 xfs_iops.o \
103 xfs_message.o \
104 xfs_super.o \
105 xfs_sync.o \
106 xfs_xattr.o)
107 90
108# Objects in support/ 91# optional features
109xfs-y += support/uuid.o 92xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
93 xfs_dquot_item.o \
94 xfs_trans_dquot.o \
95 xfs_qm_syscalls.o \
96 xfs_qm_bhv.o \
97 xfs_qm.o \
98 xfs_quotaops.o
99ifeq ($(CONFIG_XFS_QUOTA),y)
100xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o
101endif
102xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
103xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
104xfs-$(CONFIG_PROC_FS) += xfs_stats.o
105xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
106xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/kmem.c
index a907de565db3..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/kmem.c
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h
index 292eff198030..292eff198030 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/kmem.h
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5c..ff6a19873e5c 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/mrlock.h
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h
index 387e695a184c..387e695a184c 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/time.h
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c
index b83f76b6d410..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/uuid.c
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/uuid.h
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 53ec3ea9a625..d8b11b7f94aa 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,5 +24,6 @@
24#define XFS_BUF_LOCK_TRACKING 1 24#define XFS_BUF_LOCK_TRACKING 1
25#endif 25#endif
26 26
27#include <linux-2.6/xfs_linux.h> 27#include "xfs_linux.h"
28
28#endif /* __XFS_H__ */ 29#endif /* __XFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/xfs_acl.c
index 44ce51656804..b6c4b3795c4a 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -221,7 +221,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
221} 221}
222 222
223static int 223static int
224xfs_set_mode(struct inode *inode, mode_t mode) 224xfs_set_mode(struct inode *inode, umode_t mode)
225{ 225{
226 int error = 0; 226 int error = 0;
227 227
@@ -267,7 +267,7 @@ posix_acl_default_exists(struct inode *inode)
267int 267int
268xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) 268xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
269{ 269{
270 mode_t mode = inode->i_mode; 270 umode_t mode = inode->i_mode;
271 int error = 0, inherit = 0; 271 int error = 0, inherit = 0;
272 272
273 if (S_ISDIR(inode->i_mode)) { 273 if (S_ISDIR(inode->i_mode)) {
@@ -381,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
381 goto out_release; 381 goto out_release;
382 382
383 if (type == ACL_TYPE_ACCESS) { 383 if (type == ACL_TYPE_ACCESS) {
384 mode_t mode = inode->i_mode; 384 umode_t mode = inode->i_mode;
385 error = posix_acl_equiv_mode(acl, &mode); 385 error = posix_acl_equiv_mode(acl, &mode);
386 386
387 if (error <= 0) { 387 if (error <= 0) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 2c656ef49473..39632d941354 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -51,7 +51,10 @@ extern int posix_acl_default_exists(struct inode *inode);
51extern const struct xattr_handler xfs_xattr_acl_access_handler; 51extern const struct xattr_handler xfs_xattr_acl_access_handler;
52extern const struct xattr_handler xfs_xattr_acl_default_handler; 52extern const struct xattr_handler xfs_xattr_acl_default_handler;
53#else 53#else
54# define xfs_get_acl(inode, type) NULL 54static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
55{
56 return NULL;
57}
55# define xfs_inherit_acl(inode, default_acl) 0 58# define xfs_inherit_acl(inode, default_acl) 0
56# define xfs_acl_chmod(inode) 0 59# define xfs_acl_chmod(inode) 0
57# define posix_acl_access_exists(inode) 0 60# define posix_acl_access_exists(inode) 0
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6530769a999b..4805f009f923 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -103,7 +103,7 @@ typedef struct xfs_agf {
103/* disk block (xfs_daddr_t) in the AG */ 103/* disk block (xfs_daddr_t) in the AG */
104#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) 104#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
105#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) 105#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
106#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) 106#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
107 107
108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
@@ -156,7 +156,7 @@ typedef struct xfs_agi {
156/* disk block (xfs_daddr_t) in the AG */ 156/* disk block (xfs_daddr_t) in the AG */
157#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) 157#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
158#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) 158#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
159#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) 159#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
160 160
161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, 161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
162 xfs_agnumber_t agno, struct xfs_buf **bpp); 162 xfs_agnumber_t agno, struct xfs_buf **bpp);
@@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
168#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) 168#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
169#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) 169#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
170#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) 170#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
171#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) 171#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
172 172
173typedef struct xfs_agfl { 173typedef struct xfs_agfl {
174 __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ 174 __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1e00b3ef6274..bdd9cb54d63b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -451,8 +451,7 @@ xfs_alloc_read_agfl(
451 XFS_FSS_TO_BB(mp, 1), 0, &bp); 451 XFS_FSS_TO_BB(mp, 1), 0, &bp);
452 if (error) 452 if (error)
453 return error; 453 return error;
454 ASSERT(bp); 454 ASSERT(!xfs_buf_geterror(bp));
455 ASSERT(!XFS_BUF_GETERROR(bp));
456 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); 455 XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
457 *bpp = bp; 456 *bpp = bp;
458 return 0; 457 return 0;
@@ -2116,7 +2115,7 @@ xfs_read_agf(
2116 if (!*bpp) 2115 if (!*bpp)
2117 return 0; 2116 return 0;
2118 2117
2119 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2118 ASSERT(!(*bpp)->b_error);
2120 agf = XFS_BUF_TO_AGF(*bpp); 2119 agf = XFS_BUF_TO_AGF(*bpp);
2121 2120
2122 /* 2121 /*
@@ -2168,7 +2167,7 @@ xfs_alloc_read_agf(
2168 return error; 2167 return error;
2169 if (!*bpp) 2168 if (!*bpp)
2170 return 0; 2169 return 0;
2171 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2170 ASSERT(!(*bpp)->b_error);
2172 2171
2173 agf = XFS_BUF_TO_AGF(*bpp); 2172 agf = XFS_BUF_TO_AGF(*bpp);
2174 pag = xfs_perag_get(mp, agno); 2173 pag = xfs_perag_get(mp, agno);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..8c37dde4c521 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1300,6 +1300,7 @@ xfs_end_io_direct_write(
1300 bool is_async) 1300 bool is_async)
1301{ 1301{
1302 struct xfs_ioend *ioend = iocb->private; 1302 struct xfs_ioend *ioend = iocb->private;
1303 struct inode *inode = ioend->io_inode;
1303 1304
1304 /* 1305 /*
1305 * blockdev_direct_IO can return an error even after the I/O 1306 * blockdev_direct_IO can return an error even after the I/O
@@ -1331,7 +1332,7 @@ xfs_end_io_direct_write(
1331 } 1332 }
1332 1333
1333 /* XXX: probably should move into the real I/O completion handler */ 1334 /* XXX: probably should move into the real I/O completion handler */
1334 inode_dio_done(ioend->io_inode); 1335 inode_dio_done(inode);
1335} 1336}
1336 1337
1337STATIC ssize_t 1338STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71f..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index cbae424fe1ba..160bcdc34a6e 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2121,8 +2121,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2121 2121
2122 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2122 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2123 XBF_LOCK | XBF_DONT_BLOCK); 2123 XBF_LOCK | XBF_DONT_BLOCK);
2124 ASSERT(bp); 2124 ASSERT(!xfs_buf_geterror(bp));
2125 ASSERT(!XFS_BUF_GETERROR(bp));
2126 2125
2127 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2126 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2128 XFS_BUF_SIZE(bp); 2127 XFS_BUF_SIZE(bp);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c51a3f903633..452a291383ab 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -414,7 +414,7 @@ xfs_bmap_add_attrfork_local(
414 414
415 if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) 415 if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
416 return 0; 416 return 0;
417 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 417 if (S_ISDIR(ip->i_d.di_mode)) {
418 mp = ip->i_mount; 418 mp = ip->i_mount;
419 memset(&dargs, 0, sizeof(dargs)); 419 memset(&dargs, 0, sizeof(dargs));
420 dargs.dp = ip; 420 dargs.dp = ip;
@@ -3344,8 +3344,7 @@ xfs_bmap_local_to_extents(
3344 * We don't want to deal with the case of keeping inode data inline yet. 3344 * We don't want to deal with the case of keeping inode data inline yet.
3345 * So sending the data fork of a regular inode is invalid. 3345 * So sending the data fork of a regular inode is invalid.
3346 */ 3346 */
3347 ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && 3347 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
3348 whichfork == XFS_DATA_FORK));
3349 ifp = XFS_IFORK_PTR(ip, whichfork); 3348 ifp = XFS_IFORK_PTR(ip, whichfork);
3350 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); 3349 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
3351 flags = 0; 3350 flags = 0;
@@ -3384,8 +3383,7 @@ xfs_bmap_local_to_extents(
3384 ASSERT(args.len == 1); 3383 ASSERT(args.len == 1);
3385 *firstblock = args.fsbno; 3384 *firstblock = args.fsbno;
3386 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 3385 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3387 memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, 3386 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
3388 ifp->if_bytes);
3389 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); 3387 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3390 xfs_bmap_forkoff_reset(args.mp, ip, whichfork); 3388 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
3391 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); 3389 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -4052,7 +4050,7 @@ xfs_bmap_one_block(
4052 4050
4053#ifndef DEBUG 4051#ifndef DEBUG
4054 if (whichfork == XFS_DATA_FORK) { 4052 if (whichfork == XFS_DATA_FORK) {
4055 return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? 4053 return S_ISREG(ip->i_d.di_mode) ?
4056 (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : 4054 (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
4057 (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); 4055 (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
4058 } 4056 }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cabf4b5604aa..2b9fd385e27d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -275,8 +275,7 @@ xfs_btree_dup_cursor(
275 return error; 275 return error;
276 } 276 }
277 new->bc_bufs[i] = bp; 277 new->bc_bufs[i] = bp;
278 ASSERT(bp); 278 ASSERT(!xfs_buf_geterror(bp));
279 ASSERT(!XFS_BUF_GETERROR(bp));
280 } else 279 } else
281 new->bc_bufs[i] = NULL; 280 new->bc_bufs[i] = NULL;
282 } 281 }
@@ -467,8 +466,7 @@ xfs_btree_get_bufl(
467 ASSERT(fsbno != NULLFSBLOCK); 466 ASSERT(fsbno != NULLFSBLOCK);
468 d = XFS_FSB_TO_DADDR(mp, fsbno); 467 d = XFS_FSB_TO_DADDR(mp, fsbno);
469 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); 468 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
470 ASSERT(bp); 469 ASSERT(!xfs_buf_geterror(bp));
471 ASSERT(!XFS_BUF_GETERROR(bp));
472 return bp; 470 return bp;
473} 471}
474 472
@@ -491,8 +489,7 @@ xfs_btree_get_bufs(
491 ASSERT(agbno != NULLAGBLOCK); 489 ASSERT(agbno != NULLAGBLOCK);
492 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 490 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
493 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); 491 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
494 ASSERT(bp); 492 ASSERT(!xfs_buf_geterror(bp));
495 ASSERT(!XFS_BUF_GETERROR(bp));
496 return bp; 493 return bp;
497} 494}
498 495
@@ -632,7 +629,7 @@ xfs_btree_read_bufl(
632 mp->m_bsize, lock, &bp))) { 629 mp->m_bsize, lock, &bp))) {
633 return error; 630 return error;
634 } 631 }
635 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 632 ASSERT(!xfs_buf_geterror(bp));
636 if (bp) 633 if (bp)
637 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 634 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
638 *bpp = bp; 635 *bpp = bp;
@@ -973,8 +970,7 @@ xfs_btree_get_buf_block(
973 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 970 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
974 mp->m_bsize, flags); 971 mp->m_bsize, flags);
975 972
976 ASSERT(*bpp); 973 ASSERT(!xfs_buf_geterror(*bpp));
977 ASSERT(!XFS_BUF_GETERROR(*bpp));
978 974
979 *block = XFS_BUF_TO_BLOCK(*bpp); 975 *block = XFS_BUF_TO_BLOCK(*bpp);
980 return 0; 976 return 0;
@@ -1006,8 +1002,7 @@ xfs_btree_read_buf_block(
1006 if (error) 1002 if (error)
1007 return error; 1003 return error;
1008 1004
1009 ASSERT(*bpp != NULL); 1005 ASSERT(!xfs_buf_geterror(*bpp));
1010 ASSERT(!XFS_BUF_GETERROR(*bpp));
1011 1006
1012 xfs_btree_set_refs(cur, *bpp); 1007 xfs_btree_set_refs(cur, *bpp);
1013 *block = XFS_BUF_TO_BLOCK(*bpp); 1008 *block = XFS_BUF_TO_BLOCK(*bpp);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 8d05a6a46ce3..5b240de104c0 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -262,7 +262,7 @@ typedef struct xfs_btree_cur
262/* 262/*
263 * Convert from buffer to btree block header. 263 * Convert from buffer to btree block header.
264 */ 264 */
265#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) 265#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
266 266
267 267
268/* 268/*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/xfs_buf.c
index b2b411985591..c57836dc778f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -596,7 +596,7 @@ _xfs_buf_read(
596 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 596 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
597 597
598 status = xfs_buf_iorequest(bp); 598 status = xfs_buf_iorequest(bp);
599 if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) 599 if (status || bp->b_error || (flags & XBF_ASYNC))
600 return status; 600 return status;
601 return xfs_buf_iowait(bp); 601 return xfs_buf_iowait(bp);
602} 602}
@@ -679,7 +679,6 @@ xfs_buf_read_uncached(
679 /* set up the buffer for a read IO */ 679 /* set up the buffer for a read IO */
680 XFS_BUF_SET_ADDR(bp, daddr); 680 XFS_BUF_SET_ADDR(bp, daddr);
681 XFS_BUF_READ(bp); 681 XFS_BUF_READ(bp);
682 XFS_BUF_BUSY(bp);
683 682
684 xfsbdstrat(mp, bp); 683 xfsbdstrat(mp, bp);
685 error = xfs_buf_iowait(bp); 684 error = xfs_buf_iowait(bp);
@@ -1069,7 +1068,7 @@ xfs_bioerror(
1069 /* 1068 /*
1070 * No need to wait until the buffer is unpinned, we aren't flushing it. 1069 * No need to wait until the buffer is unpinned, we aren't flushing it.
1071 */ 1070 */
1072 XFS_BUF_ERROR(bp, EIO); 1071 xfs_buf_ioerror(bp, EIO);
1073 1072
1074 /* 1073 /*
1075 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1074 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
@@ -1094,7 +1093,7 @@ STATIC int
1094xfs_bioerror_relse( 1093xfs_bioerror_relse(
1095 struct xfs_buf *bp) 1094 struct xfs_buf *bp)
1096{ 1095{
1097 int64_t fl = XFS_BUF_BFLAGS(bp); 1096 int64_t fl = bp->b_flags;
1098 /* 1097 /*
1099 * No need to wait until the buffer is unpinned. 1098 * No need to wait until the buffer is unpinned.
1100 * We aren't flushing it. 1099 * We aren't flushing it.
@@ -1115,7 +1114,7 @@ xfs_bioerror_relse(
1115 * There's no reason to mark error for 1114 * There's no reason to mark error for
1116 * ASYNC buffers. 1115 * ASYNC buffers.
1117 */ 1116 */
1118 XFS_BUF_ERROR(bp, EIO); 1117 xfs_buf_ioerror(bp, EIO);
1119 XFS_BUF_FINISH_IOWAIT(bp); 1118 XFS_BUF_FINISH_IOWAIT(bp);
1120 } else { 1119 } else {
1121 xfs_buf_relse(bp); 1120 xfs_buf_relse(bp);
@@ -1224,6 +1223,9 @@ _xfs_buf_ioapply(
1224 rw = READ; 1223 rw = READ;
1225 } 1224 }
1226 1225
1226 /* we only use the buffer cache for meta-data */
1227 rw |= REQ_META;
1228
1227next_chunk: 1229next_chunk:
1228 atomic_inc(&bp->b_io_remaining); 1230 atomic_inc(&bp->b_io_remaining);
1229 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1231 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
@@ -1321,7 +1323,7 @@ xfs_buf_offset(
1321 struct page *page; 1323 struct page *page;
1322 1324
1323 if (bp->b_flags & XBF_MAPPED) 1325 if (bp->b_flags & XBF_MAPPED)
1324 return XFS_BUF_PTR(bp) + offset; 1326 return bp->b_addr + offset;
1325 1327
1326 offset += bp->b_offset; 1328 offset += bp->b_offset;
1327 page = bp->b_pages[offset >> PAGE_SHIFT]; 1329 page = bp->b_pages[offset >> PAGE_SHIFT];
@@ -1481,7 +1483,7 @@ xfs_setsize_buftarg_flags(
1481 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1483 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1482 xfs_warn(btp->bt_mount, 1484 xfs_warn(btp->bt_mount,
1483 "Cannot set_blocksize to %u on device %s\n", 1485 "Cannot set_blocksize to %u on device %s\n",
1484 sectorsize, XFS_BUFTARG_NAME(btp)); 1486 sectorsize, xfs_buf_target_name(btp));
1485 return EINVAL; 1487 return EINVAL;
1486 } 1488 }
1487 1489
@@ -1678,7 +1680,7 @@ xfs_buf_delwri_split(
1678 list_for_each_entry_safe(bp, n, dwq, b_list) { 1680 list_for_each_entry_safe(bp, n, dwq, b_list) {
1679 ASSERT(bp->b_flags & XBF_DELWRI); 1681 ASSERT(bp->b_flags & XBF_DELWRI);
1680 1682
1681 if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) { 1683 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
1682 if (!force && 1684 if (!force &&
1683 time_before(jiffies, bp->b_queuetime + age)) { 1685 time_before(jiffies, bp->b_queuetime + age)) {
1684 xfs_buf_unlock(bp); 1686 xfs_buf_unlock(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/xfs_buf.h
index 6a83b46b4bcf..620972b8094d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -228,11 +228,15 @@ extern void xfs_buf_delwri_promote(xfs_buf_t *);
228extern int xfs_buf_init(void); 228extern int xfs_buf_init(void);
229extern void xfs_buf_terminate(void); 229extern void xfs_buf_terminate(void);
230 230
231#define xfs_buf_target_name(target) \ 231static inline const char *
232 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 232xfs_buf_target_name(struct xfs_buftarg *target)
233{
234 static char __b[BDEVNAME_SIZE];
235
236 return bdevname(target->bt_bdev, __b);
237}
233 238
234 239
235#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
236#define XFS_BUF_ZEROFLAGS(bp) \ 240#define XFS_BUF_ZEROFLAGS(bp) \
237 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ 241 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
238 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 242 XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
@@ -251,23 +255,14 @@ void xfs_buf_stale(struct xfs_buf *bp);
251#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 255#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
252#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 256#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
253 257
254#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no)
255#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp)
256#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0)
257
258#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 258#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
259#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) 259#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
260#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) 260#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
261 261
262#define XFS_BUF_BUSY(bp) do { } while (0)
263#define XFS_BUF_UNBUSY(bp) do { } while (0)
264#define XFS_BUF_ISBUSY(bp) (1)
265
266#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) 262#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
267#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) 263#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
268#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) 264#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
269 265
270#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
271#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) 266#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
272#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) 267#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
273#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) 268#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
@@ -276,10 +271,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
276#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) 271#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
277#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) 272#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
278 273
279#define XFS_BUF_SET_START(bp) do { } while (0)
280
281#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
282#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
283#define XFS_BUF_ADDR(bp) ((bp)->b_bn) 274#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
284#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) 275#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
285#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) 276#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
@@ -299,14 +290,13 @@ xfs_buf_set_ref(
299#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) 290#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
300#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 291#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
301 292
302#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 293static inline int xfs_buf_ispinned(struct xfs_buf *bp)
294{
295 return atomic_read(&bp->b_pin_count);
296}
303 297
304#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); 298#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
305 299
306#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
307#define XFS_BUF_TARGET(bp) ((bp)->b_target)
308#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
309
310static inline void xfs_buf_relse(xfs_buf_t *bp) 300static inline void xfs_buf_relse(xfs_buf_t *bp)
311{ 301{
312 xfs_buf_unlock(bp); 302 xfs_buf_unlock(bp);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 88492916c3dc..cac2ecfa6746 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -124,9 +124,9 @@ xfs_buf_item_log_check(
124 124
125 bp = bip->bli_buf; 125 bp = bip->bli_buf;
126 ASSERT(XFS_BUF_COUNT(bp) > 0); 126 ASSERT(XFS_BUF_COUNT(bp) > 0);
127 ASSERT(XFS_BUF_PTR(bp) != NULL); 127 ASSERT(bp->b_addr != NULL);
128 orig = bip->bli_orig; 128 orig = bip->bli_orig;
129 buffer = XFS_BUF_PTR(bp); 129 buffer = bp->b_addr;
130 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 130 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
131 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { 131 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
132 xfs_emerg(bp->b_mount, 132 xfs_emerg(bp->b_mount,
@@ -371,7 +371,6 @@ xfs_buf_item_pin(
371{ 371{
372 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 372 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
373 373
374 ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
375 ASSERT(atomic_read(&bip->bli_refcount) > 0); 374 ASSERT(atomic_read(&bip->bli_refcount) > 0);
376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 375 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
377 (bip->bli_flags & XFS_BLI_STALE)); 376 (bip->bli_flags & XFS_BLI_STALE));
@@ -479,13 +478,13 @@ xfs_buf_item_trylock(
479 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 478 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
480 struct xfs_buf *bp = bip->bli_buf; 479 struct xfs_buf *bp = bip->bli_buf;
481 480
482 if (XFS_BUF_ISPINNED(bp)) 481 if (xfs_buf_ispinned(bp))
483 return XFS_ITEM_PINNED; 482 return XFS_ITEM_PINNED;
484 if (!xfs_buf_trylock(bp)) 483 if (!xfs_buf_trylock(bp))
485 return XFS_ITEM_LOCKED; 484 return XFS_ITEM_LOCKED;
486 485
487 /* take a reference to the buffer. */ 486 /* take a reference to the buffer. */
488 XFS_BUF_HOLD(bp); 487 xfs_buf_hold(bp);
489 488
490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 489 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
491 trace_xfs_buf_item_trylock(bip); 490 trace_xfs_buf_item_trylock(bip);
@@ -726,7 +725,7 @@ xfs_buf_item_init(
726 * to have logged. 725 * to have logged.
727 */ 726 */
728 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); 727 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
729 memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); 728 memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp));
730 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); 729 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
731#endif 730#endif
732 731
@@ -895,7 +894,6 @@ xfs_buf_attach_iodone(
895{ 894{
896 xfs_log_item_t *head_lip; 895 xfs_log_item_t *head_lip;
897 896
898 ASSERT(XFS_BUF_ISBUSY(bp));
899 ASSERT(xfs_buf_islocked(bp)); 897 ASSERT(xfs_buf_islocked(bp));
900 898
901 lip->li_cb = cb; 899 lip->li_cb = cb;
@@ -960,7 +958,7 @@ xfs_buf_iodone_callbacks(
960 static ulong lasttime; 958 static ulong lasttime;
961 static xfs_buftarg_t *lasttarg; 959 static xfs_buftarg_t *lasttarg;
962 960
963 if (likely(!XFS_BUF_GETERROR(bp))) 961 if (likely(!xfs_buf_geterror(bp)))
964 goto do_callbacks; 962 goto do_callbacks;
965 963
966 /* 964 /*
@@ -973,14 +971,14 @@ xfs_buf_iodone_callbacks(
973 goto do_callbacks; 971 goto do_callbacks;
974 } 972 }
975 973
976 if (XFS_BUF_TARGET(bp) != lasttarg || 974 if (bp->b_target != lasttarg ||
977 time_after(jiffies, (lasttime + 5*HZ))) { 975 time_after(jiffies, (lasttime + 5*HZ))) {
978 lasttime = jiffies; 976 lasttime = jiffies;
979 xfs_alert(mp, "Device %s: metadata write error block 0x%llx", 977 xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
980 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 978 xfs_buf_target_name(bp->b_target),
981 (__uint64_t)XFS_BUF_ADDR(bp)); 979 (__uint64_t)XFS_BUF_ADDR(bp));
982 } 980 }
983 lasttarg = XFS_BUF_TARGET(bp); 981 lasttarg = bp->b_target;
984 982
985 /* 983 /*
986 * If the write was asynchronous then no one will be looking for the 984 * If the write was asynchronous then no one will be looking for the
@@ -991,12 +989,11 @@ xfs_buf_iodone_callbacks(
991 * around. 989 * around.
992 */ 990 */
993 if (XFS_BUF_ISASYNC(bp)) { 991 if (XFS_BUF_ISASYNC(bp)) {
994 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ 992 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
995 993
996 if (!XFS_BUF_ISSTALE(bp)) { 994 if (!XFS_BUF_ISSTALE(bp)) {
997 XFS_BUF_DELAYWRITE(bp); 995 XFS_BUF_DELAYWRITE(bp);
998 XFS_BUF_DONE(bp); 996 XFS_BUF_DONE(bp);
999 XFS_BUF_SET_START(bp);
1000 } 997 }
1001 ASSERT(bp->b_iodone != NULL); 998 ASSERT(bp->b_iodone != NULL);
1002 trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 999 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1013,7 +1010,6 @@ xfs_buf_iodone_callbacks(
1013 XFS_BUF_UNDELAYWRITE(bp); 1010 XFS_BUF_UNDELAYWRITE(bp);
1014 1011
1015 trace_xfs_buf_error_relse(bp, _RET_IP_); 1012 trace_xfs_buf_error_relse(bp, _RET_IP_);
1016 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1017 1013
1018do_callbacks: 1014do_callbacks:
1019 xfs_buf_do_callbacks(bp); 1015 xfs_buf_do_callbacks(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 2925726529f8..ee9d5427fcd4 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -692,6 +692,24 @@ xfs_da_join(xfs_da_state_t *state)
692 return(error); 692 return(error);
693} 693}
694 694
695#ifdef DEBUG
696static void
697xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
698{
699 __be16 magic = blkinfo->magic;
700
701 if (level == 1) {
702 ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
703 magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
704 } else
705 ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
706 ASSERT(!blkinfo->forw);
707 ASSERT(!blkinfo->back);
708}
709#else /* !DEBUG */
710#define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
711#endif /* !DEBUG */
712
695/* 713/*
696 * We have only one entry in the root. Copy the only remaining child of 714 * We have only one entry in the root. Copy the only remaining child of
697 * the old root to block 0 as the new root node. 715 * the old root to block 0 as the new root node.
@@ -700,8 +718,6 @@ STATIC int
700xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) 718xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
701{ 719{
702 xfs_da_intnode_t *oldroot; 720 xfs_da_intnode_t *oldroot;
703 /* REFERENCED */
704 xfs_da_blkinfo_t *blkinfo;
705 xfs_da_args_t *args; 721 xfs_da_args_t *args;
706 xfs_dablk_t child; 722 xfs_dablk_t child;
707 xfs_dabuf_t *bp; 723 xfs_dabuf_t *bp;
@@ -732,15 +748,9 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
732 if (error) 748 if (error)
733 return(error); 749 return(error);
734 ASSERT(bp != NULL); 750 ASSERT(bp != NULL);
735 blkinfo = bp->data; 751 xfs_da_blkinfo_onlychild_validate(bp->data,
736 if (be16_to_cpu(oldroot->hdr.level) == 1) { 752 be16_to_cpu(oldroot->hdr.level));
737 ASSERT(blkinfo->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || 753
738 blkinfo->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
739 } else {
740 ASSERT(blkinfo->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
741 }
742 ASSERT(!blkinfo->forw);
743 ASSERT(!blkinfo->back);
744 memcpy(root_blk->bp->data, bp->data, state->blocksize); 754 memcpy(root_blk->bp->data, bp->data, state->blocksize);
745 xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); 755 xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
746 error = xfs_da_shrink_inode(args, child, bp); 756 error = xfs_da_shrink_inode(args, child, bp);
@@ -2040,7 +2050,7 @@ xfs_da_do_buf(
2040 case 0: 2050 case 0:
2041 bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, 2051 bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
2042 mappedbno, nmapped, 0); 2052 mappedbno, nmapped, 0);
2043 error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); 2053 error = bp ? bp->b_error : XFS_ERROR(EIO);
2044 break; 2054 break;
2045 case 1: 2055 case 1:
2046 case 2: 2056 case 2:
@@ -2258,7 +2268,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
2258 dabuf->nbuf = 1; 2268 dabuf->nbuf = 1;
2259 bp = bps[0]; 2269 bp = bps[0];
2260 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); 2270 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
2261 dabuf->data = XFS_BUF_PTR(bp); 2271 dabuf->data = bp->b_addr;
2262 dabuf->bps[0] = bp; 2272 dabuf->bps[0] = bp;
2263 } else { 2273 } else {
2264 dabuf->nbuf = nbuf; 2274 dabuf->nbuf = nbuf;
@@ -2269,7 +2279,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
2269 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); 2279 dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
2270 for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { 2280 for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
2271 bp = bps[i]; 2281 bp = bps[i];
2272 memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), 2282 memcpy((char *)dabuf->data + off, bp->b_addr,
2273 XFS_BUF_COUNT(bp)); 2283 XFS_BUF_COUNT(bp));
2274 } 2284 }
2275 } 2285 }
@@ -2292,8 +2302,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
2292 for (i = off = 0; i < dabuf->nbuf; 2302 for (i = off = 0; i < dabuf->nbuf;
2293 i++, off += XFS_BUF_COUNT(bp)) { 2303 i++, off += XFS_BUF_COUNT(bp)) {
2294 bp = dabuf->bps[i]; 2304 bp = dabuf->bps[i];
2295 memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, 2305 memcpy(bp->b_addr, dabuf->data + off,
2296 XFS_BUF_COUNT(bp)); 2306 XFS_BUF_COUNT(bp));
2297 } 2307 }
2298 } 2308 }
2299} 2309}
@@ -2330,7 +2340,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
2330 2340
2331 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); 2341 ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2332 if (dabuf->nbuf == 1) { 2342 if (dabuf->nbuf == 1) {
2333 ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); 2343 ASSERT(dabuf->data == dabuf->bps[0]->b_addr);
2334 xfs_trans_log_buf(tp, dabuf->bps[0], first, last); 2344 xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
2335 return; 2345 return;
2336 } 2346 }
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index dffba9ba0db6..a3721633abc8 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt {
148 be32_to_cpu((dip)->di_nextents) : \ 148 be32_to_cpu((dip)->di_nextents) : \
149 be16_to_cpu((dip)->di_anextents)) 149 be16_to_cpu((dip)->di_anextents))
150 150
151#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) 151#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr))
152 152
153/* 153/*
154 * For block and character special files the 32bit dev_t is stored at the 154 * For block and character special files the 32bit dev_t is stored at the
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 4580ce00aeb4..a2e27010c7fb 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -121,7 +121,7 @@ xfs_dir_isempty(
121{ 121{
122 xfs_dir2_sf_hdr_t *sfp; 122 xfs_dir2_sf_hdr_t *sfp;
123 123
124 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 124 ASSERT(S_ISDIR(dp->i_d.di_mode));
125 if (dp->i_d.di_size == 0) /* might happen during shutdown. */ 125 if (dp->i_d.di_size == 0) /* might happen during shutdown. */
126 return 1; 126 return 1;
127 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) 127 if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -179,7 +179,7 @@ xfs_dir_init(
179 memset((char *)&args, 0, sizeof(args)); 179 memset((char *)&args, 0, sizeof(args));
180 args.dp = dp; 180 args.dp = dp;
181 args.trans = tp; 181 args.trans = tp;
182 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 182 ASSERT(S_ISDIR(dp->i_d.di_mode));
183 if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) 183 if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
184 return error; 184 return error;
185 return xfs_dir2_sf_create(&args, pdp->i_ino); 185 return xfs_dir2_sf_create(&args, pdp->i_ino);
@@ -202,7 +202,7 @@ xfs_dir_createname(
202 int rval; 202 int rval;
203 int v; /* type-checking value */ 203 int v; /* type-checking value */
204 204
205 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 205 ASSERT(S_ISDIR(dp->i_d.di_mode));
206 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 206 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
207 return rval; 207 return rval;
208 XFS_STATS_INC(xs_dir_create); 208 XFS_STATS_INC(xs_dir_create);
@@ -278,7 +278,7 @@ xfs_dir_lookup(
278 int rval; 278 int rval;
279 int v; /* type-checking value */ 279 int v; /* type-checking value */
280 280
281 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 281 ASSERT(S_ISDIR(dp->i_d.di_mode));
282 XFS_STATS_INC(xs_dir_lookup); 282 XFS_STATS_INC(xs_dir_lookup);
283 283
284 memset(&args, 0, sizeof(xfs_da_args_t)); 284 memset(&args, 0, sizeof(xfs_da_args_t));
@@ -333,7 +333,7 @@ xfs_dir_removename(
333 int rval; 333 int rval;
334 int v; /* type-checking value */ 334 int v; /* type-checking value */
335 335
336 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 336 ASSERT(S_ISDIR(dp->i_d.di_mode));
337 XFS_STATS_INC(xs_dir_remove); 337 XFS_STATS_INC(xs_dir_remove);
338 338
339 memset(&args, 0, sizeof(xfs_da_args_t)); 339 memset(&args, 0, sizeof(xfs_da_args_t));
@@ -382,7 +382,7 @@ xfs_readdir(
382 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 382 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
383 return XFS_ERROR(EIO); 383 return XFS_ERROR(EIO);
384 384
385 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 385 ASSERT(S_ISDIR(dp->i_d.di_mode));
386 XFS_STATS_INC(xs_dir_getdents); 386 XFS_STATS_INC(xs_dir_getdents);
387 387
388 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 388 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -414,7 +414,7 @@ xfs_dir_replace(
414 int rval; 414 int rval;
415 int v; /* type-checking value */ 415 int v; /* type-checking value */
416 416
417 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 417 ASSERT(S_ISDIR(dp->i_d.di_mode));
418 418
419 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 419 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
420 return rval; 420 return rval;
@@ -464,7 +464,7 @@ xfs_dir_canenter(
464 if (resblks) 464 if (resblks)
465 return 0; 465 return 0;
466 466
467 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 467 ASSERT(S_ISDIR(dp->i_d.di_mode));
468 468
469 memset(&args, 0, sizeof(xfs_da_args_t)); 469 memset(&args, 0, sizeof(xfs_da_args_t));
470 args.name = name->name; 470 args.name = name->name;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae32..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/xfs_discard.h
index 344879aea646..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 837f31158d43..db62959bed13 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -318,10 +318,9 @@ xfs_qm_init_dquot_blk(
318 int curid, i; 318 int curid, i;
319 319
320 ASSERT(tp); 320 ASSERT(tp);
321 ASSERT(XFS_BUF_ISBUSY(bp));
322 ASSERT(xfs_buf_islocked(bp)); 321 ASSERT(xfs_buf_islocked(bp));
323 322
324 d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); 323 d = bp->b_addr;
325 324
326 /* 325 /*
327 * ID of the first dquot in the block - id's are zero based. 326 * ID of the first dquot in the block - id's are zero based.
@@ -403,7 +402,7 @@ xfs_qm_dqalloc(
403 dqp->q_blkno, 402 dqp->q_blkno,
404 mp->m_quotainfo->qi_dqchunklen, 403 mp->m_quotainfo->qi_dqchunklen,
405 0); 404 0);
406 if (!bp || (error = XFS_BUF_GETERROR(bp))) 405 if (!bp || (error = xfs_buf_geterror(bp)))
407 goto error1; 406 goto error1;
408 /* 407 /*
409 * Make a chunk of dquots out of this buffer and log 408 * Make a chunk of dquots out of this buffer and log
@@ -534,13 +533,12 @@ xfs_qm_dqtobp(
534 return XFS_ERROR(error); 533 return XFS_ERROR(error);
535 } 534 }
536 535
537 ASSERT(XFS_BUF_ISBUSY(bp));
538 ASSERT(xfs_buf_islocked(bp)); 536 ASSERT(xfs_buf_islocked(bp));
539 537
540 /* 538 /*
541 * calculate the location of the dquot inside the buffer. 539 * calculate the location of the dquot inside the buffer.
542 */ 540 */
543 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); 541 ddq = bp->b_addr + dqp->q_bufoffset;
544 542
545 /* 543 /*
546 * A simple sanity check in case we got a corrupted dquot... 544 * A simple sanity check in case we got a corrupted dquot...
@@ -553,7 +551,6 @@ xfs_qm_dqtobp(
553 xfs_trans_brelse(tp, bp); 551 xfs_trans_brelse(tp, bp);
554 return XFS_ERROR(EIO); 552 return XFS_ERROR(EIO);
555 } 553 }
556 XFS_BUF_BUSY(bp); /* We dirtied this */
557 } 554 }
558 555
559 *O_bpp = bp; 556 *O_bpp = bp;
@@ -622,7 +619,6 @@ xfs_qm_dqread(
622 * this particular dquot was repaired. We still aren't afraid to 619 * this particular dquot was repaired. We still aren't afraid to
623 * brelse it because we have the changes incore. 620 * brelse it because we have the changes incore.
624 */ 621 */
625 ASSERT(XFS_BUF_ISBUSY(bp));
626 ASSERT(xfs_buf_islocked(bp)); 622 ASSERT(xfs_buf_islocked(bp));
627 xfs_trans_brelse(tp, bp); 623 xfs_trans_brelse(tp, bp);
628 624
@@ -1204,7 +1200,7 @@ xfs_qm_dqflush(
1204 /* 1200 /*
1205 * Calculate the location of the dquot inside the buffer. 1201 * Calculate the location of the dquot inside the buffer.
1206 */ 1202 */
1207 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); 1203 ddqp = bp->b_addr + dqp->q_bufoffset;
1208 1204
1209 /* 1205 /*
1210 * A simple sanity check in case we got a corrupted dquot.. 1206 * A simple sanity check in case we got a corrupted dquot..
@@ -1240,7 +1236,7 @@ xfs_qm_dqflush(
1240 * If the buffer is pinned then push on the log so we won't 1236 * If the buffer is pinned then push on the log so we won't
1241 * get stuck waiting in the write for too long. 1237 * get stuck waiting in the write for too long.
1242 */ 1238 */
1243 if (XFS_BUF_ISPINNED(bp)) { 1239 if (xfs_buf_ispinned(bp)) {
1244 trace_xfs_dqflush_force(dqp); 1240 trace_xfs_dqflush_force(dqp);
1245 xfs_log_force(mp, 0); 1241 xfs_log_force(mp, 0);
1246 } 1242 }
@@ -1447,7 +1443,7 @@ xfs_qm_dqflock_pushbuf_wait(
1447 goto out_lock; 1443 goto out_lock;
1448 1444
1449 if (XFS_BUF_ISDELAYWRITE(bp)) { 1445 if (XFS_BUF_ISDELAYWRITE(bp)) {
1450 if (XFS_BUF_ISPINNED(bp)) 1446 if (xfs_buf_ispinned(bp))
1451 xfs_log_force(mp, 0); 1447 xfs_log_force(mp, 0);
1452 xfs_buf_delwri_promote(bp); 1448 xfs_buf_delwri_promote(bp);
1453 wake_up_process(bp->b_target->bt_task); 1449 wake_up_process(bp->b_target->bt_task);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 34b7e945dbfa..34b7e945dbfa 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48f..75e5d322e48f 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/xfs_export.c
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h
index 3272b6ae7a35..3272b6ae7a35 100644
--- a/fs/xfs/linux-2.6/xfs_export.h
+++ b/fs/xfs/xfs_export.h
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/xfs_file.c
index 825390e1c138..7f7b42469ea7 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -149,7 +149,9 @@ xfs_file_fsync(
149 149
150 xfs_iflags_clear(ip, XFS_ITRUNCATED); 150 xfs_iflags_clear(ip, XFS_ITRUNCATED);
151 151
152 xfs_ilock(ip, XFS_IOLOCK_SHARED);
152 xfs_ioend_wait(ip); 153 xfs_ioend_wait(ip);
154 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
153 155
154 if (mp->m_flags & XFS_MOUNT_BARRIER) { 156 if (mp->m_flags & XFS_MOUNT_BARRIER) {
155 /* 157 /*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9124425b7f2f..3ff3d9e23ded 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -344,9 +344,9 @@ _xfs_filestream_update_ag(
344 * Either ip is a regular file and pip is a directory, or ip is a 344 * Either ip is a regular file and pip is a directory, or ip is a
345 * directory and pip is NULL. 345 * directory and pip is NULL.
346 */ 346 */
347 ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && 347 ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
348 (pip->i_d.di_mode & S_IFDIR)) || 348 S_ISDIR(pip->i_d.di_mode)) ||
349 ((ip->i_d.di_mode & S_IFDIR) && !pip))); 349 (S_ISDIR(ip->i_d.di_mode) && !pip)));
350 350
351 mp = ip->i_mount; 351 mp = ip->i_mount;
352 cache = mp->m_filestream; 352 cache = mp->m_filestream;
@@ -537,7 +537,7 @@ xfs_filestream_lookup_ag(
537 xfs_agnumber_t ag; 537 xfs_agnumber_t ag;
538 int ref; 538 int ref;
539 539
540 if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { 540 if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
541 ASSERT(0); 541 ASSERT(0);
542 return NULLAGNUMBER; 542 return NULLAGNUMBER;
543 } 543 }
@@ -579,9 +579,9 @@ xfs_filestream_associate(
579 xfs_agnumber_t ag, rotorstep, startag; 579 xfs_agnumber_t ag, rotorstep, startag;
580 int err = 0; 580 int err = 0;
581 581
582 ASSERT(pip->i_d.di_mode & S_IFDIR); 582 ASSERT(S_ISDIR(pip->i_d.di_mode));
583 ASSERT(ip->i_d.di_mode & S_IFREG); 583 ASSERT(S_ISREG(ip->i_d.di_mode));
584 if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) 584 if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
585 return -EINVAL; 585 return -EINVAL;
586 586
587 mp = pip->i_mount; 587 mp = pip->i_mount;
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index dd5628bd8d0b..9f24ec28283b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -202,8 +202,7 @@ xfs_ialloc_inode_init(
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 203 mp->m_bsize * blks_per_cluster,
204 XBF_LOCK); 204 XBF_LOCK);
205 ASSERT(fbuf); 205 ASSERT(!xfs_buf_geterror(fbuf));
206 ASSERT(!XFS_BUF_GETERROR(fbuf));
207 206
208 /* 207 /*
209 * Initialize all inodes in this buffer and then log them. 208 * Initialize all inodes in this buffer and then log them.
@@ -1486,7 +1485,7 @@ xfs_read_agi(
1486 if (error) 1485 if (error)
1487 return error; 1486 return error;
1488 1487
1489 ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); 1488 ASSERT(!xfs_buf_geterror(*bpp));
1490 agi = XFS_BUF_TO_AGI(*bpp); 1489 agi = XFS_BUF_TO_AGI(*bpp);
1491 1490
1492 /* 1491 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3cc21ddf9f7e..0239a7c7c886 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -368,7 +368,7 @@ xfs_iformat(
368 /* 368 /*
369 * no local regular files yet 369 * no local regular files yet
370 */ 370 */
371 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 371 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
372 xfs_warn(ip->i_mount, 372 xfs_warn(ip->i_mount,
373 "corrupt inode %Lu (local format for regular file).", 373 "corrupt inode %Lu (local format for regular file).",
374 (unsigned long long) ip->i_ino); 374 (unsigned long long) ip->i_ino);
@@ -1040,7 +1040,7 @@ xfs_ialloc(
1040 1040
1041 if (pip && XFS_INHERIT_GID(pip)) { 1041 if (pip && XFS_INHERIT_GID(pip)) {
1042 ip->i_d.di_gid = pip->i_d.di_gid; 1042 ip->i_d.di_gid = pip->i_d.di_gid;
1043 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1043 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
1044 ip->i_d.di_mode |= S_ISGID; 1044 ip->i_d.di_mode |= S_ISGID;
1045 } 1045 }
1046 } 1046 }
@@ -1097,14 +1097,14 @@ xfs_ialloc(
1097 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1097 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1098 uint di_flags = 0; 1098 uint di_flags = 0;
1099 1099
1100 if ((mode & S_IFMT) == S_IFDIR) { 1100 if (S_ISDIR(mode)) {
1101 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1101 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1102 di_flags |= XFS_DIFLAG_RTINHERIT; 1102 di_flags |= XFS_DIFLAG_RTINHERIT;
1103 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1103 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1104 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1104 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1105 ip->i_d.di_extsize = pip->i_d.di_extsize; 1105 ip->i_d.di_extsize = pip->i_d.di_extsize;
1106 } 1106 }
1107 } else if ((mode & S_IFMT) == S_IFREG) { 1107 } else if (S_ISREG(mode)) {
1108 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1108 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1109 di_flags |= XFS_DIFLAG_REALTIME; 1109 di_flags |= XFS_DIFLAG_REALTIME;
1110 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1110 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
@@ -1188,7 +1188,7 @@ xfs_isize_check(
1188 int nimaps; 1188 int nimaps;
1189 xfs_bmbt_irec_t imaps[2]; 1189 xfs_bmbt_irec_t imaps[2];
1190 1190
1191 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1191 if (!S_ISREG(ip->i_d.di_mode))
1192 return; 1192 return;
1193 1193
1194 if (XFS_IS_REALTIME_INODE(ip)) 1194 if (XFS_IS_REALTIME_INODE(ip))
@@ -1828,7 +1828,7 @@ xfs_ifree(
1828 ASSERT(ip->i_d.di_nextents == 0); 1828 ASSERT(ip->i_d.di_nextents == 0);
1829 ASSERT(ip->i_d.di_anextents == 0); 1829 ASSERT(ip->i_d.di_anextents == 0);
1830 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 1830 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
1831 ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); 1831 (!S_ISREG(ip->i_d.di_mode)));
1832 ASSERT(ip->i_d.di_nblocks == 0); 1832 ASSERT(ip->i_d.di_nblocks == 0);
1833 1833
1834 /* 1834 /*
@@ -2473,7 +2473,7 @@ cluster_corrupt_out:
2473 if (bp->b_iodone) { 2473 if (bp->b_iodone) {
2474 XFS_BUF_UNDONE(bp); 2474 XFS_BUF_UNDONE(bp);
2475 XFS_BUF_STALE(bp); 2475 XFS_BUF_STALE(bp);
2476 XFS_BUF_ERROR(bp,EIO); 2476 xfs_buf_ioerror(bp, EIO);
2477 xfs_buf_ioend(bp, 0); 2477 xfs_buf_ioend(bp, 0);
2478 } else { 2478 } else {
2479 XFS_BUF_STALE(bp); 2479 XFS_BUF_STALE(bp);
@@ -2585,7 +2585,7 @@ xfs_iflush(
2585 * If the buffer is pinned then push on the log now so we won't 2585 * If the buffer is pinned then push on the log now so we won't
2586 * get stuck waiting in the write for too long. 2586 * get stuck waiting in the write for too long.
2587 */ 2587 */
2588 if (XFS_BUF_ISPINNED(bp)) 2588 if (xfs_buf_ispinned(bp))
2589 xfs_log_force(mp, 0); 2589 xfs_log_force(mp, 0);
2590 2590
2591 /* 2591 /*
@@ -2671,7 +2671,7 @@ xfs_iflush_int(
2671 __func__, ip->i_ino, ip, ip->i_d.di_magic); 2671 __func__, ip->i_ino, ip, ip->i_d.di_magic);
2672 goto corrupt_out; 2672 goto corrupt_out;
2673 } 2673 }
2674 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2674 if (S_ISREG(ip->i_d.di_mode)) {
2675 if (XFS_TEST_ERROR( 2675 if (XFS_TEST_ERROR(
2676 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2676 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2677 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2677 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -2681,7 +2681,7 @@ xfs_iflush_int(
2681 __func__, ip->i_ino, ip); 2681 __func__, ip->i_ino, ip);
2682 goto corrupt_out; 2682 goto corrupt_out;
2683 } 2683 }
2684 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2684 } else if (S_ISDIR(ip->i_d.di_mode)) {
2685 if (XFS_TEST_ERROR( 2685 if (XFS_TEST_ERROR(
2686 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2686 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2687 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2687 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a97644ab945a..2380a4bcbece 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -263,7 +263,7 @@ typedef struct xfs_inode {
263 struct inode i_vnode; /* embedded VFS inode */ 263 struct inode i_vnode; /* embedded VFS inode */
264} xfs_inode_t; 264} xfs_inode_t;
265 265
266#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ 266#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
267 (ip)->i_size : (ip)->i_d.di_size; 267 (ip)->i_size : (ip)->i_d.di_size;
268 268
269/* Convert from vfs inode to xfs inode */ 269/* Convert from vfs inode to xfs inode */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index acca2c5ca3fa..f7ce7debe14c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -265,7 +265,7 @@ xfs_open_by_handle(
265 return PTR_ERR(filp); 265 return PTR_ERR(filp);
266 } 266 }
267 267
268 if (inode->i_mode & S_IFREG) { 268 if (S_ISREG(inode->i_mode)) {
269 filp->f_flags |= O_NOATIME; 269 filp->f_flags |= O_NOATIME;
270 filp->f_mode |= FMODE_NOCMTIME; 270 filp->f_mode |= FMODE_NOCMTIME;
271 } 271 }
@@ -850,14 +850,14 @@ xfs_set_diflags(
850 di_flags |= XFS_DIFLAG_NODEFRAG; 850 di_flags |= XFS_DIFLAG_NODEFRAG;
851 if (xflags & XFS_XFLAG_FILESTREAM) 851 if (xflags & XFS_XFLAG_FILESTREAM)
852 di_flags |= XFS_DIFLAG_FILESTREAM; 852 di_flags |= XFS_DIFLAG_FILESTREAM;
853 if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 853 if (S_ISDIR(ip->i_d.di_mode)) {
854 if (xflags & XFS_XFLAG_RTINHERIT) 854 if (xflags & XFS_XFLAG_RTINHERIT)
855 di_flags |= XFS_DIFLAG_RTINHERIT; 855 di_flags |= XFS_DIFLAG_RTINHERIT;
856 if (xflags & XFS_XFLAG_NOSYMLINKS) 856 if (xflags & XFS_XFLAG_NOSYMLINKS)
857 di_flags |= XFS_DIFLAG_NOSYMLINKS; 857 di_flags |= XFS_DIFLAG_NOSYMLINKS;
858 if (xflags & XFS_XFLAG_EXTSZINHERIT) 858 if (xflags & XFS_XFLAG_EXTSZINHERIT)
859 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 859 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
860 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 860 } else if (S_ISREG(ip->i_d.di_mode)) {
861 if (xflags & XFS_XFLAG_REALTIME) 861 if (xflags & XFS_XFLAG_REALTIME)
862 di_flags |= XFS_DIFLAG_REALTIME; 862 di_flags |= XFS_DIFLAG_REALTIME;
863 if (xflags & XFS_XFLAG_EXTSIZE) 863 if (xflags & XFS_XFLAG_EXTSIZE)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2a..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 54e623bfbb85..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6544c3236bc8..673704fab748 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -70,9 +70,8 @@ xfs_synchronize_times(
70} 70}
71 71
72/* 72/*
73 * If the linux inode is valid, mark it dirty. 73 * If the linux inode is valid, mark it dirty, else mark the dirty state
74 * Used when committing a dirty inode into a transaction so that 74 * in the XFS inode to make sure we pick it up when reclaiming the inode.
75 * the inode will get written back by the linux code
76 */ 75 */
77void 76void
78xfs_mark_inode_dirty_sync( 77xfs_mark_inode_dirty_sync(
@@ -82,6 +81,10 @@ xfs_mark_inode_dirty_sync(
82 81
83 if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) 82 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
84 mark_inode_dirty_sync(inode); 83 mark_inode_dirty_sync(inode);
84 else {
85 barrier();
86 ip->i_update_core = 1;
87 }
85} 88}
86 89
87void 90void
@@ -92,6 +95,11 @@ xfs_mark_inode_dirty(
92 95
93 if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) 96 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
94 mark_inode_dirty(inode); 97 mark_inode_dirty(inode);
98 else {
99 barrier();
100 ip->i_update_core = 1;
101 }
102
95} 103}
96 104
97/* 105/*
@@ -1194,9 +1202,14 @@ xfs_setup_inode(
1194 break; 1202 break;
1195 } 1203 }
1196 1204
1197 /* if there is no attribute fork no ACL can exist on this inode */ 1205 /*
1198 if (!XFS_IFORK_Q(ip)) 1206 * If there is no attribute fork no ACL can exist on this inode,
1207 * and it can't have any file capabilities attached to it either.
1208 */
1209 if (!XFS_IFORK_Q(ip)) {
1210 inode_has_no_xattr(inode);
1199 cache_no_acl(inode); 1211 cache_no_acl(inode);
1212 }
1200 1213
1201 xfs_iflags_clear(ip, XFS_INEW); 1214 xfs_iflags_clear(ip, XFS_INEW);
1202 barrier(); 1215 barrier();
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66e..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h
index d42f814e4d35..1e8a45e74c3e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,13 +32,12 @@
32# define XFS_BIG_INUMS 0 32# define XFS_BIG_INUMS 0
33#endif 33#endif
34 34
35#include <xfs_types.h> 35#include "xfs_types.h"
36 36
37#include <kmem.h> 37#include "kmem.h"
38#include <mrlock.h> 38#include "mrlock.h"
39#include <time.h> 39#include "time.h"
40 40#include "uuid.h"
41#include <support/uuid.h>
42 41
43#include <linux/semaphore.h> 42#include <linux/semaphore.h>
44#include <linux/mm.h> 43#include <linux/mm.h>
@@ -78,14 +77,14 @@
78#include <asm/byteorder.h> 77#include <asm/byteorder.h>
79#include <asm/unaligned.h> 78#include <asm/unaligned.h>
80 79
81#include <xfs_vnode.h> 80#include "xfs_vnode.h"
82#include <xfs_stats.h> 81#include "xfs_stats.h"
83#include <xfs_sysctl.h> 82#include "xfs_sysctl.h"
84#include <xfs_iops.h> 83#include "xfs_iops.h"
85#include <xfs_aops.h> 84#include "xfs_aops.h"
86#include <xfs_super.h> 85#include "xfs_super.h"
87#include <xfs_buf.h> 86#include "xfs_buf.h"
88#include <xfs_message.h> 87#include "xfs_message.h"
89 88
90#ifdef __BIG_ENDIAN 89#ifdef __BIG_ENDIAN
91#define XFS_NATIVE_HOST 1 90#define XFS_NATIVE_HOST 1
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 06ff8437ed8e..3a8d4f66d702 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -878,7 +878,7 @@ xlog_iodone(xfs_buf_t *bp)
878 /* 878 /*
879 * Race to shutdown the filesystem if we see an error. 879 * Race to shutdown the filesystem if we see an error.
880 */ 880 */
881 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 881 if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
882 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { 882 XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
883 xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); 883 xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
884 XFS_BUF_STALE(bp); 884 XFS_BUF_STALE(bp);
@@ -1051,7 +1051,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1051 if (!bp) 1051 if (!bp)
1052 goto out_free_log; 1052 goto out_free_log;
1053 bp->b_iodone = xlog_iodone; 1053 bp->b_iodone = xlog_iodone;
1054 ASSERT(XFS_BUF_ISBUSY(bp));
1055 ASSERT(xfs_buf_islocked(bp)); 1054 ASSERT(xfs_buf_islocked(bp));
1056 log->l_xbuf = bp; 1055 log->l_xbuf = bp;
1057 1056
@@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1108 iclog->ic_callback_tail = &(iclog->ic_callback); 1107 iclog->ic_callback_tail = &(iclog->ic_callback);
1109 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1108 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1110 1109
1111 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1112 ASSERT(xfs_buf_islocked(iclog->ic_bp)); 1110 ASSERT(xfs_buf_islocked(iclog->ic_bp));
1113 init_waitqueue_head(&iclog->ic_force_wait); 1111 init_waitqueue_head(&iclog->ic_force_wait);
1114 init_waitqueue_head(&iclog->ic_write_wait); 1112 init_waitqueue_head(&iclog->ic_write_wait);
@@ -1248,7 +1246,7 @@ xlog_bdstrat(
1248 struct xlog_in_core *iclog = bp->b_fspriv; 1246 struct xlog_in_core *iclog = bp->b_fspriv;
1249 1247
1250 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1248 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1251 XFS_BUF_ERROR(bp, EIO); 1249 xfs_buf_ioerror(bp, EIO);
1252 XFS_BUF_STALE(bp); 1250 XFS_BUF_STALE(bp);
1253 xfs_buf_ioend(bp, 0); 1251 xfs_buf_ioend(bp, 0);
1254 /* 1252 /*
@@ -1355,7 +1353,6 @@ xlog_sync(xlog_t *log,
1355 XFS_BUF_SET_COUNT(bp, count); 1353 XFS_BUF_SET_COUNT(bp, count);
1356 bp->b_fspriv = iclog; 1354 bp->b_fspriv = iclog;
1357 XFS_BUF_ZEROFLAGS(bp); 1355 XFS_BUF_ZEROFLAGS(bp);
1358 XFS_BUF_BUSY(bp);
1359 XFS_BUF_ASYNC(bp); 1356 XFS_BUF_ASYNC(bp);
1360 bp->b_flags |= XBF_SYNCIO; 1357 bp->b_flags |= XBF_SYNCIO;
1361 1358
@@ -1398,16 +1395,15 @@ xlog_sync(xlog_t *log,
1398 if (split) { 1395 if (split) {
1399 bp = iclog->ic_log->l_xbuf; 1396 bp = iclog->ic_log->l_xbuf;
1400 XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ 1397 XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
1401 XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ 1398 xfs_buf_associate_memory(bp,
1402 (__psint_t)count), split); 1399 (char *)&iclog->ic_header + count, split);
1403 bp->b_fspriv = iclog; 1400 bp->b_fspriv = iclog;
1404 XFS_BUF_ZEROFLAGS(bp); 1401 XFS_BUF_ZEROFLAGS(bp);
1405 XFS_BUF_BUSY(bp);
1406 XFS_BUF_ASYNC(bp); 1402 XFS_BUF_ASYNC(bp);
1407 bp->b_flags |= XBF_SYNCIO; 1403 bp->b_flags |= XBF_SYNCIO;
1408 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1404 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1409 bp->b_flags |= XBF_FUA; 1405 bp->b_flags |= XBF_FUA;
1410 dptr = XFS_BUF_PTR(bp); 1406 dptr = bp->b_addr;
1411 /* 1407 /*
1412 * Bump the cycle numbers at the start of each block 1408 * Bump the cycle numbers at the start of each block
1413 * since this part of the buffer is at the start of 1409 * since this part of the buffer is at the start of
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8fe4206de057..a199dbcee7d8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_align(
147 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); 147 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
148 148
149 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); 149 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
150 return XFS_BUF_PTR(bp) + BBTOB(offset); 150 return bp->b_addr + BBTOB(offset);
151} 151}
152 152
153 153
@@ -178,9 +178,7 @@ xlog_bread_noalign(
178 178
179 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 179 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
180 XFS_BUF_READ(bp); 180 XFS_BUF_READ(bp);
181 XFS_BUF_BUSY(bp);
182 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 181 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
183 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
184 182
185 xfsbdstrat(log->l_mp, bp); 183 xfsbdstrat(log->l_mp, bp);
186 error = xfs_buf_iowait(bp); 184 error = xfs_buf_iowait(bp);
@@ -220,18 +218,18 @@ xlog_bread_offset(
220 xfs_buf_t *bp, 218 xfs_buf_t *bp,
221 xfs_caddr_t offset) 219 xfs_caddr_t offset)
222{ 220{
223 xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); 221 xfs_caddr_t orig_offset = bp->b_addr;
224 int orig_len = bp->b_buffer_length; 222 int orig_len = bp->b_buffer_length;
225 int error, error2; 223 int error, error2;
226 224
227 error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); 225 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
228 if (error) 226 if (error)
229 return error; 227 return error;
230 228
231 error = xlog_bread_noalign(log, blk_no, nbblks, bp); 229 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
232 230
233 /* must reset buffer pointer even on error */ 231 /* must reset buffer pointer even on error */
234 error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); 232 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
235 if (error) 233 if (error)
236 return error; 234 return error;
237 return error2; 235 return error2;
@@ -266,11 +264,9 @@ xlog_bwrite(
266 264
267 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 265 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
268 XFS_BUF_ZEROFLAGS(bp); 266 XFS_BUF_ZEROFLAGS(bp);
269 XFS_BUF_BUSY(bp); 267 xfs_buf_hold(bp);
270 XFS_BUF_HOLD(bp);
271 xfs_buf_lock(bp); 268 xfs_buf_lock(bp);
272 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 269 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
273 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
274 270
275 if ((error = xfs_bwrite(log->l_mp, bp))) 271 if ((error = xfs_bwrite(log->l_mp, bp)))
276 xfs_ioerror_alert("xlog_bwrite", log->l_mp, 272 xfs_ioerror_alert("xlog_bwrite", log->l_mp,
@@ -360,7 +356,7 @@ STATIC void
360xlog_recover_iodone( 356xlog_recover_iodone(
361 struct xfs_buf *bp) 357 struct xfs_buf *bp)
362{ 358{
363 if (XFS_BUF_GETERROR(bp)) { 359 if (bp->b_error) {
364 /* 360 /*
365 * We're not going to bother about retrying 361 * We're not going to bother about retrying
366 * this during recovery. One strike! 362 * this during recovery. One strike!
@@ -1262,7 +1258,7 @@ xlog_write_log_records(
1262 */ 1258 */
1263 ealign = round_down(end_block, sectbb); 1259 ealign = round_down(end_block, sectbb);
1264 if (j == 0 && (start_block + endcount > ealign)) { 1260 if (j == 0 && (start_block + endcount > ealign)) {
1265 offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); 1261 offset = bp->b_addr + BBTOB(ealign - start_block);
1266 error = xlog_bread_offset(log, ealign, sectbb, 1262 error = xlog_bread_offset(log, ealign, sectbb,
1267 bp, offset); 1263 bp, offset);
1268 if (error) 1264 if (error)
@@ -2135,15 +2131,16 @@ xlog_recover_buffer_pass2(
2135 2131
2136 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2132 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2137 buf_flags); 2133 buf_flags);
2138 if (XFS_BUF_ISERROR(bp)) { 2134 if (!bp)
2135 return XFS_ERROR(ENOMEM);
2136 error = bp->b_error;
2137 if (error) {
2139 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, 2138 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2140 bp, buf_f->blf_blkno); 2139 bp, buf_f->blf_blkno);
2141 error = XFS_BUF_GETERROR(bp);
2142 xfs_buf_relse(bp); 2140 xfs_buf_relse(bp);
2143 return error; 2141 return error;
2144 } 2142 }
2145 2143
2146 error = 0;
2147 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2144 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2148 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2145 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2149 } else if (buf_f->blf_flags & 2146 } else if (buf_f->blf_flags &
@@ -2227,14 +2224,17 @@ xlog_recover_inode_pass2(
2227 2224
2228 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2225 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2229 XBF_LOCK); 2226 XBF_LOCK);
2230 if (XFS_BUF_ISERROR(bp)) { 2227 if (!bp) {
2228 error = ENOMEM;
2229 goto error;
2230 }
2231 error = bp->b_error;
2232 if (error) {
2231 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2233 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2232 bp, in_f->ilf_blkno); 2234 bp, in_f->ilf_blkno);
2233 error = XFS_BUF_GETERROR(bp);
2234 xfs_buf_relse(bp); 2235 xfs_buf_relse(bp);
2235 goto error; 2236 goto error;
2236 } 2237 }
2237 error = 0;
2238 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2238 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2239 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2239 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2240 2240
@@ -2283,7 +2283,7 @@ xlog_recover_inode_pass2(
2283 /* Take the opportunity to reset the flush iteration count */ 2283 /* Take the opportunity to reset the flush iteration count */
2284 dicp->di_flushiter = 0; 2284 dicp->di_flushiter = 0;
2285 2285
2286 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2286 if (unlikely(S_ISREG(dicp->di_mode))) {
2287 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2287 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2288 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2288 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2289 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2289 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
@@ -2296,7 +2296,7 @@ xlog_recover_inode_pass2(
2296 error = EFSCORRUPTED; 2296 error = EFSCORRUPTED;
2297 goto error; 2297 goto error;
2298 } 2298 }
2299 } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { 2299 } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2300 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2300 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2301 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2301 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2302 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2302 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
@@ -3437,7 +3437,7 @@ xlog_do_recovery_pass(
3437 /* 3437 /*
3438 * Check for header wrapping around physical end-of-log 3438 * Check for header wrapping around physical end-of-log
3439 */ 3439 */
3440 offset = XFS_BUF_PTR(hbp); 3440 offset = hbp->b_addr;
3441 split_hblks = 0; 3441 split_hblks = 0;
3442 wrapped_hblks = 0; 3442 wrapped_hblks = 0;
3443 if (blk_no + hblks <= log->l_logBBsize) { 3443 if (blk_no + hblks <= log->l_logBBsize) {
@@ -3497,7 +3497,7 @@ xlog_do_recovery_pass(
3497 } else { 3497 } else {
3498 /* This log record is split across the 3498 /* This log record is split across the
3499 * physical end of log */ 3499 * physical end of log */
3500 offset = XFS_BUF_PTR(dbp); 3500 offset = dbp->b_addr;
3501 split_bblks = 0; 3501 split_bblks = 0;
3502 if (blk_no != log->l_logBBsize) { 3502 if (blk_no != log->l_logBBsize) {
3503 /* some data is before the physical 3503 /* some data is before the physical
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95ac..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/xfs_message.c
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea007672..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/xfs_message.h
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7f25245da289..0081657ad985 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1331,7 +1331,7 @@ xfs_mountfs(
1331 1331
1332 ASSERT(rip != NULL); 1332 ASSERT(rip != NULL);
1333 1333
1334 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1334 if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
1335 xfs_warn(mp, "corrupted root inode %llu: not a directory", 1335 xfs_warn(mp, "corrupted root inode %llu: not a directory",
1336 (unsigned long long)rip->i_ino); 1336 (unsigned long long)rip->i_ino);
1337 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1337 xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1615,7 +1615,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1615 XFS_BUF_UNDELAYWRITE(sbp); 1615 XFS_BUF_UNDELAYWRITE(sbp);
1616 XFS_BUF_WRITE(sbp); 1616 XFS_BUF_WRITE(sbp);
1617 XFS_BUF_UNASYNC(sbp); 1617 XFS_BUF_UNASYNC(sbp);
1618 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1618 ASSERT(sbp->b_target == mp->m_ddev_targp);
1619 xfsbdstrat(mp, sbp); 1619 xfsbdstrat(mp, sbp);
1620 error = xfs_buf_iowait(sbp); 1620 error = xfs_buf_iowait(sbp);
1621 if (error) 1621 if (error)
@@ -1938,7 +1938,7 @@ xfs_getsb(
1938 xfs_buf_lock(bp); 1938 xfs_buf_lock(bp);
1939 } 1939 }
1940 1940
1941 XFS_BUF_HOLD(bp); 1941 xfs_buf_hold(bp);
1942 ASSERT(XFS_BUF_ISDONE(bp)); 1942 ASSERT(XFS_BUF_ISDONE(bp));
1943 return bp; 1943 return bp;
1944} 1944}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/xfs_qm.c
index 46e54ad9a2dc..9a0aa76facdf 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1240,7 +1240,7 @@ xfs_qm_reset_dqcounts(
1240 do_div(j, sizeof(xfs_dqblk_t)); 1240 do_div(j, sizeof(xfs_dqblk_t));
1241 ASSERT(mp->m_quotainfo->qi_dqperchunk == j); 1241 ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
1242#endif 1242#endif
1243 ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); 1243 ddq = bp->b_addr;
1244 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { 1244 for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
1245 /* 1245 /*
1246 * Do a sanity check, and if needed, repair the dqblk. Don't 1246 * Do a sanity check, and if needed, repair the dqblk. Don't
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/xfs_qm.h
index 43b9abe1052c..43b9abe1052c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca9..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b32644..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
index 5b964fc0dc09..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/xfs_qm_stats.h
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6c..609246f42e6c 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d716..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 29b9d642e93d..7e76f537abb7 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -25,7 +25,7 @@
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_inode.h" 27#include "xfs_inode.h"
28#include "quota/xfs_qm.h" 28#include "xfs_qm.h"
29#include <linux/quota.h> 29#include <linux/quota.h>
30 30
31 31
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 77a59891734e..df78c297d1a1 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -116,7 +116,7 @@ xfs_rename(
116 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 116 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
117 117
118 new_parent = (src_dp != target_dp); 118 new_parent = (src_dp != target_dp);
119 src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); 119 src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
120 120
121 if (src_is_directory) { 121 if (src_is_directory) {
122 /* 122 /*
@@ -226,7 +226,7 @@ xfs_rename(
226 * target and source are directories and that target can be 226 * target and source are directories and that target can be
227 * destroyed, or that neither is a directory. 227 * destroyed, or that neither is a directory.
228 */ 228 */
229 if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 229 if (S_ISDIR(target_ip->i_d.di_mode)) {
230 /* 230 /*
231 * Make sure target dir is empty. 231 * Make sure target dir is empty.
232 */ 232 */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8f76fdff4f46..35561a511b57 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -168,7 +168,7 @@ error_cancel:
168 xfs_trans_cancel(tp, cancelflags); 168 xfs_trans_cancel(tp, cancelflags);
169 goto error; 169 goto error;
170 } 170 }
171 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); 171 memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
172 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); 172 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
173 /* 173 /*
174 * Commit the transaction. 174 * Commit the transaction.
@@ -883,7 +883,7 @@ xfs_rtbuf_get(
883 if (error) { 883 if (error) {
884 return error; 884 return error;
885 } 885 }
886 ASSERT(bp && !XFS_BUF_GETERROR(bp)); 886 ASSERT(!xfs_buf_geterror(bp));
887 *bpp = bp; 887 *bpp = bp;
888 return 0; 888 return 0;
889} 889}
@@ -943,7 +943,7 @@ xfs_rtcheck_range(
943 if (error) { 943 if (error) {
944 return error; 944 return error;
945 } 945 }
946 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 946 bufp = bp->b_addr;
947 /* 947 /*
948 * Compute the starting word's address, and starting bit. 948 * Compute the starting word's address, and starting bit.
949 */ 949 */
@@ -994,7 +994,7 @@ xfs_rtcheck_range(
994 if (error) { 994 if (error) {
995 return error; 995 return error;
996 } 996 }
997 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 997 b = bufp = bp->b_addr;
998 word = 0; 998 word = 0;
999 } else { 999 } else {
1000 /* 1000 /*
@@ -1040,7 +1040,7 @@ xfs_rtcheck_range(
1040 if (error) { 1040 if (error) {
1041 return error; 1041 return error;
1042 } 1042 }
1043 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1043 b = bufp = bp->b_addr;
1044 word = 0; 1044 word = 0;
1045 } else { 1045 } else {
1046 /* 1046 /*
@@ -1158,7 +1158,7 @@ xfs_rtfind_back(
1158 if (error) { 1158 if (error) {
1159 return error; 1159 return error;
1160 } 1160 }
1161 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1161 bufp = bp->b_addr;
1162 /* 1162 /*
1163 * Get the first word's index & point to it. 1163 * Get the first word's index & point to it.
1164 */ 1164 */
@@ -1210,7 +1210,7 @@ xfs_rtfind_back(
1210 if (error) { 1210 if (error) {
1211 return error; 1211 return error;
1212 } 1212 }
1213 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1213 bufp = bp->b_addr;
1214 word = XFS_BLOCKWMASK(mp); 1214 word = XFS_BLOCKWMASK(mp);
1215 b = &bufp[word]; 1215 b = &bufp[word];
1216 } else { 1216 } else {
@@ -1256,7 +1256,7 @@ xfs_rtfind_back(
1256 if (error) { 1256 if (error) {
1257 return error; 1257 return error;
1258 } 1258 }
1259 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1259 bufp = bp->b_addr;
1260 word = XFS_BLOCKWMASK(mp); 1260 word = XFS_BLOCKWMASK(mp);
1261 b = &bufp[word]; 1261 b = &bufp[word];
1262 } else { 1262 } else {
@@ -1333,7 +1333,7 @@ xfs_rtfind_forw(
1333 if (error) { 1333 if (error) {
1334 return error; 1334 return error;
1335 } 1335 }
1336 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1336 bufp = bp->b_addr;
1337 /* 1337 /*
1338 * Get the first word's index & point to it. 1338 * Get the first word's index & point to it.
1339 */ 1339 */
@@ -1384,7 +1384,7 @@ xfs_rtfind_forw(
1384 if (error) { 1384 if (error) {
1385 return error; 1385 return error;
1386 } 1386 }
1387 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1387 b = bufp = bp->b_addr;
1388 word = 0; 1388 word = 0;
1389 } else { 1389 } else {
1390 /* 1390 /*
@@ -1429,7 +1429,7 @@ xfs_rtfind_forw(
1429 if (error) { 1429 if (error) {
1430 return error; 1430 return error;
1431 } 1431 }
1432 b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1432 b = bufp = bp->b_addr;
1433 word = 0; 1433 word = 0;
1434 } else { 1434 } else {
1435 /* 1435 /*
@@ -1649,7 +1649,7 @@ xfs_rtmodify_range(
1649 if (error) { 1649 if (error) {
1650 return error; 1650 return error;
1651 } 1651 }
1652 bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1652 bufp = bp->b_addr;
1653 /* 1653 /*
1654 * Compute the starting word's address, and starting bit. 1654 * Compute the starting word's address, and starting bit.
1655 */ 1655 */
@@ -1694,7 +1694,7 @@ xfs_rtmodify_range(
1694 if (error) { 1694 if (error) {
1695 return error; 1695 return error;
1696 } 1696 }
1697 first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1697 first = b = bufp = bp->b_addr;
1698 word = 0; 1698 word = 0;
1699 } else { 1699 } else {
1700 /* 1700 /*
@@ -1734,7 +1734,7 @@ xfs_rtmodify_range(
1734 if (error) { 1734 if (error) {
1735 return error; 1735 return error;
1736 } 1736 }
1737 first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); 1737 first = b = bufp = bp->b_addr;
1738 word = 0; 1738 word = 0;
1739 } else { 1739 } else {
1740 /* 1740 /*
@@ -1832,8 +1832,8 @@ xfs_rtmodify_summary(
1832 */ 1832 */
1833 sp = XFS_SUMPTR(mp, bp, so); 1833 sp = XFS_SUMPTR(mp, bp, so);
1834 *sp += delta; 1834 *sp += delta;
1835 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), 1835 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
1836 (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); 1836 (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
1837 return 0; 1837 return 0;
1838} 1838}
1839 1839
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 09e1f4f35e97..f7f3a359c1c5 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -47,7 +47,7 @@ struct xfs_trans;
47#define XFS_SUMOFFSTOBLOCK(mp,s) \ 47#define XFS_SUMOFFSTOBLOCK(mp,s) \
48 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) 48 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
49#define XFS_SUMPTR(mp,bp,so) \ 49#define XFS_SUMPTR(mp,bp,so) \
50 ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ 50 ((xfs_suminfo_t *)((bp)->b_addr + \
51 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) 51 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
52 52
53#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) 53#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index d6d6fdfe9422..c96a8a05ac03 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -104,9 +104,9 @@ xfs_ioerror_alert(
104 xfs_alert(mp, 104 xfs_alert(mp,
105 "I/O error occurred: meta-data dev %s block 0x%llx" 105 "I/O error occurred: meta-data dev %s block 0x%llx"
106 " (\"%s\") error %d buf count %zd", 106 " (\"%s\") error %d buf count %zd",
107 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 107 xfs_buf_target_name(bp->b_target),
108 (__uint64_t)blkno, func, 108 (__uint64_t)blkno, func,
109 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); 109 bp->b_error, XFS_BUF_COUNT(bp));
110} 110}
111 111
112/* 112/*
@@ -137,8 +137,8 @@ xfs_read_buf(
137 bp = xfs_buf_read(target, blkno, len, flags); 137 bp = xfs_buf_read(target, blkno, len, flags);
138 if (!bp) 138 if (!bp)
139 return XFS_ERROR(EIO); 139 return XFS_ERROR(EIO);
140 error = XFS_BUF_GETERROR(bp); 140 error = bp->b_error;
141 if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { 141 if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
142 *bpp = bp; 142 *bpp = bp;
143 } else { 143 } else {
144 *bpp = NULL; 144 *bpp = NULL;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1eb2ba586814..cb6ae715814a 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
509 509
510#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ 510#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
511#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) 511#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
512#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) 512#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
513 513
514#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) 514#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
515#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ 515#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc5861932..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1a..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/xfs_super.c
index 9a72dda58bd0..2366c54cc4fa 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -356,6 +356,8 @@ xfs_parseargs(
356 mp->m_flags |= XFS_MOUNT_DELAYLOG; 356 mp->m_flags |= XFS_MOUNT_DELAYLOG;
357 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 357 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
358 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 358 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
359 xfs_warn(mp,
360 "nodelaylog is deprecated and will be removed in Linux 3.3");
359 } else if (!strcmp(this_char, MNTOPT_DISCARD)) { 361 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
360 mp->m_flags |= XFS_MOUNT_DISCARD; 362 mp->m_flags |= XFS_MOUNT_DISCARD;
361 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { 363 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
@@ -877,33 +879,17 @@ xfs_log_inode(
877 struct xfs_trans *tp; 879 struct xfs_trans *tp;
878 int error; 880 int error;
879 881
880 xfs_iunlock(ip, XFS_ILOCK_SHARED);
881 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 882 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
882 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 883 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
883
884 if (error) { 884 if (error) {
885 xfs_trans_cancel(tp, 0); 885 xfs_trans_cancel(tp, 0);
886 /* we need to return with the lock hold shared */
887 xfs_ilock(ip, XFS_ILOCK_SHARED);
888 return error; 886 return error;
889 } 887 }
890 888
891 xfs_ilock(ip, XFS_ILOCK_EXCL); 889 xfs_ilock(ip, XFS_ILOCK_EXCL);
892 890 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
893 /*
894 * Note - it's possible that we might have pushed ourselves out of the
895 * way during trans_reserve which would flush the inode. But there's
896 * no guarantee that the inode buffer has actually gone out yet (it's
897 * delwri). Plus the buffer could be pinned anyway if it's part of
898 * an inode in another recent transaction. So we play it safe and
899 * fire off the transaction anyway.
900 */
901 xfs_trans_ijoin(tp, ip);
902 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 891 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
903 error = xfs_trans_commit(tp, 0); 892 return xfs_trans_commit(tp, 0);
904 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
905
906 return error;
907} 893}
908 894
909STATIC int 895STATIC int
@@ -918,7 +904,9 @@ xfs_fs_write_inode(
918 trace_xfs_write_inode(ip); 904 trace_xfs_write_inode(ip);
919 905
920 if (XFS_FORCED_SHUTDOWN(mp)) 906 if (XFS_FORCED_SHUTDOWN(mp))
921 return XFS_ERROR(EIO); 907 return -XFS_ERROR(EIO);
908 if (!ip->i_update_core)
909 return 0;
922 910
923 if (wbc->sync_mode == WB_SYNC_ALL) { 911 if (wbc->sync_mode == WB_SYNC_ALL) {
924 /* 912 /*
@@ -929,12 +917,10 @@ xfs_fs_write_inode(
929 * of synchronous log foces dramatically. 917 * of synchronous log foces dramatically.
930 */ 918 */
931 xfs_ioend_wait(ip); 919 xfs_ioend_wait(ip);
932 xfs_ilock(ip, XFS_ILOCK_SHARED); 920 error = xfs_log_inode(ip);
933 if (ip->i_update_core) { 921 if (error)
934 error = xfs_log_inode(ip); 922 goto out;
935 if (error) 923 return 0;
936 goto out_unlock;
937 }
938 } else { 924 } else {
939 /* 925 /*
940 * We make this non-blocking if the inode is contended, return 926 * We make this non-blocking if the inode is contended, return
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999e..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/xfs_super.h
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/xfs_sync.c
index e4c938afb910..4604f90f86a3 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -332,7 +332,7 @@ xfs_sync_fsdata(
332 * between there and here. 332 * between there and here.
333 */ 333 */
334 bp = xfs_getsb(mp, 0); 334 bp = xfs_getsb(mp, 0);
335 if (XFS_BUF_ISPINNED(bp)) 335 if (xfs_buf_ispinned(bp))
336 xfs_log_force(mp, 0); 336 xfs_log_force(mp, 0);
337 337
338 return xfs_bwrite(mp, bp); 338 return xfs_bwrite(mp, bp);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..941202e7ac6e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/xfs_trace.c
index 88d25d4aa56e..9010ce885e6a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -43,8 +43,8 @@
43#include "xfs_quota.h" 43#include "xfs_quota.h"
44#include "xfs_iomap.h" 44#include "xfs_iomap.h"
45#include "xfs_aops.h" 45#include "xfs_aops.h"
46#include "quota/xfs_dquot_item.h" 46#include "xfs_dquot_item.h"
47#include "quota/xfs_dquot.h" 47#include "xfs_dquot.h"
48#include "xfs_log_recover.h" 48#include "xfs_log_recover.h"
49#include "xfs_inode_item.h" 49#include "xfs_inode_item.h"
50 50
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd72..690fc7a7bd72 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 43233e92f0f6..c15aa29fa169 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -299,7 +299,7 @@ xfs_trans_ail_cursor_last(
299 * Splice the log item list into the AIL at the given LSN. We splice to the 299 * Splice the log item list into the AIL at the given LSN. We splice to the
300 * tail of the given LSN to maintain insert order for push traversals. The 300 * tail of the given LSN to maintain insert order for push traversals. The
301 * cursor is optional, allowing repeated updates to the same LSN to avoid 301 * cursor is optional, allowing repeated updates to the same LSN to avoid
302 * repeated traversals. 302 * repeated traversals. This should not be called with an empty list.
303 */ 303 */
304static void 304static void
305xfs_ail_splice( 305xfs_ail_splice(
@@ -308,50 +308,39 @@ xfs_ail_splice(
308 struct list_head *list, 308 struct list_head *list,
309 xfs_lsn_t lsn) 309 xfs_lsn_t lsn)
310{ 310{
311 struct xfs_log_item *lip = cur ? cur->item : NULL; 311 struct xfs_log_item *lip;
312 struct xfs_log_item *next_lip; 312
313 ASSERT(!list_empty(list));
313 314
314 /* 315 /*
315 * Get a new cursor if we don't have a placeholder or the existing one 316 * Use the cursor to determine the insertion point if one is
316 * has been invalidated. 317 * provided. If not, or if the one we got is not valid,
318 * find the place in the AIL where the items belong.
317 */ 319 */
318 if (!lip || (__psint_t)lip & 1) { 320 lip = cur ? cur->item : NULL;
321 if (!lip || (__psint_t) lip & 1)
319 lip = __xfs_trans_ail_cursor_last(ailp, lsn); 322 lip = __xfs_trans_ail_cursor_last(ailp, lsn);
320 323
321 if (!lip) { 324 /*
322 /* The list is empty, so just splice and return. */ 325 * If a cursor is provided, we know we're processing the AIL
323 if (cur) 326 * in lsn order, and future items to be spliced in will
324 cur->item = NULL; 327 * follow the last one being inserted now. Update the
325 list_splice(list, &ailp->xa_ail); 328 * cursor to point to that last item, now while we have a
326 return; 329 * reliable pointer to it.
327 } 330 */
328 } 331 if (cur)
332 cur->item = list_entry(list->prev, struct xfs_log_item, li_ail);
329 333
330 /* 334 /*
331 * Our cursor points to the item we want to insert _after_, so we have 335 * Finally perform the splice. Unless the AIL was empty,
332 * to update the cursor to point to the end of the list we are splicing 336 * lip points to the item in the AIL _after_ which the new
333 * in so that it points to the correct location for the next splice. 337 * items should go. If lip is null the AIL was empty, so
334 * i.e. before the splice 338 * the new items go at the head of the AIL.
335 *
336 * lsn -> lsn -> lsn + x -> lsn + x ...
337 * ^
338 * | cursor points here
339 *
340 * After the splice we have:
341 *
342 * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
343 * ^ ^
344 * | cursor points here | needs to move here
345 *
346 * So we set the cursor to the last item in the list to be spliced
347 * before we execute the splice, resulting in the cursor pointing to
348 * the correct item after the splice occurs.
349 */ 339 */
350 if (cur) { 340 if (lip)
351 next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); 341 list_splice(list, &lip->li_ail);
352 cur->item = next_lip; 342 else
353 } 343 list_splice(list, &ailp->xa_ail);
354 list_splice(list, &lip->li_ail);
355} 344}
356 345
357/* 346/*
@@ -682,6 +671,7 @@ xfs_trans_ail_update_bulk(
682 int i; 671 int i;
683 LIST_HEAD(tmp); 672 LIST_HEAD(tmp);
684 673
674 ASSERT(nr_items > 0); /* Not required, but true. */
685 mlip = xfs_ail_min(ailp); 675 mlip = xfs_ail_min(ailp);
686 676
687 for (i = 0; i < nr_items; i++) { 677 for (i = 0; i < nr_items; i++) {
@@ -701,7 +691,8 @@ xfs_trans_ail_update_bulk(
701 list_add(&lip->li_ail, &tmp); 691 list_add(&lip->li_ail, &tmp);
702 } 692 }
703 693
704 xfs_ail_splice(ailp, cur, &tmp, lsn); 694 if (!list_empty(&tmp))
695 xfs_ail_splice(ailp, cur, &tmp, lsn);
705 696
706 if (!mlip_changed) { 697 if (!mlip_changed) {
707 spin_unlock(&ailp->xa_lock); 698 spin_unlock(&ailp->xa_lock);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 15584fc3ed7d..137e2b9e2948 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -54,7 +54,7 @@ xfs_trans_buf_item_match(
54 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 54 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
55 blip = (struct xfs_buf_log_item *)lidp->lid_item; 55 blip = (struct xfs_buf_log_item *)lidp->lid_item;
56 if (blip->bli_item.li_type == XFS_LI_BUF && 56 if (blip->bli_item.li_type == XFS_LI_BUF &&
57 XFS_BUF_TARGET(blip->bli_buf) == target && 57 blip->bli_buf->b_target == target &&
58 XFS_BUF_ADDR(blip->bli_buf) == blkno && 58 XFS_BUF_ADDR(blip->bli_buf) == blkno &&
59 XFS_BUF_COUNT(blip->bli_buf) == len) 59 XFS_BUF_COUNT(blip->bli_buf) == len)
60 return blip->bli_buf; 60 return blip->bli_buf;
@@ -80,7 +80,6 @@ _xfs_trans_bjoin(
80{ 80{
81 struct xfs_buf_log_item *bip; 81 struct xfs_buf_log_item *bip;
82 82
83 ASSERT(XFS_BUF_ISBUSY(bp));
84 ASSERT(bp->b_transp == NULL); 83 ASSERT(bp->b_transp == NULL);
85 84
86 /* 85 /*
@@ -194,7 +193,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
194 return NULL; 193 return NULL;
195 } 194 }
196 195
197 ASSERT(!XFS_BUF_GETERROR(bp)); 196 ASSERT(!bp->b_error);
198 197
199 _xfs_trans_bjoin(tp, bp, 1); 198 _xfs_trans_bjoin(tp, bp, 1);
200 trace_xfs_trans_get_buf(bp->b_fspriv); 199 trace_xfs_trans_get_buf(bp->b_fspriv);
@@ -293,10 +292,10 @@ xfs_trans_read_buf(
293 return (flags & XBF_TRYLOCK) ? 292 return (flags & XBF_TRYLOCK) ?
294 EAGAIN : XFS_ERROR(ENOMEM); 293 EAGAIN : XFS_ERROR(ENOMEM);
295 294
296 if (XFS_BUF_GETERROR(bp) != 0) { 295 if (bp->b_error) {
296 error = bp->b_error;
297 xfs_ioerror_alert("xfs_trans_read_buf", mp, 297 xfs_ioerror_alert("xfs_trans_read_buf", mp,
298 bp, blkno); 298 bp, blkno);
299 error = XFS_BUF_GETERROR(bp);
300 xfs_buf_relse(bp); 299 xfs_buf_relse(bp);
301 return error; 300 return error;
302 } 301 }
@@ -330,7 +329,7 @@ xfs_trans_read_buf(
330 ASSERT(xfs_buf_islocked(bp)); 329 ASSERT(xfs_buf_islocked(bp));
331 ASSERT(bp->b_transp == tp); 330 ASSERT(bp->b_transp == tp);
332 ASSERT(bp->b_fspriv != NULL); 331 ASSERT(bp->b_fspriv != NULL);
333 ASSERT((XFS_BUF_ISERROR(bp)) == 0); 332 ASSERT(!bp->b_error);
334 if (!(XFS_BUF_ISDONE(bp))) { 333 if (!(XFS_BUF_ISDONE(bp))) {
335 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 334 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
336 ASSERT(!XFS_BUF_ISASYNC(bp)); 335 ASSERT(!XFS_BUF_ISASYNC(bp));
@@ -386,10 +385,9 @@ xfs_trans_read_buf(
386 return (flags & XBF_TRYLOCK) ? 385 return (flags & XBF_TRYLOCK) ?
387 0 : XFS_ERROR(ENOMEM); 386 0 : XFS_ERROR(ENOMEM);
388 } 387 }
389 if (XFS_BUF_GETERROR(bp) != 0) { 388 if (bp->b_error) {
390 XFS_BUF_SUPER_STALE(bp); 389 error = bp->b_error;
391 error = XFS_BUF_GETERROR(bp); 390 XFS_BUF_SUPER_STALE(bp);
392
393 xfs_ioerror_alert("xfs_trans_read_buf", mp, 391 xfs_ioerror_alert("xfs_trans_read_buf", mp,
394 bp, blkno); 392 bp, blkno);
395 if (tp->t_flags & XFS_TRANS_DIRTY) 393 if (tp->t_flags & XFS_TRANS_DIRTY)
@@ -430,7 +428,7 @@ shutdown_abort:
430 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 428 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
431 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); 429 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
432#endif 430#endif
433 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != 431 ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
434 (XBF_STALE|XBF_DELWRI)); 432 (XBF_STALE|XBF_DELWRI));
435 433
436 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 434 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
@@ -581,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t *tp,
581{ 579{
582 xfs_buf_log_item_t *bip = bp->b_fspriv; 580 xfs_buf_log_item_t *bip = bp->b_fspriv;
583 581
584 ASSERT(XFS_BUF_ISBUSY(bp));
585 ASSERT(bp->b_transp == tp); 582 ASSERT(bp->b_transp == tp);
586 ASSERT(bip != NULL); 583 ASSERT(bip != NULL);
587 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 584 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -602,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
602{ 599{
603 xfs_buf_log_item_t *bip = bp->b_fspriv; 600 xfs_buf_log_item_t *bip = bp->b_fspriv;
604 601
605 ASSERT(XFS_BUF_ISBUSY(bp));
606 ASSERT(bp->b_transp == tp); 602 ASSERT(bp->b_transp == tp);
607 ASSERT(bip != NULL); 603 ASSERT(bip != NULL);
608 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 604 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -631,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
631{ 627{
632 xfs_buf_log_item_t *bip = bp->b_fspriv; 628 xfs_buf_log_item_t *bip = bp->b_fspriv;
633 629
634 ASSERT(XFS_BUF_ISBUSY(bp));
635 ASSERT(bp->b_transp == tp); 630 ASSERT(bp->b_transp == tp);
636 ASSERT(bip != NULL); 631 ASSERT(bip != NULL);
637 ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); 632 ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
@@ -702,7 +697,6 @@ xfs_trans_binval(
702{ 697{
703 xfs_buf_log_item_t *bip = bp->b_fspriv; 698 xfs_buf_log_item_t *bip = bp->b_fspriv;
704 699
705 ASSERT(XFS_BUF_ISBUSY(bp));
706 ASSERT(bp->b_transp == tp); 700 ASSERT(bp->b_transp == tp);
707 ASSERT(bip != NULL); 701 ASSERT(bip != NULL);
708 ASSERT(atomic_read(&bip->bli_refcount) > 0); 702 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -774,7 +768,6 @@ xfs_trans_inode_buf(
774{ 768{
775 xfs_buf_log_item_t *bip = bp->b_fspriv; 769 xfs_buf_log_item_t *bip = bp->b_fspriv;
776 770
777 ASSERT(XFS_BUF_ISBUSY(bp));
778 ASSERT(bp->b_transp == tp); 771 ASSERT(bp->b_transp == tp);
779 ASSERT(bip != NULL); 772 ASSERT(bip != NULL);
780 ASSERT(atomic_read(&bip->bli_refcount) > 0); 773 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -798,7 +791,6 @@ xfs_trans_stale_inode_buf(
798{ 791{
799 xfs_buf_log_item_t *bip = bp->b_fspriv; 792 xfs_buf_log_item_t *bip = bp->b_fspriv;
800 793
801 ASSERT(XFS_BUF_ISBUSY(bp));
802 ASSERT(bp->b_transp == tp); 794 ASSERT(bp->b_transp == tp);
803 ASSERT(bip != NULL); 795 ASSERT(bip != NULL);
804 ASSERT(atomic_read(&bip->bli_refcount) > 0); 796 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -823,7 +815,6 @@ xfs_trans_inode_alloc_buf(
823{ 815{
824 xfs_buf_log_item_t *bip = bp->b_fspriv; 816 xfs_buf_log_item_t *bip = bp->b_fspriv;
825 817
826 ASSERT(XFS_BUF_ISBUSY(bp));
827 ASSERT(bp->b_transp == tp); 818 ASSERT(bp->b_transp == tp);
828 ASSERT(bip != NULL); 819 ASSERT(bip != NULL);
829 ASSERT(atomic_read(&bip->bli_refcount) > 0); 820 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -851,7 +842,6 @@ xfs_trans_dquot_buf(
851{ 842{
852 xfs_buf_log_item_t *bip = bp->b_fspriv; 843 xfs_buf_log_item_t *bip = bp->b_fspriv;
853 844
854 ASSERT(XFS_BUF_ISBUSY(bp));
855 ASSERT(bp->b_transp == tp); 845 ASSERT(bp->b_transp == tp);
856 ASSERT(bip != NULL); 846 ASSERT(bip != NULL);
857 ASSERT(type == XFS_BLF_UDQUOT_BUF || 847 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792d..4d00ee67792d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227bc..7c220b4227bc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 88d121486c52..51fc429527bc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -83,7 +83,9 @@ xfs_readlink_bmap(
83 83
84 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 84 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
85 XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); 85 XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
86 error = XFS_BUF_GETERROR(bp); 86 if (!bp)
87 return XFS_ERROR(ENOMEM);
88 error = bp->b_error;
87 if (error) { 89 if (error) {
88 xfs_ioerror_alert("xfs_readlink", 90 xfs_ioerror_alert("xfs_readlink",
89 ip->i_mount, bp, XFS_BUF_ADDR(bp)); 91 ip->i_mount, bp, XFS_BUF_ADDR(bp));
@@ -94,7 +96,7 @@ xfs_readlink_bmap(
94 byte_cnt = pathlen; 96 byte_cnt = pathlen;
95 pathlen -= byte_cnt; 97 pathlen -= byte_cnt;
96 98
97 memcpy(link, XFS_BUF_PTR(bp), byte_cnt); 99 memcpy(link, bp->b_addr, byte_cnt);
98 xfs_buf_relse(bp); 100 xfs_buf_relse(bp);
99 } 101 }
100 102
@@ -121,7 +123,7 @@ xfs_readlink(
121 123
122 xfs_ilock(ip, XFS_ILOCK_SHARED); 124 xfs_ilock(ip, XFS_ILOCK_SHARED);
123 125
124 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); 126 ASSERT(S_ISLNK(ip->i_d.di_mode));
125 ASSERT(ip->i_d.di_size <= MAXPATHLEN); 127 ASSERT(ip->i_d.di_size <= MAXPATHLEN);
126 128
127 pathlen = ip->i_d.di_size; 129 pathlen = ip->i_d.di_size;
@@ -529,7 +531,7 @@ xfs_release(
529 if (ip->i_d.di_nlink == 0) 531 if (ip->i_d.di_nlink == 0)
530 return 0; 532 return 0;
531 533
532 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 534 if ((S_ISREG(ip->i_d.di_mode) &&
533 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 535 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
534 ip->i_delayed_blks > 0)) && 536 ip->i_delayed_blks > 0)) &&
535 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 537 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
@@ -610,7 +612,7 @@ xfs_inactive(
610 truncate = ((ip->i_d.di_nlink == 0) && 612 truncate = ((ip->i_d.di_nlink == 0) &&
611 ((ip->i_d.di_size != 0) || (ip->i_size != 0) || 613 ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
612 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && 614 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
613 ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); 615 S_ISREG(ip->i_d.di_mode));
614 616
615 mp = ip->i_mount; 617 mp = ip->i_mount;
616 618
@@ -621,7 +623,7 @@ xfs_inactive(
621 goto out; 623 goto out;
622 624
623 if (ip->i_d.di_nlink != 0) { 625 if (ip->i_d.di_nlink != 0) {
624 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 626 if ((S_ISREG(ip->i_d.di_mode) &&
625 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 627 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
626 ip->i_delayed_blks > 0)) && 628 ip->i_delayed_blks > 0)) &&
627 (ip->i_df.if_flags & XFS_IFEXTENTS) && 629 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
@@ -669,7 +671,7 @@ xfs_inactive(
669 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 671 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
670 return VN_INACTIVE_CACHE; 672 return VN_INACTIVE_CACHE;
671 } 673 }
672 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { 674 } else if (S_ISLNK(ip->i_d.di_mode)) {
673 675
674 /* 676 /*
675 * If we get an error while cleaning up a 677 * If we get an error while cleaning up a
@@ -1648,13 +1650,13 @@ xfs_symlink(
1648 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 1650 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1649 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 1651 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
1650 BTOBB(byte_cnt), 0); 1652 BTOBB(byte_cnt), 0);
1651 ASSERT(bp && !XFS_BUF_GETERROR(bp)); 1653 ASSERT(!xfs_buf_geterror(bp));
1652 if (pathlen < byte_cnt) { 1654 if (pathlen < byte_cnt) {
1653 byte_cnt = pathlen; 1655 byte_cnt = pathlen;
1654 } 1656 }
1655 pathlen -= byte_cnt; 1657 pathlen -= byte_cnt;
1656 1658
1657 memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); 1659 memcpy(bp->b_addr, cur_chunk, byte_cnt);
1658 cur_chunk += byte_cnt; 1660 cur_chunk += byte_cnt;
1659 1661
1660 xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); 1662 xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
@@ -1999,7 +2001,7 @@ xfs_zero_remaining_bytes(
1999 mp, bp, XFS_BUF_ADDR(bp)); 2001 mp, bp, XFS_BUF_ADDR(bp));
2000 break; 2002 break;
2001 } 2003 }
2002 memset(XFS_BUF_PTR(bp) + 2004 memset(bp->b_addr +
2003 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 2005 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
2004 0, lastoffset - offset + 1); 2006 0, lastoffset - offset + 1);
2005 XFS_BUF_UNDONE(bp); 2007 XFS_BUF_UNDONE(bp);
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c8..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c