aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2013-07-12 06:34:42 -0400
committerThomas Gleixner <tglx@linutronix.de>2013-07-12 06:34:42 -0400
commitf2006e27396f55276f24434f56e208d86e7f9908 (patch)
tree71896db916d33888b4286f80117d3cac0da40e6d /fs
parente399eb56a6110e13f97e644658648602e2b08de7 (diff)
parent9903883f1dd6e86f286b7bfa6e4b423f98c1cd9e (diff)
Merge branch 'linus' into timers/urgent
Get upstream changes so we can apply fixes against them Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/adfs/dir.c6
-rw-r--r--fs/affs/namei.c26
-rw-r--r--fs/afs/flock.c7
-rw-r--r--fs/aio.c4
-rw-r--r--fs/autofs4/expire.c8
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c33
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c120
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c483
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c315
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c165
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c501
-rw-r--r--fs/btrfs/ioctl.c82
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c90
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/buffer.c34
-rw-r--r--fs/cachefiles/rdwr.c30
-rw-r--r--fs/ceph/addr.c88
-rw-r--r--fs/ceph/caps.c102
-rw-r--r--fs/ceph/file.c15
-rw-r--r--fs/ceph/inode.c18
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mds_client.c16
-rw-r--r--fs/ceph/mdsmap.c42
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_debug.c52
-rw-r--r--fs/cifs/cifs_unicode.h8
-rw-r--r--fs/cifs/cifsencrypt.c40
-rw-r--r--fs/cifs/cifsfs.c13
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h48
-rw-r--r--fs/cifs/cifspdu.h17
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c425
-rw-r--r--fs/cifs/connect.c159
-rw-r--r--fs/cifs/dir.c9
-rw-r--r--fs/cifs/file.c15
-rw-r--r--fs/cifs/misc.c3
-rw-r--r--fs/cifs/readdir.c29
-rw-r--r--fs/cifs/sess.c95
-rw-r--r--fs/cifs/smb1ops.c23
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2misc.c4
-rw-r--r--fs/cifs/smb2ops.c48
-rw-r--r--fs/cifs/smb2pdu.c282
-rw-r--r--fs/cifs/smb2pdu.h100
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/smb2transport.c151
-rw-r--r--fs/cifs/smbfsctl.h27
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/coda/dir.c10
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c121
-rw-r--r--fs/dcache.c66
-rw-r--r--fs/ecryptfs/crypto.c342
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c16
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efivarfs/super.c9
-rw-r--r--fs/eventpoll.c16
-rw-r--r--fs/exec.c17
-rw-r--r--fs/ext2/namei.c24
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/inode.c1
-rw-r--r--fs/ext3/namei.c47
-rw-r--r--fs/ext3/super.c13
-rw-r--r--fs/ext4/file.c24
-rw-r--r--fs/ext4/namei.c47
-rw-r--r--fs/f2fs/dir.c20
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fat/namei_msdos.c6
-rw-r--r--fs/fat/namei_vfat.c12
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c10
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/hfs/hfs_fs.h7
-rw-r--r--fs/hfs/string.c6
-rw-r--r--fs/hfsplus/hfsplus_fs.h7
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hpfs/buffer.c33
-rw-r--r--fs/hpfs/dentry.c7
-rw-r--r--fs/hpfs/file.c40
-rw-r--r--fs/hpfs/hpfs_fn.h7
-rw-r--r--fs/hpfs/map.c22
-rw-r--r--fs/hpfs/super.c17
-rw-r--r--fs/hppfs/hppfs.c11
-rw-r--r--fs/inode.c4
-rw-r--r--fs/internal.h6
-rw-r--r--fs/isofs/inode.c48
-rw-r--r--fs/isofs/namei.c3
-rw-r--r--fs/jfs/jfs_dmap.c70
-rw-r--r--fs/jfs/jfs_dtree.c37
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_imap.c69
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/jfs/jfs_superblock.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c62
-rw-r--r--fs/jfs/namei.c9
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c22
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svclock.c14
-rw-r--r--fs/lockd/svcsubs.c12
-rw-r--r--fs/locks.c328
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/namei.c13
-rw-r--r--fs/namei.c113
-rw-r--r--fs/ncpfs/dir.c45
-rw-r--r--fs/ncpfs/inode.c16
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig14
-rw-r--r--fs/nfs/Makefile6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/callback.c6
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c3
-rw-r--r--fs/nfs/callback_xdr.c52
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/delegation.c10
-rw-r--r--fs/nfs/dir.c94
-rw-r--r--fs/nfs/dns_resolve.c32
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c56
-rw-r--r--fs/nfs/inode.c140
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c15
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c3
-rw-r--r--fs/nfs/nfs4filelayout.h3
-rw-r--r--fs/nfs/nfs4filelayoutdev.c8
-rw-r--r--fs/nfs/nfs4proc.c691
-rw-r--r--fs/nfs/nfs4session.c40
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c46
-rw-r--r--fs/nfs/nfs4super.c14
-rw-r--r--fs/nfs/nfs4xdr.c182
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/proc.c13
-rw-r--r--fs/nfs/super.c199
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/nfs4proc.c44
-rw-r--r--fs/nfsd/nfs4state.c233
-rw-r--r--fs/nfsd/nfs4xdr.c169
-rw-r--r--fs/nfsd/nfsd.h26
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c28
-rw-r--r--fs/nfsd/vfs.h7
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/alloc.c63
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/ifile.c22
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/super.c33
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h4
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify_user.c92
-rw-r--r--fs/notify/inotify/inotify_user.c13
-rw-r--r--fs/notify/mark.c50
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c29
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/file.c12
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/namei.c70
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/suballoc.c37
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/xattr.c18
-rw-r--r--fs/open.c63
-rw-r--r--fs/proc/base.c105
-rw-r--r--fs/proc/fd.c18
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c13
-rw-r--r--fs/proc/proc_sysctl.c7
-rw-r--r--fs/proc/task_mmu.c145
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/proc/vmcore.c694
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/platform.c21
-rw-r--r--fs/pstore/ram.c5
-rw-r--r--fs/pstore/ram_core.c54
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/read_write.c65
-rw-r--r--fs/select.c66
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/splice.c38
-rw-r--r--fs/sysv/namei.c3
-rw-r--r--fs/timerfd.c131
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/namei.c24
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_alloc.c24
-rw-r--r--fs/xfs/xfs_bmap_btree.h2
-rw-r--r--fs/xfs/xfs_buf_item.c87
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/xfs_dquot.c16
-rw-r--r--fs/xfs/xfs_dquot.h4
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c74
-rw-r--r--fs/xfs/xfs_ialloc.h8
-rw-r--r--fs/xfs/xfs_icache.c3
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c195
-rw-r--r--fs/xfs/xfs_icreate_item.h52
-rw-r--r--fs/xfs/xfs_inode.c68
-rw-r--r--fs/xfs/xfs_iomap.c13
-rw-r--r--fs/xfs/xfs_iops.c3
-rw-r--r--fs/xfs/xfs_itable.c5
-rw-r--r--fs/xfs/xfs_log.c22
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c75
-rw-r--r--fs/xfs/xfs_log_recover.c114
-rw-r--r--fs/xfs/xfs_mount.c92
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_qm.c175
-rw-r--r--fs/xfs/xfs_qm.h83
-rw-r--r--fs/xfs/xfs_qm_syscalls.c51
-rw-r--r--fs/xfs/xfs_quota.h47
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_sb.h6
-rw-r--r--fs/xfs/xfs_super.c39
-rw-r--r--fs/xfs/xfs_symlink.c51
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.h5
-rw-r--r--fs/xfs/xfs_trans.c118
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_buf.c34
-rw-r--r--fs/xfs/xfs_trans_dquot.c87
-rw-r--r--fs/xfs/xfs_trans_inode.c11
-rw-r--r--fs/xfs/xfs_vnodeops.c15
301 files changed, 9187 insertions, 5085 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 55abfd62654a..6489e1fc1afd 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
31 If you don't know what Access Control Lists are, say N 31 If you don't know what Access Control Lists are, say N
32 32
33endif 33endif
34
35
36config 9P_FS_SECURITY
37 bool "9P Security Labels"
38 depends on 9P_FS
39 help
40 Security labels support alternative access control models
41 implemented by security modules like SELinux. This option
42 enables an extended attribute handler for file security
43 labels in the 9P filesystem.
44
45 If you are not using a security module that requires using
46 extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c12780634..ff7be98f84f2 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
11 v9fs.o \ 11 v9fs.o \
12 fid.o \ 12 fid.o \
13 xattr.o \ 13 xattr.o \
14 xattr_user.o 14 xattr_user.o \
15 xattr_trusted.o
15 16
169p-$(CONFIG_9P_FSCACHE) += cache.o 179p-$(CONFIG_9P_FSCACHE) += cache.o
179p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o 189p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
199p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d86edc8d3fd0..25b018efb8ab 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1054,13 +1054,11 @@ static int
1054v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1054v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1055 struct kstat *stat) 1055 struct kstat *stat)
1056{ 1056{
1057 int err;
1058 struct v9fs_session_info *v9ses; 1057 struct v9fs_session_info *v9ses;
1059 struct p9_fid *fid; 1058 struct p9_fid *fid;
1060 struct p9_wstat *st; 1059 struct p9_wstat *st;
1061 1060
1062 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); 1061 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1063 err = -EPERM;
1064 v9ses = v9fs_dentry2v9ses(dentry); 1062 v9ses = v9fs_dentry2v9ses(dentry);
1065 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 1063 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1066 generic_fillattr(dentry->d_inode, stat); 1064 generic_fillattr(dentry->d_inode, stat);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index c45e016b190f..3c28cdfb8c47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
167 167
168const struct xattr_handler *v9fs_xattr_handlers[] = { 168const struct xattr_handler *v9fs_xattr_handlers[] = {
169 &v9fs_xattr_user_handler, 169 &v9fs_xattr_user_handler,
170 &v9fs_xattr_trusted_handler,
170#ifdef CONFIG_9P_FS_POSIX_ACL 171#ifdef CONFIG_9P_FS_POSIX_ACL
171 &v9fs_xattr_acl_access_handler, 172 &v9fs_xattr_acl_access_handler,
172 &v9fs_xattr_acl_default_handler, 173 &v9fs_xattr_acl_default_handler,
173#endif 174#endif
175#ifdef CONFIG_9P_FS_SECURITY
176 &v9fs_xattr_security_handler,
177#endif
174 NULL 178 NULL
175}; 179};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eec348a3df71..d3e2ea3840be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
20 20
21extern const struct xattr_handler *v9fs_xattr_handlers[]; 21extern const struct xattr_handler *v9fs_xattr_handlers[];
22extern struct xattr_handler v9fs_xattr_user_handler; 22extern struct xattr_handler v9fs_xattr_user_handler;
23extern struct xattr_handler v9fs_xattr_trusted_handler;
24extern struct xattr_handler v9fs_xattr_security_handler;
23extern const struct xattr_handler v9fs_xattr_acl_access_handler; 25extern const struct xattr_handler v9fs_xattr_acl_access_handler;
24extern const struct xattr_handler v9fs_xattr_acl_default_handler; 26extern const struct xattr_handler v9fs_xattr_acl_default_handler;
25 27
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 000000000000..cb247a142a6e
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_security_handler = {
77 .prefix = XATTR_SECURITY_PREFIX,
78 .get = v9fs_xattr_security_get,
79 .set = v9fs_xattr_security_set,
80};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000000..e30d33b8a3fb
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_trusted_handler = {
77 .prefix = XATTR_TRUSTED_PREFIX,
78 .get = v9fs_xattr_trusted_get,
79 .set = v9fs_xattr_trusted_set,
80};
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index ade28bb058e3..0d138c0de293 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -191,8 +191,7 @@ const struct file_operations adfs_dir_operations = {
191}; 191};
192 192
193static int 193static int
194adfs_hash(const struct dentry *parent, const struct inode *inode, 194adfs_hash(const struct dentry *parent, struct qstr *qstr)
195 struct qstr *qstr)
196{ 195{
197 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; 196 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
198 const unsigned char *name; 197 const unsigned char *name;
@@ -228,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
228 * requirements of the underlying filesystem. 227 * requirements of the underlying filesystem.
229 */ 228 */
230static int 229static int
231adfs_compare(const struct dentry *parent, const struct inode *pinode, 230adfs_compare(const struct dentry *parent, const struct dentry *dentry,
232 const struct dentry *dentry, const struct inode *inode,
233 unsigned int len, const char *str, const struct qstr *name) 231 unsigned int len, const char *str, const struct qstr *name)
234{ 232{
235 int i; 233 int i;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a7839..c36cbb4537a2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
13typedef int (*toupper_t)(int); 13typedef int (*toupper_t)(int);
14 14
15static int affs_toupper(int ch); 15static int affs_toupper(int ch);
16static int affs_hash_dentry(const struct dentry *, 16static int affs_hash_dentry(const struct dentry *, struct qstr *);
17 const struct inode *, struct qstr *); 17static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
18static int affs_compare_dentry(const struct dentry *parent,
19 const struct inode *pinode,
20 const struct dentry *dentry, const struct inode *inode,
21 unsigned int len, const char *str, const struct qstr *name); 18 unsigned int len, const char *str, const struct qstr *name);
22static int affs_intl_toupper(int ch); 19static int affs_intl_toupper(int ch);
23static int affs_intl_hash_dentry(const struct dentry *, 20static int affs_intl_hash_dentry(const struct dentry *, struct qstr *);
24 const struct inode *, struct qstr *); 21static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
25static int affs_intl_compare_dentry(const struct dentry *parent,
26 const struct inode *pinode,
27 const struct dentry *dentry, const struct inode *inode,
28 unsigned int len, const char *str, const struct qstr *name); 22 unsigned int len, const char *str, const struct qstr *name);
29 23
30const struct dentry_operations affs_dentry_operations = { 24const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
86} 80}
87 81
88static int 82static int
89affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
90 struct qstr *qstr)
91{ 84{
92 return __affs_hash_dentry(qstr, affs_toupper); 85 return __affs_hash_dentry(qstr, affs_toupper);
93} 86}
94static int 87static int
95affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, 88affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
96 struct qstr *qstr)
97{ 89{
98 return __affs_hash_dentry(qstr, affs_intl_toupper); 90 return __affs_hash_dentry(qstr, affs_intl_toupper);
99} 91}
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
131} 123}
132 124
133static int 125static int
134affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, 126affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
135 const struct dentry *dentry, const struct inode *inode,
136 unsigned int len, const char *str, const struct qstr *name) 127 unsigned int len, const char *str, const struct qstr *name)
137{ 128{
138 return __affs_compare_dentry(len, str, name, affs_toupper); 129 return __affs_compare_dentry(len, str, name, affs_toupper);
139} 130}
140static int 131static int
141affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, 132affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
142 const struct dentry *dentry, const struct inode *inode,
143 unsigned int len, const char *str, const struct qstr *name) 133 unsigned int len, const char *str, const struct qstr *name)
144{ 134{
145 return __affs_compare_dentry(len, str, name, affs_intl_toupper); 135 return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 2497bf306c70..a8cf2cff836c 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
252 */ 252 */
253static int afs_do_setlk(struct file *file, struct file_lock *fl) 253static int afs_do_setlk(struct file *file, struct file_lock *fl)
254{ 254{
255 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); 255 struct inode *inode = file_inode(file);
256 struct afs_vnode *vnode = AFS_FS_I(inode);
256 afs_lock_type_t type; 257 afs_lock_type_t type;
257 struct key *key = file->private_data; 258 struct key *key = file->private_data;
258 int ret; 259 int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
273 274
274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
275 276
276 lock_flocks(); 277 spin_lock(&inode->i_lock);
277 278
278 /* make sure we've got a callback on this file and that our view of the 279 /* make sure we've got a callback on this file and that our view of the
279 * data version is up to date */ 280 * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
420 afs_vnode_fetch_status(vnode, NULL, key); 421 afs_vnode_fetch_status(vnode, NULL, key);
421 422
422error: 423error:
423 unlock_flocks(); 424 spin_unlock(&inode->i_lock);
424 _leave(" = %d", ret); 425 _leave(" = %d", ret);
425 return ret; 426 return ret;
426 427
diff --git a/fs/aio.c b/fs/aio.c
index 2bbcacf74d0c..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,8 @@
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41 41
42#include "internal.h"
43
42#define AIO_RING_MAGIC 0xa10a10a1 44#define AIO_RING_MAGIC 0xa10a10a1
43#define AIO_RING_COMPAT_FEATURES 1 45#define AIO_RING_COMPAT_FEATURES 1
44#define AIO_RING_INCOMPAT_FEATURES 0 46#define AIO_RING_INCOMPAT_FEATURES 0
@@ -623,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
623 625
624 /* 626 /*
625 * Add a completion event to the ring buffer. Must be done holding 627 * Add a completion event to the ring buffer. Must be done holding
626 * ctx->ctx_lock to prevent other code from messing with the tail 628 * ctx->completion_lock to prevent other code from messing with the tail
627 * pointer since we might be called from irq context. 629 * pointer since we might be called from irq context.
628 */ 630 */
629 spin_lock_irqsave(&ctx->completion_lock, flags); 631 spin_lock_irqsave(&ctx->completion_lock, flags);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec92341c..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
109 109
110 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); 110 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
111 /* Already gone or negative dentry (under construction) - try next */ 111 /* Already gone or negative dentry (under construction) - try next */
112 if (q->d_count == 0 || !simple_positive(q)) { 112 if (!d_count(q) || !simple_positive(q)) {
113 spin_unlock(&q->d_lock); 113 spin_unlock(&q->d_lock);
114 next = q->d_u.d_child.next; 114 next = q->d_u.d_child.next;
115 goto cont; 115 goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
267 else 267 else
268 ino_count++; 268 ino_count++;
269 269
270 if (p->d_count > ino_count) { 270 if (d_count(p) > ino_count) {
271 top_ino->last_used = jiffies; 271 top_ino->last_used = jiffies;
272 dput(p); 272 dput(p);
273 return 1; 273 return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
409 if (!exp_leaves) { 409 if (!exp_leaves) {
410 /* Path walk currently on this dentry? */ 410 /* Path walk currently on this dentry? */
411 ino_count = atomic_read(&ino->count) + 1; 411 ino_count = atomic_read(&ino->count) + 1;
412 if (dentry->d_count > ino_count) 412 if (d_count(dentry) > ino_count)
413 goto next; 413 goto next;
414 414
415 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { 415 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
423 } else { 423 } else {
424 /* Path walk currently on this dentry? */ 424 /* Path walk currently on this dentry? */
425 ino_count = atomic_read(&ino->count) + 1; 425 ino_count = atomic_read(&ino->count) + 1;
426 if (dentry->d_count > ino_count) 426 if (d_count(dentry) > ino_count)
427 goto next; 427 goto next;
428 428
429 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); 429 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ca8e55548d98..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
179 spin_lock(&active->d_lock); 179 spin_lock(&active->d_lock);
180 180
181 /* Already gone? */ 181 /* Already gone? */
182 if (active->d_count == 0) 182 if (!d_count(active))
183 goto next; 183 goto next;
184 184
185 qstr = &active->d_name; 185 qstr = &active->d_name;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
255 (current->mm->start_data = N_DATADDR(ex)); 255 (current->mm->start_data = N_DATADDR(ex));
256 current->mm->brk = ex.a_bss + 256 current->mm->brk = ex.a_bss +
257 (current->mm->start_brk = N_BSSADDR(ex)); 257 (current->mm->start_brk = N_BSSADDR(ex));
258 current->mm->free_area_cache = current->mm->mmap_base;
259 current->mm->cached_hole_size = 0;
260 258
261 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 259 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
262 if (retval < 0) { 260 if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
738 738
739 /* Do this so that we can load the interpreter, if need be. We will 739 /* Do this so that we can load the interpreter, if need be. We will
740 change some of these later */ 740 change some of these later */
741 current->mm->free_area_cache = current->mm->mmap_base;
742 current->mm->cached_hole_size = 0;
743 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 741 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
744 executable_stack); 742 executable_stack);
745 if (retval < 0) { 743 if (retval < 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2091db8cdd78..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
58 struct backing_dev_info *dst) 58 struct backing_dev_info *dst)
59{ 59{
60 struct backing_dev_info *old = inode->i_data.backing_dev_info; 60 struct backing_dev_info *old = inode->i_data.backing_dev_info;
61 bool wakeup_bdi = false;
61 62
62 if (unlikely(dst == old)) /* deadlock avoidance */ 63 if (unlikely(dst == old)) /* deadlock avoidance */
63 return; 64 return;
64 bdi_lock_two(&old->wb, &dst->wb); 65 bdi_lock_two(&old->wb, &dst->wb);
65 spin_lock(&inode->i_lock); 66 spin_lock(&inode->i_lock);
66 inode->i_data.backing_dev_info = dst; 67 inode->i_data.backing_dev_info = dst;
67 if (inode->i_state & I_DIRTY) 68 if (inode->i_state & I_DIRTY) {
69 if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
70 wakeup_bdi = true;
68 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 71 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
72 }
69 spin_unlock(&inode->i_lock); 73 spin_unlock(&inode->i_lock);
70 spin_unlock(&old->wb.list_lock); 74 spin_unlock(&old->wb.list_lock);
71 spin_unlock(&dst->wb.list_lock); 75 spin_unlock(&dst->wb.list_lock);
76
77 if (wakeup_bdi)
78 bdi_wakeup_thread_delayed(dst);
72} 79}
73 80
74/* Kill _all_ buffers and pagecache , dirty or not.. */ 81/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -325,31 +332,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
325static loff_t block_llseek(struct file *file, loff_t offset, int whence) 332static loff_t block_llseek(struct file *file, loff_t offset, int whence)
326{ 333{
327 struct inode *bd_inode = file->f_mapping->host; 334 struct inode *bd_inode = file->f_mapping->host;
328 loff_t size;
329 loff_t retval; 335 loff_t retval;
330 336
331 mutex_lock(&bd_inode->i_mutex); 337 mutex_lock(&bd_inode->i_mutex);
332 size = i_size_read(bd_inode); 338 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
333
334 retval = -EINVAL;
335 switch (whence) {
336 case SEEK_END:
337 offset += size;
338 break;
339 case SEEK_CUR:
340 offset += file->f_pos;
341 case SEEK_SET:
342 break;
343 default:
344 goto out;
345 }
346 if (offset >= 0 && offset <= size) {
347 if (offset != file->f_pos) {
348 file->f_pos = offset;
349 }
350 retval = offset;
351 }
352out:
353 mutex_unlock(&bd_inode->i_mutex); 339 mutex_unlock(&bd_inode->i_mutex);
354 return retval; 340 return retval;
355} 341}
@@ -1583,6 +1569,7 @@ static const struct address_space_operations def_blk_aops = {
1583 .writepages = generic_writepages, 1569 .writepages = generic_writepages,
1584 .releasepage = blkdev_releasepage, 1570 .releasepage = blkdev_releasepage,
1585 .direct_IO = blkdev_direct_IO, 1571 .direct_IO = blkdev_direct_IO,
1572 .is_dirty_writeback = buffer_check_dirty_writeback,
1586}; 1573};
1587 1574
1588const struct file_operations def_blk_fops = { 1575const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
255 * to a logical address 255 * to a logical address
256 */ 256 */
257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
258 int search_commit_root, 258 struct btrfs_path *path, u64 time_seq,
259 u64 time_seq, 259 struct __prelim_ref *ref,
260 struct __prelim_ref *ref, 260 struct ulist *parents,
261 struct ulist *parents, 261 const u64 *extent_item_pos)
262 const u64 *extent_item_pos)
263{ 262{
264 struct btrfs_path *path;
265 struct btrfs_root *root; 263 struct btrfs_root *root;
266 struct btrfs_key root_key; 264 struct btrfs_key root_key;
267 struct extent_buffer *eb; 265 struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
269 int root_level; 267 int root_level;
270 int level = ref->level; 268 int level = ref->level;
271 269
272 path = btrfs_alloc_path();
273 if (!path)
274 return -ENOMEM;
275 path->search_commit_root = !!search_commit_root;
276
277 root_key.objectid = ref->root_id; 270 root_key.objectid = ref->root_id;
278 root_key.type = BTRFS_ROOT_ITEM_KEY; 271 root_key.type = BTRFS_ROOT_ITEM_KEY;
279 root_key.offset = (u64)-1; 272 root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
314 time_seq, ref->wanted_disk_byte, 307 time_seq, ref->wanted_disk_byte,
315 extent_item_pos); 308 extent_item_pos);
316out: 309out:
317 btrfs_free_path(path); 310 path->lowest_level = 0;
311 btrfs_release_path(path);
318 return ret; 312 return ret;
319} 313}
320 314
@@ -322,7 +316,7 @@ out:
322 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
323 */ 317 */
324static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
325 int search_commit_root, u64 time_seq, 319 struct btrfs_path *path, u64 time_seq,
326 struct list_head *head, 320 struct list_head *head,
327 const u64 *extent_item_pos) 321 const u64 *extent_item_pos)
328{ 322{
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
349 continue; 343 continue;
350 if (ref->count == 0) 344 if (ref->count == 0)
351 continue; 345 continue;
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
353 time_seq, ref, parents, 347 parents, extent_item_pos);
354 extent_item_pos);
355 if (err == -ENOMEM) 348 if (err == -ENOMEM)
356 goto out; 349 goto out;
357 if (err) 350 if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
604 int slot; 597 int slot;
605 struct extent_buffer *leaf; 598 struct extent_buffer *leaf;
606 struct btrfs_key key; 599 struct btrfs_key key;
600 struct btrfs_key found_key;
607 unsigned long ptr; 601 unsigned long ptr;
608 unsigned long end; 602 unsigned long end;
609 struct btrfs_extent_item *ei; 603 struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
621 615
622 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 616 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
623 flags = btrfs_extent_flags(leaf, ei); 617 flags = btrfs_extent_flags(leaf, ei);
618 btrfs_item_key_to_cpu(leaf, &found_key, slot);
624 619
625 ptr = (unsigned long)(ei + 1); 620 ptr = (unsigned long)(ei + 1);
626 end = (unsigned long)ei + item_size; 621 end = (unsigned long)ei + item_size;
627 622
628 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
624 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
629 struct btrfs_tree_block_info *info; 625 struct btrfs_tree_block_info *info;
630 626
631 info = (struct btrfs_tree_block_info *)ptr; 627 info = (struct btrfs_tree_block_info *)ptr;
632 *info_level = btrfs_tree_block_level(leaf, info); 628 *info_level = btrfs_tree_block_level(leaf, info);
633 ptr += sizeof(struct btrfs_tree_block_info); 629 ptr += sizeof(struct btrfs_tree_block_info);
634 BUG_ON(ptr > end); 630 BUG_ON(ptr > end);
631 } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
632 *info_level = found_key.offset;
635 } else { 633 } else {
636 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 634 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
637 } 635 }
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
795 struct btrfs_delayed_ref_head *head; 793 struct btrfs_delayed_ref_head *head;
796 int info_level = 0; 794 int info_level = 0;
797 int ret; 795 int ret;
798 int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
799 struct list_head prefs_delayed; 796 struct list_head prefs_delayed;
800 struct list_head prefs; 797 struct list_head prefs;
801 struct __prelim_ref *ref; 798 struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
804 INIT_LIST_HEAD(&prefs_delayed); 801 INIT_LIST_HEAD(&prefs_delayed);
805 802
806 key.objectid = bytenr; 803 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = (u64)-1; 804 key.offset = (u64)-1;
805 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
806 key.type = BTRFS_METADATA_ITEM_KEY;
807 else
808 key.type = BTRFS_EXTENT_ITEM_KEY;
809 809
810 path = btrfs_alloc_path(); 810 path = btrfs_alloc_path();
811 if (!path) 811 if (!path)
812 return -ENOMEM; 812 return -ENOMEM;
813 path->search_commit_root = !!search_commit_root; 813 if (!trans)
814 path->search_commit_root = 1;
814 815
815 /* 816 /*
816 * grab both a lock on the path and a lock on the delayed ref head. 817 * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
825 goto out; 826 goto out;
826 BUG_ON(ret == 0); 827 BUG_ON(ret == 0);
827 828
828 if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { 829 if (trans) {
829 /* 830 /*
830 * look if there are updates for this ref queued and lock the 831 * look if there are updates for this ref queued and lock the
831 * head 832 * head
@@ -869,7 +870,8 @@ again:
869 slot = path->slots[0]; 870 slot = path->slots[0];
870 btrfs_item_key_to_cpu(leaf, &key, slot); 871 btrfs_item_key_to_cpu(leaf, &key, slot);
871 if (key.objectid == bytenr && 872 if (key.objectid == bytenr &&
872 key.type == BTRFS_EXTENT_ITEM_KEY) { 873 (key.type == BTRFS_EXTENT_ITEM_KEY ||
874 key.type == BTRFS_METADATA_ITEM_KEY)) {
873 ret = __add_inline_refs(fs_info, path, bytenr, 875 ret = __add_inline_refs(fs_info, path, bytenr,
874 &info_level, &prefs); 876 &info_level, &prefs);
875 if (ret) 877 if (ret)
@@ -890,8 +892,8 @@ again:
890 892
891 __merge_refs(&prefs, 1); 893 __merge_refs(&prefs, 1);
892 894
893 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, 895 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
894 &prefs, extent_item_pos); 896 extent_item_pos);
895 if (ret) 897 if (ret)
896 goto out; 898 goto out;
897 899
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1283{ 1285{
1284 int ret; 1286 int ret;
1285 u64 flags; 1287 u64 flags;
1288 u64 size = 0;
1286 u32 item_size; 1289 u32 item_size;
1287 struct extent_buffer *eb; 1290 struct extent_buffer *eb;
1288 struct btrfs_extent_item *ei; 1291 struct btrfs_extent_item *ei;
1289 struct btrfs_key key; 1292 struct btrfs_key key;
1290 1293
1291 key.type = BTRFS_EXTENT_ITEM_KEY; 1294 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1295 key.type = BTRFS_METADATA_ITEM_KEY;
1296 else
1297 key.type = BTRFS_EXTENT_ITEM_KEY;
1292 key.objectid = logical; 1298 key.objectid = logical;
1293 key.offset = (u64)-1; 1299 key.offset = (u64)-1;
1294 1300
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1301 return ret; 1307 return ret;
1302 1308
1303 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1309 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1304 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 1310 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1311 size = fs_info->extent_root->leafsize;
1312 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1313 size = found_key->offset;
1314
1315 if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
1316 found_key->type != BTRFS_METADATA_ITEM_KEY) ||
1305 found_key->objectid > logical || 1317 found_key->objectid > logical ||
1306 found_key->objectid + found_key->offset <= logical) { 1318 found_key->objectid + size <= logical) {
1307 pr_debug("logical %llu is not within any extent\n", 1319 pr_debug("logical %llu is not within any extent\n",
1308 (unsigned long long)logical); 1320 (unsigned long long)logical);
1309 return -ENOENT; 1321 return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1459 iterate_extent_inodes_t *iterate, void *ctx) 1471 iterate_extent_inodes_t *iterate, void *ctx)
1460{ 1472{
1461 int ret; 1473 int ret;
1462 struct btrfs_trans_handle *trans; 1474 struct btrfs_trans_handle *trans = NULL;
1463 struct ulist *refs = NULL; 1475 struct ulist *refs = NULL;
1464 struct ulist *roots = NULL; 1476 struct ulist *roots = NULL;
1465 struct ulist_node *ref_node = NULL; 1477 struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1471 pr_debug("resolving all inodes for extent %llu\n", 1483 pr_debug("resolving all inodes for extent %llu\n",
1472 extent_item_objectid); 1484 extent_item_objectid);
1473 1485
1474 if (search_commit_root) { 1486 if (!search_commit_root) {
1475 trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
1476 } else {
1477 trans = btrfs_join_transaction(fs_info->extent_root); 1487 trans = btrfs_join_transaction(fs_info->extent_root);
1478 if (IS_ERR(trans)) 1488 if (IS_ERR(trans))
1479 return PTR_ERR(trans); 1489 return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
27
28struct inode_fs_paths { 26struct inode_fs_paths {
29 struct btrfs_path *btrfs_path; 27 struct btrfs_path *btrfs_path;
30 struct btrfs_root *fs_root; 28 struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02fae7f7e42c..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1089 btrfs_set_node_ptr_generation(parent, parent_slot, 1089 btrfs_set_node_ptr_generation(parent, parent_slot,
1090 trans->transid); 1090 trans->transid);
1091 btrfs_mark_buffer_dirty(parent); 1091 btrfs_mark_buffer_dirty(parent);
1092 tree_mod_log_free_eb(root->fs_info, buf); 1092 if (last_ref)
1093 tree_mod_log_free_eb(root->fs_info, buf);
1093 btrfs_free_tree_block(trans, root, buf, parent_start, 1094 btrfs_free_tree_block(trans, root, buf, parent_start,
1094 last_ref); 1095 last_ref);
1095 } 1096 }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1161 * time_seq). 1162 * time_seq).
1162 */ 1163 */
1163static void 1164static void
1164__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1165__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1165 struct tree_mod_elem *first_tm) 1166 u64 time_seq, struct tree_mod_elem *first_tm)
1166{ 1167{
1167 u32 n; 1168 u32 n;
1168 struct rb_node *next; 1169 struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1172 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1173 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1173 1174
1174 n = btrfs_header_nritems(eb); 1175 n = btrfs_header_nritems(eb);
1176 tree_mod_log_read_lock(fs_info);
1175 while (tm && tm->seq >= time_seq) { 1177 while (tm && tm->seq >= time_seq) {
1176 /* 1178 /*
1177 * all the operations are recorded with the operator used for 1179 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1226 if (tm->index != first_tm->index) 1228 if (tm->index != first_tm->index)
1227 break; 1229 break;
1228 } 1230 }
1231 tree_mod_log_read_unlock(fs_info);
1229 btrfs_set_header_nritems(eb, n); 1232 btrfs_set_header_nritems(eb, n);
1230} 1233}
1231 1234
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1274 1277
1275 extent_buffer_get(eb_rewin); 1278 extent_buffer_get(eb_rewin);
1276 btrfs_tree_read_lock(eb_rewin); 1279 btrfs_tree_read_lock(eb_rewin);
1277 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1280 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1278 WARN_ON(btrfs_header_nritems(eb_rewin) > 1281 WARN_ON(btrfs_header_nritems(eb_rewin) >
1279 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1282 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1280 1283
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1350 btrfs_set_header_generation(eb, old_generation); 1353 btrfs_set_header_generation(eb, old_generation);
1351 } 1354 }
1352 if (tm) 1355 if (tm)
1353 __tree_mod_log_rewind(eb, time_seq, tm); 1356 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1354 else 1357 else
1355 WARN_ON(btrfs_header_level(eb) != 0); 1358 WARN_ON(btrfs_header_level(eb) != 0);
1356 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1359 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
2178 } 2181 }
2179} 2182}
2180 2183
2181/* 2184static noinline void reada_for_balance(struct btrfs_root *root,
2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2185 struct btrfs_path *path, int level)
2183 * cache
2184 */
2185static noinline int reada_for_balance(struct btrfs_root *root,
2186 struct btrfs_path *path, int level)
2187{ 2186{
2188 int slot; 2187 int slot;
2189 int nritems; 2188 int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2192 u64 gen; 2191 u64 gen;
2193 u64 block1 = 0; 2192 u64 block1 = 0;
2194 u64 block2 = 0; 2193 u64 block2 = 0;
2195 int ret = 0;
2196 int blocksize; 2194 int blocksize;
2197 2195
2198 parent = path->nodes[level + 1]; 2196 parent = path->nodes[level + 1];
2199 if (!parent) 2197 if (!parent)
2200 return 0; 2198 return;
2201 2199
2202 nritems = btrfs_header_nritems(parent); 2200 nritems = btrfs_header_nritems(parent);
2203 slot = path->slots[level + 1]; 2201 slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2224 block2 = 0; 2222 block2 = 0;
2225 free_extent_buffer(eb); 2223 free_extent_buffer(eb);
2226 } 2224 }
2227 if (block1 || block2) {
2228 ret = -EAGAIN;
2229
2230 /* release the whole path */
2231 btrfs_release_path(path);
2232
2233 /* read the blocks */
2234 if (block1)
2235 readahead_tree_block(root, block1, blocksize, 0);
2236 if (block2)
2237 readahead_tree_block(root, block2, blocksize, 0);
2238 2225
2239 if (block1) { 2226 if (block1)
2240 eb = read_tree_block(root, block1, blocksize, 0); 2227 readahead_tree_block(root, block1, blocksize, 0);
2241 free_extent_buffer(eb); 2228 if (block2)
2242 } 2229 readahead_tree_block(root, block2, blocksize, 0);
2243 if (block2) {
2244 eb = read_tree_block(root, block2, blocksize, 0);
2245 free_extent_buffer(eb);
2246 }
2247 }
2248 return ret;
2249} 2230}
2250 2231
2251 2232
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2359 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2340 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2360 if (tmp) { 2341 if (tmp) {
2361 /* first we do an atomic uptodate check */ 2342 /* first we do an atomic uptodate check */
2362 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2343 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2363 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2344 *eb_ret = tmp;
2364 /* 2345 return 0;
2365 * we found an up to date block without 2346 }
2366 * sleeping, return
2367 * right away
2368 */
2369 *eb_ret = tmp;
2370 return 0;
2371 }
2372 /* the pages were up to date, but we failed
2373 * the generation number check. Do a full
2374 * read for the generation number that is correct.
2375 * We must do this without dropping locks so
2376 * we can trust our generation number
2377 */
2378 free_extent_buffer(tmp);
2379 btrfs_set_path_blocking(p);
2380 2347
2381 /* now we're allowed to do a blocking uptodate check */ 2348 /* the pages were up to date, but we failed
2382 tmp = read_tree_block(root, blocknr, blocksize, gen); 2349 * the generation number check. Do a full
2383 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2350 * read for the generation number that is correct.
2384 *eb_ret = tmp; 2351 * We must do this without dropping locks so
2385 return 0; 2352 * we can trust our generation number
2386 } 2353 */
2387 free_extent_buffer(tmp); 2354 btrfs_set_path_blocking(p);
2388 btrfs_release_path(p); 2355
2389 return -EIO; 2356 /* now we're allowed to do a blocking uptodate check */
2357 ret = btrfs_read_buffer(tmp, gen);
2358 if (!ret) {
2359 *eb_ret = tmp;
2360 return 0;
2390 } 2361 }
2362 free_extent_buffer(tmp);
2363 btrfs_release_path(p);
2364 return -EIO;
2391 } 2365 }
2392 2366
2393 /* 2367 /*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2448 goto again; 2422 goto again;
2449 } 2423 }
2450 2424
2451 sret = reada_for_balance(root, p, level);
2452 if (sret)
2453 goto again;
2454
2455 btrfs_set_path_blocking(p); 2425 btrfs_set_path_blocking(p);
2426 reada_for_balance(root, p, level);
2456 sret = split_node(trans, root, p, level); 2427 sret = split_node(trans, root, p, level);
2457 btrfs_clear_path_blocking(p, NULL, 0); 2428 btrfs_clear_path_blocking(p, NULL, 0);
2458 2429
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2472 goto again; 2443 goto again;
2473 } 2444 }
2474 2445
2475 sret = reada_for_balance(root, p, level);
2476 if (sret)
2477 goto again;
2478
2479 btrfs_set_path_blocking(p); 2446 btrfs_set_path_blocking(p);
2447 reada_for_balance(root, p, level);
2480 sret = balance_level(trans, root, p, level); 2448 sret = balance_level(trans, root, p, level);
2481 btrfs_clear_path_blocking(p, NULL, 0); 2449 btrfs_clear_path_blocking(p, NULL, 0);
2482 2450
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
3143 */ 3111 */
3144static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3112static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3145 struct btrfs_root *root, 3113 struct btrfs_root *root,
3146 struct btrfs_path *path, int level, int log_removal) 3114 struct btrfs_path *path, int level)
3147{ 3115{
3148 u64 lower_gen; 3116 u64 lower_gen;
3149 struct extent_buffer *lower; 3117 struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3194 btrfs_mark_buffer_dirty(c); 3162 btrfs_mark_buffer_dirty(c);
3195 3163
3196 old = root->node; 3164 old = root->node;
3197 tree_mod_log_set_root_pointer(root, c, log_removal); 3165 tree_mod_log_set_root_pointer(root, c, 0);
3198 rcu_assign_pointer(root->node, c); 3166 rcu_assign_pointer(root->node, c);
3199 3167
3200 /* the super has an extra ref to root->node */ 3168 /* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3278 /* 3246 /*
3279 * trying to split the root, lets make a new one 3247 * trying to split the root, lets make a new one
3280 * 3248 *
3281 * tree mod log: We pass 0 as log_removal parameter to 3249 * tree mod log: We don't log_removal old root in
3282 * insert_new_root, because that root buffer will be kept as a 3250 * insert_new_root, because that root buffer will be kept as a
3283 * normal node. We are going to log removal of half of the 3251 * normal node. We are going to log removal of half of the
3284 * elements below with tree_mod_log_eb_copy. We're holding a 3252 * elements below with tree_mod_log_eb_copy. We're holding a
3285 * tree lock on the buffer, which is why we cannot race with 3253 * tree lock on the buffer, which is why we cannot race with
3286 * other tree_mod_log users. 3254 * other tree_mod_log users.
3287 */ 3255 */
3288 ret = insert_new_root(trans, root, path, level + 1, 0); 3256 ret = insert_new_root(trans, root, path, level + 1);
3289 if (ret) 3257 if (ret)
3290 return ret; 3258 return ret;
3291 } else { 3259 } else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
3986 return -EOVERFLOW; 3954 return -EOVERFLOW;
3987 3955
3988 /* first try to make some room by pushing left and right */ 3956 /* first try to make some room by pushing left and right */
3989 if (data_size) { 3957 if (data_size && path->nodes[1]) {
3990 wret = push_leaf_right(trans, root, path, data_size, 3958 wret = push_leaf_right(trans, root, path, data_size,
3991 data_size, 0, 0); 3959 data_size, 0, 0);
3992 if (wret < 0) 3960 if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
4005 } 3973 }
4006 3974
4007 if (!path->nodes[1]) { 3975 if (!path->nodes[1]) {
4008 ret = insert_new_root(trans, root, path, 1, 1); 3976 ret = insert_new_root(trans, root, path, 1);
4009 if (ret) 3977 if (ret)
4010 return ret; 3978 return ret;
4011 } 3979 }
@@ -4430,7 +4398,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4430} 4398}
4431 4399
4432/* 4400/*
4433 * make the item pointed to by the path bigger, data_size is the new size. 4401 * make the item pointed to by the path bigger, data_size is the added size.
4434 */ 4402 */
4435void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, 4403void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4436 u32 data_size) 4404 u32 data_size)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
964#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 964#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
965#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 965#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
967 967
968enum btrfs_raid_types { 968enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
1102 account */ 1102 account */
1103 1103
1104 /* 1104 /*
1105 * bytes_pinned is kept in line with what is actually pinned, as in
1106 * we've called update_block_group and dropped the bytes_used counter
1107 * and increased the bytes_pinned counter. However this means that
1108 * bytes_pinned does not reflect the bytes that will be pinned once the
1109 * delayed refs are flushed, so this counter is inc'ed everytime we call
1110 * btrfs_free_extent so it is a realtime count of what will be freed
1111 * once the transaction is committed. It will be zero'ed everytime the
1112 * transaction commits.
1113 */
1114 struct percpu_counter total_bytes_pinned;
1115
1116 /*
1105 * we bump reservation progress every time we decrement 1117 * we bump reservation progress every time we decrement
1106 * bytes_reserved. This way people waiting for reservations 1118 * bytes_reserved. This way people waiting for reservations
1107 * know something good has happened and they can check 1119 * know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
1437 atomic_t open_ioctl_trans; 1449 atomic_t open_ioctl_trans;
1438 1450
1439 /* 1451 /*
1440 * this is used by the balancing code to wait for all the pending 1452 * this is used to protect the following list -- ordered_roots.
1441 * ordered extents
1442 */ 1453 */
1443 spinlock_t ordered_extent_lock; 1454 spinlock_t ordered_root_lock;
1444 1455
1445 /* 1456 /*
1446 * all of the data=ordered extents pending writeback 1457 * all fs/file tree roots in which there are data=ordered extents
1458 * pending writeback are added into this list.
1459 *
1447 * these can span multiple transactions and basically include 1460 * these can span multiple transactions and basically include
1448 * every dirty data page that isn't from nodatacow 1461 * every dirty data page that isn't from nodatacow
1449 */ 1462 */
1450 struct list_head ordered_extents; 1463 struct list_head ordered_roots;
1451 1464
1452 spinlock_t delalloc_lock; 1465 spinlock_t delalloc_root_lock;
1453 /* 1466 /* all fs/file tree roots that have delalloc inodes. */
1454 * all of the inodes that have delalloc bytes. It is possible for 1467 struct list_head delalloc_roots;
1455 * this list to be empty even when there is still dirty data=ordered
1456 * extents waiting to finish IO.
1457 */
1458 struct list_head delalloc_inodes;
1459 1468
1460 /* 1469 /*
1461 * there is a pool of worker threads for checksumming during writes 1470 * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
1498 int do_barriers; 1507 int do_barriers;
1499 int closing; 1508 int closing;
1500 int log_root_recovering; 1509 int log_root_recovering;
1501 int enospc_unlink;
1502 int trans_no_join;
1503 1510
1504 u64 total_pinned; 1511 u64 total_pinned;
1505 1512
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
1594 struct rb_root qgroup_tree; 1601 struct rb_root qgroup_tree;
1595 spinlock_t qgroup_lock; 1602 spinlock_t qgroup_lock;
1596 1603
1604 /*
1605 * used to avoid frequently calling ulist_alloc()/ulist_free()
1606 * when doing qgroup accounting, it must be protected by qgroup_lock.
1607 */
1608 struct ulist *qgroup_ulist;
1609
1597 /* protect user change for quota operations */ 1610 /* protect user change for quota operations */
1598 struct mutex qgroup_ioctl_lock; 1611 struct mutex qgroup_ioctl_lock;
1599 1612
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
1607 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1620 struct mutex qgroup_rescan_lock; /* protects the progress item */
1608 struct btrfs_key qgroup_rescan_progress; 1621 struct btrfs_key qgroup_rescan_progress;
1609 struct btrfs_workers qgroup_rescan_workers; 1622 struct btrfs_workers qgroup_rescan_workers;
1623 struct completion qgroup_rescan_completion;
1624 struct btrfs_work qgroup_rescan_work;
1610 1625
1611 /* filesystem state */ 1626 /* filesystem state */
1612 unsigned long fs_state; 1627 unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
1739 int force_cow; 1754 int force_cow;
1740 1755
1741 spinlock_t root_item_lock; 1756 spinlock_t root_item_lock;
1757 atomic_t refs;
1758
1759 spinlock_t delalloc_lock;
1760 /*
1761 * all of the inodes that have delalloc bytes. It is possible for
1762 * this list to be empty even when there is still dirty data=ordered
1763 * extents waiting to finish IO.
1764 */
1765 struct list_head delalloc_inodes;
1766 struct list_head delalloc_root;
1767 u64 nr_delalloc_inodes;
1768 /*
1769 * this is used by the balancing code to wait for all the pending
1770 * ordered extents
1771 */
1772 spinlock_t ordered_extent_lock;
1773
1774 /*
1775 * all of the data=ordered extents pending writeback
1776 * these can span multiple transactions and basically include
1777 * every dirty data page that isn't from nodatacow
1778 */
1779 struct list_head ordered_extents;
1780 struct list_head ordered_root;
1781 u64 nr_ordered_extents;
1742}; 1782};
1743 1783
1744struct btrfs_ioctl_defrag_range_args { 1784struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3028 num_items; 3068 num_items;
3029} 3069}
3030 3070
3071int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3072 struct btrfs_root *root);
3031void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3073void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3032int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3074int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3033 struct btrfs_root *root, unsigned long count); 3075 struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
3039 u64 bytenr, u64 num, int reserved); 3081 u64 bytenr, u64 num, int reserved);
3040int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 3082int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
3041 u64 bytenr, u64 num_bytes); 3083 u64 bytenr, u64 num_bytes);
3084int btrfs_exclude_logged_extents(struct btrfs_root *root,
3085 struct extent_buffer *eb);
3042int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3086int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3043 struct btrfs_root *root, 3087 struct btrfs_root *root,
3044 u64 objectid, u64 offset, u64 bytenr); 3088 u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3155int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3199int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3156 struct btrfs_block_rsv *dst_rsv, 3200 struct btrfs_block_rsv *dst_rsv,
3157 u64 num_bytes); 3201 u64 num_bytes);
3202int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
3203 struct btrfs_block_rsv *dest, u64 num_bytes,
3204 int min_factor);
3158void btrfs_block_rsv_release(struct btrfs_root *root, 3205void btrfs_block_rsv_release(struct btrfs_root *root,
3159 struct btrfs_block_rsv *block_rsv, 3206 struct btrfs_block_rsv *block_rsv,
3160 u64 num_bytes); 3207 u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
3311 smp_mb(); 3358 smp_mb();
3312 return fs_info->closing; 3359 return fs_info->closing;
3313} 3360}
3361
3362/*
3363 * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
3364 * anything except sleeping. This function is used to check the status of
3365 * the fs.
3366 */
3367static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
3368{
3369 return (root->fs_info->sb->s_flags & MS_RDONLY ||
3370 btrfs_fs_closing(root->fs_info));
3371}
3372
3314static inline void free_fs_info(struct btrfs_fs_info *fs_info) 3373static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3315{ 3374{
3316 kfree(fs_info->balance_ctl); 3375 kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
3357 struct btrfs_root_item *item); 3416 struct btrfs_root_item *item);
3358void btrfs_read_root_item(struct extent_buffer *eb, int slot, 3417void btrfs_read_root_item(struct extent_buffer *eb, int slot,
3359 struct btrfs_root_item *item); 3418 struct btrfs_root_item *item);
3360int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3419int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
3361 btrfs_root_item *item, struct btrfs_key *key); 3420 struct btrfs_path *path, struct btrfs_root_item *root_item,
3362int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3421 struct btrfs_key *root_key);
3363int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 3422int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
3364void btrfs_set_root_node(struct btrfs_root_item *item, 3423void btrfs_set_root_node(struct btrfs_root_item *item,
3365 struct extent_buffer *node); 3424 struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3493struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3552struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3494 size_t pg_offset, u64 start, u64 len, 3553 size_t pg_offset, u64 start, u64 len,
3495 int create); 3554 int create);
3555noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
3556 struct inode *inode, u64 offset, u64 *len,
3557 u64 *orig_start, u64 *orig_block_len,
3558 u64 *ram_bytes);
3496 3559
3497/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 3560/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
3498#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) 3561#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3530 u32 min_type); 3593 u32 min_type);
3531 3594
3532int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3595int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3596int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
3597 int delay_iput);
3533int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3598int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3534 struct extent_state **cached_state); 3599 struct extent_state **cached_state);
3535int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3600int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3814int btrfs_quota_disable(struct btrfs_trans_handle *trans, 3879int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3815 struct btrfs_fs_info *fs_info); 3880 struct btrfs_fs_info *fs_info);
3816int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 3881int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
3882void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
3883int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
3817int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 3884int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3818 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 3885 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3819int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 3886int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb34438ddedb..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
535 return next; 535 return next;
536} 536}
537 537
538static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
539 u64 root_id)
540{
541 struct btrfs_key root_key;
542
543 if (root->objectid == root_id)
544 return root;
545
546 root_key.objectid = root_id;
547 root_key.type = BTRFS_ROOT_ITEM_KEY;
548 root_key.offset = (u64)-1;
549 return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
550}
551
552static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 538static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
553 struct btrfs_root *root, 539 struct btrfs_root *root,
554 struct btrfs_delayed_item *item) 540 struct btrfs_delayed_item *item)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
401 btrfs_dev_replace_unlock(dev_replace); 401 btrfs_dev_replace_unlock(dev_replace);
402 402
403 btrfs_wait_ordered_extents(root, 0); 403 btrfs_wait_all_ordered_extents(root->fs_info, 0);
404 404
405 /* force writing the updated state information to disk */ 405 /* force writing the updated state information to disk */
406 trans = btrfs_start_transaction(root, 0); 406 trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
470 * flush all outstanding I/O and inode extent mappings before the 470 * flush all outstanding I/O and inode extent mappings before the
471 * copy operation is declared as being finished 471 * copy operation is declared as being finished
472 */ 472 */
473 ret = btrfs_start_delalloc_inodes(root, 0); 473 ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
474 if (ret) { 474 if (ret) {
475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
476 return ret; 476 return ret;
477 } 477 }
478 btrfs_wait_ordered_extents(root, 0); 478 btrfs_wait_all_ordered_extents(root->fs_info, 0);
479 479
480 trans = btrfs_start_transaction(root, 0); 480 trans = btrfs_start_transaction(root, 0);
481 if (IS_ERR(trans)) { 481 if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0292b3ead54..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1192 root->objectid = objectid; 1192 root->objectid = objectid;
1193 root->last_trans = 0; 1193 root->last_trans = 0;
1194 root->highest_objectid = 0; 1194 root->highest_objectid = 0;
1195 root->nr_delalloc_inodes = 0;
1196 root->nr_ordered_extents = 0;
1195 root->name = NULL; 1197 root->name = NULL;
1196 root->inode_tree = RB_ROOT; 1198 root->inode_tree = RB_ROOT;
1197 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1199 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1200,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1200 1202
1201 INIT_LIST_HEAD(&root->dirty_list); 1203 INIT_LIST_HEAD(&root->dirty_list);
1202 INIT_LIST_HEAD(&root->root_list); 1204 INIT_LIST_HEAD(&root->root_list);
1205 INIT_LIST_HEAD(&root->delalloc_inodes);
1206 INIT_LIST_HEAD(&root->delalloc_root);
1207 INIT_LIST_HEAD(&root->ordered_extents);
1208 INIT_LIST_HEAD(&root->ordered_root);
1203 INIT_LIST_HEAD(&root->logged_list[0]); 1209 INIT_LIST_HEAD(&root->logged_list[0]);
1204 INIT_LIST_HEAD(&root->logged_list[1]); 1210 INIT_LIST_HEAD(&root->logged_list[1]);
1205 spin_lock_init(&root->orphan_lock); 1211 spin_lock_init(&root->orphan_lock);
1206 spin_lock_init(&root->inode_lock); 1212 spin_lock_init(&root->inode_lock);
1213 spin_lock_init(&root->delalloc_lock);
1214 spin_lock_init(&root->ordered_extent_lock);
1207 spin_lock_init(&root->accounting_lock); 1215 spin_lock_init(&root->accounting_lock);
1208 spin_lock_init(&root->log_extents_lock[0]); 1216 spin_lock_init(&root->log_extents_lock[0]);
1209 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
@@ -1217,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1217 atomic_set(&root->log_writers, 0); 1225 atomic_set(&root->log_writers, 0);
1218 atomic_set(&root->log_batch, 0); 1226 atomic_set(&root->log_batch, 0);
1219 atomic_set(&root->orphan_inodes, 0); 1227 atomic_set(&root->orphan_inodes, 0);
1228 atomic_set(&root->refs, 1);
1220 root->log_transid = 0; 1229 root->log_transid = 0;
1221 root->last_log_commit = 0; 1230 root->last_log_commit = 0;
1222 extent_io_tree_init(&root->dirty_log_pages, 1231 extent_io_tree_init(&root->dirty_log_pages,
@@ -1235,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1235 spin_lock_init(&root->root_item_lock); 1244 spin_lock_init(&root->root_item_lock);
1236} 1245}
1237 1246
1238static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
1239 struct btrfs_fs_info *fs_info,
1240 u64 objectid,
1241 struct btrfs_root *root)
1242{
1243 int ret;
1244 u32 blocksize;
1245 u64 generation;
1246
1247 __setup_root(tree_root->nodesize, tree_root->leafsize,
1248 tree_root->sectorsize, tree_root->stripesize,
1249 root, fs_info, objectid);
1250 ret = btrfs_find_last_root(tree_root, objectid,
1251 &root->root_item, &root->root_key);
1252 if (ret > 0)
1253 return -ENOENT;
1254 else if (ret < 0)
1255 return ret;
1256
1257 generation = btrfs_root_generation(&root->root_item);
1258 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1259 root->commit_root = NULL;
1260 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1261 blocksize, generation);
1262 if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
1263 free_extent_buffer(root->node);
1264 root->node = NULL;
1265 return -EIO;
1266 }
1267 root->commit_root = btrfs_root_node(root);
1268 return 0;
1269}
1270
1271static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1247static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1272{ 1248{
1273 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1249 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1452,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1452 return 0; 1428 return 0;
1453} 1429}
1454 1430
1455struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 1431struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1456 struct btrfs_key *location) 1432 struct btrfs_key *key)
1457{ 1433{
1458 struct btrfs_root *root; 1434 struct btrfs_root *root;
1459 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1435 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1460 struct btrfs_path *path; 1436 struct btrfs_path *path;
1461 struct extent_buffer *l;
1462 u64 generation; 1437 u64 generation;
1463 u32 blocksize; 1438 u32 blocksize;
1464 int ret = 0; 1439 int ret;
1465 int slot;
1466 1440
1467 root = btrfs_alloc_root(fs_info); 1441 path = btrfs_alloc_path();
1468 if (!root) 1442 if (!path)
1469 return ERR_PTR(-ENOMEM); 1443 return ERR_PTR(-ENOMEM);
1470 if (location->offset == (u64)-1) { 1444
1471 ret = find_and_setup_root(tree_root, fs_info, 1445 root = btrfs_alloc_root(fs_info);
1472 location->objectid, root); 1446 if (!root) {
1473 if (ret) { 1447 ret = -ENOMEM;
1474 kfree(root); 1448 goto alloc_fail;
1475 return ERR_PTR(ret);
1476 }
1477 goto out;
1478 } 1449 }
1479 1450
1480 __setup_root(tree_root->nodesize, tree_root->leafsize, 1451 __setup_root(tree_root->nodesize, tree_root->leafsize,
1481 tree_root->sectorsize, tree_root->stripesize, 1452 tree_root->sectorsize, tree_root->stripesize,
1482 root, fs_info, location->objectid); 1453 root, fs_info, key->objectid);
1483 1454
1484 path = btrfs_alloc_path(); 1455 ret = btrfs_find_root(tree_root, key, path,
1485 if (!path) { 1456 &root->root_item, &root->root_key);
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1490 if (ret == 0) {
1491 l = path->nodes[0];
1492 slot = path->slots[0];
1493 btrfs_read_root_item(l, slot, &root->root_item);
1494 memcpy(&root->root_key, location, sizeof(*location));
1495 }
1496 btrfs_free_path(path);
1497 if (ret) { 1457 if (ret) {
1498 kfree(root);
1499 if (ret > 0) 1458 if (ret > 0)
1500 ret = -ENOENT; 1459 ret = -ENOENT;
1501 return ERR_PTR(ret); 1460 goto find_fail;
1502 } 1461 }
1503 1462
1504 generation = btrfs_root_generation(&root->root_item); 1463 generation = btrfs_root_generation(&root->root_item);
1505 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1464 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1506 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1465 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1507 blocksize, generation); 1466 blocksize, generation);
1508 if (!root->node || !extent_buffer_uptodate(root->node)) { 1467 if (!root->node) {
1509 ret = (!root->node) ? -ENOMEM : -EIO; 1468 ret = -ENOMEM;
1510 1469 goto find_fail;
1511 free_extent_buffer(root->node); 1470 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1512 kfree(root); 1471 ret = -EIO;
1513 return ERR_PTR(ret); 1472 goto read_fail;
1514 } 1473 }
1515
1516 root->commit_root = btrfs_root_node(root); 1474 root->commit_root = btrfs_root_node(root);
1517out: 1475out:
1518 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1476 btrfs_free_path(path);
1477 return root;
1478
1479read_fail:
1480 free_extent_buffer(root->node);
1481find_fail:
1482 kfree(root);
1483alloc_fail:
1484 root = ERR_PTR(ret);
1485 goto out;
1486}
1487
1488struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1489 struct btrfs_key *location)
1490{
1491 struct btrfs_root *root;
1492
1493 root = btrfs_read_tree_root(tree_root, location);
1494 if (IS_ERR(root))
1495 return root;
1496
1497 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1519 root->ref_cows = 1; 1498 root->ref_cows = 1;
1520 btrfs_check_and_init_root_item(&root->root_item); 1499 btrfs_check_and_init_root_item(&root->root_item);
1521 } 1500 }
@@ -1523,6 +1502,66 @@ out:
1523 return root; 1502 return root;
1524} 1503}
1525 1504
1505int btrfs_init_fs_root(struct btrfs_root *root)
1506{
1507 int ret;
1508
1509 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1510 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1511 GFP_NOFS);
1512 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1513 ret = -ENOMEM;
1514 goto fail;
1515 }
1516
1517 btrfs_init_free_ino_ctl(root);
1518 mutex_init(&root->fs_commit_mutex);
1519 spin_lock_init(&root->cache_lock);
1520 init_waitqueue_head(&root->cache_wait);
1521
1522 ret = get_anon_bdev(&root->anon_dev);
1523 if (ret)
1524 goto fail;
1525 return 0;
1526fail:
1527 kfree(root->free_ino_ctl);
1528 kfree(root->free_ino_pinned);
1529 return ret;
1530}
1531
1532struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1533 u64 root_id)
1534{
1535 struct btrfs_root *root;
1536
1537 spin_lock(&fs_info->fs_roots_radix_lock);
1538 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1539 (unsigned long)root_id);
1540 spin_unlock(&fs_info->fs_roots_radix_lock);
1541 return root;
1542}
1543
1544int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1545 struct btrfs_root *root)
1546{
1547 int ret;
1548
1549 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1550 if (ret)
1551 return ret;
1552
1553 spin_lock(&fs_info->fs_roots_radix_lock);
1554 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1555 (unsigned long)root->root_key.objectid,
1556 root);
1557 if (ret == 0)
1558 root->in_radix = 1;
1559 spin_unlock(&fs_info->fs_roots_radix_lock);
1560 radix_tree_preload_end();
1561
1562 return ret;
1563}
1564
1526struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1565struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1527 struct btrfs_key *location) 1566 struct btrfs_key *location)
1528{ 1567{
@@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1543 return fs_info->quota_root ? fs_info->quota_root : 1582 return fs_info->quota_root ? fs_info->quota_root :
1544 ERR_PTR(-ENOENT); 1583 ERR_PTR(-ENOENT);
1545again: 1584again:
1546 spin_lock(&fs_info->fs_roots_radix_lock); 1585 root = btrfs_lookup_fs_root(fs_info, location->objectid);
1547 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1548 (unsigned long)location->objectid);
1549 spin_unlock(&fs_info->fs_roots_radix_lock);
1550 if (root) 1586 if (root)
1551 return root; 1587 return root;
1552 1588
1553 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1589 root = btrfs_read_fs_root(fs_info->tree_root, location);
1554 if (IS_ERR(root)) 1590 if (IS_ERR(root))
1555 return root; 1591 return root;
1556 1592
1557 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1593 if (btrfs_root_refs(&root->root_item) == 0) {
1558 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1594 ret = -ENOENT;
1559 GFP_NOFS);
1560 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1561 ret = -ENOMEM;
1562 goto fail; 1595 goto fail;
1563 } 1596 }
1564 1597
1565 btrfs_init_free_ino_ctl(root); 1598 ret = btrfs_init_fs_root(root);
1566 mutex_init(&root->fs_commit_mutex);
1567 spin_lock_init(&root->cache_lock);
1568 init_waitqueue_head(&root->cache_wait);
1569
1570 ret = get_anon_bdev(&root->anon_dev);
1571 if (ret) 1599 if (ret)
1572 goto fail; 1600 goto fail;
1573 1601
1574 if (btrfs_root_refs(&root->root_item) == 0) {
1575 ret = -ENOENT;
1576 goto fail;
1577 }
1578
1579 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1602 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1580 if (ret < 0) 1603 if (ret < 0)
1581 goto fail; 1604 goto fail;
1582 if (ret == 0) 1605 if (ret == 0)
1583 root->orphan_item_inserted = 1; 1606 root->orphan_item_inserted = 1;
1584 1607
1585 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1608 ret = btrfs_insert_fs_root(fs_info, root);
1586 if (ret)
1587 goto fail;
1588
1589 spin_lock(&fs_info->fs_roots_radix_lock);
1590 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1591 (unsigned long)root->root_key.objectid,
1592 root);
1593 if (ret == 0)
1594 root->in_radix = 1;
1595
1596 spin_unlock(&fs_info->fs_roots_radix_lock);
1597 radix_tree_preload_end();
1598 if (ret) { 1609 if (ret) {
1599 if (ret == -EEXIST) { 1610 if (ret == -EEXIST) {
1600 free_fs_root(root); 1611 free_fs_root(root);
@@ -1602,10 +1613,6 @@ again:
1602 } 1613 }
1603 goto fail; 1614 goto fail;
1604 } 1615 }
1605
1606 ret = btrfs_find_dead_roots(fs_info->tree_root,
1607 root->root_key.objectid);
1608 WARN_ON(ret);
1609 return root; 1616 return root;
1610fail: 1617fail:
1611 free_fs_root(root); 1618 free_fs_root(root);
@@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
1677static int cleaner_kthread(void *arg) 1684static int cleaner_kthread(void *arg)
1678{ 1685{
1679 struct btrfs_root *root = arg; 1686 struct btrfs_root *root = arg;
1687 int again;
1680 1688
1681 do { 1689 do {
1682 int again = 0; 1690 again = 0;
1683 1691
1684 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1692 /* Make the cleaner go to sleep early. */
1685 down_read_trylock(&root->fs_info->sb->s_umount)) { 1693 if (btrfs_need_cleaner_sleep(root))
1686 if (mutex_trylock(&root->fs_info->cleaner_mutex)) { 1694 goto sleep;
1687 btrfs_run_delayed_iputs(root); 1695
1688 again = btrfs_clean_one_deleted_snapshot(root); 1696 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1689 mutex_unlock(&root->fs_info->cleaner_mutex); 1697 goto sleep;
1690 } 1698
1691 btrfs_run_defrag_inodes(root->fs_info); 1699 /*
1692 up_read(&root->fs_info->sb->s_umount); 1700 * Avoid the problem that we change the status of the fs
1701 * during the above check and trylock.
1702 */
1703 if (btrfs_need_cleaner_sleep(root)) {
1704 mutex_unlock(&root->fs_info->cleaner_mutex);
1705 goto sleep;
1693 } 1706 }
1694 1707
1708 btrfs_run_delayed_iputs(root);
1709 again = btrfs_clean_one_deleted_snapshot(root);
1710 mutex_unlock(&root->fs_info->cleaner_mutex);
1711
1712 /*
1713 * The defragger has dealt with the R/O remount and umount,
1714 * needn't do anything special here.
1715 */
1716 btrfs_run_defrag_inodes(root->fs_info);
1717sleep:
1695 if (!try_to_freeze() && !again) { 1718 if (!try_to_freeze() && !again) {
1696 set_current_state(TASK_INTERRUPTIBLE); 1719 set_current_state(TASK_INTERRUPTIBLE);
1697 if (!kthread_should_stop()) 1720 if (!kthread_should_stop())
@@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg)
1725 } 1748 }
1726 1749
1727 now = get_seconds(); 1750 now = get_seconds();
1728 if (!cur->blocked && 1751 if (cur->state < TRANS_STATE_BLOCKED &&
1729 (now < cur->start_time || now - cur->start_time < 30)) { 1752 (now < cur->start_time || now - cur->start_time < 30)) {
1730 spin_unlock(&root->fs_info->trans_lock); 1753 spin_unlock(&root->fs_info->trans_lock);
1731 delay = HZ * 5; 1754 delay = HZ * 5;
@@ -2035,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2035 list_del(&gang[0]->root_list); 2058 list_del(&gang[0]->root_list);
2036 2059
2037 if (gang[0]->in_radix) { 2060 if (gang[0]->in_radix) {
2038 btrfs_free_fs_root(fs_info, gang[0]); 2061 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2039 } else { 2062 } else {
2040 free_extent_buffer(gang[0]->node); 2063 free_extent_buffer(gang[0]->node);
2041 free_extent_buffer(gang[0]->commit_root); 2064 free_extent_buffer(gang[0]->commit_root);
2042 kfree(gang[0]); 2065 btrfs_put_fs_root(gang[0]);
2043 } 2066 }
2044 } 2067 }
2045 2068
@@ -2050,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2050 if (!ret) 2073 if (!ret)
2051 break; 2074 break;
2052 for (i = 0; i < ret; i++) 2075 for (i = 0; i < ret; i++)
2053 btrfs_free_fs_root(fs_info, gang[i]); 2076 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2054 } 2077 }
2055} 2078}
2056 2079
@@ -2082,14 +2105,8 @@ int open_ctree(struct super_block *sb,
2082 int backup_index = 0; 2105 int backup_index = 0;
2083 2106
2084 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2107 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2085 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
2086 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
2087 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2108 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2088 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 2109 if (!tree_root || !chunk_root) {
2089 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
2090
2091 if (!tree_root || !extent_root || !csum_root ||
2092 !chunk_root || !dev_root || !quota_root) {
2093 err = -ENOMEM; 2110 err = -ENOMEM;
2094 goto fail; 2111 goto fail;
2095 } 2112 }
@@ -2132,9 +2149,9 @@ int open_ctree(struct super_block *sb,
2132 INIT_LIST_HEAD(&fs_info->trans_list); 2149 INIT_LIST_HEAD(&fs_info->trans_list);
2133 INIT_LIST_HEAD(&fs_info->dead_roots); 2150 INIT_LIST_HEAD(&fs_info->dead_roots);
2134 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2151 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2135 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2152 INIT_LIST_HEAD(&fs_info->delalloc_roots);
2136 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2153 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2137 spin_lock_init(&fs_info->delalloc_lock); 2154 spin_lock_init(&fs_info->delalloc_root_lock);
2138 spin_lock_init(&fs_info->trans_lock); 2155 spin_lock_init(&fs_info->trans_lock);
2139 spin_lock_init(&fs_info->fs_roots_radix_lock); 2156 spin_lock_init(&fs_info->fs_roots_radix_lock);
2140 spin_lock_init(&fs_info->delayed_iput_lock); 2157 spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2170,7 +2187,6 @@ int open_ctree(struct super_block *sb,
2170 fs_info->max_inline = 8192 * 1024; 2187 fs_info->max_inline = 8192 * 1024;
2171 fs_info->metadata_ratio = 0; 2188 fs_info->metadata_ratio = 0;
2172 fs_info->defrag_inodes = RB_ROOT; 2189 fs_info->defrag_inodes = RB_ROOT;
2173 fs_info->trans_no_join = 0;
2174 fs_info->free_chunk_space = 0; 2190 fs_info->free_chunk_space = 0;
2175 fs_info->tree_mod_log = RB_ROOT; 2191 fs_info->tree_mod_log = RB_ROOT;
2176 2192
@@ -2181,8 +2197,8 @@ int open_ctree(struct super_block *sb,
2181 fs_info->thread_pool_size = min_t(unsigned long, 2197 fs_info->thread_pool_size = min_t(unsigned long,
2182 num_online_cpus() + 2, 8); 2198 num_online_cpus() + 2, 8);
2183 2199
2184 INIT_LIST_HEAD(&fs_info->ordered_extents); 2200 INIT_LIST_HEAD(&fs_info->ordered_roots);
2185 spin_lock_init(&fs_info->ordered_extent_lock); 2201 spin_lock_init(&fs_info->ordered_root_lock);
2186 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2202 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2187 GFP_NOFS); 2203 GFP_NOFS);
2188 if (!fs_info->delayed_root) { 2204 if (!fs_info->delayed_root) {
@@ -2275,6 +2291,7 @@ int open_ctree(struct super_block *sb,
2275 fs_info->qgroup_seq = 1; 2291 fs_info->qgroup_seq = 1;
2276 fs_info->quota_enabled = 0; 2292 fs_info->quota_enabled = 0;
2277 fs_info->pending_quota_state = 0; 2293 fs_info->pending_quota_state = 0;
2294 fs_info->qgroup_ulist = NULL;
2278 mutex_init(&fs_info->qgroup_rescan_lock); 2295 mutex_init(&fs_info->qgroup_rescan_lock);
2279 2296
2280 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2297 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2639,33 +2656,44 @@ retry_root_backup:
2639 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2656 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2640 tree_root->commit_root = btrfs_root_node(tree_root); 2657 tree_root->commit_root = btrfs_root_node(tree_root);
2641 2658
2642 ret = find_and_setup_root(tree_root, fs_info, 2659 location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2643 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2660 location.type = BTRFS_ROOT_ITEM_KEY;
2644 if (ret) 2661 location.offset = 0;
2662
2663 extent_root = btrfs_read_tree_root(tree_root, &location);
2664 if (IS_ERR(extent_root)) {
2665 ret = PTR_ERR(extent_root);
2645 goto recovery_tree_root; 2666 goto recovery_tree_root;
2667 }
2646 extent_root->track_dirty = 1; 2668 extent_root->track_dirty = 1;
2669 fs_info->extent_root = extent_root;
2647 2670
2648 ret = find_and_setup_root(tree_root, fs_info, 2671 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2649 BTRFS_DEV_TREE_OBJECTID, dev_root); 2672 dev_root = btrfs_read_tree_root(tree_root, &location);
2650 if (ret) 2673 if (IS_ERR(dev_root)) {
2674 ret = PTR_ERR(dev_root);
2651 goto recovery_tree_root; 2675 goto recovery_tree_root;
2676 }
2652 dev_root->track_dirty = 1; 2677 dev_root->track_dirty = 1;
2678 fs_info->dev_root = dev_root;
2679 btrfs_init_devices_late(fs_info);
2653 2680
2654 ret = find_and_setup_root(tree_root, fs_info, 2681 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2655 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2682 csum_root = btrfs_read_tree_root(tree_root, &location);
2656 if (ret) 2683 if (IS_ERR(csum_root)) {
2684 ret = PTR_ERR(csum_root);
2657 goto recovery_tree_root; 2685 goto recovery_tree_root;
2686 }
2658 csum_root->track_dirty = 1; 2687 csum_root->track_dirty = 1;
2688 fs_info->csum_root = csum_root;
2659 2689
2660 ret = find_and_setup_root(tree_root, fs_info, 2690 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2661 BTRFS_QUOTA_TREE_OBJECTID, quota_root); 2691 quota_root = btrfs_read_tree_root(tree_root, &location);
2662 if (ret) { 2692 if (!IS_ERR(quota_root)) {
2663 kfree(quota_root);
2664 quota_root = fs_info->quota_root = NULL;
2665 } else {
2666 quota_root->track_dirty = 1; 2693 quota_root->track_dirty = 1;
2667 fs_info->quota_enabled = 1; 2694 fs_info->quota_enabled = 1;
2668 fs_info->pending_quota_state = 1; 2695 fs_info->pending_quota_state = 1;
2696 fs_info->quota_root = quota_root;
2669 } 2697 }
2670 2698
2671 fs_info->generation = generation; 2699 fs_info->generation = generation;
@@ -2818,11 +2846,9 @@ retry_root_backup:
2818 2846
2819 location.objectid = BTRFS_FS_TREE_OBJECTID; 2847 location.objectid = BTRFS_FS_TREE_OBJECTID;
2820 location.type = BTRFS_ROOT_ITEM_KEY; 2848 location.type = BTRFS_ROOT_ITEM_KEY;
2821 location.offset = (u64)-1; 2849 location.offset = 0;
2822 2850
2823 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2851 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2824 if (!fs_info->fs_root)
2825 goto fail_qgroup;
2826 if (IS_ERR(fs_info->fs_root)) { 2852 if (IS_ERR(fs_info->fs_root)) {
2827 err = PTR_ERR(fs_info->fs_root); 2853 err = PTR_ERR(fs_info->fs_root);
2828 goto fail_qgroup; 2854 goto fail_qgroup;
@@ -2854,6 +2880,8 @@ retry_root_backup:
2854 return ret; 2880 return ret;
2855 } 2881 }
2856 2882
2883 btrfs_qgroup_rescan_resume(fs_info);
2884
2857 return 0; 2885 return 0;
2858 2886
2859fail_qgroup: 2887fail_qgroup:
@@ -3259,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3259 BTRFS_BLOCK_GROUP_RAID10)) { 3287 BTRFS_BLOCK_GROUP_RAID10)) {
3260 num_tolerated_disk_barrier_failures = 1; 3288 num_tolerated_disk_barrier_failures = 1;
3261 } else if (flags & 3289 } else if (flags &
3262 BTRFS_BLOCK_GROUP_RAID5) { 3290 BTRFS_BLOCK_GROUP_RAID6) {
3263 num_tolerated_disk_barrier_failures = 2; 3291 num_tolerated_disk_barrier_failures = 2;
3264 } 3292 }
3265 } 3293 }
@@ -3367,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
3367 return ret; 3395 return ret;
3368} 3396}
3369 3397
3370void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3398/* Drop a fs root from the radix tree and free it. */
3399void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3400 struct btrfs_root *root)
3371{ 3401{
3372 spin_lock(&fs_info->fs_roots_radix_lock); 3402 spin_lock(&fs_info->fs_roots_radix_lock);
3373 radix_tree_delete(&fs_info->fs_roots_radix, 3403 radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3398,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
3398 kfree(root->free_ino_ctl); 3428 kfree(root->free_ino_ctl);
3399 kfree(root->free_ino_pinned); 3429 kfree(root->free_ino_pinned);
3400 kfree(root->name); 3430 kfree(root->name);
3401 kfree(root); 3431 btrfs_put_fs_root(root);
3432}
3433
3434void btrfs_free_fs_root(struct btrfs_root *root)
3435{
3436 free_fs_root(root);
3402} 3437}
3403 3438
3404int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 3439int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3654,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3654 INIT_LIST_HEAD(&splice); 3689 INIT_LIST_HEAD(&splice);
3655 3690
3656 mutex_lock(&root->fs_info->ordered_operations_mutex); 3691 mutex_lock(&root->fs_info->ordered_operations_mutex);
3657 spin_lock(&root->fs_info->ordered_extent_lock); 3692 spin_lock(&root->fs_info->ordered_root_lock);
3658 3693
3659 list_splice_init(&t->ordered_operations, &splice); 3694 list_splice_init(&t->ordered_operations, &splice);
3660 while (!list_empty(&splice)) { 3695 while (!list_empty(&splice)) {
@@ -3662,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3662 ordered_operations); 3697 ordered_operations);
3663 3698
3664 list_del_init(&btrfs_inode->ordered_operations); 3699 list_del_init(&btrfs_inode->ordered_operations);
3665 spin_unlock(&root->fs_info->ordered_extent_lock); 3700 spin_unlock(&root->fs_info->ordered_root_lock);
3666 3701
3667 btrfs_invalidate_inodes(btrfs_inode->root); 3702 btrfs_invalidate_inodes(btrfs_inode->root);
3668 3703
3669 spin_lock(&root->fs_info->ordered_extent_lock); 3704 spin_lock(&root->fs_info->ordered_root_lock);
3670 } 3705 }
3671 3706
3672 spin_unlock(&root->fs_info->ordered_extent_lock); 3707 spin_unlock(&root->fs_info->ordered_root_lock);
3673 mutex_unlock(&root->fs_info->ordered_operations_mutex); 3708 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3674} 3709}
3675 3710
@@ -3677,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3677{ 3712{
3678 struct btrfs_ordered_extent *ordered; 3713 struct btrfs_ordered_extent *ordered;
3679 3714
3680 spin_lock(&root->fs_info->ordered_extent_lock); 3715 spin_lock(&root->ordered_extent_lock);
3681 /* 3716 /*
3682 * This will just short circuit the ordered completion stuff which will 3717 * This will just short circuit the ordered completion stuff which will
3683 * make sure the ordered extent gets properly cleaned up. 3718 * make sure the ordered extent gets properly cleaned up.
3684 */ 3719 */
3685 list_for_each_entry(ordered, &root->fs_info->ordered_extents, 3720 list_for_each_entry(ordered, &root->ordered_extents,
3686 root_extent_list) 3721 root_extent_list)
3687 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 3722 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3688 spin_unlock(&root->fs_info->ordered_extent_lock); 3723 spin_unlock(&root->ordered_extent_lock);
3724}
3725
3726static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3727{
3728 struct btrfs_root *root;
3729 struct list_head splice;
3730
3731 INIT_LIST_HEAD(&splice);
3732
3733 spin_lock(&fs_info->ordered_root_lock);
3734 list_splice_init(&fs_info->ordered_roots, &splice);
3735 while (!list_empty(&splice)) {
3736 root = list_first_entry(&splice, struct btrfs_root,
3737 ordered_root);
3738 list_del_init(&root->ordered_root);
3739
3740 btrfs_destroy_ordered_extents(root);
3741
3742 cond_resched_lock(&fs_info->ordered_root_lock);
3743 }
3744 spin_unlock(&fs_info->ordered_root_lock);
3689} 3745}
3690 3746
3691int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 3747int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3707,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3707 3763
3708 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3764 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3709 struct btrfs_delayed_ref_head *head = NULL; 3765 struct btrfs_delayed_ref_head *head = NULL;
3766 bool pin_bytes = false;
3710 3767
3711 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3768 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3712 atomic_set(&ref->refs, 1); 3769 atomic_set(&ref->refs, 1);
@@ -3727,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3727 } 3784 }
3728 3785
3729 if (head->must_insert_reserved) 3786 if (head->must_insert_reserved)
3730 btrfs_pin_extent(root, ref->bytenr, 3787 pin_bytes = true;
3731 ref->num_bytes, 1);
3732 btrfs_free_delayed_extent_op(head->extent_op); 3788 btrfs_free_delayed_extent_op(head->extent_op);
3733 delayed_refs->num_heads--; 3789 delayed_refs->num_heads--;
3734 if (list_empty(&head->cluster)) 3790 if (list_empty(&head->cluster))
@@ -3739,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3739 ref->in_tree = 0; 3795 ref->in_tree = 0;
3740 rb_erase(&ref->rb_node, &delayed_refs->root); 3796 rb_erase(&ref->rb_node, &delayed_refs->root);
3741 delayed_refs->num_entries--; 3797 delayed_refs->num_entries--;
3742 if (head)
3743 mutex_unlock(&head->mutex);
3744 spin_unlock(&delayed_refs->lock); 3798 spin_unlock(&delayed_refs->lock);
3799 if (head) {
3800 if (pin_bytes)
3801 btrfs_pin_extent(root, ref->bytenr,
3802 ref->num_bytes, 1);
3803 mutex_unlock(&head->mutex);
3804 }
3745 btrfs_put_delayed_ref(ref); 3805 btrfs_put_delayed_ref(ref);
3746 3806
3747 cond_resched(); 3807 cond_resched();
@@ -3778,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3778 3838
3779 INIT_LIST_HEAD(&splice); 3839 INIT_LIST_HEAD(&splice);
3780 3840
3781 spin_lock(&root->fs_info->delalloc_lock); 3841 spin_lock(&root->delalloc_lock);
3782 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 3842 list_splice_init(&root->delalloc_inodes, &splice);
3783 3843
3784 while (!list_empty(&splice)) { 3844 while (!list_empty(&splice)) {
3785 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3845 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
3786 delalloc_inodes); 3846 delalloc_inodes);
3787 3847
3788 list_del_init(&btrfs_inode->delalloc_inodes); 3848 list_del_init(&btrfs_inode->delalloc_inodes);
3789 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 3849 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3790 &btrfs_inode->runtime_flags); 3850 &btrfs_inode->runtime_flags);
3791 spin_unlock(&root->fs_info->delalloc_lock); 3851 spin_unlock(&root->delalloc_lock);
3792 3852
3793 btrfs_invalidate_inodes(btrfs_inode->root); 3853 btrfs_invalidate_inodes(btrfs_inode->root);
3794 3854
3795 spin_lock(&root->fs_info->delalloc_lock); 3855 spin_lock(&root->delalloc_lock);
3796 } 3856 }
3797 3857
3798 spin_unlock(&root->fs_info->delalloc_lock); 3858 spin_unlock(&root->delalloc_lock);
3859}
3860
3861static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
3862{
3863 struct btrfs_root *root;
3864 struct list_head splice;
3865
3866 INIT_LIST_HEAD(&splice);
3867
3868 spin_lock(&fs_info->delalloc_root_lock);
3869 list_splice_init(&fs_info->delalloc_roots, &splice);
3870 while (!list_empty(&splice)) {
3871 root = list_first_entry(&splice, struct btrfs_root,
3872 delalloc_root);
3873 list_del_init(&root->delalloc_root);
3874 root = btrfs_grab_fs_root(root);
3875 BUG_ON(!root);
3876 spin_unlock(&fs_info->delalloc_root_lock);
3877
3878 btrfs_destroy_delalloc_inodes(root);
3879 btrfs_put_fs_root(root);
3880
3881 spin_lock(&fs_info->delalloc_root_lock);
3882 }
3883 spin_unlock(&fs_info->delalloc_root_lock);
3799} 3884}
3800 3885
3801static int btrfs_destroy_marked_extents(struct btrfs_root *root, 3886static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3879,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3879 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, 3964 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
3880 cur_trans->dirty_pages.dirty_bytes); 3965 cur_trans->dirty_pages.dirty_bytes);
3881 3966
3882 /* FIXME: cleanup wait for commit */ 3967 cur_trans->state = TRANS_STATE_COMMIT_START;
3883 cur_trans->in_commit = 1;
3884 cur_trans->blocked = 1;
3885 wake_up(&root->fs_info->transaction_blocked_wait); 3968 wake_up(&root->fs_info->transaction_blocked_wait);
3886 3969
3887 btrfs_evict_pending_snapshots(cur_trans); 3970 btrfs_evict_pending_snapshots(cur_trans);
3888 3971
3889 cur_trans->blocked = 0; 3972 cur_trans->state = TRANS_STATE_UNBLOCKED;
3890 wake_up(&root->fs_info->transaction_wait); 3973 wake_up(&root->fs_info->transaction_wait);
3891 3974
3892 cur_trans->commit_done = 1;
3893 wake_up(&cur_trans->commit_wait);
3894
3895 btrfs_destroy_delayed_inodes(root); 3975 btrfs_destroy_delayed_inodes(root);
3896 btrfs_assert_delayed_root_empty(root); 3976 btrfs_assert_delayed_root_empty(root);
3897 3977
@@ -3900,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3900 btrfs_destroy_pinned_extent(root, 3980 btrfs_destroy_pinned_extent(root,
3901 root->fs_info->pinned_extents); 3981 root->fs_info->pinned_extents);
3902 3982
3983 cur_trans->state =TRANS_STATE_COMPLETED;
3984 wake_up(&cur_trans->commit_wait);
3985
3903 /* 3986 /*
3904 memset(cur_trans, 0, sizeof(*cur_trans)); 3987 memset(cur_trans, 0, sizeof(*cur_trans));
3905 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 3988 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3915,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3915 3998
3916 spin_lock(&root->fs_info->trans_lock); 3999 spin_lock(&root->fs_info->trans_lock);
3917 list_splice_init(&root->fs_info->trans_list, &list); 4000 list_splice_init(&root->fs_info->trans_list, &list);
3918 root->fs_info->trans_no_join = 1; 4001 root->fs_info->running_transaction = NULL;
3919 spin_unlock(&root->fs_info->trans_lock); 4002 spin_unlock(&root->fs_info->trans_lock);
3920 4003
3921 while (!list_empty(&list)) { 4004 while (!list_empty(&list)) {
@@ -3923,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3923 4006
3924 btrfs_destroy_ordered_operations(t, root); 4007 btrfs_destroy_ordered_operations(t, root);
3925 4008
3926 btrfs_destroy_ordered_extents(root); 4009 btrfs_destroy_all_ordered_extents(root->fs_info);
3927 4010
3928 btrfs_destroy_delayed_refs(t, root); 4011 btrfs_destroy_delayed_refs(t, root);
3929 4012
3930 /* FIXME: cleanup wait for commit */ 4013 /*
3931 t->in_commit = 1; 4014 * FIXME: cleanup wait for commit
3932 t->blocked = 1; 4015 * We needn't acquire the lock here, because we are during
4016 * the umount, there is no other task which will change it.
4017 */
4018 t->state = TRANS_STATE_COMMIT_START;
3933 smp_mb(); 4019 smp_mb();
3934 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 4020 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3935 wake_up(&root->fs_info->transaction_blocked_wait); 4021 wake_up(&root->fs_info->transaction_blocked_wait);
3936 4022
3937 btrfs_evict_pending_snapshots(t); 4023 btrfs_evict_pending_snapshots(t);
3938 4024
3939 t->blocked = 0; 4025 t->state = TRANS_STATE_UNBLOCKED;
3940 smp_mb(); 4026 smp_mb();
3941 if (waitqueue_active(&root->fs_info->transaction_wait)) 4027 if (waitqueue_active(&root->fs_info->transaction_wait))
3942 wake_up(&root->fs_info->transaction_wait); 4028 wake_up(&root->fs_info->transaction_wait);
3943 4029
3944 t->commit_done = 1;
3945 smp_mb();
3946 if (waitqueue_active(&t->commit_wait))
3947 wake_up(&t->commit_wait);
3948
3949 btrfs_destroy_delayed_inodes(root); 4030 btrfs_destroy_delayed_inodes(root);
3950 btrfs_assert_delayed_root_empty(root); 4031 btrfs_assert_delayed_root_empty(root);
3951 4032
3952 btrfs_destroy_delalloc_inodes(root); 4033 btrfs_destroy_all_delalloc_inodes(root->fs_info);
3953
3954 spin_lock(&root->fs_info->trans_lock);
3955 root->fs_info->running_transaction = NULL;
3956 spin_unlock(&root->fs_info->trans_lock);
3957 4034
3958 btrfs_destroy_marked_extents(root, &t->dirty_pages, 4035 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3959 EXTENT_DIRTY); 4036 EXTENT_DIRTY);
@@ -3961,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3961 btrfs_destroy_pinned_extent(root, 4038 btrfs_destroy_pinned_extent(root,
3962 root->fs_info->pinned_extents); 4039 root->fs_info->pinned_extents);
3963 4040
4041 t->state = TRANS_STATE_COMPLETED;
4042 smp_mb();
4043 if (waitqueue_active(&t->commit_wait))
4044 wake_up(&t->commit_wait);
4045
3964 atomic_set(&t->use_count, 0); 4046 atomic_set(&t->use_count, 0);
3965 list_del_init(&t->list); 4047 list_del_init(&t->list);
3966 memset(t, 0, sizeof(*t)); 4048 memset(t, 0, sizeof(*t));
3967 kmem_cache_free(btrfs_transaction_cachep, t); 4049 kmem_cache_free(btrfs_transaction_cachep, t);
3968 } 4050 }
3969 4051
3970 spin_lock(&root->fs_info->trans_lock);
3971 root->fs_info->trans_no_join = 0;
3972 spin_unlock(&root->fs_info->trans_lock);
3973 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 4052 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3974 4053
3975 return 0; 4054 return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr, u32 blocksize);
66struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root);
68struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 71struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
69 struct btrfs_key *location); 72 struct btrfs_key *location);
70int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 73int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
71void btrfs_btree_balance_dirty(struct btrfs_root *root); 74void btrfs_btree_balance_dirty(struct btrfs_root *root);
72void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 75void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
73void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 76void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
77 struct btrfs_root *root);
78void btrfs_free_fs_root(struct btrfs_root *root);
79
80/*
81 * This function is used to grab the root, and avoid it is freed when we
82 * access it. But it doesn't ensure that the tree is not dropped.
83 *
84 * If you want to ensure the whole tree is safe, you should use
85 * fs_info->subvol_srcu
86 */
87static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
88{
89 if (atomic_inc_not_zero(&root->refs))
90 return root;
91 return NULL;
92}
93
94static inline void btrfs_put_fs_root(struct btrfs_root *root)
95{
96 if (atomic_dec_and_test(&root->refs))
97 kfree(root);
98}
99
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 100void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 101int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
76 int atomic); 102 int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 goto fail; 82 goto fail;
83 } 83 }
84 84
85 if (btrfs_root_refs(&root->root_item) == 0) {
86 err = -ENOENT;
87 goto fail;
88 }
89
90 key.objectid = objectid; 85 key.objectid = objectid;
91 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 86 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
92 key.offset = 0; 87 key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..0236de711989 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h>
27#include "compat.h" 28#include "compat.h"
28#include "hash.h" 29#include "hash.h"
29#include "ctree.h" 30#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2526 return 0; 2527 return 0;
2527} 2528}
2528 2529
2530static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2531{
2532 u64 num_bytes;
2533
2534 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2535 sizeof(struct btrfs_extent_inline_ref));
2536 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2537 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2538
2539 /*
2540 * We don't ever fill up leaves all the way so multiply by 2 just to be
2541 * closer to what we're really going to want to ouse.
2542 */
2543 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2544}
2545
2546int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2547 struct btrfs_root *root)
2548{
2549 struct btrfs_block_rsv *global_rsv;
2550 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2551 u64 num_bytes;
2552 int ret = 0;
2553
2554 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2555 num_heads = heads_to_leaves(root, num_heads);
2556 if (num_heads > 1)
2557 num_bytes += (num_heads - 1) * root->leafsize;
2558 num_bytes <<= 1;
2559 global_rsv = &root->fs_info->global_block_rsv;
2560
2561 /*
2562 * If we can't allocate any more chunks lets make sure we have _lots_ of
2563 * wiggle room since running delayed refs can create more delayed refs.
2564 */
2565 if (global_rsv->space_info->full)
2566 num_bytes <<= 1;
2567
2568 spin_lock(&global_rsv->lock);
2569 if (global_rsv->reserved <= num_bytes)
2570 ret = 1;
2571 spin_unlock(&global_rsv->lock);
2572 return ret;
2573}
2574
2529/* 2575/*
2530 * this starts processing the delayed reference count updates and 2576 * this starts processing the delayed reference count updates and
2531 * extent insertions we have queued up so far. count can be 2577 * extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
2573 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2619 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2574 if (old) { 2620 if (old) {
2575 DEFINE_WAIT(__wait); 2621 DEFINE_WAIT(__wait);
2576 if (delayed_refs->num_entries < 16348) 2622 if (delayed_refs->flushing ||
2623 !btrfs_should_throttle_delayed_refs(trans, root))
2577 return 0; 2624 return 0;
2578 2625
2579 prepare_to_wait(&delayed_refs->wait, &__wait, 2626 prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
2608 2655
2609 while (1) { 2656 while (1) {
2610 if (!(run_all || run_most) && 2657 if (!(run_all || run_most) &&
2611 delayed_refs->num_heads_ready < 64) 2658 !btrfs_should_throttle_delayed_refs(trans, root))
2612 break; 2659 break;
2613 2660
2614 /* 2661 /*
@@ -2629,6 +2676,7 @@ again:
2629 spin_unlock(&delayed_refs->lock); 2676 spin_unlock(&delayed_refs->lock);
2630 btrfs_abort_transaction(trans, root, ret); 2677 btrfs_abort_transaction(trans, root, ret);
2631 atomic_dec(&delayed_refs->procs_running_refs); 2678 atomic_dec(&delayed_refs->procs_running_refs);
2679 wake_up(&delayed_refs->wait);
2632 return ret; 2680 return ret;
2633 } 2681 }
2634 2682
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3310 struct btrfs_space_info *found; 3358 struct btrfs_space_info *found;
3311 int i; 3359 int i;
3312 int factor; 3360 int factor;
3361 int ret;
3313 3362
3314 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3363 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3315 BTRFS_BLOCK_GROUP_RAID10)) 3364 BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3333 if (!found) 3382 if (!found)
3334 return -ENOMEM; 3383 return -ENOMEM;
3335 3384
3385 ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3386 if (ret) {
3387 kfree(found);
3388 return ret;
3389 }
3390
3336 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3391 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3337 INIT_LIST_HEAD(&found->block_groups[i]); 3392 INIT_LIST_HEAD(&found->block_groups[i]);
3338 init_rwsem(&found->groups_sem); 3393 init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
3565 } 3620 }
3566 3621
3567 /* 3622 /*
3568 * If we have less pinned bytes than we want to allocate then 3623 * If we don't have enough pinned space to deal with this
3569 * don't bother committing the transaction, it won't help us. 3624 * allocation don't bother committing the transaction.
3570 */ 3625 */
3571 if (data_sinfo->bytes_pinned < bytes) 3626 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3627 bytes) < 0)
3572 committed = 1; 3628 committed = 1;
3573 spin_unlock(&data_sinfo->lock); 3629 spin_unlock(&data_sinfo->lock);
3574 3630
@@ -3577,6 +3633,7 @@ commit_trans:
3577 if (!committed && 3633 if (!committed &&
3578 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3634 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3579 committed = 1; 3635 committed = 1;
3636
3580 trans = btrfs_join_transaction(root); 3637 trans = btrfs_join_transaction(root);
3581 if (IS_ERR(trans)) 3638 if (IS_ERR(trans))
3582 return PTR_ERR(trans); 3639 return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3609 3666
3610 data_sinfo = root->fs_info->data_sinfo; 3667 data_sinfo = root->fs_info->data_sinfo;
3611 spin_lock(&data_sinfo->lock); 3668 spin_lock(&data_sinfo->lock);
3669 WARN_ON(data_sinfo->bytes_may_use < bytes);
3612 data_sinfo->bytes_may_use -= bytes; 3670 data_sinfo->bytes_may_use -= bytes;
3613 trace_btrfs_space_reservation(root->fs_info, "space_info", 3671 trace_btrfs_space_reservation(root->fs_info, "space_info",
3614 data_sinfo->flags, bytes, 0); 3672 data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3886 unsigned long nr_pages) 3944 unsigned long nr_pages)
3887{ 3945{
3888 struct super_block *sb = root->fs_info->sb; 3946 struct super_block *sb = root->fs_info->sb;
3889 int started;
3890 3947
3891 /* If we can not start writeback, just sync all the delalloc file. */ 3948 if (down_read_trylock(&sb->s_umount)) {
3892 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3949 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3893 WB_REASON_FS_FREE_SPACE); 3950 up_read(&sb->s_umount);
3894 if (!started) { 3951 } else {
3895 /* 3952 /*
3896 * We needn't worry the filesystem going from r/w to r/o though 3953 * We needn't worry the filesystem going from r/w to r/o though
3897 * we don't acquire ->s_umount mutex, because the filesystem 3954 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3899 * the filesystem is readonly(all dirty pages are written to 3956 * the filesystem is readonly(all dirty pages are written to
3900 * the disk). 3957 * the disk).
3901 */ 3958 */
3902 btrfs_start_delalloc_inodes(root, 0); 3959 btrfs_start_all_delalloc_inodes(root->fs_info, 0);
3903 if (!current->journal_info) 3960 if (!current->journal_info)
3904 btrfs_wait_ordered_extents(root, 0); 3961 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3905 } 3962 }
3906} 3963}
3907 3964
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3931 if (delalloc_bytes == 0) { 3988 if (delalloc_bytes == 0) {
3932 if (trans) 3989 if (trans)
3933 return; 3990 return;
3934 btrfs_wait_ordered_extents(root, 0); 3991 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3935 return; 3992 return;
3936 } 3993 }
3937 3994
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3959 4016
3960 loops++; 4017 loops++;
3961 if (wait_ordered && !trans) { 4018 if (wait_ordered && !trans) {
3962 btrfs_wait_ordered_extents(root, 0); 4019 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3963 } else { 4020 } else {
3964 time_left = schedule_timeout_killable(1); 4021 time_left = schedule_timeout_killable(1);
3965 if (time_left) 4022 if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
3997 4054
3998 /* See if there is enough pinned space to make this reservation */ 4055 /* See if there is enough pinned space to make this reservation */
3999 spin_lock(&space_info->lock); 4056 spin_lock(&space_info->lock);
4000 if (space_info->bytes_pinned >= bytes) { 4057 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4058 bytes) >= 0) {
4001 spin_unlock(&space_info->lock); 4059 spin_unlock(&space_info->lock);
4002 goto commit; 4060 goto commit;
4003 } 4061 }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
4012 4070
4013 spin_lock(&space_info->lock); 4071 spin_lock(&space_info->lock);
4014 spin_lock(&delayed_rsv->lock); 4072 spin_lock(&delayed_rsv->lock);
4015 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4073 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4074 bytes - delayed_rsv->size) >= 0) {
4016 spin_unlock(&delayed_rsv->lock); 4075 spin_unlock(&delayed_rsv->lock);
4017 spin_unlock(&space_info->lock); 4076 spin_unlock(&space_info->lock);
4018 return -ENOSPC; 4077 return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4297 spin_unlock(&block_rsv->lock); 4356 spin_unlock(&block_rsv->lock);
4298} 4357}
4299 4358
4359int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4360 struct btrfs_block_rsv *dest, u64 num_bytes,
4361 int min_factor)
4362{
4363 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4364 u64 min_bytes;
4365
4366 if (global_rsv->space_info != dest->space_info)
4367 return -ENOSPC;
4368
4369 spin_lock(&global_rsv->lock);
4370 min_bytes = div_factor(global_rsv->size, min_factor);
4371 if (global_rsv->reserved < min_bytes + num_bytes) {
4372 spin_unlock(&global_rsv->lock);
4373 return -ENOSPC;
4374 }
4375 global_rsv->reserved -= num_bytes;
4376 if (global_rsv->reserved < global_rsv->size)
4377 global_rsv->full = 0;
4378 spin_unlock(&global_rsv->lock);
4379
4380 block_rsv_add_bytes(dest, num_bytes, 1);
4381 return 0;
4382}
4383
4300static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4384static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4301 struct btrfs_block_rsv *block_rsv, 4385 struct btrfs_block_rsv *block_rsv,
4302 struct btrfs_block_rsv *dest, u64 num_bytes) 4386 struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
5030 int factor; 5114 int factor;
5031 5115
5032 /* block accounting for super block */ 5116 /* block accounting for super block */
5033 spin_lock(&info->delalloc_lock); 5117 spin_lock(&info->delalloc_root_lock);
5034 old_val = btrfs_super_bytes_used(info->super_copy); 5118 old_val = btrfs_super_bytes_used(info->super_copy);
5035 if (alloc) 5119 if (alloc)
5036 old_val += num_bytes; 5120 old_val += num_bytes;
5037 else 5121 else
5038 old_val -= num_bytes; 5122 old_val -= num_bytes;
5039 btrfs_set_super_bytes_used(info->super_copy, old_val); 5123 btrfs_set_super_bytes_used(info->super_copy, old_val);
5040 spin_unlock(&info->delalloc_lock); 5124 spin_unlock(&info->delalloc_root_lock);
5041 5125
5042 while (total) { 5126 while (total) {
5043 cache = btrfs_lookup_block_group(info, bytenr); 5127 cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5189 return ret; 5273 return ret;
5190} 5274}
5191 5275
5276static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5277{
5278 int ret;
5279 struct btrfs_block_group_cache *block_group;
5280 struct btrfs_caching_control *caching_ctl;
5281
5282 block_group = btrfs_lookup_block_group(root->fs_info, start);
5283 if (!block_group)
5284 return -EINVAL;
5285
5286 cache_block_group(block_group, 0);
5287 caching_ctl = get_caching_control(block_group);
5288
5289 if (!caching_ctl) {
5290 /* Logic error */
5291 BUG_ON(!block_group_cache_done(block_group));
5292 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5293 } else {
5294 mutex_lock(&caching_ctl->mutex);
5295
5296 if (start >= caching_ctl->progress) {
5297 ret = add_excluded_extent(root, start, num_bytes);
5298 } else if (start + num_bytes <= caching_ctl->progress) {
5299 ret = btrfs_remove_free_space(block_group,
5300 start, num_bytes);
5301 } else {
5302 num_bytes = caching_ctl->progress - start;
5303 ret = btrfs_remove_free_space(block_group,
5304 start, num_bytes);
5305 if (ret)
5306 goto out_lock;
5307
5308 num_bytes = (start + num_bytes) -
5309 caching_ctl->progress;
5310 start = caching_ctl->progress;
5311 ret = add_excluded_extent(root, start, num_bytes);
5312 }
5313out_lock:
5314 mutex_unlock(&caching_ctl->mutex);
5315 put_caching_control(caching_ctl);
5316 }
5317 btrfs_put_block_group(block_group);
5318 return ret;
5319}
5320
5321int btrfs_exclude_logged_extents(struct btrfs_root *log,
5322 struct extent_buffer *eb)
5323{
5324 struct btrfs_file_extent_item *item;
5325 struct btrfs_key key;
5326 int found_type;
5327 int i;
5328
5329 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5330 return 0;
5331
5332 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5333 btrfs_item_key_to_cpu(eb, &key, i);
5334 if (key.type != BTRFS_EXTENT_DATA_KEY)
5335 continue;
5336 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5337 found_type = btrfs_file_extent_type(eb, item);
5338 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5339 continue;
5340 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5341 continue;
5342 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5343 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5344 __exclude_logged_extent(log, key.objectid, key.offset);
5345 }
5346
5347 return 0;
5348}
5349
5192/** 5350/**
5193 * btrfs_update_reserved_bytes - update the block_group and space info counters 5351 * btrfs_update_reserved_bytes - update the block_group and space info counters
5194 * @cache: The cache we are manipulating 5352 * @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5251 struct btrfs_caching_control *next; 5409 struct btrfs_caching_control *next;
5252 struct btrfs_caching_control *caching_ctl; 5410 struct btrfs_caching_control *caching_ctl;
5253 struct btrfs_block_group_cache *cache; 5411 struct btrfs_block_group_cache *cache;
5412 struct btrfs_space_info *space_info;
5254 5413
5255 down_write(&fs_info->extent_commit_sem); 5414 down_write(&fs_info->extent_commit_sem);
5256 5415
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5273 5432
5274 up_write(&fs_info->extent_commit_sem); 5433 up_write(&fs_info->extent_commit_sem);
5275 5434
5435 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5436 percpu_counter_set(&space_info->total_bytes_pinned, 0);
5437
5276 update_global_block_rsv(fs_info); 5438 update_global_block_rsv(fs_info);
5277} 5439}
5278 5440
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5370 return 0; 5532 return 0;
5371} 5533}
5372 5534
5535static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5536 u64 owner, u64 root_objectid)
5537{
5538 struct btrfs_space_info *space_info;
5539 u64 flags;
5540
5541 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5542 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5543 flags = BTRFS_BLOCK_GROUP_SYSTEM;
5544 else
5545 flags = BTRFS_BLOCK_GROUP_METADATA;
5546 } else {
5547 flags = BTRFS_BLOCK_GROUP_DATA;
5548 }
5549
5550 space_info = __find_space_info(fs_info, flags);
5551 BUG_ON(!space_info); /* Logic bug */
5552 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5553}
5554
5555
5373static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5556static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5374 struct btrfs_root *root, 5557 struct btrfs_root *root,
5375 u64 bytenr, u64 num_bytes, u64 parent, 5558 u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5590 goto out; 5773 goto out;
5591 } 5774 }
5592 } 5775 }
5776 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5777 root_objectid);
5593 } else { 5778 } else {
5594 if (found_extent) { 5779 if (found_extent) {
5595 BUG_ON(is_data && refs_to_drop != 5780 BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5713 u64 parent, int last_ref) 5898 u64 parent, int last_ref)
5714{ 5899{
5715 struct btrfs_block_group_cache *cache = NULL; 5900 struct btrfs_block_group_cache *cache = NULL;
5901 int pin = 1;
5716 int ret; 5902 int ret;
5717 5903
5718 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5904 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5745 5931
5746 btrfs_add_free_space(cache, buf->start, buf->len); 5932 btrfs_add_free_space(cache, buf->start, buf->len);
5747 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5933 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5934 pin = 0;
5748 } 5935 }
5749out: 5936out:
5937 if (pin)
5938 add_pinned_bytes(root->fs_info, buf->len,
5939 btrfs_header_level(buf),
5940 root->root_key.objectid);
5941
5750 /* 5942 /*
5751 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5943 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5752 * anymore. 5944 * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5763 int ret; 5955 int ret;
5764 struct btrfs_fs_info *fs_info = root->fs_info; 5956 struct btrfs_fs_info *fs_info = root->fs_info;
5765 5957
5958 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
5959
5766 /* 5960 /*
5767 * tree log blocks never actually go into the extent allocation 5961 * tree log blocks never actually go into the extent allocation
5768 * tree, just update pinning info and exit early. 5962 * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6560{ 6754{
6561 int ret; 6755 int ret;
6562 struct btrfs_block_group_cache *block_group; 6756 struct btrfs_block_group_cache *block_group;
6563 struct btrfs_caching_control *caching_ctl;
6564 u64 start = ins->objectid;
6565 u64 num_bytes = ins->offset;
6566
6567 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6568 cache_block_group(block_group, 0);
6569 caching_ctl = get_caching_control(block_group);
6570
6571 if (!caching_ctl) {
6572 BUG_ON(!block_group_cache_done(block_group));
6573 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6574 if (ret)
6575 goto out;
6576 } else {
6577 mutex_lock(&caching_ctl->mutex);
6578 6757
6579 if (start >= caching_ctl->progress) { 6758 /*
6580 ret = add_excluded_extent(root, start, num_bytes); 6759 * Mixed block groups will exclude before processing the log so we only
6581 } else if (start + num_bytes <= caching_ctl->progress) { 6760 * need to do the exlude dance if this fs isn't mixed.
6582 ret = btrfs_remove_free_space(block_group, 6761 */
6583 start, num_bytes); 6762 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6584 } else { 6763 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6585 num_bytes = caching_ctl->progress - start;
6586 ret = btrfs_remove_free_space(block_group,
6587 start, num_bytes);
6588 if (ret)
6589 goto out_lock;
6590
6591 start = caching_ctl->progress;
6592 num_bytes = ins->objectid + ins->offset -
6593 caching_ctl->progress;
6594 ret = add_excluded_extent(root, start, num_bytes);
6595 }
6596out_lock:
6597 mutex_unlock(&caching_ctl->mutex);
6598 put_caching_control(caching_ctl);
6599 if (ret) 6764 if (ret)
6600 goto out; 6765 return ret;
6601 } 6766 }
6602 6767
6768 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6769 if (!block_group)
6770 return -EINVAL;
6771
6603 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6772 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6604 RESERVE_ALLOC_NO_ACCOUNT); 6773 RESERVE_ALLOC_NO_ACCOUNT);
6605 BUG_ON(ret); /* logic error */ 6774 BUG_ON(ret); /* logic error */
6606 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6775 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6607 0, owner, offset, ins, 1); 6776 0, owner, offset, ins, 1);
6608out:
6609 btrfs_put_block_group(block_group); 6777 btrfs_put_block_group(block_group);
6610 return ret; 6778 return ret;
6611} 6779}
@@ -7384,7 +7552,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7384 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7552 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7385 7553
7386 while (1) { 7554 while (1) {
7387 if (!for_reloc && btrfs_fs_closing(root->fs_info)) { 7555 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7388 pr_debug("btrfs: drop snapshot early exit\n"); 7556 pr_debug("btrfs: drop snapshot early exit\n");
7389 err = -EAGAIN; 7557 err = -EAGAIN;
7390 goto out_end_trans; 7558 goto out_end_trans;
@@ -7447,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7447 } 7615 }
7448 7616
7449 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7617 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7450 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7618 ret = btrfs_find_root(tree_root, &root->root_key, path,
7451 NULL, NULL); 7619 NULL, NULL);
7452 if (ret < 0) { 7620 if (ret < 0) {
7453 btrfs_abort_transaction(trans, tree_root, ret); 7621 btrfs_abort_transaction(trans, tree_root, ret);
7454 err = ret; 7622 err = ret;
@@ -7465,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7465 } 7633 }
7466 7634
7467 if (root->in_radix) { 7635 if (root->in_radix) {
7468 btrfs_free_fs_root(tree_root->fs_info, root); 7636 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7469 } else { 7637 } else {
7470 free_extent_buffer(root->node); 7638 free_extent_buffer(root->node);
7471 free_extent_buffer(root->commit_root); 7639 free_extent_buffer(root->commit_root);
7472 kfree(root); 7640 btrfs_put_fs_root(root);
7473 } 7641 }
7474out_end_trans: 7642out_end_trans:
7475 btrfs_end_transaction_throttle(trans, tree_root); 7643 btrfs_end_transaction_throttle(trans, tree_root);
@@ -7782,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7782 struct btrfs_space_info *space_info; 7950 struct btrfs_space_info *space_info;
7783 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7951 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7784 struct btrfs_device *device; 7952 struct btrfs_device *device;
7953 struct btrfs_trans_handle *trans;
7785 u64 min_free; 7954 u64 min_free;
7786 u64 dev_min = 1; 7955 u64 dev_min = 1;
7787 u64 dev_nr = 0; 7956 u64 dev_nr = 0;
@@ -7868,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7868 do_div(min_free, dev_min); 8037 do_div(min_free, dev_min);
7869 } 8038 }
7870 8039
8040 /* We need to do this so that we can look at pending chunks */
8041 trans = btrfs_join_transaction(root);
8042 if (IS_ERR(trans)) {
8043 ret = PTR_ERR(trans);
8044 goto out;
8045 }
8046
7871 mutex_lock(&root->fs_info->chunk_mutex); 8047 mutex_lock(&root->fs_info->chunk_mutex);
7872 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8048 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7873 u64 dev_offset; 8049 u64 dev_offset;
@@ -7878,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7878 */ 8054 */
7879 if (device->total_bytes > device->bytes_used + min_free && 8055 if (device->total_bytes > device->bytes_used + min_free &&
7880 !device->is_tgtdev_for_dev_replace) { 8056 !device->is_tgtdev_for_dev_replace) {
7881 ret = find_free_dev_extent(device, min_free, 8057 ret = find_free_dev_extent(trans, device, min_free,
7882 &dev_offset, NULL); 8058 &dev_offset, NULL);
7883 if (!ret) 8059 if (!ret)
7884 dev_nr++; 8060 dev_nr++;
@@ -7890,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7890 } 8066 }
7891 } 8067 }
7892 mutex_unlock(&root->fs_info->chunk_mutex); 8068 mutex_unlock(&root->fs_info->chunk_mutex);
8069 btrfs_end_transaction(trans, root);
7893out: 8070out:
7894 btrfs_put_block_group(block_group); 8071 btrfs_put_block_group(block_group);
7895 return ret; 8072 return ret;
@@ -8032,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8032 dump_space_info(space_info, 0, 0); 8209 dump_space_info(space_info, 0, 0);
8033 } 8210 }
8034 } 8211 }
8212 percpu_counter_destroy(&space_info->total_bytes_pinned);
8035 list_del(&space_info->list); 8213 list_del(&space_info->list);
8036 kfree(space_info); 8214 kfree(space_info);
8037 } 8215 }
@@ -8254,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8254 sizeof(item)); 8432 sizeof(item));
8255 if (ret) 8433 if (ret)
8256 btrfs_abort_transaction(trans, extent_root, ret); 8434 btrfs_abort_transaction(trans, extent_root, ret);
8435 ret = btrfs_finish_chunk_alloc(trans, extent_root,
8436 key.objectid, key.offset);
8437 if (ret)
8438 btrfs_abort_transaction(trans, extent_root, ret);
8257 } 8439 }
8258} 8440}
8259 8441
@@ -8591,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8591 if (end - start >= range->minlen) { 8773 if (end - start >= range->minlen) {
8592 if (!block_group_cache_done(cache)) { 8774 if (!block_group_cache_done(cache)) {
8593 ret = cache_block_group(cache, 0); 8775 ret = cache_block_group(cache, 0);
8594 if (!ret) 8776 if (ret) {
8595 wait_block_group_cache_done(cache); 8777 btrfs_put_block_group(cache);
8778 break;
8779 }
8780 ret = wait_block_group_cache_done(cache);
8781 if (ret) {
8782 btrfs_put_block_group(cache);
8783 break;
8784 }
8596 } 8785 }
8597 ret = btrfs_trim_block_group(cache, 8786 ret = btrfs_trim_block_group(cache,
8598 &group_trimmed, 8787 &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bca9472f313..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
77 kmem_cache_free(extent_buffer_cache, eb); 77 kmem_cache_free(extent_buffer_cache, eb);
78 } 78 }
79} 79}
80
81#define btrfs_debug_check_extent_io_range(inode, start, end) \
82 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
83static inline void __btrfs_debug_check_extent_io_range(const char *caller,
84 struct inode *inode, u64 start, u64 end)
85{
86 u64 isize = i_size_read(inode);
87
88 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
89 printk_ratelimited(KERN_DEBUG
90 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
91 caller,
92 (unsigned long long)btrfs_ino(inode),
93 (unsigned long long)isize,
94 (unsigned long long)start,
95 (unsigned long long)end);
96 }
97}
80#else 98#else
81#define btrfs_leak_debug_add(new, head) do {} while (0) 99#define btrfs_leak_debug_add(new, head) do {} while (0)
82#define btrfs_leak_debug_del(entry) do {} while (0) 100#define btrfs_leak_debug_del(entry) do {} while (0)
83#define btrfs_leak_debug_check() do {} while (0) 101#define btrfs_leak_debug_check() do {} while (0)
102#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
84#endif 103#endif
85 104
86#define BUFFER_LRU_MAX 64 105#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
522 int err; 541 int err;
523 int clear = 0; 542 int clear = 0;
524 543
544 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
545
546 if (bits & EXTENT_DELALLOC)
547 bits |= EXTENT_NORESERVE;
548
525 if (delete) 549 if (delete)
526 bits |= ~EXTENT_CTLBITS; 550 bits |= ~EXTENT_CTLBITS;
527 bits |= EXTENT_FIRST_DELALLOC; 551 bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
677 struct extent_state *state; 701 struct extent_state *state;
678 struct rb_node *node; 702 struct rb_node *node;
679 703
704 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
705
680 spin_lock(&tree->lock); 706 spin_lock(&tree->lock);
681again: 707again:
682 while (1) { 708 while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
769 u64 last_start; 795 u64 last_start;
770 u64 last_end; 796 u64 last_end;
771 797
798 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
799
772 bits |= EXTENT_FIRST_DELALLOC; 800 bits |= EXTENT_FIRST_DELALLOC;
773again: 801again:
774 if (!prealloc && (mask & __GFP_WAIT)) { 802 if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
989 u64 last_start; 1017 u64 last_start;
990 u64 last_end; 1018 u64 last_end;
991 1019
1020 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
1021
992again: 1022again:
993 if (!prealloc && (mask & __GFP_WAIT)) { 1023 if (!prealloc && (mask & __GFP_WAIT)) {
994 prealloc = alloc_extent_state(mask); 1024 prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2450 struct extent_state *cached = NULL; 2480 struct extent_state *cached = NULL;
2451 struct extent_state *state; 2481 struct extent_state *state;
2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2482 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2483 struct inode *inode = page->mapping->host;
2453 2484
2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2485 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2455 "mirror=%lu\n", (u64)bio->bi_sector, err, 2486 "mirror=%lu\n", (u64)bio->bi_sector, err,
2456 io_bio->mirror_num); 2487 io_bio->mirror_num);
2457 tree = &BTRFS_I(page->mapping->host)->io_tree; 2488 tree = &BTRFS_I(inode)->io_tree;
2458 2489
2459 /* We always issue full-page reads, but if some block 2490 /* We always issue full-page reads, but if some block
2460 * in a page fails to read, blk_update_request() will 2491 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2559 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2529 2560
2530 if (uptodate) { 2561 if (uptodate) {
2562 loff_t i_size = i_size_read(inode);
2563 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2564 unsigned offset;
2565
2566 /* Zero out the end if this page straddles i_size */
2567 offset = i_size & (PAGE_CACHE_SIZE-1);
2568 if (page->index == end_index && offset)
2569 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2531 SetPageUptodate(page); 2570 SetPageUptodate(page);
2532 } else { 2571 } else {
2533 ClearPageUptodate(page); 2572 ClearPageUptodate(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14) 21#define EXTENT_DAMAGED (1 << 14)
22#define EXTENT_NORESERVE (1 << 15)
22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
24 25
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
34 34
35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
36 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
37 sizeof(struct btrfs_sector_sum) * \ 37 sizeof(u32) * (r)->sectorsize)
38 (r)->sectorsize - (r)->sectorsize)
39 38
40int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 39int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 40 struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
297 struct btrfs_path *path; 296 struct btrfs_path *path;
298 struct extent_buffer *leaf; 297 struct extent_buffer *leaf;
299 struct btrfs_ordered_sum *sums; 298 struct btrfs_ordered_sum *sums;
300 struct btrfs_sector_sum *sector_sum;
301 struct btrfs_csum_item *item; 299 struct btrfs_csum_item *item;
302 LIST_HEAD(tmplist); 300 LIST_HEAD(tmplist);
303 unsigned long offset; 301 unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
368 struct btrfs_csum_item); 366 struct btrfs_csum_item);
369 while (start < csum_end) { 367 while (start < csum_end) {
370 size = min_t(size_t, csum_end - start, 368 size = min_t(size_t, csum_end - start,
371 MAX_ORDERED_SUM_BYTES(root)); 369 MAX_ORDERED_SUM_BYTES(root));
372 sums = kzalloc(btrfs_ordered_sum_size(root, size), 370 sums = kzalloc(btrfs_ordered_sum_size(root, size),
373 GFP_NOFS); 371 GFP_NOFS);
374 if (!sums) { 372 if (!sums) {
375 ret = -ENOMEM; 373 ret = -ENOMEM;
376 goto fail; 374 goto fail;
377 } 375 }
378 376
379 sector_sum = sums->sums;
380 sums->bytenr = start; 377 sums->bytenr = start;
381 sums->len = size; 378 sums->len = (int)size;
382 379
383 offset = (start - key.offset) >> 380 offset = (start - key.offset) >>
384 root->fs_info->sb->s_blocksize_bits; 381 root->fs_info->sb->s_blocksize_bits;
385 offset *= csum_size; 382 offset *= csum_size;
383 size >>= root->fs_info->sb->s_blocksize_bits;
386 384
387 while (size > 0) { 385 read_extent_buffer(path->nodes[0],
388 read_extent_buffer(path->nodes[0], 386 sums->sums,
389 &sector_sum->sum, 387 ((unsigned long)item) + offset,
390 ((unsigned long)item) + 388 csum_size * size);
391 offset, csum_size); 389
392 sector_sum->bytenr = start; 390 start += root->sectorsize * size;
393
394 size -= root->sectorsize;
395 start += root->sectorsize;
396 offset += csum_size;
397 sector_sum++;
398 }
399 list_add_tail(&sums->list, &tmplist); 391 list_add_tail(&sums->list, &tmplist);
400 } 392 }
401 path->slots[0]++; 393 path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
417 struct bio *bio, u64 file_start, int contig) 409 struct bio *bio, u64 file_start, int contig)
418{ 410{
419 struct btrfs_ordered_sum *sums; 411 struct btrfs_ordered_sum *sums;
420 struct btrfs_sector_sum *sector_sum;
421 struct btrfs_ordered_extent *ordered; 412 struct btrfs_ordered_extent *ordered;
422 char *data; 413 char *data;
423 struct bio_vec *bvec = bio->bi_io_vec; 414 struct bio_vec *bvec = bio->bi_io_vec;
424 int bio_index = 0; 415 int bio_index = 0;
416 int index;
425 unsigned long total_bytes = 0; 417 unsigned long total_bytes = 0;
426 unsigned long this_sum_bytes = 0; 418 unsigned long this_sum_bytes = 0;
427 u64 offset; 419 u64 offset;
428 u64 disk_bytenr;
429 420
430 WARN_ON(bio->bi_vcnt <= 0); 421 WARN_ON(bio->bi_vcnt <= 0);
431 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 422 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
432 if (!sums) 423 if (!sums)
433 return -ENOMEM; 424 return -ENOMEM;
434 425
435 sector_sum = sums->sums;
436 disk_bytenr = (u64)bio->bi_sector << 9;
437 sums->len = bio->bi_size; 426 sums->len = bio->bi_size;
438 INIT_LIST_HEAD(&sums->list); 427 INIT_LIST_HEAD(&sums->list);
439 428
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
444 433
445 ordered = btrfs_lookup_ordered_extent(inode, offset); 434 ordered = btrfs_lookup_ordered_extent(inode, offset);
446 BUG_ON(!ordered); /* Logic error */ 435 BUG_ON(!ordered); /* Logic error */
447 sums->bytenr = ordered->start; 436 sums->bytenr = (u64)bio->bi_sector << 9;
437 index = 0;
448 438
449 while (bio_index < bio->bi_vcnt) { 439 while (bio_index < bio->bi_vcnt) {
450 if (!contig) 440 if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
463 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 453 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
464 GFP_NOFS); 454 GFP_NOFS);
465 BUG_ON(!sums); /* -ENOMEM */ 455 BUG_ON(!sums); /* -ENOMEM */
466 sector_sum = sums->sums;
467 sums->len = bytes_left; 456 sums->len = bytes_left;
468 ordered = btrfs_lookup_ordered_extent(inode, offset); 457 ordered = btrfs_lookup_ordered_extent(inode, offset);
469 BUG_ON(!ordered); /* Logic error */ 458 BUG_ON(!ordered); /* Logic error */
470 sums->bytenr = ordered->start; 459 sums->bytenr = ((u64)bio->bi_sector << 9) +
460 total_bytes;
461 index = 0;
471 } 462 }
472 463
473 data = kmap_atomic(bvec->bv_page); 464 data = kmap_atomic(bvec->bv_page);
474 sector_sum->sum = ~(u32)0; 465 sums->sums[index] = ~(u32)0;
475 sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, 466 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
476 sector_sum->sum, 467 sums->sums[index],
477 bvec->bv_len); 468 bvec->bv_len);
478 kunmap_atomic(data); 469 kunmap_atomic(data);
479 btrfs_csum_final(sector_sum->sum, 470 btrfs_csum_final(sums->sums[index],
480 (char *)&sector_sum->sum); 471 (char *)(sums->sums + index));
481 sector_sum->bytenr = disk_bytenr;
482 472
483 sector_sum++;
484 bio_index++; 473 bio_index++;
474 index++;
485 total_bytes += bvec->bv_len; 475 total_bytes += bvec->bv_len;
486 this_sum_bytes += bvec->bv_len; 476 this_sum_bytes += bvec->bv_len;
487 disk_bytenr += bvec->bv_len;
488 offset += bvec->bv_len; 477 offset += bvec->bv_len;
489 bvec++; 478 bvec++;
490 } 479 }
@@ -672,62 +661,46 @@ out:
672 return ret; 661 return ret;
673} 662}
674 663
675static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
676 struct btrfs_sector_sum *sector_sum,
677 u64 total_bytes, u64 sectorsize)
678{
679 u64 tmp = sectorsize;
680 u64 next_sector = sector_sum->bytenr;
681 struct btrfs_sector_sum *next = sector_sum + 1;
682
683 while ((tmp + total_bytes) < sums->len) {
684 if (next_sector + sectorsize != next->bytenr)
685 break;
686 tmp += sectorsize;
687 next_sector = next->bytenr;
688 next++;
689 }
690 return tmp;
691}
692
693int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 664int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
694 struct btrfs_root *root, 665 struct btrfs_root *root,
695 struct btrfs_ordered_sum *sums) 666 struct btrfs_ordered_sum *sums)
696{ 667{
697 u64 bytenr;
698 int ret;
699 struct btrfs_key file_key; 668 struct btrfs_key file_key;
700 struct btrfs_key found_key; 669 struct btrfs_key found_key;
701 u64 next_offset;
702 u64 total_bytes = 0;
703 int found_next;
704 struct btrfs_path *path; 670 struct btrfs_path *path;
705 struct btrfs_csum_item *item; 671 struct btrfs_csum_item *item;
706 struct btrfs_csum_item *item_end; 672 struct btrfs_csum_item *item_end;
707 struct extent_buffer *leaf = NULL; 673 struct extent_buffer *leaf = NULL;
674 u64 next_offset;
675 u64 total_bytes = 0;
708 u64 csum_offset; 676 u64 csum_offset;
709 struct btrfs_sector_sum *sector_sum; 677 u64 bytenr;
710 u32 nritems; 678 u32 nritems;
711 u32 ins_size; 679 u32 ins_size;
680 int index = 0;
681 int found_next;
682 int ret;
712 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 683 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
713 684
714 path = btrfs_alloc_path(); 685 path = btrfs_alloc_path();
715 if (!path) 686 if (!path)
716 return -ENOMEM; 687 return -ENOMEM;
717
718 sector_sum = sums->sums;
719again: 688again:
720 next_offset = (u64)-1; 689 next_offset = (u64)-1;
721 found_next = 0; 690 found_next = 0;
691 bytenr = sums->bytenr + total_bytes;
722 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 692 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
723 file_key.offset = sector_sum->bytenr; 693 file_key.offset = bytenr;
724 bytenr = sector_sum->bytenr;
725 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 694 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
726 695
727 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); 696 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
728 if (!IS_ERR(item)) { 697 if (!IS_ERR(item)) {
729 leaf = path->nodes[0];
730 ret = 0; 698 ret = 0;
699 leaf = path->nodes[0];
700 item_end = btrfs_item_ptr(leaf, path->slots[0],
701 struct btrfs_csum_item);
702 item_end = (struct btrfs_csum_item *)((char *)item_end +
703 btrfs_item_size_nr(leaf, path->slots[0]));
731 goto found; 704 goto found;
732 } 705 }
733 ret = PTR_ERR(item); 706 ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
807 780
808 free_space = btrfs_leaf_free_space(root, leaf) - 781 free_space = btrfs_leaf_free_space(root, leaf) -
809 sizeof(struct btrfs_item) - csum_size; 782 sizeof(struct btrfs_item) - csum_size;
810 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 783 tmp = sums->len - total_bytes;
811 root->sectorsize);
812 tmp >>= root->fs_info->sb->s_blocksize_bits; 784 tmp >>= root->fs_info->sb->s_blocksize_bits;
813 WARN_ON(tmp < 1); 785 WARN_ON(tmp < 1);
814 786
@@ -822,6 +794,7 @@ again:
822 diff *= csum_size; 794 diff *= csum_size;
823 795
824 btrfs_extend_item(root, path, diff); 796 btrfs_extend_item(root, path, diff);
797 ret = 0;
825 goto csum; 798 goto csum;
826 } 799 }
827 800
@@ -831,8 +804,7 @@ insert:
831 if (found_next) { 804 if (found_next) {
832 u64 tmp; 805 u64 tmp;
833 806
834 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 807 tmp = sums->len - total_bytes;
835 root->sectorsize);
836 tmp >>= root->fs_info->sb->s_blocksize_bits; 808 tmp >>= root->fs_info->sb->s_blocksize_bits;
837 tmp = min(tmp, (next_offset - file_key.offset) >> 809 tmp = min(tmp, (next_offset - file_key.offset) >>
838 root->fs_info->sb->s_blocksize_bits); 810 root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
853 WARN_ON(1); 825 WARN_ON(1);
854 goto fail_unlock; 826 goto fail_unlock;
855 } 827 }
856csum:
857 leaf = path->nodes[0]; 828 leaf = path->nodes[0];
829csum:
858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 830 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
859 ret = 0; 831 item_end = (struct btrfs_csum_item *)((unsigned char *)item +
832 btrfs_item_size_nr(leaf, path->slots[0]));
860 item = (struct btrfs_csum_item *)((unsigned char *)item + 833 item = (struct btrfs_csum_item *)((unsigned char *)item +
861 csum_offset * csum_size); 834 csum_offset * csum_size);
862found: 835found:
863 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 836 ins_size = (u32)(sums->len - total_bytes) >>
864 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 837 root->fs_info->sb->s_blocksize_bits;
865 btrfs_item_size_nr(leaf, path->slots[0])); 838 ins_size *= csum_size;
866next_sector: 839 ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
867 840 ins_size);
868 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size); 841 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
869 842 ins_size);
870 total_bytes += root->sectorsize; 843
871 sector_sum++; 844 ins_size /= csum_size;
872 if (total_bytes < sums->len) { 845 total_bytes += ins_size * root->sectorsize;
873 item = (struct btrfs_csum_item *)((char *)item + 846 index += ins_size;
874 csum_size);
875 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
876 sector_sum->bytenr) {
877 bytenr = sector_sum->bytenr;
878 goto next_sector;
879 }
880 }
881 847
882 btrfs_mark_buffer_dirty(path->nodes[0]); 848 btrfs_mark_buffer_dirty(path->nodes[0]);
883 if (total_bytes < sums->len) { 849 if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4205ba752d40..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
309 ret = PTR_ERR(inode_root); 309 ret = PTR_ERR(inode_root);
310 goto cleanup; 310 goto cleanup;
311 } 311 }
312 if (btrfs_root_refs(&inode_root->root_item) == 0) {
313 ret = -ENOENT;
314 goto cleanup;
315 }
316 312
317 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
318 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
1317 1313
1318} 1314}
1319 1315
1316static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1317 size_t *write_bytes)
1318{
1319 struct btrfs_trans_handle *trans;
1320 struct btrfs_root *root = BTRFS_I(inode)->root;
1321 struct btrfs_ordered_extent *ordered;
1322 u64 lockstart, lockend;
1323 u64 num_bytes;
1324 int ret;
1325
1326 lockstart = round_down(pos, root->sectorsize);
1327 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
1328
1329 while (1) {
1330 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1331 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1332 lockend - lockstart + 1);
1333 if (!ordered) {
1334 break;
1335 }
1336 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1337 btrfs_start_ordered_extent(inode, ordered, 1);
1338 btrfs_put_ordered_extent(ordered);
1339 }
1340
1341 trans = btrfs_join_transaction(root);
1342 if (IS_ERR(trans)) {
1343 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1344 return PTR_ERR(trans);
1345 }
1346
1347 num_bytes = lockend - lockstart + 1;
1348 ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
1349 NULL);
1350 btrfs_end_transaction(trans, root);
1351 if (ret <= 0) {
1352 ret = 0;
1353 } else {
1354 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1355 EXTENT_DIRTY | EXTENT_DELALLOC |
1356 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1357 NULL, GFP_NOFS);
1358 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1359 }
1360
1361 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1362
1363 return ret;
1364}
1365
1320static noinline ssize_t __btrfs_buffered_write(struct file *file, 1366static noinline ssize_t __btrfs_buffered_write(struct file *file,
1321 struct iov_iter *i, 1367 struct iov_iter *i,
1322 loff_t pos) 1368 loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1324 struct inode *inode = file_inode(file); 1370 struct inode *inode = file_inode(file);
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1371 struct btrfs_root *root = BTRFS_I(inode)->root;
1326 struct page **pages = NULL; 1372 struct page **pages = NULL;
1373 u64 release_bytes = 0;
1327 unsigned long first_index; 1374 unsigned long first_index;
1328 size_t num_written = 0; 1375 size_t num_written = 0;
1329 int nrptrs; 1376 int nrptrs;
1330 int ret = 0; 1377 int ret = 0;
1378 bool only_release_metadata = false;
1331 bool force_page_uptodate = false; 1379 bool force_page_uptodate = false;
1332 1380
1333 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1381 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 offset); 1396 offset);
1349 size_t num_pages = (write_bytes + offset + 1397 size_t num_pages = (write_bytes + offset +
1350 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1398 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1399 size_t reserve_bytes;
1351 size_t dirty_pages; 1400 size_t dirty_pages;
1352 size_t copied; 1401 size_t copied;
1353 1402
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1362 break; 1411 break;
1363 } 1412 }
1364 1413
1365 ret = btrfs_delalloc_reserve_space(inode, 1414 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1366 num_pages << PAGE_CACHE_SHIFT); 1415 ret = btrfs_check_data_free_space(inode, reserve_bytes);
1416 if (ret == -ENOSPC &&
1417 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1418 BTRFS_INODE_PREALLOC))) {
1419 ret = check_can_nocow(inode, pos, &write_bytes);
1420 if (ret > 0) {
1421 only_release_metadata = true;
1422 /*
1423 * our prealloc extent may be smaller than
1424 * write_bytes, so scale down.
1425 */
1426 num_pages = (write_bytes + offset +
1427 PAGE_CACHE_SIZE - 1) >>
1428 PAGE_CACHE_SHIFT;
1429 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1430 ret = 0;
1431 } else {
1432 ret = -ENOSPC;
1433 }
1434 }
1435
1367 if (ret) 1436 if (ret)
1368 break; 1437 break;
1369 1438
1439 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1440 if (ret) {
1441 if (!only_release_metadata)
1442 btrfs_free_reserved_data_space(inode,
1443 reserve_bytes);
1444 break;
1445 }
1446
1447 release_bytes = reserve_bytes;
1448
1370 /* 1449 /*
1371 * This is going to setup the pages array with the number of 1450 * This is going to setup the pages array with the number of
1372 * pages we want, so we don't really need to worry about the 1451 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1375 ret = prepare_pages(root, file, pages, num_pages, 1454 ret = prepare_pages(root, file, pages, num_pages,
1376 pos, first_index, write_bytes, 1455 pos, first_index, write_bytes,
1377 force_page_uptodate); 1456 force_page_uptodate);
1378 if (ret) { 1457 if (ret)
1379 btrfs_delalloc_release_space(inode,
1380 num_pages << PAGE_CACHE_SHIFT);
1381 break; 1458 break;
1382 }
1383 1459
1384 copied = btrfs_copy_from_user(pos, num_pages, 1460 copied = btrfs_copy_from_user(pos, num_pages,
1385 write_bytes, pages, i); 1461 write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1409 * managed to copy. 1485 * managed to copy.
1410 */ 1486 */
1411 if (num_pages > dirty_pages) { 1487 if (num_pages > dirty_pages) {
1488 release_bytes = (num_pages - dirty_pages) <<
1489 PAGE_CACHE_SHIFT;
1412 if (copied > 0) { 1490 if (copied > 0) {
1413 spin_lock(&BTRFS_I(inode)->lock); 1491 spin_lock(&BTRFS_I(inode)->lock);
1414 BTRFS_I(inode)->outstanding_extents++; 1492 BTRFS_I(inode)->outstanding_extents++;
1415 spin_unlock(&BTRFS_I(inode)->lock); 1493 spin_unlock(&BTRFS_I(inode)->lock);
1416 } 1494 }
1417 btrfs_delalloc_release_space(inode, 1495 if (only_release_metadata)
1418 (num_pages - dirty_pages) << 1496 btrfs_delalloc_release_metadata(inode,
1419 PAGE_CACHE_SHIFT); 1497 release_bytes);
1498 else
1499 btrfs_delalloc_release_space(inode,
1500 release_bytes);
1420 } 1501 }
1421 1502
1503 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1422 if (copied > 0) { 1504 if (copied > 0) {
1423 ret = btrfs_dirty_pages(root, inode, pages, 1505 ret = btrfs_dirty_pages(root, inode, pages,
1424 dirty_pages, pos, copied, 1506 dirty_pages, pos, copied,
1425 NULL); 1507 NULL);
1426 if (ret) { 1508 if (ret) {
1427 btrfs_delalloc_release_space(inode,
1428 dirty_pages << PAGE_CACHE_SHIFT);
1429 btrfs_drop_pages(pages, num_pages); 1509 btrfs_drop_pages(pages, num_pages);
1430 break; 1510 break;
1431 } 1511 }
1432 } 1512 }
1433 1513
1514 release_bytes = 0;
1434 btrfs_drop_pages(pages, num_pages); 1515 btrfs_drop_pages(pages, num_pages);
1435 1516
1517 if (only_release_metadata && copied > 0) {
1518 u64 lockstart = round_down(pos, root->sectorsize);
1519 u64 lockend = lockstart +
1520 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1521
1522 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1523 lockend, EXTENT_NORESERVE, NULL,
1524 NULL, GFP_NOFS);
1525 only_release_metadata = false;
1526 }
1527
1436 cond_resched(); 1528 cond_resched();
1437 1529
1438 balance_dirty_pages_ratelimited(inode->i_mapping); 1530 balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1445 1537
1446 kfree(pages); 1538 kfree(pages);
1447 1539
1540 if (release_bytes) {
1541 if (only_release_metadata)
1542 btrfs_delalloc_release_metadata(inode, release_bytes);
1543 else
1544 btrfs_delalloc_release_space(inode, release_bytes);
1545 }
1546
1448 return num_written ? num_written : ret; 1547 return num_written ? num_written : ret;
1449} 1548}
1450 1549
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2175 goto out_reserve_fail; 2274 goto out_reserve_fail;
2176 } 2275 }
2177 2276
2178 /*
2179 * wait for ordered IO before we have any locks. We'll loop again
2180 * below with the locks held.
2181 */
2182 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2183
2184 mutex_lock(&inode->i_mutex); 2277 mutex_lock(&inode->i_mutex);
2185 ret = inode_newsize_ok(inode, alloc_end); 2278 ret = inode_newsize_ok(inode, alloc_end);
2186 if (ret) 2279 if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
2191 alloc_start); 2284 alloc_start);
2192 if (ret) 2285 if (ret)
2193 goto out; 2286 goto out;
2287 } else {
2288 /*
2289 * If we are fallocating from the end of the file onward we
2290 * need to zero out the end of the page if i_size lands in the
2291 * middle of a page.
2292 */
2293 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2294 if (ret)
2295 goto out;
2194 } 2296 }
2195 2297
2298 /*
2299 * wait for ordered IO before we have any locks. We'll loop again
2300 * below with the locks held.
2301 */
2302 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2303
2196 locked_end = alloc_end - 1; 2304 locked_end = alloc_end - 1;
2197 while (1) { 2305 while (1) {
2198 struct btrfs_ordered_extent *ordered; 2306 struct btrfs_ordered_extent *ordered;
@@ -2425,20 +2533,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2425 } 2533 }
2426 } 2534 }
2427 2535
2428 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { 2536 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2429 offset = -EINVAL;
2430 goto out;
2431 }
2432 if (offset > inode->i_sb->s_maxbytes) {
2433 offset = -EINVAL;
2434 goto out;
2435 }
2436
2437 /* Special lock needed here? */
2438 if (offset != file->f_pos) {
2439 file->f_pos = offset;
2440 file->f_version = 0;
2441 }
2442out: 2537out:
2443 mutex_unlock(&inode->i_mutex); 2538 mutex_unlock(&inode->i_mutex);
2444 return offset; 2539 return offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e53009657f0e..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
213 else 213 else
214 ret = 0; 214 ret = 0;
215 spin_unlock(&rsv->lock); 215 spin_unlock(&rsv->lock);
216 return 0; 216 return ret;
217} 217}
218 218
219int btrfs_truncate_free_space_cache(struct btrfs_root *root, 219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
3150 return 0; 3150 return 0;
3151} 3151}
3152 3152
3153#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
3154
3153/* 3155/*
3154 * This test just does basic sanity checking, making sure we can add an exten 3156 * This test just does basic sanity checking, making sure we can add an exten
3155 * entry and remove space from either end and the middle, and make sure we can 3157 * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
3159{ 3161{
3160 int ret = 0; 3162 int ret = 0;
3161 3163
3162 printk(KERN_ERR "Running extent only tests\n"); 3164 test_msg("Running extent only tests\n");
3163 3165
3164 /* First just make sure we can remove an entire entry */ 3166 /* First just make sure we can remove an entire entry */
3165 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3167 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3166 if (ret) { 3168 if (ret) {
3167 printk(KERN_ERR "Error adding initial extents %d\n", ret); 3169 test_msg("Error adding initial extents %d\n", ret);
3168 return ret; 3170 return ret;
3169 } 3171 }
3170 3172
3171 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3173 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3172 if (ret) { 3174 if (ret) {
3173 printk(KERN_ERR "Error removing extent %d\n", ret); 3175 test_msg("Error removing extent %d\n", ret);
3174 return ret; 3176 return ret;
3175 } 3177 }
3176 3178
3177 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3179 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3178 printk(KERN_ERR "Full remove left some lingering space\n"); 3180 test_msg("Full remove left some lingering space\n");
3179 return -1; 3181 return -1;
3180 } 3182 }
3181 3183
3182 /* Ok edge and middle cases now */ 3184 /* Ok edge and middle cases now */
3183 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3185 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3184 if (ret) { 3186 if (ret) {
3185 printk(KERN_ERR "Error adding half extent %d\n", ret); 3187 test_msg("Error adding half extent %d\n", ret);
3186 return ret; 3188 return ret;
3187 } 3189 }
3188 3190
3189 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); 3191 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
3190 if (ret) { 3192 if (ret) {
3191 printk(KERN_ERR "Error removing tail end %d\n", ret); 3193 test_msg("Error removing tail end %d\n", ret);
3192 return ret; 3194 return ret;
3193 } 3195 }
3194 3196
3195 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3197 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3196 if (ret) { 3198 if (ret) {
3197 printk(KERN_ERR "Error removing front end %d\n", ret); 3199 test_msg("Error removing front end %d\n", ret);
3198 return ret; 3200 return ret;
3199 } 3201 }
3200 3202
3201 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 3203 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
3202 if (ret) { 3204 if (ret) {
3203 printk(KERN_ERR "Error removing middle peice %d\n", ret); 3205 test_msg("Error removing middle piece %d\n", ret);
3204 return ret; 3206 return ret;
3205 } 3207 }
3206 3208
3207 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3209 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3208 printk(KERN_ERR "Still have space at the front\n"); 3210 test_msg("Still have space at the front\n");
3209 return -1; 3211 return -1;
3210 } 3212 }
3211 3213
3212 if (check_exists(cache, 2 * 1024 * 1024, 4096)) { 3214 if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
3213 printk(KERN_ERR "Still have space in the middle\n"); 3215 test_msg("Still have space in the middle\n");
3214 return -1; 3216 return -1;
3215 } 3217 }
3216 3218
3217 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { 3219 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
3218 printk(KERN_ERR "Still have space at the end\n"); 3220 test_msg("Still have space at the end\n");
3219 return -1; 3221 return -1;
3220 } 3222 }
3221 3223
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3230 u64 next_bitmap_offset; 3232 u64 next_bitmap_offset;
3231 int ret; 3233 int ret;
3232 3234
3233 printk(KERN_ERR "Running bitmap only tests\n"); 3235 test_msg("Running bitmap only tests\n");
3234 3236
3235 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3237 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3236 if (ret) { 3238 if (ret) {
3237 printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); 3239 test_msg("Couldn't create a bitmap entry %d\n", ret);
3238 return ret; 3240 return ret;
3239 } 3241 }
3240 3242
3241 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3243 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3242 if (ret) { 3244 if (ret) {
3243 printk(KERN_ERR "Error removing bitmap full range %d\n", ret); 3245 test_msg("Error removing bitmap full range %d\n", ret);
3244 return ret; 3246 return ret;
3245 } 3247 }
3246 3248
3247 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3249 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3248 printk(KERN_ERR "Left some space in bitmap\n"); 3250 test_msg("Left some space in bitmap\n");
3249 return -1; 3251 return -1;
3250 } 3252 }
3251 3253
3252 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3254 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3253 if (ret) { 3255 if (ret) {
3254 printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); 3256 test_msg("Couldn't add to our bitmap entry %d\n", ret);
3255 return ret; 3257 return ret;
3256 } 3258 }
3257 3259
3258 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); 3260 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
3259 if (ret) { 3261 if (ret) {
3260 printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); 3262 test_msg("Couldn't remove middle chunk %d\n", ret);
3261 return ret; 3263 return ret;
3262 } 3264 }
3263 3265
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3271 ret = add_free_space_entry(cache, next_bitmap_offset - 3273 ret = add_free_space_entry(cache, next_bitmap_offset -
3272 (2 * 1024 * 1024), 4 * 1024 * 1024, 1); 3274 (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
3273 if (ret) { 3275 if (ret) {
3274 printk(KERN_ERR "Couldn't add space that straddles two bitmaps" 3276 test_msg("Couldn't add space that straddles two bitmaps %d\n",
3275 " %d\n", ret); 3277 ret);
3276 return ret; 3278 return ret;
3277 } 3279 }
3278 3280
3279 ret = btrfs_remove_free_space(cache, next_bitmap_offset - 3281 ret = btrfs_remove_free_space(cache, next_bitmap_offset -
3280 (1 * 1024 * 1024), 2 * 1024 * 1024); 3282 (1 * 1024 * 1024), 2 * 1024 * 1024);
3281 if (ret) { 3283 if (ret) {
3282 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3284 test_msg("Couldn't remove overlapping space %d\n", ret);
3283 return ret; 3285 return ret;
3284 } 3286 }
3285 3287
3286 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), 3288 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
3287 2 * 1024 * 1024)) { 3289 2 * 1024 * 1024)) {
3288 printk(KERN_ERR "Left some space when removing overlapping\n"); 3290 test_msg("Left some space when removing overlapping\n");
3289 return -1; 3291 return -1;
3290 } 3292 }
3291 3293
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3300 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); 3302 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
3301 int ret; 3303 int ret;
3302 3304
3303 printk(KERN_ERR "Running bitmap and extent tests\n"); 3305 test_msg("Running bitmap and extent tests\n");
3304 3306
3305 /* 3307 /*
3306 * First let's do something simple, an extent at the same offset as the 3308 * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3309 */ 3311 */
3310 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); 3312 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
3311 if (ret) { 3313 if (ret) {
3312 printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); 3314 test_msg("Couldn't create bitmap entry %d\n", ret);
3313 return ret; 3315 return ret;
3314 } 3316 }
3315 3317
3316 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3318 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3317 if (ret) { 3319 if (ret) {
3318 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3320 test_msg("Couldn't add extent entry %d\n", ret);
3319 return ret; 3321 return ret;
3320 } 3322 }
3321 3323
3322 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3324 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3323 if (ret) { 3325 if (ret) {
3324 printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); 3326 test_msg("Couldn't remove extent entry %d\n", ret);
3325 return ret; 3327 return ret;
3326 } 3328 }
3327 3329
3328 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3330 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3329 printk(KERN_ERR "Left remnants after our remove\n"); 3331 test_msg("Left remnants after our remove\n");
3330 return -1; 3332 return -1;
3331 } 3333 }
3332 3334
3333 /* Now to add back the extent entry and remove from the bitmap */ 3335 /* Now to add back the extent entry and remove from the bitmap */
3334 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3336 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3335 if (ret) { 3337 if (ret) {
3336 printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); 3338 test_msg("Couldn't re-add extent entry %d\n", ret);
3337 return ret; 3339 return ret;
3338 } 3340 }
3339 3341
3340 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); 3342 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
3341 if (ret) { 3343 if (ret) {
3342 printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); 3344 test_msg("Couldn't remove from bitmap %d\n", ret);
3343 return ret; 3345 return ret;
3344 } 3346 }
3345 3347
3346 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { 3348 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
3347 printk(KERN_ERR "Left remnants in the bitmap\n"); 3349 test_msg("Left remnants in the bitmap\n");
3348 return -1; 3350 return -1;
3349 } 3351 }
3350 3352
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3354 */ 3356 */
3355 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); 3357 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
3356 if (ret) { 3358 if (ret) {
3357 printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); 3359 test_msg("Couldn't add to a bitmap %d\n", ret);
3358 return ret; 3360 return ret;
3359 } 3361 }
3360 3362
3361 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); 3363 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
3362 if (ret) { 3364 if (ret) {
3363 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3365 test_msg("Couldn't remove overlapping space %d\n", ret);
3364 return ret; 3366 return ret;
3365 } 3367 }
3366 3368
3367 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 3369 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
3368 printk(KERN_ERR "Left over peices after removing " 3370 test_msg("Left over peices after removing overlapping\n");
3369 "overlapping\n");
3370 return -1; 3371 return -1;
3371 } 3372 }
3372 3373
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3375 /* Now with the extent entry offset into the bitmap */ 3376 /* Now with the extent entry offset into the bitmap */
3376 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); 3377 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
3377 if (ret) { 3378 if (ret) {
3378 printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); 3379 test_msg("Couldn't add space to the bitmap %d\n", ret);
3379 return ret; 3380 return ret;
3380 } 3381 }
3381 3382
3382 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); 3383 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
3383 if (ret) { 3384 if (ret) {
3384 printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); 3385 test_msg("Couldn't add extent to the cache %d\n", ret);
3385 return ret; 3386 return ret;
3386 } 3387 }
3387 3388
3388 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); 3389 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
3389 if (ret) { 3390 if (ret) {
3390 printk(KERN_ERR "Problem removing overlapping space %d\n", ret); 3391 test_msg("Problem removing overlapping space %d\n", ret);
3391 return ret; 3392 return ret;
3392 } 3393 }
3393 3394
3394 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { 3395 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
3395 printk(KERN_ERR "Left something behind when removing space"); 3396 test_msg("Left something behind when removing space");
3396 return -1; 3397 return -1;
3397 } 3398 }
3398 3399
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3410 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, 3411 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
3411 4 * 1024 * 1024, 1); 3412 4 * 1024 * 1024, 1);
3412 if (ret) { 3413 if (ret) {
3413 printk(KERN_ERR "Couldn't add bitmap %d\n", ret); 3414 test_msg("Couldn't add bitmap %d\n", ret);
3414 return ret; 3415 return ret;
3415 } 3416 }
3416 3417
3417 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, 3418 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
3418 5 * 1024 * 1024, 0); 3419 5 * 1024 * 1024, 0);
3419 if (ret) { 3420 if (ret) {
3420 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3421 test_msg("Couldn't add extent entry %d\n", ret);
3421 return ret; 3422 return ret;
3422 } 3423 }
3423 3424
3424 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, 3425 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
3425 5 * 1024 * 1024); 3426 5 * 1024 * 1024);
3426 if (ret) { 3427 if (ret) {
3427 printk(KERN_ERR "Failed to free our space %d\n", ret); 3428 test_msg("Failed to free our space %d\n", ret);
3428 return ret; 3429 return ret;
3429 } 3430 }
3430 3431
3431 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, 3432 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
3432 5 * 1024 * 1024)) { 3433 5 * 1024 * 1024)) {
3433 printk(KERN_ERR "Left stuff over\n"); 3434 test_msg("Left stuff over\n");
3434 return -1; 3435 return -1;
3435 } 3436 }
3436 3437
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3444 */ 3445 */
3445 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); 3446 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
3446 if (ret) { 3447 if (ret) {
3447 printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); 3448 test_msg("Couldn't add bitmap entry %d\n", ret);
3448 return ret; 3449 return ret;
3449 } 3450 }
3450 3451
3451 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); 3452 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
3452 if (ret) { 3453 if (ret) {
3453 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3454 test_msg("Couldn't add extent entry %d\n", ret);
3454 return ret; 3455 return ret;
3455 } 3456 }
3456 3457
3457 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); 3458 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
3458 if (ret) { 3459 if (ret) {
3459 printk(KERN_ERR "Error removing bitmap and extent " 3460 test_msg("Error removing bitmap and extent overlapping %d\n", ret);
3460 "overlapping %d\n", ret);
3461 return ret; 3461 return ret;
3462 } 3462 }
3463 3463
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
3469{ 3469{
3470 struct btrfs_block_group_cache *cache; 3470 struct btrfs_block_group_cache *cache;
3471 3471
3472 printk(KERN_ERR "Running btrfs free space cache tests\n"); 3472 test_msg("Running btrfs free space cache tests\n");
3473 3473
3474 cache = init_test_block_group(); 3474 cache = init_test_block_group();
3475 if (!cache) { 3475 if (!cache) {
3476 printk(KERN_ERR "Couldn't run the tests\n"); 3476 test_msg("Couldn't run the tests\n");
3477 return; 3477 return;
3478 } 3478 }
3479 3479
@@ -3487,6 +3487,9 @@ out:
3487 __btrfs_remove_free_space_cache(cache->free_space_ctl); 3487 __btrfs_remove_free_space_cache(cache->free_space_ctl);
3488 kfree(cache->free_space_ctl); 3488 kfree(cache->free_space_ctl);
3489 kfree(cache); 3489 kfree(cache);
3490 printk(KERN_ERR "Free space cache tests finished\n"); 3490 test_msg("Free space cache tests finished\n");
3491} 3491}
3492#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3492#undef test_msg
3493#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3494void btrfs_test_free_space_cache(void) {}
3495#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
114 u64 *trimmed, u64 start, u64 end, u64 minlen); 114 u64 *trimmed, u64 start, u64 end, u64 minlen);
115 115
116#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
117void btrfs_test_free_space_cache(void); 116void btrfs_test_free_space_cache(void);
118#endif
119 117
120#endif 118#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f9d16b70d3d..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/btrfs.h> 43#include <linux/btrfs.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/posix_acl_xattr.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
@@ -57,6 +58,7 @@
57#include "free-space-cache.h" 58#include "free-space-cache.h"
58#include "inode-map.h" 59#include "inode-map.h"
59#include "backref.h" 60#include "backref.h"
61#include "hash.h"
60 62
61struct btrfs_iget_args { 63struct btrfs_iget_args {
62 u64 ino; 64 u64 ino;
@@ -701,8 +703,12 @@ retry:
701 async_extent->nr_pages = 0; 703 async_extent->nr_pages = 0;
702 async_extent->pages = NULL; 704 async_extent->pages = NULL;
703 705
704 if (ret == -ENOSPC) 706 if (ret == -ENOSPC) {
707 unlock_extent(io_tree, async_extent->start,
708 async_extent->start +
709 async_extent->ram_size - 1);
705 goto retry; 710 goto retry;
711 }
706 goto out_free; 712 goto out_free;
707 } 713 }
708 714
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
1529 spin_unlock(&BTRFS_I(inode)->lock); 1535 spin_unlock(&BTRFS_I(inode)->lock);
1530} 1536}
1531 1537
1538static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1539 struct inode *inode)
1540{
1541 spin_lock(&root->delalloc_lock);
1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 root->nr_delalloc_inodes++;
1548 if (root->nr_delalloc_inodes == 1) {
1549 spin_lock(&root->fs_info->delalloc_root_lock);
1550 BUG_ON(!list_empty(&root->delalloc_root));
1551 list_add_tail(&root->delalloc_root,
1552 &root->fs_info->delalloc_roots);
1553 spin_unlock(&root->fs_info->delalloc_root_lock);
1554 }
1555 }
1556 spin_unlock(&root->delalloc_lock);
1557}
1558
1559static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1560 struct inode *inode)
1561{
1562 spin_lock(&root->delalloc_lock);
1563 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1564 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1565 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1566 &BTRFS_I(inode)->runtime_flags);
1567 root->nr_delalloc_inodes--;
1568 if (!root->nr_delalloc_inodes) {
1569 spin_lock(&root->fs_info->delalloc_root_lock);
1570 BUG_ON(list_empty(&root->delalloc_root));
1571 list_del_init(&root->delalloc_root);
1572 spin_unlock(&root->fs_info->delalloc_root_lock);
1573 }
1574 }
1575 spin_unlock(&root->delalloc_lock);
1576}
1577
1532/* 1578/*
1533 * extent_io.c set_bit_hook, used to track delayed allocation 1579 * extent_io.c set_bit_hook, used to track delayed allocation
1534 * bytes in this file, and to maintain the list of inodes that 1580 * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1561 spin_lock(&BTRFS_I(inode)->lock); 1607 spin_lock(&BTRFS_I(inode)->lock);
1562 BTRFS_I(inode)->delalloc_bytes += len; 1608 BTRFS_I(inode)->delalloc_bytes += len;
1563 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1609 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1564 &BTRFS_I(inode)->runtime_flags)) { 1610 &BTRFS_I(inode)->runtime_flags))
1565 spin_lock(&root->fs_info->delalloc_lock); 1611 btrfs_add_delalloc_inodes(root, inode);
1566 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1567 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1568 &root->fs_info->delalloc_inodes);
1569 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1570 &BTRFS_I(inode)->runtime_flags);
1571 }
1572 spin_unlock(&root->fs_info->delalloc_lock);
1573 }
1574 spin_unlock(&BTRFS_I(inode)->lock); 1612 spin_unlock(&BTRFS_I(inode)->lock);
1575 } 1613 }
1576} 1614}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1604 btrfs_delalloc_release_metadata(inode, len); 1642 btrfs_delalloc_release_metadata(inode, len);
1605 1643
1606 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1644 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1607 && do_list) 1645 && do_list && !(state->state & EXTENT_NORESERVE))
1608 btrfs_free_reserved_data_space(inode, len); 1646 btrfs_free_reserved_data_space(inode, len);
1609 1647
1610 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1648 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1613 BTRFS_I(inode)->delalloc_bytes -= len; 1651 BTRFS_I(inode)->delalloc_bytes -= len;
1614 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1652 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1615 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1653 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1616 &BTRFS_I(inode)->runtime_flags)) { 1654 &BTRFS_I(inode)->runtime_flags))
1617 spin_lock(&root->fs_info->delalloc_lock); 1655 btrfs_del_delalloc_inode(root, inode);
1618 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1619 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1620 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1621 &BTRFS_I(inode)->runtime_flags);
1622 }
1623 spin_unlock(&root->fs_info->delalloc_lock);
1624 }
1625 spin_unlock(&BTRFS_I(inode)->lock); 1656 spin_unlock(&BTRFS_I(inode)->lock);
1626 } 1657 }
1627} 1658}
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2263 return 0; 2294 return 0;
2264 return PTR_ERR(root); 2295 return PTR_ERR(root);
2265 } 2296 }
2266 if (btrfs_root_refs(&root->root_item) == 0) {
2267 srcu_read_unlock(&fs_info->subvol_srcu, index);
2268 /* parse ENOENT to 0 */
2269 return 0;
2270 }
2271 2297
2272 /* step 2: get inode */ 2298 /* step 2: get inode */
2273 key.objectid = backref->inum; 2299 key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3215 /* 1 for the orphan item deletion. */ 3241 /* 1 for the orphan item deletion. */
3216 trans = btrfs_start_transaction(root, 1); 3242 trans = btrfs_start_transaction(root, 1);
3217 if (IS_ERR(trans)) { 3243 if (IS_ERR(trans)) {
3244 iput(inode);
3218 ret = PTR_ERR(trans); 3245 ret = PTR_ERR(trans);
3219 goto out; 3246 goto out;
3220 } 3247 }
3221 ret = btrfs_orphan_add(trans, inode); 3248 ret = btrfs_orphan_add(trans, inode);
3222 btrfs_end_transaction(trans, root); 3249 btrfs_end_transaction(trans, root);
3223 if (ret) 3250 if (ret) {
3251 iput(inode);
3224 goto out; 3252 goto out;
3253 }
3225 3254
3226 ret = btrfs_truncate(inode); 3255 ret = btrfs_truncate(inode);
3227 if (ret) 3256 if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3274{ 3303{
3275 u32 nritems = btrfs_header_nritems(leaf); 3304 u32 nritems = btrfs_header_nritems(leaf);
3276 struct btrfs_key found_key; 3305 struct btrfs_key found_key;
3306 static u64 xattr_access = 0;
3307 static u64 xattr_default = 0;
3277 int scanned = 0; 3308 int scanned = 0;
3278 3309
3310 if (!xattr_access) {
3311 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3312 strlen(POSIX_ACL_XATTR_ACCESS));
3313 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3314 strlen(POSIX_ACL_XATTR_DEFAULT));
3315 }
3316
3279 slot++; 3317 slot++;
3280 while (slot < nritems) { 3318 while (slot < nritems) {
3281 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3319 btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3285 return 0; 3323 return 0;
3286 3324
3287 /* we found an xattr, assume we've got an acl */ 3325 /* we found an xattr, assume we've got an acl */
3288 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 3326 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3289 return 1; 3327 if (found_key.offset == xattr_access ||
3328 found_key.offset == xattr_default)
3329 return 1;
3330 }
3290 3331
3291 /* 3332 /*
3292 * we found a key greater than an xattr key, there can't 3333 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3660 } 3701 }
3661 return ret; 3702 return ret;
3662} 3703}
3663
3664
3665/* helper to check if there is any shared block in the path */
3666static int check_path_shared(struct btrfs_root *root,
3667 struct btrfs_path *path)
3668{
3669 struct extent_buffer *eb;
3670 int level;
3671 u64 refs = 1;
3672
3673 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3674 int ret;
3675
3676 if (!path->nodes[level])
3677 break;
3678 eb = path->nodes[level];
3679 if (!btrfs_block_can_be_shared(root, eb))
3680 continue;
3681 ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
3682 &refs, NULL);
3683 if (refs > 1)
3684 return 1;
3685 }
3686 return 0;
3687}
3688 3704
3689/* 3705/*
3690 * helper to start transaction for unlink and rmdir. 3706 * helper to start transaction for unlink and rmdir.
3691 * 3707 *
3692 * unlink and rmdir are special in btrfs, they do not always free space. 3708 * unlink and rmdir are special in btrfs, they do not always free space, so
3693 * so in enospc case, we should make sure they will free space before 3709 * if we cannot make our reservations the normal way try and see if there is
3694 * allowing them to use the global metadata reservation. 3710 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3711 * allow the unlink to occur.
3695 */ 3712 */
3696static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 3713static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3697 struct dentry *dentry)
3698{ 3714{
3699 struct btrfs_trans_handle *trans; 3715 struct btrfs_trans_handle *trans;
3700 struct btrfs_root *root = BTRFS_I(dir)->root; 3716 struct btrfs_root *root = BTRFS_I(dir)->root;
3701 struct btrfs_path *path;
3702 struct btrfs_dir_item *di;
3703 struct inode *inode = dentry->d_inode;
3704 u64 index;
3705 int check_link = 1;
3706 int err = -ENOSPC;
3707 int ret; 3717 int ret;
3708 u64 ino = btrfs_ino(inode);
3709 u64 dir_ino = btrfs_ino(dir);
3710 3718
3711 /* 3719 /*
3712 * 1 for the possible orphan item 3720 * 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3719 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3727 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3720 return trans; 3728 return trans;
3721 3729
3722 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 3730 if (PTR_ERR(trans) == -ENOSPC) {
3723 return ERR_PTR(-ENOSPC); 3731 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3724
3725 /* check if there is someone else holds reference */
3726 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
3727 return ERR_PTR(-ENOSPC);
3728
3729 if (atomic_read(&inode->i_count) > 2)
3730 return ERR_PTR(-ENOSPC);
3731
3732 if (xchg(&root->fs_info->enospc_unlink, 1))
3733 return ERR_PTR(-ENOSPC);
3734
3735 path = btrfs_alloc_path();
3736 if (!path) {
3737 root->fs_info->enospc_unlink = 0;
3738 return ERR_PTR(-ENOMEM);
3739 }
3740 3732
3741 /* 1 for the orphan item */ 3733 trans = btrfs_start_transaction(root, 0);
3742 trans = btrfs_start_transaction(root, 1); 3734 if (IS_ERR(trans))
3743 if (IS_ERR(trans)) { 3735 return trans;
3744 btrfs_free_path(path); 3736 ret = btrfs_cond_migrate_bytes(root->fs_info,
3745 root->fs_info->enospc_unlink = 0; 3737 &root->fs_info->trans_block_rsv,
3746 return trans; 3738 num_bytes, 5);
3747 } 3739 if (ret) {
3748 3740 btrfs_end_transaction(trans, root);
3749 path->skip_locking = 1; 3741 return ERR_PTR(ret);
3750 path->search_commit_root = 1;
3751
3752 ret = btrfs_lookup_inode(trans, root, path,
3753 &BTRFS_I(dir)->location, 0);
3754 if (ret < 0) {
3755 err = ret;
3756 goto out;
3757 }
3758 if (ret == 0) {
3759 if (check_path_shared(root, path))
3760 goto out;
3761 } else {
3762 check_link = 0;
3763 }
3764 btrfs_release_path(path);
3765
3766 ret = btrfs_lookup_inode(trans, root, path,
3767 &BTRFS_I(inode)->location, 0);
3768 if (ret < 0) {
3769 err = ret;
3770 goto out;
3771 }
3772 if (ret == 0) {
3773 if (check_path_shared(root, path))
3774 goto out;
3775 } else {
3776 check_link = 0;
3777 }
3778 btrfs_release_path(path);
3779
3780 if (ret == 0 && S_ISREG(inode->i_mode)) {
3781 ret = btrfs_lookup_file_extent(trans, root, path,
3782 ino, (u64)-1, 0);
3783 if (ret < 0) {
3784 err = ret;
3785 goto out;
3786 } 3742 }
3787 BUG_ON(ret == 0); /* Corruption */
3788 if (check_path_shared(root, path))
3789 goto out;
3790 btrfs_release_path(path);
3791 }
3792
3793 if (!check_link) {
3794 err = 0;
3795 goto out;
3796 }
3797
3798 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3799 dentry->d_name.name, dentry->d_name.len, 0);
3800 if (IS_ERR(di)) {
3801 err = PTR_ERR(di);
3802 goto out;
3803 }
3804 if (di) {
3805 if (check_path_shared(root, path))
3806 goto out;
3807 } else {
3808 err = 0;
3809 goto out;
3810 }
3811 btrfs_release_path(path);
3812
3813 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3814 dentry->d_name.len, ino, dir_ino, 0,
3815 &index);
3816 if (ret) {
3817 err = ret;
3818 goto out;
3819 }
3820
3821 if (check_path_shared(root, path))
3822 goto out;
3823
3824 btrfs_release_path(path);
3825
3826 /*
3827 * This is a commit root search, if we can lookup inode item and other
3828 * relative items in the commit root, it means the transaction of
3829 * dir/file creation has been committed, and the dir index item that we
3830 * delay to insert has also been inserted into the commit root. So
3831 * we needn't worry about the delayed insertion of the dir index item
3832 * here.
3833 */
3834 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
3835 dentry->d_name.name, dentry->d_name.len, 0);
3836 if (IS_ERR(di)) {
3837 err = PTR_ERR(di);
3838 goto out;
3839 }
3840 BUG_ON(ret == -ENOENT);
3841 if (check_path_shared(root, path))
3842 goto out;
3843
3844 err = 0;
3845out:
3846 btrfs_free_path(path);
3847 /* Migrate the orphan reservation over */
3848 if (!err)
3849 err = btrfs_block_rsv_migrate(trans->block_rsv,
3850 &root->fs_info->global_block_rsv,
3851 trans->bytes_reserved);
3852
3853 if (err) {
3854 btrfs_end_transaction(trans, root);
3855 root->fs_info->enospc_unlink = 0;
3856 return ERR_PTR(err);
3857 }
3858
3859 trans->block_rsv = &root->fs_info->global_block_rsv;
3860 return trans;
3861}
3862
3863static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3864 struct btrfs_root *root)
3865{
3866 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3867 btrfs_block_rsv_release(root, trans->block_rsv,
3868 trans->bytes_reserved);
3869 trans->block_rsv = &root->fs_info->trans_block_rsv; 3743 trans->block_rsv = &root->fs_info->trans_block_rsv;
3870 BUG_ON(!root->fs_info->enospc_unlink); 3744 trans->bytes_reserved = num_bytes;
3871 root->fs_info->enospc_unlink = 0;
3872 } 3745 }
3873 btrfs_end_transaction(trans, root); 3746 return trans;
3874} 3747}
3875 3748
3876static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3749static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3880 struct inode *inode = dentry->d_inode; 3753 struct inode *inode = dentry->d_inode;
3881 int ret; 3754 int ret;
3882 3755
3883 trans = __unlink_start_trans(dir, dentry); 3756 trans = __unlink_start_trans(dir);
3884 if (IS_ERR(trans)) 3757 if (IS_ERR(trans))
3885 return PTR_ERR(trans); 3758 return PTR_ERR(trans);
3886 3759
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3898 } 3771 }
3899 3772
3900out: 3773out:
3901 __unlink_end_trans(trans, root); 3774 btrfs_end_transaction(trans, root);
3902 btrfs_btree_balance_dirty(root); 3775 btrfs_btree_balance_dirty(root);
3903 return ret; 3776 return ret;
3904} 3777}
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3995 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3868 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3996 return -EPERM; 3869 return -EPERM;
3997 3870
3998 trans = __unlink_start_trans(dir, dentry); 3871 trans = __unlink_start_trans(dir);
3999 if (IS_ERR(trans)) 3872 if (IS_ERR(trans))
4000 return PTR_ERR(trans); 3873 return PTR_ERR(trans);
4001 3874
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4017 if (!err) 3890 if (!err)
4018 btrfs_i_size_write(inode, 0); 3891 btrfs_i_size_write(inode, 0);
4019out: 3892out:
4020 __unlink_end_trans(trans, root); 3893 btrfs_end_transaction(trans, root);
4021 btrfs_btree_balance_dirty(root); 3894 btrfs_btree_balance_dirty(root);
4022 3895
4023 return err; 3896 return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4395 u64 hole_size; 4268 u64 hole_size;
4396 int err = 0; 4269 int err = 0;
4397 4270
4271 /*
4272 * If our size started in the middle of a page we need to zero out the
4273 * rest of the page before we expand the i_size, otherwise we could
4274 * expose stale data.
4275 */
4276 err = btrfs_truncate_page(inode, oldsize, 0, 0);
4277 if (err)
4278 return err;
4279
4398 if (size <= hole_start) 4280 if (size <= hole_start)
4399 return 0; 4281 return 0;
4400 4282
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
4822 goto out; 4704 goto out;
4823 } 4705 }
4824 4706
4825 if (btrfs_root_refs(&new_root->root_item) == 0) {
4826 err = -ENOENT;
4827 goto out;
4828 }
4829
4830 *sub_root = new_root; 4707 *sub_root = new_root;
4831 location->objectid = btrfs_root_dirid(&new_root->root_item); 4708 location->objectid = btrfs_root_dirid(&new_root->root_item);
4832 location->type = BTRFS_INODE_ITEM_KEY; 4709 location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5092 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4969 if (!(inode->i_sb->s_flags & MS_RDONLY))
5093 ret = btrfs_orphan_cleanup(sub_root); 4970 ret = btrfs_orphan_cleanup(sub_root);
5094 up_read(&root->fs_info->cleanup_work_sem); 4971 up_read(&root->fs_info->cleanup_work_sem);
5095 if (ret) 4972 if (ret) {
4973 iput(inode);
5096 inode = ERR_PTR(ret); 4974 inode = ERR_PTR(ret);
4975 }
5097 } 4976 }
5098 4977
5099 return inode; 4978 return inode;
@@ -6501,10 +6380,10 @@ out:
6501 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6380 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6502 * block must be cow'd 6381 * block must be cow'd
6503 */ 6382 */
6504static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 6383noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
6505 struct inode *inode, u64 offset, u64 *len, 6384 struct inode *inode, u64 offset, u64 *len,
6506 u64 *orig_start, u64 *orig_block_len, 6385 u64 *orig_start, u64 *orig_block_len,
6507 u64 *ram_bytes) 6386 u64 *ram_bytes)
6508{ 6387{
6509 struct btrfs_path *path; 6388 struct btrfs_path *path;
6510 int ret; 6389 int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6518 u64 num_bytes; 6397 u64 num_bytes;
6519 int slot; 6398 int slot;
6520 int found_type; 6399 int found_type;
6521 6400 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6522 path = btrfs_alloc_path(); 6401 path = btrfs_alloc_path();
6523 if (!path) 6402 if (!path)
6524 return -ENOMEM; 6403 return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6558 /* not a regular extent, must cow */ 6437 /* not a regular extent, must cow */
6559 goto out; 6438 goto out;
6560 } 6439 }
6440
6441 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6442 goto out;
6443
6561 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6444 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6445 if (disk_bytenr == 0)
6446 goto out;
6447
6448 if (btrfs_file_extent_compression(leaf, fi) ||
6449 btrfs_file_extent_encryption(leaf, fi) ||
6450 btrfs_file_extent_other_encoding(leaf, fi))
6451 goto out;
6452
6562 backref_offset = btrfs_file_extent_offset(leaf, fi); 6453 backref_offset = btrfs_file_extent_offset(leaf, fi);
6563 6454
6564 *orig_start = key.offset - backref_offset; 6455 if (orig_start) {
6565 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6456 *orig_start = key.offset - backref_offset;
6566 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6457 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6458 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6459 }
6567 6460
6568 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6461 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6569 if (extent_end < offset + *len) {
6570 /* extent doesn't include our full range, must cow */
6571 goto out;
6572 }
6573 6462
6574 if (btrfs_extent_readonly(root, disk_bytenr)) 6463 if (btrfs_extent_readonly(root, disk_bytenr))
6575 goto out; 6464 goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6813 if (IS_ERR(trans)) 6702 if (IS_ERR(trans))
6814 goto must_cow; 6703 goto must_cow;
6815 6704
6816 if (can_nocow_odirect(trans, inode, start, &len, &orig_start, 6705 if (can_nocow_extent(trans, inode, start, &len, &orig_start,
6817 &orig_block_len, &ram_bytes) == 1) { 6706 &orig_block_len, &ram_bytes) == 1) {
6818 if (type == BTRFS_ORDERED_PREALLOC) { 6707 if (type == BTRFS_ORDERED_PREALLOC) {
6819 free_extent_map(em); 6708 free_extent_map(em);
6820 em = create_pinned_em(inode, start, len, 6709 em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7243{ 7132{
7244 struct btrfs_root *root = BTRFS_I(inode)->root; 7133 struct btrfs_root *root = BTRFS_I(inode)->root;
7245 struct btrfs_dio_private *dip; 7134 struct btrfs_dio_private *dip;
7246 struct bio_vec *bvec = dio_bio->bi_io_vec;
7247 struct bio *io_bio; 7135 struct bio *io_bio;
7248 int skip_sum; 7136 int skip_sum;
7249 int write = rw & REQ_WRITE; 7137 int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7265 } 7153 }
7266 7154
7267 dip->private = dio_bio->bi_private; 7155 dip->private = dio_bio->bi_private;
7268 io_bio->bi_private = dio_bio->bi_private;
7269 dip->inode = inode; 7156 dip->inode = inode;
7270 dip->logical_offset = file_offset; 7157 dip->logical_offset = file_offset;
7271 7158 dip->bytes = dio_bio->bi_size;
7272 dip->bytes = 0;
7273 do {
7274 dip->bytes += bvec->bv_len;
7275 bvec++;
7276 } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
7277
7278 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7159 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7279 io_bio->bi_private = dip; 7160 io_bio->bi_private = dip;
7280 dip->errors = 0; 7161 dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7373 atomic_inc(&inode->i_dio_count); 7254 atomic_inc(&inode->i_dio_count);
7374 smp_mb__after_atomic_inc(); 7255 smp_mb__after_atomic_inc();
7375 7256
7257 /*
7258 * The generic stuff only does filemap_write_and_wait_range, which isn't
7259 * enough if we've written compressed pages to this area, so we need to
7260 * call btrfs_wait_ordered_range to make absolutely sure that any
7261 * outstanding dirty pages are on disk.
7262 */
7263 count = iov_length(iov, nr_segs);
7264 btrfs_wait_ordered_range(inode, offset, count);
7265
7376 if (rw & WRITE) { 7266 if (rw & WRITE) {
7377 count = iov_length(iov, nr_segs);
7378 /* 7267 /*
7379 * If the write DIO is beyond the EOF, we need update 7268 * If the write DIO is beyond the EOF, we need update
7380 * the isize, but it is protected by i_mutex. So we can 7269 * the isize, but it is protected by i_mutex. So we can
@@ -7694,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
7694{ 7583{
7695 struct btrfs_root *root = BTRFS_I(inode)->root; 7584 struct btrfs_root *root = BTRFS_I(inode)->root;
7696 struct btrfs_block_rsv *rsv; 7585 struct btrfs_block_rsv *rsv;
7697 int ret; 7586 int ret = 0;
7698 int err = 0; 7587 int err = 0;
7699 struct btrfs_trans_handle *trans; 7588 struct btrfs_trans_handle *trans;
7700 u64 mask = root->sectorsize - 1; 7589 u64 mask = root->sectorsize - 1;
7701 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 7590 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7702 7591
7703 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
7704 if (ret)
7705 return ret;
7706
7707 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 7592 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7708 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 7593 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7709 7594
@@ -7961,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
7961 */ 7846 */
7962 smp_mb(); 7847 smp_mb();
7963 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7848 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7964 spin_lock(&root->fs_info->ordered_extent_lock); 7849 spin_lock(&root->fs_info->ordered_root_lock);
7965 list_del_init(&BTRFS_I(inode)->ordered_operations); 7850 list_del_init(&BTRFS_I(inode)->ordered_operations);
7966 spin_unlock(&root->fs_info->ordered_extent_lock); 7851 spin_unlock(&root->fs_info->ordered_root_lock);
7967 } 7852 }
7968 7853
7969 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7854 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8333,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8333 * some fairly slow code that needs optimization. This walks the list 8218 * some fairly slow code that needs optimization. This walks the list
8334 * of all the inodes with pending delalloc and forces them to disk. 8219 * of all the inodes with pending delalloc and forces them to disk.
8335 */ 8220 */
8336int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8221static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8337{ 8222{
8338 struct btrfs_inode *binode; 8223 struct btrfs_inode *binode;
8339 struct inode *inode; 8224 struct inode *inode;
@@ -8342,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8342 struct list_head splice; 8227 struct list_head splice;
8343 int ret = 0; 8228 int ret = 0;
8344 8229
8345 if (root->fs_info->sb->s_flags & MS_RDONLY)
8346 return -EROFS;
8347
8348 INIT_LIST_HEAD(&works); 8230 INIT_LIST_HEAD(&works);
8349 INIT_LIST_HEAD(&splice); 8231 INIT_LIST_HEAD(&splice);
8350 8232
8351 spin_lock(&root->fs_info->delalloc_lock); 8233 spin_lock(&root->delalloc_lock);
8352 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8234 list_splice_init(&root->delalloc_inodes, &splice);
8353 while (!list_empty(&splice)) { 8235 while (!list_empty(&splice)) {
8354 binode = list_entry(splice.next, struct btrfs_inode, 8236 binode = list_entry(splice.next, struct btrfs_inode,
8355 delalloc_inodes); 8237 delalloc_inodes);
8356 8238
8357 list_del_init(&binode->delalloc_inodes); 8239 list_move_tail(&binode->delalloc_inodes,
8358 8240 &root->delalloc_inodes);
8359 inode = igrab(&binode->vfs_inode); 8241 inode = igrab(&binode->vfs_inode);
8360 if (!inode) { 8242 if (!inode) {
8361 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 8243 cond_resched_lock(&root->delalloc_lock);
8362 &binode->runtime_flags);
8363 continue; 8244 continue;
8364 } 8245 }
8365 8246 spin_unlock(&root->delalloc_lock);
8366 list_add_tail(&binode->delalloc_inodes,
8367 &root->fs_info->delalloc_inodes);
8368 spin_unlock(&root->fs_info->delalloc_lock);
8369 8247
8370 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8248 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8371 if (unlikely(!work)) { 8249 if (unlikely(!work)) {
@@ -8377,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8377 &work->work); 8255 &work->work);
8378 8256
8379 cond_resched(); 8257 cond_resched();
8380 spin_lock(&root->fs_info->delalloc_lock); 8258 spin_lock(&root->delalloc_lock);
8381 } 8259 }
8382 spin_unlock(&root->fs_info->delalloc_lock); 8260 spin_unlock(&root->delalloc_lock);
8383 8261
8384 list_for_each_entry_safe(work, next, &works, list) { 8262 list_for_each_entry_safe(work, next, &works, list) {
8385 list_del_init(&work->list); 8263 list_del_init(&work->list);
8386 btrfs_wait_and_free_delalloc_work(work); 8264 btrfs_wait_and_free_delalloc_work(work);
8387 } 8265 }
8266 return 0;
8267out:
8268 list_for_each_entry_safe(work, next, &works, list) {
8269 list_del_init(&work->list);
8270 btrfs_wait_and_free_delalloc_work(work);
8271 }
8272
8273 if (!list_empty_careful(&splice)) {
8274 spin_lock(&root->delalloc_lock);
8275 list_splice_tail(&splice, &root->delalloc_inodes);
8276 spin_unlock(&root->delalloc_lock);
8277 }
8278 return ret;
8279}
8280
8281int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8282{
8283 int ret;
8388 8284
8389 /* the filemap_flush will queue IO into the worker threads, but 8285 if (root->fs_info->sb->s_flags & MS_RDONLY)
8286 return -EROFS;
8287
8288 ret = __start_delalloc_inodes(root, delay_iput);
8289 /*
8290 * the filemap_flush will queue IO into the worker threads, but
8390 * we have to make sure the IO is actually started and that 8291 * we have to make sure the IO is actually started and that
8391 * ordered extents get created before we return 8292 * ordered extents get created before we return
8392 */ 8293 */
@@ -8398,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8398 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8299 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8399 } 8300 }
8400 atomic_dec(&root->fs_info->async_submit_draining); 8301 atomic_dec(&root->fs_info->async_submit_draining);
8401 return 0; 8302 return ret;
8402out: 8303}
8403 list_for_each_entry_safe(work, next, &works, list) { 8304
8404 list_del_init(&work->list); 8305int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8405 btrfs_wait_and_free_delalloc_work(work); 8306 int delay_iput)
8307{
8308 struct btrfs_root *root;
8309 struct list_head splice;
8310 int ret;
8311
8312 if (fs_info->sb->s_flags & MS_RDONLY)
8313 return -EROFS;
8314
8315 INIT_LIST_HEAD(&splice);
8316
8317 spin_lock(&fs_info->delalloc_root_lock);
8318 list_splice_init(&fs_info->delalloc_roots, &splice);
8319 while (!list_empty(&splice)) {
8320 root = list_first_entry(&splice, struct btrfs_root,
8321 delalloc_root);
8322 root = btrfs_grab_fs_root(root);
8323 BUG_ON(!root);
8324 list_move_tail(&root->delalloc_root,
8325 &fs_info->delalloc_roots);
8326 spin_unlock(&fs_info->delalloc_root_lock);
8327
8328 ret = __start_delalloc_inodes(root, delay_iput);
8329 btrfs_put_fs_root(root);
8330 if (ret)
8331 goto out;
8332
8333 spin_lock(&fs_info->delalloc_root_lock);
8406 } 8334 }
8335 spin_unlock(&fs_info->delalloc_root_lock);
8407 8336
8337 atomic_inc(&fs_info->async_submit_draining);
8338 while (atomic_read(&fs_info->nr_async_submits) ||
8339 atomic_read(&fs_info->async_delalloc_pages)) {
8340 wait_event(fs_info->async_submit_wait,
8341 (atomic_read(&fs_info->nr_async_submits) == 0 &&
8342 atomic_read(&fs_info->async_delalloc_pages) == 0));
8343 }
8344 atomic_dec(&fs_info->async_submit_draining);
8345 return 0;
8346out:
8408 if (!list_empty_careful(&splice)) { 8347 if (!list_empty_careful(&splice)) {
8409 spin_lock(&root->fs_info->delalloc_lock); 8348 spin_lock(&fs_info->delalloc_root_lock);
8410 list_splice_tail(&splice, &root->fs_info->delalloc_inodes); 8349 list_splice_tail(&splice, &fs_info->delalloc_roots);
8411 spin_unlock(&root->fs_info->delalloc_lock); 8350 spin_unlock(&fs_info->delalloc_root_lock);
8412 } 8351 }
8413 return ret; 8352 return ret;
8414} 8353}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f81d67cdc8d..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
555 if (!root->ref_cows) 555 if (!root->ref_cows)
556 return -EINVAL; 556 return -EINVAL;
557 557
558 ret = btrfs_start_delalloc_inodes(root, 0);
559 if (ret)
560 return ret;
561
562 btrfs_wait_ordered_extents(root, 0);
563
558 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 564 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
559 if (!pending_snapshot) 565 if (!pending_snapshot)
560 return -ENOMEM; 566 return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2354 if (ret) 2360 if (ret)
2355 return ret; 2361 return ret;
2356 2362
2357 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2358 1)) {
2359 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2360 mnt_drop_write_file(file);
2361 return -EINVAL;
2362 }
2363
2364 mutex_lock(&root->fs_info->volume_mutex);
2365 vol_args = memdup_user(arg, sizeof(*vol_args)); 2363 vol_args = memdup_user(arg, sizeof(*vol_args));
2366 if (IS_ERR(vol_args)) { 2364 if (IS_ERR(vol_args)) {
2367 ret = PTR_ERR(vol_args); 2365 ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2369 } 2367 }
2370 2368
2371 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2369 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2372 ret = btrfs_rm_device(root, vol_args->name);
2373 2370
2374 kfree(vol_args); 2371 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2375out: 2372 1)) {
2373 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2374 goto out;
2375 }
2376
2377 mutex_lock(&root->fs_info->volume_mutex);
2378 ret = btrfs_rm_device(root, vol_args->name);
2376 mutex_unlock(&root->fs_info->volume_mutex); 2379 mutex_unlock(&root->fs_info->volume_mutex);
2377 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2380 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2381
2382out:
2383 kfree(vol_args);
2378 mnt_drop_write_file(file); 2384 mnt_drop_write_file(file);
2379 return ret; 2385 return ret;
2380} 2386}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2480 int ret; 2486 int ret;
2481 u64 len = olen; 2487 u64 len = olen;
2482 u64 bs = root->fs_info->sb->s_blocksize; 2488 u64 bs = root->fs_info->sb->s_blocksize;
2489 int same_inode = 0;
2483 2490
2484 /* 2491 /*
2485 * TODO: 2492 * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2516 2523
2517 ret = -EINVAL; 2524 ret = -EINVAL;
2518 if (src == inode) 2525 if (src == inode)
2519 goto out_fput; 2526 same_inode = 1;
2520 2527
2521 /* the src must be open for reading */ 2528 /* the src must be open for reading */
2522 if (!(src_file.file->f_mode & FMODE_READ)) 2529 if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2547 } 2554 }
2548 path->reada = 2; 2555 path->reada = 2;
2549 2556
2550 if (inode < src) { 2557 if (!same_inode) {
2551 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 2558 if (inode < src) {
2552 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 2559 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
2560 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
2561 } else {
2562 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
2563 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2564 }
2553 } else { 2565 } else {
2554 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 2566 mutex_lock(&src->i_mutex);
2555 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2556 } 2567 }
2557 2568
2558 /* determine range to clone */ 2569 /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2570 !IS_ALIGNED(destoff, bs)) 2581 !IS_ALIGNED(destoff, bs))
2571 goto out_unlock; 2582 goto out_unlock;
2572 2583
2584 /* verify if ranges are overlapped within the same file */
2585 if (same_inode) {
2586 if (destoff + len > off && destoff < off + len)
2587 goto out_unlock;
2588 }
2589
2573 if (destoff > inode->i_size) { 2590 if (destoff > inode->i_size) {
2574 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 2591 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2575 if (ret) 2592 if (ret)
@@ -2846,7 +2863,8 @@ out:
2846 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 2863 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2847out_unlock: 2864out_unlock:
2848 mutex_unlock(&src->i_mutex); 2865 mutex_unlock(&src->i_mutex);
2849 mutex_unlock(&inode->i_mutex); 2866 if (!same_inode)
2867 mutex_unlock(&inode->i_mutex);
2850 vfree(buf); 2868 vfree(buf);
2851 btrfs_free_path(path); 2869 btrfs_free_path(path);
2852out_fput: 2870out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2951 goto out; 2969 goto out;
2952 } 2970 }
2953 2971
2954 if (btrfs_root_refs(&new_root->root_item) == 0) {
2955 ret = -ENOENT;
2956 goto out;
2957 }
2958
2959 path = btrfs_alloc_path(); 2972 path = btrfs_alloc_path();
2960 if (!path) { 2973 if (!path) {
2961 ret = -ENOMEM; 2974 ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3719 break; 3732 break;
3720 } 3733 }
3721 3734
3722 if (copy_to_user(arg, sa, sizeof(*sa)))
3723 ret = -EFAULT;
3724
3725 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 3735 err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
3726 if (err && !ret) 3736 if (err && !ret)
3727 ret = err; 3737 ret = err;
@@ -3881,7 +3891,7 @@ drop_write:
3881 3891
3882static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 3892static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
3883{ 3893{
3884 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3894 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3885 struct btrfs_ioctl_quota_rescan_args *qsa; 3895 struct btrfs_ioctl_quota_rescan_args *qsa;
3886 int ret; 3896 int ret;
3887 3897
@@ -3914,7 +3924,7 @@ drop_write:
3914 3924
3915static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 3925static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3916{ 3926{
3917 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3927 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3918 struct btrfs_ioctl_quota_rescan_args *qsa; 3928 struct btrfs_ioctl_quota_rescan_args *qsa;
3919 int ret = 0; 3929 int ret = 0;
3920 3930
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3937 return ret; 3947 return ret;
3938} 3948}
3939 3949
3950static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
3951{
3952 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3953
3954 if (!capable(CAP_SYS_ADMIN))
3955 return -EPERM;
3956
3957 return btrfs_qgroup_wait_for_completion(root->fs_info);
3958}
3959
3940static long btrfs_ioctl_set_received_subvol(struct file *file, 3960static long btrfs_ioctl_set_received_subvol(struct file *file,
3941 void __user *arg) 3961 void __user *arg)
3942{ 3962{
@@ -4020,7 +4040,7 @@ out:
4020 4040
4021static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 4041static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4022{ 4042{
4023 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4043 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4024 const char *label = root->fs_info->super_copy->label; 4044 const char *label = root->fs_info->super_copy->label;
4025 size_t len = strnlen(label, BTRFS_LABEL_SIZE); 4045 size_t len = strnlen(label, BTRFS_LABEL_SIZE);
4026 int ret; 4046 int ret;
@@ -4039,7 +4059,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4039 4059
4040static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 4060static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4041{ 4061{
4042 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4062 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4043 struct btrfs_super_block *super_block = root->fs_info->super_copy; 4063 struct btrfs_super_block *super_block = root->fs_info->super_copy;
4044 struct btrfs_trans_handle *trans; 4064 struct btrfs_trans_handle *trans;
4045 char label[BTRFS_LABEL_SIZE]; 4065 char label[BTRFS_LABEL_SIZE];
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
4179 return btrfs_ioctl_quota_rescan(file, argp); 4199 return btrfs_ioctl_quota_rescan(file, argp);
4180 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 4200 case BTRFS_IOC_QUOTA_RESCAN_STATUS:
4181 return btrfs_ioctl_quota_rescan_status(file, argp); 4201 return btrfs_ioctl_quota_rescan_status(file, argp);
4202 case BTRFS_IOC_QUOTA_RESCAN_WAIT:
4203 return btrfs_ioctl_quota_rescan_wait(file, argp);
4182 case BTRFS_IOC_DEV_REPLACE: 4204 case BTRFS_IOC_DEV_REPLACE:
4183 return btrfs_ioctl_dev_replace(root, argp); 4205 return btrfs_ioctl_dev_replace(root, argp);
4184 case BTRFS_IOC_GET_FSLABEL: 4206 case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
31 31
32struct workspace { 32struct workspace {
33 void *mem; 33 void *mem;
34 void *buf; /* where compressed data goes */ 34 void *buf; /* where decompressed data goes */
35 void *cbuf; /* where decompressed data goes */ 35 void *cbuf; /* where compressed data goes */
36 struct list_head list; 36 struct list_head list;
37}; 37};
38 38
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
24#include "transaction.h" 24#include "transaction.h"
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h"
27 28
28static struct kmem_cache *btrfs_ordered_extent_cache; 29static struct kmem_cache *btrfs_ordered_extent_cache;
29 30
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
184 u64 start, u64 len, u64 disk_len, 185 u64 start, u64 len, u64 disk_len,
185 int type, int dio, int compress_type) 186 int type, int dio, int compress_type)
186{ 187{
188 struct btrfs_root *root = BTRFS_I(inode)->root;
187 struct btrfs_ordered_inode_tree *tree; 189 struct btrfs_ordered_inode_tree *tree;
188 struct rb_node *node; 190 struct rb_node *node;
189 struct btrfs_ordered_extent *entry; 191 struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
227 ordered_data_tree_panic(inode, -EEXIST, file_offset); 229 ordered_data_tree_panic(inode, -EEXIST, file_offset);
228 spin_unlock_irq(&tree->lock); 230 spin_unlock_irq(&tree->lock);
229 231
230 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 232 spin_lock(&root->ordered_extent_lock);
231 list_add_tail(&entry->root_extent_list, 233 list_add_tail(&entry->root_extent_list,
232 &BTRFS_I(inode)->root->fs_info->ordered_extents); 234 &root->ordered_extents);
233 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 235 root->nr_ordered_extents++;
236 if (root->nr_ordered_extents == 1) {
237 spin_lock(&root->fs_info->ordered_root_lock);
238 BUG_ON(!list_empty(&root->ordered_root));
239 list_add_tail(&root->ordered_root,
240 &root->fs_info->ordered_roots);
241 spin_unlock(&root->fs_info->ordered_root_lock);
242 }
243 spin_unlock(&root->ordered_extent_lock);
234 244
235 return 0; 245 return 0;
236} 246}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
516 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 526 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
517 spin_unlock_irq(&tree->lock); 527 spin_unlock_irq(&tree->lock);
518 528
519 spin_lock(&root->fs_info->ordered_extent_lock); 529 spin_lock(&root->ordered_extent_lock);
520 list_del_init(&entry->root_extent_list); 530 list_del_init(&entry->root_extent_list);
531 root->nr_ordered_extents--;
521 532
522 trace_btrfs_ordered_extent_remove(inode, entry); 533 trace_btrfs_ordered_extent_remove(inode, entry);
523 534
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
530 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 541 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
531 list_del_init(&BTRFS_I(inode)->ordered_operations); 542 list_del_init(&BTRFS_I(inode)->ordered_operations);
532 } 543 }
533 spin_unlock(&root->fs_info->ordered_extent_lock); 544
545 if (!root->nr_ordered_extents) {
546 spin_lock(&root->fs_info->ordered_root_lock);
547 BUG_ON(list_empty(&root->ordered_root));
548 list_del_init(&root->ordered_root);
549 spin_unlock(&root->fs_info->ordered_root_lock);
550 }
551 spin_unlock(&root->ordered_extent_lock);
534 wake_up(&entry->wait); 552 wake_up(&entry->wait);
535} 553}
536 554
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
550void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 568void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
551{ 569{
552 struct list_head splice, works; 570 struct list_head splice, works;
553 struct list_head *cur;
554 struct btrfs_ordered_extent *ordered, *next; 571 struct btrfs_ordered_extent *ordered, *next;
555 struct inode *inode; 572 struct inode *inode;
556 573
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
558 INIT_LIST_HEAD(&works); 575 INIT_LIST_HEAD(&works);
559 576
560 mutex_lock(&root->fs_info->ordered_operations_mutex); 577 mutex_lock(&root->fs_info->ordered_operations_mutex);
561 spin_lock(&root->fs_info->ordered_extent_lock); 578 spin_lock(&root->ordered_extent_lock);
562 list_splice_init(&root->fs_info->ordered_extents, &splice); 579 list_splice_init(&root->ordered_extents, &splice);
563 while (!list_empty(&splice)) { 580 while (!list_empty(&splice)) {
564 cur = splice.next; 581 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
565 ordered = list_entry(cur, struct btrfs_ordered_extent, 582 root_extent_list);
566 root_extent_list); 583 list_move_tail(&ordered->root_extent_list,
567 list_del_init(&ordered->root_extent_list); 584 &root->ordered_extents);
568 atomic_inc(&ordered->refs);
569
570 /* 585 /*
571 * the inode may be getting freed (in sys_unlink path). 586 * the inode may be getting freed (in sys_unlink path).
572 */ 587 */
573 inode = igrab(ordered->inode); 588 inode = igrab(ordered->inode);
589 if (!inode) {
590 cond_resched_lock(&root->ordered_extent_lock);
591 continue;
592 }
574 593
575 spin_unlock(&root->fs_info->ordered_extent_lock); 594 atomic_inc(&ordered->refs);
595 spin_unlock(&root->ordered_extent_lock);
576 596
577 if (inode) { 597 ordered->flush_work.func = btrfs_run_ordered_extent_work;
578 ordered->flush_work.func = btrfs_run_ordered_extent_work; 598 list_add_tail(&ordered->work_list, &works);
579 list_add_tail(&ordered->work_list, &works); 599 btrfs_queue_worker(&root->fs_info->flush_workers,
580 btrfs_queue_worker(&root->fs_info->flush_workers, 600 &ordered->flush_work);
581 &ordered->flush_work);
582 } else {
583 btrfs_put_ordered_extent(ordered);
584 }
585 601
586 cond_resched(); 602 cond_resched();
587 spin_lock(&root->fs_info->ordered_extent_lock); 603 spin_lock(&root->ordered_extent_lock);
588 } 604 }
589 spin_unlock(&root->fs_info->ordered_extent_lock); 605 spin_unlock(&root->ordered_extent_lock);
590 606
591 list_for_each_entry_safe(ordered, next, &works, work_list) { 607 list_for_each_entry_safe(ordered, next, &works, work_list) {
592 list_del_init(&ordered->work_list); 608 list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
604 mutex_unlock(&root->fs_info->ordered_operations_mutex); 620 mutex_unlock(&root->fs_info->ordered_operations_mutex);
605} 621}
606 622
623void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
624 int delay_iput)
625{
626 struct btrfs_root *root;
627 struct list_head splice;
628
629 INIT_LIST_HEAD(&splice);
630
631 spin_lock(&fs_info->ordered_root_lock);
632 list_splice_init(&fs_info->ordered_roots, &splice);
633 while (!list_empty(&splice)) {
634 root = list_first_entry(&splice, struct btrfs_root,
635 ordered_root);
636 root = btrfs_grab_fs_root(root);
637 BUG_ON(!root);
638 list_move_tail(&root->ordered_root,
639 &fs_info->ordered_roots);
640 spin_unlock(&fs_info->ordered_root_lock);
641
642 btrfs_wait_ordered_extents(root, delay_iput);
643 btrfs_put_fs_root(root);
644
645 spin_lock(&fs_info->ordered_root_lock);
646 }
647 spin_unlock(&fs_info->ordered_root_lock);
648}
649
607/* 650/*
608 * this is used during transaction commit to write all the inodes 651 * this is used during transaction commit to write all the inodes
609 * added to the ordered operation list. These files must be fully on 652 * added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
629 INIT_LIST_HEAD(&works); 672 INIT_LIST_HEAD(&works);
630 673
631 mutex_lock(&root->fs_info->ordered_operations_mutex); 674 mutex_lock(&root->fs_info->ordered_operations_mutex);
632 spin_lock(&root->fs_info->ordered_extent_lock); 675 spin_lock(&root->fs_info->ordered_root_lock);
633 list_splice_init(&cur_trans->ordered_operations, &splice); 676 list_splice_init(&cur_trans->ordered_operations, &splice);
634 while (!list_empty(&splice)) { 677 while (!list_empty(&splice)) {
635 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 678 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
648 if (!wait) 691 if (!wait)
649 list_add_tail(&BTRFS_I(inode)->ordered_operations, 692 list_add_tail(&BTRFS_I(inode)->ordered_operations,
650 &cur_trans->ordered_operations); 693 &cur_trans->ordered_operations);
651 spin_unlock(&root->fs_info->ordered_extent_lock); 694 spin_unlock(&root->fs_info->ordered_root_lock);
652 695
653 work = btrfs_alloc_delalloc_work(inode, wait, 1); 696 work = btrfs_alloc_delalloc_work(inode, wait, 1);
654 if (!work) { 697 if (!work) {
655 spin_lock(&root->fs_info->ordered_extent_lock); 698 spin_lock(&root->fs_info->ordered_root_lock);
656 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 699 if (list_empty(&BTRFS_I(inode)->ordered_operations))
657 list_add_tail(&btrfs_inode->ordered_operations, 700 list_add_tail(&btrfs_inode->ordered_operations,
658 &splice); 701 &splice);
659 list_splice_tail(&splice, 702 list_splice_tail(&splice,
660 &cur_trans->ordered_operations); 703 &cur_trans->ordered_operations);
661 spin_unlock(&root->fs_info->ordered_extent_lock); 704 spin_unlock(&root->fs_info->ordered_root_lock);
662 ret = -ENOMEM; 705 ret = -ENOMEM;
663 goto out; 706 goto out;
664 } 707 }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
667 &work->work); 710 &work->work);
668 711
669 cond_resched(); 712 cond_resched();
670 spin_lock(&root->fs_info->ordered_extent_lock); 713 spin_lock(&root->fs_info->ordered_root_lock);
671 } 714 }
672 spin_unlock(&root->fs_info->ordered_extent_lock); 715 spin_unlock(&root->fs_info->ordered_root_lock);
673out: 716out:
674 list_for_each_entry_safe(work, next, &works, list) { 717 list_for_each_entry_safe(work, next, &works, list) {
675 list_del_init(&work->list); 718 list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
989 u32 *sum, int len) 1032 u32 *sum, int len)
990{ 1033{
991 struct btrfs_ordered_sum *ordered_sum; 1034 struct btrfs_ordered_sum *ordered_sum;
992 struct btrfs_sector_sum *sector_sums;
993 struct btrfs_ordered_extent *ordered; 1035 struct btrfs_ordered_extent *ordered;
994 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 1036 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
995 unsigned long num_sectors; 1037 unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1007 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 1049 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1008 i = (disk_bytenr - ordered_sum->bytenr) >> 1050 i = (disk_bytenr - ordered_sum->bytenr) >>
1009 inode->i_sb->s_blocksize_bits; 1051 inode->i_sb->s_blocksize_bits;
1010 sector_sums = ordered_sum->sums + i;
1011 num_sectors = ordered_sum->len >> 1052 num_sectors = ordered_sum->len >>
1012 inode->i_sb->s_blocksize_bits; 1053 inode->i_sb->s_blocksize_bits;
1013 for (; i < num_sectors; i++) { 1054 num_sectors = min_t(int, len - index, num_sectors - i);
1014 if (sector_sums[i].bytenr == disk_bytenr) { 1055 memcpy(sum + index, ordered_sum->sums + i,
1015 sum[index] = sector_sums[i].sum; 1056 num_sectors);
1016 index++; 1057
1017 if (index == len) 1058 index += (int)num_sectors;
1018 goto out; 1059 if (index == len)
1019 disk_bytenr += sectorsize; 1060 goto out;
1020 } 1061 disk_bytenr += num_sectors * sectorsize;
1021 }
1022 } 1062 }
1023 } 1063 }
1024out: 1064out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1055 if (last_mod < root->fs_info->last_trans_committed) 1095 if (last_mod < root->fs_info->last_trans_committed)
1056 return; 1096 return;
1057 1097
1058 spin_lock(&root->fs_info->ordered_extent_lock); 1098 spin_lock(&root->fs_info->ordered_root_lock);
1059 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1099 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1060 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1100 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1061 &cur_trans->ordered_operations); 1101 &cur_trans->ordered_operations);
1062 } 1102 }
1063 spin_unlock(&root->fs_info->ordered_extent_lock); 1103 spin_unlock(&root->fs_info->ordered_root_lock);
1064} 1104}
1065 1105
1066int __init ordered_data_init(void) 1106int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
28 28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum { 29struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */ 30 /* bytenr is the start of this extent on disk */
43 u64 bytenr; 31 u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
45 /* 33 /*
46 * this is the length in bytes covered by the sums array below. 34 * this is the length in bytes covered by the sums array below.
47 */ 35 */
48 unsigned long len; 36 int len;
49 struct list_head list; 37 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */ 38 /* last field is a variable length array of csums */
51 struct btrfs_sector_sum sums[]; 39 u32 sums[];
52}; 40};
53 41
54/* 42/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
149static inline int btrfs_ordered_sum_size(struct btrfs_root *root, 137static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
150 unsigned long bytes) 138 unsigned long bytes)
151{ 139{
152 unsigned long num_sectors = (bytes + root->sectorsize - 1) / 140 int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
153 root->sectorsize; 141 return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
154 num_sectors++;
155 return sizeof(struct btrfs_ordered_sum) +
156 num_sectors * sizeof(struct btrfs_sector_sum);
157} 142}
158 143
159static inline void 144static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
204 struct btrfs_root *root, 189 struct btrfs_root *root,
205 struct inode *inode); 190 struct inode *inode);
206void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 191void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
192void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
193 int delay_iput);
207void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 194void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
208void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 195void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 196void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 98 struct btrfs_qgroup *member;
99}; 99};
100 100
101struct qgroup_rescan { 101static int
102 struct btrfs_work work; 102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 struct btrfs_fs_info *fs_info; 103 int init_flags);
104}; 104static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
105
106static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
107 struct qgroup_rescan *qscan);
108 105
109/* must be called with qgroup_ioctl_lock held */ 106/* must be called with qgroup_ioctl_lock held */
110static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 107static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
255 int slot; 252 int slot;
256 int ret = 0; 253 int ret = 0;
257 u64 flags = 0; 254 u64 flags = 0;
255 u64 rescan_progress = 0;
258 256
259 if (!fs_info->quota_enabled) 257 if (!fs_info->quota_enabled)
260 return 0; 258 return 0;
261 259
260 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
261 if (!fs_info->qgroup_ulist) {
262 ret = -ENOMEM;
263 goto out;
264 }
265
262 path = btrfs_alloc_path(); 266 path = btrfs_alloc_path();
263 if (!path) { 267 if (!path) {
264 ret = -ENOMEM; 268 ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
306 } 310 }
307 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 311 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
308 ptr); 312 ptr);
309 fs_info->qgroup_rescan_progress.objectid = 313 rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
310 btrfs_qgroup_status_rescan(l, ptr);
311 if (fs_info->qgroup_flags &
312 BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
313 struct qgroup_rescan *qscan =
314 kmalloc(sizeof(*qscan), GFP_NOFS);
315 if (!qscan) {
316 ret = -ENOMEM;
317 goto out;
318 }
319 fs_info->qgroup_rescan_progress.type = 0;
320 fs_info->qgroup_rescan_progress.offset = 0;
321 qgroup_rescan_start(fs_info, qscan);
322 }
323 goto next1; 314 goto next1;
324 } 315 }
325 316
@@ -421,9 +412,18 @@ out:
421 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { 412 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
422 fs_info->quota_enabled = 0; 413 fs_info->quota_enabled = 0;
423 fs_info->pending_quota_state = 0; 414 fs_info->pending_quota_state = 0;
415 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
416 ret >= 0) {
417 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
424 } 418 }
425 btrfs_free_path(path); 419 btrfs_free_path(path);
426 420
421 if (ret < 0) {
422 ulist_free(fs_info->qgroup_ulist);
423 fs_info->qgroup_ulist = NULL;
424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
425 }
426
427 return ret < 0 ? ret : 0; 427 return ret < 0 ? ret : 0;
428} 428}
429 429
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
460 } 460 }
461 kfree(qgroup); 461 kfree(qgroup);
462 } 462 }
463 ulist_free(fs_info->qgroup_ulist);
463} 464}
464 465
465static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, 466static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
819 goto out; 820 goto out;
820 } 821 }
821 822
823 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
824 if (!fs_info->qgroup_ulist) {
825 ret = -ENOMEM;
826 goto out;
827 }
828
822 /* 829 /*
823 * initially create the quota tree 830 * initially create the quota tree
824 */ 831 */
@@ -916,6 +923,10 @@ out_free_root:
916 kfree(quota_root); 923 kfree(quota_root);
917 } 924 }
918out: 925out:
926 if (ret) {
927 ulist_free(fs_info->qgroup_ulist);
928 fs_info->qgroup_ulist = NULL;
929 }
919 mutex_unlock(&fs_info->qgroup_ioctl_lock); 930 mutex_unlock(&fs_info->qgroup_ioctl_lock);
920 return ret; 931 return ret;
921} 932}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1355 u64 ref_root; 1366 u64 ref_root;
1356 struct btrfs_qgroup *qgroup; 1367 struct btrfs_qgroup *qgroup;
1357 struct ulist *roots = NULL; 1368 struct ulist *roots = NULL;
1358 struct ulist *tmp = NULL;
1359 u64 seq; 1369 u64 seq;
1360 int ret = 0; 1370 int ret = 0;
1361 int sgn; 1371 int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1428 if (ret < 0) 1438 if (ret < 0)
1429 return ret; 1439 return ret;
1430 1440
1431 mutex_lock(&fs_info->qgroup_rescan_lock);
1432 spin_lock(&fs_info->qgroup_lock); 1441 spin_lock(&fs_info->qgroup_lock);
1433 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1434 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
1435 ret = 0;
1436 goto unlock;
1437 }
1438 }
1439 1442
1440 quota_root = fs_info->quota_root; 1443 quota_root = fs_info->quota_root;
1441 if (!quota_root) 1444 if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1448 /* 1451 /*
1449 * step 1: for each old ref, visit all nodes once and inc refcnt 1452 * step 1: for each old ref, visit all nodes once and inc refcnt
1450 */ 1453 */
1451 tmp = ulist_alloc(GFP_ATOMIC); 1454 ulist_reinit(fs_info->qgroup_ulist);
1452 if (!tmp) {
1453 ret = -ENOMEM;
1454 goto unlock;
1455 }
1456 seq = fs_info->qgroup_seq; 1455 seq = fs_info->qgroup_seq;
1457 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1456 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1458 1457
1459 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 1458 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
1459 seq);
1460 if (ret) 1460 if (ret)
1461 goto unlock; 1461 goto unlock;
1462 1462
1463 /* 1463 /*
1464 * step 2: walk from the new root 1464 * step 2: walk from the new root
1465 */ 1465 */
1466 ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, 1466 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
1467 node->num_bytes, qgroup); 1467 seq, sgn, node->num_bytes, qgroup);
1468 if (ret) 1468 if (ret)
1469 goto unlock; 1469 goto unlock;
1470 1470
1471 /* 1471 /*
1472 * step 3: walk again from old refs 1472 * step 3: walk again from old refs
1473 */ 1473 */
1474 ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, 1474 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
1475 node->num_bytes); 1475 seq, sgn, node->num_bytes);
1476 if (ret) 1476 if (ret)
1477 goto unlock; 1477 goto unlock;
1478 1478
1479unlock: 1479unlock:
1480 spin_unlock(&fs_info->qgroup_lock); 1480 spin_unlock(&fs_info->qgroup_lock);
1481 mutex_unlock(&fs_info->qgroup_rescan_lock);
1482 ulist_free(roots); 1481 ulist_free(roots);
1483 ulist_free(tmp);
1484 1482
1485 return ret; 1483 return ret;
1486} 1484}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1527 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1525 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1528 1526
1529 if (!ret && start_rescan_worker) { 1527 if (!ret && start_rescan_worker) {
1530 ret = btrfs_qgroup_rescan(fs_info); 1528 ret = qgroup_rescan_init(fs_info, 0, 1);
1531 if (ret) 1529 if (!ret) {
1532 pr_err("btrfs: start rescan quota failed: %d\n", ret); 1530 qgroup_rescan_zero_tracking(fs_info);
1531 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
1532 &fs_info->qgroup_rescan_work);
1533 }
1533 ret = 0; 1534 ret = 0;
1534 } 1535 }
1535 1536
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1720 struct btrfs_fs_info *fs_info = root->fs_info; 1721 struct btrfs_fs_info *fs_info = root->fs_info;
1721 u64 ref_root = root->root_key.objectid; 1722 u64 ref_root = root->root_key.objectid;
1722 int ret = 0; 1723 int ret = 0;
1723 struct ulist *ulist = NULL;
1724 struct ulist_node *unode; 1724 struct ulist_node *unode;
1725 struct ulist_iterator uiter; 1725 struct ulist_iterator uiter;
1726 1726
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1743 * in a first step, we check all affected qgroups if any limits would 1743 * in a first step, we check all affected qgroups if any limits would
1744 * be exceeded 1744 * be exceeded
1745 */ 1745 */
1746 ulist = ulist_alloc(GFP_ATOMIC); 1746 ulist_reinit(fs_info->qgroup_ulist);
1747 if (!ulist) { 1747 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1748 ret = -ENOMEM;
1749 goto out;
1750 }
1751 ret = ulist_add(ulist, qgroup->qgroupid,
1752 (uintptr_t)qgroup, GFP_ATOMIC); 1748 (uintptr_t)qgroup, GFP_ATOMIC);
1753 if (ret < 0) 1749 if (ret < 0)
1754 goto out; 1750 goto out;
1755 ULIST_ITER_INIT(&uiter); 1751 ULIST_ITER_INIT(&uiter);
1756 while ((unode = ulist_next(ulist, &uiter))) { 1752 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1757 struct btrfs_qgroup *qg; 1753 struct btrfs_qgroup *qg;
1758 struct btrfs_qgroup_list *glist; 1754 struct btrfs_qgroup_list *glist;
1759 1755
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1774 } 1770 }
1775 1771
1776 list_for_each_entry(glist, &qg->groups, next_group) { 1772 list_for_each_entry(glist, &qg->groups, next_group) {
1777 ret = ulist_add(ulist, glist->group->qgroupid, 1773 ret = ulist_add(fs_info->qgroup_ulist,
1774 glist->group->qgroupid,
1778 (uintptr_t)glist->group, GFP_ATOMIC); 1775 (uintptr_t)glist->group, GFP_ATOMIC);
1779 if (ret < 0) 1776 if (ret < 0)
1780 goto out; 1777 goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1785 * no limits exceeded, now record the reservation into all qgroups 1782 * no limits exceeded, now record the reservation into all qgroups
1786 */ 1783 */
1787 ULIST_ITER_INIT(&uiter); 1784 ULIST_ITER_INIT(&uiter);
1788 while ((unode = ulist_next(ulist, &uiter))) { 1785 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1789 struct btrfs_qgroup *qg; 1786 struct btrfs_qgroup *qg;
1790 1787
1791 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1788 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1795 1792
1796out: 1793out:
1797 spin_unlock(&fs_info->qgroup_lock); 1794 spin_unlock(&fs_info->qgroup_lock);
1798 ulist_free(ulist);
1799
1800 return ret; 1795 return ret;
1801} 1796}
1802 1797
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1805 struct btrfs_root *quota_root; 1800 struct btrfs_root *quota_root;
1806 struct btrfs_qgroup *qgroup; 1801 struct btrfs_qgroup *qgroup;
1807 struct btrfs_fs_info *fs_info = root->fs_info; 1802 struct btrfs_fs_info *fs_info = root->fs_info;
1808 struct ulist *ulist = NULL;
1809 struct ulist_node *unode; 1803 struct ulist_node *unode;
1810 struct ulist_iterator uiter; 1804 struct ulist_iterator uiter;
1811 u64 ref_root = root->root_key.objectid; 1805 u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1827 if (!qgroup) 1821 if (!qgroup)
1828 goto out; 1822 goto out;
1829 1823
1830 ulist = ulist_alloc(GFP_ATOMIC); 1824 ulist_reinit(fs_info->qgroup_ulist);
1831 if (!ulist) { 1825 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1832 btrfs_std_error(fs_info, -ENOMEM);
1833 goto out;
1834 }
1835 ret = ulist_add(ulist, qgroup->qgroupid,
1836 (uintptr_t)qgroup, GFP_ATOMIC); 1826 (uintptr_t)qgroup, GFP_ATOMIC);
1837 if (ret < 0) 1827 if (ret < 0)
1838 goto out; 1828 goto out;
1839 ULIST_ITER_INIT(&uiter); 1829 ULIST_ITER_INIT(&uiter);
1840 while ((unode = ulist_next(ulist, &uiter))) { 1830 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1841 struct btrfs_qgroup *qg; 1831 struct btrfs_qgroup *qg;
1842 struct btrfs_qgroup_list *glist; 1832 struct btrfs_qgroup_list *glist;
1843 1833
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1846 qg->reserved -= num_bytes; 1836 qg->reserved -= num_bytes;
1847 1837
1848 list_for_each_entry(glist, &qg->groups, next_group) { 1838 list_for_each_entry(glist, &qg->groups, next_group) {
1849 ret = ulist_add(ulist, glist->group->qgroupid, 1839 ret = ulist_add(fs_info->qgroup_ulist,
1840 glist->group->qgroupid,
1850 (uintptr_t)glist->group, GFP_ATOMIC); 1841 (uintptr_t)glist->group, GFP_ATOMIC);
1851 if (ret < 0) 1842 if (ret < 0)
1852 goto out; 1843 goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1855 1846
1856out: 1847out:
1857 spin_unlock(&fs_info->qgroup_lock); 1848 spin_unlock(&fs_info->qgroup_lock);
1858 ulist_free(ulist);
1859} 1849}
1860 1850
1861void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) 1851void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1874 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. 1864 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
1875 */ 1865 */
1876static int 1866static int
1877qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, 1867qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1878 struct btrfs_trans_handle *trans, struct ulist *tmp, 1868 struct btrfs_trans_handle *trans, struct ulist *tmp,
1879 struct extent_buffer *scratch_leaf) 1869 struct extent_buffer *scratch_leaf)
1880{ 1870{
1881 struct btrfs_key found; 1871 struct btrfs_key found;
1882 struct btrfs_fs_info *fs_info = qscan->fs_info;
1883 struct ulist *roots = NULL; 1872 struct ulist *roots = NULL;
1884 struct ulist_node *unode; 1873 struct ulist_node *unode;
1885 struct ulist_iterator uiter; 1874 struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
2007 1996
2008static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 1997static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2009{ 1998{
2010 struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, 1999 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2011 work); 2000 qgroup_rescan_work);
2012 struct btrfs_path *path; 2001 struct btrfs_path *path;
2013 struct btrfs_trans_handle *trans = NULL; 2002 struct btrfs_trans_handle *trans = NULL;
2014 struct btrfs_fs_info *fs_info = qscan->fs_info;
2015 struct ulist *tmp = NULL; 2003 struct ulist *tmp = NULL;
2016 struct extent_buffer *scratch_leaf = NULL; 2004 struct extent_buffer *scratch_leaf = NULL;
2017 int err = -ENOMEM; 2005 int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2036 if (!fs_info->quota_enabled) { 2024 if (!fs_info->quota_enabled) {
2037 err = -EINTR; 2025 err = -EINTR;
2038 } else { 2026 } else {
2039 err = qgroup_rescan_leaf(qscan, path, trans, 2027 err = qgroup_rescan_leaf(fs_info, path, trans,
2040 tmp, scratch_leaf); 2028 tmp, scratch_leaf);
2041 } 2029 }
2042 if (err > 0) 2030 if (err > 0)
@@ -2049,7 +2037,6 @@ out:
2049 kfree(scratch_leaf); 2037 kfree(scratch_leaf);
2050 ulist_free(tmp); 2038 ulist_free(tmp);
2051 btrfs_free_path(path); 2039 btrfs_free_path(path);
2052 kfree(qscan);
2053 2040
2054 mutex_lock(&fs_info->qgroup_rescan_lock); 2041 mutex_lock(&fs_info->qgroup_rescan_lock);
2055 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2042 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
2068 } else { 2055 } else {
2069 pr_err("btrfs: qgroup scan failed with %d\n", err); 2056 pr_err("btrfs: qgroup scan failed with %d\n", err);
2070 } 2057 }
2071}
2072 2058
2073static void 2059 complete_all(&fs_info->qgroup_rescan_completion);
2074qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
2075{
2076 memset(&qscan->work, 0, sizeof(qscan->work));
2077 qscan->work.func = btrfs_qgroup_rescan_worker;
2078 qscan->fs_info = fs_info;
2079
2080 pr_info("btrfs: qgroup scan started\n");
2081 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
2082} 2060}
2083 2061
2084int 2062/*
2085btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 2063 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
2064 * memory required for the rescan context.
2065 */
2066static int
2067qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2068 int init_flags)
2086{ 2069{
2087 int ret = 0; 2070 int ret = 0;
2088 struct rb_node *n;
2089 struct btrfs_qgroup *qgroup;
2090 struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
2091 2071
2092 if (!qscan) 2072 if (!init_flags &&
2093 return -ENOMEM; 2073 (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
2074 !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
2075 ret = -EINVAL;
2076 goto err;
2077 }
2094 2078
2095 mutex_lock(&fs_info->qgroup_rescan_lock); 2079 mutex_lock(&fs_info->qgroup_rescan_lock);
2096 spin_lock(&fs_info->qgroup_lock); 2080 spin_lock(&fs_info->qgroup_lock);
2097 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2081
2098 ret = -EINPROGRESS; 2082 if (init_flags) {
2099 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 2083 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2100 ret = -EINVAL; 2084 ret = -EINPROGRESS;
2101 if (ret) { 2085 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
2102 spin_unlock(&fs_info->qgroup_lock); 2086 ret = -EINVAL;
2103 mutex_unlock(&fs_info->qgroup_rescan_lock); 2087
2104 kfree(qscan); 2088 if (ret) {
2105 return ret; 2089 spin_unlock(&fs_info->qgroup_lock);
2090 mutex_unlock(&fs_info->qgroup_rescan_lock);
2091 goto err;
2092 }
2093
2094 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2106 } 2095 }
2107 2096
2108 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2109 memset(&fs_info->qgroup_rescan_progress, 0, 2097 memset(&fs_info->qgroup_rescan_progress, 0,
2110 sizeof(fs_info->qgroup_rescan_progress)); 2098 sizeof(fs_info->qgroup_rescan_progress));
2099 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
2100
2101 spin_unlock(&fs_info->qgroup_lock);
2102 mutex_unlock(&fs_info->qgroup_rescan_lock);
2103
2104 init_completion(&fs_info->qgroup_rescan_completion);
2105
2106 memset(&fs_info->qgroup_rescan_work, 0,
2107 sizeof(fs_info->qgroup_rescan_work));
2108 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
2109
2110 if (ret) {
2111err:
2112 pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
2113 return ret;
2114 }
2115
2116 return 0;
2117}
2118
2119static void
2120qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
2121{
2122 struct rb_node *n;
2123 struct btrfs_qgroup *qgroup;
2111 2124
2125 spin_lock(&fs_info->qgroup_lock);
2112 /* clear all current qgroup tracking information */ 2126 /* clear all current qgroup tracking information */
2113 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 2127 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
2114 qgroup = rb_entry(n, struct btrfs_qgroup, node); 2128 qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2118 qgroup->excl_cmpr = 0; 2132 qgroup->excl_cmpr = 0;
2119 } 2133 }
2120 spin_unlock(&fs_info->qgroup_lock); 2134 spin_unlock(&fs_info->qgroup_lock);
2121 mutex_unlock(&fs_info->qgroup_rescan_lock); 2135}
2136
2137int
2138btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2139{
2140 int ret = 0;
2141 struct btrfs_trans_handle *trans;
2122 2142
2123 qgroup_rescan_start(fs_info, qscan); 2143 ret = qgroup_rescan_init(fs_info, 0, 1);
2144 if (ret)
2145 return ret;
2146
2147 /*
2148 * We have set the rescan_progress to 0, which means no more
2149 * delayed refs will be accounted by btrfs_qgroup_account_ref.
2150 * However, btrfs_qgroup_account_ref may be right after its call
2151 * to btrfs_find_all_roots, in which case it would still do the
2152 * accounting.
2153 * To solve this, we're committing the transaction, which will
2154 * ensure we run all delayed refs and only after that, we are
2155 * going to clear all tracking information for a clean start.
2156 */
2157
2158 trans = btrfs_join_transaction(fs_info->fs_root);
2159 if (IS_ERR(trans)) {
2160 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2161 return PTR_ERR(trans);
2162 }
2163 ret = btrfs_commit_transaction(trans, fs_info->fs_root);
2164 if (ret) {
2165 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2166 return ret;
2167 }
2168
2169 qgroup_rescan_zero_tracking(fs_info);
2170
2171 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2172 &fs_info->qgroup_rescan_work);
2124 2173
2125 return 0; 2174 return 0;
2126} 2175}
2176
2177int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
2178{
2179 int running;
2180 int ret = 0;
2181
2182 mutex_lock(&fs_info->qgroup_rescan_lock);
2183 spin_lock(&fs_info->qgroup_lock);
2184 running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2185 spin_unlock(&fs_info->qgroup_lock);
2186 mutex_unlock(&fs_info->qgroup_rescan_lock);
2187
2188 if (running)
2189 ret = wait_for_completion_interruptible(
2190 &fs_info->qgroup_rescan_completion);
2191
2192 return ret;
2193}
2194
2195/*
2196 * this is only called from open_ctree where we're still single threaded, thus
2197 * locking is omitted here.
2198 */
2199void
2200btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2201{
2202 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2203 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2204 &fs_info->qgroup_rescan_work);
2205}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1305 struct extent_buffer *eb; 1305 struct extent_buffer *eb;
1306 struct btrfs_root_item *root_item; 1306 struct btrfs_root_item *root_item;
1307 struct btrfs_key root_key; 1307 struct btrfs_key root_key;
1308 u64 last_snap = 0;
1308 int ret; 1309 int ret;
1309 1310
1310 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1311 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1320 BTRFS_TREE_RELOC_OBJECTID); 1321 BTRFS_TREE_RELOC_OBJECTID);
1321 BUG_ON(ret); 1322 BUG_ON(ret);
1322 1323
1324 last_snap = btrfs_root_last_snapshot(&root->root_item);
1323 btrfs_set_root_last_snapshot(&root->root_item, 1325 btrfs_set_root_last_snapshot(&root->root_item,
1324 trans->transid - 1); 1326 trans->transid - 1);
1325 } else { 1327 } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1345 memset(&root_item->drop_progress, 0, 1347 memset(&root_item->drop_progress, 0,
1346 sizeof(struct btrfs_disk_key)); 1348 sizeof(struct btrfs_disk_key));
1347 root_item->drop_level = 0; 1349 root_item->drop_level = 0;
1350 /*
1351 * abuse rtransid, it is safe because it is impossible to
1352 * receive data into a relocation tree.
1353 */
1354 btrfs_set_root_rtransid(root_item, last_snap);
1355 btrfs_set_root_otransid(root_item, trans->transid);
1348 } 1356 }
1349 1357
1350 btrfs_tree_unlock(eb); 1358 btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1355 BUG_ON(ret); 1363 BUG_ON(ret);
1356 kfree(root_item); 1364 kfree(root_item);
1357 1365
1358 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 1366 reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1359 &root_key);
1360 BUG_ON(IS_ERR(reloc_root)); 1367 BUG_ON(IS_ERR(reloc_root));
1361 reloc_root->last_trans = trans->transid; 1368 reloc_root->last_trans = trans->transid;
1362 return reloc_root; 1369 return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
2273static noinline_for_stack 2280static noinline_for_stack
2274int merge_reloc_roots(struct reloc_control *rc) 2281int merge_reloc_roots(struct reloc_control *rc)
2275{ 2282{
2283 struct btrfs_trans_handle *trans;
2276 struct btrfs_root *root; 2284 struct btrfs_root *root;
2277 struct btrfs_root *reloc_root; 2285 struct btrfs_root *reloc_root;
2286 u64 last_snap;
2287 u64 otransid;
2288 u64 objectid;
2278 LIST_HEAD(reloc_roots); 2289 LIST_HEAD(reloc_roots);
2279 int found = 0; 2290 int found = 0;
2280 int ret = 0; 2291 int ret = 0;
@@ -2308,12 +2319,44 @@ again:
2308 } else { 2319 } else {
2309 list_del_init(&reloc_root->root_list); 2320 list_del_init(&reloc_root->root_list);
2310 } 2321 }
2322
2323 /*
2324 * we keep the old last snapshod transid in rtranid when we
2325 * created the relocation tree.
2326 */
2327 last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2328 otransid = btrfs_root_otransid(&reloc_root->root_item);
2329 objectid = reloc_root->root_key.offset;
2330
2311 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); 2331 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2312 if (ret < 0) { 2332 if (ret < 0) {
2313 if (list_empty(&reloc_root->root_list)) 2333 if (list_empty(&reloc_root->root_list))
2314 list_add_tail(&reloc_root->root_list, 2334 list_add_tail(&reloc_root->root_list,
2315 &reloc_roots); 2335 &reloc_roots);
2316 goto out; 2336 goto out;
2337 } else if (!ret) {
2338 /*
2339 * recover the last snapshot tranid to avoid
2340 * the space balance break NOCOW.
2341 */
2342 root = read_fs_root(rc->extent_root->fs_info,
2343 objectid);
2344 if (IS_ERR(root))
2345 continue;
2346
2347 if (btrfs_root_refs(&root->root_item) == 0)
2348 continue;
2349
2350 trans = btrfs_join_transaction(root);
2351 BUG_ON(IS_ERR(trans));
2352
2353 /* Check if the fs/file tree was snapshoted or not. */
2354 if (btrfs_root_last_snapshot(&root->root_item) ==
2355 otransid - 1)
2356 btrfs_set_root_last_snapshot(&root->root_item,
2357 last_snap);
2358
2359 btrfs_end_transaction(trans, root);
2317 } 2360 }
2318 } 2361 }
2319 2362
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
3266 struct btrfs_path *path; 3309 struct btrfs_path *path;
3267 struct btrfs_key key; 3310 struct btrfs_key key;
3268 int ret; 3311 int ret;
3312 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3313 SKINNY_METADATA);
3269 3314
3270 if (tree_block_processed(bytenr, blocksize, rc)) 3315 if (tree_block_processed(bytenr, blocksize, rc))
3271 return 0; 3316 return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
3276 path = btrfs_alloc_path(); 3321 path = btrfs_alloc_path();
3277 if (!path) 3322 if (!path)
3278 return -ENOMEM; 3323 return -ENOMEM;
3279 3324again:
3280 key.objectid = bytenr; 3325 key.objectid = bytenr;
3281 key.type = BTRFS_EXTENT_ITEM_KEY; 3326 if (skinny) {
3282 key.offset = blocksize; 3327 key.type = BTRFS_METADATA_ITEM_KEY;
3328 key.offset = (u64)-1;
3329 } else {
3330 key.type = BTRFS_EXTENT_ITEM_KEY;
3331 key.offset = blocksize;
3332 }
3283 3333
3284 path->search_commit_root = 1; 3334 path->search_commit_root = 1;
3285 path->skip_locking = 1; 3335 path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
3287 if (ret < 0) 3337 if (ret < 0)
3288 goto out; 3338 goto out;
3289 3339
3290 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3340 if (ret > 0 && skinny) {
3291 if (ret > 0) { 3341 if (path->slots[0]) {
3292 if (key.objectid == bytenr && 3342 path->slots[0]--;
3293 key.type == BTRFS_METADATA_ITEM_KEY) 3343 btrfs_item_key_to_cpu(path->nodes[0], &key,
3294 ret = 0; 3344 path->slots[0]);
3345 if (key.objectid == bytenr &&
3346 (key.type == BTRFS_METADATA_ITEM_KEY ||
3347 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3348 key.offset == blocksize)))
3349 ret = 0;
3350 }
3351
3352 if (ret) {
3353 skinny = false;
3354 btrfs_release_path(path);
3355 goto again;
3356 }
3295 } 3357 }
3296 BUG_ON(ret); 3358 BUG_ON(ret);
3297 3359
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4160 (unsigned long long)rc->block_group->key.objectid, 4222 (unsigned long long)rc->block_group->key.objectid,
4161 (unsigned long long)rc->block_group->flags); 4223 (unsigned long long)rc->block_group->flags);
4162 4224
4163 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4225 ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
4164 if (ret < 0) { 4226 if (ret < 0) {
4165 err = ret; 4227 err = ret;
4166 goto out; 4228 goto out;
4167 } 4229 }
4168 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4230 btrfs_wait_all_ordered_extents(fs_info, 0);
4169 4231
4170 while (1) { 4232 while (1) {
4171 mutex_lock(&fs_info->cleaner_mutex); 4233 mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4277 key.type != BTRFS_ROOT_ITEM_KEY) 4339 key.type != BTRFS_ROOT_ITEM_KEY)
4278 break; 4340 break;
4279 4341
4280 reloc_root = btrfs_read_fs_root_no_radix(root, &key); 4342 reloc_root = btrfs_read_fs_root(root, &key);
4281 if (IS_ERR(reloc_root)) { 4343 if (IS_ERR(reloc_root)) {
4282 err = PTR_ERR(reloc_root); 4344 err = PTR_ERR(reloc_root);
4283 goto out; 4345 goto out;
@@ -4396,10 +4458,8 @@ out:
4396int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4458int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4397{ 4459{
4398 struct btrfs_ordered_sum *sums; 4460 struct btrfs_ordered_sum *sums;
4399 struct btrfs_sector_sum *sector_sum;
4400 struct btrfs_ordered_extent *ordered; 4461 struct btrfs_ordered_extent *ordered;
4401 struct btrfs_root *root = BTRFS_I(inode)->root; 4462 struct btrfs_root *root = BTRFS_I(inode)->root;
4402 size_t offset;
4403 int ret; 4463 int ret;
4404 u64 disk_bytenr; 4464 u64 disk_bytenr;
4405 LIST_HEAD(list); 4465 LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4413 if (ret) 4473 if (ret)
4414 goto out; 4474 goto out;
4415 4475
4476 disk_bytenr = ordered->start;
4416 while (!list_empty(&list)) { 4477 while (!list_empty(&list)) {
4417 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4478 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4418 list_del_init(&sums->list); 4479 list_del_init(&sums->list);
4419 4480
4420 sector_sum = sums->sums; 4481 sums->bytenr = disk_bytenr;
4421 sums->bytenr = ordered->start; 4482 disk_bytenr += sums->len;
4422
4423 offset = 0;
4424 while (offset < sums->len) {
4425 sector_sum->bytenr += ordered->start - disk_bytenr;
4426 sector_sum++;
4427 offset += root->sectorsize;
4428 }
4429 4483
4430 btrfs_add_ordered_sum(inode, ordered, sums); 4484 btrfs_add_ordered_sum(inode, ordered, sums);
4431 } 4485 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
64} 64}
65 65
66/* 66/*
67 * lookup the root with the highest offset for a given objectid. The key we do 67 * btrfs_find_root - lookup the root by the key.
68 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 68 * root: the root of the root tree
69 * on error. 69 * search_key: the key to search
70 * path: the path we search
71 * root_item: the root item of the tree we look for
72 * root_key: the reak key of the tree we look for
73 *
74 * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
75 * of the search key, just lookup the root with the highest offset for a
76 * given objectid.
77 *
78 * If we find something return 0, otherwise > 0, < 0 on error.
70 */ 79 */
71int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, 80int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
72 struct btrfs_root_item *item, struct btrfs_key *key) 81 struct btrfs_path *path, struct btrfs_root_item *root_item,
82 struct btrfs_key *root_key)
73{ 83{
74 struct btrfs_path *path;
75 struct btrfs_key search_key;
76 struct btrfs_key found_key; 84 struct btrfs_key found_key;
77 struct extent_buffer *l; 85 struct extent_buffer *l;
78 int ret; 86 int ret;
79 int slot; 87 int slot;
80 88
81 search_key.objectid = objectid; 89 ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
82 search_key.type = BTRFS_ROOT_ITEM_KEY;
83 search_key.offset = (u64)-1;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
89 if (ret < 0) 90 if (ret < 0)
90 goto out; 91 return ret;
91 92
92 BUG_ON(ret == 0); 93 if (search_key->offset != -1ULL) { /* the search key is exact */
93 if (path->slots[0] == 0) { 94 if (ret > 0)
94 ret = 1; 95 goto out;
95 goto out; 96 } else {
97 BUG_ON(ret == 0); /* Logical error */
98 if (path->slots[0] == 0)
99 goto out;
100 path->slots[0]--;
101 ret = 0;
96 } 102 }
103
97 l = path->nodes[0]; 104 l = path->nodes[0];
98 slot = path->slots[0] - 1; 105 slot = path->slots[0];
106
99 btrfs_item_key_to_cpu(l, &found_key, slot); 107 btrfs_item_key_to_cpu(l, &found_key, slot);
100 if (found_key.objectid != objectid || 108 if (found_key.objectid != search_key->objectid ||
101 found_key.type != BTRFS_ROOT_ITEM_KEY) { 109 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 110 ret = 1;
103 goto out; 111 goto out;
104 } 112 }
105 if (item)
106 btrfs_read_root_item(l, slot, item);
107 if (key)
108 memcpy(key, &found_key, sizeof(found_key));
109 113
110 ret = 0; 114 if (root_item)
115 btrfs_read_root_item(l, slot, root_item);
116 if (root_key)
117 memcpy(root_key, &found_key, sizeof(found_key));
111out: 118out:
112 btrfs_free_path(path); 119 btrfs_release_path(path);
113 return ret; 120 return ret;
114} 121}
115 122
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
212 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 219 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
213} 220}
214 221
215/*
216 * at mount time we want to find all the old transaction snapshots that were in
217 * the process of being deleted if we crashed. This is any root item with an
218 * offset lower than the latest root. They need to be queued for deletion to
219 * finish what was happening when we crashed.
220 */
221int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
222{
223 struct btrfs_root *dead_root;
224 struct btrfs_root_item *ri;
225 struct btrfs_key key;
226 struct btrfs_key found_key;
227 struct btrfs_path *path;
228 int ret;
229 u32 nritems;
230 struct extent_buffer *leaf;
231 int slot;
232
233 key.objectid = objectid;
234 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
235 key.offset = 0;
236 path = btrfs_alloc_path();
237 if (!path)
238 return -ENOMEM;
239
240again:
241 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
242 if (ret < 0)
243 goto err;
244 while (1) {
245 leaf = path->nodes[0];
246 nritems = btrfs_header_nritems(leaf);
247 slot = path->slots[0];
248 if (slot >= nritems) {
249 ret = btrfs_next_leaf(root, path);
250 if (ret)
251 break;
252 leaf = path->nodes[0];
253 nritems = btrfs_header_nritems(leaf);
254 slot = path->slots[0];
255 }
256 btrfs_item_key_to_cpu(leaf, &key, slot);
257 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
258 goto next;
259
260 if (key.objectid < objectid)
261 goto next;
262
263 if (key.objectid > objectid)
264 break;
265
266 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
267 if (btrfs_disk_root_refs(leaf, ri) != 0)
268 goto next;
269
270 memcpy(&found_key, &key, sizeof(key));
271 key.offset++;
272 btrfs_release_path(path);
273 dead_root =
274 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
275 &found_key);
276 if (IS_ERR(dead_root)) {
277 ret = PTR_ERR(dead_root);
278 goto err;
279 }
280
281 ret = btrfs_add_dead_root(dead_root);
282 if (ret)
283 goto err;
284 goto again;
285next:
286 slot++;
287 path->slots[0]++;
288 }
289 ret = 0;
290err:
291 btrfs_free_path(path);
292 return ret;
293}
294
295int btrfs_find_orphan_roots(struct btrfs_root *tree_root) 222int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
296{ 223{
297 struct extent_buffer *leaf; 224 struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
301 struct btrfs_root *root; 228 struct btrfs_root *root;
302 int err = 0; 229 int err = 0;
303 int ret; 230 int ret;
231 bool can_recover = true;
232
233 if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
234 can_recover = false;
304 235
305 path = btrfs_alloc_path(); 236 path = btrfs_alloc_path();
306 if (!path) 237 if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
340 root_key.objectid = key.offset; 271 root_key.objectid = key.offset;
341 key.offset++; 272 key.offset++;
342 273
343 root = btrfs_read_fs_root_no_name(tree_root->fs_info, 274 root = btrfs_read_fs_root(tree_root, &root_key);
344 &root_key); 275 err = PTR_RET(root);
345 if (!IS_ERR(root)) 276 if (err && err != -ENOENT) {
277 break;
278 } else if (err == -ENOENT) {
279 struct btrfs_trans_handle *trans;
280
281 btrfs_release_path(path);
282
283 trans = btrfs_join_transaction(tree_root);
284 if (IS_ERR(trans)) {
285 err = PTR_ERR(trans);
286 btrfs_error(tree_root->fs_info, err,
287 "Failed to start trans to delete "
288 "orphan item");
289 break;
290 }
291 err = btrfs_del_orphan_item(trans, tree_root,
292 root_key.objectid);
293 btrfs_end_transaction(trans, tree_root);
294 if (err) {
295 btrfs_error(tree_root->fs_info, err,
296 "Failed to delete root orphan "
297 "item");
298 break;
299 }
346 continue; 300 continue;
301 }
347 302
348 ret = PTR_ERR(root); 303 if (btrfs_root_refs(&root->root_item) == 0) {
349 if (ret != -ENOENT) { 304 btrfs_add_dead_root(root);
350 err = ret; 305 continue;
306 }
307
308 err = btrfs_init_fs_root(root);
309 if (err) {
310 btrfs_free_fs_root(root);
351 break; 311 break;
352 } 312 }
353 313
354 ret = btrfs_find_dead_roots(tree_root, root_key.objectid); 314 root->orphan_item_inserted = 1;
355 if (ret) { 315
356 err = ret; 316 err = btrfs_insert_fs_root(root->fs_info, root);
317 if (err) {
318 BUG_ON(err == -EEXIST);
319 btrfs_free_fs_root(root);
357 break; 320 break;
358 } 321 }
359 } 322 }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
368{ 331{
369 struct btrfs_path *path; 332 struct btrfs_path *path;
370 int ret; 333 int ret;
371 struct btrfs_root_item *ri;
372 struct extent_buffer *leaf;
373 334
374 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
375 if (!path) 336 if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
379 goto out; 340 goto out;
380 341
381 BUG_ON(ret != 0); 342 BUG_ON(ret != 0);
382 leaf = path->nodes[0];
383 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
384 343
385 ret = btrfs_del_item(trans, root, path); 344 ret = btrfs_del_item(trans, root, path);
386out: 345out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..4ba2a69a60ad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2126 u8 *csum) 2126 u8 *csum)
2127{ 2127{
2128 struct btrfs_ordered_sum *sum = NULL; 2128 struct btrfs_ordered_sum *sum = NULL;
2129 int ret = 0; 2129 unsigned long index;
2130 unsigned long i;
2131 unsigned long num_sectors; 2130 unsigned long num_sectors;
2132 2131
2133 while (!list_empty(&sctx->csum_list)) { 2132 while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2146 if (!sum) 2145 if (!sum)
2147 return 0; 2146 return 0;
2148 2147
2148 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2149 num_sectors = sum->len / sctx->sectorsize; 2149 num_sectors = sum->len / sctx->sectorsize;
2150 for (i = 0; i < num_sectors; ++i) { 2150 memcpy(csum, sum->sums + index, sctx->csum_size);
2151 if (sum->sums[i].bytenr == logical) { 2151 if (index == num_sectors - 1) {
2152 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2153 ret = 1;
2154 break;
2155 }
2156 }
2157 if (ret && i == num_sectors - 1) {
2158 list_del(&sum->list); 2152 list_del(&sum->list);
2159 kfree(sum); 2153 kfree(sum);
2160 } 2154 }
2161 return ret; 2155 return 1;
2162} 2156}
2163 2157
2164/* scrub extent tries to collect up to 64 kB for each bio */ 2158/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2505,6 +2499,7 @@ again:
2505 if (ret) 2499 if (ret)
2506 goto out; 2500 goto out;
2507 2501
2502 scrub_free_csums(sctx);
2508 if (extent_logical + extent_len < 2503 if (extent_logical + extent_len <
2509 key.objectid + bytes) { 2504 key.objectid + bytes) {
2510 logical += increment; 2505 logical += increment;
@@ -3204,16 +3199,18 @@ out:
3204 3199
3205static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3200static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3206{ 3201{
3207 unsigned long index;
3208 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3202 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3209 int ret = 0; 3203 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3210 struct btrfs_key key; 3204 struct btrfs_key key;
3211 struct inode *inode = NULL; 3205 struct inode *inode;
3206 struct page *page;
3212 struct btrfs_root *local_root; 3207 struct btrfs_root *local_root;
3213 u64 physical_for_dev_replace; 3208 u64 physical_for_dev_replace;
3214 u64 len; 3209 u64 len;
3215 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3210 unsigned long index;
3216 int srcu_index; 3211 int srcu_index;
3212 int ret;
3213 int err;
3217 3214
3218 key.objectid = root; 3215 key.objectid = root;
3219 key.type = BTRFS_ROOT_ITEM_KEY; 3216 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3227 return PTR_ERR(local_root); 3224 return PTR_ERR(local_root);
3228 } 3225 }
3229 3226
3227 if (btrfs_root_refs(&local_root->root_item) == 0) {
3228 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3229 return -ENOENT;
3230 }
3231
3230 key.type = BTRFS_INODE_ITEM_KEY; 3232 key.type = BTRFS_INODE_ITEM_KEY;
3231 key.objectid = inum; 3233 key.objectid = inum;
3232 key.offset = 0; 3234 key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3235 if (IS_ERR(inode)) 3237 if (IS_ERR(inode))
3236 return PTR_ERR(inode); 3238 return PTR_ERR(inode);
3237 3239
3240 /* Avoid truncate/dio/punch hole.. */
3241 mutex_lock(&inode->i_mutex);
3242 inode_dio_wait(inode);
3243
3244 ret = 0;
3238 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3245 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3239 len = nocow_ctx->len; 3246 len = nocow_ctx->len;
3240 while (len >= PAGE_CACHE_SIZE) { 3247 while (len >= PAGE_CACHE_SIZE) {
3241 struct page *page = NULL;
3242 int ret_sub;
3243
3244 index = offset >> PAGE_CACHE_SHIFT; 3248 index = offset >> PAGE_CACHE_SHIFT;
3245 3249again:
3246 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3250 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3247 if (!page) { 3251 if (!page) {
3248 pr_err("find_or_create_page() failed\n"); 3252 pr_err("find_or_create_page() failed\n");
3249 ret = -ENOMEM; 3253 ret = -ENOMEM;
3250 goto next_page; 3254 goto out;
3251 } 3255 }
3252 3256
3253 if (PageUptodate(page)) { 3257 if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3255 goto next_page; 3259 goto next_page;
3256 } else { 3260 } else {
3257 ClearPageError(page); 3261 ClearPageError(page);
3258 ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 3262 err = extent_read_full_page(&BTRFS_I(inode)->
3259 io_tree, 3263 io_tree,
3260 page, btrfs_get_extent, 3264 page, btrfs_get_extent,
3261 nocow_ctx->mirror_num); 3265 nocow_ctx->mirror_num);
3262 if (ret_sub) { 3266 if (err) {
3263 ret = ret_sub; 3267 ret = err;
3264 goto next_page; 3268 goto next_page;
3265 } 3269 }
3266 wait_on_page_locked(page); 3270
3271 lock_page(page);
3272 /*
3273 * If the page has been remove from the page cache,
3274 * the data on it is meaningless, because it may be
3275 * old one, the new data may be written into the new
3276 * page in the page cache.
3277 */
3278 if (page->mapping != inode->i_mapping) {
3279 page_cache_release(page);
3280 goto again;
3281 }
3267 if (!PageUptodate(page)) { 3282 if (!PageUptodate(page)) {
3268 ret = -EIO; 3283 ret = -EIO;
3269 goto next_page; 3284 goto next_page;
3270 } 3285 }
3271 } 3286 }
3272 ret_sub = write_page_nocow(nocow_ctx->sctx, 3287 err = write_page_nocow(nocow_ctx->sctx,
3273 physical_for_dev_replace, page); 3288 physical_for_dev_replace, page);
3274 if (ret_sub) { 3289 if (err)
3275 ret = ret_sub; 3290 ret = err;
3276 goto next_page;
3277 }
3278
3279next_page: 3291next_page:
3280 if (page) { 3292 unlock_page(page);
3281 unlock_page(page); 3293 page_cache_release(page);
3282 put_page(page); 3294
3283 } 3295 if (ret)
3296 break;
3297
3284 offset += PAGE_CACHE_SIZE; 3298 offset += PAGE_CACHE_SIZE;
3285 physical_for_dev_replace += PAGE_CACHE_SIZE; 3299 physical_for_dev_replace += PAGE_CACHE_SIZE;
3286 len -= PAGE_CACHE_SIZE; 3300 len -= PAGE_CACHE_SIZE;
3287 } 3301 }
3288 3302out:
3289 if (inode) 3303 mutex_unlock(&inode->i_mutex);
3290 iput(inode); 3304 iput(inode);
3291 return ret; 3305 return ret;
3292} 3306}
3293 3307
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
158 } 158 }
159} 159}
160 160
161static struct fs_path *fs_path_alloc(struct send_ctx *sctx) 161static struct fs_path *fs_path_alloc(void)
162{ 162{
163 struct fs_path *p; 163 struct fs_path *p;
164 164
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
173 return p; 173 return p;
174} 174}
175 175
176static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) 176static struct fs_path *fs_path_alloc_reversed(void)
177{ 177{
178 struct fs_path *p; 178 struct fs_path *p;
179 179
180 p = fs_path_alloc(sctx); 180 p = fs_path_alloc();
181 if (!p) 181 if (!p)
182 return NULL; 182 return NULL;
183 p->reversed = 1; 183 p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
185 return p; 185 return p;
186} 186}
187 187
188static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) 188static void fs_path_free(struct fs_path *p)
189{ 189{
190 if (!p) 190 if (!p)
191 return; 191 return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
753 * 753 *
754 * path must point to the INODE_REF or INODE_EXTREF when called. 754 * path must point to the INODE_REF or INODE_EXTREF when called.
755 */ 755 */
756static int iterate_inode_ref(struct send_ctx *sctx, 756static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
757 struct btrfs_root *root, struct btrfs_path *path,
758 struct btrfs_key *found_key, int resolve, 757 struct btrfs_key *found_key, int resolve,
759 iterate_inode_ref_t iterate, void *ctx) 758 iterate_inode_ref_t iterate, void *ctx)
760{ 759{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
777 unsigned long elem_size; 776 unsigned long elem_size;
778 unsigned long ptr; 777 unsigned long ptr;
779 778
780 p = fs_path_alloc_reversed(sctx); 779 p = fs_path_alloc_reversed();
781 if (!p) 780 if (!p)
782 return -ENOMEM; 781 return -ENOMEM;
783 782
784 tmp_path = alloc_path_for_send(); 783 tmp_path = alloc_path_for_send();
785 if (!tmp_path) { 784 if (!tmp_path) {
786 fs_path_free(sctx, p); 785 fs_path_free(p);
787 return -ENOMEM; 786 return -ENOMEM;
788 } 787 }
789 788
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
858 857
859out: 858out:
860 btrfs_free_path(tmp_path); 859 btrfs_free_path(tmp_path);
861 fs_path_free(sctx, p); 860 fs_path_free(p);
862 return ret; 861 return ret;
863} 862}
864 863
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
874 * 873 *
875 * path must point to the dir item when called. 874 * path must point to the dir item when called.
876 */ 875 */
877static int iterate_dir_item(struct send_ctx *sctx, 876static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
878 struct btrfs_root *root, struct btrfs_path *path,
879 struct btrfs_key *found_key, 877 struct btrfs_key *found_key,
880 iterate_dir_item_t iterate, void *ctx) 878 iterate_dir_item_t iterate, void *ctx)
881{ 879{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
990 * Retrieve the first path of an inode. If an inode has more then one 988 * Retrieve the first path of an inode. If an inode has more then one
991 * ref/hardlink, this is ignored. 989 * ref/hardlink, this is ignored.
992 */ 990 */
993static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, 991static int get_inode_path(struct btrfs_root *root,
994 u64 ino, struct fs_path *path) 992 u64 ino, struct fs_path *path)
995{ 993{
996 int ret; 994 int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
1022 goto out; 1020 goto out;
1023 } 1021 }
1024 1022
1025 ret = iterate_inode_ref(sctx, root, p, &found_key, 1, 1023 ret = iterate_inode_ref(root, p, &found_key, 1,
1026 __copy_first_ref, path); 1024 __copy_first_ref, path);
1027 if (ret < 0) 1025 if (ret < 0)
1028 goto out; 1026 goto out;
1029 ret = 0; 1027 ret = 0;
@@ -1314,8 +1312,7 @@ out:
1314 return ret; 1312 return ret;
1315} 1313}
1316 1314
1317static int read_symlink(struct send_ctx *sctx, 1315static int read_symlink(struct btrfs_root *root,
1318 struct btrfs_root *root,
1319 u64 ino, 1316 u64 ino,
1320 struct fs_path *dest) 1317 struct fs_path *dest)
1321{ 1318{
@@ -1562,8 +1559,7 @@ out:
1562 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1559 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1563 * generation of the parent dir and the name of the dir entry. 1560 * generation of the parent dir and the name of the dir entry.
1564 */ 1561 */
1565static int get_first_ref(struct send_ctx *sctx, 1562static int get_first_ref(struct btrfs_root *root, u64 ino,
1566 struct btrfs_root *root, u64 ino,
1567 u64 *dir, u64 *dir_gen, struct fs_path *name) 1563 u64 *dir, u64 *dir_gen, struct fs_path *name)
1568{ 1564{
1569 int ret; 1565 int ret;
@@ -1628,8 +1624,7 @@ out:
1628 return ret; 1624 return ret;
1629} 1625}
1630 1626
1631static int is_first_ref(struct send_ctx *sctx, 1627static int is_first_ref(struct btrfs_root *root,
1632 struct btrfs_root *root,
1633 u64 ino, u64 dir, 1628 u64 ino, u64 dir,
1634 const char *name, int name_len) 1629 const char *name, int name_len)
1635{ 1630{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
1638 u64 tmp_dir; 1633 u64 tmp_dir;
1639 u64 tmp_dir_gen; 1634 u64 tmp_dir_gen;
1640 1635
1641 tmp_name = fs_path_alloc(sctx); 1636 tmp_name = fs_path_alloc();
1642 if (!tmp_name) 1637 if (!tmp_name)
1643 return -ENOMEM; 1638 return -ENOMEM;
1644 1639
1645 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1640 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1646 if (ret < 0) 1641 if (ret < 0)
1647 goto out; 1642 goto out;
1648 1643
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
1654 ret = !memcmp(tmp_name->start, name, name_len); 1649 ret = !memcmp(tmp_name->start, name, name_len);
1655 1650
1656out: 1651out:
1657 fs_path_free(sctx, tmp_name); 1652 fs_path_free(tmp_name);
1658 return ret; 1653 return ret;
1659} 1654}
1660 1655
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1783 if (!sctx->parent_root) 1778 if (!sctx->parent_root)
1784 goto out; 1779 goto out;
1785 1780
1786 name = fs_path_alloc(sctx); 1781 name = fs_path_alloc();
1787 if (!name) 1782 if (!name)
1788 return -ENOMEM; 1783 return -ENOMEM;
1789 1784
1790 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); 1785 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
1791 if (ret < 0) 1786 if (ret < 0)
1792 goto out; 1787 goto out;
1793 1788
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1795 name->start, fs_path_len(name)); 1790 name->start, fs_path_len(name));
1796 1791
1797out: 1792out:
1798 fs_path_free(sctx, name); 1793 fs_path_free(name);
1799 return ret; 1794 return ret;
1800} 1795}
1801 1796
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 * send_root or parent_root for ref lookup. 1974 * send_root or parent_root for ref lookup.
1980 */ 1975 */
1981 if (ino < sctx->send_progress) 1976 if (ino < sctx->send_progress)
1982 ret = get_first_ref(sctx, sctx->send_root, ino, 1977 ret = get_first_ref(sctx->send_root, ino,
1983 parent_ino, parent_gen, dest); 1978 parent_ino, parent_gen, dest);
1984 else 1979 else
1985 ret = get_first_ref(sctx, sctx->parent_root, ino, 1980 ret = get_first_ref(sctx->parent_root, ino,
1986 parent_ino, parent_gen, dest); 1981 parent_ino, parent_gen, dest);
1987 if (ret < 0) 1982 if (ret < 0)
1988 goto out; 1983 goto out;
1989 1984
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2070 u64 parent_gen = 0; 2065 u64 parent_gen = 0;
2071 int stop = 0; 2066 int stop = 0;
2072 2067
2073 name = fs_path_alloc(sctx); 2068 name = fs_path_alloc();
2074 if (!name) { 2069 if (!name) {
2075 ret = -ENOMEM; 2070 ret = -ENOMEM;
2076 goto out; 2071 goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2098 } 2093 }
2099 2094
2100out: 2095out:
2101 fs_path_free(sctx, name); 2096 fs_path_free(name);
2102 if (!ret) 2097 if (!ret)
2103 fs_path_unreverse(dest); 2098 fs_path_unreverse(dest);
2104 return ret; 2099 return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2263 2258
2264verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); 2259verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2265 2260
2266 p = fs_path_alloc(sctx); 2261 p = fs_path_alloc();
2267 if (!p) 2262 if (!p)
2268 return -ENOMEM; 2263 return -ENOMEM;
2269 2264
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2281 2276
2282tlv_put_failure: 2277tlv_put_failure:
2283out: 2278out:
2284 fs_path_free(sctx, p); 2279 fs_path_free(p);
2285 return ret; 2280 return ret;
2286} 2281}
2287 2282
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2292 2287
2293verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); 2288verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2294 2289
2295 p = fs_path_alloc(sctx); 2290 p = fs_path_alloc();
2296 if (!p) 2291 if (!p)
2297 return -ENOMEM; 2292 return -ENOMEM;
2298 2293
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2310 2305
2311tlv_put_failure: 2306tlv_put_failure:
2312out: 2307out:
2313 fs_path_free(sctx, p); 2308 fs_path_free(p);
2314 return ret; 2309 return ret;
2315} 2310}
2316 2311
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2321 2316
2322verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); 2317verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2323 2318
2324 p = fs_path_alloc(sctx); 2319 p = fs_path_alloc();
2325 if (!p) 2320 if (!p)
2326 return -ENOMEM; 2321 return -ENOMEM;
2327 2322
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2340 2335
2341tlv_put_failure: 2336tlv_put_failure:
2342out: 2337out:
2343 fs_path_free(sctx, p); 2338 fs_path_free(p);
2344 return ret; 2339 return ret;
2345} 2340}
2346 2341
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2356 2351
2357verbose_printk("btrfs: send_utimes %llu\n", ino); 2352verbose_printk("btrfs: send_utimes %llu\n", ino);
2358 2353
2359 p = fs_path_alloc(sctx); 2354 p = fs_path_alloc();
2360 if (!p) 2355 if (!p)
2361 return -ENOMEM; 2356 return -ENOMEM;
2362 2357
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2397 2392
2398tlv_put_failure: 2393tlv_put_failure:
2399out: 2394out:
2400 fs_path_free(sctx, p); 2395 fs_path_free(p);
2401 btrfs_free_path(path); 2396 btrfs_free_path(path);
2402 return ret; 2397 return ret;
2403} 2398}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
2418 2413
2419verbose_printk("btrfs: send_create_inode %llu\n", ino); 2414verbose_printk("btrfs: send_create_inode %llu\n", ino);
2420 2415
2421 p = fs_path_alloc(sctx); 2416 p = fs_path_alloc();
2422 if (!p) 2417 if (!p)
2423 return -ENOMEM; 2418 return -ENOMEM;
2424 2419
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2459 2454
2460 if (S_ISLNK(mode)) { 2455 if (S_ISLNK(mode)) {
2461 fs_path_reset(p); 2456 fs_path_reset(p);
2462 ret = read_symlink(sctx, sctx->send_root, ino, p); 2457 ret = read_symlink(sctx->send_root, ino, p);
2463 if (ret < 0) 2458 if (ret < 0)
2464 goto out; 2459 goto out;
2465 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2460 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2476 2471
2477tlv_put_failure: 2472tlv_put_failure:
2478out: 2473out:
2479 fs_path_free(sctx, p); 2474 fs_path_free(p);
2480 return ret; 2475 return ret;
2481} 2476}
2482 2477
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
2615 return 0; 2610 return 0;
2616} 2611}
2617 2612
2618static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2613static void __free_recorded_refs(struct list_head *head)
2619{ 2614{
2620 struct recorded_ref *cur; 2615 struct recorded_ref *cur;
2621 2616
2622 while (!list_empty(head)) { 2617 while (!list_empty(head)) {
2623 cur = list_entry(head->next, struct recorded_ref, list); 2618 cur = list_entry(head->next, struct recorded_ref, list);
2624 fs_path_free(sctx, cur->full_path); 2619 fs_path_free(cur->full_path);
2625 list_del(&cur->list); 2620 list_del(&cur->list);
2626 kfree(cur); 2621 kfree(cur);
2627 } 2622 }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2629 2624
2630static void free_recorded_refs(struct send_ctx *sctx) 2625static void free_recorded_refs(struct send_ctx *sctx)
2631{ 2626{
2632 __free_recorded_refs(sctx, &sctx->new_refs); 2627 __free_recorded_refs(&sctx->new_refs);
2633 __free_recorded_refs(sctx, &sctx->deleted_refs); 2628 __free_recorded_refs(&sctx->deleted_refs);
2634} 2629}
2635 2630
2636/* 2631/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2644 int ret; 2639 int ret;
2645 struct fs_path *orphan; 2640 struct fs_path *orphan;
2646 2641
2647 orphan = fs_path_alloc(sctx); 2642 orphan = fs_path_alloc();
2648 if (!orphan) 2643 if (!orphan)
2649 return -ENOMEM; 2644 return -ENOMEM;
2650 2645
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2655 ret = send_rename(sctx, path, orphan); 2650 ret = send_rename(sctx, path, orphan);
2656 2651
2657out: 2652out:
2658 fs_path_free(sctx, orphan); 2653 fs_path_free(orphan);
2659 return ret; 2654 return ret;
2660} 2655}
2661 2656
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2746 */ 2741 */
2747 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 2742 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2748 2743
2749 valid_path = fs_path_alloc(sctx); 2744 valid_path = fs_path_alloc();
2750 if (!valid_path) { 2745 if (!valid_path) {
2751 ret = -ENOMEM; 2746 ret = -ENOMEM;
2752 goto out; 2747 goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2843 if (ret < 0) 2838 if (ret < 0)
2844 goto out; 2839 goto out;
2845 if (ret) { 2840 if (ret) {
2846 ret = is_first_ref(sctx, sctx->parent_root, 2841 ret = is_first_ref(sctx->parent_root,
2847 ow_inode, cur->dir, cur->name, 2842 ow_inode, cur->dir, cur->name,
2848 cur->name_len); 2843 cur->name_len);
2849 if (ret < 0) 2844 if (ret < 0)
2850 goto out; 2845 goto out;
2851 if (ret) { 2846 if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3024out: 3019out:
3025 free_recorded_refs(sctx); 3020 free_recorded_refs(sctx);
3026 ulist_free(check_dirs); 3021 ulist_free(check_dirs);
3027 fs_path_free(sctx, valid_path); 3022 fs_path_free(valid_path);
3028 return ret; 3023 return ret;
3029} 3024}
3030 3025
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3037 struct fs_path *p; 3032 struct fs_path *p;
3038 u64 gen; 3033 u64 gen;
3039 3034
3040 p = fs_path_alloc(sctx); 3035 p = fs_path_alloc();
3041 if (!p) 3036 if (!p)
3042 return -ENOMEM; 3037 return -ENOMEM;
3043 3038
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3057 3052
3058out: 3053out:
3059 if (ret) 3054 if (ret)
3060 fs_path_free(sctx, p); 3055 fs_path_free(p);
3061 return ret; 3056 return ret;
3062} 3057}
3063 3058
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3070 struct fs_path *p; 3065 struct fs_path *p;
3071 u64 gen; 3066 u64 gen;
3072 3067
3073 p = fs_path_alloc(sctx); 3068 p = fs_path_alloc();
3074 if (!p) 3069 if (!p)
3075 return -ENOMEM; 3070 return -ENOMEM;
3076 3071
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3090 3085
3091out: 3086out:
3092 if (ret) 3087 if (ret)
3093 fs_path_free(sctx, p); 3088 fs_path_free(p);
3094 return ret; 3089 return ret;
3095} 3090}
3096 3091
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
3098{ 3093{
3099 int ret; 3094 int ret;
3100 3095
3101 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3096 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3102 sctx->cmp_key, 0, __record_new_ref, sctx); 3097 sctx->cmp_key, 0, __record_new_ref, sctx);
3103 if (ret < 0) 3098 if (ret < 0)
3104 goto out; 3099 goto out;
3105 ret = 0; 3100 ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
3112{ 3107{
3113 int ret; 3108 int ret;
3114 3109
3115 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3110 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3116 sctx->cmp_key, 0, __record_deleted_ref, sctx); 3111 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3117 if (ret < 0) 3112 if (ret < 0)
3118 goto out; 3113 goto out;
3119 ret = 0; 3114 ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
3142 return 0; 3137 return 0;
3143} 3138}
3144 3139
3145static int find_iref(struct send_ctx *sctx, 3140static int find_iref(struct btrfs_root *root,
3146 struct btrfs_root *root,
3147 struct btrfs_path *path, 3141 struct btrfs_path *path,
3148 struct btrfs_key *key, 3142 struct btrfs_key *key,
3149 u64 dir, struct fs_path *name) 3143 u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
3155 ctx.name = name; 3149 ctx.name = name;
3156 ctx.found_idx = -1; 3150 ctx.found_idx = -1;
3157 3151
3158 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); 3152 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
3159 if (ret < 0) 3153 if (ret < 0)
3160 return ret; 3154 return ret;
3161 3155
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
3172 int ret; 3166 int ret;
3173 struct send_ctx *sctx = ctx; 3167 struct send_ctx *sctx = ctx;
3174 3168
3175 ret = find_iref(sctx, sctx->parent_root, sctx->right_path, 3169 ret = find_iref(sctx->parent_root, sctx->right_path,
3176 sctx->cmp_key, dir, name); 3170 sctx->cmp_key, dir, name);
3177 if (ret == -ENOENT) 3171 if (ret == -ENOENT)
3178 ret = __record_new_ref(num, dir, index, name, sctx); 3172 ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
3189 int ret; 3183 int ret;
3190 struct send_ctx *sctx = ctx; 3184 struct send_ctx *sctx = ctx;
3191 3185
3192 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3186 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
3193 dir, name); 3187 dir, name);
3194 if (ret == -ENOENT) 3188 if (ret == -ENOENT)
3195 ret = __record_deleted_ref(num, dir, index, name, sctx); 3189 ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
3203{ 3197{
3204 int ret = 0; 3198 int ret = 0;
3205 3199
3206 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3200 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3207 sctx->cmp_key, 0, __record_changed_new_ref, sctx); 3201 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3208 if (ret < 0) 3202 if (ret < 0)
3209 goto out; 3203 goto out;
3210 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3204 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3211 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); 3205 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3212 if (ret < 0) 3206 if (ret < 0)
3213 goto out; 3207 goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
3266 found_key.type != BTRFS_INODE_EXTREF_KEY)) 3260 found_key.type != BTRFS_INODE_EXTREF_KEY))
3267 break; 3261 break;
3268 3262
3269 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, 3263 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3270 sctx);
3271 btrfs_release_path(path); 3264 btrfs_release_path(path);
3272 if (ret < 0) 3265 if (ret < 0)
3273 goto out; 3266 goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3335 struct fs_path *p; 3328 struct fs_path *p;
3336 posix_acl_xattr_header dummy_acl; 3329 posix_acl_xattr_header dummy_acl;
3337 3330
3338 p = fs_path_alloc(sctx); 3331 p = fs_path_alloc();
3339 if (!p) 3332 if (!p)
3340 return -ENOMEM; 3333 return -ENOMEM;
3341 3334
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3362 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 3355 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3363 3356
3364out: 3357out:
3365 fs_path_free(sctx, p); 3358 fs_path_free(p);
3366 return ret; 3359 return ret;
3367} 3360}
3368 3361
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3375 struct send_ctx *sctx = ctx; 3368 struct send_ctx *sctx = ctx;
3376 struct fs_path *p; 3369 struct fs_path *p;
3377 3370
3378 p = fs_path_alloc(sctx); 3371 p = fs_path_alloc();
3379 if (!p) 3372 if (!p)
3380 return -ENOMEM; 3373 return -ENOMEM;
3381 3374
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3386 ret = send_remove_xattr(sctx, p, name, name_len); 3379 ret = send_remove_xattr(sctx, p, name, name_len);
3387 3380
3388out: 3381out:
3389 fs_path_free(sctx, p); 3382 fs_path_free(p);
3390 return ret; 3383 return ret;
3391} 3384}
3392 3385
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
3394{ 3387{
3395 int ret = 0; 3388 int ret = 0;
3396 3389
3397 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3390 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3398 sctx->cmp_key, __process_new_xattr, sctx); 3391 sctx->cmp_key, __process_new_xattr, sctx);
3399 3392
3400 return ret; 3393 return ret;
3401} 3394}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
3404{ 3397{
3405 int ret; 3398 int ret;
3406 3399
3407 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3400 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3408 sctx->cmp_key, __process_deleted_xattr, sctx); 3401 sctx->cmp_key, __process_deleted_xattr, sctx);
3409 3402
3410 return ret; 3403 return ret;
3411} 3404}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
3429 strncmp(name, ctx->name, name_len) == 0) { 3422 strncmp(name, ctx->name, name_len) == 0) {
3430 ctx->found_idx = num; 3423 ctx->found_idx = num;
3431 ctx->found_data_len = data_len; 3424 ctx->found_data_len = data_len;
3432 ctx->found_data = kmalloc(data_len, GFP_NOFS); 3425 ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
3433 if (!ctx->found_data) 3426 if (!ctx->found_data)
3434 return -ENOMEM; 3427 return -ENOMEM;
3435 memcpy(ctx->found_data, data, data_len);
3436 return 1; 3428 return 1;
3437 } 3429 }
3438 return 0; 3430 return 0;
3439} 3431}
3440 3432
3441static int find_xattr(struct send_ctx *sctx, 3433static int find_xattr(struct btrfs_root *root,
3442 struct btrfs_root *root,
3443 struct btrfs_path *path, 3434 struct btrfs_path *path,
3444 struct btrfs_key *key, 3435 struct btrfs_key *key,
3445 const char *name, int name_len, 3436 const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
3454 ctx.found_data = NULL; 3445 ctx.found_data = NULL;
3455 ctx.found_data_len = 0; 3446 ctx.found_data_len = 0;
3456 3447
3457 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); 3448 ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
3458 if (ret < 0) 3449 if (ret < 0)
3459 return ret; 3450 return ret;
3460 3451
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3480 char *found_data = NULL; 3471 char *found_data = NULL;
3481 int found_data_len = 0; 3472 int found_data_len = 0;
3482 3473
3483 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, 3474 ret = find_xattr(sctx->parent_root, sctx->right_path,
3484 sctx->cmp_key, name, name_len, &found_data, 3475 sctx->cmp_key, name, name_len, &found_data,
3485 &found_data_len); 3476 &found_data_len);
3486 if (ret == -ENOENT) { 3477 if (ret == -ENOENT) {
3487 ret = __process_new_xattr(num, di_key, name, name_len, data, 3478 ret = __process_new_xattr(num, di_key, name, name_len, data,
3488 data_len, type, ctx); 3479 data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3508 int ret; 3499 int ret;
3509 struct send_ctx *sctx = ctx; 3500 struct send_ctx *sctx = ctx;
3510 3501
3511 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3502 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
3512 name, name_len, NULL, NULL); 3503 name, name_len, NULL, NULL);
3513 if (ret == -ENOENT) 3504 if (ret == -ENOENT)
3514 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 3505 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3515 data_len, type, ctx); 3506 data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
3523{ 3514{
3524 int ret = 0; 3515 int ret = 0;
3525 3516
3526 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3517 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3527 sctx->cmp_key, __process_changed_new_xattr, sctx); 3518 sctx->cmp_key, __process_changed_new_xattr, sctx);
3528 if (ret < 0) 3519 if (ret < 0)
3529 goto out; 3520 goto out;
3530 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3521 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3531 sctx->cmp_key, __process_changed_deleted_xattr, sctx); 3522 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3532 3523
3533out: 3524out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3572 goto out; 3563 goto out;
3573 } 3564 }
3574 3565
3575 ret = iterate_dir_item(sctx, root, path, &found_key, 3566 ret = iterate_dir_item(root, path, &found_key,
3576 __process_new_xattr, sctx); 3567 __process_new_xattr, sctx);
3577 if (ret < 0) 3568 if (ret < 0)
3578 goto out; 3569 goto out;
3579 3570
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3598 int num_read = 0; 3589 int num_read = 0;
3599 mm_segment_t old_fs; 3590 mm_segment_t old_fs;
3600 3591
3601 p = fs_path_alloc(sctx); 3592 p = fs_path_alloc();
3602 if (!p) 3593 if (!p)
3603 return -ENOMEM; 3594 return -ENOMEM;
3604 3595
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3640 3631
3641tlv_put_failure: 3632tlv_put_failure:
3642out: 3633out:
3643 fs_path_free(sctx, p); 3634 fs_path_free(p);
3644 set_fs(old_fs); 3635 set_fs(old_fs);
3645 if (ret < 0) 3636 if (ret < 0)
3646 return ret; 3637 return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3663 clone_root->root->objectid, clone_root->ino, 3654 clone_root->root->objectid, clone_root->ino,
3664 clone_root->offset); 3655 clone_root->offset);
3665 3656
3666 p = fs_path_alloc(sctx); 3657 p = fs_path_alloc();
3667 if (!p) 3658 if (!p)
3668 return -ENOMEM; 3659 return -ENOMEM;
3669 3660
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3686 goto out; 3677 goto out;
3687 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3678 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3688 } else { 3679 } else {
3689 ret = get_inode_path(sctx, clone_root->root, 3680 ret = get_inode_path(clone_root->root, clone_root->ino, p);
3690 clone_root->ino, p);
3691 } 3681 }
3692 if (ret < 0) 3682 if (ret < 0)
3693 goto out; 3683 goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3704 3694
3705tlv_put_failure: 3695tlv_put_failure:
3706out: 3696out:
3707 fs_path_free(sctx, p); 3697 fs_path_free(p);
3708 return ret; 3698 return ret;
3709} 3699}
3710 3700
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
3717 int ret = 0; 3707 int ret = 0;
3718 struct fs_path *p; 3708 struct fs_path *p;
3719 3709
3720 p = fs_path_alloc(sctx); 3710 p = fs_path_alloc();
3721 if (!p) 3711 if (!p)
3722 return -ENOMEM; 3712 return -ENOMEM;
3723 3713
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
3737 3727
3738tlv_put_failure: 3728tlv_put_failure:
3739out: 3729out:
3740 fs_path_free(sctx, p); 3730 fs_path_free(p);
3741 return ret; 3731 return ret;
3742} 3732}
3743 3733
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4579 send_root = BTRFS_I(file_inode(mnt_file))->root; 4569 send_root = BTRFS_I(file_inode(mnt_file))->root;
4580 fs_info = send_root->fs_info; 4570 fs_info = send_root->fs_info;
4581 4571
4572 /*
4573 * This is done when we lookup the root, it should already be complete
4574 * by the time we get here.
4575 */
4576 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
4577
4578 /*
4579 * If we just created this root we need to make sure that the orphan
4580 * cleanup has been done and committed since we search the commit root,
4581 * so check its commit root transid with our otransid and if they match
4582 * commit the transaction to make sure everything is updated.
4583 */
4584 down_read(&send_root->fs_info->extent_commit_sem);
4585 if (btrfs_header_generation(send_root->commit_root) ==
4586 btrfs_root_otransid(&send_root->root_item)) {
4587 struct btrfs_trans_handle *trans;
4588
4589 up_read(&send_root->fs_info->extent_commit_sem);
4590
4591 trans = btrfs_attach_transaction_barrier(send_root);
4592 if (IS_ERR(trans)) {
4593 if (PTR_ERR(trans) != -ENOENT) {
4594 ret = PTR_ERR(trans);
4595 goto out;
4596 }
4597 /* ENOENT means theres no transaction */
4598 } else {
4599 ret = btrfs_commit_transaction(trans, send_root);
4600 if (ret)
4601 goto out;
4602 }
4603 } else {
4604 up_read(&send_root->fs_info->extent_commit_sem);
4605 }
4606
4582 arg = memdup_user(arg_, sizeof(*arg)); 4607 arg = memdup_user(arg_, sizeof(*arg));
4583 if (IS_ERR(arg)) { 4608 if (IS_ERR(arg)) {
4584 ret = PTR_ERR(arg); 4609 ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4663 key.type = BTRFS_ROOT_ITEM_KEY; 4688 key.type = BTRFS_ROOT_ITEM_KEY;
4664 key.offset = (u64)-1; 4689 key.offset = (u64)-1;
4665 clone_root = btrfs_read_fs_root_no_name(fs_info, &key); 4690 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4666 if (!clone_root) {
4667 ret = -EINVAL;
4668 goto out;
4669 }
4670 if (IS_ERR(clone_root)) { 4691 if (IS_ERR(clone_root)) {
4671 ret = PTR_ERR(clone_root); 4692 ret = PTR_ERR(clone_root);
4672 goto out; 4693 goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4682 key.type = BTRFS_ROOT_ITEM_KEY; 4703 key.type = BTRFS_ROOT_ITEM_KEY;
4683 key.offset = (u64)-1; 4704 key.offset = (u64)-1;
4684 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); 4705 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4685 if (!sctx->parent_root) { 4706 if (IS_ERR(sctx->parent_root)) {
4686 ret = -EINVAL; 4707 ret = PTR_ERR(sctx->parent_root);
4687 goto out; 4708 goto out;
4688 } 4709 }
4689 } 4710 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
54#include "version.h"
55#include "export.h" 54#include "export.h"
56#include "compression.h" 55#include "compression.h"
57#include "rcu-string.h" 56#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
266 return; 265 return;
267 } 266 }
268 ACCESS_ONCE(trans->transaction->aborted) = errno; 267 ACCESS_ONCE(trans->transaction->aborted) = errno;
268 /* Wake up anybody who may be waiting on this transaction */
269 wake_up(&root->fs_info->transaction_wait);
270 wake_up(&root->fs_info->transaction_blocked_wait);
269 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
270} 272}
271/* 273/*
@@ -776,9 +778,6 @@ find_root:
776 if (IS_ERR(new_root)) 778 if (IS_ERR(new_root))
777 return ERR_CAST(new_root); 779 return ERR_CAST(new_root);
778 780
779 if (btrfs_root_refs(&new_root->root_item) == 0)
780 return ERR_PTR(-ENOENT);
781
782 dir_id = btrfs_root_dirid(&new_root->root_item); 781 dir_id = btrfs_root_dirid(&new_root->root_item);
783setup_root: 782setup_root:
784 location.objectid = dir_id; 783 location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
866 return 0; 865 return 0;
867 } 866 }
868 867
869 btrfs_wait_ordered_extents(root, 1); 868 btrfs_wait_all_ordered_extents(fs_info, 1);
870 869
871 trans = btrfs_attach_transaction_barrier(root); 870 trans = btrfs_attach_transaction_barrier(root);
872 if (IS_ERR(trans)) { 871 if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
1685 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); 1684 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1686} 1685}
1687 1686
1687static void btrfs_print_info(void)
1688{
1689 printk(KERN_INFO "Btrfs loaded"
1690#ifdef CONFIG_BTRFS_DEBUG
1691 ", debug=on"
1692#endif
1693#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1694 ", integrity-checker=on"
1695#endif
1696 "\n");
1697}
1698
1688static int __init init_btrfs_fs(void) 1699static int __init init_btrfs_fs(void)
1689{ 1700{
1690 int err; 1701 int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
1733 1744
1734 btrfs_init_lockdep(); 1745 btrfs_init_lockdep();
1735 1746
1736#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1747 btrfs_print_info();
1737 btrfs_test_free_space_cache(); 1748 btrfs_test_free_space_cache();
1738#endif
1739 1749
1740 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
1741 return 0; 1750 return 0;
1742 1751
1743unregister_ioctl: 1752unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
34 34
35#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
36 36
37static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
38 [TRANS_STATE_RUNNING] = 0U,
39 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
40 __TRANS_START),
41 [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
42 __TRANS_START |
43 __TRANS_ATTACH),
44 [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
45 __TRANS_START |
46 __TRANS_ATTACH |
47 __TRANS_JOIN),
48 [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
49 __TRANS_START |
50 __TRANS_ATTACH |
51 __TRANS_JOIN |
52 __TRANS_JOIN_NOLOCK),
53 [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
54 __TRANS_START |
55 __TRANS_ATTACH |
56 __TRANS_JOIN |
57 __TRANS_JOIN_NOLOCK),
58};
59
37static void put_transaction(struct btrfs_transaction *transaction) 60static void put_transaction(struct btrfs_transaction *transaction)
38{ 61{
39 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
40 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 65 WARN_ON(transaction->delayed_refs.root.rb_node);
66 while (!list_empty(&transaction->pending_chunks)) {
67 struct extent_map *em;
68
69 em = list_first_entry(&transaction->pending_chunks,
70 struct extent_map, list);
71 list_del_init(&em->list);
72 free_extent_map(em);
73 }
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 74 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 75 }
45} 76}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
50 root->commit_root = btrfs_root_node(root); 81 root->commit_root = btrfs_root_node(root);
51} 82}
52 83
53static inline int can_join_transaction(struct btrfs_transaction *trans, 84static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
54 int type) 85 unsigned int type)
86{
87 if (type & TRANS_EXTWRITERS)
88 atomic_inc(&trans->num_extwriters);
89}
90
91static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
92 unsigned int type)
93{
94 if (type & TRANS_EXTWRITERS)
95 atomic_dec(&trans->num_extwriters);
96}
97
98static inline void extwriter_counter_init(struct btrfs_transaction *trans,
99 unsigned int type)
100{
101 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
102}
103
104static inline int extwriter_counter_read(struct btrfs_transaction *trans)
55{ 105{
56 return !(trans->in_commit && 106 return atomic_read(&trans->num_extwriters);
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59} 107}
60 108
61/* 109/*
62 * either allocate a new transaction or hop into the existing one 110 * either allocate a new transaction or hop into the existing one
63 */ 111 */
64static noinline int join_transaction(struct btrfs_root *root, int type) 112static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
65{ 113{
66 struct btrfs_transaction *cur_trans; 114 struct btrfs_transaction *cur_trans;
67 struct btrfs_fs_info *fs_info = root->fs_info; 115 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
74 return -EROFS; 122 return -EROFS;
75 } 123 }
76 124
77 if (fs_info->trans_no_join) {
78 /*
79 * If we are JOIN_NOLOCK we're already committing a current
80 * transaction, we just need a handle to deal with something
81 * when committing the transaction, such as inode cache and
82 * space cache. It is a special case.
83 */
84 if (type != TRANS_JOIN_NOLOCK) {
85 spin_unlock(&fs_info->trans_lock);
86 return -EBUSY;
87 }
88 }
89
90 cur_trans = fs_info->running_transaction; 125 cur_trans = fs_info->running_transaction;
91 if (cur_trans) { 126 if (cur_trans) {
92 if (cur_trans->aborted) { 127 if (cur_trans->aborted) {
93 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
94 return cur_trans->aborted; 129 return cur_trans->aborted;
95 } 130 }
96 if (!can_join_transaction(cur_trans, type)) { 131 if (btrfs_blocked_trans_types[cur_trans->state] & type) {
97 spin_unlock(&fs_info->trans_lock); 132 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY; 133 return -EBUSY;
99 } 134 }
100 atomic_inc(&cur_trans->use_count); 135 atomic_inc(&cur_trans->use_count);
101 atomic_inc(&cur_trans->num_writers); 136 atomic_inc(&cur_trans->num_writers);
102 cur_trans->num_joined++; 137 extwriter_counter_inc(cur_trans, type);
103 spin_unlock(&fs_info->trans_lock); 138 spin_unlock(&fs_info->trans_lock);
104 return 0; 139 return 0;
105 } 140 }
@@ -112,6 +147,12 @@ loop:
112 if (type == TRANS_ATTACH) 147 if (type == TRANS_ATTACH)
113 return -ENOENT; 148 return -ENOENT;
114 149
150 /*
151 * JOIN_NOLOCK only happens during the transaction commit, so
152 * it is impossible that ->running_transaction is NULL
153 */
154 BUG_ON(type == TRANS_JOIN_NOLOCK);
155
115 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 156 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
116 if (!cur_trans) 157 if (!cur_trans)
117 return -ENOMEM; 158 return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
120 if (fs_info->running_transaction) { 161 if (fs_info->running_transaction) {
121 /* 162 /*
122 * someone started a transaction after we unlocked. Make sure 163 * someone started a transaction after we unlocked. Make sure
123 * to redo the trans_no_join checks above 164 * to redo the checks above
124 */ 165 */
125 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 166 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
126 goto loop; 167 goto loop;
@@ -131,17 +172,15 @@ loop:
131 } 172 }
132 173
133 atomic_set(&cur_trans->num_writers, 1); 174 atomic_set(&cur_trans->num_writers, 1);
134 cur_trans->num_joined = 0; 175 extwriter_counter_init(cur_trans, type);
135 init_waitqueue_head(&cur_trans->writer_wait); 176 init_waitqueue_head(&cur_trans->writer_wait);
136 init_waitqueue_head(&cur_trans->commit_wait); 177 init_waitqueue_head(&cur_trans->commit_wait);
137 cur_trans->in_commit = 0; 178 cur_trans->state = TRANS_STATE_RUNNING;
138 cur_trans->blocked = 0;
139 /* 179 /*
140 * One for this trans handle, one so it will live on until we 180 * One for this trans handle, one so it will live on until we
141 * commit the transaction. 181 * commit the transaction.
142 */ 182 */
143 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
144 cur_trans->commit_done = 0;
145 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
146 185
147 cur_trans->delayed_refs.root = RB_ROOT; 186 cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
164 "creating a fresh transaction\n"); 203 "creating a fresh transaction\n");
165 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
166 205
167 spin_lock_init(&cur_trans->commit_lock);
168 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); 207 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0); 208 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
172 210
173 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 211 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations); 212 INIT_LIST_HEAD(&cur_trans->ordered_operations);
213 INIT_LIST_HEAD(&cur_trans->pending_chunks);
175 list_add_tail(&cur_trans->list, &fs_info->trans_list); 214 list_add_tail(&cur_trans->list, &fs_info->trans_list);
176 extent_io_tree_init(&cur_trans->dirty_pages, 215 extent_io_tree_init(&cur_trans->dirty_pages,
177 fs_info->btree_inode->i_mapping); 216 fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
269 return 0; 308 return 0;
270} 309}
271 310
311static inline int is_transaction_blocked(struct btrfs_transaction *trans)
312{
313 return (trans->state >= TRANS_STATE_BLOCKED &&
314 trans->state < TRANS_STATE_UNBLOCKED &&
315 !trans->aborted);
316}
317
272/* wait for commit against the current transaction to become unblocked 318/* wait for commit against the current transaction to become unblocked
273 * when this is done, it is safe to start a new transaction, but the current 319 * when this is done, it is safe to start a new transaction, but the current
274 * transaction might not be fully on disk. 320 * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
279 325
280 spin_lock(&root->fs_info->trans_lock); 326 spin_lock(&root->fs_info->trans_lock);
281 cur_trans = root->fs_info->running_transaction; 327 cur_trans = root->fs_info->running_transaction;
282 if (cur_trans && cur_trans->blocked) { 328 if (cur_trans && is_transaction_blocked(cur_trans)) {
283 atomic_inc(&cur_trans->use_count); 329 atomic_inc(&cur_trans->use_count);
284 spin_unlock(&root->fs_info->trans_lock); 330 spin_unlock(&root->fs_info->trans_lock);
285 331
286 wait_event(root->fs_info->transaction_wait, 332 wait_event(root->fs_info->transaction_wait,
287 !cur_trans->blocked); 333 cur_trans->state >= TRANS_STATE_UNBLOCKED ||
334 cur_trans->aborted);
288 put_transaction(cur_trans); 335 put_transaction(cur_trans);
289 } else { 336 } else {
290 spin_unlock(&root->fs_info->trans_lock); 337 spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
307} 354}
308 355
309static struct btrfs_trans_handle * 356static struct btrfs_trans_handle *
310start_transaction(struct btrfs_root *root, u64 num_items, int type, 357start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
311 enum btrfs_reserve_flush_enum flush) 358 enum btrfs_reserve_flush_enum flush)
312{ 359{
313 struct btrfs_trans_handle *h; 360 struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
320 return ERR_PTR(-EROFS); 367 return ERR_PTR(-EROFS);
321 368
322 if (current->journal_info) { 369 if (current->journal_info) {
323 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 370 WARN_ON(type & TRANS_EXTWRITERS);
324 h = current->journal_info; 371 h = current->journal_info;
325 h->use_count++; 372 h->use_count++;
326 WARN_ON(h->use_count > 2); 373 WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
366 * If we are ATTACH, it means we just want to catch the current 413 * If we are ATTACH, it means we just want to catch the current
367 * transaction and commit it, so we needn't do sb_start_intwrite(). 414 * transaction and commit it, so we needn't do sb_start_intwrite().
368 */ 415 */
369 if (type < TRANS_JOIN_NOLOCK) 416 if (type & __TRANS_FREEZABLE)
370 sb_start_intwrite(root->fs_info->sb); 417 sb_start_intwrite(root->fs_info->sb);
371 418
372 if (may_wait_transaction(root, type)) 419 if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
408 INIT_LIST_HEAD(&h->new_bgs); 455 INIT_LIST_HEAD(&h->new_bgs);
409 456
410 smp_mb(); 457 smp_mb();
411 if (cur_trans->blocked && may_wait_transaction(root, type)) { 458 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
459 may_wait_transaction(root, type)) {
412 btrfs_commit_transaction(h, root); 460 btrfs_commit_transaction(h, root);
413 goto again; 461 goto again;
414 } 462 }
@@ -429,7 +477,7 @@ got_it:
429 return h; 477 return h;
430 478
431join_fail: 479join_fail:
432 if (type < TRANS_JOIN_NOLOCK) 480 if (type & __TRANS_FREEZABLE)
433 sb_end_intwrite(root->fs_info->sb); 481 sb_end_intwrite(root->fs_info->sb);
434 kmem_cache_free(btrfs_trans_handle_cachep, h); 482 kmem_cache_free(btrfs_trans_handle_cachep, h);
435alloc_fail: 483alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
490} 538}
491 539
492/* 540/*
493 * btrfs_attach_transaction() - catch the running transaction 541 * btrfs_attach_transaction_barrier() - catch the running transaction
494 * 542 *
495 * It is similar to the above function, the differentia is this one 543 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully 544 * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
512static noinline void wait_for_commit(struct btrfs_root *root, 560static noinline void wait_for_commit(struct btrfs_root *root,
513 struct btrfs_transaction *commit) 561 struct btrfs_transaction *commit)
514{ 562{
515 wait_event(commit->commit_wait, commit->commit_done); 563 wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
516} 564}
517 565
518int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 566int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
548 spin_lock(&root->fs_info->trans_lock); 596 spin_lock(&root->fs_info->trans_lock);
549 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 597 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
550 list) { 598 list) {
551 if (t->in_commit) { 599 if (t->state >= TRANS_STATE_COMMIT_START) {
552 if (t->commit_done) 600 if (t->state == TRANS_STATE_COMPLETED)
553 break; 601 break;
554 cur_trans = t; 602 cur_trans = t;
555 atomic_inc(&cur_trans->use_count); 603 atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
576static int should_end_transaction(struct btrfs_trans_handle *trans, 624static int should_end_transaction(struct btrfs_trans_handle *trans,
577 struct btrfs_root *root) 625 struct btrfs_root *root)
578{ 626{
579 int ret; 627 if (root->fs_info->global_block_rsv.space_info->full &&
628 btrfs_should_throttle_delayed_refs(trans, root))
629 return 1;
580 630
581 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 631 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
582 return ret ? 1 : 0;
583} 632}
584 633
585int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 634int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
590 int err; 639 int err;
591 640
592 smp_mb(); 641 smp_mb();
593 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 642 if (cur_trans->state >= TRANS_STATE_BLOCKED ||
643 cur_trans->delayed_refs.flushing)
594 return 1; 644 return 1;
595 645
596 updates = trans->delayed_ref_updates; 646 updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
609{ 659{
610 struct btrfs_transaction *cur_trans = trans->transaction; 660 struct btrfs_transaction *cur_trans = trans->transaction;
611 struct btrfs_fs_info *info = root->fs_info; 661 struct btrfs_fs_info *info = root->fs_info;
612 int count = 0; 662 unsigned long cur = trans->delayed_ref_updates;
613 int lock = (trans->type != TRANS_JOIN_NOLOCK); 663 int lock = (trans->type != TRANS_JOIN_NOLOCK);
614 int err = 0; 664 int err = 0;
615 665
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
638 if (!list_empty(&trans->new_bgs)) 688 if (!list_empty(&trans->new_bgs))
639 btrfs_create_pending_block_groups(trans, root); 689 btrfs_create_pending_block_groups(trans, root);
640 690
641 while (count < 1) { 691 trans->delayed_ref_updates = 0;
642 unsigned long cur = trans->delayed_ref_updates; 692 if (btrfs_should_throttle_delayed_refs(trans, root)) {
693 cur = max_t(unsigned long, cur, 1);
643 trans->delayed_ref_updates = 0; 694 trans->delayed_ref_updates = 0;
644 if (cur && 695 btrfs_run_delayed_refs(trans, root, cur);
645 trans->transaction->delayed_refs.num_heads_ready > 64) {
646 trans->delayed_ref_updates = 0;
647 btrfs_run_delayed_refs(trans, root, cur);
648 } else {
649 break;
650 }
651 count++;
652 } 696 }
653 697
654 btrfs_trans_release_metadata(trans, root); 698 btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
658 btrfs_create_pending_block_groups(trans, root); 702 btrfs_create_pending_block_groups(trans, root);
659 703
660 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 704 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
661 should_end_transaction(trans, root)) { 705 should_end_transaction(trans, root) &&
662 trans->transaction->blocked = 1; 706 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
663 smp_wmb(); 707 spin_lock(&info->trans_lock);
708 if (cur_trans->state == TRANS_STATE_RUNNING)
709 cur_trans->state = TRANS_STATE_BLOCKED;
710 spin_unlock(&info->trans_lock);
664 } 711 }
665 712
666 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 713 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
667 if (throttle) { 714 if (throttle) {
668 /* 715 /*
669 * We may race with somebody else here so end up having 716 * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
677 } 724 }
678 } 725 }
679 726
680 if (trans->type < TRANS_JOIN_NOLOCK) 727 if (trans->type & __TRANS_FREEZABLE)
681 sb_end_intwrite(root->fs_info->sb); 728 sb_end_intwrite(root->fs_info->sb);
682 729
683 WARN_ON(cur_trans != info->running_transaction); 730 WARN_ON(cur_trans != info->running_transaction);
684 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 731 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
685 atomic_dec(&cur_trans->num_writers); 732 atomic_dec(&cur_trans->num_writers);
733 extwriter_counter_dec(cur_trans, trans->type);
686 734
687 smp_mb(); 735 smp_mb();
688 if (waitqueue_active(&cur_trans->writer_wait)) 736 if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
736 struct extent_state *cached_state = NULL; 784 struct extent_state *cached_state = NULL;
737 u64 start = 0; 785 u64 start = 0;
738 u64 end; 786 u64 end;
739 struct blk_plug plug;
740 787
741 blk_start_plug(&plug);
742 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 788 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
743 mark, &cached_state)) { 789 mark, &cached_state)) {
744 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 790 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
752 } 798 }
753 if (err) 799 if (err)
754 werr = err; 800 werr = err;
755 blk_finish_plug(&plug);
756 return werr; 801 return werr;
757} 802}
758 803
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
797{ 842{
798 int ret; 843 int ret;
799 int ret2; 844 int ret2;
845 struct blk_plug plug;
800 846
847 blk_start_plug(&plug);
801 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 848 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
849 blk_finish_plug(&plug);
802 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 850 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
803 851
804 if (ret) 852 if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
1318 1366
1319int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1367int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
1320{ 1368{
1369 struct btrfs_transaction *trans;
1321 int ret = 0; 1370 int ret = 0;
1371
1322 spin_lock(&info->trans_lock); 1372 spin_lock(&info->trans_lock);
1323 if (info->running_transaction) 1373 trans = info->running_transaction;
1324 ret = info->running_transaction->in_commit; 1374 if (trans)
1375 ret = (trans->state >= TRANS_STATE_COMMIT_START);
1325 spin_unlock(&info->trans_lock); 1376 spin_unlock(&info->trans_lock);
1326 return ret; 1377 return ret;
1327} 1378}
1328 1379
1329int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1380int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1330{ 1381{
1382 struct btrfs_transaction *trans;
1331 int ret = 0; 1383 int ret = 0;
1384
1332 spin_lock(&info->trans_lock); 1385 spin_lock(&info->trans_lock);
1333 if (info->running_transaction) 1386 trans = info->running_transaction;
1334 ret = info->running_transaction->blocked; 1387 if (trans)
1388 ret = is_transaction_blocked(trans);
1335 spin_unlock(&info->trans_lock); 1389 spin_unlock(&info->trans_lock);
1336 return ret; 1390 return ret;
1337} 1391}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1343static void wait_current_trans_commit_start(struct btrfs_root *root, 1397static void wait_current_trans_commit_start(struct btrfs_root *root,
1344 struct btrfs_transaction *trans) 1398 struct btrfs_transaction *trans)
1345{ 1399{
1346 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); 1400 wait_event(root->fs_info->transaction_blocked_wait,
1401 trans->state >= TRANS_STATE_COMMIT_START ||
1402 trans->aborted);
1347} 1403}
1348 1404
1349/* 1405/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1354 struct btrfs_transaction *trans) 1410 struct btrfs_transaction *trans)
1355{ 1411{
1356 wait_event(root->fs_info->transaction_wait, 1412 wait_event(root->fs_info->transaction_wait,
1357 trans->commit_done || (trans->in_commit && !trans->blocked)); 1413 trans->state >= TRANS_STATE_UNBLOCKED ||
1414 trans->aborted);
1358} 1415}
1359 1416
1360/* 1417/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1450 1507
1451 spin_lock(&root->fs_info->trans_lock); 1508 spin_lock(&root->fs_info->trans_lock);
1452 1509
1453 if (list_empty(&cur_trans->list)) { 1510 /*
1454 spin_unlock(&root->fs_info->trans_lock); 1511 * If the transaction is removed from the list, it means this
1455 btrfs_end_transaction(trans, root); 1512 * transaction has been committed successfully, so it is impossible
1456 return; 1513 * to call the cleanup function.
1457 } 1514 */
1515 BUG_ON(list_empty(&cur_trans->list));
1458 1516
1459 list_del_init(&cur_trans->list); 1517 list_del_init(&cur_trans->list);
1460 if (cur_trans == root->fs_info->running_transaction) { 1518 if (cur_trans == root->fs_info->running_transaction) {
1461 root->fs_info->trans_no_join = 1; 1519 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1462 spin_unlock(&root->fs_info->trans_lock); 1520 spin_unlock(&root->fs_info->trans_lock);
1463 wait_event(cur_trans->writer_wait, 1521 wait_event(cur_trans->writer_wait,
1464 atomic_read(&cur_trans->num_writers) == 1); 1522 atomic_read(&cur_trans->num_writers) == 1);
1465 1523
1466 spin_lock(&root->fs_info->trans_lock); 1524 spin_lock(&root->fs_info->trans_lock);
1467 root->fs_info->running_transaction = NULL;
1468 } 1525 }
1469 spin_unlock(&root->fs_info->trans_lock); 1526 spin_unlock(&root->fs_info->trans_lock);
1470 1527
1471 btrfs_cleanup_one_transaction(trans->transaction, root); 1528 btrfs_cleanup_one_transaction(trans->transaction, root);
1472 1529
1530 spin_lock(&root->fs_info->trans_lock);
1531 if (cur_trans == root->fs_info->running_transaction)
1532 root->fs_info->running_transaction = NULL;
1533 spin_unlock(&root->fs_info->trans_lock);
1534
1473 put_transaction(cur_trans); 1535 put_transaction(cur_trans);
1474 put_transaction(cur_trans); 1536 put_transaction(cur_trans);
1475 1537
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1481 current->journal_info = NULL; 1543 current->journal_info = NULL;
1482 1544
1483 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1545 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1484
1485 spin_lock(&root->fs_info->trans_lock);
1486 root->fs_info->trans_no_join = 0;
1487 spin_unlock(&root->fs_info->trans_lock);
1488} 1546}
1489 1547
1490static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1548static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root) 1549 struct btrfs_root *root)
1492{ 1550{
1493 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1494 int snap_pending = 0;
1495 int ret; 1551 int ret;
1496 1552
1497 if (!flush_on_commit) {
1498 spin_lock(&root->fs_info->trans_lock);
1499 if (!list_empty(&trans->transaction->pending_snapshots))
1500 snap_pending = 1;
1501 spin_unlock(&root->fs_info->trans_lock);
1502 }
1503
1504 if (flush_on_commit || snap_pending) {
1505 ret = btrfs_start_delalloc_inodes(root, 1);
1506 if (ret)
1507 return ret;
1508 btrfs_wait_ordered_extents(root, 1);
1509 }
1510
1511 ret = btrfs_run_delayed_items(trans, root); 1553 ret = btrfs_run_delayed_items(trans, root);
1512 if (ret) 1554 if (ret)
1513 return ret; 1555 return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1531 return ret; 1573 return ret;
1532} 1574}
1533 1575
1534/* 1576static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1535 * btrfs_transaction state sequence: 1577{
1536 * in_commit = 0, blocked = 0 (initial) 1578 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1537 * in_commit = 1, blocked = 1 1579 return btrfs_start_all_delalloc_inodes(fs_info, 1);
1538 * blocked = 0 1580 return 0;
1539 * commit_done = 1 1581}
1540 */ 1582
1583static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1584{
1585 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1586 btrfs_wait_all_ordered_extents(fs_info, 1);
1587}
1588
1541int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1589int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root) 1590 struct btrfs_root *root)
1543{ 1591{
1544 unsigned long joined = 0;
1545 struct btrfs_transaction *cur_trans = trans->transaction; 1592 struct btrfs_transaction *cur_trans = trans->transaction;
1546 struct btrfs_transaction *prev_trans = NULL; 1593 struct btrfs_transaction *prev_trans = NULL;
1547 DEFINE_WAIT(wait);
1548 int ret; 1594 int ret;
1549 int should_grow = 0;
1550 unsigned long now = get_seconds();
1551 1595
1552 ret = btrfs_run_ordered_operations(trans, root, 0); 1596 ret = btrfs_run_ordered_operations(trans, root, 0);
1553 if (ret) { 1597 if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1586 * start sending their work down. 1630 * start sending their work down.
1587 */ 1631 */
1588 cur_trans->delayed_refs.flushing = 1; 1632 cur_trans->delayed_refs.flushing = 1;
1633 smp_wmb();
1589 1634
1590 if (!list_empty(&trans->new_bgs)) 1635 if (!list_empty(&trans->new_bgs))
1591 btrfs_create_pending_block_groups(trans, root); 1636 btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1596 return ret; 1641 return ret;
1597 } 1642 }
1598 1643
1599 spin_lock(&cur_trans->commit_lock); 1644 spin_lock(&root->fs_info->trans_lock);
1600 if (cur_trans->in_commit) { 1645 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1601 spin_unlock(&cur_trans->commit_lock); 1646 spin_unlock(&root->fs_info->trans_lock);
1602 atomic_inc(&cur_trans->use_count); 1647 atomic_inc(&cur_trans->use_count);
1603 ret = btrfs_end_transaction(trans, root); 1648 ret = btrfs_end_transaction(trans, root);
1604 1649
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1609 return ret; 1654 return ret;
1610 } 1655 }
1611 1656
1612 trans->transaction->in_commit = 1; 1657 cur_trans->state = TRANS_STATE_COMMIT_START;
1613 trans->transaction->blocked = 1;
1614 spin_unlock(&cur_trans->commit_lock);
1615 wake_up(&root->fs_info->transaction_blocked_wait); 1658 wake_up(&root->fs_info->transaction_blocked_wait);
1616 1659
1617 spin_lock(&root->fs_info->trans_lock);
1618 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1660 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1619 prev_trans = list_entry(cur_trans->list.prev, 1661 prev_trans = list_entry(cur_trans->list.prev,
1620 struct btrfs_transaction, list); 1662 struct btrfs_transaction, list);
1621 if (!prev_trans->commit_done) { 1663 if (prev_trans->state != TRANS_STATE_COMPLETED) {
1622 atomic_inc(&prev_trans->use_count); 1664 atomic_inc(&prev_trans->use_count);
1623 spin_unlock(&root->fs_info->trans_lock); 1665 spin_unlock(&root->fs_info->trans_lock);
1624 1666
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1632 spin_unlock(&root->fs_info->trans_lock); 1674 spin_unlock(&root->fs_info->trans_lock);
1633 } 1675 }
1634 1676
1635 if (!btrfs_test_opt(root, SSD) && 1677 extwriter_counter_dec(cur_trans, trans->type);
1636 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1637 should_grow = 1;
1638
1639 do {
1640 joined = cur_trans->num_joined;
1641
1642 WARN_ON(cur_trans != trans->transaction);
1643
1644 ret = btrfs_flush_all_pending_stuffs(trans, root);
1645 if (ret)
1646 goto cleanup_transaction;
1647 1678
1648 prepare_to_wait(&cur_trans->writer_wait, &wait, 1679 ret = btrfs_start_delalloc_flush(root->fs_info);
1649 TASK_UNINTERRUPTIBLE); 1680 if (ret)
1681 goto cleanup_transaction;
1650 1682
1651 if (atomic_read(&cur_trans->num_writers) > 1) 1683 ret = btrfs_flush_all_pending_stuffs(trans, root);
1652 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1684 if (ret)
1653 else if (should_grow) 1685 goto cleanup_transaction;
1654 schedule_timeout(1);
1655 1686
1656 finish_wait(&cur_trans->writer_wait, &wait); 1687 wait_event(cur_trans->writer_wait,
1657 } while (atomic_read(&cur_trans->num_writers) > 1 || 1688 extwriter_counter_read(cur_trans) == 0);
1658 (should_grow && cur_trans->num_joined != joined));
1659 1689
1690 /* some pending stuffs might be added after the previous flush. */
1660 ret = btrfs_flush_all_pending_stuffs(trans, root); 1691 ret = btrfs_flush_all_pending_stuffs(trans, root);
1661 if (ret) 1692 if (ret)
1662 goto cleanup_transaction; 1693 goto cleanup_transaction;
1663 1694
1695 btrfs_wait_delalloc_flush(root->fs_info);
1664 /* 1696 /*
1665 * Ok now we need to make sure to block out any other joins while we 1697 * Ok now we need to make sure to block out any other joins while we
1666 * commit the transaction. We could have started a join before setting 1698 * commit the transaction. We could have started a join before setting
1667 * no_join so make sure to wait for num_writers to == 1 again. 1699 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
1668 */ 1700 */
1669 spin_lock(&root->fs_info->trans_lock); 1701 spin_lock(&root->fs_info->trans_lock);
1670 root->fs_info->trans_no_join = 1; 1702 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1671 spin_unlock(&root->fs_info->trans_lock); 1703 spin_unlock(&root->fs_info->trans_lock);
1672 wait_event(cur_trans->writer_wait, 1704 wait_event(cur_trans->writer_wait,
1673 atomic_read(&cur_trans->num_writers) == 1); 1705 atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1794 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1826 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1795 sizeof(*root->fs_info->super_copy)); 1827 sizeof(*root->fs_info->super_copy));
1796 1828
1797 trans->transaction->blocked = 0;
1798 spin_lock(&root->fs_info->trans_lock); 1829 spin_lock(&root->fs_info->trans_lock);
1830 cur_trans->state = TRANS_STATE_UNBLOCKED;
1799 root->fs_info->running_transaction = NULL; 1831 root->fs_info->running_transaction = NULL;
1800 root->fs_info->trans_no_join = 0;
1801 spin_unlock(&root->fs_info->trans_lock); 1832 spin_unlock(&root->fs_info->trans_lock);
1802 mutex_unlock(&root->fs_info->reloc_mutex); 1833 mutex_unlock(&root->fs_info->reloc_mutex);
1803 1834
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1825 1856
1826 btrfs_finish_extent_commit(trans, root); 1857 btrfs_finish_extent_commit(trans, root);
1827 1858
1828 cur_trans->commit_done = 1;
1829
1830 root->fs_info->last_trans_committed = cur_trans->transid; 1859 root->fs_info->last_trans_committed = cur_trans->transid;
1831 1860 /*
1861 * We needn't acquire the lock here because there is no other task
1862 * which can change it.
1863 */
1864 cur_trans->state = TRANS_STATE_COMPLETED;
1832 wake_up(&cur_trans->commit_wait); 1865 wake_up(&cur_trans->commit_wait);
1833 1866
1834 spin_lock(&root->fs_info->trans_lock); 1867 spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1838 put_transaction(cur_trans); 1871 put_transaction(cur_trans);
1839 put_transaction(cur_trans); 1872 put_transaction(cur_trans);
1840 1873
1841 if (trans->type < TRANS_JOIN_NOLOCK) 1874 if (trans->type & __TRANS_FREEZABLE)
1842 sb_end_intwrite(root->fs_info->sb); 1875 sb_end_intwrite(root->fs_info->sb);
1843 1876
1844 trace_btrfs_transaction_commit(root); 1877 trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1885 int ret; 1918 int ret;
1886 struct btrfs_fs_info *fs_info = root->fs_info; 1919 struct btrfs_fs_info *fs_info = root->fs_info;
1887 1920
1888 if (fs_info->sb->s_flags & MS_RDONLY) {
1889 pr_debug("btrfs: cleaner called for RO fs!\n");
1890 return 0;
1891 }
1892
1893 spin_lock(&fs_info->trans_lock); 1921 spin_lock(&fs_info->trans_lock);
1894 if (list_empty(&fs_info->dead_roots)) { 1922 if (list_empty(&fs_info->dead_roots)) {
1895 spin_unlock(&fs_info->trans_lock); 1923 spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h" 23#include "ctree.h"
24 24
25enum btrfs_trans_state {
26 TRANS_STATE_RUNNING = 0,
27 TRANS_STATE_BLOCKED = 1,
28 TRANS_STATE_COMMIT_START = 2,
29 TRANS_STATE_COMMIT_DOING = 3,
30 TRANS_STATE_UNBLOCKED = 4,
31 TRANS_STATE_COMPLETED = 5,
32 TRANS_STATE_MAX = 6,
33};
34
25struct btrfs_transaction { 35struct btrfs_transaction {
26 u64 transid; 36 u64 transid;
27 /* 37 /*
38 * total external writers(USERSPACE/START/ATTACH) in this
39 * transaction, it must be zero before the transaction is
40 * being committed
41 */
42 atomic_t num_extwriters;
43 /*
28 * total writers in this transaction, it must be zero before the 44 * total writers in this transaction, it must be zero before the
29 * transaction can end 45 * transaction can end
30 */ 46 */
31 atomic_t num_writers; 47 atomic_t num_writers;
32 atomic_t use_count; 48 atomic_t use_count;
33 49
34 unsigned long num_joined; 50 /* Be protected by fs_info->trans_lock when we want to change it. */
35 51 enum btrfs_trans_state state;
36 spinlock_t commit_lock;
37 int in_commit;
38 int commit_done;
39 int blocked;
40 struct list_head list; 52 struct list_head list;
41 struct extent_io_tree dirty_pages; 53 struct extent_io_tree dirty_pages;
42 unsigned long start_time; 54 unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
44 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
46 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks;
47 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
48 int aborted; 61 int aborted;
49}; 62};
50 63
51enum btrfs_trans_type { 64#define __TRANS_FREEZABLE (1U << 0)
52 TRANS_START, 65
53 TRANS_JOIN, 66#define __TRANS_USERSPACE (1U << 8)
54 TRANS_USERSPACE, 67#define __TRANS_START (1U << 9)
55 TRANS_JOIN_NOLOCK, 68#define __TRANS_ATTACH (1U << 10)
56 TRANS_ATTACH, 69#define __TRANS_JOIN (1U << 11)
57}; 70#define __TRANS_JOIN_NOLOCK (1U << 12)
71
72#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
73#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
74#define TRANS_ATTACH (__TRANS_ATTACH)
75#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
76#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
77
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH)
58 80
59struct btrfs_trans_handle { 81struct btrfs_trans_handle {
60 u64 transid; 82 u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
70 short aborted; 92 short aborted;
71 short adding_csums; 93 short adding_csums;
72 bool allocating_chunk; 94 bool allocating_chunk;
73 enum btrfs_trans_type type; 95 unsigned int type;
74 /* 96 /*
75 * this root is only needed to validate that the root passed to 97 * this root is only needed to validate that the root passed to
76 * start_transaction is the same as the one passed to end_transaction. 98 * start_transaction is the same as the one passed to end_transaction.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h>
21#include <linux/list_sort.h> 22#include <linux/list_sort.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
279{ 280{
280 int ret = 0; 281 int ret = 0;
281 282
283 /*
284 * If this fs is mixed then we need to be able to process the leaves to
285 * pin down any logged extents, so we have to read the block.
286 */
287 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
288 ret = btrfs_read_buffer(eb, gen);
289 if (ret)
290 return ret;
291 }
292
282 if (wc->pin) 293 if (wc->pin)
283 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 294 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
284 eb->start, eb->len); 295 eb->start, eb->len);
285 296
286 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 297 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
298 if (wc->pin && btrfs_header_level(eb) == 0)
299 ret = btrfs_exclude_logged_extents(log, eb);
287 if (wc->write) 300 if (wc->write)
288 btrfs_write_tree_block(eb); 301 btrfs_write_tree_block(eb);
289 if (wc->wait) 302 if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2016 eb, i, &key); 2029 eb, i, &key);
2017 if (ret) 2030 if (ret)
2018 break; 2031 break;
2019 } else if (key.type == BTRFS_INODE_REF_KEY) { 2032 } else if (key.type == BTRFS_INODE_REF_KEY ||
2020 ret = add_inode_ref(wc->trans, root, log, path, 2033 key.type == BTRFS_INODE_EXTREF_KEY) {
2021 eb, i, &key);
2022 if (ret && ret != -ENOENT)
2023 break;
2024 ret = 0;
2025 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
2026 ret = add_inode_ref(wc->trans, root, log, path, 2034 ret = add_inode_ref(wc->trans, root, log, path,
2027 eb, i, &key); 2035 eb, i, &key);
2028 if (ret && ret != -ENOENT) 2036 if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2358 struct btrfs_root *log = root->log_root; 2366 struct btrfs_root *log = root->log_root;
2359 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2367 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2360 unsigned long log_transid = 0; 2368 unsigned long log_transid = 0;
2369 struct blk_plug plug;
2361 2370
2362 mutex_lock(&root->log_mutex); 2371 mutex_lock(&root->log_mutex);
2363 log_transid = root->log_transid; 2372 log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2401 /* we start IO on all the marked extents here, but we don't actually 2410 /* we start IO on all the marked extents here, but we don't actually
2402 * wait for them until later. 2411 * wait for them until later.
2403 */ 2412 */
2413 blk_start_plug(&plug);
2404 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2414 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2405 if (ret) { 2415 if (ret) {
2416 blk_finish_plug(&plug);
2406 btrfs_abort_transaction(trans, root, ret); 2417 btrfs_abort_transaction(trans, root, ret);
2407 btrfs_free_logged_extents(log, log_transid); 2418 btrfs_free_logged_extents(log, log_transid);
2408 mutex_unlock(&root->log_mutex); 2419 mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2437 } 2448 }
2438 2449
2439 if (ret) { 2450 if (ret) {
2451 blk_finish_plug(&plug);
2440 if (ret != -ENOSPC) { 2452 if (ret != -ENOSPC) {
2441 btrfs_abort_transaction(trans, root, ret); 2453 btrfs_abort_transaction(trans, root, ret);
2442 mutex_unlock(&log_root_tree->log_mutex); 2454 mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2452 2464
2453 index2 = log_root_tree->log_transid % 2; 2465 index2 = log_root_tree->log_transid % 2;
2454 if (atomic_read(&log_root_tree->log_commit[index2])) { 2466 if (atomic_read(&log_root_tree->log_commit[index2])) {
2467 blk_finish_plug(&plug);
2455 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2468 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2456 wait_log_commit(trans, log_root_tree, 2469 wait_log_commit(trans, log_root_tree,
2457 log_root_tree->log_transid); 2470 log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2474 * check the full commit flag again 2487 * check the full commit flag again
2475 */ 2488 */
2476 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2489 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2490 blk_finish_plug(&plug);
2477 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2491 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2478 btrfs_free_logged_extents(log, log_transid); 2492 btrfs_free_logged_extents(log, log_transid);
2479 mutex_unlock(&log_root_tree->log_mutex); 2493 mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2481 goto out_wake_log_root; 2495 goto out_wake_log_root;
2482 } 2496 }
2483 2497
2484 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2498 ret = btrfs_write_marked_extents(log_root_tree,
2485 &log_root_tree->dirty_log_pages, 2499 &log_root_tree->dirty_log_pages,
2486 EXTENT_DIRTY | EXTENT_NEW); 2500 EXTENT_DIRTY | EXTENT_NEW);
2501 blk_finish_plug(&plug);
2487 if (ret) { 2502 if (ret) {
2488 btrfs_abort_transaction(trans, root, ret); 2503 btrfs_abort_transaction(trans, root, ret);
2489 btrfs_free_logged_extents(log, log_transid); 2504 btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2491 goto out_wake_log_root; 2506 goto out_wake_log_root;
2492 } 2507 }
2493 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2508 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2509 btrfs_wait_marked_extents(log_root_tree,
2510 &log_root_tree->dirty_log_pages,
2511 EXTENT_NEW | EXTENT_DIRTY);
2494 btrfs_wait_logged_extents(log, log_transid); 2512 btrfs_wait_logged_extents(log, log_transid);
2495 2513
2496 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2514 btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
4016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4034 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4017 break; 4035 break;
4018 4036
4019 log = btrfs_read_fs_root_no_radix(log_root_tree, 4037 log = btrfs_read_fs_root(log_root_tree, &found_key);
4020 &found_key);
4021 if (IS_ERR(log)) { 4038 if (IS_ERR(log)) {
4022 ret = PTR_ERR(log); 4039 ret = PTR_ERR(log);
4023 btrfs_error(fs_info, ret, 4040 btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
205 u64 new_alloced = ulist->nodes_alloced + 128; 205 u64 new_alloced = ulist->nodes_alloced + 128;
206 struct ulist_node *new_nodes; 206 struct ulist_node *new_nodes;
207 void *old = NULL; 207 void *old = NULL;
208 int i;
209
210 for (i = 0; i < ulist->nnodes; i++)
211 rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
208 212
209 /* 213 /*
210 * if nodes_alloced == ULIST_SIZE no memory has been allocated 214 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
224 228
225 ulist->nodes = new_nodes; 229 ulist->nodes = new_nodes;
226 ulist->nodes_alloced = new_alloced; 230 ulist->nodes_alloced = new_alloced;
231
232 /*
233 * krealloc actually uses memcpy, which does not copy rb_node
234 * pointers, so we have to do it ourselves. Otherwise we may
235 * be bitten by crashes.
236 */
237 for (i = 0; i < ulist->nnodes; i++) {
238 ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
239 if (ret < 0)
240 return ret;
241 }
227 } 242 }
228 ulist->nodes[ulist->nnodes].val = val; 243 ulist->nodes[ulist->nnodes].val = val;
229 ulist->nodes[ulist->nnodes].aux = aux; 244 ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
982 return ret; 982 return ret;
983} 983}
984 984
985static int contains_pending_extent(struct btrfs_trans_handle *trans,
986 struct btrfs_device *device,
987 u64 *start, u64 len)
988{
989 struct extent_map *em;
990 int ret = 0;
991
992 list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
993 struct map_lookup *map;
994 int i;
995
996 map = (struct map_lookup *)em->bdev;
997 for (i = 0; i < map->num_stripes; i++) {
998 if (map->stripes[i].dev != device)
999 continue;
1000 if (map->stripes[i].physical >= *start + len ||
1001 map->stripes[i].physical + em->orig_block_len <=
1002 *start)
1003 continue;
1004 *start = map->stripes[i].physical +
1005 em->orig_block_len;
1006 ret = 1;
1007 }
1008 }
1009
1010 return ret;
1011}
1012
1013
985/* 1014/*
986 * find_free_dev_extent - find free space in the specified device 1015 * find_free_dev_extent - find free space in the specified device
987 * @device: the device which we search the free space in 1016 * @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
1002 * But if we don't find suitable free space, it is used to store the size of 1031 * But if we don't find suitable free space, it is used to store the size of
1003 * the max free space. 1032 * the max free space.
1004 */ 1033 */
1005int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1034int find_free_dev_extent(struct btrfs_trans_handle *trans,
1035 struct btrfs_device *device, u64 num_bytes,
1006 u64 *start, u64 *len) 1036 u64 *start, u64 *len)
1007{ 1037{
1008 struct btrfs_key key; 1038 struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1026 */ 1056 */
1027 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1057 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1028 1058
1059 path = btrfs_alloc_path();
1060 if (!path)
1061 return -ENOMEM;
1062again:
1029 max_hole_start = search_start; 1063 max_hole_start = search_start;
1030 max_hole_size = 0; 1064 max_hole_size = 0;
1031 hole_size = 0; 1065 hole_size = 0;
1032 1066
1033 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1067 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1034 ret = -ENOSPC; 1068 ret = -ENOSPC;
1035 goto error; 1069 goto out;
1036 } 1070 }
1037 1071
1038 path = btrfs_alloc_path();
1039 if (!path) {
1040 ret = -ENOMEM;
1041 goto error;
1042 }
1043 path->reada = 2; 1072 path->reada = 2;
1073 path->search_commit_root = 1;
1074 path->skip_locking = 1;
1044 1075
1045 key.objectid = device->devid; 1076 key.objectid = device->devid;
1046 key.offset = search_start; 1077 key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1081 if (key.offset > search_start) { 1112 if (key.offset > search_start) {
1082 hole_size = key.offset - search_start; 1113 hole_size = key.offset - search_start;
1083 1114
1115 /*
1116 * Have to check before we set max_hole_start, otherwise
1117 * we could end up sending back this offset anyway.
1118 */
1119 if (contains_pending_extent(trans, device,
1120 &search_start,
1121 hole_size))
1122 hole_size = 0;
1123
1084 if (hole_size > max_hole_size) { 1124 if (hole_size > max_hole_size) {
1085 max_hole_start = search_start; 1125 max_hole_start = search_start;
1086 max_hole_size = hole_size; 1126 max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
1124 max_hole_size = hole_size; 1164 max_hole_size = hole_size;
1125 } 1165 }
1126 1166
1167 if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1168 btrfs_release_path(path);
1169 goto again;
1170 }
1171
1127 /* See above. */ 1172 /* See above. */
1128 if (hole_size < num_bytes) 1173 if (hole_size < num_bytes)
1129 ret = -ENOSPC; 1174 ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
1132 1177
1133out: 1178out:
1134 btrfs_free_path(path); 1179 btrfs_free_path(path);
1135error:
1136 *start = max_hole_start; 1180 *start = max_hole_start;
1137 if (len) 1181 if (len)
1138 *len = max_hole_size; 1182 *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
1244 return ret; 1288 return ret;
1245} 1289}
1246 1290
1247static noinline int find_next_chunk(struct btrfs_root *root, 1291static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1248 u64 objectid, u64 *offset)
1249{ 1292{
1250 struct btrfs_path *path; 1293 struct extent_map_tree *em_tree;
1251 int ret; 1294 struct extent_map *em;
1252 struct btrfs_key key; 1295 struct rb_node *n;
1253 struct btrfs_chunk *chunk; 1296 u64 ret = 0;
1254 struct btrfs_key found_key;
1255
1256 path = btrfs_alloc_path();
1257 if (!path)
1258 return -ENOMEM;
1259
1260 key.objectid = objectid;
1261 key.offset = (u64)-1;
1262 key.type = BTRFS_CHUNK_ITEM_KEY;
1263
1264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1265 if (ret < 0)
1266 goto error;
1267
1268 BUG_ON(ret == 0); /* Corruption */
1269 1297
1270 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1298 em_tree = &fs_info->mapping_tree.map_tree;
1271 if (ret) { 1299 read_lock(&em_tree->lock);
1272 *offset = 0; 1300 n = rb_last(&em_tree->map);
1273 } else { 1301 if (n) {
1274 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1302 em = rb_entry(n, struct extent_map, rb_node);
1275 path->slots[0]); 1303 ret = em->start + em->len;
1276 if (found_key.objectid != objectid)
1277 *offset = 0;
1278 else {
1279 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1280 struct btrfs_chunk);
1281 *offset = found_key.offset +
1282 btrfs_chunk_length(path->nodes[0], chunk);
1283 }
1284 } 1304 }
1285 ret = 0; 1305 read_unlock(&em_tree->lock);
1286error: 1306
1287 btrfs_free_path(path);
1288 return ret; 1307 return ret;
1289} 1308}
1290 1309
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1462 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1481 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1463 1482
1464 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1483 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1465 printk(KERN_ERR "btrfs: unable to go below four devices " 1484 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1466 "on raid10\n");
1467 ret = -EINVAL;
1468 goto out; 1485 goto out;
1469 } 1486 }
1470 1487
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1488 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1472 printk(KERN_ERR "btrfs: unable to go below two " 1489 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1473 "devices on raid1\n");
1474 ret = -EINVAL;
1475 goto out; 1490 goto out;
1476 } 1491 }
1477 1492
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1493 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1479 root->fs_info->fs_devices->rw_devices <= 2) { 1494 root->fs_info->fs_devices->rw_devices <= 2) {
1480 printk(KERN_ERR "btrfs: unable to go below two " 1495 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1481 "devices on raid5\n");
1482 ret = -EINVAL;
1483 goto out; 1496 goto out;
1484 } 1497 }
1485 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1498 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1486 root->fs_info->fs_devices->rw_devices <= 3) { 1499 root->fs_info->fs_devices->rw_devices <= 3) {
1487 printk(KERN_ERR "btrfs: unable to go below three " 1500 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1488 "devices on raid6\n");
1489 ret = -EINVAL;
1490 goto out; 1501 goto out;
1491 } 1502 }
1492 1503
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1512 bh = NULL; 1523 bh = NULL;
1513 disk_super = NULL; 1524 disk_super = NULL;
1514 if (!device) { 1525 if (!device) {
1515 printk(KERN_ERR "btrfs: no missing devices found to " 1526 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1516 "remove\n");
1517 goto out; 1527 goto out;
1518 } 1528 }
1519 } else { 1529 } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1535 } 1545 }
1536 1546
1537 if (device->is_tgtdev_for_dev_replace) { 1547 if (device->is_tgtdev_for_dev_replace) {
1538 pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1548 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1539 ret = -EINVAL;
1540 goto error_brelse; 1549 goto error_brelse;
1541 } 1550 }
1542 1551
1543 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1552 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1544 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1553 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1545 "device\n");
1546 ret = -EINVAL;
1547 goto error_brelse; 1554 goto error_brelse;
1548 } 1555 }
1549 1556
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3295 } 3302 }
3296 3303
3297 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3304 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3298 if (IS_ERR(tsk)) 3305 return PTR_RET(tsk);
3299 return PTR_ERR(tsk);
3300
3301 return 0;
3302} 3306}
3303 3307
3304int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3308int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3681} 3685}
3682 3686
3683static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3687static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3684 struct btrfs_root *extent_root, 3688 struct btrfs_root *extent_root, u64 start,
3685 struct map_lookup **map_ret, 3689 u64 type)
3686 u64 *num_bytes_out, u64 *stripe_size_out,
3687 u64 start, u64 type)
3688{ 3690{
3689 struct btrfs_fs_info *info = extent_root->fs_info; 3691 struct btrfs_fs_info *info = extent_root->fs_info;
3690 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3692 struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3791 if (total_avail == 0) 3793 if (total_avail == 0)
3792 continue; 3794 continue;
3793 3795
3794 ret = find_free_dev_extent(device, 3796 ret = find_free_dev_extent(trans, device,
3795 max_stripe_size * dev_stripes, 3797 max_stripe_size * dev_stripes,
3796 &dev_offset, &max_avail); 3798 &dev_offset, &max_avail);
3797 if (ret && ret != -ENOSPC) 3799 if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3903 map->type = type; 3905 map->type = type;
3904 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3905 3907
3906 *map_ret = map;
3907 num_bytes = stripe_size * data_stripes; 3908 num_bytes = stripe_size * data_stripes;
3908 3909
3909 *stripe_size_out = stripe_size;
3910 *num_bytes_out = num_bytes;
3911
3912 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3910 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3913 3911
3914 em = alloc_extent_map(); 3912 em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3921 em->len = num_bytes; 3919 em->len = num_bytes;
3922 em->block_start = 0; 3920 em->block_start = 0;
3923 em->block_len = em->len; 3921 em->block_len = em->len;
3922 em->orig_block_len = stripe_size;
3924 3923
3925 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3924 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3926 write_lock(&em_tree->lock); 3925 write_lock(&em_tree->lock);
3927 ret = add_extent_mapping(em_tree, em, 0); 3926 ret = add_extent_mapping(em_tree, em, 0);
3927 if (!ret) {
3928 list_add_tail(&em->list, &trans->transaction->pending_chunks);
3929 atomic_inc(&em->refs);
3930 }
3928 write_unlock(&em_tree->lock); 3931 write_unlock(&em_tree->lock);
3929 if (ret) { 3932 if (ret) {
3930 free_extent_map(em); 3933 free_extent_map(em);
3931 goto error; 3934 goto error;
3932 } 3935 }
3933 3936
3934 for (i = 0; i < map->num_stripes; ++i) {
3935 struct btrfs_device *device;
3936 u64 dev_offset;
3937
3938 device = map->stripes[i].dev;
3939 dev_offset = map->stripes[i].physical;
3940
3941 ret = btrfs_alloc_dev_extent(trans, device,
3942 info->chunk_root->root_key.objectid,
3943 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3944 start, dev_offset, stripe_size);
3945 if (ret)
3946 goto error_dev_extent;
3947 }
3948
3949 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3937 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3950 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3938 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3951 start, num_bytes); 3939 start, num_bytes);
3952 if (ret) { 3940 if (ret)
3953 i = map->num_stripes - 1; 3941 goto error_del_extent;
3954 goto error_dev_extent;
3955 }
3956 3942
3957 free_extent_map(em); 3943 free_extent_map(em);
3958 check_raid56_incompat_flag(extent_root->fs_info, type); 3944 check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3960 kfree(devices_info); 3946 kfree(devices_info);
3961 return 0; 3947 return 0;
3962 3948
3963error_dev_extent: 3949error_del_extent:
3964 for (; i >= 0; i--) {
3965 struct btrfs_device *device;
3966 int err;
3967
3968 device = map->stripes[i].dev;
3969 err = btrfs_free_dev_extent(trans, device, start);
3970 if (err) {
3971 btrfs_abort_transaction(trans, extent_root, err);
3972 break;
3973 }
3974 }
3975 write_lock(&em_tree->lock); 3950 write_lock(&em_tree->lock);
3976 remove_extent_mapping(em_tree, em); 3951 remove_extent_mapping(em_tree, em);
3977 write_unlock(&em_tree->lock); 3952 write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
3986 return ret; 3961 return ret;
3987} 3962}
3988 3963
3989static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3964int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, 3965 struct btrfs_root *extent_root,
3991 struct map_lookup *map, u64 chunk_offset, 3966 u64 chunk_offset, u64 chunk_size)
3992 u64 chunk_size, u64 stripe_size)
3993{ 3967{
3994 u64 dev_offset;
3995 struct btrfs_key key; 3968 struct btrfs_key key;
3996 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3969 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3997 struct btrfs_device *device; 3970 struct btrfs_device *device;
3998 struct btrfs_chunk *chunk; 3971 struct btrfs_chunk *chunk;
3999 struct btrfs_stripe *stripe; 3972 struct btrfs_stripe *stripe;
4000 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3973 struct extent_map_tree *em_tree;
4001 int index = 0; 3974 struct extent_map *em;
3975 struct map_lookup *map;
3976 size_t item_size;
3977 u64 dev_offset;
3978 u64 stripe_size;
3979 int i = 0;
4002 int ret; 3980 int ret;
4003 3981
3982 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3983 read_lock(&em_tree->lock);
3984 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
3985 read_unlock(&em_tree->lock);
3986
3987 if (!em) {
3988 btrfs_crit(extent_root->fs_info, "unable to find logical "
3989 "%Lu len %Lu", chunk_offset, chunk_size);
3990 return -EINVAL;
3991 }
3992
3993 if (em->start != chunk_offset || em->len != chunk_size) {
3994 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
3995 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
3996 chunk_size, em->start, em->len);
3997 free_extent_map(em);
3998 return -EINVAL;
3999 }
4000
4001 map = (struct map_lookup *)em->bdev;
4002 item_size = btrfs_chunk_item_size(map->num_stripes);
4003 stripe_size = em->orig_block_len;
4004
4004 chunk = kzalloc(item_size, GFP_NOFS); 4005 chunk = kzalloc(item_size, GFP_NOFS);
4005 if (!chunk) 4006 if (!chunk) {
4006 return -ENOMEM; 4007 ret = -ENOMEM;
4008 goto out;
4009 }
4010
4011 for (i = 0; i < map->num_stripes; i++) {
4012 device = map->stripes[i].dev;
4013 dev_offset = map->stripes[i].physical;
4007 4014
4008 index = 0;
4009 while (index < map->num_stripes) {
4010 device = map->stripes[index].dev;
4011 device->bytes_used += stripe_size; 4015 device->bytes_used += stripe_size;
4012 ret = btrfs_update_device(trans, device); 4016 ret = btrfs_update_device(trans, device);
4013 if (ret) 4017 if (ret)
4014 goto out_free; 4018 goto out;
4015 index++; 4019 ret = btrfs_alloc_dev_extent(trans, device,
4020 chunk_root->root_key.objectid,
4021 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4022 chunk_offset, dev_offset,
4023 stripe_size);
4024 if (ret)
4025 goto out;
4016 } 4026 }
4017 4027
4018 spin_lock(&extent_root->fs_info->free_chunk_lock); 4028 spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4020 map->num_stripes); 4030 map->num_stripes);
4021 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4031 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4022 4032
4023 index = 0;
4024 stripe = &chunk->stripe; 4033 stripe = &chunk->stripe;
4025 while (index < map->num_stripes) { 4034 for (i = 0; i < map->num_stripes; i++) {
4026 device = map->stripes[index].dev; 4035 device = map->stripes[i].dev;
4027 dev_offset = map->stripes[index].physical; 4036 dev_offset = map->stripes[i].physical;
4028 4037
4029 btrfs_set_stack_stripe_devid(stripe, device->devid); 4038 btrfs_set_stack_stripe_devid(stripe, device->devid);
4030 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4039 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4031 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4040 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4032 stripe++; 4041 stripe++;
4033 index++;
4034 } 4042 }
4035 4043
4036 btrfs_set_stack_chunk_length(chunk, chunk_size); 4044 btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4048 key.offset = chunk_offset; 4056 key.offset = chunk_offset;
4049 4057
4050 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4058 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4051
4052 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4059 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4053 /* 4060 /*
4054 * TODO: Cleanup of inserted chunk root in case of 4061 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4058 item_size); 4065 item_size);
4059 } 4066 }
4060 4067
4061out_free: 4068out:
4062 kfree(chunk); 4069 kfree(chunk);
4070 free_extent_map(em);
4063 return ret; 4071 return ret;
4064} 4072}
4065 4073
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4074 struct btrfs_root *extent_root, u64 type) 4082 struct btrfs_root *extent_root, u64 type)
4075{ 4083{
4076 u64 chunk_offset; 4084 u64 chunk_offset;
4077 u64 chunk_size;
4078 u64 stripe_size;
4079 struct map_lookup *map;
4080 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4081 int ret;
4082
4083 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4084 &chunk_offset);
4085 if (ret)
4086 return ret;
4087 4085
4088 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4086 chunk_offset = find_next_chunk(extent_root->fs_info);
4089 &stripe_size, chunk_offset, type); 4087 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4090 if (ret)
4091 return ret;
4092
4093 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4094 chunk_size, stripe_size);
4095 if (ret)
4096 return ret;
4097 return 0;
4098} 4088}
4099 4089
4100static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4090static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4103{ 4093{
4104 u64 chunk_offset; 4094 u64 chunk_offset;
4105 u64 sys_chunk_offset; 4095 u64 sys_chunk_offset;
4106 u64 chunk_size;
4107 u64 sys_chunk_size;
4108 u64 stripe_size;
4109 u64 sys_stripe_size;
4110 u64 alloc_profile; 4096 u64 alloc_profile;
4111 struct map_lookup *map;
4112 struct map_lookup *sys_map;
4113 struct btrfs_fs_info *fs_info = root->fs_info; 4097 struct btrfs_fs_info *fs_info = root->fs_info;
4114 struct btrfs_root *extent_root = fs_info->extent_root; 4098 struct btrfs_root *extent_root = fs_info->extent_root;
4115 int ret; 4099 int ret;
4116 4100
4117 ret = find_next_chunk(fs_info->chunk_root, 4101 chunk_offset = find_next_chunk(fs_info);
4118 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
4119 if (ret)
4120 return ret;
4121
4122 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4102 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4123 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4103 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4124 &stripe_size, chunk_offset, alloc_profile); 4104 alloc_profile);
4125 if (ret) 4105 if (ret)
4126 return ret; 4106 return ret;
4127 4107
4128 sys_chunk_offset = chunk_offset + chunk_size; 4108 sys_chunk_offset = find_next_chunk(root->fs_info);
4129
4130 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4109 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4131 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4110 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4132 &sys_chunk_size, &sys_stripe_size, 4111 alloc_profile);
4133 sys_chunk_offset, alloc_profile);
4134 if (ret) { 4112 if (ret) {
4135 btrfs_abort_transaction(trans, root, ret); 4113 btrfs_abort_transaction(trans, root, ret);
4136 goto out; 4114 goto out;
4137 } 4115 }
4138 4116
4139 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4117 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4140 if (ret) {
4141 btrfs_abort_transaction(trans, root, ret);
4142 goto out;
4143 }
4144
4145 /*
4146 * Modifying chunk tree needs allocating new blocks from both
4147 * system block group and metadata block group. So we only can
4148 * do operations require modifying the chunk tree after both
4149 * block groups were created.
4150 */
4151 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4152 chunk_size, stripe_size);
4153 if (ret) {
4154 btrfs_abort_transaction(trans, root, ret);
4155 goto out;
4156 }
4157
4158 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
4159 sys_chunk_offset, sys_chunk_size,
4160 sys_stripe_size);
4161 if (ret) 4118 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4119 btrfs_abort_transaction(trans, root, ret);
4163
4164out: 4120out:
4165
4166 return ret; 4121 return ret;
4167} 4122}
4168 4123
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4435 map = (struct map_lookup *)em->bdev; 4390 map = (struct map_lookup *)em->bdev;
4436 offset = logical - em->start; 4391 offset = logical - em->start;
4437 4392
4438 if (mirror_num > map->num_stripes)
4439 mirror_num = 0;
4440
4441 stripe_len = map->stripe_len; 4393 stripe_len = map->stripe_len;
4442 stripe_nr = offset; 4394 stripe_nr = offset;
4443 /* 4395 /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5367 return NULL; 5319 return NULL;
5368 list_add(&device->dev_list, 5320 list_add(&device->dev_list,
5369 &fs_devices->devices); 5321 &fs_devices->devices);
5370 device->dev_root = root->fs_info->dev_root;
5371 device->devid = devid; 5322 device->devid = devid;
5372 device->work.func = pending_bios_fn; 5323 device->work.func = pending_bios_fn;
5373 device->fs_devices = fs_devices; 5324 device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
5593 } 5544 }
5594 5545
5595 fill_device_from_item(leaf, dev_item, device); 5546 fill_device_from_item(leaf, dev_item, device);
5596 device->dev_root = root->fs_info->dev_root;
5597 device->in_fs_metadata = 1; 5547 device->in_fs_metadata = 1;
5598 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5548 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5599 device->fs_devices->total_rw_bytes += device->total_bytes; 5549 device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
5751 return ret; 5701 return ret;
5752} 5702}
5753 5703
5704void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
5705{
5706 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5707 struct btrfs_device *device;
5708
5709 mutex_lock(&fs_devices->device_list_mutex);
5710 list_for_each_entry(device, &fs_devices->devices, dev_list)
5711 device->dev_root = fs_info->dev_root;
5712 mutex_unlock(&fs_devices->device_list_mutex);
5713}
5714
5754static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 5715static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5755{ 5716{
5756 int i; 5717 int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
316int btrfs_pause_balance(struct btrfs_fs_info *fs_info); 316int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); 317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
319int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 319int find_free_dev_extent(struct btrfs_trans_handle *trans,
320 struct btrfs_device *device, u64 num_bytes,
320 u64 *start, u64 *max_avail); 321 u64 *start, u64 *max_avail);
321void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 322void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
322int btrfs_get_dev_stats(struct btrfs_root *root, 323int btrfs_get_dev_stats(struct btrfs_root *root,
323 struct btrfs_ioctl_get_dev_stats *stats); 324 struct btrfs_ioctl_get_dev_stats *stats);
325void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
324int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 326int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
325int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 327int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
326 struct btrfs_fs_info *fs_info); 328 struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
336unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 338unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
337 struct btrfs_mapping_tree *map_tree, 339 struct btrfs_mapping_tree *map_tree,
338 u64 logical); 340 u64 logical);
341int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
342 struct btrfs_root *extent_root,
343 u64 chunk_offset, u64 chunk_size);
339static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 344static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
340 int index) 345 int index)
341{ 346{
diff --git a/fs/buffer.c b/fs/buffer.c
index f93392e2df12..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
83EXPORT_SYMBOL(unlock_buffer); 83EXPORT_SYMBOL(unlock_buffer);
84 84
85/* 85/*
86 * Returns if the page has dirty or writeback buffers. If all the buffers
87 * are unlocked and clean then the PageDirty information is stale. If
88 * any of the pages are locked, it is assumed they are locked for IO.
89 */
90void buffer_check_dirty_writeback(struct page *page,
91 bool *dirty, bool *writeback)
92{
93 struct buffer_head *head, *bh;
94 *dirty = false;
95 *writeback = false;
96
97 BUG_ON(!PageLocked(page));
98
99 if (!page_has_buffers(page))
100 return;
101
102 if (PageWriteback(page))
103 *writeback = true;
104
105 head = page_buffers(page);
106 bh = head;
107 do {
108 if (buffer_locked(bh))
109 *writeback = true;
110
111 if (buffer_dirty(bh))
112 *dirty = true;
113
114 bh = bh->b_this_page;
115 } while (bh != head);
116}
117EXPORT_SYMBOL(buffer_check_dirty_writeback);
118
119/*
86 * Block until a buffer comes unlocked. This doesn't stop it 120 * Block until a buffer comes unlocked. This doesn't stop it
87 * from becoming locked again - you have to lock it yourself 121 * from becoming locked again - you have to lock it yourself
88 * if you want to preserve its state. 122 * if you want to preserve its state.
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/swap.h>
15#include "internal.h" 16#include "internal.h"
16 17
17/* 18/*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
227 */ 228 */
228static int cachefiles_read_backing_file_one(struct cachefiles_object *object, 229static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
229 struct fscache_retrieval *op, 230 struct fscache_retrieval *op,
230 struct page *netpage, 231 struct page *netpage)
231 struct pagevec *pagevec)
232{ 232{
233 struct cachefiles_one_read *monitor; 233 struct cachefiles_one_read *monitor;
234 struct address_space *bmapping; 234 struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
237 237
238 _enter(""); 238 _enter("");
239 239
240 pagevec_reinit(pagevec);
241
242 _debug("read back %p{%lu,%d}", 240 _debug("read back %p{%lu,%d}",
243 netpage, netpage->index, page_count(netpage)); 241 netpage, netpage->index, page_count(netpage));
244 242
@@ -283,9 +281,7 @@ installed_new_backing_page:
283 backpage = newpage; 281 backpage = newpage;
284 newpage = NULL; 282 newpage = NULL;
285 283
286 page_cache_get(backpage); 284 lru_cache_add_file(backpage);
287 pagevec_add(pagevec, backpage);
288 __pagevec_lru_add_file(pagevec);
289 285
290read_backing_page: 286read_backing_page:
291 ret = bmapping->a_ops->readpage(NULL, backpage); 287 ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
452 if (block) { 448 if (block) {
453 /* submit the apparently valid page to the backing fs to be 449 /* submit the apparently valid page to the backing fs to be
454 * read from disk */ 450 * read from disk */
455 ret = cachefiles_read_backing_file_one(object, op, page, 451 ret = cachefiles_read_backing_file_one(object, op, page);
456 &pagevec);
457 } else if (cachefiles_has_space(cache, 0, 1) == 0) { 452 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
458 /* there's space in the cache we can use */ 453 /* there's space in the cache we can use */
459 fscache_mark_page_cached(op, page); 454 fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
482{ 477{
483 struct cachefiles_one_read *monitor = NULL; 478 struct cachefiles_one_read *monitor = NULL;
484 struct address_space *bmapping = object->backer->d_inode->i_mapping; 479 struct address_space *bmapping = object->backer->d_inode->i_mapping;
485 struct pagevec lru_pvec;
486 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; 480 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
487 int ret = 0; 481 int ret = 0;
488 482
489 _enter(""); 483 _enter("");
490 484
491 pagevec_init(&lru_pvec, 0);
492
493 list_for_each_entry_safe(netpage, _n, list, lru) { 485 list_for_each_entry_safe(netpage, _n, list, lru) {
494 list_del(&netpage->lru); 486 list_del(&netpage->lru);
495 487
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
534 backpage = newpage; 526 backpage = newpage;
535 newpage = NULL; 527 newpage = NULL;
536 528
537 page_cache_get(backpage); 529 lru_cache_add_file(backpage);
538 if (!pagevec_add(&lru_pvec, backpage))
539 __pagevec_lru_add_file(&lru_pvec);
540 530
541 reread_backing_page: 531 reread_backing_page:
542 ret = bmapping->a_ops->readpage(NULL, backpage); 532 ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
559 goto nomem; 549 goto nomem;
560 } 550 }
561 551
562 page_cache_get(netpage); 552 lru_cache_add_file(netpage);
563 if (!pagevec_add(&lru_pvec, netpage))
564 __pagevec_lru_add_file(&lru_pvec);
565 553
566 /* install a monitor */ 554 /* install a monitor */
567 page_cache_get(netpage); 555 page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
643 631
644 fscache_mark_page_cached(op, netpage); 632 fscache_mark_page_cached(op, netpage);
645 633
646 page_cache_get(netpage); 634 lru_cache_add_file(netpage);
647 if (!pagevec_add(&lru_pvec, netpage))
648 __pagevec_lru_add_file(&lru_pvec);
649 635
650 /* the netpage is unlocked and marked up to date here */ 636 /* the netpage is unlocked and marked up to date here */
651 fscache_end_io(op, netpage, 0); 637 fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
661 647
662out: 648out:
663 /* tidy up */ 649 /* tidy up */
664 pagevec_lru_add_file(&lru_pvec);
665
666 if (newpage) 650 if (newpage)
667 page_cache_release(newpage); 651 page_cache_release(newpage);
668 if (netpage) 652 if (netpage)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b5c1bc6776..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -439,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
439 struct ceph_inode_info *ci; 439 struct ceph_inode_info *ci;
440 struct ceph_fs_client *fsc; 440 struct ceph_fs_client *fsc;
441 struct ceph_osd_client *osdc; 441 struct ceph_osd_client *osdc;
442 loff_t page_off = page_offset(page);
443 int len = PAGE_CACHE_SIZE;
444 loff_t i_size;
445 int err = 0;
446 struct ceph_snap_context *snapc, *oldest; 442 struct ceph_snap_context *snapc, *oldest;
447 u64 snap_size = 0; 443 loff_t page_off = page_offset(page);
448 long writeback_stat; 444 long writeback_stat;
445 u64 truncate_size, snap_size = 0;
446 u32 truncate_seq;
447 int err = 0, len = PAGE_CACHE_SIZE;
449 448
450 dout("writepage %p idx %lu\n", page, page->index); 449 dout("writepage %p idx %lu\n", page, page->index);
451 450
@@ -475,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
475 } 474 }
476 ceph_put_snap_context(oldest); 475 ceph_put_snap_context(oldest);
477 476
477 spin_lock(&ci->i_ceph_lock);
478 truncate_seq = ci->i_truncate_seq;
479 truncate_size = ci->i_truncate_size;
480 if (!snap_size)
481 snap_size = i_size_read(inode);
482 spin_unlock(&ci->i_ceph_lock);
483
478 /* is this a partial page at end of file? */ 484 /* is this a partial page at end of file? */
479 if (snap_size) 485 if (page_off >= snap_size) {
480 i_size = snap_size; 486 dout("%p page eof %llu\n", page, snap_size);
481 else 487 goto out;
482 i_size = i_size_read(inode); 488 }
483 if (i_size < page_off + len) 489 if (snap_size < page_off + len)
484 len = i_size - page_off; 490 len = snap_size - page_off;
485 491
486 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 492 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
487 inode, page, page->index, page_off, len, snapc); 493 inode, page, page->index, page_off, len, snapc);
@@ -495,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
495 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 501 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
496 &ci->i_layout, snapc, 502 &ci->i_layout, snapc,
497 page_off, len, 503 page_off, len,
498 ci->i_truncate_seq, ci->i_truncate_size, 504 truncate_seq, truncate_size,
499 &inode->i_mtime, &page, 1); 505 &inode->i_mtime, &page, 1);
500 if (err < 0) { 506 if (err < 0) {
501 dout("writepage setting page/mapping error %d %p\n", err, page); 507 dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -632,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
632 ceph_osdc_put_request(req); 638 ceph_osdc_put_request(req);
633} 639}
634 640
635static struct ceph_osd_request *
636ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
637 struct ceph_snap_context *snapc, int num_ops)
638{
639 struct ceph_fs_client *fsc;
640 struct ceph_inode_info *ci;
641 struct ceph_vino vino;
642
643 fsc = ceph_inode_to_client(inode);
644 ci = ceph_inode(inode);
645 vino = ceph_vino(inode);
646 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
647
648 return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
649 vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
650 CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
651 snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
652}
653
654/* 641/*
655 * initiate async writeback 642 * initiate async writeback
656 */ 643 */
@@ -659,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
659{ 646{
660 struct inode *inode = mapping->host; 647 struct inode *inode = mapping->host;
661 struct ceph_inode_info *ci = ceph_inode(inode); 648 struct ceph_inode_info *ci = ceph_inode(inode);
662 struct ceph_fs_client *fsc; 649 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
650 struct ceph_vino vino = ceph_vino(inode);
663 pgoff_t index, start, end; 651 pgoff_t index, start, end;
664 int range_whole = 0; 652 int range_whole = 0;
665 int should_loop = 1; 653 int should_loop = 1;
@@ -671,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
671 unsigned wsize = 1 << inode->i_blkbits; 659 unsigned wsize = 1 << inode->i_blkbits;
672 struct ceph_osd_request *req = NULL; 660 struct ceph_osd_request *req = NULL;
673 int do_sync; 661 int do_sync;
674 u64 snap_size; 662 u64 truncate_size, snap_size;
663 u32 truncate_seq;
675 664
676 /* 665 /*
677 * Include a 'sync' in the OSD request if this is a data 666 * Include a 'sync' in the OSD request if this is a data
678 * integrity write (e.g., O_SYNC write or fsync()), or if our 667 * integrity write (e.g., O_SYNC write or fsync()), or if our
679 * cap is being revoked. 668 * cap is being revoked.
680 */ 669 */
681 do_sync = wbc->sync_mode == WB_SYNC_ALL; 670 if ((wbc->sync_mode == WB_SYNC_ALL) ||
682 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) 671 ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
683 do_sync = 1; 672 do_sync = 1;
684 dout("writepages_start %p dosync=%d (mode=%s)\n", 673 dout("writepages_start %p dosync=%d (mode=%s)\n",
685 inode, do_sync, 674 inode, do_sync,
686 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 675 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
687 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 676 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
688 677
689 fsc = ceph_inode_to_client(inode);
690 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { 678 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
691 pr_warning("writepage_start %p on forced umount\n", inode); 679 pr_warning("writepage_start %p on forced umount\n", inode);
692 return -EIO; /* we're in a forced umount, don't write! */ 680 return -EIO; /* we're in a forced umount, don't write! */
@@ -729,6 +717,14 @@ retry:
729 snap_size = i_size_read(inode); 717 snap_size = i_size_read(inode);
730 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 718 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
731 snapc, snapc->seq, snapc->num_snaps); 719 snapc, snapc->seq, snapc->num_snaps);
720
721 spin_lock(&ci->i_ceph_lock);
722 truncate_seq = ci->i_truncate_seq;
723 truncate_size = ci->i_truncate_size;
724 if (!snap_size)
725 snap_size = i_size_read(inode);
726 spin_unlock(&ci->i_ceph_lock);
727
732 if (last_snapc && snapc != last_snapc) { 728 if (last_snapc && snapc != last_snapc) {
733 /* if we switched to a newer snapc, restart our scan at the 729 /* if we switched to a newer snapc, restart our scan at the
734 * start of the original file range. */ 730 * start of the original file range. */
@@ -740,7 +736,6 @@ retry:
740 736
741 while (!done && index <= end) { 737 while (!done && index <= end) {
742 int num_ops = do_sync ? 2 : 1; 738 int num_ops = do_sync ? 2 : 1;
743 struct ceph_vino vino;
744 unsigned i; 739 unsigned i;
745 int first; 740 int first;
746 pgoff_t next; 741 pgoff_t next;
@@ -834,17 +829,18 @@ get_more_pages:
834 * that it will use. 829 * that it will use.
835 */ 830 */
836 if (locked_pages == 0) { 831 if (locked_pages == 0) {
837 size_t size;
838
839 BUG_ON(pages); 832 BUG_ON(pages);
840
841 /* prepare async write request */ 833 /* prepare async write request */
842 offset = (u64)page_offset(page); 834 offset = (u64)page_offset(page);
843 len = wsize; 835 len = wsize;
844 req = ceph_writepages_osd_request(inode, 836 req = ceph_osdc_new_request(&fsc->client->osdc,
845 offset, &len, snapc, 837 &ci->i_layout, vino,
846 num_ops); 838 offset, &len, num_ops,
847 839 CEPH_OSD_OP_WRITE,
840 CEPH_OSD_FLAG_WRITE |
841 CEPH_OSD_FLAG_ONDISK,
842 snapc, truncate_seq,
843 truncate_size, true);
848 if (IS_ERR(req)) { 844 if (IS_ERR(req)) {
849 rc = PTR_ERR(req); 845 rc = PTR_ERR(req);
850 unlock_page(page); 846 unlock_page(page);
@@ -855,8 +851,8 @@ get_more_pages:
855 req->r_inode = inode; 851 req->r_inode = inode;
856 852
857 max_pages = calc_pages_for(0, (u64)len); 853 max_pages = calc_pages_for(0, (u64)len);
858 size = max_pages * sizeof (*pages); 854 pages = kmalloc(max_pages * sizeof (*pages),
859 pages = kmalloc(size, GFP_NOFS); 855 GFP_NOFS);
860 if (!pages) { 856 if (!pages) {
861 pool = fsc->wb_pagevec_pool; 857 pool = fsc->wb_pagevec_pool;
862 pages = mempool_alloc(pool, GFP_NOFS); 858 pages = mempool_alloc(pool, GFP_NOFS);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8a3bcb..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
147 spin_unlock(&mdsc->caps_list_lock); 147 spin_unlock(&mdsc->caps_list_lock);
148} 148}
149 149
150int ceph_reserve_caps(struct ceph_mds_client *mdsc, 150void ceph_reserve_caps(struct ceph_mds_client *mdsc,
151 struct ceph_cap_reservation *ctx, int need) 151 struct ceph_cap_reservation *ctx, int need)
152{ 152{
153 int i; 153 int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
155 int have; 155 int have;
156 int alloc = 0; 156 int alloc = 0;
157 LIST_HEAD(newcaps); 157 LIST_HEAD(newcaps);
158 int ret = 0;
159 158
160 dout("reserve caps ctx=%p need=%d\n", ctx, need); 159 dout("reserve caps ctx=%p need=%d\n", ctx, need);
161 160
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
174 173
175 for (i = have; i < need; i++) { 174 for (i = have; i < need; i++) {
176 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 175 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
177 if (!cap) { 176 if (!cap)
178 ret = -ENOMEM; 177 break;
179 goto out_alloc_count;
180 }
181 list_add(&cap->caps_item, &newcaps); 178 list_add(&cap->caps_item, &newcaps);
182 alloc++; 179 alloc++;
183 } 180 }
184 BUG_ON(have + alloc != need); 181 /* we didn't manage to reserve as much as we needed */
182 if (have + alloc != need)
183 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
184 ctx, need, have + alloc);
185 185
186 spin_lock(&mdsc->caps_list_lock); 186 spin_lock(&mdsc->caps_list_lock);
187 mdsc->caps_total_count += alloc; 187 mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
198 ctx, mdsc->caps_total_count, mdsc->caps_use_count, 198 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
199 mdsc->caps_reserve_count, mdsc->caps_avail_count); 199 mdsc->caps_reserve_count, mdsc->caps_avail_count);
200 return 0;
201
202out_alloc_count:
203 /* we didn't manage to reserve as much as we needed */
204 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
205 ctx, need, have);
206 return ret;
207} 200}
208 201
209int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 202int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
612 __cap_delay_requeue(mdsc, ci); 605 __cap_delay_requeue(mdsc, ci);
613 } 606 }
614 607
615 if (flags & CEPH_CAP_FLAG_AUTH) 608 if (flags & CEPH_CAP_FLAG_AUTH) {
616 ci->i_auth_cap = cap; 609 if (ci->i_auth_cap == NULL ||
617 else if (ci->i_auth_cap == cap) { 610 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
611 ci->i_auth_cap = cap;
612 } else if (ci->i_auth_cap == cap) {
618 ci->i_auth_cap = NULL; 613 ci->i_auth_cap = NULL;
619 spin_lock(&mdsc->cap_dirty_lock); 614 spin_lock(&mdsc->cap_dirty_lock);
620 if (!list_empty(&ci->i_dirty_item)) { 615 if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
695 if (implemented) 690 if (implemented)
696 *implemented |= cap->implemented; 691 *implemented |= cap->implemented;
697 } 692 }
693 /*
694 * exclude caps issued by non-auth MDS, but are been revoking
695 * by the auth MDS. The non-auth MDS should be revoking/exporting
696 * these caps, but the message is delayed.
697 */
698 if (ci->i_auth_cap) {
699 cap = ci->i_auth_cap;
700 have &= ~cap->implemented | cap->issued;
701 }
698 return have; 702 return have;
699} 703}
700 704
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
802/* 806/*
803 * Return true if mask caps are currently being revoked by an MDS. 807 * Return true if mask caps are currently being revoked by an MDS.
804 */ 808 */
805int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) 809int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
810 struct ceph_cap *ocap, int mask)
806{ 811{
807 struct inode *inode = &ci->vfs_inode;
808 struct ceph_cap *cap; 812 struct ceph_cap *cap;
809 struct rb_node *p; 813 struct rb_node *p;
810 int ret = 0;
811 814
812 spin_lock(&ci->i_ceph_lock);
813 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 815 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
814 cap = rb_entry(p, struct ceph_cap, ci_node); 816 cap = rb_entry(p, struct ceph_cap, ci_node);
815 if (__cap_is_valid(cap) && 817 if (cap != ocap && __cap_is_valid(cap) &&
816 (cap->implemented & ~cap->issued & mask)) { 818 (cap->implemented & ~cap->issued & mask))
817 ret = 1; 819 return 1;
818 break;
819 }
820 } 820 }
821 return 0;
822}
823
824int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
825{
826 struct inode *inode = &ci->vfs_inode;
827 int ret;
828
829 spin_lock(&ci->i_ceph_lock);
830 ret = __ceph_caps_revoking_other(ci, NULL, mask);
821 spin_unlock(&ci->i_ceph_lock); 831 spin_unlock(&ci->i_ceph_lock);
822 dout("ceph_caps_revoking %p %s = %d\n", inode, 832 dout("ceph_caps_revoking %p %s = %d\n", inode,
823 ceph_cap_string(mask), ret); 833 ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1980 cap = ci->i_auth_cap; 1990 cap = ci->i_auth_cap;
1981 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 1991 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1982 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 1992 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1993
1983 __ceph_flush_snaps(ci, &session, 1); 1994 __ceph_flush_snaps(ci, &session, 1);
1995
1984 if (ci->i_flushing_caps) { 1996 if (ci->i_flushing_caps) {
1997 spin_lock(&mdsc->cap_dirty_lock);
1998 list_move_tail(&ci->i_flushing_item,
1999 &cap->session->s_cap_flushing);
2000 spin_unlock(&mdsc->cap_dirty_lock);
2001
1985 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2002 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1986 __ceph_caps_used(ci), 2003 __ceph_caps_used(ci),
1987 __ceph_caps_wanted(ci), 2004 __ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2055 /* finish pending truncate */ 2072 /* finish pending truncate */
2056 while (ci->i_truncate_pending) { 2073 while (ci->i_truncate_pending) {
2057 spin_unlock(&ci->i_ceph_lock); 2074 spin_unlock(&ci->i_ceph_lock);
2058 __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); 2075 if (!(need & CEPH_CAP_FILE_WR))
2076 mutex_lock(&inode->i_mutex);
2077 __ceph_do_pending_vmtruncate(inode);
2078 if (!(need & CEPH_CAP_FILE_WR))
2079 mutex_unlock(&inode->i_mutex);
2059 spin_lock(&ci->i_ceph_lock); 2080 spin_lock(&ci->i_ceph_lock);
2060 } 2081 }
2061 2082
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2473 } else { 2494 } else {
2474 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), 2495 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2475 ceph_cap_string(newcaps)); 2496 ceph_cap_string(newcaps));
2497 /* non-auth MDS is revoking the newly grant caps ? */
2498 if (cap == ci->i_auth_cap &&
2499 __ceph_caps_revoking_other(ci, cap, newcaps))
2500 check_caps = 2;
2501
2476 cap->issued = newcaps; 2502 cap->issued = newcaps;
2477 cap->implemented |= newcaps; /* add bits only, to 2503 cap->implemented |= newcaps; /* add bits only, to
2478 * avoid stepping on a 2504 * avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
3042 (cap->issued & unless) == 0)) { 3068 (cap->issued & unless) == 0)) {
3043 if ((cap->issued & drop) && 3069 if ((cap->issued & drop) &&
3044 (cap->issued & unless) == 0) { 3070 (cap->issued & unless) == 0) {
3045 dout("encode_inode_release %p cap %p %s -> " 3071 int wanted = __ceph_caps_wanted(ci);
3046 "%s\n", inode, cap, 3072 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
3073 wanted |= cap->mds_wanted;
3074 dout("encode_inode_release %p cap %p "
3075 "%s -> %s, wanted %s -> %s\n", inode, cap,
3047 ceph_cap_string(cap->issued), 3076 ceph_cap_string(cap->issued),
3048 ceph_cap_string(cap->issued & ~drop)); 3077 ceph_cap_string(cap->issued & ~drop),
3078 ceph_cap_string(cap->mds_wanted),
3079 ceph_cap_string(wanted));
3080
3049 cap->issued &= ~drop; 3081 cap->issued &= ~drop;
3050 cap->implemented &= ~drop; 3082 cap->implemented &= ~drop;
3051 if (ci->i_ceph_flags & CEPH_I_NODELAY) { 3083 cap->mds_wanted = wanted;
3052 int wanted = __ceph_caps_wanted(ci);
3053 dout(" wanted %s -> %s (act %s)\n",
3054 ceph_cap_string(cap->mds_wanted),
3055 ceph_cap_string(cap->mds_wanted &
3056 ~wanted),
3057 ceph_cap_string(wanted));
3058 cap->mds_wanted &= wanted;
3059 }
3060 } else { 3084 } else {
3061 dout("encode_inode_release %p cap %p %s" 3085 dout("encode_inode_release %p cap %p %s"
3062 " (force)\n", inode, cap, 3086 " (force)\n", inode, cap,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e16907430..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
716 if (ceph_snap(inode) != CEPH_NOSNAP) 716 if (ceph_snap(inode) != CEPH_NOSNAP)
717 return -EROFS; 717 return -EROFS;
718 718
719 sb_start_write(inode->i_sb);
720 mutex_lock(&inode->i_mutex); 719 mutex_lock(&inode->i_mutex);
721 hold_mutex = true; 720 hold_mutex = true;
722 721
@@ -809,7 +808,6 @@ retry_snap:
809out: 808out:
810 if (hold_mutex) 809 if (hold_mutex)
811 mutex_unlock(&inode->i_mutex); 810 mutex_unlock(&inode->i_mutex);
812 sb_end_write(inode->i_sb);
813 current->backing_dev_info = NULL; 811 current->backing_dev_info = NULL;
814 812
815 return written ? written : err; 813 return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
824 int ret; 822 int ret;
825 823
826 mutex_lock(&inode->i_mutex); 824 mutex_lock(&inode->i_mutex);
827 __ceph_do_pending_vmtruncate(inode, false); 825 __ceph_do_pending_vmtruncate(inode);
828 826
829 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 827 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
830 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 828 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
@@ -866,16 +864,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
866 break; 864 break;
867 } 865 }
868 866
869 if (offset < 0 || offset > inode->i_sb->s_maxbytes) { 867 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
870 offset = -EINVAL;
871 goto out;
872 }
873
874 /* Special lock needed here? */
875 if (offset != file->f_pos) {
876 file->f_pos = offset;
877 file->f_version = 0;
878 }
879 868
880out: 869out:
881 mutex_unlock(&inode->i_mutex); 870 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e20d62e..f3a2abf28a77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
903 } else if (realdn) { 903 } else if (realdn) {
904 dout("dn %p (%d) spliced with %p (%d) " 904 dout("dn %p (%d) spliced with %p (%d) "
905 "inode %p ino %llx.%llx\n", 905 "inode %p ino %llx.%llx\n",
906 dn, dn->d_count, 906 dn, d_count(dn),
907 realdn, realdn->d_count, 907 realdn, d_count(realdn),
908 realdn->d_inode, ceph_vinop(realdn->d_inode)); 908 realdn->d_inode, ceph_vinop(realdn->d_inode));
909 dput(dn); 909 dput(dn);
910 dn = realdn; 910 dn = realdn;
@@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work)
1465 struct inode *inode = &ci->vfs_inode; 1465 struct inode *inode = &ci->vfs_inode;
1466 1466
1467 dout("vmtruncate_work %p\n", inode); 1467 dout("vmtruncate_work %p\n", inode);
1468 __ceph_do_pending_vmtruncate(inode, true); 1468 mutex_lock(&inode->i_mutex);
1469 __ceph_do_pending_vmtruncate(inode);
1470 mutex_unlock(&inode->i_mutex);
1469 iput(inode); 1471 iput(inode);
1470} 1472}
1471 1473
@@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1492 * Make sure any pending truncation is applied before doing anything 1494 * Make sure any pending truncation is applied before doing anything
1493 * that may depend on it. 1495 * that may depend on it.
1494 */ 1496 */
1495void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) 1497void __ceph_do_pending_vmtruncate(struct inode *inode)
1496{ 1498{
1497 struct ceph_inode_info *ci = ceph_inode(inode); 1499 struct ceph_inode_info *ci = ceph_inode(inode);
1498 u64 to; 1500 u64 to;
@@ -1525,11 +1527,7 @@ retry:
1525 ci->i_truncate_pending, to); 1527 ci->i_truncate_pending, to);
1526 spin_unlock(&ci->i_ceph_lock); 1528 spin_unlock(&ci->i_ceph_lock);
1527 1529
1528 if (needlock)
1529 mutex_lock(&inode->i_mutex);
1530 truncate_inode_pages(inode->i_mapping, to); 1530 truncate_inode_pages(inode->i_mapping, to);
1531 if (needlock)
1532 mutex_unlock(&inode->i_mutex);
1533 1531
1534 spin_lock(&ci->i_ceph_lock); 1532 spin_lock(&ci->i_ceph_lock);
1535 if (to == ci->i_truncate_size) { 1533 if (to == ci->i_truncate_size) {
@@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1588 if (ceph_snap(inode) != CEPH_NOSNAP) 1586 if (ceph_snap(inode) != CEPH_NOSNAP)
1589 return -EROFS; 1587 return -EROFS;
1590 1588
1591 __ceph_do_pending_vmtruncate(inode, false); 1589 __ceph_do_pending_vmtruncate(inode);
1592 1590
1593 err = inode_change_ok(inode, attr); 1591 err = inode_change_ok(inode, attr);
1594 if (err != 0) 1592 if (err != 0)
@@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1770 ceph_cap_string(dirtied), mask); 1768 ceph_cap_string(dirtied), mask);
1771 1769
1772 ceph_mdsc_put_request(req); 1770 ceph_mdsc_put_request(req);
1773 __ceph_do_pending_vmtruncate(inode, false); 1771 __ceph_do_pending_vmtruncate(inode);
1774 return err; 1772 return err;
1775out: 1773out:
1776 spin_unlock(&ci->i_ceph_lock); 1774 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ebbf680378e2..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
169} 169}
170 170
171/** 171/**
172 * Must be called with BKL already held. Fills in the passed 172 * Must be called with lock_flocks() already held. Fills in the passed
173 * counter variables, so you can prepare pagelist metadata before calling 173 * counter variables, so you can prepare pagelist metadata before calling
174 * ceph_encode_locks. 174 * ceph_encode_locks.
175 */ 175 */
@@ -192,7 +192,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
192 192
193/** 193/**
194 * Encode the flock and fcntl locks for the given inode into the ceph_filelock 194 * Encode the flock and fcntl locks for the given inode into the ceph_filelock
195 * array. Must be called with lock_flocks() already held. 195 * array. Must be called with inode->i_lock already held.
196 * If we encounter more of a specific lock type than expected, return -ENOSPC. 196 * If we encounter more of a specific lock type than expected, return -ENOSPC.
197 */ 197 */
198int ceph_encode_locks_to_buffer(struct inode *inode, 198int ceph_encode_locks_to_buffer(struct inode *inode,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4d2920304be8..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1391 num = le32_to_cpu(head->num); 1391 num = le32_to_cpu(head->num);
1392 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1392 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1393 head->num = cpu_to_le32(0); 1393 head->num = cpu_to_le32(0);
1394 msg->front.iov_len = sizeof(*head);
1394 session->s_num_cap_releases += num; 1395 session->s_num_cap_releases += num;
1395 1396
1396 /* requeue completed messages */ 1397 /* requeue completed messages */
@@ -1553,7 +1554,7 @@ retry:
1553 *base = ceph_ino(temp->d_inode); 1554 *base = ceph_ino(temp->d_inode);
1554 *plen = len; 1555 *plen = len;
1555 dout("build_path on %p %d built %llx '%.*s'\n", 1556 dout("build_path on %p %d built %llx '%.*s'\n",
1556 dentry, dentry->d_count, *base, len, path); 1557 dentry, d_count(dentry), *base, len, path);
1557 return path; 1558 return path;
1558} 1559}
1559 1560
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2454 spin_lock(&ci->i_ceph_lock); 2455 spin_lock(&ci->i_ceph_lock);
2455 cap->seq = 0; /* reset cap seq */ 2456 cap->seq = 0; /* reset cap seq */
2456 cap->issue_seq = 0; /* and issue_seq */ 2457 cap->issue_seq = 0; /* and issue_seq */
2458 cap->mseq = 0; /* and migrate_seq */
2457 2459
2458 if (recon_state->flock) { 2460 if (recon_state->flock) {
2459 rec.v2.cap_id = cpu_to_le64(cap->cap_id); 2461 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2481,20 +2483,20 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2481 struct ceph_filelock *flocks; 2483 struct ceph_filelock *flocks;
2482 2484
2483encode_again: 2485encode_again:
2484 lock_flocks(); 2486 spin_lock(&inode->i_lock);
2485 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2487 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2486 unlock_flocks(); 2488 spin_unlock(&inode->i_lock);
2487 flocks = kmalloc((num_fcntl_locks+num_flock_locks) * 2489 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2488 sizeof(struct ceph_filelock), GFP_NOFS); 2490 sizeof(struct ceph_filelock), GFP_NOFS);
2489 if (!flocks) { 2491 if (!flocks) {
2490 err = -ENOMEM; 2492 err = -ENOMEM;
2491 goto out_free; 2493 goto out_free;
2492 } 2494 }
2493 lock_flocks(); 2495 spin_lock(&inode->i_lock);
2494 err = ceph_encode_locks_to_buffer(inode, flocks, 2496 err = ceph_encode_locks_to_buffer(inode, flocks,
2495 num_fcntl_locks, 2497 num_fcntl_locks,
2496 num_flock_locks); 2498 num_flock_locks);
2497 unlock_flocks(); 2499 spin_unlock(&inode->i_lock);
2498 if (err) { 2500 if (err) {
2499 kfree(flocks); 2501 kfree(flocks);
2500 if (err == -ENOSPC) 2502 if (err == -ENOSPC)
@@ -3040,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3040 fsc->mdsc = mdsc; 3042 fsc->mdsc = mdsc;
3041 mutex_init(&mdsc->mutex); 3043 mutex_init(&mdsc->mutex);
3042 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3044 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3043 if (mdsc->mdsmap == NULL) 3045 if (mdsc->mdsmap == NULL) {
3046 kfree(mdsc);
3044 return -ENOMEM; 3047 return -ENOMEM;
3048 }
3045 3049
3046 init_completion(&mdsc->safe_umount_waiters); 3050 init_completion(&mdsc->safe_umount_waiters);
3047 init_waitqueue_head(&mdsc->session_close_wq); 3051 init_waitqueue_head(&mdsc->session_close_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
92 u32 num_export_targets; 92 u32 num_export_targets;
93 void *pexport_targets = NULL; 93 void *pexport_targets = NULL;
94 struct ceph_timespec laggy_since; 94 struct ceph_timespec laggy_since;
95 struct ceph_mds_info *info;
95 96
96 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 97 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
97 global_id = ceph_decode_64(p); 98 global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
126 i+1, n, global_id, mds, inc, 127 i+1, n, global_id, mds, inc,
127 ceph_pr_addr(&addr.in_addr), 128 ceph_pr_addr(&addr.in_addr),
128 ceph_mds_state_name(state)); 129 ceph_mds_state_name(state));
129 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 130
130 m->m_info[mds].global_id = global_id; 131 if (mds < 0 || mds >= m->m_max_mds || state <= 0)
131 m->m_info[mds].state = state; 132 continue;
132 m->m_info[mds].addr = addr; 133
133 m->m_info[mds].laggy = 134 info = &m->m_info[mds];
134 (laggy_since.tv_sec != 0 || 135 info->global_id = global_id;
135 laggy_since.tv_nsec != 0); 136 info->state = state;
136 m->m_info[mds].num_export_targets = num_export_targets; 137 info->addr = addr;
137 if (num_export_targets) { 138 info->laggy = (laggy_since.tv_sec != 0 ||
138 m->m_info[mds].export_targets = 139 laggy_since.tv_nsec != 0);
139 kcalloc(num_export_targets, sizeof(u32), 140 info->num_export_targets = num_export_targets;
140 GFP_NOFS); 141 if (num_export_targets) {
141 for (j = 0; j < num_export_targets; j++) 142 info->export_targets = kcalloc(num_export_targets,
142 m->m_info[mds].export_targets[j] = 143 sizeof(u32), GFP_NOFS);
143 ceph_decode_32(&pexport_targets); 144 if (info->export_targets == NULL)
144 } else { 145 goto badmem;
145 m->m_info[mds].export_targets = NULL; 146 for (j = 0; j < num_export_targets; j++)
146 } 147 info->export_targets[j] =
148 ceph_decode_32(&pexport_targets);
149 } else {
150 info->export_targets = NULL;
147 } 151 }
148 } 152 }
149 153
@@ -170,7 +174,7 @@ bad:
170 DUMP_PREFIX_OFFSET, 16, 1, 174 DUMP_PREFIX_OFFSET, 16, 1,
171 start, end - start, true); 175 start, end - start, true);
172 ceph_mdsmap_destroy(m); 176 ceph_mdsmap_destroy(m);
173 return ERR_PTR(-EINVAL); 177 return ERR_PTR(err);
174} 178}
175 179
176void ceph_mdsmap_destroy(struct ceph_mdsmap *m) 180void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
357 } 357 }
358 err = -EINVAL; 358 err = -EINVAL;
359 dev_name_end--; /* back up to ':' separator */ 359 dev_name_end--; /* back up to ':' separator */
360 if (*dev_name_end != ':') { 360 if (dev_name_end < dev_name || *dev_name_end != ':') {
361 pr_err("device name is missing path (no : separator in %s)\n", 361 pr_err("device name is missing path (no : separator in %s)\n",
362 dev_name); 362 dev_name);
363 goto out; 363 goto out;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7ccfdb4aea2e..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
534extern void ceph_caps_init(struct ceph_mds_client *mdsc); 534extern void ceph_caps_init(struct ceph_mds_client *mdsc);
535extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); 535extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
536extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); 536extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
537extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, 537extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
538 struct ceph_cap_reservation *ctx, int need); 538 struct ceph_cap_reservation *ctx, int need);
539extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 539extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
540 struct ceph_cap_reservation *ctx); 540 struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
692extern int ceph_inode_holds_cap(struct inode *inode, int mask); 692extern int ceph_inode_holds_cap(struct inode *inode, int mask);
693 693
694extern int ceph_inode_set_size(struct inode *inode, loff_t size); 694extern int ceph_inode_set_size(struct inode *inode, loff_t size);
695extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); 695extern void __ceph_do_pending_vmtruncate(struct inode *inode);
696extern void ceph_queue_vmtruncate(struct inode *inode); 696extern void ceph_queue_vmtruncate(struct inode *inode);
697 697
698extern void ceph_queue_invalidate(struct inode *inode); 698extern void ceph_queue_invalidate(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
675 if (!ceph_is_valid_xattr(name)) 675 if (!ceph_is_valid_xattr(name))
676 return -ENODATA; 676 return -ENODATA;
677 677
678 spin_lock(&ci->i_ceph_lock);
679 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
680 ci->i_xattrs.version, ci->i_xattrs.index_version);
681 678
682 /* let's see if a virtual xattr was requested */ 679 /* let's see if a virtual xattr was requested */
683 vxattr = ceph_match_vxattr(inode, name); 680 vxattr = ceph_match_vxattr(inode, name);
684 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 681 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
685 err = vxattr->getxattr_cb(ci, value, size); 682 err = vxattr->getxattr_cb(ci, value, size);
686 goto out; 683 return err;
687 } 684 }
688 685
686 spin_lock(&ci->i_ceph_lock);
687 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
688 ci->i_xattrs.version, ci->i_xattrs.index_version);
689
689 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 690 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
690 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 691 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
691 goto get_xattr; 692 goto get_xattr;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2906ee276408..603f18a65c12 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -10,6 +10,7 @@ config CIFS
10 select CRYPTO_ECB 10 select CRYPTO_ECB
11 select CRYPTO_DES 11 select CRYPTO_DES
12 select CRYPTO_SHA256 12 select CRYPTO_SHA256
13 select CRYPTO_CMAC
13 help 14 help
14 This is the client VFS module for the Common Internet File System 15 This is the client VFS module for the Common Internet File System
15 (CIFS) protocol which is the successor to the Server Message Block 16 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index d59748346020..f3ac4154cbb6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
213 tcon->nativeFileSystem); 213 tcon->nativeFileSystem);
214 } 214 }
215 seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" 215 seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
216 "\nPathComponentMax: %d Status: 0x%d", 216 "\n\tPathComponentMax: %d Status: 0x%d",
217 le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), 217 le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
218 le32_to_cpu(tcon->fsAttrInfo.Attributes), 218 le32_to_cpu(tcon->fsAttrInfo.Attributes),
219 le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), 219 le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -224,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
224 seq_puts(m, " type: CDROM "); 224 seq_puts(m, " type: CDROM ");
225 else 225 else
226 seq_printf(m, " type: %d ", dev_type); 226 seq_printf(m, " type: %d ", dev_type);
227 if (server->ops->dump_share_caps)
228 server->ops->dump_share_caps(m, tcon);
227 229
228 if (tcon->need_reconnect) 230 if (tcon->need_reconnect)
229 seq_puts(m, "\tDISCONNECTED "); 231 seq_puts(m, "\tDISCONNECTED ");
@@ -595,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
595 return single_open(file, cifs_security_flags_proc_show, NULL); 597 return single_open(file, cifs_security_flags_proc_show, NULL);
596} 598}
597 599
600/*
601 * Ensure that if someone sets a MUST flag, that we disable all other MAY
602 * flags except for the ones corresponding to the given MUST flag. If there are
603 * multiple MUST flags, then try to prefer more secure ones.
604 */
605static void
606cifs_security_flags_handle_must_flags(unsigned int *flags)
607{
608 unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
609
610 if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
611 *flags = CIFSSEC_MUST_KRB5;
612 else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
613 *flags = CIFSSEC_MUST_NTLMSSP;
614 else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
615 *flags = CIFSSEC_MUST_NTLMV2;
616 else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
617 *flags = CIFSSEC_MUST_NTLM;
618 else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
619 *flags = CIFSSEC_MUST_LANMAN;
620 else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
621 *flags = CIFSSEC_MUST_PLNTXT;
622
623 *flags |= signflags;
624}
625
598static ssize_t cifs_security_flags_proc_write(struct file *file, 626static ssize_t cifs_security_flags_proc_write(struct file *file,
599 const char __user *buffer, size_t count, loff_t *ppos) 627 const char __user *buffer, size_t count, loff_t *ppos)
600{ 628{
629 int rc;
601 unsigned int flags; 630 unsigned int flags;
602 char flags_string[12]; 631 char flags_string[12];
603 char c; 632 char c;
@@ -620,26 +649,35 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
620 global_secflags = CIFSSEC_MAX; 649 global_secflags = CIFSSEC_MAX;
621 return count; 650 return count;
622 } else if (!isdigit(c)) { 651 } else if (!isdigit(c)) {
623 cifs_dbg(VFS, "invalid flag %c\n", c); 652 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
653 flags_string);
624 return -EINVAL; 654 return -EINVAL;
625 } 655 }
626 } 656 }
627 /* else we have a number */
628 657
629 flags = simple_strtoul(flags_string, NULL, 0); 658 /* else we have a number */
659 rc = kstrtouint(flags_string, 0, &flags);
660 if (rc) {
661 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
662 flags_string);
663 return rc;
664 }
630 665
631 cifs_dbg(FYI, "sec flags 0x%x\n", flags); 666 cifs_dbg(FYI, "sec flags 0x%x\n", flags);
632 667
633 if (flags <= 0) { 668 if (flags == 0) {
634 cifs_dbg(VFS, "invalid security flags %s\n", flags_string); 669 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
635 return -EINVAL; 670 return -EINVAL;
636 } 671 }
637 672
638 if (flags & ~CIFSSEC_MASK) { 673 if (flags & ~CIFSSEC_MASK) {
639 cifs_dbg(VFS, "attempt to set unsupported security flags 0x%x\n", 674 cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
640 flags & ~CIFSSEC_MASK); 675 flags & ~CIFSSEC_MASK);
641 return -EINVAL; 676 return -EINVAL;
642 } 677 }
678
679 cifs_security_flags_handle_must_flags(&flags);
680
643 /* flags look ok - update the global security flags for cifs module */ 681 /* flags look ok - update the global security flags for cifs module */
644 global_secflags = flags; 682 global_secflags = flags;
645 if (global_secflags & CIFSSEC_MUST_SIGN) { 683 if (global_secflags & CIFSSEC_MUST_SIGN) {
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 4fb097468e21..fe8d6276410a 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -327,14 +327,14 @@ UniToupper(register wchar_t uc)
327/* 327/*
328 * UniStrupr: Upper case a unicode string 328 * UniStrupr: Upper case a unicode string
329 */ 329 */
330static inline wchar_t * 330static inline __le16 *
331UniStrupr(register wchar_t *upin) 331UniStrupr(register __le16 *upin)
332{ 332{
333 register wchar_t *up; 333 register __le16 *up;
334 334
335 up = upin; 335 up = upin;
336 while (*up) { /* For all characters */ 336 while (*up) { /* For all characters */
337 *up = UniToupper(*up); 337 *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
338 up++; 338 up++;
339 } 339 }
340 return upin; /* Return input pointer */ 340 return upin; /* Return input pointer */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 71436d1fca13..3d8bf941d126 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -276,7 +276,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
276 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); 276 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
277 277
278 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { 278 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
279 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
280 memcpy(lnm_session_key, password_with_pad, 279 memcpy(lnm_session_key, password_with_pad,
281 CIFS_ENCPWD_SIZE); 280 CIFS_ENCPWD_SIZE);
282 return 0; 281 return 0;
@@ -414,7 +413,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
414 int rc = 0; 413 int rc = 0;
415 int len; 414 int len;
416 char nt_hash[CIFS_NTHASH_SIZE]; 415 char nt_hash[CIFS_NTHASH_SIZE];
417 wchar_t *user; 416 __le16 *user;
418 wchar_t *domain; 417 wchar_t *domain;
419 wchar_t *server; 418 wchar_t *server;
420 419
@@ -439,7 +438,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
439 return rc; 438 return rc;
440 } 439 }
441 440
442 /* convert ses->user_name to unicode and uppercase */ 441 /* convert ses->user_name to unicode */
443 len = ses->user_name ? strlen(ses->user_name) : 0; 442 len = ses->user_name ? strlen(ses->user_name) : 0;
444 user = kmalloc(2 + (len * 2), GFP_KERNEL); 443 user = kmalloc(2 + (len * 2), GFP_KERNEL);
445 if (user == NULL) { 444 if (user == NULL) {
@@ -448,7 +447,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
448 } 447 }
449 448
450 if (len) { 449 if (len) {
451 len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp); 450 len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
452 UniStrupr(user); 451 UniStrupr(user);
453 } else { 452 } else {
454 memset(user, '\0', 2); 453 memset(user, '\0', 2);
@@ -536,7 +535,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
536 return rc; 535 return rc;
537 } 536 }
538 537
539 if (ses->server->secType == RawNTLMSSP) 538 if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
540 memcpy(ses->auth_key.response + offset, 539 memcpy(ses->auth_key.response + offset,
541 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); 540 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
542 else 541 else
@@ -568,7 +567,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
568 char ntlmv2_hash[16]; 567 char ntlmv2_hash[16];
569 unsigned char *tiblob = NULL; /* target info blob */ 568 unsigned char *tiblob = NULL; /* target info blob */
570 569
571 if (ses->server->secType == RawNTLMSSP) { 570 if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
572 if (!ses->domainName) { 571 if (!ses->domainName) {
573 rc = find_domain_name(ses, nls_cp); 572 rc = find_domain_name(ses, nls_cp);
574 if (rc) { 573 if (rc) {
@@ -706,6 +705,9 @@ calc_seckey(struct cifs_ses *ses)
706void 705void
707cifs_crypto_shash_release(struct TCP_Server_Info *server) 706cifs_crypto_shash_release(struct TCP_Server_Info *server)
708{ 707{
708 if (server->secmech.cmacaes)
709 crypto_free_shash(server->secmech.cmacaes);
710
709 if (server->secmech.hmacsha256) 711 if (server->secmech.hmacsha256)
710 crypto_free_shash(server->secmech.hmacsha256); 712 crypto_free_shash(server->secmech.hmacsha256);
711 713
@@ -715,6 +717,8 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server)
715 if (server->secmech.hmacmd5) 717 if (server->secmech.hmacmd5)
716 crypto_free_shash(server->secmech.hmacmd5); 718 crypto_free_shash(server->secmech.hmacmd5);
717 719
720 kfree(server->secmech.sdesccmacaes);
721
718 kfree(server->secmech.sdeschmacsha256); 722 kfree(server->secmech.sdeschmacsha256);
719 723
720 kfree(server->secmech.sdeschmacmd5); 724 kfree(server->secmech.sdeschmacmd5);
@@ -748,6 +752,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
748 goto crypto_allocate_hmacsha256_fail; 752 goto crypto_allocate_hmacsha256_fail;
749 } 753 }
750 754
755 server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
756 if (IS_ERR(server->secmech.cmacaes)) {
757 cifs_dbg(VFS, "could not allocate crypto cmac-aes");
758 rc = PTR_ERR(server->secmech.cmacaes);
759 goto crypto_allocate_cmacaes_fail;
760 }
761
751 size = sizeof(struct shash_desc) + 762 size = sizeof(struct shash_desc) +
752 crypto_shash_descsize(server->secmech.hmacmd5); 763 crypto_shash_descsize(server->secmech.hmacmd5);
753 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); 764 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
@@ -778,8 +789,22 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
778 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256; 789 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
779 server->secmech.sdeschmacsha256->shash.flags = 0x0; 790 server->secmech.sdeschmacsha256->shash.flags = 0x0;
780 791
792 size = sizeof(struct shash_desc) +
793 crypto_shash_descsize(server->secmech.cmacaes);
794 server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
795 if (!server->secmech.sdesccmacaes) {
796 cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
797 rc = -ENOMEM;
798 goto crypto_allocate_cmacaes_sdesc_fail;
799 }
800 server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
801 server->secmech.sdesccmacaes->shash.flags = 0x0;
802
781 return 0; 803 return 0;
782 804
805crypto_allocate_cmacaes_sdesc_fail:
806 kfree(server->secmech.sdeschmacsha256);
807
783crypto_allocate_hmacsha256_sdesc_fail: 808crypto_allocate_hmacsha256_sdesc_fail:
784 kfree(server->secmech.sdescmd5); 809 kfree(server->secmech.sdescmd5);
785 810
@@ -787,6 +812,9 @@ crypto_allocate_md5_sdesc_fail:
787 kfree(server->secmech.sdeschmacmd5); 812 kfree(server->secmech.sdeschmacmd5);
788 813
789crypto_allocate_hmacmd5_sdesc_fail: 814crypto_allocate_hmacmd5_sdesc_fail:
815 crypto_free_shash(server->secmech.cmacaes);
816
817crypto_allocate_cmacaes_fail:
790 crypto_free_shash(server->secmech.hmacsha256); 818 crypto_free_shash(server->secmech.hmacsha256);
791 819
792crypto_allocate_hmacsha256_fail: 820crypto_allocate_hmacsha256_fail:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 540c1ccfcdb2..4bdd547dbf6f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,11 +312,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
312} 312}
313 313
314static void 314static void
315cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) 315cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
316{ 316{
317 if (ses->sectype == Unspecified)
318 return;
319
317 seq_printf(s, ",sec="); 320 seq_printf(s, ",sec=");
318 321
319 switch (server->secType) { 322 switch (ses->sectype) {
320 case LANMAN: 323 case LANMAN:
321 seq_printf(s, "lanman"); 324 seq_printf(s, "lanman");
322 break; 325 break;
@@ -338,7 +341,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
338 break; 341 break;
339 } 342 }
340 343
341 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 344 if (ses->sign)
342 seq_printf(s, "i"); 345 seq_printf(s, "i");
343} 346}
344 347
@@ -369,7 +372,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
369 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; 372 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
370 373
371 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); 374 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
372 cifs_show_security(s, tcon->ses->server); 375 cifs_show_security(s, tcon->ses);
373 cifs_show_cache_flavor(s, cifs_sb); 376 cifs_show_cache_flavor(s, cifs_sb);
374 377
375 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 378 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
@@ -765,7 +768,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
765 768
766static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 769static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
767{ 770{
768 /* note that this is called by vfs setlease with lock_flocks held 771 /* note that this is called by vfs setlease with i_lock held
769 to protect *lease from going away */ 772 to protect *lease from going away */
770 struct inode *inode = file_inode(file); 773 struct inode *inode = file_inode(file);
771 struct cifsFileInfo *cfile = file->private_data; 774 struct cifsFileInfo *cfile = file->private_data;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d05b3028e3b9..ea723a5e8226 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
132extern const struct export_operations cifs_export_ops; 132extern const struct export_operations cifs_export_ops;
133#endif /* CONFIG_CIFS_NFSD_EXPORT */ 133#endif /* CONFIG_CIFS_NFSD_EXPORT */
134 134
135#define CIFS_VERSION "2.0" 135#define CIFS_VERSION "2.01"
136#endif /* _CIFSFS_H */ 136#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4f07f6fbe494..e66b08882548 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -101,20 +101,14 @@ enum statusEnum {
101}; 101};
102 102
103enum securityEnum { 103enum securityEnum {
104 LANMAN = 0, /* Legacy LANMAN auth */ 104 Unspecified = 0, /* not specified */
105 LANMAN, /* Legacy LANMAN auth */
105 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 106 NTLM, /* Legacy NTLM012 auth with NTLM hash */
106 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 107 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
107 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 108 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
108/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
109 Kerberos, /* Kerberos via SPNEGO */ 109 Kerberos, /* Kerberos via SPNEGO */
110}; 110};
111 111
112enum protocolEnum {
113 TCP = 0,
114 SCTP
115 /* Netbios frames protocol not supported at this time */
116};
117
118struct session_key { 112struct session_key {
119 unsigned int len; 113 unsigned int len;
120 char *response; 114 char *response;
@@ -131,9 +125,11 @@ struct cifs_secmech {
131 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ 125 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
132 struct crypto_shash *md5; /* md5 hash function */ 126 struct crypto_shash *md5; /* md5 hash function */
133 struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ 127 struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
128 struct crypto_shash *cmacaes; /* block-cipher based MAC function */
134 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ 129 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
135 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ 130 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
136 struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ 131 struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
132 struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
137}; 133};
138 134
139/* per smb session structure/fields */ 135/* per smb session structure/fields */
@@ -181,6 +177,7 @@ enum smb_version {
181 Smb_20, 177 Smb_20,
182 Smb_21, 178 Smb_21,
183 Smb_30, 179 Smb_30,
180 Smb_302,
184}; 181};
185 182
186struct mid_q_entry; 183struct mid_q_entry;
@@ -228,6 +225,7 @@ struct smb_version_operations {
228 void (*dump_detail)(void *); 225 void (*dump_detail)(void *);
229 void (*clear_stats)(struct cifs_tcon *); 226 void (*clear_stats)(struct cifs_tcon *);
230 void (*print_stats)(struct seq_file *m, struct cifs_tcon *); 227 void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
228 void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
231 /* verify the message */ 229 /* verify the message */
232 int (*check_message)(char *, unsigned int); 230 int (*check_message)(char *, unsigned int);
233 bool (*is_oplock_break)(char *, struct TCP_Server_Info *); 231 bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
@@ -367,6 +365,8 @@ struct smb_version_operations {
367 void (*set_lease_key)(struct inode *, struct cifs_fid *fid); 365 void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
368 /* generate new lease key */ 366 /* generate new lease key */
369 void (*new_lease_key)(struct cifs_fid *fid); 367 void (*new_lease_key)(struct cifs_fid *fid);
368 /* The next two functions will need to be changed to per smb session */
369 void (*generate_signingkey)(struct TCP_Server_Info *server);
370 int (*calc_signature)(struct smb_rqst *rqst, 370 int (*calc_signature)(struct smb_rqst *rqst,
371 struct TCP_Server_Info *server); 371 struct TCP_Server_Info *server);
372}; 372};
@@ -387,6 +387,8 @@ struct smb_version_values {
387 unsigned int cap_nt_find; 387 unsigned int cap_nt_find;
388 unsigned int cap_large_files; 388 unsigned int cap_large_files;
389 unsigned int oplock_read; 389 unsigned int oplock_read;
390 __u16 signing_enabled;
391 __u16 signing_required;
390}; 392};
391 393
392#define HEADER_SIZE(server) (server->vals->header_size) 394#define HEADER_SIZE(server) (server->vals->header_size)
@@ -407,7 +409,8 @@ struct smb_vol {
407 kgid_t backupgid; 409 kgid_t backupgid;
408 umode_t file_mode; 410 umode_t file_mode;
409 umode_t dir_mode; 411 umode_t dir_mode;
410 unsigned secFlg; 412 enum securityEnum sectype; /* sectype requested via mnt opts */
413 bool sign; /* was signing requested via mnt opts? */
411 bool retry:1; 414 bool retry:1;
412 bool intr:1; 415 bool intr:1;
413 bool setuids:1; 416 bool setuids:1;
@@ -441,6 +444,7 @@ struct smb_vol {
441 bool mfsymlinks:1; /* use Minshall+French Symlinks */ 444 bool mfsymlinks:1; /* use Minshall+French Symlinks */
442 bool multiuser:1; 445 bool multiuser:1;
443 bool rwpidforward:1; /* pid forward for read/write operations */ 446 bool rwpidforward:1; /* pid forward for read/write operations */
447 bool nosharesock;
444 unsigned int rsize; 448 unsigned int rsize;
445 unsigned int wsize; 449 unsigned int wsize;
446 bool sockopt_tcp_nodelay:1; 450 bool sockopt_tcp_nodelay:1;
@@ -514,6 +518,7 @@ struct TCP_Server_Info {
514 struct task_struct *tsk; 518 struct task_struct *tsk;
515 char server_GUID[16]; 519 char server_GUID[16];
516 __u16 sec_mode; 520 __u16 sec_mode;
521 bool sign; /* is signing enabled on this connection? */
517 bool session_estab; /* mark when very first sess is established */ 522 bool session_estab; /* mark when very first sess is established */
518#ifdef CONFIG_CIFS_SMB2 523#ifdef CONFIG_CIFS_SMB2
519 int echo_credits; /* echo reserved slots */ 524 int echo_credits; /* echo reserved slots */
@@ -521,7 +526,6 @@ struct TCP_Server_Info {
521 bool echoes:1; /* enable echoes */ 526 bool echoes:1; /* enable echoes */
522#endif 527#endif
523 u16 dialect; /* dialect index that server chose */ 528 u16 dialect; /* dialect index that server chose */
524 enum securityEnum secType;
525 bool oplocks:1; /* enable oplocks */ 529 bool oplocks:1; /* enable oplocks */
526 unsigned int maxReq; /* Clients should submit no more */ 530 unsigned int maxReq; /* Clients should submit no more */
527 /* than maxReq distinct unanswered SMBs to the server when using */ 531 /* than maxReq distinct unanswered SMBs to the server when using */
@@ -540,12 +544,17 @@ struct TCP_Server_Info {
540 int timeAdj; /* Adjust for difference in server time zone in sec */ 544 int timeAdj; /* Adjust for difference in server time zone in sec */
541 __u64 CurrentMid; /* multiplex id - rotating counter */ 545 __u64 CurrentMid; /* multiplex id - rotating counter */
542 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ 546 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
547 char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
543 /* 16th byte of RFC1001 workstation name is always null */ 548 /* 16th byte of RFC1001 workstation name is always null */
544 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 549 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
545 __u32 sequence_number; /* for signing, protected by srv_mutex */ 550 __u32 sequence_number; /* for signing, protected by srv_mutex */
546 struct session_key session_key; 551 struct session_key session_key;
547 unsigned long lstrp; /* when we got last response from this server */ 552 unsigned long lstrp; /* when we got last response from this server */
548 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ 553 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
554#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */
555#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
556#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
557 char negflavor; /* NEGOTIATE response flavor */
549 /* extended security flavors that server supports */ 558 /* extended security flavors that server supports */
550 bool sec_ntlmssp; /* supports NTLMSSP */ 559 bool sec_ntlmssp; /* supports NTLMSSP */
551 bool sec_kerberosu2u; /* supports U2U Kerberos */ 560 bool sec_kerberosu2u; /* supports U2U Kerberos */
@@ -697,7 +706,6 @@ struct cifs_ses {
697 enum statusEnum status; 706 enum statusEnum status;
698 unsigned overrideSecFlg; /* if non-zero override global sec flags */ 707 unsigned overrideSecFlg; /* if non-zero override global sec flags */
699 __u16 ipc_tid; /* special tid for connection to IPC share */ 708 __u16 ipc_tid; /* special tid for connection to IPC share */
700 __u16 flags;
701 __u16 vcnum; 709 __u16 vcnum;
702 char *serverOS; /* name of operating system underlying server */ 710 char *serverOS; /* name of operating system underlying server */
703 char *serverNOS; /* name of network operating system of server */ 711 char *serverNOS; /* name of network operating system of server */
@@ -714,21 +722,14 @@ struct cifs_ses {
714 char *password; 722 char *password;
715 struct session_key auth_key; 723 struct session_key auth_key;
716 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ 724 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
725 enum securityEnum sectype; /* what security flavor was specified? */
726 bool sign; /* is signing required? */
717 bool need_reconnect:1; /* connection reset, uid now invalid */ 727 bool need_reconnect:1; /* connection reset, uid now invalid */
718#ifdef CONFIG_CIFS_SMB2 728#ifdef CONFIG_CIFS_SMB2
719 __u16 session_flags; 729 __u16 session_flags;
720#endif /* CONFIG_CIFS_SMB2 */ 730#endif /* CONFIG_CIFS_SMB2 */
721}; 731};
722 732
723/* no more than one of the following three session flags may be set */
724#define CIFS_SES_NT4 1
725#define CIFS_SES_OS2 2
726#define CIFS_SES_W9X 4
727/* following flag is set for old servers such as OS2 (and Win95?)
728 which do not negotiate NTLM or POSIX dialects, but instead
729 negotiate one of the older LANMAN dialects */
730#define CIFS_SES_LANMAN 8
731
732static inline bool 733static inline bool
733cap_unix(struct cifs_ses *ses) 734cap_unix(struct cifs_ses *ses)
734{ 735{
@@ -816,7 +817,7 @@ struct cifs_tcon {
816#ifdef CONFIG_CIFS_SMB2 817#ifdef CONFIG_CIFS_SMB2
817 bool print:1; /* set if connection to printer share */ 818 bool print:1; /* set if connection to printer share */
818 bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ 819 bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
819 __u32 capabilities; 820 __le32 capabilities;
820 __u32 share_flags; 821 __u32 share_flags;
821 __u32 maximal_access; 822 __u32 maximal_access;
822 __u32 vol_serial_number; 823 __u32 vol_serial_number;
@@ -1348,7 +1349,7 @@ require use of the stronger protocol */
1348#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 1349#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
1349#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ 1350#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
1350 1351
1351#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP) 1352#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
1352#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 1353#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
1353#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) 1354#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
1354/* 1355/*
@@ -1494,4 +1495,7 @@ extern struct smb_version_values smb21_values;
1494#define SMB30_VERSION_STRING "3.0" 1495#define SMB30_VERSION_STRING "3.0"
1495extern struct smb_version_operations smb30_operations; 1496extern struct smb_version_operations smb30_operations;
1496extern struct smb_version_values smb30_values; 1497extern struct smb_version_values smb30_values;
1498#define SMB302_VERSION_STRING "3.02"
1499/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
1500extern struct smb_version_values smb302_values;
1497#endif /* _CIFS_GLOB_H */ 1501#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e996ff6b26d1..11ca24a8e054 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
142 */ 142 */
143#define CIFS_SESS_KEY_SIZE (16) 143#define CIFS_SESS_KEY_SIZE (16)
144 144
145/*
146 * Size of the smb3 signing key
147 */
148#define SMB3_SIGN_KEY_SIZE (16)
149
145#define CIFS_CLIENT_CHALLENGE_SIZE (8) 150#define CIFS_CLIENT_CHALLENGE_SIZE (8)
146#define CIFS_SERVER_CHALLENGE_SIZE (8) 151#define CIFS_SERVER_CHALLENGE_SIZE (8)
147#define CIFS_HMAC_MD5_HASH_SIZE (16) 152#define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp {
531#define READ_RAW_ENABLE 1 536#define READ_RAW_ENABLE 1
532#define WRITE_RAW_ENABLE 2 537#define WRITE_RAW_ENABLE 2
533#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) 538#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
534 539#define SMB1_CLIENT_GUID_SIZE (16)
535typedef struct negotiate_rsp { 540typedef struct negotiate_rsp {
536 struct smb_hdr hdr; /* wct = 17 */ 541 struct smb_hdr hdr; /* wct = 17 */
537 __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ 542 __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -553,7 +558,7 @@ typedef struct negotiate_rsp {
553 /* followed by 16 bytes of server GUID */ 558 /* followed by 16 bytes of server GUID */
554 /* then security blob if cap_extended_security negotiated */ 559 /* then security blob if cap_extended_security negotiated */
555 struct { 560 struct {
556 unsigned char GUID[16]; 561 unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
557 unsigned char SecurityBlob[1]; 562 unsigned char SecurityBlob[1];
558 } __attribute__((packed)) extended_response; 563 } __attribute__((packed)) extended_response;
559 } __attribute__((packed)) u; 564 } __attribute__((packed)) u;
@@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp {
1315 /* parms and data follow */ 1320 /* parms and data follow */
1316} __attribute__((packed)) NTRANSACT_RSP; 1321} __attribute__((packed)) NTRANSACT_RSP;
1317 1322
1323/* See MS-SMB 2.2.7.2.1.1 */
1324struct srv_copychunk {
1325 __le64 SourceOffset;
1326 __le64 DestinationOffset;
1327 __le32 CopyLength;
1328 __u32 Reserved;
1329} __packed;
1330
1318typedef struct smb_com_transaction_ioctl_req { 1331typedef struct smb_com_transaction_ioctl_req {
1319 struct smb_hdr hdr; /* wct = 23 */ 1332 struct smb_hdr hdr; /* wct = 23 */
1320 __u8 MaxSetupCount; 1333 __u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index dda188a94332..c8ff018fae68 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
118extern int small_smb_init_no_tc(const int smb_cmd, const int wct, 118extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
119 struct cifs_ses *ses, 119 struct cifs_ses *ses,
120 void **request_buf); 120 void **request_buf);
121extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
122 enum securityEnum requested);
121extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, 123extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
122 const struct nls_table *nls_cp); 124 const struct nls_table *nls_cp);
123extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 125extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid,
212 struct cifs_ses *ses); 214 struct cifs_ses *ses);
213extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, 215extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
214 struct nls_table *nls_info); 216 struct nls_table *nls_info);
217extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
215extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); 218extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
216 219
217extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, 220extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -433,6 +436,7 @@ extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
433extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); 436extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
434extern void cifs_crypto_shash_release(struct TCP_Server_Info *); 437extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
435extern int calc_seckey(struct cifs_ses *); 438extern int calc_seckey(struct cifs_ses *);
439extern void generate_smb3signingkey(struct TCP_Server_Info *);
436 440
437#ifdef CONFIG_CIFS_WEAK_PW_HASH 441#ifdef CONFIG_CIFS_WEAK_PW_HASH
438extern int calc_lanman_hash(const char *password, const char *cryptkey, 442extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a58dc77cc443..a89c4cb4e6cf 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -367,6 +367,185 @@ vt2_err:
367 return -EINVAL; 367 return -EINVAL;
368} 368}
369 369
370static int
371decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
372{
373 int rc = 0;
374 u16 count;
375 char *guid = pSMBr->u.extended_response.GUID;
376 struct TCP_Server_Info *server = ses->server;
377
378 count = get_bcc(&pSMBr->hdr);
379 if (count < SMB1_CLIENT_GUID_SIZE)
380 return -EIO;
381
382 spin_lock(&cifs_tcp_ses_lock);
383 if (server->srv_count > 1) {
384 spin_unlock(&cifs_tcp_ses_lock);
385 if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
386 cifs_dbg(FYI, "server UID changed\n");
387 memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
388 }
389 } else {
390 spin_unlock(&cifs_tcp_ses_lock);
391 memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
392 }
393
394 if (count == SMB1_CLIENT_GUID_SIZE) {
395 server->sec_ntlmssp = true;
396 } else {
397 count -= SMB1_CLIENT_GUID_SIZE;
398 rc = decode_negTokenInit(
399 pSMBr->u.extended_response.SecurityBlob, count, server);
400 if (rc != 1)
401 return -EINVAL;
402 }
403
404 return 0;
405}
406
407int
408cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
409{
410 bool srv_sign_required = server->sec_mode & server->vals->signing_required;
411 bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
412 bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
413
414 /*
415 * Is signing required by mnt options? If not then check
416 * global_secflags to see if it is there.
417 */
418 if (!mnt_sign_required)
419 mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
420 CIFSSEC_MUST_SIGN);
421
422 /*
423 * If signing is required then it's automatically enabled too,
424 * otherwise, check to see if the secflags allow it.
425 */
426 mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
427 (global_secflags & CIFSSEC_MAY_SIGN);
428
429 /* If server requires signing, does client allow it? */
430 if (srv_sign_required) {
431 if (!mnt_sign_enabled) {
432 cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
433 return -ENOTSUPP;
434 }
435 server->sign = true;
436 }
437
438 /* If client requires signing, does server allow it? */
439 if (mnt_sign_required) {
440 if (!srv_sign_enabled) {
441 cifs_dbg(VFS, "Server does not support signing!");
442 return -ENOTSUPP;
443 }
444 server->sign = true;
445 }
446
447 return 0;
448}
449
450#ifdef CONFIG_CIFS_WEAK_PW_HASH
451static int
452decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
453{
454 __s16 tmp;
455 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
456
457 if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
458 return -EOPNOTSUPP;
459
460 server->sec_mode = le16_to_cpu(rsp->SecurityMode);
461 server->maxReq = min_t(unsigned int,
462 le16_to_cpu(rsp->MaxMpxCount),
463 cifs_max_pending);
464 set_credits(server, server->maxReq);
465 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
466 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
467 /* even though we do not use raw we might as well set this
468 accurately, in case we ever find a need for it */
469 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
470 server->max_rw = 0xFF00;
471 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
472 } else {
473 server->max_rw = 0;/* do not need to use raw anyway */
474 server->capabilities = CAP_MPX_MODE;
475 }
476 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
477 if (tmp == -1) {
478 /* OS/2 often does not set timezone therefore
479 * we must use server time to calc time zone.
480 * Could deviate slightly from the right zone.
481 * Smallest defined timezone difference is 15 minutes
482 * (i.e. Nepal). Rounding up/down is done to match
483 * this requirement.
484 */
485 int val, seconds, remain, result;
486 struct timespec ts, utc;
487 utc = CURRENT_TIME;
488 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
489 rsp->SrvTime.Time, 0);
490 cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
491 (int)ts.tv_sec, (int)utc.tv_sec,
492 (int)(utc.tv_sec - ts.tv_sec));
493 val = (int)(utc.tv_sec - ts.tv_sec);
494 seconds = abs(val);
495 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
496 remain = seconds % MIN_TZ_ADJ;
497 if (remain >= (MIN_TZ_ADJ / 2))
498 result += MIN_TZ_ADJ;
499 if (val < 0)
500 result = -result;
501 server->timeAdj = result;
502 } else {
503 server->timeAdj = (int)tmp;
504 server->timeAdj *= 60; /* also in seconds */
505 }
506 cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
507
508
509 /* BB get server time for time conversions and add
510 code to use it and timezone since this is not UTC */
511
512 if (rsp->EncryptionKeyLength ==
513 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
514 memcpy(server->cryptkey, rsp->EncryptionKey,
515 CIFS_CRYPTO_KEY_SIZE);
516 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
517 return -EIO; /* need cryptkey unless plain text */
518 }
519
520 cifs_dbg(FYI, "LANMAN negotiated\n");
521 return 0;
522}
523#else
524static inline int
525decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
526{
527 cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
528 return -EOPNOTSUPP;
529}
530#endif
531
532static bool
533should_set_ext_sec_flag(enum securityEnum sectype)
534{
535 switch (sectype) {
536 case RawNTLMSSP:
537 case Kerberos:
538 return true;
539 case Unspecified:
540 if (global_secflags &
541 (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
542 return true;
543 /* Fallthrough */
544 default:
545 return false;
546 }
547}
548
370int 549int
371CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) 550CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
372{ 551{
@@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
375 int rc = 0; 554 int rc = 0;
376 int bytes_returned; 555 int bytes_returned;
377 int i; 556 int i;
378 struct TCP_Server_Info *server; 557 struct TCP_Server_Info *server = ses->server;
379 u16 count; 558 u16 count;
380 unsigned int secFlags;
381 559
382 if (ses->server) 560 if (!server) {
383 server = ses->server; 561 WARN(1, "%s: server is NULL!\n", __func__);
384 else { 562 return -EIO;
385 rc = -EIO;
386 return rc;
387 } 563 }
564
388 rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ , 565 rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
389 (void **) &pSMB, (void **) &pSMBr); 566 (void **) &pSMB, (void **) &pSMBr);
390 if (rc) 567 if (rc)
391 return rc; 568 return rc;
392 569
393 /* if any of auth flags (ie not sign or seal) are overriden use them */
394 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
395 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
396 else /* if override flags set only sign/seal OR them with global auth */
397 secFlags = global_secflags | ses->overrideSecFlg;
398
399 cifs_dbg(FYI, "secFlags 0x%x\n", secFlags);
400
401 pSMB->hdr.Mid = get_next_mid(server); 570 pSMB->hdr.Mid = get_next_mid(server);
402 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); 571 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
403 572
404 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) 573 if (should_set_ext_sec_flag(ses->sectype)) {
405 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 574 cifs_dbg(FYI, "Requesting extended security.");
406 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
407 cifs_dbg(FYI, "Kerberos only mechanism, enable extended security\n");
408 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
409 } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
410 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
411 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
412 cifs_dbg(FYI, "NTLMSSP only mechanism, enable extended security\n");
413 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 575 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
414 } 576 }
415 577
@@ -436,127 +598,21 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
436 could not negotiate a common dialect */ 598 could not negotiate a common dialect */
437 rc = -EOPNOTSUPP; 599 rc = -EOPNOTSUPP;
438 goto neg_err_exit; 600 goto neg_err_exit;
439#ifdef CONFIG_CIFS_WEAK_PW_HASH
440 } else if ((pSMBr->hdr.WordCount == 13)
441 && ((server->dialect == LANMAN_PROT)
442 || (server->dialect == LANMAN2_PROT))) {
443 __s16 tmp;
444 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
445
446 if ((secFlags & CIFSSEC_MAY_LANMAN) ||
447 (secFlags & CIFSSEC_MAY_PLNTXT))
448 server->secType = LANMAN;
449 else {
450 cifs_dbg(VFS, "mount failed weak security disabled in /proc/fs/cifs/SecurityFlags\n");
451 rc = -EOPNOTSUPP;
452 goto neg_err_exit;
453 }
454 server->sec_mode = le16_to_cpu(rsp->SecurityMode);
455 server->maxReq = min_t(unsigned int,
456 le16_to_cpu(rsp->MaxMpxCount),
457 cifs_max_pending);
458 set_credits(server, server->maxReq);
459 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
460 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
461 /* even though we do not use raw we might as well set this
462 accurately, in case we ever find a need for it */
463 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
464 server->max_rw = 0xFF00;
465 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
466 } else {
467 server->max_rw = 0;/* do not need to use raw anyway */
468 server->capabilities = CAP_MPX_MODE;
469 }
470 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
471 if (tmp == -1) {
472 /* OS/2 often does not set timezone therefore
473 * we must use server time to calc time zone.
474 * Could deviate slightly from the right zone.
475 * Smallest defined timezone difference is 15 minutes
476 * (i.e. Nepal). Rounding up/down is done to match
477 * this requirement.
478 */
479 int val, seconds, remain, result;
480 struct timespec ts, utc;
481 utc = CURRENT_TIME;
482 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
483 rsp->SrvTime.Time, 0);
484 cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
485 (int)ts.tv_sec, (int)utc.tv_sec,
486 (int)(utc.tv_sec - ts.tv_sec));
487 val = (int)(utc.tv_sec - ts.tv_sec);
488 seconds = abs(val);
489 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
490 remain = seconds % MIN_TZ_ADJ;
491 if (remain >= (MIN_TZ_ADJ / 2))
492 result += MIN_TZ_ADJ;
493 if (val < 0)
494 result = -result;
495 server->timeAdj = result;
496 } else {
497 server->timeAdj = (int)tmp;
498 server->timeAdj *= 60; /* also in seconds */
499 }
500 cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
501
502
503 /* BB get server time for time conversions and add
504 code to use it and timezone since this is not UTC */
505
506 if (rsp->EncryptionKeyLength ==
507 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
508 memcpy(ses->server->cryptkey, rsp->EncryptionKey,
509 CIFS_CRYPTO_KEY_SIZE);
510 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
511 rc = -EIO; /* need cryptkey unless plain text */
512 goto neg_err_exit;
513 }
514
515 cifs_dbg(FYI, "LANMAN negotiated\n");
516 /* we will not end up setting signing flags - as no signing
517 was in LANMAN and server did not return the flags on */
518 goto signing_check;
519#else /* weak security disabled */
520 } else if (pSMBr->hdr.WordCount == 13) { 601 } else if (pSMBr->hdr.WordCount == 13) {
521 cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); 602 server->negflavor = CIFS_NEGFLAVOR_LANMAN;
522 rc = -EOPNOTSUPP; 603 rc = decode_lanman_negprot_rsp(server, pSMBr);
523#endif /* WEAK_PW_HASH */ 604 goto signing_check;
524 goto neg_err_exit;
525 } else if (pSMBr->hdr.WordCount != 17) { 605 } else if (pSMBr->hdr.WordCount != 17) {
526 /* unknown wct */ 606 /* unknown wct */
527 rc = -EOPNOTSUPP; 607 rc = -EOPNOTSUPP;
528 goto neg_err_exit; 608 goto neg_err_exit;
529 } 609 }
530 /* else wct == 17 NTLM */ 610 /* else wct == 17, NTLM or better */
611
531 server->sec_mode = pSMBr->SecurityMode; 612 server->sec_mode = pSMBr->SecurityMode;
532 if ((server->sec_mode & SECMODE_USER) == 0) 613 if ((server->sec_mode & SECMODE_USER) == 0)
533 cifs_dbg(FYI, "share mode security\n"); 614 cifs_dbg(FYI, "share mode security\n");
534 615
535 if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
536#ifdef CONFIG_CIFS_WEAK_PW_HASH
537 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
538#endif /* CIFS_WEAK_PW_HASH */
539 cifs_dbg(VFS, "Server requests plain text password but client support disabled\n");
540
541 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
542 server->secType = NTLMv2;
543 else if (secFlags & CIFSSEC_MAY_NTLM)
544 server->secType = NTLM;
545 else if (secFlags & CIFSSEC_MAY_NTLMV2)
546 server->secType = NTLMv2;
547 else if (secFlags & CIFSSEC_MAY_KRB5)
548 server->secType = Kerberos;
549 else if (secFlags & CIFSSEC_MAY_NTLMSSP)
550 server->secType = RawNTLMSSP;
551 else if (secFlags & CIFSSEC_MAY_LANMAN)
552 server->secType = LANMAN;
553 else {
554 rc = -EOPNOTSUPP;
555 cifs_dbg(VFS, "Invalid security type\n");
556 goto neg_err_exit;
557 }
558 /* else ... any others ...? */
559
560 /* one byte, so no need to convert this or EncryptionKeyLen from 616 /* one byte, so no need to convert this or EncryptionKeyLen from
561 little endian */ 617 little endian */
562 server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount), 618 server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
@@ -569,90 +625,26 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
569 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 625 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
570 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 626 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
571 server->timeAdj *= 60; 627 server->timeAdj *= 60;
628
572 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 629 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
630 server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
573 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, 631 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
574 CIFS_CRYPTO_KEY_SIZE); 632 CIFS_CRYPTO_KEY_SIZE);
575 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || 633 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
576 server->capabilities & CAP_EXTENDED_SECURITY) && 634 server->capabilities & CAP_EXTENDED_SECURITY) &&
577 (pSMBr->EncryptionKeyLength == 0)) { 635 (pSMBr->EncryptionKeyLength == 0)) {
578 /* decode security blob */ 636 server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
579 count = get_bcc(&pSMBr->hdr); 637 rc = decode_ext_sec_blob(ses, pSMBr);
580 if (count < 16) {
581 rc = -EIO;
582 goto neg_err_exit;
583 }
584 spin_lock(&cifs_tcp_ses_lock);
585 if (server->srv_count > 1) {
586 spin_unlock(&cifs_tcp_ses_lock);
587 if (memcmp(server->server_GUID,
588 pSMBr->u.extended_response.
589 GUID, 16) != 0) {
590 cifs_dbg(FYI, "server UID changed\n");
591 memcpy(server->server_GUID,
592 pSMBr->u.extended_response.GUID,
593 16);
594 }
595 } else {
596 spin_unlock(&cifs_tcp_ses_lock);
597 memcpy(server->server_GUID,
598 pSMBr->u.extended_response.GUID, 16);
599 }
600
601 if (count == 16) {
602 server->secType = RawNTLMSSP;
603 } else {
604 rc = decode_negTokenInit(pSMBr->u.extended_response.
605 SecurityBlob, count - 16,
606 server);
607 if (rc == 1)
608 rc = 0;
609 else
610 rc = -EINVAL;
611 if (server->secType == Kerberos) {
612 if (!server->sec_kerberos &&
613 !server->sec_mskerberos)
614 rc = -EOPNOTSUPP;
615 } else if (server->secType == RawNTLMSSP) {
616 if (!server->sec_ntlmssp)
617 rc = -EOPNOTSUPP;
618 } else
619 rc = -EOPNOTSUPP;
620 }
621 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { 638 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
622 rc = -EIO; /* no crypt key only if plain text pwd */ 639 rc = -EIO; /* no crypt key only if plain text pwd */
623 goto neg_err_exit;
624 } else
625 server->capabilities &= ~CAP_EXTENDED_SECURITY;
626
627#ifdef CONFIG_CIFS_WEAK_PW_HASH
628signing_check:
629#endif
630 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
631 /* MUST_SIGN already includes the MAY_SIGN FLAG
632 so if this is zero it means that signing is disabled */
633 cifs_dbg(FYI, "Signing disabled\n");
634 if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
635 cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
636 rc = -EOPNOTSUPP;
637 }
638 server->sec_mode &=
639 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
640 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
641 /* signing required */
642 cifs_dbg(FYI, "Must sign - secFlags 0x%x\n", secFlags);
643 if ((server->sec_mode &
644 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
645 cifs_dbg(VFS, "signing required but server lacks support\n");
646 rc = -EOPNOTSUPP;
647 } else
648 server->sec_mode |= SECMODE_SIGN_REQUIRED;
649 } else { 640 } else {
650 /* signing optional ie CIFSSEC_MAY_SIGN */ 641 server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
651 if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0) 642 server->capabilities &= ~CAP_EXTENDED_SECURITY;
652 server->sec_mode &=
653 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
654 } 643 }
655 644
645signing_check:
646 if (!rc)
647 rc = cifs_enable_signing(server, ses->sign);
656neg_err_exit: 648neg_err_exit:
657 cifs_buf_release(pSMB); 649 cifs_buf_release(pSMB);
658 650
@@ -777,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
777 769
778 pSMB->hdr.Mid = get_next_mid(ses->server); 770 pSMB->hdr.Mid = get_next_mid(ses->server);
779 771
780 if (ses->server->sec_mode & 772 if (ses->server->sign)
781 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 773 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
782 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
783 774
784 pSMB->hdr.Uid = ses->Suid; 775 pSMB->hdr.Uid = ses->Suid;
785 776
@@ -1540,8 +1531,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
1540 switch (mid->mid_state) { 1531 switch (mid->mid_state) {
1541 case MID_RESPONSE_RECEIVED: 1532 case MID_RESPONSE_RECEIVED:
1542 /* result already set, check signature */ 1533 /* result already set, check signature */
1543 if (server->sec_mode & 1534 if (server->sign) {
1544 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1545 int rc = 0; 1535 int rc = 0;
1546 1536
1547 rc = cifs_verify_signature(&rqst, server, 1537 rc = cifs_verify_signature(&rqst, server,
@@ -3940,6 +3930,7 @@ QFileInfoRetry:
3940 pSMB->Pad = 0; 3930 pSMB->Pad = 0;
3941 pSMB->Fid = netfid; 3931 pSMB->Fid = netfid;
3942 inc_rfc1001_len(pSMB, byte_count); 3932 inc_rfc1001_len(pSMB, byte_count);
3933 pSMB->t2.ByteCount = cpu_to_le16(byte_count);
3943 3934
3944 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3935 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3945 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3936 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4108,6 +4099,7 @@ UnixQFileInfoRetry:
4108 pSMB->Pad = 0; 4099 pSMB->Pad = 0;
4109 pSMB->Fid = netfid; 4100 pSMB->Fid = netfid;
4110 inc_rfc1001_len(pSMB, byte_count); 4101 inc_rfc1001_len(pSMB, byte_count);
4102 pSMB->t2.ByteCount = cpu_to_le16(byte_count);
4111 4103
4112 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4104 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4113 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4105 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4794,11 +4786,8 @@ getDFSRetry:
4794 strncpy(pSMB->RequestFileName, search_name, name_len); 4786 strncpy(pSMB->RequestFileName, search_name, name_len);
4795 } 4787 }
4796 4788
4797 if (ses->server) { 4789 if (ses->server && ses->server->sign)
4798 if (ses->server->sec_mode & 4790 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
4799 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
4800 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
4801 }
4802 4791
4803 pSMB->hdr.Uid = ses->Suid; 4792 pSMB->hdr.Uid = ses->Suid;
4804 4793
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e3bc39bb9d12..afcb8a1a33b7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -85,7 +85,7 @@ enum {
85 Opt_acl, Opt_noacl, Opt_locallease, 85 Opt_acl, Opt_noacl, Opt_locallease,
86 Opt_sign, Opt_seal, Opt_noac, 86 Opt_sign, Opt_seal, Opt_noac,
87 Opt_fsc, Opt_mfsymlinks, 87 Opt_fsc, Opt_mfsymlinks,
88 Opt_multiuser, Opt_sloppy, 88 Opt_multiuser, Opt_sloppy, Opt_nosharesock,
89 89
90 /* Mount options which take numeric value */ 90 /* Mount options which take numeric value */
91 Opt_backupuid, Opt_backupgid, Opt_uid, 91 Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -165,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = {
165 { Opt_mfsymlinks, "mfsymlinks" }, 165 { Opt_mfsymlinks, "mfsymlinks" },
166 { Opt_multiuser, "multiuser" }, 166 { Opt_multiuser, "multiuser" },
167 { Opt_sloppy, "sloppy" }, 167 { Opt_sloppy, "sloppy" },
168 { Opt_nosharesock, "nosharesock" },
168 169
169 { Opt_backupuid, "backupuid=%s" }, 170 { Opt_backupuid, "backupuid=%s" },
170 { Opt_backupgid, "backupgid=%s" }, 171 { Opt_backupgid, "backupgid=%s" },
@@ -275,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = {
275 { Smb_20, SMB20_VERSION_STRING}, 276 { Smb_20, SMB20_VERSION_STRING},
276 { Smb_21, SMB21_VERSION_STRING }, 277 { Smb_21, SMB21_VERSION_STRING },
277 { Smb_30, SMB30_VERSION_STRING }, 278 { Smb_30, SMB30_VERSION_STRING },
279 { Smb_302, SMB302_VERSION_STRING },
278}; 280};
279 281
280static int ip_connect(struct TCP_Server_Info *server); 282static int ip_connect(struct TCP_Server_Info *server);
@@ -1024,44 +1026,48 @@ static int cifs_parse_security_flavors(char *value,
1024 1026
1025 substring_t args[MAX_OPT_ARGS]; 1027 substring_t args[MAX_OPT_ARGS];
1026 1028
1029 /*
1030 * With mount options, the last one should win. Reset any existing
1031 * settings back to default.
1032 */
1033 vol->sectype = Unspecified;
1034 vol->sign = false;
1035
1027 switch (match_token(value, cifs_secflavor_tokens, args)) { 1036 switch (match_token(value, cifs_secflavor_tokens, args)) {
1028 case Opt_sec_krb5:
1029 vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
1030 break;
1031 case Opt_sec_krb5i:
1032 vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
1033 break;
1034 case Opt_sec_krb5p: 1037 case Opt_sec_krb5p:
1035 /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ 1038 cifs_dbg(VFS, "sec=krb5p is not supported!\n");
1036 cifs_dbg(VFS, "Krb5 cifs privacy not supported\n"); 1039 return 1;
1037 break; 1040 case Opt_sec_krb5i:
1038 case Opt_sec_ntlmssp: 1041 vol->sign = true;
1039 vol->secFlg |= CIFSSEC_MAY_NTLMSSP; 1042 /* Fallthrough */
1043 case Opt_sec_krb5:
1044 vol->sectype = Kerberos;
1040 break; 1045 break;
1041 case Opt_sec_ntlmsspi: 1046 case Opt_sec_ntlmsspi:
1042 vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN; 1047 vol->sign = true;
1043 break; 1048 /* Fallthrough */
1044 case Opt_ntlm: 1049 case Opt_sec_ntlmssp:
1045 /* ntlm is default so can be turned off too */ 1050 vol->sectype = RawNTLMSSP;
1046 vol->secFlg |= CIFSSEC_MAY_NTLM;
1047 break; 1051 break;
1048 case Opt_sec_ntlmi: 1052 case Opt_sec_ntlmi:
1049 vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN; 1053 vol->sign = true;
1050 break; 1054 /* Fallthrough */
1051 case Opt_sec_ntlmv2: 1055 case Opt_ntlm:
1052 vol->secFlg |= CIFSSEC_MAY_NTLMV2; 1056 vol->sectype = NTLM;
1053 break; 1057 break;
1054 case Opt_sec_ntlmv2i: 1058 case Opt_sec_ntlmv2i:
1055 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN; 1059 vol->sign = true;
1060 /* Fallthrough */
1061 case Opt_sec_ntlmv2:
1062 vol->sectype = NTLMv2;
1056 break; 1063 break;
1057#ifdef CONFIG_CIFS_WEAK_PW_HASH 1064#ifdef CONFIG_CIFS_WEAK_PW_HASH
1058 case Opt_sec_lanman: 1065 case Opt_sec_lanman:
1059 vol->secFlg |= CIFSSEC_MAY_LANMAN; 1066 vol->sectype = LANMAN;
1060 break; 1067 break;
1061#endif 1068#endif
1062 case Opt_sec_none: 1069 case Opt_sec_none:
1063 vol->nullauth = 1; 1070 vol->nullauth = 1;
1064 vol->secFlg |= CIFSSEC_MAY_NTLM;
1065 break; 1071 break;
1066 default: 1072 default:
1067 cifs_dbg(VFS, "bad security option: %s\n", value); 1073 cifs_dbg(VFS, "bad security option: %s\n", value);
@@ -1119,6 +1125,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1119 vol->ops = &smb30_operations; 1125 vol->ops = &smb30_operations;
1120 vol->vals = &smb30_values; 1126 vol->vals = &smb30_values;
1121 break; 1127 break;
1128 case Smb_302:
1129 vol->ops = &smb30_operations; /* currently identical with 3.0 */
1130 vol->vals = &smb302_values;
1131 break;
1122#endif 1132#endif
1123 default: 1133 default:
1124 cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); 1134 cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
@@ -1424,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1424 vol->local_lease = 1; 1434 vol->local_lease = 1;
1425 break; 1435 break;
1426 case Opt_sign: 1436 case Opt_sign:
1427 vol->secFlg |= CIFSSEC_MUST_SIGN; 1437 vol->sign = true;
1428 break; 1438 break;
1429 case Opt_seal: 1439 case Opt_seal:
1430 /* we do not do the following in secFlags because seal 1440 /* we do not do the following in secFlags because seal
@@ -1455,6 +1465,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1455 case Opt_sloppy: 1465 case Opt_sloppy:
1456 sloppy = true; 1466 sloppy = true;
1457 break; 1467 break;
1468 case Opt_nosharesock:
1469 vol->nosharesock = true;
1470 break;
1458 1471
1459 /* Numeric Values */ 1472 /* Numeric Values */
1460 case Opt_backupuid: 1473 case Opt_backupuid:
@@ -1978,47 +1991,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1978static bool 1991static bool
1979match_security(struct TCP_Server_Info *server, struct smb_vol *vol) 1992match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1980{ 1993{
1981 unsigned int secFlags; 1994 /*
1982 1995 * The select_sectype function should either return the vol->sectype
1983 if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) 1996 * that was specified, or "Unspecified" if that sectype was not
1984 secFlags = vol->secFlg; 1997 * compatible with the given NEGOTIATE request.
1985 else 1998 */
1986 secFlags = global_secflags | vol->secFlg; 1999 if (select_sectype(server, vol->sectype) == Unspecified)
1987
1988 switch (server->secType) {
1989 case LANMAN:
1990 if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
1991 return false;
1992 break;
1993 case NTLMv2:
1994 if (!(secFlags & CIFSSEC_MAY_NTLMV2))
1995 return false;
1996 break;
1997 case NTLM:
1998 if (!(secFlags & CIFSSEC_MAY_NTLM))
1999 return false;
2000 break;
2001 case Kerberos:
2002 if (!(secFlags & CIFSSEC_MAY_KRB5))
2003 return false;
2004 break;
2005 case RawNTLMSSP:
2006 if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
2007 return false;
2008 break;
2009 default:
2010 /* shouldn't happen */
2011 return false; 2000 return false;
2012 }
2013 2001
2014 /* now check if signing mode is acceptable */ 2002 /*
2015 if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && 2003 * Now check if signing mode is acceptable. No need to check
2016 (server->sec_mode & SECMODE_SIGN_REQUIRED)) 2004 * global_secflags at this point since if MUST_SIGN is set then
2017 return false; 2005 * the server->sign had better be too.
2018 else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && 2006 */
2019 (server->sec_mode & 2007 if (vol->sign && !server->sign)
2020 (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) 2008 return false;
2021 return false;
2022 2009
2023 return true; 2010 return true;
2024} 2011}
@@ -2027,6 +2014,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
2027{ 2014{
2028 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; 2015 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
2029 2016
2017 if (vol->nosharesock)
2018 return 0;
2019
2030 if ((server->vals != vol->vals) || (server->ops != vol->ops)) 2020 if ((server->vals != vol->vals) || (server->ops != vol->ops))
2031 return 0; 2021 return 0;
2032 2022
@@ -2216,7 +2206,11 @@ out_err:
2216 2206
2217static int match_session(struct cifs_ses *ses, struct smb_vol *vol) 2207static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
2218{ 2208{
2219 switch (ses->server->secType) { 2209 if (vol->sectype != Unspecified &&
2210 vol->sectype != ses->sectype)
2211 return 0;
2212
2213 switch (ses->sectype) {
2220 case Kerberos: 2214 case Kerberos:
2221 if (!uid_eq(vol->cred_uid, ses->cred_uid)) 2215 if (!uid_eq(vol->cred_uid, ses->cred_uid))
2222 return 0; 2216 return 0;
@@ -2493,7 +2487,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2493 ses->cred_uid = volume_info->cred_uid; 2487 ses->cred_uid = volume_info->cred_uid;
2494 ses->linux_uid = volume_info->linux_uid; 2488 ses->linux_uid = volume_info->linux_uid;
2495 2489
2496 ses->overrideSecFlg = volume_info->secFlg; 2490 ses->sectype = volume_info->sectype;
2491 ses->sign = volume_info->sign;
2497 2492
2498 mutex_lock(&ses->session_mutex); 2493 mutex_lock(&ses->session_mutex);
2499 rc = cifs_negotiate_protocol(xid, ses); 2494 rc = cifs_negotiate_protocol(xid, ses);
@@ -3656,7 +3651,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
3656 NTLMv2 password here) */ 3651 NTLMv2 password here) */
3657#ifdef CONFIG_CIFS_WEAK_PW_HASH 3652#ifdef CONFIG_CIFS_WEAK_PW_HASH
3658 if ((global_secflags & CIFSSEC_MAY_LANMAN) && 3653 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
3659 (ses->server->secType == LANMAN)) 3654 (ses->sectype == LANMAN))
3660 calc_lanman_hash(tcon->password, ses->server->cryptkey, 3655 calc_lanman_hash(tcon->password, ses->server->cryptkey,
3661 ses->server->sec_mode & 3656 ses->server->sec_mode &
3662 SECMODE_PW_ENCRYPT ? true : false, 3657 SECMODE_PW_ENCRYPT ? true : false,
@@ -3674,8 +3669,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
3674 } 3669 }
3675 } 3670 }
3676 3671
3677 if (ses->server->sec_mode & 3672 if (ses->server->sign)
3678 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
3679 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 3673 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
3680 3674
3681 if (ses->capabilities & CAP_STATUS32) { 3675 if (ses->capabilities & CAP_STATUS32) {
@@ -3738,7 +3732,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
3738 } 3732 }
3739 bcc_ptr += length + 1; 3733 bcc_ptr += length + 1;
3740 bytes_left -= (length + 1); 3734 bytes_left -= (length + 1);
3741 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 3735 strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
3742 3736
3743 /* mostly informational -- no need to fail on error here */ 3737 /* mostly informational -- no need to fail on error here */
3744 kfree(tcon->nativeFileSystem); 3738 kfree(tcon->nativeFileSystem);
@@ -3827,7 +3821,6 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3827 int rc = -ENOSYS; 3821 int rc = -ENOSYS;
3828 struct TCP_Server_Info *server = ses->server; 3822 struct TCP_Server_Info *server = ses->server;
3829 3823
3830 ses->flags = 0;
3831 ses->capabilities = server->capabilities; 3824 ses->capabilities = server->capabilities;
3832 if (linuxExtEnabled == 0) 3825 if (linuxExtEnabled == 0)
3833 ses->capabilities &= (~server->vals->cap_unix); 3826 ses->capabilities &= (~server->vals->cap_unix);
@@ -3848,6 +3841,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3848 server->sequence_number = 0x2; 3841 server->sequence_number = 0x2;
3849 server->session_estab = true; 3842 server->session_estab = true;
3850 ses->auth_key.response = NULL; 3843 ses->auth_key.response = NULL;
3844 if (server->ops->generate_signingkey)
3845 server->ops->generate_signingkey(server);
3851 } 3846 }
3852 mutex_unlock(&server->srv_mutex); 3847 mutex_unlock(&server->srv_mutex);
3853 3848
@@ -3870,23 +3865,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3870static int 3865static int
3871cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) 3866cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
3872{ 3867{
3873 switch (ses->server->secType) { 3868 vol->sectype = ses->sectype;
3874 case Kerberos: 3869
3875 vol->secFlg = CIFSSEC_MUST_KRB5; 3870 /* krb5 is special, since we don't need username or pw */
3871 if (vol->sectype == Kerberos)
3876 return 0; 3872 return 0;
3877 case NTLMv2:
3878 vol->secFlg = CIFSSEC_MUST_NTLMV2;
3879 break;
3880 case NTLM:
3881 vol->secFlg = CIFSSEC_MUST_NTLM;
3882 break;
3883 case RawNTLMSSP:
3884 vol->secFlg = CIFSSEC_MUST_NTLMSSP;
3885 break;
3886 case LANMAN:
3887 vol->secFlg = CIFSSEC_MUST_LANMAN;
3888 break;
3889 }
3890 3873
3891 return cifs_set_cifscreds(vol, ses); 3874 return cifs_set_cifscreds(vol, ses);
3892} 3875}
@@ -3912,6 +3895,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
3912 vol_info->nocase = master_tcon->nocase; 3895 vol_info->nocase = master_tcon->nocase;
3913 vol_info->local_lease = master_tcon->local_lease; 3896 vol_info->local_lease = master_tcon->local_lease;
3914 vol_info->no_linux_ext = !master_tcon->unix_ext; 3897 vol_info->no_linux_ext = !master_tcon->unix_ext;
3898 vol_info->sectype = master_tcon->ses->sectype;
3899 vol_info->sign = master_tcon->ses->sign;
3915 3900
3916 rc = cifs_set_vol_auth(vol_info, master_tcon->ses); 3901 rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
3917 if (rc) { 3902 if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5699b5036ed8..5175aebf6737 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -822,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = {
822/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 822/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
823}; 823};
824 824
825static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, 825static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
826 struct qstr *q)
827{ 826{
828 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; 827 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
829 unsigned long hash; 828 unsigned long hash;
@@ -838,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
838 return 0; 837 return 0;
839} 838}
840 839
841static int cifs_ci_compare(const struct dentry *parent, 840static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
842 const struct inode *pinode,
843 const struct dentry *dentry, const struct inode *inode,
844 unsigned int len, const char *str, const struct qstr *name) 841 unsigned int len, const char *str, const struct qstr *name)
845{ 842{
846 struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; 843 struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
847 844
848 if ((name->len == len) && 845 if ((name->len == len) &&
849 (nls_strnicmp(codepage, name->name, str, len) == 0)) 846 (nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4d8ba8d491e5..91d8629e69a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -999,7 +999,7 @@ try_again:
999 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); 999 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
1000 if (!rc) 1000 if (!rc)
1001 goto try_again; 1001 goto try_again;
1002 locks_delete_block(flock); 1002 posix_unblock_lock(flock);
1003 } 1003 }
1004 return rc; 1004 return rc;
1005} 1005}
@@ -1092,6 +1092,7 @@ struct lock_to_push {
1092static int 1092static int
1093cifs_push_posix_locks(struct cifsFileInfo *cfile) 1093cifs_push_posix_locks(struct cifsFileInfo *cfile)
1094{ 1094{
1095 struct inode *inode = cfile->dentry->d_inode;
1095 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1096 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1096 struct file_lock *flock, **before; 1097 struct file_lock *flock, **before;
1097 unsigned int count = 0, i = 0; 1098 unsigned int count = 0, i = 0;
@@ -1102,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1102 1103
1103 xid = get_xid(); 1104 xid = get_xid();
1104 1105
1105 lock_flocks(); 1106 spin_lock(&inode->i_lock);
1106 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1107 cifs_for_each_lock(inode, before) {
1107 if ((*before)->fl_flags & FL_POSIX) 1108 if ((*before)->fl_flags & FL_POSIX)
1108 count++; 1109 count++;
1109 } 1110 }
1110 unlock_flocks(); 1111 spin_unlock(&inode->i_lock);
1111 1112
1112 INIT_LIST_HEAD(&locks_to_send); 1113 INIT_LIST_HEAD(&locks_to_send);
1113 1114
@@ -1126,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1126 } 1127 }
1127 1128
1128 el = locks_to_send.next; 1129 el = locks_to_send.next;
1129 lock_flocks(); 1130 spin_lock(&inode->i_lock);
1130 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1131 cifs_for_each_lock(inode, before) {
1131 flock = *before; 1132 flock = *before;
1132 if ((flock->fl_flags & FL_POSIX) == 0) 1133 if ((flock->fl_flags & FL_POSIX) == 0)
1133 continue; 1134 continue;
@@ -1152,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1152 lck->offset = flock->fl_start; 1153 lck->offset = flock->fl_start;
1153 el = el->next; 1154 el = el->next;
1154 } 1155 }
1155 unlock_flocks(); 1156 spin_unlock(&inode->i_lock);
1156 1157
1157 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { 1158 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
1158 int stored_rc; 1159 int stored_rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1bec014779fd..f7d4b2285efe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
267 if (treeCon->nocase) 267 if (treeCon->nocase)
268 buffer->Flags |= SMBFLG_CASELESS; 268 buffer->Flags |= SMBFLG_CASELESS;
269 if ((treeCon->ses) && (treeCon->ses->server)) 269 if ((treeCon->ses) && (treeCon->ses->server))
270 if (treeCon->ses->server->sec_mode & 270 if (treeCon->ses->server->sign)
271 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
272 buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 271 buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
273 } 272 }
274 273
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f1213799de1a..ab8778469394 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -126,6 +126,22 @@ out:
126 dput(dentry); 126 dput(dentry);
127} 127}
128 128
129/*
130 * Is it possible that this directory might turn out to be a DFS referral
131 * once we go to try and use it?
132 */
133static bool
134cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
135{
136#ifdef CONFIG_CIFS_DFS_UPCALL
137 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
138
139 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
140 return true;
141#endif
142 return false;
143}
144
129static void 145static void
130cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) 146cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
131{ 147{
@@ -135,6 +151,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
135 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { 151 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
136 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; 152 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
137 fattr->cf_dtype = DT_DIR; 153 fattr->cf_dtype = DT_DIR;
154 /*
155 * Windows CIFS servers generally make DFS referrals look
156 * like directories in FIND_* responses with the reparse
157 * attribute flag also set (since DFS junctions are
158 * reparse points). We must revalidate at least these
159 * directory inodes before trying to use them (if
160 * they are DFS we will get PATH_NOT_COVERED back
161 * when queried directly and can then try to connect
162 * to the DFS target)
163 */
164 if (cifs_dfs_is_possible(cifs_sb) &&
165 (fattr->cf_cifsattrs & ATTR_REPARSE))
166 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
138 } else { 167 } else {
139 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; 168 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
140 fattr->cf_dtype = DT_REG; 169 fattr->cf_dtype = DT_REG;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index f230571a7ab3..79358e341fd2 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
138 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | 138 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
139 CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; 139 CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
140 140
141 if (ses->server->sec_mode & 141 if (ses->server->sign)
142 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
143 pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 142 pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
144 143
145 if (ses->capabilities & CAP_UNICODE) { 144 if (ses->capabilities & CAP_UNICODE) {
@@ -310,11 +309,10 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
310 return; 309 return;
311} 310}
312 311
313static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, 312static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
314 struct cifs_ses *ses, 313 struct cifs_ses *ses,
315 const struct nls_table *nls_cp) 314 const struct nls_table *nls_cp)
316{ 315{
317 int rc = 0;
318 int len; 316 int len;
319 char *bcc_ptr = *pbcc_area; 317 char *bcc_ptr = *pbcc_area;
320 318
@@ -322,24 +320,22 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
322 320
323 len = strnlen(bcc_ptr, bleft); 321 len = strnlen(bcc_ptr, bleft);
324 if (len >= bleft) 322 if (len >= bleft)
325 return rc; 323 return;
326 324
327 kfree(ses->serverOS); 325 kfree(ses->serverOS);
328 326
329 ses->serverOS = kzalloc(len + 1, GFP_KERNEL); 327 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
330 if (ses->serverOS) 328 if (ses->serverOS)
331 strncpy(ses->serverOS, bcc_ptr, len); 329 strncpy(ses->serverOS, bcc_ptr, len);
332 if (strncmp(ses->serverOS, "OS/2", 4) == 0) { 330 if (strncmp(ses->serverOS, "OS/2", 4) == 0)
333 cifs_dbg(FYI, "OS/2 server\n"); 331 cifs_dbg(FYI, "OS/2 server\n");
334 ses->flags |= CIFS_SES_OS2;
335 }
336 332
337 bcc_ptr += len + 1; 333 bcc_ptr += len + 1;
338 bleft -= len + 1; 334 bleft -= len + 1;
339 335
340 len = strnlen(bcc_ptr, bleft); 336 len = strnlen(bcc_ptr, bleft);
341 if (len >= bleft) 337 if (len >= bleft)
342 return rc; 338 return;
343 339
344 kfree(ses->serverNOS); 340 kfree(ses->serverNOS);
345 341
@@ -352,7 +348,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
352 348
353 len = strnlen(bcc_ptr, bleft); 349 len = strnlen(bcc_ptr, bleft);
354 if (len > bleft) 350 if (len > bleft)
355 return rc; 351 return;
356 352
357 /* No domain field in LANMAN case. Domain is 353 /* No domain field in LANMAN case. Domain is
358 returned by old servers in the SMB negprot response */ 354 returned by old servers in the SMB negprot response */
@@ -360,8 +356,6 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
360 but thus do return domain here we could add parsing 356 but thus do return domain here we could add parsing
361 for it later, but it is not very important */ 357 for it later, but it is not very important */
362 cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); 358 cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
363
364 return rc;
365} 359}
366 360
367int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 361int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
@@ -432,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
432 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
433 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
434 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; 428 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
435 if (ses->server->sec_mode & 429 if (ses->server->sign) {
436 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
437 flags |= NTLMSSP_NEGOTIATE_SIGN; 430 flags |= NTLMSSP_NEGOTIATE_SIGN;
438 if (!ses->server->session_estab) 431 if (!ses->server->session_estab)
439 flags |= NTLMSSP_NEGOTIATE_KEY_XCH; 432 flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -471,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
471 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 464 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
472 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 465 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
473 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; 466 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
474 if (ses->server->sec_mode & 467 if (ses->server->sign) {
475 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
476 flags |= NTLMSSP_NEGOTIATE_SIGN; 468 flags |= NTLMSSP_NEGOTIATE_SIGN;
477 if (!ses->server->session_estab) 469 if (!ses->server->session_estab)
478 flags |= NTLMSSP_NEGOTIATE_KEY_XCH; 470 flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -558,6 +550,56 @@ setup_ntlmv2_ret:
558 return rc; 550 return rc;
559} 551}
560 552
553enum securityEnum
554select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
555{
556 switch (server->negflavor) {
557 case CIFS_NEGFLAVOR_EXTENDED:
558 switch (requested) {
559 case Kerberos:
560 case RawNTLMSSP:
561 return requested;
562 case Unspecified:
563 if (server->sec_ntlmssp &&
564 (global_secflags & CIFSSEC_MAY_NTLMSSP))
565 return RawNTLMSSP;
566 if ((server->sec_kerberos || server->sec_mskerberos) &&
567 (global_secflags & CIFSSEC_MAY_KRB5))
568 return Kerberos;
569 /* Fallthrough */
570 default:
571 return Unspecified;
572 }
573 case CIFS_NEGFLAVOR_UNENCAP:
574 switch (requested) {
575 case NTLM:
576 case NTLMv2:
577 return requested;
578 case Unspecified:
579 if (global_secflags & CIFSSEC_MAY_NTLMV2)
580 return NTLMv2;
581 if (global_secflags & CIFSSEC_MAY_NTLM)
582 return NTLM;
583 /* Fallthrough */
584 default:
585 return Unspecified;
586 }
587 case CIFS_NEGFLAVOR_LANMAN:
588 switch (requested) {
589 case LANMAN:
590 return requested;
591 case Unspecified:
592 if (global_secflags & CIFSSEC_MAY_LANMAN)
593 return LANMAN;
594 /* Fallthrough */
595 default:
596 return Unspecified;
597 }
598 default:
599 return Unspecified;
600 }
601}
602
561int 603int
562CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, 604CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
563 const struct nls_table *nls_cp) 605 const struct nls_table *nls_cp)
@@ -579,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
579 u16 blob_len; 621 u16 blob_len;
580 char *ntlmsspblob = NULL; 622 char *ntlmsspblob = NULL;
581 623
582 if (ses == NULL) 624 if (ses == NULL) {
625 WARN(1, "%s: ses == NULL!", __func__);
583 return -EINVAL; 626 return -EINVAL;
627 }
584 628
585 type = ses->server->secType; 629 type = select_sectype(ses->server, ses->sectype);
586 cifs_dbg(FYI, "sess setup type %d\n", type); 630 cifs_dbg(FYI, "sess setup type %d\n", type);
631 if (type == Unspecified) {
632 cifs_dbg(VFS, "Unable to select appropriate authentication method!");
633 return -EINVAL;
634 }
635
587 if (type == RawNTLMSSP) { 636 if (type == RawNTLMSSP) {
588 /* if memory allocation is successful, caller of this function 637 /* if memory allocation is successful, caller of this function
589 * frees it. 638 * frees it.
@@ -643,8 +692,6 @@ ssetup_ntlmssp_authenticate:
643 } 692 }
644 bcc_ptr = str_area; 693 bcc_ptr = str_area;
645 694
646 ses->flags &= ~CIFS_SES_LANMAN;
647
648 iov[1].iov_base = NULL; 695 iov[1].iov_base = NULL;
649 iov[1].iov_len = 0; 696 iov[1].iov_len = 0;
650 697
@@ -668,7 +715,6 @@ ssetup_ntlmssp_authenticate:
668 ses->server->sec_mode & SECMODE_PW_ENCRYPT ? 715 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
669 true : false, lnm_session_key); 716 true : false, lnm_session_key);
670 717
671 ses->flags |= CIFS_SES_LANMAN;
672 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); 718 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
673 bcc_ptr += CIFS_AUTH_RESP_SIZE; 719 bcc_ptr += CIFS_AUTH_RESP_SIZE;
674 720
@@ -938,8 +984,7 @@ ssetup_ntlmssp_authenticate:
938 } 984 }
939 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); 985 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
940 } else { 986 } else {
941 rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, 987 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
942 ses, nls_cp);
943 } 988 }
944 989
945ssetup_exit: 990ssetup_exit:
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3efdb9d5c0b8..e813f04511d8 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -449,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
449 * WRITEX header, not including the 4 byte RFC1001 length. 449 * WRITEX header, not including the 4 byte RFC1001 length.
450 */ 450 */
451 if (!(server->capabilities & CAP_LARGE_WRITE_X) || 451 if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
452 (!(server->capabilities & CAP_UNIX) && 452 (!(server->capabilities & CAP_UNIX) && server->sign))
453 (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
454 wsize = min_t(unsigned int, wsize, 453 wsize = min_t(unsigned int, wsize,
455 server->maxBuf - sizeof(WRITE_REQ) + 4); 454 server->maxBuf - sizeof(WRITE_REQ) + 4);
456 455
@@ -765,20 +764,14 @@ smb_set_file_info(struct inode *inode, const char *full_path,
765 } 764 }
766 tcon = tlink_tcon(tlink); 765 tcon = tlink_tcon(tlink);
767 766
768 /* 767 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
769 * NT4 apparently returns success on this call, but it doesn't really
770 * work.
771 */
772 if (!(tcon->ses->flags & CIFS_SES_NT4)) {
773 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
774 cifs_sb->local_nls,
775 cifs_sb->mnt_cifs_flags & 768 cifs_sb->mnt_cifs_flags &
776 CIFS_MOUNT_MAP_SPECIAL_CHR); 769 CIFS_MOUNT_MAP_SPECIAL_CHR);
777 if (rc == 0) { 770 if (rc == 0) {
778 cinode->cifsAttrs = le32_to_cpu(buf->Attributes); 771 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
779 goto out; 772 goto out;
780 } else if (rc != -EOPNOTSUPP && rc != -EINVAL) 773 } else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
781 goto out; 774 goto out;
782 } 775 }
783 776
784 cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); 777 cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
@@ -964,4 +957,6 @@ struct smb_version_values smb1_values = {
964 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, 957 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
965 .cap_large_files = CAP_LARGE_FILES, 958 .cap_large_files = CAP_LARGE_FILES,
966 .oplock_read = OPLOCK_READ, 959 .oplock_read = OPLOCK_READ,
960 .signing_enabled = SECMODE_SIGN_ENABLED,
961 .signing_required = SECMODE_SIGN_REQUIRED,
967}; 962};
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e775..c38350851b08 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -54,5 +54,7 @@
54#define SMB2_SIGNATURE_SIZE (16) 54#define SMB2_SIGNATURE_SIZE (16)
55#define SMB2_NTLMV2_SESSKEY_SIZE (16) 55#define SMB2_NTLMV2_SESSKEY_SIZE (16)
56#define SMB2_HMACSHA256_SIZE (32) 56#define SMB2_HMACSHA256_SIZE (32)
57#define SMB2_CMACAES_SIZE (16)
58#define SMB3_SIGNKEY_SIZE (16)
57 59
58#endif /* _SMB2_GLOB_H */ 60#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 10383d8c015b..b0c43345cd98 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -266,6 +266,10 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
266 ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); 266 ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
267 break; 267 break;
268 case SMB2_IOCTL: 268 case SMB2_IOCTL:
269 *off = le32_to_cpu(
270 ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
271 *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
272 break;
269 case SMB2_CHANGE_NOTIFY: 273 case SMB2_CHANGE_NOTIFY:
270 default: 274 default:
271 /* BB FIXME for unimplemented cases above */ 275 /* BB FIXME for unimplemented cases above */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f2e76f3b0c61..6d15cab95b99 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -281,6 +281,25 @@ smb2_clear_stats(struct cifs_tcon *tcon)
281} 281}
282 282
283static void 283static void
284smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
285{
286 seq_puts(m, "\n\tShare Capabilities:");
287 if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
288 seq_puts(m, " DFS,");
289 if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
290 seq_puts(m, " CONTINUOUS AVAILABILITY,");
291 if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
292 seq_puts(m, " SCALEOUT,");
293 if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
294 seq_puts(m, " CLUSTER,");
295 if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
296 seq_puts(m, " ASYMMETRIC,");
297 if (tcon->capabilities == 0)
298 seq_puts(m, " None");
299 seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
300}
301
302static void
284smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) 303smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
285{ 304{
286#ifdef CONFIG_CIFS_STATS 305#ifdef CONFIG_CIFS_STATS
@@ -292,7 +311,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
292 seq_printf(m, "\nSessionSetups: %d sent %d failed", 311 seq_printf(m, "\nSessionSetups: %d sent %d failed",
293 atomic_read(&sent[SMB2_SESSION_SETUP_HE]), 312 atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
294 atomic_read(&failed[SMB2_SESSION_SETUP_HE])); 313 atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
295#define SMB2LOGOFF 0x0002 /* trivial request/resp */
296 seq_printf(m, "\nLogoffs: %d sent %d failed", 314 seq_printf(m, "\nLogoffs: %d sent %d failed",
297 atomic_read(&sent[SMB2_LOGOFF_HE]), 315 atomic_read(&sent[SMB2_LOGOFF_HE]),
298 atomic_read(&failed[SMB2_LOGOFF_HE])); 316 atomic_read(&failed[SMB2_LOGOFF_HE]));
@@ -645,6 +663,7 @@ struct smb_version_operations smb30_operations = {
645 .dump_detail = smb2_dump_detail, 663 .dump_detail = smb2_dump_detail,
646 .clear_stats = smb2_clear_stats, 664 .clear_stats = smb2_clear_stats,
647 .print_stats = smb2_print_stats, 665 .print_stats = smb2_print_stats,
666 .dump_share_caps = smb2_dump_share_caps,
648 .is_oplock_break = smb2_is_valid_oplock_break, 667 .is_oplock_break = smb2_is_valid_oplock_break,
649 .need_neg = smb2_need_neg, 668 .need_neg = smb2_need_neg,
650 .negotiate = smb2_negotiate, 669 .negotiate = smb2_negotiate,
@@ -690,6 +709,7 @@ struct smb_version_operations smb30_operations = {
690 .get_lease_key = smb2_get_lease_key, 709 .get_lease_key = smb2_get_lease_key,
691 .set_lease_key = smb2_set_lease_key, 710 .set_lease_key = smb2_set_lease_key,
692 .new_lease_key = smb2_new_lease_key, 711 .new_lease_key = smb2_new_lease_key,
712 .generate_signingkey = generate_smb3signingkey,
693 .calc_signature = smb3_calc_signature, 713 .calc_signature = smb3_calc_signature,
694}; 714};
695 715
@@ -709,6 +729,8 @@ struct smb_version_values smb20_values = {
709 .cap_nt_find = SMB2_NT_FIND, 729 .cap_nt_find = SMB2_NT_FIND,
710 .cap_large_files = SMB2_LARGE_FILES, 730 .cap_large_files = SMB2_LARGE_FILES,
711 .oplock_read = SMB2_OPLOCK_LEVEL_II, 731 .oplock_read = SMB2_OPLOCK_LEVEL_II,
732 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
733 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
712}; 734};
713 735
714struct smb_version_values smb21_values = { 736struct smb_version_values smb21_values = {
@@ -727,6 +749,8 @@ struct smb_version_values smb21_values = {
727 .cap_nt_find = SMB2_NT_FIND, 749 .cap_nt_find = SMB2_NT_FIND,
728 .cap_large_files = SMB2_LARGE_FILES, 750 .cap_large_files = SMB2_LARGE_FILES,
729 .oplock_read = SMB2_OPLOCK_LEVEL_II, 751 .oplock_read = SMB2_OPLOCK_LEVEL_II,
752 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
753 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
730}; 754};
731 755
732struct smb_version_values smb30_values = { 756struct smb_version_values smb30_values = {
@@ -745,4 +769,26 @@ struct smb_version_values smb30_values = {
745 .cap_nt_find = SMB2_NT_FIND, 769 .cap_nt_find = SMB2_NT_FIND,
746 .cap_large_files = SMB2_LARGE_FILES, 770 .cap_large_files = SMB2_LARGE_FILES,
747 .oplock_read = SMB2_OPLOCK_LEVEL_II, 771 .oplock_read = SMB2_OPLOCK_LEVEL_II,
772 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
773 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
774};
775
776struct smb_version_values smb302_values = {
777 .version_string = SMB302_VERSION_STRING,
778 .protocol_id = SMB302_PROT_ID,
779 .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
780 .large_lock_type = 0,
781 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
782 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
783 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
784 .header_size = sizeof(struct smb2_hdr),
785 .max_header_size = MAX_SMB2_HDR_SIZE,
786 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
787 .lock_cmd = SMB2_LOCK,
788 .cap_unix = 0,
789 .cap_nt_find = SMB2_NT_FIND,
790 .cap_large_files = SMB2_LARGE_FILES,
791 .oplock_read = SMB2_OPLOCK_LEVEL_II,
792 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
793 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
748}; 794};
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b95ce2b54e8..2b312e4eeaa6 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smb2pdu.c 2 * fs/cifs/smb2pdu.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2009, 2012 4 * Copyright (C) International Business Machines Corp., 2009, 2013
5 * Etersoft, 2012 5 * Etersoft, 2012
6 * Author(s): Steve French (sfrench@us.ibm.com) 6 * Author(s): Steve French (sfrench@us.ibm.com)
7 * Pavel Shilovsky (pshilovsky@samba.org) 2012 7 * Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
108 if (!tcon) 108 if (!tcon)
109 goto out; 109 goto out;
110 110
111 /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
112 /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
113 /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
114 if ((tcon->ses) &&
115 (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
116 hdr->CreditCharge = cpu_to_le16(1);
117 /* else CreditCharge MBZ */
118
111 hdr->TreeId = tcon->tid; 119 hdr->TreeId = tcon->tid;
112 /* Uid is not converted */ 120 /* Uid is not converted */
113 if (tcon->ses) 121 if (tcon->ses)
114 hdr->SessionId = tcon->ses->Suid; 122 hdr->SessionId = tcon->ses->Suid;
115 /* BB check following DFS flags BB */ 123
116 /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */ 124 /*
117 if (tcon->share_flags & SHI1005_FLAGS_DFS) 125 * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
118 hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; 126 * to pass the path on the Open SMB prefixed by \\server\share.
119 /* BB how does SMB2 do case sensitive? */ 127 * Not sure when we would need to do the augmented path (if ever) and
120 /* if (tcon->nocase) 128 * setting this flag breaks the SMB2 open operation since it is
121 hdr->Flags |= SMBFLG_CASELESS; */ 129 * illegal to send an empty path name (without \\server\share prefix)
122 if (tcon->ses && tcon->ses->server && 130 * when the DFS flag is set in the SMB open header. We could
123 (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED)) 131 * consider setting the flag on all operations other than open
132 * but it is safer to net set it for now.
133 */
134/* if (tcon->share_flags & SHI1005_FLAGS_DFS)
135 hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
136
137 if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
124 hdr->Flags |= SMB2_FLAGS_SIGNED; 138 hdr->Flags |= SMB2_FLAGS_SIGNED;
125out: 139out:
126 pdu->StructureSize2 = cpu_to_le16(parmsize); 140 pdu->StructureSize2 = cpu_to_le16(parmsize);
@@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
328 struct kvec iov[1]; 342 struct kvec iov[1];
329 int rc = 0; 343 int rc = 0;
330 int resp_buftype; 344 int resp_buftype;
331 struct TCP_Server_Info *server; 345 struct TCP_Server_Info *server = ses->server;
332 unsigned int sec_flags;
333 u16 temp = 0;
334 int blob_offset, blob_length; 346 int blob_offset, blob_length;
335 char *security_blob; 347 char *security_blob;
336 int flags = CIFS_NEG_OP; 348 int flags = CIFS_NEG_OP;
337 349
338 cifs_dbg(FYI, "Negotiate protocol\n"); 350 cifs_dbg(FYI, "Negotiate protocol\n");
339 351
340 if (ses->server) 352 if (!server) {
341 server = ses->server; 353 WARN(1, "%s: server is NULL!\n", __func__);
342 else { 354 return -EIO;
343 rc = -EIO;
344 return rc;
345 } 355 }
346 356
347 rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); 357 rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
348 if (rc) 358 if (rc)
349 return rc; 359 return rc;
350 360
351 /* if any of auth flags (ie not sign or seal) are overriden use them */
352 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
353 sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
354 else /* if override flags set only sign/seal OR them with global auth */
355 sec_flags = global_secflags | ses->overrideSecFlg;
356
357 cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
358
359 req->hdr.SessionId = 0; 361 req->hdr.SessionId = 0;
360 362
361 req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); 363 req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
364 inc_rfc1001_len(req, 2); 366 inc_rfc1001_len(req, 2);
365 367
366 /* only one of SMB2 signing flags may be set in SMB2 request */ 368 /* only one of SMB2 signing flags may be set in SMB2 request */
367 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) 369 if (ses->sign)
368 temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; 370 req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
369 else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ 371 else if (global_secflags & CIFSSEC_MAY_SIGN)
370 temp = SMB2_NEGOTIATE_SIGNING_ENABLED; 372 req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
371 373 else
372 req->SecurityMode = cpu_to_le16(temp); 374 req->SecurityMode = 0;
373 375
374 req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); 376 req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
375 377
@@ -399,6 +401,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
399 cifs_dbg(FYI, "negotiated smb2.1 dialect\n"); 401 cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
400 else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) 402 else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
401 cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); 403 cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
404 else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
405 cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
402 else { 406 else {
403 cifs_dbg(VFS, "Illegal dialect returned by server %d\n", 407 cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
404 le16_to_cpu(rsp->DialectRevision)); 408 le16_to_cpu(rsp->DialectRevision));
@@ -407,6 +411,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
407 } 411 }
408 server->dialect = le16_to_cpu(rsp->DialectRevision); 412 server->dialect = le16_to_cpu(rsp->DialectRevision);
409 413
414 /* SMB2 only has an extended negflavor */
415 server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
410 server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); 416 server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
411 server->max_read = le32_to_cpu(rsp->MaxReadSize); 417 server->max_read = le32_to_cpu(rsp->MaxReadSize);
412 server->max_write = le32_to_cpu(rsp->MaxWriteSize); 418 server->max_write = le32_to_cpu(rsp->MaxWriteSize);
@@ -418,44 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
418 424
419 security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, 425 security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
420 &rsp->hdr); 426 &rsp->hdr);
421 if (blob_length == 0) { 427 /*
422 cifs_dbg(VFS, "missing security blob on negprot\n"); 428 * See MS-SMB2 section 2.2.4: if no blob, client picks default which
423 rc = -EIO; 429 * for us will be
424 goto neg_exit; 430 * ses->sectype = RawNTLMSSP;
425 } 431 * but for time being this is our only auth choice so doesn't matter.
426 432 * We just found a server which sets blob length to zero expecting raw.
427 cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); 433 */
428 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { 434 if (blob_length == 0)
429 cifs_dbg(FYI, "Signing required\n"); 435 cifs_dbg(FYI, "missing security blob on negprot\n");
430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
431 SMB2_NEGOTIATE_SIGNING_ENABLED))) {
432 cifs_dbg(VFS, "signing required but server lacks support\n");
433 rc = -EOPNOTSUPP;
434 goto neg_exit;
435 }
436 server->sec_mode |= SECMODE_SIGN_REQUIRED;
437 } else if (sec_flags & CIFSSEC_MAY_SIGN) {
438 cifs_dbg(FYI, "Signing optional\n");
439 if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
440 cifs_dbg(FYI, "Server requires signing\n");
441 server->sec_mode |= SECMODE_SIGN_REQUIRED;
442 } else {
443 server->sec_mode &=
444 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
445 }
446 } else {
447 cifs_dbg(FYI, "Signing disabled\n");
448 if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
449 cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
450 rc = -EOPNOTSUPP;
451 goto neg_exit;
452 }
453 server->sec_mode &=
454 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
455 }
456 436
437 rc = cifs_enable_signing(server, ses->sign);
457#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ 438#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
458 rc = decode_neg_token_init(security_blob, blob_length, 439 if (rc)
440 goto neg_exit;
441 if (blob_length)
442 rc = decode_neg_token_init(security_blob, blob_length,
459 &server->sec_type); 443 &server->sec_type);
460 if (rc == 1) 444 if (rc == 1)
461 rc = 0; 445 rc = 0;
@@ -480,9 +464,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
480 int rc = 0; 464 int rc = 0;
481 int resp_buftype; 465 int resp_buftype;
482 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 466 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
483 struct TCP_Server_Info *server; 467 struct TCP_Server_Info *server = ses->server;
484 unsigned int sec_flags;
485 u8 temp = 0;
486 u16 blob_length = 0; 468 u16 blob_length = 0;
487 char *security_blob; 469 char *security_blob;
488 char *ntlmssp_blob = NULL; 470 char *ntlmssp_blob = NULL;
@@ -490,11 +472,9 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
490 472
491 cifs_dbg(FYI, "Session Setup\n"); 473 cifs_dbg(FYI, "Session Setup\n");
492 474
493 if (ses->server) 475 if (!server) {
494 server = ses->server; 476 WARN(1, "%s: server is NULL!\n", __func__);
495 else { 477 return -EIO;
496 rc = -EIO;
497 return rc;
498 } 478 }
499 479
500 /* 480 /*
@@ -505,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
505 if (!ses->ntlmssp) 485 if (!ses->ntlmssp)
506 return -ENOMEM; 486 return -ENOMEM;
507 487
508 ses->server->secType = RawNTLMSSP; 488 /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
489 ses->sectype = RawNTLMSSP;
509 490
510ssetup_ntlmssp_authenticate: 491ssetup_ntlmssp_authenticate:
511 if (phase == NtLmChallenge) 492 if (phase == NtLmChallenge)
@@ -515,28 +496,19 @@ ssetup_ntlmssp_authenticate:
515 if (rc) 496 if (rc)
516 return rc; 497 return rc;
517 498
518 /* if any of auth flags (ie not sign or seal) are overriden use them */
519 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
520 sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
521 else /* if override flags set only sign/seal OR them with global auth */
522 sec_flags = global_secflags | ses->overrideSecFlg;
523
524 cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
525
526 req->hdr.SessionId = 0; /* First session, not a reauthenticate */ 499 req->hdr.SessionId = 0; /* First session, not a reauthenticate */
527 req->VcNumber = 0; /* MBZ */ 500 req->VcNumber = 0; /* MBZ */
528 /* to enable echos and oplocks */ 501 /* to enable echos and oplocks */
529 req->hdr.CreditRequest = cpu_to_le16(3); 502 req->hdr.CreditRequest = cpu_to_le16(3);
530 503
531 /* only one of SMB2 signing flags may be set in SMB2 request */ 504 /* only one of SMB2 signing flags may be set in SMB2 request */
532 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) 505 if (server->sign)
533 temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; 506 req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
534 else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) 507 else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
535 temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; 508 req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
536 else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ 509 else
537 temp = SMB2_NEGOTIATE_SIGNING_ENABLED; 510 req->SecurityMode = 0;
538 511
539 req->SecurityMode = temp;
540 req->Capabilities = 0; 512 req->Capabilities = 0;
541 req->Channel = 0; /* MBZ */ 513 req->Channel = 0; /* MBZ */
542 514
@@ -679,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
679 651
680 /* since no tcon, smb2_init can not do this, so do here */ 652 /* since no tcon, smb2_init can not do this, so do here */
681 req->hdr.SessionId = ses->Suid; 653 req->hdr.SessionId = ses->Suid;
682 if (server->sec_mode & SECMODE_SIGN_REQUIRED) 654 if (server->sign)
683 req->hdr.Flags |= SMB2_FLAGS_SIGNED; 655 req->hdr.Flags |= SMB2_FLAGS_SIGNED;
684 656
685 rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); 657 rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
@@ -788,11 +760,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
788 } 760 }
789 761
790 tcon->share_flags = le32_to_cpu(rsp->ShareFlags); 762 tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
763 tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
791 tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); 764 tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
792 tcon->tidStatus = CifsGood; 765 tcon->tidStatus = CifsGood;
793 tcon->need_reconnect = false; 766 tcon->need_reconnect = false;
794 tcon->tid = rsp->hdr.TreeId; 767 tcon->tid = rsp->hdr.TreeId;
795 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 768 strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
796 769
797 if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && 770 if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
798 ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) 771 ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -1036,6 +1009,122 @@ creat_exit:
1036 return rc; 1009 return rc;
1037} 1010}
1038 1011
1012/*
1013 * SMB2 IOCTL is used for both IOCTLs and FSCTLs
1014 */
1015int
1016SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1017 u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
1018 u32 indatalen, char **out_data, u32 *plen /* returned data len */)
1019{
1020 struct smb2_ioctl_req *req;
1021 struct smb2_ioctl_rsp *rsp;
1022 struct TCP_Server_Info *server;
1023 struct cifs_ses *ses = tcon->ses;
1024 struct kvec iov[2];
1025 int resp_buftype;
1026 int num_iovecs;
1027 int rc = 0;
1028
1029 cifs_dbg(FYI, "SMB2 IOCTL\n");
1030
1031 /* zero out returned data len, in case of error */
1032 if (plen)
1033 *plen = 0;
1034
1035 if (ses && (ses->server))
1036 server = ses->server;
1037 else
1038 return -EIO;
1039
1040 rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
1041 if (rc)
1042 return rc;
1043
1044 req->CtlCode = cpu_to_le32(opcode);
1045 req->PersistentFileId = persistent_fid;
1046 req->VolatileFileId = volatile_fid;
1047
1048 if (indatalen) {
1049 req->InputCount = cpu_to_le32(indatalen);
1050 /* do not set InputOffset if no input data */
1051 req->InputOffset =
1052 cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
1053 iov[1].iov_base = in_data;
1054 iov[1].iov_len = indatalen;
1055 num_iovecs = 2;
1056 } else
1057 num_iovecs = 1;
1058
1059 req->OutputOffset = 0;
1060 req->OutputCount = 0; /* MBZ */
1061
1062 /*
1063 * Could increase MaxOutputResponse, but that would require more
1064 * than one credit. Windows typically sets this smaller, but for some
1065 * ioctls it may be useful to allow server to send more. No point
1066 * limiting what the server can send as long as fits in one credit
1067 */
1068 req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
1069
1070 if (is_fsctl)
1071 req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
1072 else
1073 req->Flags = 0;
1074
1075 iov[0].iov_base = (char *)req;
1076 /* 4 for rfc1002 length field */
1077 iov[0].iov_len = get_rfc1002_length(req) + 4;
1078
1079 if (indatalen)
1080 inc_rfc1001_len(req, indatalen);
1081
1082 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
1083 rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
1084
1085 if (rc != 0) {
1086 if (tcon)
1087 cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
1088 goto ioctl_exit;
1089 }
1090
1091 /* check if caller wants to look at return data or just return rc */
1092 if ((plen == NULL) || (out_data == NULL))
1093 goto ioctl_exit;
1094
1095 *plen = le32_to_cpu(rsp->OutputCount);
1096
1097 /* We check for obvious errors in the output buffer length and offset */
1098 if (*plen == 0)
1099 goto ioctl_exit; /* server returned no data */
1100 else if (*plen > 0xFF00) {
1101 cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
1102 *plen = 0;
1103 rc = -EIO;
1104 goto ioctl_exit;
1105 }
1106
1107 if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
1108 cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
1109 le32_to_cpu(rsp->OutputOffset));
1110 *plen = 0;
1111 rc = -EIO;
1112 goto ioctl_exit;
1113 }
1114
1115 *out_data = kmalloc(*plen, GFP_KERNEL);
1116 if (*out_data == NULL) {
1117 rc = -ENOMEM;
1118 goto ioctl_exit;
1119 }
1120
1121 memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
1122 *plen);
1123ioctl_exit:
1124 free_rsp_buf(resp_buftype, rsp);
1125 return rc;
1126}
1127
1039int 1128int
1040SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, 1129SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
1041 u64 persistent_fid, u64 volatile_fid) 1130 u64 persistent_fid, u64 volatile_fid)
@@ -1384,8 +1473,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
1384 case MID_RESPONSE_RECEIVED: 1473 case MID_RESPONSE_RECEIVED:
1385 credits_received = le16_to_cpu(buf->CreditRequest); 1474 credits_received = le16_to_cpu(buf->CreditRequest);
1386 /* result already set, check signature */ 1475 /* result already set, check signature */
1387 if (server->sec_mode & 1476 if (server->sign) {
1388 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1389 int rc; 1477 int rc;
1390 1478
1391 rc = smb2_verify_signature(&rqst, server); 1479 rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4cb4ced258cb..f31043b26bd3 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smb2pdu.h 2 * fs/cifs/smb2pdu.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2009, 2010 4 * Copyright (c) International Business Machines Corp., 2009, 2013
5 * Etersoft, 2012 5 * Etersoft, 2012
6 * Author(s): Steve French (sfrench@us.ibm.com) 6 * Author(s): Steve French (sfrench@us.ibm.com)
7 * Pavel Shilovsky (pshilovsky@samba.org) 2012 7 * Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -170,6 +170,7 @@ struct smb2_negotiate_req {
170#define SMB20_PROT_ID 0x0202 170#define SMB20_PROT_ID 0x0202
171#define SMB21_PROT_ID 0x0210 171#define SMB21_PROT_ID 0x0210
172#define SMB30_PROT_ID 0x0300 172#define SMB30_PROT_ID 0x0300
173#define SMB302_PROT_ID 0x0302
173#define BAD_PROT_ID 0xFFFF 174#define BAD_PROT_ID 0xFFFF
174 175
175/* SecurityMode flags */ 176/* SecurityMode flags */
@@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp {
283#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 284#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
284#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 285#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
285#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 286#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
286#define SHI1005_FLAGS_ENABLE_HASH 0x00002000 287#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
288#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
289#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
290#define SHI1005_FLAGS_ALL 0x0000FF33
287 291
288/* Possible share capabilities */ 292/* Possible share capabilities */
289#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) 293#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
294#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
295#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
296#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
297#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
290 298
291struct smb2_tree_disconnect_req { 299struct smb2_tree_disconnect_req {
292 struct smb2_hdr hdr; 300 struct smb2_hdr hdr;
@@ -477,6 +485,75 @@ struct create_lease {
477 struct lease_context lcontext; 485 struct lease_context lcontext;
478} __packed; 486} __packed;
479 487
488/* this goes in the ioctl buffer when doing a copychunk request */
489struct copychunk_ioctl {
490 char SourceKey[24];
491 __le32 ChunkCount; /* we are only sending 1 */
492 __le32 Reserved;
493 /* array will only be one chunk long for us */
494 __le64 SourceOffset;
495 __le64 TargetOffset;
496 __le32 Length; /* how many bytes to copy */
497 __u32 Reserved2;
498} __packed;
499
500/* Response and Request are the same format */
501struct validate_negotiate_info {
502 __le32 Capabilities;
503 __u8 Guid[SMB2_CLIENT_GUID_SIZE];
504 __le16 SecurityMode;
505 __le16 DialectCount;
506 __le16 Dialect[1];
507} __packed;
508
509#define RSS_CAPABLE 0x00000001
510#define RDMA_CAPABLE 0x00000002
511
512struct network_interface_info_ioctl_rsp {
513 __le32 Next; /* next interface. zero if this is last one */
514 __le32 IfIndex;
515 __le32 Capability; /* RSS or RDMA Capable */
516 __le32 Reserved;
517 __le64 LinkSpeed;
518 char SockAddr_Storage[128];
519} __packed;
520
521#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
522
523struct smb2_ioctl_req {
524 struct smb2_hdr hdr;
525 __le16 StructureSize; /* Must be 57 */
526 __u16 Reserved;
527 __le32 CtlCode;
528 __u64 PersistentFileId; /* opaque endianness */
529 __u64 VolatileFileId; /* opaque endianness */
530 __le32 InputOffset;
531 __le32 InputCount;
532 __le32 MaxInputResponse;
533 __le32 OutputOffset;
534 __le32 OutputCount;
535 __le32 MaxOutputResponse;
536 __le32 Flags;
537 __u32 Reserved2;
538 char Buffer[0];
539} __packed;
540
541struct smb2_ioctl_rsp {
542 struct smb2_hdr hdr;
543 __le16 StructureSize; /* Must be 57 */
544 __u16 Reserved;
545 __le32 CtlCode;
546 __u64 PersistentFileId; /* opaque endianness */
547 __u64 VolatileFileId; /* opaque endianness */
548 __le32 InputOffset;
549 __le32 InputCount;
550 __le32 OutputOffset;
551 __le32 OutputCount;
552 __le32 Flags;
553 __u32 Reserved2;
554 /* char * buffer[] */
555} __packed;
556
480/* Currently defined values for close flags */ 557/* Currently defined values for close flags */
481#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) 558#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
482struct smb2_close_req { 559struct smb2_close_req {
@@ -517,17 +594,25 @@ struct smb2_flush_rsp {
517 __le16 Reserved; 594 __le16 Reserved;
518} __packed; 595} __packed;
519 596
597/* For read request Flags field below, following flag is defined for SMB3.02 */
598#define SMB2_READFLAG_READ_UNBUFFERED 0x01
599
600/* Channel field for read and write: exactly one of following flags can be set*/
601#define SMB2_CHANNEL_NONE 0x00000000
602#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */
603#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
604
520struct smb2_read_req { 605struct smb2_read_req {
521 struct smb2_hdr hdr; 606 struct smb2_hdr hdr;
522 __le16 StructureSize; /* Must be 49 */ 607 __le16 StructureSize; /* Must be 49 */
523 __u8 Padding; /* offset from start of SMB2 header to place read */ 608 __u8 Padding; /* offset from start of SMB2 header to place read */
524 __u8 Reserved; 609 __u8 Flags; /* MBZ unless SMB3.02 or later */
525 __le32 Length; 610 __le32 Length;
526 __le64 Offset; 611 __le64 Offset;
527 __u64 PersistentFileId; /* opaque endianness */ 612 __u64 PersistentFileId; /* opaque endianness */
528 __u64 VolatileFileId; /* opaque endianness */ 613 __u64 VolatileFileId; /* opaque endianness */
529 __le32 MinimumCount; 614 __le32 MinimumCount;
530 __le32 Channel; /* Reserved MBZ */ 615 __le32 Channel; /* MBZ except for SMB3 or later */
531 __le32 RemainingBytes; 616 __le32 RemainingBytes;
532 __le16 ReadChannelInfoOffset; /* Reserved MBZ */ 617 __le16 ReadChannelInfoOffset; /* Reserved MBZ */
533 __le16 ReadChannelInfoLength; /* Reserved MBZ */ 618 __le16 ReadChannelInfoLength; /* Reserved MBZ */
@@ -545,8 +630,9 @@ struct smb2_read_rsp {
545 __u8 Buffer[1]; 630 __u8 Buffer[1];
546} __packed; 631} __packed;
547 632
548/* For write request Flags field below the following flag is defined: */ 633/* For write request Flags field below the following flags are defined: */
549#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 634#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
635#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
550 636
551struct smb2_write_req { 637struct smb2_write_req {
552 struct smb2_hdr hdr; 638 struct smb2_hdr hdr;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2aa3535e38ce..d4e1eb807457 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -111,6 +111,10 @@ extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
111 __u32 desired_access, __u32 create_disposition, 111 __u32 desired_access, __u32 create_disposition,
112 __u32 file_attributes, __u32 create_options, 112 __u32 file_attributes, __u32 create_options,
113 __u8 *oplock, struct smb2_file_all_info *buf); 113 __u8 *oplock, struct smb2_file_all_info *buf);
114extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
115 u64 persistent_fid, u64 volatile_fid, u32 opcode,
116 bool is_fsctl, char *in_data, u32 indatalen,
117 char **out_data, u32 *plen /* returned data len */);
114extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, 118extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
115 u64 persistent_file_id, u64 volatile_file_id); 119 u64 persistent_file_id, u64 volatile_file_id);
116extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, 120extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 01f0ac800780..09b4fbaadeb6 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -116,11 +116,155 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
116 return rc; 116 return rc;
117} 117}
118 118
119void
120generate_smb3signingkey(struct TCP_Server_Info *server)
121{
122 unsigned char zero = 0x0;
123 __u8 i[4] = {0, 0, 0, 1};
124 __u8 L[4] = {0, 0, 0, 128};
125 int rc = 0;
126 unsigned char prfhash[SMB2_HMACSHA256_SIZE];
127 unsigned char *hashptr = prfhash;
128
129 memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
130 memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
131
132 rc = crypto_shash_setkey(server->secmech.hmacsha256,
133 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
134 if (rc) {
135 cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
136 goto smb3signkey_ret;
137 }
138
139 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
140 if (rc) {
141 cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
142 goto smb3signkey_ret;
143 }
144
145 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
146 i, 4);
147 if (rc) {
148 cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
149 goto smb3signkey_ret;
150 }
151
152 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
153 "SMB2AESCMAC", 12);
154 if (rc) {
155 cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
156 goto smb3signkey_ret;
157 }
158
159 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
160 &zero, 1);
161 if (rc) {
162 cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
163 goto smb3signkey_ret;
164 }
165
166 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
167 "SmbSign", 8);
168 if (rc) {
169 cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
170 goto smb3signkey_ret;
171 }
172
173 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
174 L, 4);
175 if (rc) {
176 cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
177 goto smb3signkey_ret;
178 }
179
180 rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
181 hashptr);
182 if (rc) {
183 cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
184 goto smb3signkey_ret;
185 }
186
187 memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
188
189smb3signkey_ret:
190 return;
191}
192
119int 193int
120smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 194smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
121{ 195{
122 cifs_dbg(FYI, "smb3 signatures not supported yet\n"); 196 int i, rc;
123 return -EOPNOTSUPP; 197 unsigned char smb3_signature[SMB2_CMACAES_SIZE];
198 unsigned char *sigptr = smb3_signature;
199 struct kvec *iov = rqst->rq_iov;
200 int n_vec = rqst->rq_nvec;
201 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
202
203 memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
204 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
205
206 rc = crypto_shash_setkey(server->secmech.cmacaes,
207 server->smb3signingkey, SMB2_CMACAES_SIZE);
208 if (rc) {
209 cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
210 return rc;
211 }
212
213 rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
214 if (rc) {
215 cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
216 return rc;
217 }
218
219 for (i = 0; i < n_vec; i++) {
220 if (iov[i].iov_len == 0)
221 continue;
222 if (iov[i].iov_base == NULL) {
223 cifs_dbg(VFS, "null iovec entry");
224 return -EIO;
225 }
226 /*
227 * The first entry includes a length field (which does not get
228 * signed that occupies the first 4 bytes before the header).
229 */
230 if (i == 0) {
231 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
232 break; /* nothing to sign or corrupt header */
233 rc =
234 crypto_shash_update(
235 &server->secmech.sdesccmacaes->shash,
236 iov[i].iov_base + 4, iov[i].iov_len - 4);
237 } else {
238 rc =
239 crypto_shash_update(
240 &server->secmech.sdesccmacaes->shash,
241 iov[i].iov_base, iov[i].iov_len);
242 }
243 if (rc) {
244 cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
245 __func__);
246 return rc;
247 }
248 }
249
250 /* now hash over the rq_pages array */
251 for (i = 0; i < rqst->rq_npages; i++) {
252 struct kvec p_iov;
253
254 cifs_rqst_page_to_kvec(rqst, i, &p_iov);
255 crypto_shash_update(&server->secmech.sdesccmacaes->shash,
256 p_iov.iov_base, p_iov.iov_len);
257 kunmap(rqst->rq_pages[i]);
258 }
259
260 rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
261 sigptr);
262 if (rc)
263 cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
264
265 memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
266
267 return rc;
124} 268}
125 269
126/* must be called with server->srv_mutex held */ 270/* must be called with server->srv_mutex held */
@@ -275,8 +419,7 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
275 419
276 dump_smb(mid->resp_buf, min_t(u32, 80, len)); 420 dump_smb(mid->resp_buf, min_t(u32, 80, len));
277 /* convert the length into a more usable form */ 421 /* convert the length into a more usable form */
278 if ((len > 24) && 422 if (len > 24 && server->sign) {
279 (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
280 int rc; 423 int rc;
281 424
282 rc = smb2_verify_signature(&rqst, server); 425 rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e087..d952ee48f4dc 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions 2 * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002,2009 4 * Copyright (c) International Business Machines Corp., 2002,2013
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
22/* IOCTL information */ 22/* IOCTL information */
23/* 23/*
24 * List of ioctl/fsctl function codes that are or could be useful in the 24 * List of ioctl/fsctl function codes that are or could be useful in the
25 * future to remote clients like cifs or SMB2 client. There is probably 25 * future to remote clients like cifs or SMB2/SMB3 client. This is probably
26 * a slightly larger set of fsctls that NTFS local filesystem could handle, 26 * a slightly larger set of fsctls that NTFS local filesystem could handle,
27 * including the seven below that we do not have struct definitions for. 27 * including the seven below that we do not have struct definitions for.
28 * Even with protocol definitions for most of these now available, we still 28 * Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
30 * remotely. Some of the following, such as the encryption/compression ones 30 * remotely. Some of the following, such as the encryption/compression ones
31 * could be invoked from tools via a specialized hook into the VFS rather 31 * could be invoked from tools via a specialized hook into the VFS rather
32 * than via the standard vfs entry points 32 * than via the standard vfs entry points
33 *
34 * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
35 * below). Additional detail on less common ones can be found in MS-FSCC
36 * section 2.3.
33 */ 37 */
38#define FSCTL_DFS_GET_REFERRALS 0x00060194
39#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0
34#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 40#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
35#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 41#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
36#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 42#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
@@ -71,14 +77,31 @@
71#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ 77#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
72#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ 78#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
73#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ 79#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
80#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
74#define FSCTL_SIS_LINK_FILES 0x0009C104 81#define FSCTL_SIS_LINK_FILES 0x0009C104
75#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ 82#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
76#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ 83#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
77/* strange that the number for this op is not sequential with previous op */ 84/* strange that the number for this op is not sequential with previous op */
78#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ 85#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
86/* Enumerate previous versions of a file */
87#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
88/* Retrieve an opaque file reference for server-side data movement ie copy */
89#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
90#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
79#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ 91#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
80#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ 92#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
93#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
94/* Perform server-side data movement */
95#define FSCTL_SRV_COPYCHUNK 0x001440F2
96#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
97#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
98#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */
81 99
82#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 100#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
83#define IO_REPARSE_TAG_HSM 0xC0000004 101#define IO_REPARSE_TAG_HSM 0xC0000004
84#define IO_REPARSE_TAG_SIS 0x80000007 102#define IO_REPARSE_TAG_SIS 0x80000007
103
104/* fsctl flags */
105/* If Flags is set to this value, the request is an FSCTL not ioctl request */
106#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
107
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bfbf4700d160..6fdcb1b4a106 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -447,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
447{ 447{
448 int error; 448 int error;
449 449
450 error = wait_event_freezekillable(server->response_q, 450 error = wait_event_freezekillable_unsafe(server->response_q,
451 midQ->mid_state != MID_REQUEST_SUBMITTED); 451 midQ->mid_state != MID_REQUEST_SUBMITTED);
452 if (error < 0) 452 if (error < 0)
453 return -ERESTARTSYS; 453 return -ERESTARTSYS;
@@ -463,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
463 struct mid_q_entry *mid; 463 struct mid_q_entry *mid;
464 464
465 /* enable signing if server requires it */ 465 /* enable signing if server requires it */
466 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 466 if (server->sign)
467 hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 467 hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
468 468
469 mid = AllocMidQEntry(hdr, server); 469 mid = AllocMidQEntry(hdr, server);
@@ -612,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
612 dump_smb(mid->resp_buf, min_t(u32, 92, len)); 612 dump_smb(mid->resp_buf, min_t(u32, 92, len));
613 613
614 /* convert the length into a more usable form */ 614 /* convert the length into a more usable form */
615 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 615 if (server->sign) {
616 struct kvec iov; 616 struct kvec iov;
617 int rc = 0; 617 int rc = 0;
618 struct smb_rqst rqst = { .rq_iov = &iov, 618 struct smb_rqst rqst = { .rq_iov = &iov,
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 87e0ee9f4465..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -487,13 +487,7 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
487 487
488 /* skip null entries */ 488 /* skip null entries */
489 if (vdir->d_fileno && name.len) { 489 if (vdir->d_fileno && name.len) {
490 /* try to look up this entry in the dcache, that way 490 ino = vdir->d_fileno;
491 * userspace doesn't have to worry about breaking
492 * getcwd by having mismatched inode numbers for
493 * internal volume mountpoints. */
494 ino = find_inode_number(de, &name);
495 if (!ino) ino = vdir->d_fileno;
496
497 type = CDT2DT(vdir->d_type); 491 type = CDT2DT(vdir->d_type);
498 if (!dir_emit(ctx, name.name, name.len, ino, type)) 492 if (!dir_emit(ctx, name.name, name.len, ino, type))
499 break; 493 break;
@@ -532,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
532 if (cii->c_flags & C_FLUSH) 526 if (cii->c_flags & C_FLUSH)
533 coda_flag_inode_children(inode, C_FLUSH); 527 coda_flag_inode_children(inode, C_FLUSH);
534 528
535 if (de->d_count > 1) 529 if (d_count(de) > 1)
536 /* pretend it's valid, but don't change the flags */ 530 /* pretend it's valid, but don't change the flags */
537 goto out; 531 goto out;
538 532
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 64e5323cbbb0..5e7c60c1cb63 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
387 if (d->d_inode) 387 if (d->d_inode)
388 simple_rmdir(parent->d_inode,d); 388 simple_rmdir(parent->d_inode,d);
389 389
390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); 390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
391 391
392 dput(parent); 392 dput(parent);
393} 393}
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
203 mutex_lock(&buffer->mutex); 203 mutex_lock(&buffer->mutex);
204 len = fill_write_buffer(buffer, buf, count); 204 len = fill_write_buffer(buffer, buf, count);
205 if (len > 0) 205 if (len > 0)
206 len = flush_write_buffer(file->f_path.dentry, buffer, count); 206 len = flush_write_buffer(file->f_path.dentry, buffer, len);
207 if (len > 0) 207 if (len > 0)
208 *ppos += len; 208 *ppos += len;
209 mutex_unlock(&buffer->mutex); 209 mutex_unlock(&buffer->mutex);
diff --git a/fs/coredump.c b/fs/coredump.c
index dafafbafa731..72f816d6cad9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
45#include <trace/events/sched.h> 45#include <trace/events/sched.h>
46 46
47int core_uses_pid; 47int core_uses_pid;
48char core_pattern[CORENAME_MAX_SIZE] = "core";
49unsigned int core_pipe_limit; 48unsigned int core_pipe_limit;
49char core_pattern[CORENAME_MAX_SIZE] = "core";
50static int core_name_size = CORENAME_MAX_SIZE;
50 51
51struct core_name { 52struct core_name {
52 char *corename; 53 char *corename;
53 int used, size; 54 int used, size;
54}; 55};
55static atomic_t call_count = ATOMIC_INIT(1);
56 56
57/* The maximal length of core_pattern is also specified in sysctl.c */ 57/* The maximal length of core_pattern is also specified in sysctl.c */
58 58
59static int expand_corename(struct core_name *cn) 59static int expand_corename(struct core_name *cn, int size)
60{ 60{
61 char *old_corename = cn->corename; 61 char *corename = krealloc(cn->corename, size, GFP_KERNEL);
62
63 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
64 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
65 62
66 if (!cn->corename) { 63 if (!corename)
67 kfree(old_corename);
68 return -ENOMEM; 64 return -ENOMEM;
69 }
70 65
66 if (size > core_name_size) /* racy but harmless */
67 core_name_size = size;
68
69 cn->size = ksize(corename);
70 cn->corename = corename;
71 return 0; 71 return 0;
72} 72}
73 73
74static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
75{
76 int free, need;
77
78again:
79 free = cn->size - cn->used;
80 need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
81 if (need < free) {
82 cn->used += need;
83 return 0;
84 }
85
86 if (!expand_corename(cn, cn->size + need - free + 1))
87 goto again;
88
89 return -ENOMEM;
90}
91
74static int cn_printf(struct core_name *cn, const char *fmt, ...) 92static int cn_printf(struct core_name *cn, const char *fmt, ...)
75{ 93{
76 char *cur;
77 int need;
78 int ret;
79 va_list arg; 94 va_list arg;
95 int ret;
80 96
81 va_start(arg, fmt); 97 va_start(arg, fmt);
82 need = vsnprintf(NULL, 0, fmt, arg); 98 ret = cn_vprintf(cn, fmt, arg);
83 va_end(arg); 99 va_end(arg);
84 100
85 if (likely(need < cn->size - cn->used - 1)) 101 return ret;
86 goto out_printf; 102}
87 103
88 ret = expand_corename(cn); 104static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
89 if (ret) 105{
90 goto expand_fail; 106 int cur = cn->used;
107 va_list arg;
108 int ret;
91 109
92out_printf:
93 cur = cn->corename + cn->used;
94 va_start(arg, fmt); 110 va_start(arg, fmt);
95 vsnprintf(cur, need + 1, fmt, arg); 111 ret = cn_vprintf(cn, fmt, arg);
96 va_end(arg); 112 va_end(arg);
97 cn->used += need;
98 return 0;
99 113
100expand_fail: 114 for (; cur < cn->used; ++cur) {
115 if (cn->corename[cur] == '/')
116 cn->corename[cur] = '!';
117 }
101 return ret; 118 return ret;
102} 119}
103 120
104static void cn_escape(char *str)
105{
106 for (; *str; str++)
107 if (*str == '/')
108 *str = '!';
109}
110
111static int cn_print_exe_file(struct core_name *cn) 121static int cn_print_exe_file(struct core_name *cn)
112{ 122{
113 struct file *exe_file; 123 struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
115 int ret; 125 int ret;
116 126
117 exe_file = get_mm_exe_file(current->mm); 127 exe_file = get_mm_exe_file(current->mm);
118 if (!exe_file) { 128 if (!exe_file)
119 char *commstart = cn->corename + cn->used; 129 return cn_esc_printf(cn, "%s (path unknown)", current->comm);
120 ret = cn_printf(cn, "%s (path unknown)", current->comm);
121 cn_escape(commstart);
122 return ret;
123 }
124 130
125 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); 131 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
126 if (!pathbuf) { 132 if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
134 goto free_buf; 140 goto free_buf;
135 } 141 }
136 142
137 cn_escape(path); 143 ret = cn_esc_printf(cn, "%s", path);
138
139 ret = cn_printf(cn, "%s", path);
140 144
141free_buf: 145free_buf:
142 kfree(pathbuf); 146 kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
157 int pid_in_pattern = 0; 161 int pid_in_pattern = 0;
158 int err = 0; 162 int err = 0;
159 163
160 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
161 cn->corename = kmalloc(cn->size, GFP_KERNEL);
162 cn->used = 0; 164 cn->used = 0;
163 165 cn->corename = NULL;
164 if (!cn->corename) 166 if (expand_corename(cn, core_name_size))
165 return -ENOMEM; 167 return -ENOMEM;
168 cn->corename[0] = '\0';
169
170 if (ispipe)
171 ++pat_ptr;
166 172
167 /* Repeat as long as we have more pattern to process and more output 173 /* Repeat as long as we have more pattern to process and more output
168 space */ 174 space */
169 while (*pat_ptr) { 175 while (*pat_ptr) {
170 if (*pat_ptr != '%') { 176 if (*pat_ptr != '%') {
171 if (*pat_ptr == 0)
172 goto out;
173 err = cn_printf(cn, "%c", *pat_ptr++); 177 err = cn_printf(cn, "%c", *pat_ptr++);
174 } else { 178 } else {
175 switch (*++pat_ptr) { 179 switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
210 break; 214 break;
211 } 215 }
212 /* hostname */ 216 /* hostname */
213 case 'h': { 217 case 'h':
214 char *namestart = cn->corename + cn->used;
215 down_read(&uts_sem); 218 down_read(&uts_sem);
216 err = cn_printf(cn, "%s", 219 err = cn_esc_printf(cn, "%s",
217 utsname()->nodename); 220 utsname()->nodename);
218 up_read(&uts_sem); 221 up_read(&uts_sem);
219 cn_escape(namestart);
220 break; 222 break;
221 }
222 /* executable */ 223 /* executable */
223 case 'e': { 224 case 'e':
224 char *commstart = cn->corename + cn->used; 225 err = cn_esc_printf(cn, "%s", current->comm);
225 err = cn_printf(cn, "%s", current->comm);
226 cn_escape(commstart);
227 break; 226 break;
228 }
229 case 'E': 227 case 'E':
230 err = cn_print_exe_file(cn); 228 err = cn_print_exe_file(cn);
231 break; 229 break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
244 return err; 242 return err;
245 } 243 }
246 244
245out:
247 /* Backward compatibility with core_uses_pid: 246 /* Backward compatibility with core_uses_pid:
248 * 247 *
249 * If core_pattern does not include a %p (as is the default) 248 * If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
254 if (err) 253 if (err)
255 return err; 254 return err;
256 } 255 }
257out:
258 return ispipe; 256 return ispipe;
259} 257}
260 258
@@ -549,7 +547,7 @@ void do_coredump(siginfo_t *siginfo)
549 if (ispipe < 0) { 547 if (ispipe < 0) {
550 printk(KERN_WARNING "format_corename failed\n"); 548 printk(KERN_WARNING "format_corename failed\n");
551 printk(KERN_WARNING "Aborting core\n"); 549 printk(KERN_WARNING "Aborting core\n");
552 goto fail_corename; 550 goto fail_unlock;
553 } 551 }
554 552
555 if (cprm.limit == 1) { 553 if (cprm.limit == 1) {
@@ -584,7 +582,7 @@ void do_coredump(siginfo_t *siginfo)
584 goto fail_dropcount; 582 goto fail_dropcount;
585 } 583 }
586 584
587 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); 585 helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
588 if (!helper_argv) { 586 if (!helper_argv) {
589 printk(KERN_WARNING "%s failed to allocate memory\n", 587 printk(KERN_WARNING "%s failed to allocate memory\n",
590 __func__); 588 __func__);
@@ -601,7 +599,7 @@ void do_coredump(siginfo_t *siginfo)
601 599
602 argv_free(helper_argv); 600 argv_free(helper_argv);
603 if (retval) { 601 if (retval) {
604 printk(KERN_INFO "Core dump to %s pipe failed\n", 602 printk(KERN_INFO "Core dump to |%s pipe failed\n",
605 cn.corename); 603 cn.corename);
606 goto close_fail; 604 goto close_fail;
607 } 605 }
@@ -669,7 +667,6 @@ fail_dropcount:
669 atomic_dec(&core_dump_count); 667 atomic_dec(&core_dump_count);
670fail_unlock: 668fail_unlock:
671 kfree(cn.corename); 669 kfree(cn.corename);
672fail_corename:
673 coredump_finish(mm, core_dumped); 670 coredump_finish(mm, core_dumped);
674 revert_creds(old_cred); 671 revert_creds(old_cred);
675fail_creds: 672fail_creds:
diff --git a/fs/dcache.c b/fs/dcache.c
index 5a23073138df..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1730,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci);
1730 * Do the slow-case of the dentry name compare. 1730 * Do the slow-case of the dentry name compare.
1731 * 1731 *
1732 * Unlike the dentry_cmp() function, we need to atomically 1732 * Unlike the dentry_cmp() function, we need to atomically
1733 * load the name, length and inode information, so that the 1733 * load the name and length information, so that the
1734 * filesystem can rely on them, and can use the 'name' and 1734 * filesystem can rely on them, and can use the 'name' and
1735 * 'len' information without worrying about walking off the 1735 * 'len' information without worrying about walking off the
1736 * end of memory etc. 1736 * end of memory etc.
@@ -1748,22 +1748,18 @@ enum slow_d_compare {
1748 1748
1749static noinline enum slow_d_compare slow_dentry_cmp( 1749static noinline enum slow_d_compare slow_dentry_cmp(
1750 const struct dentry *parent, 1750 const struct dentry *parent,
1751 struct inode *inode,
1752 struct dentry *dentry, 1751 struct dentry *dentry,
1753 unsigned int seq, 1752 unsigned int seq,
1754 const struct qstr *name) 1753 const struct qstr *name)
1755{ 1754{
1756 int tlen = dentry->d_name.len; 1755 int tlen = dentry->d_name.len;
1757 const char *tname = dentry->d_name.name; 1756 const char *tname = dentry->d_name.name;
1758 struct inode *i = dentry->d_inode;
1759 1757
1760 if (read_seqcount_retry(&dentry->d_seq, seq)) { 1758 if (read_seqcount_retry(&dentry->d_seq, seq)) {
1761 cpu_relax(); 1759 cpu_relax();
1762 return D_COMP_SEQRETRY; 1760 return D_COMP_SEQRETRY;
1763 } 1761 }
1764 if (parent->d_op->d_compare(parent, inode, 1762 if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
1765 dentry, i,
1766 tlen, tname, name))
1767 return D_COMP_NOMATCH; 1763 return D_COMP_NOMATCH;
1768 return D_COMP_OK; 1764 return D_COMP_OK;
1769} 1765}
@@ -1773,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1773 * @parent: parent dentry 1769 * @parent: parent dentry
1774 * @name: qstr of name we wish to find 1770 * @name: qstr of name we wish to find
1775 * @seqp: returns d_seq value at the point where the dentry was found 1771 * @seqp: returns d_seq value at the point where the dentry was found
1776 * @inode: returns dentry->d_inode when the inode was found valid.
1777 * Returns: dentry, or NULL 1772 * Returns: dentry, or NULL
1778 * 1773 *
1779 * __d_lookup_rcu is the dcache lookup function for rcu-walk name 1774 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1800,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1800 */ 1795 */
1801struct dentry *__d_lookup_rcu(const struct dentry *parent, 1796struct dentry *__d_lookup_rcu(const struct dentry *parent,
1802 const struct qstr *name, 1797 const struct qstr *name,
1803 unsigned *seqp, struct inode *inode) 1798 unsigned *seqp)
1804{ 1799{
1805 u64 hashlen = name->hash_len; 1800 u64 hashlen = name->hash_len;
1806 const unsigned char *str = name->name; 1801 const unsigned char *str = name->name;
@@ -1834,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
1834seqretry: 1829seqretry:
1835 /* 1830 /*
1836 * The dentry sequence count protects us from concurrent 1831 * The dentry sequence count protects us from concurrent
1837 * renames, and thus protects inode, parent and name fields. 1832 * renames, and thus protects parent and name fields.
1838 * 1833 *
1839 * The caller must perform a seqcount check in order 1834 * The caller must perform a seqcount check in order
1840 * to do anything useful with the returned dentry, 1835 * to do anything useful with the returned dentry.
1841 * including using the 'd_inode' pointer.
1842 * 1836 *
1843 * NOTE! We do a "raw" seqcount_begin here. That means that 1837 * NOTE! We do a "raw" seqcount_begin here. That means that
1844 * we don't wait for the sequence count to stabilize if it 1838 * we don't wait for the sequence count to stabilize if it
@@ -1852,12 +1846,12 @@ seqretry:
1852 continue; 1846 continue;
1853 if (d_unhashed(dentry)) 1847 if (d_unhashed(dentry))
1854 continue; 1848 continue;
1855 *seqp = seq;
1856 1849
1857 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { 1850 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1858 if (dentry->d_name.hash != hashlen_hash(hashlen)) 1851 if (dentry->d_name.hash != hashlen_hash(hashlen))
1859 continue; 1852 continue;
1860 switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { 1853 *seqp = seq;
1854 switch (slow_dentry_cmp(parent, dentry, seq, name)) {
1861 case D_COMP_OK: 1855 case D_COMP_OK:
1862 return dentry; 1856 return dentry;
1863 case D_COMP_NOMATCH: 1857 case D_COMP_NOMATCH:
@@ -1869,6 +1863,7 @@ seqretry:
1869 1863
1870 if (dentry->d_name.hash_len != hashlen) 1864 if (dentry->d_name.hash_len != hashlen)
1871 continue; 1865 continue;
1866 *seqp = seq;
1872 if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) 1867 if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
1873 return dentry; 1868 return dentry;
1874 } 1869 }
@@ -1966,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
1966 if (parent->d_flags & DCACHE_OP_COMPARE) { 1961 if (parent->d_flags & DCACHE_OP_COMPARE) {
1967 int tlen = dentry->d_name.len; 1962 int tlen = dentry->d_name.len;
1968 const char *tname = dentry->d_name.name; 1963 const char *tname = dentry->d_name.name;
1969 if (parent->d_op->d_compare(parent, parent->d_inode, 1964 if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
1970 dentry, dentry->d_inode,
1971 tlen, tname, name))
1972 goto next; 1965 goto next;
1973 } else { 1966 } else {
1974 if (dentry->d_name.len != len) 1967 if (dentry->d_name.len != len)
@@ -2005,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
2005 */ 1998 */
2006 name->hash = full_name_hash(name->name, name->len); 1999 name->hash = full_name_hash(name->name, name->len);
2007 if (dir->d_flags & DCACHE_OP_HASH) { 2000 if (dir->d_flags & DCACHE_OP_HASH) {
2008 int err = dir->d_op->d_hash(dir, dir->d_inode, name); 2001 int err = dir->d_op->d_hash(dir, name);
2009 if (unlikely(err < 0)) 2002 if (unlikely(err < 0))
2010 return ERR_PTR(err); 2003 return ERR_PTR(err);
2011 } 2004 }
@@ -2975,34 +2968,21 @@ rename_retry:
2975 goto again; 2968 goto again;
2976} 2969}
2977 2970
2978/** 2971void d_tmpfile(struct dentry *dentry, struct inode *inode)
2979 * find_inode_number - check for dentry with name
2980 * @dir: directory to check
2981 * @name: Name to find.
2982 *
2983 * Check whether a dentry already exists for the given name,
2984 * and return the inode number if it has an inode. Otherwise
2985 * 0 is returned.
2986 *
2987 * This routine is used to post-process directory listings for
2988 * filesystems using synthetic inode numbers, and is necessary
2989 * to keep getcwd() working.
2990 */
2991
2992ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2993{ 2972{
2994 struct dentry * dentry; 2973 inode_dec_link_count(inode);
2995 ino_t ino = 0; 2974 BUG_ON(dentry->d_name.name != dentry->d_iname ||
2996 2975 !hlist_unhashed(&dentry->d_alias) ||
2997 dentry = d_hash_and_lookup(dir, name); 2976 !d_unlinked(dentry));
2998 if (!IS_ERR_OR_NULL(dentry)) { 2977 spin_lock(&dentry->d_parent->d_lock);
2999 if (dentry->d_inode) 2978 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
3000 ino = dentry->d_inode->i_ino; 2979 dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
3001 dput(dentry); 2980 (unsigned long long)inode->i_ino);
3002 } 2981 spin_unlock(&dentry->d_lock);
3003 return ino; 2982 spin_unlock(&dentry->d_parent->d_lock);
2983 d_instantiate(dentry, inode);
3004} 2984}
3005EXPORT_SYMBOL(find_inode_number); 2985EXPORT_SYMBOL(d_tmpfile);
3006 2986
3007static __initdata unsigned long dhash_entries; 2987static __initdata unsigned long dhash_entries;
3008static int __init set_dhash_entries(char *str) 2988static int __init set_dhash_entries(char *str)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f71ec125290d..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
37#include <asm/unaligned.h> 37#include <asm/unaligned.h>
38#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
39 39
40static int 40#define DECRYPT 0
41ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, 41#define ENCRYPT 1
42 struct page *dst_page, int dst_offset,
43 struct page *src_page, int src_offset, int size,
44 unsigned char *iv);
45static int
46ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
47 struct page *dst_page, int dst_offset,
48 struct page *src_page, int src_offset, int size,
49 unsigned char *iv);
50 42
51/** 43/**
52 * ecryptfs_to_hex 44 * ecryptfs_to_hex
@@ -336,19 +328,20 @@ static void extent_crypt_complete(struct crypto_async_request *req, int rc)
336} 328}
337 329
338/** 330/**
339 * encrypt_scatterlist 331 * crypt_scatterlist
340 * @crypt_stat: Pointer to the crypt_stat struct to initialize. 332 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
341 * @dest_sg: Destination of encrypted data 333 * @dst_sg: Destination of the data after performing the crypto operation
342 * @src_sg: Data to be encrypted 334 * @src_sg: Data to be encrypted or decrypted
343 * @size: Length of data to be encrypted 335 * @size: Length of data
344 * @iv: iv to use during encryption 336 * @iv: IV to use
337 * @op: ENCRYPT or DECRYPT to indicate the desired operation
345 * 338 *
346 * Returns the number of bytes encrypted; negative value on error 339 * Returns the number of bytes encrypted or decrypted; negative value on error
347 */ 340 */
348static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, 341static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
349 struct scatterlist *dest_sg, 342 struct scatterlist *dst_sg,
350 struct scatterlist *src_sg, int size, 343 struct scatterlist *src_sg, int size,
351 unsigned char *iv) 344 unsigned char *iv, int op)
352{ 345{
353 struct ablkcipher_request *req = NULL; 346 struct ablkcipher_request *req = NULL;
354 struct extent_crypt_result ecr; 347 struct extent_crypt_result ecr;
@@ -391,9 +384,9 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
391 crypt_stat->flags |= ECRYPTFS_KEY_SET; 384 crypt_stat->flags |= ECRYPTFS_KEY_SET;
392 } 385 }
393 mutex_unlock(&crypt_stat->cs_tfm_mutex); 386 mutex_unlock(&crypt_stat->cs_tfm_mutex);
394 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size); 387 ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
395 ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv); 388 rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
396 rc = crypto_ablkcipher_encrypt(req); 389 crypto_ablkcipher_decrypt(req);
397 if (rc == -EINPROGRESS || rc == -EBUSY) { 390 if (rc == -EINPROGRESS || rc == -EBUSY) {
398 struct extent_crypt_result *ecr = req->base.data; 391 struct extent_crypt_result *ecr = req->base.data;
399 392
@@ -407,41 +400,43 @@ out:
407} 400}
408 401
409/** 402/**
410 * ecryptfs_lower_offset_for_extent 403 * lower_offset_for_page
411 * 404 *
412 * Convert an eCryptfs page index into a lower byte offset 405 * Convert an eCryptfs page index into a lower byte offset
413 */ 406 */
414static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 407static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
415 struct ecryptfs_crypt_stat *crypt_stat) 408 struct page *page)
416{ 409{
417 (*offset) = ecryptfs_lower_header_size(crypt_stat) 410 return ecryptfs_lower_header_size(crypt_stat) +
418 + (crypt_stat->extent_size * extent_num); 411 (page->index << PAGE_CACHE_SHIFT);
419} 412}
420 413
421/** 414/**
422 * ecryptfs_encrypt_extent 415 * crypt_extent
423 * @enc_extent_page: Allocated page into which to encrypt the data in
424 * @page
425 * @crypt_stat: crypt_stat containing cryptographic context for the 416 * @crypt_stat: crypt_stat containing cryptographic context for the
426 * encryption operation 417 * encryption operation
427 * @page: Page containing plaintext data extent to encrypt 418 * @dst_page: The page to write the result into
419 * @src_page: The page to read from
428 * @extent_offset: Page extent offset for use in generating IV 420 * @extent_offset: Page extent offset for use in generating IV
421 * @op: ENCRYPT or DECRYPT to indicate the desired operation
429 * 422 *
430 * Encrypts one extent of data. 423 * Encrypts or decrypts one extent of data.
431 * 424 *
432 * Return zero on success; non-zero otherwise 425 * Return zero on success; non-zero otherwise
433 */ 426 */
434static int ecryptfs_encrypt_extent(struct page *enc_extent_page, 427static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
435 struct ecryptfs_crypt_stat *crypt_stat, 428 struct page *dst_page,
436 struct page *page, 429 struct page *src_page,
437 unsigned long extent_offset) 430 unsigned long extent_offset, int op)
438{ 431{
432 pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
439 loff_t extent_base; 433 loff_t extent_base;
440 char extent_iv[ECRYPTFS_MAX_IV_BYTES]; 434 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
435 struct scatterlist src_sg, dst_sg;
436 size_t extent_size = crypt_stat->extent_size;
441 int rc; 437 int rc;
442 438
443 extent_base = (((loff_t)page->index) 439 extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
444 * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
445 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 440 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
446 (extent_base + extent_offset)); 441 (extent_base + extent_offset));
447 if (rc) { 442 if (rc) {
@@ -450,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
450 (unsigned long long)(extent_base + extent_offset), rc); 445 (unsigned long long)(extent_base + extent_offset), rc);
451 goto out; 446 goto out;
452 } 447 }
453 rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, 448
454 page, (extent_offset 449 sg_init_table(&src_sg, 1);
455 * crypt_stat->extent_size), 450 sg_init_table(&dst_sg, 1);
456 crypt_stat->extent_size, extent_iv); 451
452 sg_set_page(&src_sg, src_page, extent_size,
453 extent_offset * extent_size);
454 sg_set_page(&dst_sg, dst_page, extent_size,
455 extent_offset * extent_size);
456
457 rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
458 extent_iv, op);
457 if (rc < 0) { 459 if (rc < 0) {
458 printk(KERN_ERR "%s: Error attempting to encrypt page with " 460 printk(KERN_ERR "%s: Error attempting to crypt page with "
459 "page->index = [%ld], extent_offset = [%ld]; " 461 "page_index = [%ld], extent_offset = [%ld]; "
460 "rc = [%d]\n", __func__, page->index, extent_offset, 462 "rc = [%d]\n", __func__, page_index, extent_offset, rc);
461 rc);
462 goto out; 463 goto out;
463 } 464 }
464 rc = 0; 465 rc = 0;
@@ -489,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
489 char *enc_extent_virt; 490 char *enc_extent_virt;
490 struct page *enc_extent_page = NULL; 491 struct page *enc_extent_page = NULL;
491 loff_t extent_offset; 492 loff_t extent_offset;
493 loff_t lower_offset;
492 int rc = 0; 494 int rc = 0;
493 495
494 ecryptfs_inode = page->mapping->host; 496 ecryptfs_inode = page->mapping->host;
@@ -502,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
502 "encrypted extent\n"); 504 "encrypted extent\n");
503 goto out; 505 goto out;
504 } 506 }
505 enc_extent_virt = kmap(enc_extent_page); 507
506 for (extent_offset = 0; 508 for (extent_offset = 0;
507 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); 509 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
508 extent_offset++) { 510 extent_offset++) {
509 loff_t offset; 511 rc = crypt_extent(crypt_stat, enc_extent_page, page,
510 512 extent_offset, ENCRYPT);
511 rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
512 extent_offset);
513 if (rc) { 513 if (rc) {
514 printk(KERN_ERR "%s: Error encrypting extent; " 514 printk(KERN_ERR "%s: Error encrypting extent; "
515 "rc = [%d]\n", __func__, rc); 515 "rc = [%d]\n", __func__, rc);
516 goto out; 516 goto out;
517 } 517 }
518 ecryptfs_lower_offset_for_extent(
519 &offset, ((((loff_t)page->index)
520 * (PAGE_CACHE_SIZE
521 / crypt_stat->extent_size))
522 + extent_offset), crypt_stat);
523 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
524 offset, crypt_stat->extent_size);
525 if (rc < 0) {
526 ecryptfs_printk(KERN_ERR, "Error attempting "
527 "to write lower page; rc = [%d]"
528 "\n", rc);
529 goto out;
530 }
531 }
532 rc = 0;
533out:
534 if (enc_extent_page) {
535 kunmap(enc_extent_page);
536 __free_page(enc_extent_page);
537 } 518 }
538 return rc;
539}
540 519
541static int ecryptfs_decrypt_extent(struct page *page, 520 lower_offset = lower_offset_for_page(crypt_stat, page);
542 struct ecryptfs_crypt_stat *crypt_stat, 521 enc_extent_virt = kmap(enc_extent_page);
543 struct page *enc_extent_page, 522 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
544 unsigned long extent_offset) 523 PAGE_CACHE_SIZE);
545{ 524 kunmap(enc_extent_page);
546 loff_t extent_base;
547 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
548 int rc;
549
550 extent_base = (((loff_t)page->index)
551 * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
552 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
553 (extent_base + extent_offset));
554 if (rc) {
555 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
556 "extent [0x%.16llx]; rc = [%d]\n",
557 (unsigned long long)(extent_base + extent_offset), rc);
558 goto out;
559 }
560 rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
561 (extent_offset
562 * crypt_stat->extent_size),
563 enc_extent_page, 0,
564 crypt_stat->extent_size, extent_iv);
565 if (rc < 0) { 525 if (rc < 0) {
566 printk(KERN_ERR "%s: Error attempting to decrypt to page with " 526 ecryptfs_printk(KERN_ERR,
567 "page->index = [%ld], extent_offset = [%ld]; " 527 "Error attempting to write lower page; rc = [%d]\n",
568 "rc = [%d]\n", __func__, page->index, extent_offset, 528 rc);
569 rc);
570 goto out; 529 goto out;
571 } 530 }
572 rc = 0; 531 rc = 0;
573out: 532out:
533 if (enc_extent_page) {
534 __free_page(enc_extent_page);
535 }
574 return rc; 536 return rc;
575} 537}
576 538
@@ -594,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
594{ 556{
595 struct inode *ecryptfs_inode; 557 struct inode *ecryptfs_inode;
596 struct ecryptfs_crypt_stat *crypt_stat; 558 struct ecryptfs_crypt_stat *crypt_stat;
597 char *enc_extent_virt; 559 char *page_virt;
598 struct page *enc_extent_page = NULL;
599 unsigned long extent_offset; 560 unsigned long extent_offset;
561 loff_t lower_offset;
600 int rc = 0; 562 int rc = 0;
601 563
602 ecryptfs_inode = page->mapping->host; 564 ecryptfs_inode = page->mapping->host;
603 crypt_stat = 565 crypt_stat =
604 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 566 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
605 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); 567 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
606 enc_extent_page = alloc_page(GFP_USER); 568
607 if (!enc_extent_page) { 569 lower_offset = lower_offset_for_page(crypt_stat, page);
608 rc = -ENOMEM; 570 page_virt = kmap(page);
609 ecryptfs_printk(KERN_ERR, "Error allocating memory for " 571 rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
610 "encrypted extent\n"); 572 ecryptfs_inode);
573 kunmap(page);
574 if (rc < 0) {
575 ecryptfs_printk(KERN_ERR,
576 "Error attempting to read lower page; rc = [%d]\n",
577 rc);
611 goto out; 578 goto out;
612 } 579 }
613 enc_extent_virt = kmap(enc_extent_page); 580
614 for (extent_offset = 0; 581 for (extent_offset = 0;
615 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); 582 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
616 extent_offset++) { 583 extent_offset++) {
617 loff_t offset; 584 rc = crypt_extent(crypt_stat, page, page,
618 585 extent_offset, DECRYPT);
619 ecryptfs_lower_offset_for_extent(
620 &offset, ((page->index * (PAGE_CACHE_SIZE
621 / crypt_stat->extent_size))
622 + extent_offset), crypt_stat);
623 rc = ecryptfs_read_lower(enc_extent_virt, offset,
624 crypt_stat->extent_size,
625 ecryptfs_inode);
626 if (rc < 0) {
627 ecryptfs_printk(KERN_ERR, "Error attempting "
628 "to read lower page; rc = [%d]"
629 "\n", rc);
630 goto out;
631 }
632 rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
633 extent_offset);
634 if (rc) { 586 if (rc) {
635 printk(KERN_ERR "%s: Error encrypting extent; " 587 printk(KERN_ERR "%s: Error encrypting extent; "
636 "rc = [%d]\n", __func__, rc); 588 "rc = [%d]\n", __func__, rc);
@@ -638,142 +590,9 @@ int ecryptfs_decrypt_page(struct page *page)
638 } 590 }
639 } 591 }
640out: 592out:
641 if (enc_extent_page) {
642 kunmap(enc_extent_page);
643 __free_page(enc_extent_page);
644 }
645 return rc; 593 return rc;
646} 594}
647 595
648/**
649 * decrypt_scatterlist
650 * @crypt_stat: Cryptographic context
651 * @dest_sg: The destination scatterlist to decrypt into
652 * @src_sg: The source scatterlist to decrypt from
653 * @size: The number of bytes to decrypt
654 * @iv: The initialization vector to use for the decryption
655 *
656 * Returns the number of bytes decrypted; negative value on error
657 */
658static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
659 struct scatterlist *dest_sg,
660 struct scatterlist *src_sg, int size,
661 unsigned char *iv)
662{
663 struct ablkcipher_request *req = NULL;
664 struct extent_crypt_result ecr;
665 int rc = 0;
666
667 BUG_ON(!crypt_stat || !crypt_stat->tfm
668 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
669 if (unlikely(ecryptfs_verbosity > 0)) {
670 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
671 crypt_stat->key_size);
672 ecryptfs_dump_hex(crypt_stat->key,
673 crypt_stat->key_size);
674 }
675
676 init_completion(&ecr.completion);
677
678 mutex_lock(&crypt_stat->cs_tfm_mutex);
679 req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
680 if (!req) {
681 mutex_unlock(&crypt_stat->cs_tfm_mutex);
682 rc = -ENOMEM;
683 goto out;
684 }
685
686 ablkcipher_request_set_callback(req,
687 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
688 extent_crypt_complete, &ecr);
689 /* Consider doing this once, when the file is opened */
690 if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
691 rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
692 crypt_stat->key_size);
693 if (rc) {
694 ecryptfs_printk(KERN_ERR,
695 "Error setting key; rc = [%d]\n",
696 rc);
697 mutex_unlock(&crypt_stat->cs_tfm_mutex);
698 rc = -EINVAL;
699 goto out;
700 }
701 crypt_stat->flags |= ECRYPTFS_KEY_SET;
702 }
703 mutex_unlock(&crypt_stat->cs_tfm_mutex);
704 ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
705 ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
706 rc = crypto_ablkcipher_decrypt(req);
707 if (rc == -EINPROGRESS || rc == -EBUSY) {
708 struct extent_crypt_result *ecr = req->base.data;
709
710 wait_for_completion(&ecr->completion);
711 rc = ecr->rc;
712 INIT_COMPLETION(ecr->completion);
713 }
714out:
715 ablkcipher_request_free(req);
716 return rc;
717
718}
719
720/**
721 * ecryptfs_encrypt_page_offset
722 * @crypt_stat: The cryptographic context
723 * @dst_page: The page to encrypt into
724 * @dst_offset: The offset in the page to encrypt into
725 * @src_page: The page to encrypt from
726 * @src_offset: The offset in the page to encrypt from
727 * @size: The number of bytes to encrypt
728 * @iv: The initialization vector to use for the encryption
729 *
730 * Returns the number of bytes encrypted
731 */
732static int
733ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
734 struct page *dst_page, int dst_offset,
735 struct page *src_page, int src_offset, int size,
736 unsigned char *iv)
737{
738 struct scatterlist src_sg, dst_sg;
739
740 sg_init_table(&src_sg, 1);
741 sg_init_table(&dst_sg, 1);
742
743 sg_set_page(&src_sg, src_page, size, src_offset);
744 sg_set_page(&dst_sg, dst_page, size, dst_offset);
745 return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
746}
747
748/**
749 * ecryptfs_decrypt_page_offset
750 * @crypt_stat: The cryptographic context
751 * @dst_page: The page to decrypt into
752 * @dst_offset: The offset in the page to decrypt into
753 * @src_page: The page to decrypt from
754 * @src_offset: The offset in the page to decrypt from
755 * @size: The number of bytes to decrypt
756 * @iv: The initialization vector to use for the decryption
757 *
758 * Returns the number of bytes decrypted
759 */
760static int
761ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
762 struct page *dst_page, int dst_offset,
763 struct page *src_page, int src_offset, int size,
764 unsigned char *iv)
765{
766 struct scatterlist src_sg, dst_sg;
767
768 sg_init_table(&src_sg, 1);
769 sg_set_page(&src_sg, src_page, size, src_offset);
770
771 sg_init_table(&dst_sg, 1);
772 sg_set_page(&dst_sg, dst_page, size, dst_offset);
773
774 return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
775}
776
777#define ECRYPTFS_MAX_SCATTERLIST_LEN 4 596#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
778 597
779/** 598/**
@@ -2243,12 +2062,11 @@ out:
2243 */ 2062 */
2244int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, 2063int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2245 size_t *plaintext_name_size, 2064 size_t *plaintext_name_size,
2246 struct dentry *ecryptfs_dir_dentry, 2065 struct super_block *sb,
2247 const char *name, size_t name_size) 2066 const char *name, size_t name_size)
2248{ 2067{
2249 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 2068 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2250 &ecryptfs_superblock_to_private( 2069 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
2251 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2252 char *decoded_name; 2070 char *decoded_name;
2253 size_t decoded_name_size; 2071 size_t decoded_name_size;
2254 size_t packet_size; 2072 size_t packet_size;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f622a733f7ad..df19d34a033b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -575,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
575 struct inode *ecryptfs_inode); 575 struct inode *ecryptfs_inode);
576int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 576int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
577 size_t *decrypted_name_size, 577 size_t *decrypted_name_size,
578 struct dentry *ecryptfs_dentry, 578 struct super_block *sb,
579 const char *name, size_t name_size); 579 const char *name, size_t name_size);
580int ecryptfs_fill_zeros(struct file *file, loff_t new_length); 580int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
581int ecryptfs_encrypt_and_encode_filename( 581int ecryptfs_encrypt_and_encode_filename(
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9aa05e08060b..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -49,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
49 unsigned long nr_segs, loff_t pos) 49 unsigned long nr_segs, loff_t pos)
50{ 50{
51 ssize_t rc; 51 ssize_t rc;
52 struct path lower; 52 struct path *path;
53 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
54 54
55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,9 +60,8 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
60 if (-EIOCBQUEUED == rc) 60 if (-EIOCBQUEUED == rc)
61 rc = wait_on_sync_kiocb(iocb); 61 rc = wait_on_sync_kiocb(iocb);
62 if (rc >= 0) { 62 if (rc >= 0) {
63 lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); 63 path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
64 lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); 64 touch_atime(path);
65 touch_atime(&lower);
66 } 65 }
67 return rc; 66 return rc;
68} 67}
@@ -70,7 +69,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
70struct ecryptfs_getdents_callback { 69struct ecryptfs_getdents_callback {
71 struct dir_context ctx; 70 struct dir_context ctx;
72 struct dir_context *caller; 71 struct dir_context *caller;
73 struct dentry *dentry; 72 struct super_block *sb;
74 int filldir_called; 73 int filldir_called;
75 int entries_written; 74 int entries_written;
76}; 75};
@@ -88,7 +87,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
88 87
89 buf->filldir_called++; 88 buf->filldir_called++;
90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, 89 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
91 buf->dentry, lower_name, 90 buf->sb, lower_name,
92 lower_namelen); 91 lower_namelen);
93 if (rc) { 92 if (rc) {
94 printk(KERN_ERR "%s: Error attempting to decode and decrypt " 93 printk(KERN_ERR "%s: Error attempting to decode and decrypt "
@@ -114,15 +113,14 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
114{ 113{
115 int rc; 114 int rc;
116 struct file *lower_file; 115 struct file *lower_file;
117 struct inode *inode; 116 struct inode *inode = file_inode(file);
118 struct ecryptfs_getdents_callback buf = { 117 struct ecryptfs_getdents_callback buf = {
119 .ctx.actor = ecryptfs_filldir, 118 .ctx.actor = ecryptfs_filldir,
120 .caller = ctx, 119 .caller = ctx,
121 .dentry = file->f_path.dentry 120 .sb = inode->i_sb,
122 }; 121 };
123 lower_file = ecryptfs_file_to_lower(file); 122 lower_file = ecryptfs_file_to_lower(file);
124 lower_file->f_pos = ctx->pos; 123 lower_file->f_pos = ctx->pos;
125 inode = file_inode(file);
126 rc = iterate_dir(lower_file, &buf.ctx); 124 rc = iterate_dir(lower_file, &buf.ctx);
127 ctx->pos = buf.ctx.pos; 125 ctx->pos = buf.ctx.pos;
128 if (rc < 0) 126 if (rc < 0)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5eab400e2590..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
358 358
359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
361 BUG_ON(!lower_dentry->d_count); 361 BUG_ON(!d_count(lower_dentry));
362 362
363 ecryptfs_set_dentry_private(dentry, dentry_info); 363 ecryptfs_set_dentry_private(dentry, dentry_info);
364 ecryptfs_set_dentry_lower(dentry, lower_dentry); 364 ecryptfs_set_dentry_lower(dentry, lower_dentry);
@@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
679 set_fs(old_fs); 679 set_fs(old_fs);
680 if (rc < 0) 680 if (rc < 0)
681 goto out; 681 goto out;
682 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, 682 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
683 lower_buf, rc); 683 lower_buf, rc);
684out: 684out:
685 kfree(lower_buf); 685 kfree(lower_buf);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
120 struct file **lower_file) 120 struct file **lower_file)
121{ 121{
122 const struct cred *cred = current_cred(); 122 const struct cred *cred = current_cred();
123 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 123 struct path *path = ecryptfs_dentry_to_lower_path(dentry);
124 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
125 int rc; 124 int rc;
126 125
127 rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt, 126 rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
128 cred); 127 cred);
129 if (rc) { 128 if (rc) {
130 printk(KERN_ERR "Error opening lower file " 129 printk(KERN_ERR "Error opening lower file "
131 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 130 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
132 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 131 "rc = [%d]\n", path->dentry, path->mnt, rc);
133 (*lower_file) = NULL; 132 (*lower_file) = NULL;
134 } 133 }
135 return rc; 134 return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
247 goto unlock; 247 goto unlock;
248 } 248 }
249 msg_size = (sizeof(*msg) + msg->data_len); 249 msg_size = (sizeof(*msg) + msg->data_len);
250 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); 250 msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
251 if (!msg_ctx->msg) { 251 if (!msg_ctx->msg) {
252 rc = -ENOMEM; 252 rc = -ENOMEM;
253 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " 253 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
254 "GFP_KERNEL memory\n", __func__, msg_size); 254 "GFP_KERNEL memory\n", __func__, msg_size);
255 goto unlock; 255 goto unlock;
256 } 256 }
257 memcpy(msg_ctx->msg, msg, msg_size);
258 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; 257 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
259 wake_up_process(msg_ctx->task); 258 wake_up_process(msg_ctx->task);
260 rc = 0; 259 rc = 0;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 141aee31884f..a8766b880c07 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -45,8 +45,8 @@ static struct super_block *efivarfs_sb;
45 * So we need to perform a case-sensitive match on part 1 and a 45 * So we need to perform a case-sensitive match on part 1 and a
46 * case-insensitive match on part 2. 46 * case-insensitive match on part 2.
47 */ 47 */
48static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, 48static int efivarfs_d_compare(const struct dentry *parent,
49 const struct dentry *dentry, const struct inode *inode, 49 const struct dentry *dentry,
50 unsigned int len, const char *str, 50 unsigned int len, const char *str,
51 const struct qstr *name) 51 const struct qstr *name)
52{ 52{
@@ -63,8 +63,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
63 return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); 63 return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
64} 64}
65 65
66static int efivarfs_d_hash(const struct dentry *dentry, 66static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
67 const struct inode *inode, struct qstr *qstr)
68{ 67{
69 unsigned long hash = init_name_hash(); 68 unsigned long hash = init_name_hash();
70 const unsigned char *s = qstr->name; 69 const unsigned char *s = qstr->name;
@@ -108,7 +107,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
108 q.name = name; 107 q.name = name;
109 q.len = strlen(name); 108 q.len = strlen(name);
110 109
111 err = efivarfs_d_hash(NULL, NULL, &q); 110 err = efivarfs_d_hash(NULL, &q);
112 if (err) 111 if (err)
113 return ERR_PTR(err); 112 return ERR_PTR(err);
114 113
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index deecc7294a67..9ad17b15b454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
34#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/anon_inodes.h> 35#include <linux/anon_inodes.h>
36#include <linux/device.h> 36#include <linux/device.h>
37#include <linux/freezer.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/mman.h> 40#include <asm/mman.h>
@@ -1602,7 +1603,8 @@ fetch_events:
1602 } 1603 }
1603 1604
1604 spin_unlock_irqrestore(&ep->lock, flags); 1605 spin_unlock_irqrestore(&ep->lock, flags);
1605 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 1606 if (!freezable_schedule_hrtimeout_range(to, slack,
1607 HRTIMER_MODE_ABS))
1606 timed_out = 1; 1608 timed_out = 1;
1607 1609
1608 spin_lock_irqsave(&ep->lock, flags); 1610 spin_lock_irqsave(&ep->lock, flags);
@@ -1975,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1975 return -EINVAL; 1977 return -EINVAL;
1976 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) 1978 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1977 return -EFAULT; 1979 return -EFAULT;
1978 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 1980 sigsaved = current->blocked;
1979 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 1981 set_current_blocked(&ksigmask);
1980 } 1982 }
1981 1983
1982 error = sys_epoll_wait(epfd, events, maxevents, timeout); 1984 error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1993,7 +1995,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1993 sizeof(sigsaved)); 1995 sizeof(sigsaved));
1994 set_restore_sigmask(); 1996 set_restore_sigmask();
1995 } else 1997 } else
1996 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1998 set_current_blocked(&sigsaved);
1997 } 1999 }
1998 2000
1999 return error; 2001 return error;
@@ -2020,8 +2022,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2020 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) 2022 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
2021 return -EFAULT; 2023 return -EFAULT;
2022 sigset_from_compat(&ksigmask, &csigmask); 2024 sigset_from_compat(&ksigmask, &csigmask);
2023 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 2025 sigsaved = current->blocked;
2024 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 2026 set_current_blocked(&ksigmask);
2025 } 2027 }
2026 2028
2027 err = sys_epoll_wait(epfd, events, maxevents, timeout); 2029 err = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -2038,7 +2040,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2038 sizeof(sigsaved)); 2040 sizeof(sigsaved));
2039 set_restore_sigmask(); 2041 set_restore_sigmask();
2040 } else 2042 } else
2041 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 2043 set_current_blocked(&sigsaved);
2042 } 2044 }
2043 2045
2044 return err; 2046 return err;
diff --git a/fs/exec.c b/fs/exec.c
index ffd7a813ad3d..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
110 static const struct open_flags uselib_flags = { 110 static const struct open_flags uselib_flags = {
111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
112 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, 112 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
113 .intent = LOOKUP_OPEN 113 .intent = LOOKUP_OPEN,
114 .lookup_flags = LOOKUP_FOLLOW,
114 }; 115 };
115 116
116 if (IS_ERR(tmp)) 117 if (IS_ERR(tmp))
117 goto out; 118 goto out;
118 119
119 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); 120 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
120 putname(tmp); 121 putname(tmp);
121 error = PTR_ERR(file); 122 error = PTR_ERR(file);
122 if (IS_ERR(file)) 123 if (IS_ERR(file))
@@ -756,10 +757,11 @@ struct file *open_exec(const char *name)
756 static const struct open_flags open_exec_flags = { 757 static const struct open_flags open_exec_flags = {
757 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 758 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
758 .acc_mode = MAY_EXEC | MAY_OPEN, 759 .acc_mode = MAY_EXEC | MAY_OPEN,
759 .intent = LOOKUP_OPEN 760 .intent = LOOKUP_OPEN,
761 .lookup_flags = LOOKUP_FOLLOW,
760 }; 762 };
761 763
762 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW); 764 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
763 if (IS_ERR(file)) 765 if (IS_ERR(file))
764 goto out; 766 goto out;
765 767
@@ -930,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
930 * also take its birthdate (always earlier than our own). 932 * also take its birthdate (always earlier than our own).
931 */ 933 */
932 tsk->start_time = leader->start_time; 934 tsk->start_time = leader->start_time;
935 tsk->real_start_time = leader->real_start_time;
933 936
934 BUG_ON(!same_thread_group(leader, tsk)); 937 BUG_ON(!same_thread_group(leader, tsk));
935 BUG_ON(has_group_leader_pid(tsk)); 938 BUG_ON(has_group_leader_pid(tsk));
@@ -945,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
945 * Note: The old leader also uses this pid until release_task 948 * Note: The old leader also uses this pid until release_task
946 * is called. Odd but simple and correct. 949 * is called. Odd but simple and correct.
947 */ 950 */
948 detach_pid(tsk, PIDTYPE_PID);
949 tsk->pid = leader->pid; 951 tsk->pid = leader->pid;
950 attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); 952 change_pid(tsk, PIDTYPE_PID, task_pid(leader));
951 transfer_pid(leader, tsk, PIDTYPE_PGID); 953 transfer_pid(leader, tsk, PIDTYPE_PGID);
952 transfer_pid(leader, tsk, PIDTYPE_SID); 954 transfer_pid(leader, tsk, PIDTYPE_SID);
953 955
@@ -1463,7 +1465,6 @@ static int do_execve_common(const char *filename,
1463 struct files_struct *displaced; 1465 struct files_struct *displaced;
1464 bool clear_in_exec; 1466 bool clear_in_exec;
1465 int retval; 1467 int retval;
1466 const struct cred *cred = current_cred();
1467 1468
1468 /* 1469 /*
1469 * We move the actual failure in case of RLIMIT_NPROC excess from 1470 * We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1472,7 +1473,7 @@ static int do_execve_common(const char *filename,
1472 * whether NPROC limit is still exceeded. 1473 * whether NPROC limit is still exceeded.
1473 */ 1474 */
1474 if ((current->flags & PF_NPROC_EXCEEDED) && 1475 if ((current->flags & PF_NPROC_EXCEEDED) &&
1475 atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { 1476 atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1476 retval = -EAGAIN; 1477 retval = -EAGAIN;
1477 goto out_ret; 1478 goto out_ret;
1478 } 1479 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d9519836..256dd5f4c1c4 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
119 return ext2_add_nondir(dentry, inode); 119 return ext2_add_nondir(dentry, inode);
120} 120}
121 121
122static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
123{
124 struct inode *inode = ext2_new_inode(dir, mode, NULL);
125 if (IS_ERR(inode))
126 return PTR_ERR(inode);
127
128 inode->i_op = &ext2_file_inode_operations;
129 if (ext2_use_xip(inode->i_sb)) {
130 inode->i_mapping->a_ops = &ext2_aops_xip;
131 inode->i_fop = &ext2_xip_file_operations;
132 } else if (test_opt(inode->i_sb, NOBH)) {
133 inode->i_mapping->a_ops = &ext2_nobh_aops;
134 inode->i_fop = &ext2_file_operations;
135 } else {
136 inode->i_mapping->a_ops = &ext2_aops;
137 inode->i_fop = &ext2_file_operations;
138 }
139 mark_inode_dirty(inode);
140 d_tmpfile(dentry, inode);
141 unlock_new_inode(inode);
142 return 0;
143}
144
122static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) 145static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
123{ 146{
124 struct inode * inode; 147 struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
398#endif 421#endif
399 .setattr = ext2_setattr, 422 .setattr = ext2_setattr,
400 .get_acl = ext2_get_acl, 423 .get_acl = ext2_get_acl,
424 .tmpfile = ext2_tmpfile,
401}; 425};
402 426
403const struct inode_operations ext2_special_inode_operations = { 427const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
48 48
49 trace_ext3_sync_file_enter(file, datasync); 49 trace_ext3_sync_file_enter(file, datasync);
50 50
51 if (inode->i_sb->s_flags & MS_RDONLY) 51 if (inode->i_sb->s_flags & MS_RDONLY) {
52 /* Make sure that we read updated state */
53 smp_rmb();
54 if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
55 return -EROFS;
52 return 0; 56 return 0;
53 57 }
54 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 58 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
55 if (ret) 59 if (ret)
56 goto out; 60 goto out;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f67668f724ba..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1985,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1985 .direct_IO = ext3_direct_IO, 1985 .direct_IO = ext3_direct_IO,
1986 .migratepage = buffer_migrate_page, 1986 .migratepage = buffer_migrate_page,
1987 .is_partially_uptodate = block_is_partially_uptodate, 1987 .is_partially_uptodate = block_is_partially_uptodate,
1988 .is_dirty_writeback = buffer_check_dirty_writeback,
1988 .error_remove_page = generic_error_remove_page, 1989 .error_remove_page = generic_error_remove_page,
1989}; 1990};
1990 1991
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index cea8ecf3e76e..998ea111e537 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1759,6 +1759,45 @@ retry:
1759 return err; 1759 return err;
1760} 1760}
1761 1761
1762static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1763{
1764 handle_t *handle;
1765 struct inode *inode;
1766 int err, retries = 0;
1767
1768 dquot_initialize(dir);
1769
1770retry:
1771 handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
1772 4 + EXT3_XATTR_TRANS_BLOCKS);
1773
1774 if (IS_ERR(handle))
1775 return PTR_ERR(handle);
1776
1777 inode = ext3_new_inode (handle, dir, NULL, mode);
1778 err = PTR_ERR(inode);
1779 if (!IS_ERR(inode)) {
1780 inode->i_op = &ext3_file_inode_operations;
1781 inode->i_fop = &ext3_file_operations;
1782 ext3_set_aops(inode);
1783 err = ext3_orphan_add(handle, inode);
1784 if (err)
1785 goto err_drop_inode;
1786 mark_inode_dirty(inode);
1787 d_tmpfile(dentry, inode);
1788 unlock_new_inode(inode);
1789 }
1790 ext3_journal_stop(handle);
1791 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1792 goto retry;
1793 return err;
1794err_drop_inode:
1795 ext3_journal_stop(handle);
1796 unlock_new_inode(inode);
1797 iput(inode);
1798 return err;
1799}
1800
1762static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) 1801static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
1763{ 1802{
1764 handle_t *handle; 1803 handle_t *handle;
@@ -2300,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry,
2300 2339
2301retry: 2340retry:
2302 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2341 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2303 EXT3_INDEX_EXTRA_TRANS_BLOCKS); 2342 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2304 if (IS_ERR(handle)) 2343 if (IS_ERR(handle))
2305 return PTR_ERR(handle); 2344 return PTR_ERR(handle);
2306 2345
@@ -2314,6 +2353,11 @@ retry:
2314 err = ext3_add_entry(handle, dentry, inode); 2353 err = ext3_add_entry(handle, dentry, inode);
2315 if (!err) { 2354 if (!err) {
2316 ext3_mark_inode_dirty(handle, inode); 2355 ext3_mark_inode_dirty(handle, inode);
2356 /* this can happen only for tmpfile being
2357 * linked the first time
2358 */
2359 if (inode->i_nlink == 1)
2360 ext3_orphan_del(handle, inode);
2317 d_instantiate(dentry, inode); 2361 d_instantiate(dentry, inode);
2318 } else { 2362 } else {
2319 drop_nlink(inode); 2363 drop_nlink(inode);
@@ -2516,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2516 .mkdir = ext3_mkdir, 2560 .mkdir = ext3_mkdir,
2517 .rmdir = ext3_rmdir, 2561 .rmdir = ext3_rmdir,
2518 .mknod = ext3_mknod, 2562 .mknod = ext3_mknod,
2563 .tmpfile = ext3_tmpfile,
2519 .rename = ext3_rename, 2564 .rename = ext3_rename,
2520 .setattr = ext3_setattr, 2565 .setattr = ext3_setattr,
2521#ifdef CONFIG_EXT3_FS_XATTR 2566#ifdef CONFIG_EXT3_FS_XATTR
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6356665a74bb..c47f14750722 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
174 if (test_opt (sb, ERRORS_RO)) { 174 if (test_opt (sb, ERRORS_RO)) {
175 ext3_msg(sb, KERN_CRIT, 175 ext3_msg(sb, KERN_CRIT,
176 "error: remounting filesystem read-only"); 176 "error: remounting filesystem read-only");
177 /*
178 * Make sure updated value of ->s_mount_state will be visible
179 * before ->s_flags update.
180 */
181 smp_wmb();
177 sb->s_flags |= MS_RDONLY; 182 sb->s_flags |= MS_RDONLY;
178 } 183 }
179 ext3_commit_super(sb, es, 1); 184 ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
291 ext3_msg(sb, KERN_CRIT, 296 ext3_msg(sb, KERN_CRIT,
292 "error: remounting filesystem read-only"); 297 "error: remounting filesystem read-only");
293 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; 298 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
294 sb->s_flags |= MS_RDONLY;
295 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); 299 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
300 /*
301 * Make sure updated value of ->s_mount_state will be visible
302 * before ->s_flags update.
303 */
304 smp_wmb();
305 sb->s_flags |= MS_RDONLY;
306
296 if (EXT3_SB(sb)->s_journal) 307 if (EXT3_SB(sb)->s_journal)
297 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 308 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
298} 309}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b19f0a457f32..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
494 if (dataoff > isize) 494 if (dataoff > isize)
495 return -ENXIO; 495 return -ENXIO;
496 496
497 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 497 return vfs_setpos(file, dataoff, maxsize);
498 return -EINVAL;
499 if (dataoff > maxsize)
500 return -EINVAL;
501
502 if (dataoff != file->f_pos) {
503 file->f_pos = dataoff;
504 file->f_version = 0;
505 }
506
507 return dataoff;
508} 498}
509 499
510/* 500/*
@@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
580 if (holeoff > isize) 570 if (holeoff > isize)
581 holeoff = isize; 571 holeoff = isize;
582 572
583 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 573 return vfs_setpos(file, holeoff, maxsize);
584 return -EINVAL;
585 if (holeoff > maxsize)
586 return -EINVAL;
587
588 if (holeoff != file->f_pos) {
589 file->f_pos = holeoff;
590 file->f_version = 0;
591 }
592
593 return holeoff;
594} 574}
595 575
596/* 576/*
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab2f6dc44b3a..234b834d5a97 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2296,6 +2296,45 @@ retry:
2296 return err; 2296 return err;
2297} 2297}
2298 2298
2299static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2300{
2301 handle_t *handle;
2302 struct inode *inode;
2303 int err, retries = 0;
2304
2305 dquot_initialize(dir);
2306
2307retry:
2308 inode = ext4_new_inode_start_handle(dir, mode,
2309 NULL, 0, NULL,
2310 EXT4_HT_DIR,
2311 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2312 4 + EXT4_XATTR_TRANS_BLOCKS);
2313 handle = ext4_journal_current_handle();
2314 err = PTR_ERR(inode);
2315 if (!IS_ERR(inode)) {
2316 inode->i_op = &ext4_file_inode_operations;
2317 inode->i_fop = &ext4_file_operations;
2318 ext4_set_aops(inode);
2319 err = ext4_orphan_add(handle, inode);
2320 if (err)
2321 goto err_drop_inode;
2322 mark_inode_dirty(inode);
2323 d_tmpfile(dentry, inode);
2324 unlock_new_inode(inode);
2325 }
2326 if (handle)
2327 ext4_journal_stop(handle);
2328 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2329 goto retry;
2330 return err;
2331err_drop_inode:
2332 ext4_journal_stop(handle);
2333 unlock_new_inode(inode);
2334 iput(inode);
2335 return err;
2336}
2337
2299struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2338struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2300 struct ext4_dir_entry_2 *de, 2339 struct ext4_dir_entry_2 *de,
2301 int blocksize, int csum_size, 2340 int blocksize, int csum_size,
@@ -2903,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,
2903retry: 2942retry:
2904 handle = ext4_journal_start(dir, EXT4_HT_DIR, 2943 handle = ext4_journal_start(dir, EXT4_HT_DIR,
2905 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2944 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2906 EXT4_INDEX_EXTRA_TRANS_BLOCKS)); 2945 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
2907 if (IS_ERR(handle)) 2946 if (IS_ERR(handle))
2908 return PTR_ERR(handle); 2947 return PTR_ERR(handle);
2909 2948
@@ -2917,6 +2956,11 @@ retry:
2917 err = ext4_add_entry(handle, dentry, inode); 2956 err = ext4_add_entry(handle, dentry, inode);
2918 if (!err) { 2957 if (!err) {
2919 ext4_mark_inode_dirty(handle, inode); 2958 ext4_mark_inode_dirty(handle, inode);
2959 /* this can happen only for tmpfile being
2960 * linked the first time
2961 */
2962 if (inode->i_nlink == 1)
2963 ext4_orphan_del(handle, inode);
2920 d_instantiate(dentry, inode); 2964 d_instantiate(dentry, inode);
2921 } else { 2965 } else {
2922 drop_nlink(inode); 2966 drop_nlink(inode);
@@ -3169,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3169 .mkdir = ext4_mkdir, 3213 .mkdir = ext4_mkdir,
3170 .rmdir = ext4_rmdir, 3214 .rmdir = ext4_rmdir,
3171 .mknod = ext4_mknod, 3215 .mknod = ext4_mknod,
3216 .tmpfile = ext4_tmpfile,
3172 .rename = ext4_rename, 3217 .rename = ext4_rename,
3173 .setattr = ext4_setattr, 3218 .setattr = ext4_setattr,
3174 .setxattr = generic_setxattr, 3219 .setxattr = generic_setxattr,
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 9d1cd423450d..62f0d5977c64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -610,13 +610,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
610{ 610{
611 struct inode *inode = file_inode(file); 611 struct inode *inode = file_inode(file);
612 unsigned long npages = dir_blocks(inode); 612 unsigned long npages = dir_blocks(inode);
613 unsigned int bit_pos = 0, start_bit_pos = 0; 613 unsigned int bit_pos = 0;
614 struct f2fs_dentry_block *dentry_blk = NULL; 614 struct f2fs_dentry_block *dentry_blk = NULL;
615 struct f2fs_dir_entry *de = NULL; 615 struct f2fs_dir_entry *de = NULL;
616 struct page *dentry_page = NULL; 616 struct page *dentry_page = NULL;
617 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); 617 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
618 unsigned char d_type = DT_UNKNOWN; 618 unsigned char d_type = DT_UNKNOWN;
619 int slots;
620 619
621 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); 620 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
622 621
@@ -625,7 +624,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
625 if (IS_ERR(dentry_page)) 624 if (IS_ERR(dentry_page))
626 continue; 625 continue;
627 626
628 start_bit_pos = bit_pos;
629 dentry_blk = kmap(dentry_page); 627 dentry_blk = kmap(dentry_page);
630 while (bit_pos < NR_DENTRY_IN_BLOCK) { 628 while (bit_pos < NR_DENTRY_IN_BLOCK) {
631 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 629 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -634,19 +632,19 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
634 if (bit_pos >= NR_DENTRY_IN_BLOCK) 632 if (bit_pos >= NR_DENTRY_IN_BLOCK)
635 break; 633 break;
636 634
637 ctx->pos += bit_pos - start_bit_pos;
638 de = &dentry_blk->dentry[bit_pos]; 635 de = &dentry_blk->dentry[bit_pos];
639 if (de->file_type < F2FS_FT_MAX) 636 if (de->file_type < F2FS_FT_MAX)
640 d_type = f2fs_filetype_table[de->file_type]; 637 d_type = f2fs_filetype_table[de->file_type];
641 else 638 else
642 d_type = DT_UNKNOWN; 639 d_type = DT_UNKNOWN;
643 if (!dir_emit(ctx, 640 if (!dir_emit(ctx,
644 dentry_blk->filename[bit_pos], 641 dentry_blk->filename[bit_pos],
645 le16_to_cpu(de->name_len), 642 le16_to_cpu(de->name_len),
646 le32_to_cpu(de->ino), d_type)) 643 le32_to_cpu(de->ino), d_type))
647 goto success; 644 goto stop;
648 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 645
649 bit_pos += slots; 646 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
647 ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
650 } 648 }
651 bit_pos = 0; 649 bit_pos = 0;
652 ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; 650 ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -654,7 +652,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
654 f2fs_put_page(dentry_page, 1); 652 f2fs_put_page(dentry_page, 1);
655 dentry_page = NULL; 653 dentry_page = NULL;
656 } 654 }
657success: 655stop:
658 if (dentry_page && !IS_ERR(dentry_page)) { 656 if (dentry_page && !IS_ERR(dentry_page)) {
659 kunmap(dentry_page); 657 kunmap(dentry_page);
660 f2fs_put_page(dentry_page, 1); 658 f2fs_put_page(dentry_page, 1);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
86 const void *dir_ops; /* Opaque; default directory operations */ 86 const void *dir_ops; /* Opaque; default directory operations */
87 int dir_per_block; /* dir entries per block */ 87 int dir_per_block; /* dir entries per block */
88 int dir_per_block_bits; /* log2(dir_per_block) */ 88 int dir_per_block_bits; /* log2(dir_per_block) */
89 unsigned int vol_id; /*volume ID*/
89 90
90 int fatent_shift; 91 int fatent_shift;
91 struct fatent_operations *fatent_ops; 92 struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
114 return err; 114 return err;
115} 115}
116 116
117static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
118{
119 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
120 return put_user(sbi->vol_id, user_attr);
121}
122
117long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 123long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118{ 124{
119 struct inode *inode = file_inode(filp); 125 struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
124 return fat_ioctl_get_attributes(inode, user_attr); 130 return fat_ioctl_get_attributes(inode, user_attr);
125 case FAT_IOCTL_SET_ATTRIBUTES: 131 case FAT_IOCTL_SET_ATTRIBUTES:
126 return fat_ioctl_set_attributes(filp, user_attr); 132 return fat_ioctl_set_attributes(filp, user_attr);
133 case FAT_IOCTL_GET_VOLUME_ID:
134 return fat_ioctl_get_volume_id(inode, user_attr);
127 default: 135 default:
128 return -ENOTTY; /* Inappropriate ioctl for device */ 136 return -ENOTTY; /* Inappropriate ioctl for device */
129 } 137 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1415 brelse(fsinfo_bh); 1415 brelse(fsinfo_bh);
1416 } 1416 }
1417 1417
1418 /* interpret volume ID as a little endian 32 bit integer */
1419 if (sbi->fat_bits == 32)
1420 sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
1421 ((u32)b->fat32.vol_id[1] << 8) |
1422 ((u32)b->fat32.vol_id[2] << 16) |
1423 ((u32)b->fat32.vol_id[3] << 24));
1424 else /* fat 16 or 12 */
1425 sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
1426 ((u32)b->fat16.vol_id[1] << 8) |
1427 ((u32)b->fat16.vol_id[2] << 16) |
1428 ((u32)b->fat16.vol_id[3] << 24));
1429
1418 sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); 1430 sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
1419 sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; 1431 sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
1420 1432
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
30 va_start(args, fmt); 30 va_start(args, fmt);
31 vaf.fmt = fmt; 31 vaf.fmt = fmt;
32 vaf.va = &args; 32 vaf.va = &args;
33 printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); 33 fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
34 va_end(args); 34 va_end(args);
35 } 35 }
36 36
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); 38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { 39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
40 sb->s_flags |= MS_RDONLY; 40 sb->s_flags |= MS_RDONLY;
41 printk(KERN_ERR "FAT-fs (%s): Filesystem has been " 41 fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
42 "set read-only\n", sb->s_id);
43 } 42 }
44} 43}
45EXPORT_SYMBOL_GPL(__fat_fs_error); 44EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 081b759cff83..a783b0e1272a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
148 * that the existing dentry can be used. The msdos fs routines will 148 * that the existing dentry can be used. The msdos fs routines will
149 * return ENOENT or EINVAL as appropriate. 149 * return ENOENT or EINVAL as appropriate.
150 */ 150 */
151static int msdos_hash(const struct dentry *dentry, const struct inode *inode, 151static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
152 struct qstr *qstr)
153{ 152{
154 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 153 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
155 unsigned char msdos_name[MSDOS_NAME]; 154 unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
165 * Compare two msdos names. If either of the names are invalid, 164 * Compare two msdos names. If either of the names are invalid,
166 * we fall back to doing the standard name comparison. 165 * we fall back to doing the standard name comparison.
167 */ 166 */
168static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, 167static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
169 const struct dentry *dentry, const struct inode *inode,
170 unsigned int len, const char *str, const struct qstr *name) 168 unsigned int len, const char *str, const struct qstr *name)
171{ 169{
172 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; 170 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2da952036a3d..6df8d3d885e5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
107 * that the existing dentry can be used. The vfat fs routines will 107 * that the existing dentry can be used. The vfat fs routines will
108 * return ENOENT or EINVAL as appropriate. 108 * return ENOENT or EINVAL as appropriate.
109 */ 109 */
110static int vfat_hash(const struct dentry *dentry, const struct inode *inode, 110static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
111 struct qstr *qstr)
112{ 111{
113 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); 112 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
114 return 0; 113 return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
120 * that the existing dentry can be used. The vfat fs routines will 119 * that the existing dentry can be used. The vfat fs routines will
121 * return ENOENT or EINVAL as appropriate. 120 * return ENOENT or EINVAL as appropriate.
122 */ 121 */
123static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, 122static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
124 struct qstr *qstr)
125{ 123{
126 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; 124 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
127 const unsigned char *name; 125 const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
142/* 140/*
143 * Case insensitive compare of two vfat names. 141 * Case insensitive compare of two vfat names.
144 */ 142 */
145static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, 143static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
146 const struct dentry *dentry, const struct inode *inode,
147 unsigned int len, const char *str, const struct qstr *name) 144 unsigned int len, const char *str, const struct qstr *name)
148{ 145{
149 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; 146 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
162/* 159/*
163 * Case sensitive compare of two vfat names. 160 * Case sensitive compare of two vfat names.
164 */ 161 */
165static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, 162static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
166 const struct dentry *dentry, const struct inode *inode,
167 unsigned int len, const char *str, const struct qstr *name) 163 unsigned int len, const char *str, const struct qstr *name)
168{ 164{
169 unsigned int alen, blen; 165 unsigned int alen, blen;
diff --git a/fs/file_table.c b/fs/file_table.c
index 485dc0eddd67..08e719b884ca 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
227{ 227{
228 struct dentry *dentry = file->f_path.dentry; 228 struct dentry *dentry = file->f_path.dentry;
229 struct vfsmount *mnt = file->f_path.mnt; 229 struct vfsmount *mnt = file->f_path.mnt;
230 struct inode *inode = dentry->d_inode; 230 struct inode *inode = file->f_inode;
231 231
232 might_sleep(); 232 might_sleep();
233 233
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a85ac4e33436..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
963/* 963/*
964 * Retrieve work items and do the writeback they describe 964 * Retrieve work items and do the writeback they describe
965 */ 965 */
966long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 966static long wb_do_writeback(struct bdi_writeback *wb)
967{ 967{
968 struct backing_dev_info *bdi = wb->bdi; 968 struct backing_dev_info *bdi = wb->bdi;
969 struct wb_writeback_work *work; 969 struct wb_writeback_work *work;
@@ -971,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
971 971
972 set_bit(BDI_writeback_running, &wb->bdi->state); 972 set_bit(BDI_writeback_running, &wb->bdi->state);
973 while ((work = get_next_work_item(bdi)) != NULL) { 973 while ((work = get_next_work_item(bdi)) != NULL) {
974 /*
975 * Override sync mode, in case we must wait for completion
976 * because this thread is exiting now.
977 */
978 if (force_wait)
979 work->sync_mode = WB_SYNC_ALL;
980 974
981 trace_writeback_exec(bdi, work); 975 trace_writeback_exec(bdi, work);
982 976
@@ -1025,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
1025 * rescuer as work_list needs to be drained. 1019 * rescuer as work_list needs to be drained.
1026 */ 1020 */
1027 do { 1021 do {
1028 pages_written = wb_do_writeback(wb, 0); 1022 pages_written = wb_do_writeback(wb);
1029 trace_writeback_pages_written(pages_written); 1023 trace_writeback_pages_written(pages_written);
1030 } while (!list_empty(&bdi->work_list)); 1024 } while (!list_empty(&bdi->work_list));
1031 } else { 1025 } else {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 35f281033142..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -548,8 +548,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
548 res = io->bytes < 0 ? io->size : io->bytes; 548 res = io->bytes < 0 ? io->size : io->bytes;
549 549
550 if (!is_sync_kiocb(io->iocb)) { 550 if (!is_sync_kiocb(io->iocb)) {
551 struct path *path = &io->iocb->ki_filp->f_path; 551 struct inode *inode = file_inode(io->iocb->ki_filp);
552 struct inode *inode = path->dentry->d_inode;
553 struct fuse_conn *fc = get_fuse_conn(inode); 552 struct fuse_conn *fc = get_fuse_conn(inode);
554 struct fuse_inode *fi = get_fuse_inode(inode); 553 struct fuse_inode *fi = get_fuse_inode(inode);
555 554
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a0cdde14a08..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
785static void sanitize_global_limit(unsigned *limit) 785static void sanitize_global_limit(unsigned *limit)
786{ 786{
787 if (*limit == 0) 787 if (*limit == 0)
788 *limit = ((num_physpages << PAGE_SHIFT) >> 13) / 788 *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
789 sizeof(struct fuse_req); 789 sizeof(struct fuse_req);
790 790
791 if (*limit >= 1 << 16) 791 if (*limit >= 1 << 16)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d25..f2448ab2aac5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
109 return 0; 109 return 0;
110} 110}
111 111
112static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, 112static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
113 struct qstr *str)
114{ 113{
115 str->hash = gfs2_disk_hash(str->name, str->len); 114 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0; 115 return 0;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f99f9e8a325f..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -912,7 +912,7 @@ out_uninit:
912 * cluster; until we do, disable leases (by just returning -EINVAL), 912 * cluster; until we do, disable leases (by just returning -EINVAL),
913 * unless the administrator has requested purely local locking. 913 * unless the administrator has requested purely local locking.
914 * 914 *
915 * Locking: called under lock_flocks 915 * Locking: called under i_lock
916 * 916 *
917 * Returns: errno 917 * Returns: errno
918 */ 918 */
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a73b11839a41..0524cda47a6e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -229,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
229/* string.c */ 229/* string.c */
230extern const struct dentry_operations hfs_dentry_operations; 230extern const struct dentry_operations hfs_dentry_operations;
231 231
232extern int hfs_hash_dentry(const struct dentry *, const struct inode *, 232extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
233 struct qstr *);
234extern int hfs_strcmp(const unsigned char *, unsigned int, 233extern int hfs_strcmp(const unsigned char *, unsigned int,
235 const unsigned char *, unsigned int); 234 const unsigned char *, unsigned int);
236extern int hfs_compare_dentry(const struct dentry *parent, 235extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
237 const struct inode *pinode,
238 const struct dentry *dentry, const struct inode *inode,
239 unsigned int len, const char *str, const struct qstr *name); 236 unsigned int len, const char *str, const struct qstr *name);
240 237
241/* trans.c */ 238/* trans.c */
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc9..85b610c3909f 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
51/* 51/*
52 * Hash a string to an integer in a case-independent way 52 * Hash a string to an integer in a case-independent way
53 */ 53 */
54int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 54int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
55 struct qstr *this)
56{ 55{
57 const unsigned char *name = this->name; 56 const unsigned char *name = this->name;
58 unsigned int hash, len = this->len; 57 unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
93 * Test for equality of two strings in the HFS filename character ordering. 92 * Test for equality of two strings in the HFS filename character ordering.
94 * return 1 on failure and 0 on success 93 * return 1 on failure and 0 on success
95 */ 94 */
96int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, 95int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
97 const struct dentry *dentry, const struct inode *inode,
98 unsigned int len, const char *str, const struct qstr *name) 96 unsigned int len, const char *str, const struct qstr *name)
99{ 97{
100 const unsigned char *n1, *n2; 98 const unsigned char *n1, *n2;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 60b0a3388b26..ede79317cfb8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -495,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *,
495 const struct hfsplus_unistr *, char *, int *); 495 const struct hfsplus_unistr *, char *, int *);
496int hfsplus_asc2uni(struct super_block *, 496int hfsplus_asc2uni(struct super_block *,
497 struct hfsplus_unistr *, int, const char *, int); 497 struct hfsplus_unistr *, int, const char *, int);
498int hfsplus_hash_dentry(const struct dentry *dentry, 498int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
499 const struct inode *inode, struct qstr *str); 499int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
500int hfsplus_compare_dentry(const struct dentry *parent,
501 const struct inode *pinode,
502 const struct dentry *dentry, const struct inode *inode,
503 unsigned int len, const char *str, const struct qstr *name); 500 unsigned int len, const char *str, const struct qstr *name);
504 501
505/* wrapper.c */ 502/* wrapper.c */
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 2c2e47dcfdd8..e8ef121a4d8b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb,
334 * Composed unicode characters are decomposed and case-folding is performed 334 * Composed unicode characters are decomposed and case-folding is performed
335 * if the appropriate bits are (un)set on the superblock. 335 * if the appropriate bits are (un)set on the superblock.
336 */ 336 */
337int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, 337int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
338 struct qstr *str)
339{ 338{
340 struct super_block *sb = dentry->d_sb; 339 struct super_block *sb = dentry->d_sb;
341 const char *astr; 340 const char *astr;
@@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
386 * Composed unicode characters are decomposed and case-folding is performed 385 * Composed unicode characters are decomposed and case-folding is performed
387 * if the appropriate bits are (un)set on the superblock. 386 * if the appropriate bits are (un)set on the superblock.
388 */ 387 */
389int hfsplus_compare_dentry(const struct dentry *parent, 388int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
390 const struct inode *pinode,
391 const struct dentry *dentry, const struct inode *inode,
392 unsigned int len, const char *str, const struct qstr *name) 389 unsigned int len, const char *str, const struct qstr *name)
393{ 390{
394 struct super_block *sb = parent->d_sb; 391 struct super_block *sb = parent->d_sb;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f49d1498aa2e..4d0a1afa058c 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -7,8 +7,37 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/blkdev.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
13void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
14{
15 struct buffer_head *bh;
16 struct blk_plug plug;
17
18 if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
19 return;
20
21 bh = sb_find_get_block(s, secno);
22 if (bh) {
23 if (buffer_uptodate(bh)) {
24 brelse(bh);
25 return;
26 }
27 brelse(bh);
28 };
29
30 blk_start_plug(&plug);
31 while (n > 0) {
32 if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
33 break;
34 sb_breadahead(s, secno);
35 secno++;
36 n--;
37 }
38 blk_finish_plug(&plug);
39}
40
12/* Map a sector into a buffer and return pointers to it and to the buffer. */ 41/* Map a sector into a buffer and return pointers to it and to the buffer. */
13 42
14void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, 43void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
@@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
18 47
19 hpfs_lock_assert(s); 48 hpfs_lock_assert(s);
20 49
50 hpfs_prefetch_sectors(s, secno, ahead);
51
21 cond_resched(); 52 cond_resched();
22 53
23 *bhp = bh = sb_bread(s, secno); 54 *bhp = bh = sb_bread(s, secno);
@@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
67 return NULL; 98 return NULL;
68 } 99 }
69 100
101 hpfs_prefetch_sectors(s, secno, 4 + ahead);
102
70 qbh->data = data = kmalloc(2048, GFP_NOFS); 103 qbh->data = data = kmalloc(2048, GFP_NOFS);
71 if (!data) { 104 if (!data) {
72 printk("HPFS: hpfs_map_4sectors: out of memory\n"); 105 printk("HPFS: hpfs_map_4sectors: out of memory\n");
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e77..fa27980f2229 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
12 * Note: the dentry argument is the parent dentry. 12 * Note: the dentry argument is the parent dentry.
13 */ 13 */
14 14
15static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 15static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
16 struct qstr *qstr)
17{ 16{
18 unsigned long hash; 17 unsigned long hash;
19 int i; 18 int i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
35 return 0; 34 return 0;
36} 35}
37 36
38static int hpfs_compare_dentry(const struct dentry *parent, 37static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
39 const struct inode *pinode,
40 const struct dentry *dentry, const struct inode *inode,
41 unsigned int len, const char *str, const struct qstr *name) 38 unsigned int len, const char *str, const struct qstr *name)
42{ 39{
43 unsigned al = len; 40 unsigned al = len;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index e4ba5fe4c3b5..4e9dabcf1f4c 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10#include <linux/mpage.h>
10 11
11#define BLOCKS(size) (((size) + 511) >> 9) 12#define BLOCKS(size) (((size) + 511) >> 9)
12 13
@@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
34 * so we must ignore such errors. 35 * so we must ignore such errors.
35 */ 36 */
36 37
37static secno hpfs_bmap(struct inode *inode, unsigned file_secno) 38static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
38{ 39{
39 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 40 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
40 unsigned n, disk_secno; 41 unsigned n, disk_secno;
@@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
42 struct buffer_head *bh; 43 struct buffer_head *bh;
43 if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0; 44 if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
44 n = file_secno - hpfs_inode->i_file_sec; 45 n = file_secno - hpfs_inode->i_file_sec;
45 if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n; 46 if (n < hpfs_inode->i_n_secs) {
47 *n_secs = hpfs_inode->i_n_secs - n;
48 return hpfs_inode->i_disk_sec + n;
49 }
46 if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; 50 if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
47 disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); 51 disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
48 if (disk_secno == -1) return 0; 52 if (disk_secno == -1) return 0;
49 if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; 53 if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
54 n = file_secno - hpfs_inode->i_file_sec;
55 if (n < hpfs_inode->i_n_secs) {
56 *n_secs = hpfs_inode->i_n_secs - n;
57 return hpfs_inode->i_disk_sec + n;
58 }
59 *n_secs = 1;
50 return disk_secno; 60 return disk_secno;
51} 61}
52 62
@@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
67{ 77{
68 int r; 78 int r;
69 secno s; 79 secno s;
80 unsigned n_secs;
70 hpfs_lock(inode->i_sb); 81 hpfs_lock(inode->i_sb);
71 s = hpfs_bmap(inode, iblock); 82 s = hpfs_bmap(inode, iblock, &n_secs);
72 if (s) { 83 if (s) {
84 if (bh_result->b_size >> 9 < n_secs)
85 n_secs = bh_result->b_size >> 9;
73 map_bh(bh_result, inode->i_sb, s); 86 map_bh(bh_result, inode->i_sb, s);
87 bh_result->b_size = n_secs << 9;
74 goto ret_0; 88 goto ret_0;
75 } 89 }
76 if (!create) goto ret_0; 90 if (!create) goto ret_0;
@@ -95,14 +109,26 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
95 return r; 109 return r;
96} 110}
97 111
112static int hpfs_readpage(struct file *file, struct page *page)
113{
114 return mpage_readpage(page, hpfs_get_block);
115}
116
98static int hpfs_writepage(struct page *page, struct writeback_control *wbc) 117static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
99{ 118{
100 return block_write_full_page(page,hpfs_get_block, wbc); 119 return block_write_full_page(page, hpfs_get_block, wbc);
101} 120}
102 121
103static int hpfs_readpage(struct file *file, struct page *page) 122static int hpfs_readpages(struct file *file, struct address_space *mapping,
123 struct list_head *pages, unsigned nr_pages)
124{
125 return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
126}
127
128static int hpfs_writepages(struct address_space *mapping,
129 struct writeback_control *wbc)
104{ 130{
105 return block_read_full_page(page,hpfs_get_block); 131 return mpage_writepages(mapping, wbc, hpfs_get_block);
106} 132}
107 133
108static void hpfs_write_failed(struct address_space *mapping, loff_t to) 134static void hpfs_write_failed(struct address_space *mapping, loff_t to)
@@ -161,6 +187,8 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
161const struct address_space_operations hpfs_aops = { 187const struct address_space_operations hpfs_aops = {
162 .readpage = hpfs_readpage, 188 .readpage = hpfs_readpage,
163 .writepage = hpfs_writepage, 189 .writepage = hpfs_writepage,
190 .readpages = hpfs_readpages,
191 .writepages = hpfs_writepages,
164 .write_begin = hpfs_write_begin, 192 .write_begin = hpfs_write_begin,
165 .write_end = hpfs_write_end, 193 .write_end = hpfs_write_end,
166 .bmap = _hpfs_bmap 194 .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b7ae286646b5..1b398636e990 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -27,8 +27,9 @@
27#define ALLOC_FWD_MAX 128 27#define ALLOC_FWD_MAX 128
28#define ALLOC_M 1 28#define ALLOC_M 1
29#define FNODE_RD_AHEAD 16 29#define FNODE_RD_AHEAD 16
30#define ANODE_RD_AHEAD 16 30#define ANODE_RD_AHEAD 0
31#define DNODE_RD_AHEAD 4 31#define DNODE_RD_AHEAD 72
32#define COUNT_RD_AHEAD 62
32 33
33#define FREE_DNODES_ADD 58 34#define FREE_DNODES_ADD 58
34#define FREE_DNODES_DEL 29 35#define FREE_DNODES_DEL 29
@@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
207 208
208/* buffer.c */ 209/* buffer.c */
209 210
211void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
210void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); 212void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
211void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); 213void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
212void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); 214void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
@@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *);
271 273
272__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 274__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
273__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 275__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
276void hpfs_prefetch_bitmap(struct super_block *, unsigned);
274unsigned char *hpfs_load_code_page(struct super_block *, secno); 277unsigned char *hpfs_load_code_page(struct super_block *, secno);
275__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 278__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
276struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 279struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d78359..3aa66ae1031e 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
17 struct quad_buffer_head *qbh, char *id) 17 struct quad_buffer_head *qbh, char *id)
18{ 18{
19 secno sec; 19 secno sec;
20 if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) { 20 __le32 *ret;
21 unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
22 if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
21 hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); 23 hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
22 return NULL; 24 return NULL;
23 } 25 }
@@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
26 hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); 28 hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
27 return NULL; 29 return NULL;
28 } 30 }
29 return hpfs_map_4sectors(s, sec, qbh, 4); 31 ret = hpfs_map_4sectors(s, sec, qbh, 4);
32 if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
33 return ret;
34}
35
36void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
37{
38 unsigned to_prefetch, next_prefetch;
39 unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
40 if (unlikely(bmp_block >= n_bands))
41 return;
42 to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
43 if (unlikely(bmp_block + 1 >= n_bands))
44 next_prefetch = 0;
45 else
46 next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
47 hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
30} 48}
31 49
32/* 50/*
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a0617e706957..4334cda8dba1 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
121 unsigned long *bits; 121 unsigned long *bits;
122 unsigned count; 122 unsigned count;
123 123
124 bits = hpfs_map_4sectors(s, secno, &qbh, 4); 124 bits = hpfs_map_4sectors(s, secno, &qbh, 0);
125 if (!bits) 125 if (!bits)
126 return 0; 126 return 0;
127 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); 127 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
@@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s)
134 unsigned n, count, n_bands; 134 unsigned n, count, n_bands;
135 n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; 135 n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
136 count = 0; 136 count = 0;
137 for (n = 0; n < n_bands; n++) 137 for (n = 0; n < COUNT_RD_AHEAD; n++) {
138 hpfs_prefetch_bitmap(s, n);
139 }
140 for (n = 0; n < n_bands; n++) {
141 hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
138 count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); 142 count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
143 }
139 return count; 144 return count;
140} 145}
141 146
@@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
558 sbi->sb_cp_table = NULL; 563 sbi->sb_cp_table = NULL;
559 sbi->sb_c_bitmap = -1; 564 sbi->sb_c_bitmap = -1;
560 sbi->sb_max_fwd_alloc = 0xffffff; 565 sbi->sb_max_fwd_alloc = 0xffffff;
561 566
567 if (sbi->sb_fs_size >= 0x80000000) {
568 hpfs_error(s, "invalid size in superblock: %08x",
569 (unsigned)sbi->sb_fs_size);
570 goto bail4;
571 }
572
562 /* Load bitmap directory */ 573 /* Load bitmap directory */
563 if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) 574 if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
564 goto bail4; 575 goto bail4;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index fc90ab11c340..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
69 struct dentry *parent; 69 struct dentry *parent;
70 char *root, *name; 70 char *root, *name;
71 const char *seg_name; 71 const char *seg_name;
72 int len, seg_len; 72 int len, seg_len, root_len;
73 73
74 len = 0; 74 len = 0;
75 parent = dentry; 75 parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
81 } 81 }
82 82
83 root = "proc"; 83 root = "proc";
84 len += strlen(root); 84 root_len = strlen(root);
85 len += root_len;
85 name = kmalloc(len + extra + 1, GFP_KERNEL); 86 name = kmalloc(len + extra + 1, GFP_KERNEL);
86 if (name == NULL) 87 if (name == NULL)
87 return NULL; 88 return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
91 while (parent->d_parent != parent) { 92 while (parent->d_parent != parent) {
92 if (is_pid(parent)) { 93 if (is_pid(parent)) {
93 seg_name = "pid"; 94 seg_name = "pid";
94 seg_len = strlen("pid"); 95 seg_len = strlen(seg_name);
95 } 96 }
96 else { 97 else {
97 seg_name = parent->d_name.name; 98 seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
100 101
101 len -= seg_len + 1; 102 len -= seg_len + 1;
102 name[len] = '/'; 103 name[len] = '/';
103 strncpy(&name[len + 1], seg_name, seg_len); 104 memcpy(&name[len + 1], seg_name, seg_len);
104 parent = parent->d_parent; 105 parent = parent->d_parent;
105 } 106 }
106 strncpy(name, root, strlen(root)); 107 memcpy(name, root, root_len);
107 return name; 108 return name;
108} 109}
109 110
diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3b86e1..d6dfb09c8280 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
333 */ 333 */
334void inc_nlink(struct inode *inode) 334void inc_nlink(struct inode *inode)
335{ 335{
336 if (WARN_ON(inode->i_nlink == 0)) 336 if (unlikely(inode->i_nlink == 0)) {
337 WARN_ON(!(inode->i_state & I_LINKABLE));
337 atomic_long_dec(&inode->i_sb->s_remove_count); 338 atomic_long_dec(&inode->i_sb->s_remove_count);
339 }
338 340
339 inode->__i_nlink++; 341 inode->__i_nlink++;
340} 342}
diff --git a/fs/internal.h b/fs/internal.h
index 68121584ae37..7c5f01cf619d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,11 +96,12 @@ struct open_flags {
96 umode_t mode; 96 umode_t mode;
97 int acc_mode; 97 int acc_mode;
98 int intent; 98 int intent;
99 int lookup_flags;
99}; 100};
100extern struct file *do_filp_open(int dfd, struct filename *pathname, 101extern struct file *do_filp_open(int dfd, struct filename *pathname,
101 const struct open_flags *op, int flags); 102 const struct open_flags *op);
102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, 103extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
103 const char *, const struct open_flags *, int lookup_flags); 104 const char *, const struct open_flags *);
104 105
105extern long do_handle_open(int mountdirfd, 106extern long do_handle_open(int mountdirfd,
106 struct file_handle __user *ufh, int open_flag); 107 struct file_handle __user *ufh, int open_flag);
@@ -130,6 +131,7 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
130 * read_write.c 131 * read_write.c
131 */ 132 */
132extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); 133extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
134extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
133 135
134/* 136/*
135 * splice.c 137 * splice.c
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d9b8aebdeb22..c348d6d88624 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
28 28
29#define BEQUIET 29#define BEQUIET
30 30
31static int isofs_hashi(const struct dentry *parent, const struct inode *inode, 31static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
32 struct qstr *qstr); 32static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
33static int isofs_hash(const struct dentry *parent, const struct inode *inode,
34 struct qstr *qstr);
35static int isofs_dentry_cmpi(const struct dentry *parent, 33static int isofs_dentry_cmpi(const struct dentry *parent,
36 const struct inode *pinode, 34 const struct dentry *dentry,
37 const struct dentry *dentry, const struct inode *inode,
38 unsigned int len, const char *str, const struct qstr *name); 35 unsigned int len, const char *str, const struct qstr *name);
39static int isofs_dentry_cmp(const struct dentry *parent, 36static int isofs_dentry_cmp(const struct dentry *parent,
40 const struct inode *pinode, 37 const struct dentry *dentry,
41 const struct dentry *dentry, const struct inode *inode,
42 unsigned int len, const char *str, const struct qstr *name); 38 unsigned int len, const char *str, const struct qstr *name);
43 39
44#ifdef CONFIG_JOLIET 40#ifdef CONFIG_JOLIET
45static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, 41static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
46 struct qstr *qstr); 42static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
47static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
48 struct qstr *qstr);
49static int isofs_dentry_cmpi_ms(const struct dentry *parent, 43static int isofs_dentry_cmpi_ms(const struct dentry *parent,
50 const struct inode *pinode, 44 const struct dentry *dentry,
51 const struct dentry *dentry, const struct inode *inode,
52 unsigned int len, const char *str, const struct qstr *name); 45 unsigned int len, const char *str, const struct qstr *name);
53static int isofs_dentry_cmp_ms(const struct dentry *parent, 46static int isofs_dentry_cmp_ms(const struct dentry *parent,
54 const struct inode *pinode, 47 const struct dentry *dentry,
55 const struct dentry *dentry, const struct inode *inode,
56 unsigned int len, const char *str, const struct qstr *name); 48 unsigned int len, const char *str, const struct qstr *name);
57#endif 49#endif
58 50
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
265} 257}
266 258
267static int 259static int
268isofs_hash(const struct dentry *dentry, const struct inode *inode, 260isofs_hash(const struct dentry *dentry, struct qstr *qstr)
269 struct qstr *qstr)
270{ 261{
271 return isofs_hash_common(dentry, qstr, 0); 262 return isofs_hash_common(dentry, qstr, 0);
272} 263}
273 264
274static int 265static int
275isofs_hashi(const struct dentry *dentry, const struct inode *inode, 266isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
276 struct qstr *qstr)
277{ 267{
278 return isofs_hashi_common(dentry, qstr, 0); 268 return isofs_hashi_common(dentry, qstr, 0);
279} 269}
280 270
281static int 271static int
282isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, 272isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
283 const struct dentry *dentry, const struct inode *inode,
284 unsigned int len, const char *str, const struct qstr *name) 273 unsigned int len, const char *str, const struct qstr *name)
285{ 274{
286 return isofs_dentry_cmp_common(len, str, name, 0, 0); 275 return isofs_dentry_cmp_common(len, str, name, 0, 0);
287} 276}
288 277
289static int 278static int
290isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, 279isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
291 const struct dentry *dentry, const struct inode *inode,
292 unsigned int len, const char *str, const struct qstr *name) 280 unsigned int len, const char *str, const struct qstr *name)
293{ 281{
294 return isofs_dentry_cmp_common(len, str, name, 0, 1); 282 return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
296 284
297#ifdef CONFIG_JOLIET 285#ifdef CONFIG_JOLIET
298static int 286static int
299isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, 287isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
300 struct qstr *qstr)
301{ 288{
302 return isofs_hash_common(dentry, qstr, 1); 289 return isofs_hash_common(dentry, qstr, 1);
303} 290}
304 291
305static int 292static int
306isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, 293isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
307 struct qstr *qstr)
308{ 294{
309 return isofs_hashi_common(dentry, qstr, 1); 295 return isofs_hashi_common(dentry, qstr, 1);
310} 296}
311 297
312static int 298static int
313isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, 299isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
314 const struct dentry *dentry, const struct inode *inode,
315 unsigned int len, const char *str, const struct qstr *name) 300 unsigned int len, const char *str, const struct qstr *name)
316{ 301{
317 return isofs_dentry_cmp_common(len, str, name, 1, 0); 302 return isofs_dentry_cmp_common(len, str, name, 1, 0);
318} 303}
319 304
320static int 305static int
321isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, 306isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
322 const struct dentry *dentry, const struct inode *inode,
323 unsigned int len, const char *str, const struct qstr *name) 307 unsigned int len, const char *str, const struct qstr *name)
324{ 308{
325 return isofs_dentry_cmp_common(len, str, name, 1, 1); 309 return isofs_dentry_cmp_common(len, str, name, 1, 1);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844ed..95295640d9c8 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
37 37
38 qstr.name = compare; 38 qstr.name = compare;
39 qstr.len = dlen; 39 qstr.len = dlen;
40 return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, 40 return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
41 dentry->d_name.len, dentry->d_name.name, &qstr);
42} 41}
43 42
44/* 43/*
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
346 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 346 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
347 (unsigned long long) blkno, 347 (unsigned long long) blkno,
348 (unsigned long long) nblocks); 348 (unsigned long long) nblocks);
349 jfs_error(ip->i_sb, 349 jfs_error(ip->i_sb, "block to be freed is outside the map\n");
350 "dbFree: block to be freed is outside the map");
351 return -EIO; 350 return -EIO;
352 } 351 }
353 352
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
384 383
385 /* free the blocks. */ 384 /* free the blocks. */
386 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { 385 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
387 jfs_error(ip->i_sb, "dbFree: error in block map\n"); 386 jfs_error(ip->i_sb, "error in block map\n");
388 release_metapage(mp); 387 release_metapage(mp);
389 IREAD_UNLOCK(ipbmap); 388 IREAD_UNLOCK(ipbmap);
390 return (rc); 389 return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
441 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 440 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
442 (unsigned long long) blkno, 441 (unsigned long long) blkno,
443 (unsigned long long) nblocks); 442 (unsigned long long) nblocks);
444 jfs_error(ipbmap->i_sb, 443 jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
445 "dbUpdatePMap: blocks are outside the map");
446 return -EIO; 444 return -EIO;
447 } 445 }
448 446
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
726 724
727 /* the hint should be within the map */ 725 /* the hint should be within the map */
728 if (hint >= mapSize) { 726 if (hint >= mapSize) {
729 jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); 727 jfs_error(ip->i_sb, "the hint is outside the map\n");
730 return -EIO; 728 return -EIO;
731 } 729 }
732 730
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1057 bmp = sbi->bmap; 1055 bmp = sbi->bmap;
1058 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { 1056 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1059 IREAD_UNLOCK(ipbmap); 1057 IREAD_UNLOCK(ipbmap);
1060 jfs_error(ip->i_sb, 1058 jfs_error(ip->i_sb, "the block is outside the filesystem\n");
1061 "dbExtend: the block is outside the filesystem");
1062 return -EIO; 1059 return -EIO;
1063 } 1060 }
1064 1061
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1134 u32 mask; 1131 u32 mask;
1135 1132
1136 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1133 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1137 jfs_error(bmp->db_ipbmap->i_sb, 1134 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1138 "dbAllocNext: Corrupt dmap page");
1139 return -EIO; 1135 return -EIO;
1140 } 1136 }
1141 1137
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
1265 s8 *leaf; 1261 s8 *leaf;
1266 1262
1267 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1263 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1268 jfs_error(bmp->db_ipbmap->i_sb, 1264 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1269 "dbAllocNear: Corrupt dmap page");
1270 return -EIO; 1265 return -EIO;
1271 } 1266 }
1272 1267
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1381 */ 1376 */
1382 if (l2nb > bmp->db_agl2size) { 1377 if (l2nb > bmp->db_agl2size) {
1383 jfs_error(bmp->db_ipbmap->i_sb, 1378 jfs_error(bmp->db_ipbmap->i_sb,
1384 "dbAllocAG: allocation request is larger than the " 1379 "allocation request is larger than the allocation group size\n");
1385 "allocation group size");
1386 return -EIO; 1380 return -EIO;
1387 } 1381 }
1388 1382
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1417 (unsigned long long) blkno, 1411 (unsigned long long) blkno,
1418 (unsigned long long) nblocks); 1412 (unsigned long long) nblocks);
1419 jfs_error(bmp->db_ipbmap->i_sb, 1413 jfs_error(bmp->db_ipbmap->i_sb,
1420 "dbAllocAG: dbAllocCtl failed in free AG"); 1414 "dbAllocCtl failed in free AG\n");
1421 } 1415 }
1422 return (rc); 1416 return (rc);
1423 } 1417 }
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1433 budmin = dcp->budmin; 1427 budmin = dcp->budmin;
1434 1428
1435 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1429 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1436 jfs_error(bmp->db_ipbmap->i_sb, 1430 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
1437 "dbAllocAG: Corrupt dmapctl page");
1438 release_metapage(mp); 1431 release_metapage(mp);
1439 return -EIO; 1432 return -EIO;
1440 } 1433 }
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1475 } 1468 }
1476 if (n == 4) { 1469 if (n == 4) {
1477 jfs_error(bmp->db_ipbmap->i_sb, 1470 jfs_error(bmp->db_ipbmap->i_sb,
1478 "dbAllocAG: failed descending stree"); 1471 "failed descending stree\n");
1479 release_metapage(mp); 1472 release_metapage(mp);
1480 return -EIO; 1473 return -EIO;
1481 } 1474 }
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1515 &blkno))) { 1508 &blkno))) {
1516 if (rc == -ENOSPC) { 1509 if (rc == -ENOSPC) {
1517 jfs_error(bmp->db_ipbmap->i_sb, 1510 jfs_error(bmp->db_ipbmap->i_sb,
1518 "dbAllocAG: control page " 1511 "control page inconsistent\n");
1519 "inconsistent");
1520 return -EIO; 1512 return -EIO;
1521 } 1513 }
1522 return (rc); 1514 return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1528 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1520 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1529 if (rc == -ENOSPC) { 1521 if (rc == -ENOSPC) {
1530 jfs_error(bmp->db_ipbmap->i_sb, 1522 jfs_error(bmp->db_ipbmap->i_sb,
1531 "dbAllocAG: unable to allocate blocks"); 1523 "unable to allocate blocks\n");
1532 rc = -EIO; 1524 rc = -EIO;
1533 } 1525 }
1534 return (rc); 1526 return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1587 */ 1579 */
1588 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1580 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1589 if (rc == -ENOSPC) { 1581 if (rc == -ENOSPC) {
1590 jfs_error(bmp->db_ipbmap->i_sb, 1582 jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
1591 "dbAllocAny: unable to allocate blocks");
1592 return -EIO; 1583 return -EIO;
1593 } 1584 }
1594 return (rc); 1585 return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1652 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); 1643 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
1653 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); 1644 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
1654 if (totrim == NULL) { 1645 if (totrim == NULL) {
1655 jfs_error(bmp->db_ipbmap->i_sb, 1646 jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
1656 "dbDiscardAG: no memory for trim array");
1657 IWRITE_UNLOCK(ipbmap); 1647 IWRITE_UNLOCK(ipbmap);
1658 return 0; 1648 return 0;
1659 } 1649 }
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1682 nblocks = 1 << l2nb; 1672 nblocks = 1 << l2nb;
1683 } else { 1673 } else {
1684 /* Trim any already allocated blocks */ 1674 /* Trim any already allocated blocks */
1685 jfs_error(bmp->db_ipbmap->i_sb, 1675 jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
1686 "dbDiscardAG: -EIO");
1687 break; 1676 break;
1688 } 1677 }
1689 1678
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1761 1750
1762 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1751 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1763 jfs_error(bmp->db_ipbmap->i_sb, 1752 jfs_error(bmp->db_ipbmap->i_sb,
1764 "dbFindCtl: Corrupt dmapctl page"); 1753 "Corrupt dmapctl page\n");
1765 release_metapage(mp); 1754 release_metapage(mp);
1766 return -EIO; 1755 return -EIO;
1767 } 1756 }
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1782 if (rc) { 1771 if (rc) {
1783 if (lev != level) { 1772 if (lev != level) {
1784 jfs_error(bmp->db_ipbmap->i_sb, 1773 jfs_error(bmp->db_ipbmap->i_sb,
1785 "dbFindCtl: dmap inconsistent"); 1774 "dmap inconsistent\n");
1786 return -EIO; 1775 return -EIO;
1787 } 1776 }
1788 return -ENOSPC; 1777 return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1906 if (dp->tree.stree[ROOT] != L2BPERDMAP) { 1895 if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1907 release_metapage(mp); 1896 release_metapage(mp);
1908 jfs_error(bmp->db_ipbmap->i_sb, 1897 jfs_error(bmp->db_ipbmap->i_sb,
1909 "dbAllocCtl: the dmap is not all free"); 1898 "the dmap is not all free\n");
1910 rc = -EIO; 1899 rc = -EIO;
1911 goto backout; 1900 goto backout;
1912 } 1901 }
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1953 * to indicate that we have leaked blocks. 1942 * to indicate that we have leaked blocks.
1954 */ 1943 */
1955 jfs_error(bmp->db_ipbmap->i_sb, 1944 jfs_error(bmp->db_ipbmap->i_sb,
1956 "dbAllocCtl: I/O Error: Block Leakage."); 1945 "I/O Error: Block Leakage\n");
1957 continue; 1946 continue;
1958 } 1947 }
1959 dp = (struct dmap *) mp->data; 1948 dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1965 * to indicate that we have leaked blocks. 1954 * to indicate that we have leaked blocks.
1966 */ 1955 */
1967 release_metapage(mp); 1956 release_metapage(mp);
1968 jfs_error(bmp->db_ipbmap->i_sb, 1957 jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
1969 "dbAllocCtl: Block Leakage.");
1970 continue; 1958 continue;
1971 } 1959 }
1972 1960
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2263 for (; nwords > 0; nwords -= nw) { 2251 for (; nwords > 0; nwords -= nw) {
2264 if (leaf[word] < BUDMIN) { 2252 if (leaf[word] < BUDMIN) {
2265 jfs_error(bmp->db_ipbmap->i_sb, 2253 jfs_error(bmp->db_ipbmap->i_sb,
2266 "dbAllocBits: leaf page " 2254 "leaf page corrupt\n");
2267 "corrupt");
2268 break; 2255 break;
2269 } 2256 }
2270 2257
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2536 dcp = (struct dmapctl *) mp->data; 2523 dcp = (struct dmapctl *) mp->data;
2537 2524
2538 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 2525 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2539 jfs_error(bmp->db_ipbmap->i_sb, 2526 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
2540 "dbAdjCtl: Corrupt dmapctl page");
2541 release_metapage(mp); 2527 release_metapage(mp);
2542 return -EIO; 2528 return -EIO;
2543 } 2529 }
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2638 assert(level == bmp->db_maxlevel); 2624 assert(level == bmp->db_maxlevel);
2639 if (bmp->db_maxfreebud != oldroot) { 2625 if (bmp->db_maxfreebud != oldroot) {
2640 jfs_error(bmp->db_ipbmap->i_sb, 2626 jfs_error(bmp->db_ipbmap->i_sb,
2641 "dbAdjCtl: the maximum free buddy is " 2627 "the maximum free buddy is not the old root\n");
2642 "not the old root");
2643 } 2628 }
2644 bmp->db_maxfreebud = dcp->stree[ROOT]; 2629 bmp->db_maxfreebud = dcp->stree[ROOT];
2645 } 2630 }
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3481 p = BMAPBLKNO + nbperpage; /* L2 page */ 3466 p = BMAPBLKNO + nbperpage; /* L2 page */
3482 l2mp = read_metapage(ipbmap, p, PSIZE, 0); 3467 l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3483 if (!l2mp) { 3468 if (!l2mp) {
3484 jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); 3469 jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
3485 return -EIO; 3470 return -EIO;
3486 } 3471 }
3487 l2dcp = (struct dmapctl *) l2mp->data; 3472 l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3646 } 3631 }
3647 } /* for each L1 in a L2 */ 3632 } /* for each L1 in a L2 */
3648 3633
3649 jfs_error(ipbmap->i_sb, 3634 jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
3650 "dbExtendFS: function has not returned as expected");
3651errout: 3635errout:
3652 if (l0mp) 3636 if (l0mp)
3653 release_metapage(l0mp); 3637 release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3717 } 3701 }
3718 if (bmp->db_agpref >= bmp->db_numag) { 3702 if (bmp->db_agpref >= bmp->db_numag) {
3719 jfs_error(ipbmap->i_sb, 3703 jfs_error(ipbmap->i_sb,
3720 "cannot find ag with average freespace"); 3704 "cannot find ag with average freespace\n");
3721 } 3705 }
3722 } 3706 }
3723 3707
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 9f4ed13d9f15..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) 124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
125 125
126/* get page buffer for specified block address */ 126/* get page buffer for specified block address */
127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ 127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
128{\ 128do { \
129 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ 129 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \
130 if (!(RC))\ 130 if (!(RC)) { \
131 {\ 131 if (((P)->header.nextindex > \
132 if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ 132 (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
133 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ 133 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
134 {\ 134 BT_PUTPAGE(MP); \
135 BT_PUTPAGE(MP);\ 135 jfs_error((IP)->i_sb, \
136 jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ 136 "DT_GETPAGE: dtree page corrupt\n"); \
137 MP = NULL;\ 137 MP = NULL; \
138 RC = -EIO;\ 138 RC = -EIO; \
139 }\ 139 } \
140 }\ 140 } \
141} 141} while (0)
142 142
143/* for consistency */ 143/* for consistency */
144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) 144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
776 /* Something's corrupted, mark filesystem dirty so 776 /* Something's corrupted, mark filesystem dirty so
777 * chkdsk will fix it. 777 * chkdsk will fix it.
778 */ 778 */
779 jfs_error(sb, "stack overrun in dtSearch!"); 779 jfs_error(sb, "stack overrun!\n");
780 BT_STACK_DUMP(btstack); 780 BT_STACK_DUMP(btstack);
781 rc = -EIO; 781 rc = -EIO;
782 goto out; 782 goto out;
@@ -3247,8 +3247,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
3247 /* Sanity Check */ 3247 /* Sanity Check */
3248 if (d_namleft == 0) { 3248 if (d_namleft == 0) {
3249 jfs_error(ip->i_sb, 3249 jfs_error(ip->i_sb,
3250 "JFS:Dtree error: ino = " 3250 "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
3251 "%ld, bn=%Ld, index = %d",
3252 (long)ip->i_ino, 3251 (long)ip->i_ino,
3253 (long long)bn, 3252 (long long)bn,
3254 i); 3253 i);
@@ -3368,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3368 */ 3367 */
3369 if (BT_STACK_FULL(btstack)) { 3368 if (BT_STACK_FULL(btstack)) {
3370 DT_PUTPAGE(mp); 3369 DT_PUTPAGE(mp);
3371 jfs_error(ip->i_sb, "dtReadFirst: btstack overrun"); 3370 jfs_error(ip->i_sb, "btstack overrun\n");
3372 BT_STACK_DUMP(btstack); 3371 BT_STACK_DUMP(btstack);
3373 return -EIO; 3372 return -EIO;
3374 } 3373 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
388 388
389 if ((rc == 0) && xlen) { 389 if ((rc == 0) && xlen) {
390 if (xlen != nbperpage) { 390 if (xlen != nbperpage) {
391 jfs_error(ip->i_sb, "extHint: corrupt xtree"); 391 jfs_error(ip->i_sb, "corrupt xtree\n");
392 rc = -EIO; 392 rc = -EIO;
393 } 393 }
394 XADaddress(xp, xaddr); 394 XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f7e042b63ddb..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
386 dp += rel_inode; 386 dp += rel_inode;
387 387
388 if (ip->i_ino != le32_to_cpu(dp->di_number)) { 388 if (ip->i_ino != le32_to_cpu(dp->di_number)) {
389 jfs_error(ip->i_sb, "diRead: i_ino != di_number"); 389 jfs_error(ip->i_sb, "i_ino != di_number\n");
390 rc = -EIO; 390 rc = -EIO;
391 } else if (le32_to_cpu(dp->di_nlink) == 0) 391 } else if (le32_to_cpu(dp->di_nlink) == 0)
392 rc = -ESTALE; 392 rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
625 if (!addressPXD(&(jfs_ip->ixpxd)) || 625 if (!addressPXD(&(jfs_ip->ixpxd)) ||
626 (lengthPXD(&(jfs_ip->ixpxd)) != 626 (lengthPXD(&(jfs_ip->ixpxd)) !=
627 JFS_IP(ipimap)->i_imap->im_nbperiext)) { 627 JFS_IP(ipimap)->i_imap->im_nbperiext)) {
628 jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); 628 jfs_error(ip->i_sb, "ixpxd invalid\n");
629 return -EIO; 629 return -EIO;
630 } 630 }
631 631
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
893 if (iagno >= imap->im_nextiag) { 893 if (iagno >= imap->im_nextiag) {
894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, 894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
895 imap, 32, 0); 895 imap, 32, 0);
896 jfs_error(ip->i_sb, 896 jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
897 "diFree: inum = %d, iagno = %d, nextiag = %d",
898 (uint) inum, iagno, imap->im_nextiag); 897 (uint) inum, iagno, imap->im_nextiag);
899 return -EIO; 898 return -EIO;
900 } 899 }
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
930 mask = HIGHORDER >> bitno; 929 mask = HIGHORDER >> bitno;
931 930
932 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 931 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
933 jfs_error(ip->i_sb, 932 jfs_error(ip->i_sb, "wmap shows inode already free\n");
934 "diFree: wmap shows inode already free");
935 } 933 }
936 934
937 if (!addressPXD(&iagp->inoext[extno])) { 935 if (!addressPXD(&iagp->inoext[extno])) {
938 release_metapage(mp); 936 release_metapage(mp);
939 IREAD_UNLOCK(ipimap); 937 IREAD_UNLOCK(ipimap);
940 AG_UNLOCK(imap, agno); 938 AG_UNLOCK(imap, agno);
941 jfs_error(ip->i_sb, "diFree: invalid inoext"); 939 jfs_error(ip->i_sb, "invalid inoext\n");
942 return -EIO; 940 return -EIO;
943 } 941 }
944 942
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
950 release_metapage(mp); 948 release_metapage(mp);
951 IREAD_UNLOCK(ipimap); 949 IREAD_UNLOCK(ipimap);
952 AG_UNLOCK(imap, agno); 950 AG_UNLOCK(imap, agno);
953 jfs_error(ip->i_sb, "diFree: numfree > numinos"); 951 jfs_error(ip->i_sb, "numfree > numinos\n");
954 return -EIO; 952 return -EIO;
955 } 953 }
956 /* 954 /*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
1199 * for the inode being freed. 1197 * for the inode being freed.
1200 */ 1198 */
1201 if (iagp->pmap[extno] != 0) { 1199 if (iagp->pmap[extno] != 0) {
1202 jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); 1200 jfs_error(ip->i_sb, "the pmap does not show inode free\n");
1203 } 1201 }
1204 iagp->wmap[extno] = 0; 1202 iagp->wmap[extno] = 0;
1205 PXDlength(&iagp->inoext[extno], 0); 1203 PXDlength(&iagp->inoext[extno], 0);
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1518 release_metapage(mp); 1516 release_metapage(mp);
1519 AG_UNLOCK(imap, agno); 1517 AG_UNLOCK(imap, agno);
1520 jfs_error(ip->i_sb, 1518 jfs_error(ip->i_sb,
1521 "diAlloc: can't find free bit " 1519 "can't find free bit in wmap\n");
1522 "in wmap");
1523 return -EIO; 1520 return -EIO;
1524 } 1521 }
1525 1522
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1660 numinos = imap->im_agctl[agno].numinos; 1657 numinos = imap->im_agctl[agno].numinos;
1661 1658
1662 if (numfree > numinos) { 1659 if (numfree > numinos) {
1663 jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); 1660 jfs_error(ip->i_sb, "numfree > numinos\n");
1664 return -EIO; 1661 return -EIO;
1665 } 1662 }
1666 1663
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1811 if (!iagp->nfreeinos) { 1808 if (!iagp->nfreeinos) {
1812 IREAD_UNLOCK(imap->im_ipimap); 1809 IREAD_UNLOCK(imap->im_ipimap);
1813 release_metapage(mp); 1810 release_metapage(mp);
1814 jfs_error(ip->i_sb, 1811 jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
1815 "diAllocIno: nfreeinos = 0, but iag on freelist");
1816 return -EIO; 1812 return -EIO;
1817 } 1813 }
1818 1814
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1824 IREAD_UNLOCK(imap->im_ipimap); 1820 IREAD_UNLOCK(imap->im_ipimap);
1825 release_metapage(mp); 1821 release_metapage(mp);
1826 jfs_error(ip->i_sb, 1822 jfs_error(ip->i_sb,
1827 "diAllocIno: free inode not found in summary map"); 1823 "free inode not found in summary map\n");
1828 return -EIO; 1824 return -EIO;
1829 } 1825 }
1830 1826
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1839 if (rem >= EXTSPERSUM) { 1835 if (rem >= EXTSPERSUM) {
1840 IREAD_UNLOCK(imap->im_ipimap); 1836 IREAD_UNLOCK(imap->im_ipimap);
1841 release_metapage(mp); 1837 release_metapage(mp);
1842 jfs_error(ip->i_sb, "diAllocIno: no free extent found"); 1838 jfs_error(ip->i_sb, "no free extent found\n");
1843 return -EIO; 1839 return -EIO;
1844 } 1840 }
1845 extno = (sword << L2EXTSPERSUM) + rem; 1841 extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1850 if (rem >= INOSPEREXT) { 1846 if (rem >= INOSPEREXT) {
1851 IREAD_UNLOCK(imap->im_ipimap); 1847 IREAD_UNLOCK(imap->im_ipimap);
1852 release_metapage(mp); 1848 release_metapage(mp);
1853 jfs_error(ip->i_sb, "diAllocIno: free inode not found"); 1849 jfs_error(ip->i_sb, "free inode not found\n");
1854 return -EIO; 1850 return -EIO;
1855 } 1851 }
1856 1852
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1936 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); 1932 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1937 if ((rc = diIAGRead(imap, iagno, &mp))) { 1933 if ((rc = diIAGRead(imap, iagno, &mp))) {
1938 IREAD_UNLOCK(imap->im_ipimap); 1934 IREAD_UNLOCK(imap->im_ipimap);
1939 jfs_error(ip->i_sb, "diAllocExt: error reading iag"); 1935 jfs_error(ip->i_sb, "error reading iag\n");
1940 return rc; 1936 return rc;
1941 } 1937 }
1942 iagp = (struct iag *) mp->data; 1938 iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1948 if (sword >= SMAPSZ) { 1944 if (sword >= SMAPSZ) {
1949 release_metapage(mp); 1945 release_metapage(mp);
1950 IREAD_UNLOCK(imap->im_ipimap); 1946 IREAD_UNLOCK(imap->im_ipimap);
1951 jfs_error(ip->i_sb, 1947 jfs_error(ip->i_sb, "free ext summary map not found\n");
1952 "diAllocExt: free ext summary map not found");
1953 return -EIO; 1948 return -EIO;
1954 } 1949 }
1955 if (~iagp->extsmap[sword]) 1950 if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1962 if (rem >= EXTSPERSUM) { 1957 if (rem >= EXTSPERSUM) {
1963 release_metapage(mp); 1958 release_metapage(mp);
1964 IREAD_UNLOCK(imap->im_ipimap); 1959 IREAD_UNLOCK(imap->im_ipimap);
1965 jfs_error(ip->i_sb, "diAllocExt: free extent not found"); 1960 jfs_error(ip->i_sb, "free extent not found\n");
1966 return -EIO; 1961 return -EIO;
1967 } 1962 }
1968 extno = (sword << L2EXTSPERSUM) + rem; 1963 extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2081 if (bmp) 2076 if (bmp)
2082 release_metapage(bmp); 2077 release_metapage(bmp);
2083 2078
2084 jfs_error(imap->im_ipimap->i_sb, 2079 jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
2085 "diAllocBit: iag inconsistent");
2086 return -EIO; 2080 return -EIO;
2087 } 2081 }
2088 2082
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2189 /* better have free extents. 2183 /* better have free extents.
2190 */ 2184 */
2191 if (!iagp->nfreeexts) { 2185 if (!iagp->nfreeexts) {
2192 jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); 2186 jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
2193 return -EIO; 2187 return -EIO;
2194 } 2188 }
2195 2189
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2261 } 2255 }
2262 if (ciagp == NULL) { 2256 if (ciagp == NULL) {
2263 jfs_error(imap->im_ipimap->i_sb, 2257 jfs_error(imap->im_ipimap->i_sb,
2264 "diNewExt: ciagp == NULL"); 2258 "ciagp == NULL\n");
2265 rc = -EIO; 2259 rc = -EIO;
2266 goto error_out; 2260 goto error_out;
2267 } 2261 }
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2498 IWRITE_UNLOCK(ipimap); 2492 IWRITE_UNLOCK(ipimap);
2499 IAGFREE_UNLOCK(imap); 2493 IAGFREE_UNLOCK(imap);
2500 jfs_error(imap->im_ipimap->i_sb, 2494 jfs_error(imap->im_ipimap->i_sb,
2501 "diNewIAG: ipimap->i_size is wrong"); 2495 "ipimap->i_size is wrong\n");
2502 return -EIO; 2496 return -EIO;
2503 } 2497 }
2504 2498
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
2758 iagno = INOTOIAG(inum); 2752 iagno = INOTOIAG(inum);
2759 /* make sure that the iag is contained within the map */ 2753 /* make sure that the iag is contained within the map */
2760 if (iagno >= imap->im_nextiag) { 2754 if (iagno >= imap->im_nextiag) {
2761 jfs_error(ipimap->i_sb, 2755 jfs_error(ipimap->i_sb, "the iag is outside the map\n");
2762 "diUpdatePMap: the iag is outside the map");
2763 return -EIO; 2756 return -EIO;
2764 } 2757 }
2765 /* read the iag */ 2758 /* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
2788 */ 2781 */
2789 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 2782 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2790 jfs_error(ipimap->i_sb, 2783 jfs_error(ipimap->i_sb,
2791 "diUpdatePMap: inode %ld not marked as " 2784 "inode %ld not marked as allocated in wmap!\n",
2792 "allocated in wmap!", inum); 2785 inum);
2793 } 2786 }
2794 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { 2787 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
2795 jfs_error(ipimap->i_sb, 2788 jfs_error(ipimap->i_sb,
2796 "diUpdatePMap: inode %ld not marked as " 2789 "inode %ld not marked as allocated in pmap!\n",
2797 "allocated in pmap!", inum); 2790 inum);
2798 } 2791 }
2799 /* update the bitmap for the extent of the freed inode */ 2792 /* update the bitmap for the extent of the freed inode */
2800 iagp->pmap[extno] &= cpu_to_le32(~mask); 2793 iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
2809 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 2802 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2810 release_metapage(mp); 2803 release_metapage(mp);
2811 jfs_error(ipimap->i_sb, 2804 jfs_error(ipimap->i_sb,
2812 "diUpdatePMap: the inode is not allocated in " 2805 "the inode is not allocated in the working map\n");
2813 "the working map");
2814 return -EIO; 2806 return -EIO;
2815 } 2807 }
2816 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { 2808 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
2817 release_metapage(mp); 2809 release_metapage(mp);
2818 jfs_error(ipimap->i_sb, 2810 jfs_error(ipimap->i_sb,
2819 "diUpdatePMap: the inode is not free in the " 2811 "the inode is not free in the persistent map\n");
2820 "persistent map");
2821 return -EIO; 2812 return -EIO;
2822 } 2813 }
2823 /* update the bitmap for the extent of the allocated inode */ 2814 /* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2909 iagp = (struct iag *) bp->data; 2900 iagp = (struct iag *) bp->data;
2910 if (le32_to_cpu(iagp->iagnum) != i) { 2901 if (le32_to_cpu(iagp->iagnum) != i) {
2911 release_metapage(bp); 2902 release_metapage(bp);
2912 jfs_error(ipimap->i_sb, 2903 jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
2913 "diExtendFs: unexpected value of iagnum");
2914 return -EIO; 2904 return -EIO;
2915 } 2905 }
2916 2906
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2986 2976
2987 if (xnuminos != atomic_read(&imap->im_numinos) || 2977 if (xnuminos != atomic_read(&imap->im_numinos) ||
2988 xnumfree != atomic_read(&imap->im_numfree)) { 2978 xnumfree != atomic_read(&imap->im_numfree)) {
2989 jfs_error(ipimap->i_sb, 2979 jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
2990 "diExtendFs: numinos or numfree incorrect");
2991 return -EIO; 2980 return -EIO;
2992 } 2981 }
2993 2982
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 9e3aaff11f89..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -647,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
647 if (mp) { 647 if (mp) {
648 if (mp->logical_size != size) { 648 if (mp->logical_size != size) {
649 jfs_error(inode->i_sb, 649 jfs_error(inode->i_sb,
650 "__get_metapage: mp->logical_size != size"); 650 "get_mp->logical_size != size\n");
651 jfs_err("logical_size = %d, size = %d", 651 jfs_err("logical_size = %d, size = %d",
652 mp->logical_size, size); 652 mp->logical_size, size);
653 dump_stack(); 653 dump_stack();
@@ -658,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
658 if (test_bit(META_discard, &mp->flag)) { 658 if (test_bit(META_discard, &mp->flag)) {
659 if (!new) { 659 if (!new) {
660 jfs_error(inode->i_sb, 660 jfs_error(inode->i_sb,
661 "__get_metapage: using a " 661 "using a discarded metapage\n");
662 "discarded metapage");
663 discard_metapage(mp); 662 discard_metapage(mp);
664 goto unlock; 663 goto unlock;
665 } 664 }
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
108 108
109extern int readSuper(struct super_block *, struct buffer_head **); 109extern int readSuper(struct super_block *, struct buffer_head **);
110extern int updateSuper(struct super_block *, uint); 110extern int updateSuper(struct super_block *, uint);
111__printf(2, 3)
111extern void jfs_error(struct super_block *, const char *, ...); 112extern void jfs_error(struct super_block *, const char *, ...);
112extern int jfs_mount(struct super_block *); 113extern int jfs_mount(struct super_block *);
113extern int jfs_mount_rw(struct super_block *, int); 114extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
2684 * mark filesystem dirty 2684 * mark filesystem dirty
2685 */ 2685 */
2686 if (dirty) 2686 if (dirty)
2687 jfs_error(tblk->sb, "txAbort"); 2687 jfs_error(tblk->sb, "\n");
2688 2688
2689 return; 2689 return;
2690} 2690}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
64 64
65/* get page buffer for specified block address */ 65/* get page buffer for specified block address */
66/* ToDo: Replace this ugly macro with a function */ 66/* ToDo: Replace this ugly macro with a function */
67#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ 67#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
68{\ 68do { \
69 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ 69 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
70 if (!(RC))\ 70 if (!(RC)) { \
71 {\ 71 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
72 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ 72 (le16_to_cpu((P)->header.nextindex) > \
73 (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ 73 le16_to_cpu((P)->header.maxentry)) || \
74 (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ 74 (le16_to_cpu((P)->header.maxentry) > \
75 {\ 75 (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
76 jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ 76 jfs_error((IP)->i_sb, \
77 BT_PUTPAGE(MP);\ 77 "XT_GETPAGE: xtree page corrupt\n"); \
78 MP = NULL;\ 78 BT_PUTPAGE(MP); \
79 RC = -EIO;\ 79 MP = NULL; \
80 }\ 80 RC = -EIO; \
81 }\ 81 } \
82} 82 } \
83} while (0)
83 84
84/* for consistency */ 85/* for consistency */
85#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) 86#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
499 500
500 /* push (bn, index) of the parent page/entry */ 501 /* push (bn, index) of the parent page/entry */
501 if (BT_STACK_FULL(btstack)) { 502 if (BT_STACK_FULL(btstack)) {
502 jfs_error(ip->i_sb, "stack overrun in xtSearch!"); 503 jfs_error(ip->i_sb, "stack overrun!\n");
503 XT_PUTPAGE(mp); 504 XT_PUTPAGE(mp);
504 return -EIO; 505 return -EIO;
505 } 506 }
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */
1385 1386
1386 if (cmp != 0) { 1387 if (cmp != 0) {
1387 XT_PUTPAGE(mp); 1388 XT_PUTPAGE(mp);
1388 jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); 1389 jfs_error(ip->i_sb, "xtSearch did not find extent\n");
1389 return -EIO; 1390 return -EIO;
1390 } 1391 }
1391 1392
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */
1393 xad = &p->xad[index]; 1394 xad = &p->xad[index];
1394 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { 1395 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
1395 XT_PUTPAGE(mp); 1396 XT_PUTPAGE(mp);
1396 jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); 1397 jfs_error(ip->i_sb, "extension is not contiguous\n");
1397 return -EIO; 1398 return -EIO;
1398 } 1399 }
1399 1400
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1552 1553
1553 if (cmp != 0) { 1554 if (cmp != 0) {
1554 XT_PUTPAGE(mp); 1555 XT_PUTPAGE(mp);
1555 jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); 1556 jfs_error(ip->i_sb, "couldn't find extent\n");
1556 return -EIO; 1557 return -EIO;
1557 } 1558 }
1558 1559
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1560 nextindex = le16_to_cpu(p->header.nextindex); 1561 nextindex = le16_to_cpu(p->header.nextindex);
1561 if (index != nextindex - 1) { 1562 if (index != nextindex - 1) {
1562 XT_PUTPAGE(mp); 1563 XT_PUTPAGE(mp);
1563 jfs_error(ip->i_sb, 1564 jfs_error(ip->i_sb, "the entry found is not the last entry\n");
1564 "xtTailgate: the entry found is not the last entry");
1565 return -EIO; 1565 return -EIO;
1566 } 1566 }
1567 1567
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1734 1734
1735 if (cmp != 0) { 1735 if (cmp != 0) {
1736 XT_PUTPAGE(mp); 1736 XT_PUTPAGE(mp);
1737 jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); 1737 jfs_error(ip->i_sb, "Could not find extent\n");
1738 return -EIO; 1738 return -EIO;
1739 } 1739 }
1740 1740
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1758 (nxoff + nxlen > xoff + xlen)) { 1758 (nxoff + nxlen > xoff + xlen)) {
1759 XT_PUTPAGE(mp); 1759 XT_PUTPAGE(mp);
1760 jfs_error(ip->i_sb, 1760 jfs_error(ip->i_sb,
1761 "xtUpdate: nXAD in not completely contained within XAD"); 1761 "nXAD in not completely contained within XAD\n");
1762 return -EIO; 1762 return -EIO;
1763 } 1763 }
1764 1764
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1907 1907
1908 if (xoff >= nxoff) { 1908 if (xoff >= nxoff) {
1909 XT_PUTPAGE(mp); 1909 XT_PUTPAGE(mp);
1910 jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); 1910 jfs_error(ip->i_sb, "xoff >= nxoff\n");
1911 return -EIO; 1911 return -EIO;
1912 } 1912 }
1913/* #endif _JFS_WIP_COALESCE */ 1913/* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
2048 2048
2049 if (cmp != 0) { 2049 if (cmp != 0) {
2050 XT_PUTPAGE(mp); 2050 XT_PUTPAGE(mp);
2051 jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); 2051 jfs_error(ip->i_sb, "xtSearch failed\n");
2052 return -EIO; 2052 return -EIO;
2053 } 2053 }
2054 2054
2055 if (index0 != index) { 2055 if (index0 != index) {
2056 XT_PUTPAGE(mp); 2056 XT_PUTPAGE(mp);
2057 jfs_error(ip->i_sb, 2057 jfs_error(ip->i_sb, "unexpected value of index\n");
2058 "xtUpdate: unexpected value of index");
2059 return -EIO; 2058 return -EIO;
2060 } 2059 }
2061 } 2060 }
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3650 getChild: 3649 getChild:
3651 /* save current parent entry for the child page */ 3650 /* save current parent entry for the child page */
3652 if (BT_STACK_FULL(&btstack)) { 3651 if (BT_STACK_FULL(&btstack)) {
3653 jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); 3652 jfs_error(ip->i_sb, "stack overrun!\n");
3654 XT_PUTPAGE(mp); 3653 XT_PUTPAGE(mp);
3655 return -EIO; 3654 return -EIO;
3656 } 3655 }
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3751 3750
3752 if (cmp != 0) { 3751 if (cmp != 0) {
3753 XT_PUTPAGE(mp); 3752 XT_PUTPAGE(mp);
3754 jfs_error(ip->i_sb, 3753 jfs_error(ip->i_sb, "did not find extent\n");
3755 "xtTruncate_pmap: did not find extent");
3756 return -EIO; 3754 return -EIO;
3757 } 3755 }
3758 } else { 3756 } else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3851 getChild: 3849 getChild:
3852 /* save current parent entry for the child page */ 3850 /* save current parent entry for the child page */
3853 if (BT_STACK_FULL(&btstack)) { 3851 if (BT_STACK_FULL(&btstack)) {
3854 jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); 3852 jfs_error(ip->i_sb, "stack overrun!\n");
3855 XT_PUTPAGE(mp); 3853 XT_PUTPAGE(mp);
3856 return -EIO; 3854 return -EIO;
3857 } 3855 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 89186b7b9002..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1176 if (!S_ISDIR(old_ip->i_mode) && new_ip) 1176 if (!S_ISDIR(old_ip->i_mode) && new_ip)
1177 IWRITE_UNLOCK(new_ip); 1177 IWRITE_UNLOCK(new_ip);
1178 jfs_error(new_ip->i_sb, 1178 jfs_error(new_ip->i_sb,
1179 "jfs_rename: new_ip->i_nlink != 0"); 1179 "new_ip->i_nlink != 0\n");
1180 return -EIO; 1180 return -EIO;
1181 } 1181 }
1182 tblk = tid_to_tblock(tid); 1182 tblk = tid_to_tblock(tid);
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
1539}; 1539};
1540 1540
1541static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, 1541static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
1542 struct qstr *this)
1543{ 1542{
1544 unsigned long hash; 1543 unsigned long hash;
1545 int i; 1544 int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
1552 return 0; 1551 return 0;
1553} 1552}
1554 1553
1555static int jfs_ci_compare(const struct dentry *parent, 1554static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
1556 const struct inode *pinode,
1557 const struct dentry *dentry, const struct inode *inode,
1558 unsigned int len, const char *str, const struct qstr *name) 1555 unsigned int len, const char *str, const struct qstr *name)
1559{ 1556{
1560 int i, result = 1; 1557 int i, result = 1;
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
530 goto resume; 530 goto resume;
531 531
532 error_out: 532 error_out:
533 jfs_error(sb, "jfs_extendfs"); 533 jfs_error(sb, "\n");
534 534
535 resume: 535 resume:
536 /* 536 /*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 788e0a9c1fb0..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
92 /* nothing is done for continue beyond marking the superblock dirty */ 92 /* nothing is done for continue beyond marking the superblock dirty */
93} 93}
94 94
95void jfs_error(struct super_block *sb, const char * function, ...) 95void jfs_error(struct super_block *sb, const char *fmt, ...)
96{ 96{
97 static char error_buf[256]; 97 struct va_format vaf;
98 va_list args; 98 va_list args;
99 99
100 va_start(args, function); 100 va_start(args, fmt);
101 vsnprintf(error_buf, sizeof(error_buf), function, args); 101
102 va_end(args); 102 vaf.fmt = fmt;
103 vaf.va = &args;
103 104
104 pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf); 105 pr_err("ERROR: (device %s): %pf: %pV\n",
106 sb->s_id, __builtin_return_address(0), &vaf);
107
108 va_end(args);
105 109
106 jfs_handle_error(sb); 110 jfs_handle_error(sb);
107} 111}
@@ -617,7 +621,7 @@ static int jfs_freeze(struct super_block *sb)
617 txQuiesce(sb); 621 txQuiesce(sb);
618 rc = lmLogShutdown(log); 622 rc = lmLogShutdown(log);
619 if (rc) { 623 if (rc) {
620 jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); 624 jfs_error(sb, "lmLogShutdown failed\n");
621 625
622 /* let operations fail rather than hang */ 626 /* let operations fail rather than hang */
623 txResume(sb); 627 txResume(sb);
@@ -646,12 +650,12 @@ static int jfs_unfreeze(struct super_block *sb)
646 if (!(sb->s_flags & MS_RDONLY)) { 650 if (!(sb->s_flags & MS_RDONLY)) {
647 rc = updateSuper(sb, FM_MOUNT); 651 rc = updateSuper(sb, FM_MOUNT);
648 if (rc) { 652 if (rc) {
649 jfs_error(sb, "jfs_unfreeze: updateSuper failed"); 653 jfs_error(sb, "updateSuper failed\n");
650 goto out; 654 goto out;
651 } 655 }
652 rc = lmLogInit(log); 656 rc = lmLogInit(log);
653 if (rc) 657 if (rc)
654 jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); 658 jfs_error(sb, "lmLogInit failed\n");
655out: 659out:
656 txResume(sb); 660 txResume(sb);
657 } 661 }
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
382 382
383 nbytes = sizeDXD(&ji->ea); 383 nbytes = sizeDXD(&ji->ea);
384 if (!nbytes) { 384 if (!nbytes) {
385 jfs_error(sb, "ea_read: nbytes is 0"); 385 jfs_error(sb, "nbytes is 0\n");
386 return -EIO; 386 return -EIO;
387 } 387 }
388 388
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
482 current_blocks = 0; 482 current_blocks = 0;
483 } else { 483 } else {
484 if (!(ji->ea.flag & DXD_EXTENT)) { 484 if (!(ji->ea.flag & DXD_EXTENT)) {
485 jfs_error(sb, "ea_get: invalid ea.flag)"); 485 jfs_error(sb, "invalid ea.flag\n");
486 return -EIO; 486 return -EIO;
487 } 487 }
488 current_blocks = (ea_size + sb->s_blocksize - 1) >> 488 current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1089} 1089}
1090 1090
1091#ifdef CONFIG_JFS_SECURITY 1091#ifdef CONFIG_JFS_SECURITY
1092int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, 1092static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
1093 void *fs_info) 1093 void *fs_info)
1094{ 1094{
1095 const struct xattr *xattr; 1095 const struct xattr *xattr;
1096 tid_t *tid = fs_info; 1096 tid_t *tid = fs_info;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
305 svc_sock_update_bufs(serv); 305 svc_sock_update_bufs(serv);
306 serv->sv_maxconn = nlm_max_connections; 306 serv->sv_maxconn = nlm_max_connections;
307 307
308 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 308 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
309 if (IS_ERR(nlmsvc_task)) { 309 if (IS_ERR(nlmsvc_task)) {
310 error = PTR_ERR(nlmsvc_task); 310 error = PTR_ERR(nlmsvc_task);
311 printk(KERN_WARNING 311 printk(KERN_WARNING
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e703318c41df..067778b0ccc9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
276 dprintk("lockd: unlinking block %p...\n", block); 276 dprintk("lockd: unlinking block %p...\n", block);
277 277
278 /* Remove block from list */ 278 /* Remove block from list */
279 status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); 279 status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
280 nlmsvc_remove_block(block); 280 nlmsvc_remove_block(block);
281 return status; 281 return status;
282} 282}
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
744 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; 744 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
745} 745}
746 746
747/*
748 * Since NLM uses two "keys" for tracking locks, we need to hash them down
749 * to one for the blocked_hash. Here, we're just xor'ing the host address
750 * with the pid in order to create a key value for picking a hash bucket.
751 */
752static unsigned long
753nlmsvc_owner_key(struct file_lock *fl)
754{
755 return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
756}
757
747const struct lock_manager_operations nlmsvc_lock_operations = { 758const struct lock_manager_operations nlmsvc_lock_operations = {
748 .lm_compare_owner = nlmsvc_same_owner, 759 .lm_compare_owner = nlmsvc_same_owner,
760 .lm_owner_key = nlmsvc_owner_key,
749 .lm_notify = nlmsvc_notify_blocked, 761 .lm_notify = nlmsvc_notify_blocked,
750 .lm_grant = nlmsvc_grant_deferred, 762 .lm_grant = nlmsvc_grant_deferred,
751}; 763};
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 97e87415b145..dc5c75930f0f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
169 169
170again: 170again:
171 file->f_locks = 0; 171 file->f_locks = 0;
172 lock_flocks(); /* protects i_flock list */ 172 spin_lock(&inode->i_lock);
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 173 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 174 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 175 continue;
@@ -181,7 +181,7 @@ again:
181 if (match(lockhost, host)) { 181 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 182 struct file_lock lock = *fl;
183 183
184 unlock_flocks(); 184 spin_unlock(&inode->i_lock);
185 lock.fl_type = F_UNLCK; 185 lock.fl_type = F_UNLCK;
186 lock.fl_start = 0; 186 lock.fl_start = 0;
187 lock.fl_end = OFFSET_MAX; 187 lock.fl_end = OFFSET_MAX;
@@ -193,7 +193,7 @@ again:
193 goto again; 193 goto again;
194 } 194 }
195 } 195 }
196 unlock_flocks(); 196 spin_unlock(&inode->i_lock);
197 197
198 return 0; 198 return 0;
199} 199}
@@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
228 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 228 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
229 return 1; 229 return 1;
230 230
231 lock_flocks(); 231 spin_lock(&inode->i_lock);
232 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 232 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
233 if (fl->fl_lmops == &nlmsvc_lock_operations) { 233 if (fl->fl_lmops == &nlmsvc_lock_operations) {
234 unlock_flocks(); 234 spin_unlock(&inode->i_lock);
235 return 1; 235 return 1;
236 } 236 }
237 } 237 }
238 unlock_flocks(); 238 spin_unlock(&inode->i_lock);
239 file->f_locks = 0; 239 file->f_locks = 0;
240 return 0; 240 return 0;
241} 241}
diff --git a/fs/locks.c b/fs/locks.c
index cb424a4fed71..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,9 @@
126#include <linux/time.h> 126#include <linux/time.h>
127#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
128#include <linux/pid_namespace.h> 128#include <linux/pid_namespace.h>
129#include <linux/hashtable.h>
130#include <linux/percpu.h>
131#include <linux/lglock.h>
129 132
130#include <asm/uaccess.h> 133#include <asm/uaccess.h>
131 134
@@ -153,30 +156,53 @@ int lease_break_time = 45;
153#define for_each_lock(inode, lockp) \ 156#define for_each_lock(inode, lockp) \
154 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) 157 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
155 158
156static LIST_HEAD(file_lock_list); 159/*
157static LIST_HEAD(blocked_list); 160 * The global file_lock_list is only used for displaying /proc/locks, so we
158static DEFINE_SPINLOCK(file_lock_lock); 161 * keep a list on each CPU, with each list protected by its own spinlock via
162 * the file_lock_lglock. Note that alterations to the list also require that
163 * the relevant i_lock is held.
164 */
165DEFINE_STATIC_LGLOCK(file_lock_lglock);
166static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
159 167
160/* 168/*
161 * Protects the two list heads above, plus the inode->i_flock list 169 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
170 * It is protected by blocked_lock_lock.
171 *
172 * We hash locks by lockowner in order to optimize searching for the lock a
173 * particular lockowner is waiting on.
174 *
175 * FIXME: make this value scale via some heuristic? We generally will want more
176 * buckets when we have more lockowners holding locks, but that's a little
177 * difficult to determine without knowing what the workload will look like.
162 */ 178 */
163void lock_flocks(void) 179#define BLOCKED_HASH_BITS 7
164{ 180static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
165 spin_lock(&file_lock_lock);
166}
167EXPORT_SYMBOL_GPL(lock_flocks);
168 181
169void unlock_flocks(void) 182/*
170{ 183 * This lock protects the blocked_hash. Generally, if you're accessing it, you
171 spin_unlock(&file_lock_lock); 184 * want to be holding this lock.
172} 185 *
173EXPORT_SYMBOL_GPL(unlock_flocks); 186 * In addition, it also protects the fl->fl_block list, and the fl->fl_next
187 * pointer for file_lock structures that are acting as lock requests (in
188 * contrast to those that are acting as records of acquired locks).
189 *
190 * Note that when we acquire this lock in order to change the above fields,
191 * we often hold the i_lock as well. In certain cases, when reading the fields
192 * protected by this lock, we can skip acquiring it iff we already hold the
193 * i_lock.
194 *
195 * In particular, adding an entry to the fl_block list requires that you hold
196 * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
197 * an entry from the list however only requires the file_lock_lock.
198 */
199static DEFINE_SPINLOCK(blocked_lock_lock);
174 200
175static struct kmem_cache *filelock_cache __read_mostly; 201static struct kmem_cache *filelock_cache __read_mostly;
176 202
177static void locks_init_lock_heads(struct file_lock *fl) 203static void locks_init_lock_heads(struct file_lock *fl)
178{ 204{
179 INIT_LIST_HEAD(&fl->fl_link); 205 INIT_HLIST_NODE(&fl->fl_link);
180 INIT_LIST_HEAD(&fl->fl_block); 206 INIT_LIST_HEAD(&fl->fl_block);
181 init_waitqueue_head(&fl->fl_wait); 207 init_waitqueue_head(&fl->fl_wait);
182} 208}
@@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl)
210{ 236{
211 BUG_ON(waitqueue_active(&fl->fl_wait)); 237 BUG_ON(waitqueue_active(&fl->fl_wait));
212 BUG_ON(!list_empty(&fl->fl_block)); 238 BUG_ON(!list_empty(&fl->fl_block));
213 BUG_ON(!list_empty(&fl->fl_link)); 239 BUG_ON(!hlist_unhashed(&fl->fl_link));
214 240
215 locks_release_private(fl); 241 locks_release_private(fl);
216 kmem_cache_free(filelock_cache, fl); 242 kmem_cache_free(filelock_cache, fl);
@@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
484 return fl1->fl_owner == fl2->fl_owner; 510 return fl1->fl_owner == fl2->fl_owner;
485} 511}
486 512
513/* Must be called with the i_lock held! */
514static inline void
515locks_insert_global_locks(struct file_lock *fl)
516{
517 lg_local_lock(&file_lock_lglock);
518 fl->fl_link_cpu = smp_processor_id();
519 hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
520 lg_local_unlock(&file_lock_lglock);
521}
522
523/* Must be called with the i_lock held! */
524static inline void
525locks_delete_global_locks(struct file_lock *fl)
526{
527 /*
528 * Avoid taking lock if already unhashed. This is safe since this check
529 * is done while holding the i_lock, and new insertions into the list
530 * also require that it be held.
531 */
532 if (hlist_unhashed(&fl->fl_link))
533 return;
534 lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
535 hlist_del_init(&fl->fl_link);
536 lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
537}
538
539static unsigned long
540posix_owner_key(struct file_lock *fl)
541{
542 if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
543 return fl->fl_lmops->lm_owner_key(fl);
544 return (unsigned long)fl->fl_owner;
545}
546
547static inline void
548locks_insert_global_blocked(struct file_lock *waiter)
549{
550 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
551}
552
553static inline void
554locks_delete_global_blocked(struct file_lock *waiter)
555{
556 hash_del(&waiter->fl_link);
557}
558
487/* Remove waiter from blocker's block list. 559/* Remove waiter from blocker's block list.
488 * When blocker ends up pointing to itself then the list is empty. 560 * When blocker ends up pointing to itself then the list is empty.
561 *
562 * Must be called with blocked_lock_lock held.
489 */ 563 */
490static void __locks_delete_block(struct file_lock *waiter) 564static void __locks_delete_block(struct file_lock *waiter)
491{ 565{
566 locks_delete_global_blocked(waiter);
492 list_del_init(&waiter->fl_block); 567 list_del_init(&waiter->fl_block);
493 list_del_init(&waiter->fl_link);
494 waiter->fl_next = NULL; 568 waiter->fl_next = NULL;
495} 569}
496 570
497/* 571static void locks_delete_block(struct file_lock *waiter)
498 */
499void locks_delete_block(struct file_lock *waiter)
500{ 572{
501 lock_flocks(); 573 spin_lock(&blocked_lock_lock);
502 __locks_delete_block(waiter); 574 __locks_delete_block(waiter);
503 unlock_flocks(); 575 spin_unlock(&blocked_lock_lock);
504} 576}
505EXPORT_SYMBOL(locks_delete_block);
506 577
507/* Insert waiter into blocker's block list. 578/* Insert waiter into blocker's block list.
508 * We use a circular list so that processes can be easily woken up in 579 * We use a circular list so that processes can be easily woken up in
509 * the order they blocked. The documentation doesn't require this but 580 * the order they blocked. The documentation doesn't require this but
510 * it seems like the reasonable thing to do. 581 * it seems like the reasonable thing to do.
582 *
583 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
584 * list itself is protected by the file_lock_list, but by ensuring that the
585 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
586 * in some cases when we see that the fl_block list is empty.
511 */ 587 */
512static void locks_insert_block(struct file_lock *blocker, 588static void __locks_insert_block(struct file_lock *blocker,
513 struct file_lock *waiter) 589 struct file_lock *waiter)
514{ 590{
515 BUG_ON(!list_empty(&waiter->fl_block)); 591 BUG_ON(!list_empty(&waiter->fl_block));
516 list_add_tail(&waiter->fl_block, &blocker->fl_block);
517 waiter->fl_next = blocker; 592 waiter->fl_next = blocker;
593 list_add_tail(&waiter->fl_block, &blocker->fl_block);
518 if (IS_POSIX(blocker)) 594 if (IS_POSIX(blocker))
519 list_add(&waiter->fl_link, &blocked_list); 595 locks_insert_global_blocked(waiter);
520} 596}
521 597
522/* Wake up processes blocked waiting for blocker. 598/* Must be called with i_lock held. */
523 * If told to wait then schedule the processes until the block list 599static void locks_insert_block(struct file_lock *blocker,
524 * is empty, otherwise empty the block list ourselves. 600 struct file_lock *waiter)
601{
602 spin_lock(&blocked_lock_lock);
603 __locks_insert_block(blocker, waiter);
604 spin_unlock(&blocked_lock_lock);
605}
606
607/*
608 * Wake up processes blocked waiting for blocker.
609 *
610 * Must be called with the inode->i_lock held!
525 */ 611 */
526static void locks_wake_up_blocks(struct file_lock *blocker) 612static void locks_wake_up_blocks(struct file_lock *blocker)
527{ 613{
614 /*
615 * Avoid taking global lock if list is empty. This is safe since new
616 * blocked requests are only added to the list under the i_lock, and
617 * the i_lock is always held here. Note that removal from the fl_block
618 * list does not require the i_lock, so we must recheck list_empty()
619 * after acquiring the blocked_lock_lock.
620 */
621 if (list_empty(&blocker->fl_block))
622 return;
623
624 spin_lock(&blocked_lock_lock);
528 while (!list_empty(&blocker->fl_block)) { 625 while (!list_empty(&blocker->fl_block)) {
529 struct file_lock *waiter; 626 struct file_lock *waiter;
530 627
@@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
536 else 633 else
537 wake_up(&waiter->fl_wait); 634 wake_up(&waiter->fl_wait);
538 } 635 }
636 spin_unlock(&blocked_lock_lock);
539} 637}
540 638
541/* Insert file lock fl into an inode's lock list at the position indicated 639/* Insert file lock fl into an inode's lock list at the position indicated
542 * by pos. At the same time add the lock to the global file lock list. 640 * by pos. At the same time add the lock to the global file lock list.
641 *
642 * Must be called with the i_lock held!
543 */ 643 */
544static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) 644static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
545{ 645{
546 list_add(&fl->fl_link, &file_lock_list);
547
548 fl->fl_nspid = get_pid(task_tgid(current)); 646 fl->fl_nspid = get_pid(task_tgid(current));
549 647
550 /* insert into file's list */ 648 /* insert into file's list */
551 fl->fl_next = *pos; 649 fl->fl_next = *pos;
552 *pos = fl; 650 *pos = fl;
651
652 locks_insert_global_locks(fl);
553} 653}
554 654
555/* 655/*
@@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
557 * Wake up processes that are blocked waiting for this lock, 657 * Wake up processes that are blocked waiting for this lock,
558 * notify the FS that the lock has been cleared and 658 * notify the FS that the lock has been cleared and
559 * finally free the lock. 659 * finally free the lock.
660 *
661 * Must be called with the i_lock held!
560 */ 662 */
561static void locks_delete_lock(struct file_lock **thisfl_p) 663static void locks_delete_lock(struct file_lock **thisfl_p)
562{ 664{
563 struct file_lock *fl = *thisfl_p; 665 struct file_lock *fl = *thisfl_p;
564 666
667 locks_delete_global_locks(fl);
668
565 *thisfl_p = fl->fl_next; 669 *thisfl_p = fl->fl_next;
566 fl->fl_next = NULL; 670 fl->fl_next = NULL;
567 list_del_init(&fl->fl_link);
568 671
569 if (fl->fl_nspid) { 672 if (fl->fl_nspid) {
570 put_pid(fl->fl_nspid); 673 put_pid(fl->fl_nspid);
@@ -625,8 +728,9 @@ void
625posix_test_lock(struct file *filp, struct file_lock *fl) 728posix_test_lock(struct file *filp, struct file_lock *fl)
626{ 729{
627 struct file_lock *cfl; 730 struct file_lock *cfl;
731 struct inode *inode = file_inode(filp);
628 732
629 lock_flocks(); 733 spin_lock(&inode->i_lock);
630 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { 734 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
631 if (!IS_POSIX(cfl)) 735 if (!IS_POSIX(cfl))
632 continue; 736 continue;
@@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
639 fl->fl_pid = pid_vnr(cfl->fl_nspid); 743 fl->fl_pid = pid_vnr(cfl->fl_nspid);
640 } else 744 } else
641 fl->fl_type = F_UNLCK; 745 fl->fl_type = F_UNLCK;
642 unlock_flocks(); 746 spin_unlock(&inode->i_lock);
643 return; 747 return;
644} 748}
645EXPORT_SYMBOL(posix_test_lock); 749EXPORT_SYMBOL(posix_test_lock);
@@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
676{ 780{
677 struct file_lock *fl; 781 struct file_lock *fl;
678 782
679 list_for_each_entry(fl, &blocked_list, fl_link) { 783 hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
680 if (posix_same_owner(fl, block_fl)) 784 if (posix_same_owner(fl, block_fl))
681 return fl->fl_next; 785 return fl->fl_next;
682 } 786 }
683 return NULL; 787 return NULL;
684} 788}
685 789
790/* Must be called with the blocked_lock_lock held! */
686static int posix_locks_deadlock(struct file_lock *caller_fl, 791static int posix_locks_deadlock(struct file_lock *caller_fl,
687 struct file_lock *block_fl) 792 struct file_lock *block_fl)
688{ 793{
@@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
718 return -ENOMEM; 823 return -ENOMEM;
719 } 824 }
720 825
721 lock_flocks(); 826 spin_lock(&inode->i_lock);
722 if (request->fl_flags & FL_ACCESS) 827 if (request->fl_flags & FL_ACCESS)
723 goto find_conflict; 828 goto find_conflict;
724 829
@@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
748 * give it the opportunity to lock the file. 853 * give it the opportunity to lock the file.
749 */ 854 */
750 if (found) { 855 if (found) {
751 unlock_flocks(); 856 spin_unlock(&inode->i_lock);
752 cond_resched(); 857 cond_resched();
753 lock_flocks(); 858 spin_lock(&inode->i_lock);
754 } 859 }
755 860
756find_conflict: 861find_conflict:
@@ -777,7 +882,7 @@ find_conflict:
777 error = 0; 882 error = 0;
778 883
779out: 884out:
780 unlock_flocks(); 885 spin_unlock(&inode->i_lock);
781 if (new_fl) 886 if (new_fl)
782 locks_free_lock(new_fl); 887 locks_free_lock(new_fl);
783 return error; 888 return error;
@@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
791 struct file_lock *left = NULL; 896 struct file_lock *left = NULL;
792 struct file_lock *right = NULL; 897 struct file_lock *right = NULL;
793 struct file_lock **before; 898 struct file_lock **before;
794 int error, added = 0; 899 int error;
900 bool added = false;
795 901
796 /* 902 /*
797 * We may need two file_lock structures for this operation, 903 * We may need two file_lock structures for this operation,
@@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
806 new_fl2 = locks_alloc_lock(); 912 new_fl2 = locks_alloc_lock();
807 } 913 }
808 914
809 lock_flocks(); 915 spin_lock(&inode->i_lock);
916 /*
917 * New lock request. Walk all POSIX locks and look for conflicts. If
918 * there are any, either return error or put the request on the
919 * blocker's list of waiters and the global blocked_hash.
920 */
810 if (request->fl_type != F_UNLCK) { 921 if (request->fl_type != F_UNLCK) {
811 for_each_lock(inode, before) { 922 for_each_lock(inode, before) {
812 fl = *before; 923 fl = *before;
@@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
819 error = -EAGAIN; 930 error = -EAGAIN;
820 if (!(request->fl_flags & FL_SLEEP)) 931 if (!(request->fl_flags & FL_SLEEP))
821 goto out; 932 goto out;
933 /*
934 * Deadlock detection and insertion into the blocked
935 * locks list must be done while holding the same lock!
936 */
822 error = -EDEADLK; 937 error = -EDEADLK;
823 if (posix_locks_deadlock(request, fl)) 938 spin_lock(&blocked_lock_lock);
824 goto out; 939 if (likely(!posix_locks_deadlock(request, fl))) {
825 error = FILE_LOCK_DEFERRED; 940 error = FILE_LOCK_DEFERRED;
826 locks_insert_block(fl, request); 941 __locks_insert_block(fl, request);
942 }
943 spin_unlock(&blocked_lock_lock);
827 goto out; 944 goto out;
828 } 945 }
829 } 946 }
@@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
845 before = &fl->fl_next; 962 before = &fl->fl_next;
846 } 963 }
847 964
848 /* Process locks with this owner. */ 965 /* Process locks with this owner. */
849 while ((fl = *before) && posix_same_owner(request, fl)) { 966 while ((fl = *before) && posix_same_owner(request, fl)) {
850 /* Detect adjacent or overlapping regions (if same lock type) 967 /* Detect adjacent or overlapping regions (if same lock type)
851 */ 968 */
@@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
880 continue; 997 continue;
881 } 998 }
882 request = fl; 999 request = fl;
883 added = 1; 1000 added = true;
884 } 1001 }
885 else { 1002 else {
886 /* Processing for different lock types is a bit 1003 /* Processing for different lock types is a bit
@@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
891 if (fl->fl_start > request->fl_end) 1008 if (fl->fl_start > request->fl_end)
892 break; 1009 break;
893 if (request->fl_type == F_UNLCK) 1010 if (request->fl_type == F_UNLCK)
894 added = 1; 1011 added = true;
895 if (fl->fl_start < request->fl_start) 1012 if (fl->fl_start < request->fl_start)
896 left = fl; 1013 left = fl;
897 /* If the next lock in the list has a higher end 1014 /* If the next lock in the list has a higher end
@@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
921 locks_release_private(fl); 1038 locks_release_private(fl);
922 locks_copy_private(fl, request); 1039 locks_copy_private(fl, request);
923 request = fl; 1040 request = fl;
924 added = 1; 1041 added = true;
925 } 1042 }
926 } 1043 }
927 /* Go on to next lock. 1044 /* Go on to next lock.
@@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
931 } 1048 }
932 1049
933 /* 1050 /*
934 * The above code only modifies existing locks in case of 1051 * The above code only modifies existing locks in case of merging or
935 * merging or replacing. If new lock(s) need to be inserted 1052 * replacing. If new lock(s) need to be inserted all modifications are
936 * all modifications are done bellow this, so it's safe yet to 1053 * done below this, so it's safe yet to bail out.
937 * bail out.
938 */ 1054 */
939 error = -ENOLCK; /* "no luck" */ 1055 error = -ENOLCK; /* "no luck" */
940 if (right && left == right && !new_fl2) 1056 if (right && left == right && !new_fl2)
@@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
974 locks_wake_up_blocks(left); 1090 locks_wake_up_blocks(left);
975 } 1091 }
976 out: 1092 out:
977 unlock_flocks(); 1093 spin_unlock(&inode->i_lock);
978 /* 1094 /*
979 * Free any unused locks. 1095 * Free any unused locks.
980 */ 1096 */
@@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode)
1049 /* 1165 /*
1050 * Search the lock list for this inode for any POSIX locks. 1166 * Search the lock list for this inode for any POSIX locks.
1051 */ 1167 */
1052 lock_flocks(); 1168 spin_lock(&inode->i_lock);
1053 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1169 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1054 if (!IS_POSIX(fl)) 1170 if (!IS_POSIX(fl))
1055 continue; 1171 continue;
1056 if (fl->fl_owner != owner) 1172 if (fl->fl_owner != owner)
1057 break; 1173 break;
1058 } 1174 }
1059 unlock_flocks(); 1175 spin_unlock(&inode->i_lock);
1060 return fl ? -EAGAIN : 0; 1176 return fl ? -EAGAIN : 0;
1061} 1177}
1062 1178
@@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1199 if (IS_ERR(new_fl)) 1315 if (IS_ERR(new_fl))
1200 return PTR_ERR(new_fl); 1316 return PTR_ERR(new_fl);
1201 1317
1202 lock_flocks(); 1318 spin_lock(&inode->i_lock);
1203 1319
1204 time_out_leases(inode); 1320 time_out_leases(inode);
1205 1321
@@ -1249,11 +1365,11 @@ restart:
1249 break_time++; 1365 break_time++;
1250 } 1366 }
1251 locks_insert_block(flock, new_fl); 1367 locks_insert_block(flock, new_fl);
1252 unlock_flocks(); 1368 spin_unlock(&inode->i_lock);
1253 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1369 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1254 !new_fl->fl_next, break_time); 1370 !new_fl->fl_next, break_time);
1255 lock_flocks(); 1371 spin_lock(&inode->i_lock);
1256 __locks_delete_block(new_fl); 1372 locks_delete_block(new_fl);
1257 if (error >= 0) { 1373 if (error >= 0) {
1258 if (error == 0) 1374 if (error == 0)
1259 time_out_leases(inode); 1375 time_out_leases(inode);
@@ -1270,7 +1386,7 @@ restart:
1270 } 1386 }
1271 1387
1272out: 1388out:
1273 unlock_flocks(); 1389 spin_unlock(&inode->i_lock);
1274 locks_free_lock(new_fl); 1390 locks_free_lock(new_fl);
1275 return error; 1391 return error;
1276} 1392}
@@ -1323,9 +1439,10 @@ EXPORT_SYMBOL(lease_get_mtime);
1323int fcntl_getlease(struct file *filp) 1439int fcntl_getlease(struct file *filp)
1324{ 1440{
1325 struct file_lock *fl; 1441 struct file_lock *fl;
1442 struct inode *inode = file_inode(filp);
1326 int type = F_UNLCK; 1443 int type = F_UNLCK;
1327 1444
1328 lock_flocks(); 1445 spin_lock(&inode->i_lock);
1329 time_out_leases(file_inode(filp)); 1446 time_out_leases(file_inode(filp));
1330 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); 1447 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
1331 fl = fl->fl_next) { 1448 fl = fl->fl_next) {
@@ -1334,11 +1451,11 @@ int fcntl_getlease(struct file *filp)
1334 break; 1451 break;
1335 } 1452 }
1336 } 1453 }
1337 unlock_flocks(); 1454 spin_unlock(&inode->i_lock);
1338 return type; 1455 return type;
1339} 1456}
1340 1457
1341int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1458static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1342{ 1459{
1343 struct file_lock *fl, **before, **my_before = NULL, *lease; 1460 struct file_lock *fl, **before, **my_before = NULL, *lease;
1344 struct dentry *dentry = filp->f_path.dentry; 1461 struct dentry *dentry = filp->f_path.dentry;
@@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1351 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1468 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1352 goto out; 1469 goto out;
1353 if ((arg == F_WRLCK) 1470 if ((arg == F_WRLCK)
1354 && ((dentry->d_count > 1) 1471 && ((d_count(dentry) > 1)
1355 || (atomic_read(&inode->i_count) > 1))) 1472 || (atomic_read(&inode->i_count) > 1)))
1356 goto out; 1473 goto out;
1357 1474
@@ -1403,7 +1520,7 @@ out:
1403 return error; 1520 return error;
1404} 1521}
1405 1522
1406int generic_delete_lease(struct file *filp, struct file_lock **flp) 1523static int generic_delete_lease(struct file *filp, struct file_lock **flp)
1407{ 1524{
1408 struct file_lock *fl, **before; 1525 struct file_lock *fl, **before;
1409 struct dentry *dentry = filp->f_path.dentry; 1526 struct dentry *dentry = filp->f_path.dentry;
@@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp)
1428 * The (input) flp->fl_lmops->lm_break function is required 1545 * The (input) flp->fl_lmops->lm_break function is required
1429 * by break_lease(). 1546 * by break_lease().
1430 * 1547 *
1431 * Called with file_lock_lock held. 1548 * Called with inode->i_lock held.
1432 */ 1549 */
1433int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1550int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1434{ 1551{
@@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1497 1614
1498int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) 1615int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1499{ 1616{
1617 struct inode *inode = file_inode(filp);
1500 int error; 1618 int error;
1501 1619
1502 lock_flocks(); 1620 spin_lock(&inode->i_lock);
1503 error = __vfs_setlease(filp, arg, lease); 1621 error = __vfs_setlease(filp, arg, lease);
1504 unlock_flocks(); 1622 spin_unlock(&inode->i_lock);
1505 1623
1506 return error; 1624 return error;
1507} 1625}
@@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp)
1519static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) 1637static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1520{ 1638{
1521 struct file_lock *fl, *ret; 1639 struct file_lock *fl, *ret;
1640 struct inode *inode = file_inode(filp);
1522 struct fasync_struct *new; 1641 struct fasync_struct *new;
1523 int error; 1642 int error;
1524 1643
@@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1532 return -ENOMEM; 1651 return -ENOMEM;
1533 } 1652 }
1534 ret = fl; 1653 ret = fl;
1535 lock_flocks(); 1654 spin_lock(&inode->i_lock);
1536 error = __vfs_setlease(filp, arg, &ret); 1655 error = __vfs_setlease(filp, arg, &ret);
1537 if (error) { 1656 if (error) {
1538 unlock_flocks(); 1657 spin_unlock(&inode->i_lock);
1539 locks_free_lock(fl); 1658 locks_free_lock(fl);
1540 goto out_free_fasync; 1659 goto out_free_fasync;
1541 } 1660 }
@@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1552 new = NULL; 1671 new = NULL;
1553 1672
1554 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 1673 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1555 unlock_flocks(); 1674 spin_unlock(&inode->i_lock);
1556 1675
1557out_free_fasync: 1676out_free_fasync:
1558 if (new) 1677 if (new)
@@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp)
2076 fl.fl_ops->fl_release_private(&fl); 2195 fl.fl_ops->fl_release_private(&fl);
2077 } 2196 }
2078 2197
2079 lock_flocks(); 2198 spin_lock(&inode->i_lock);
2080 before = &inode->i_flock; 2199 before = &inode->i_flock;
2081 2200
2082 while ((fl = *before) != NULL) { 2201 while ((fl = *before) != NULL) {
@@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp)
2094 } 2213 }
2095 before = &fl->fl_next; 2214 before = &fl->fl_next;
2096 } 2215 }
2097 unlock_flocks(); 2216 spin_unlock(&inode->i_lock);
2098} 2217}
2099 2218
2100/** 2219/**
2101 * posix_unblock_lock - stop waiting for a file lock 2220 * posix_unblock_lock - stop waiting for a file lock
2102 * @filp: how the file was opened
2103 * @waiter: the lock which was waiting 2221 * @waiter: the lock which was waiting
2104 * 2222 *
2105 * lockd needs to block waiting for locks. 2223 * lockd needs to block waiting for locks.
2106 */ 2224 */
2107int 2225int
2108posix_unblock_lock(struct file *filp, struct file_lock *waiter) 2226posix_unblock_lock(struct file_lock *waiter)
2109{ 2227{
2110 int status = 0; 2228 int status = 0;
2111 2229
2112 lock_flocks(); 2230 spin_lock(&blocked_lock_lock);
2113 if (waiter->fl_next) 2231 if (waiter->fl_next)
2114 __locks_delete_block(waiter); 2232 __locks_delete_block(waiter);
2115 else 2233 else
2116 status = -ENOENT; 2234 status = -ENOENT;
2117 unlock_flocks(); 2235 spin_unlock(&blocked_lock_lock);
2118 return status; 2236 return status;
2119} 2237}
2120
2121EXPORT_SYMBOL(posix_unblock_lock); 2238EXPORT_SYMBOL(posix_unblock_lock);
2122 2239
2123/** 2240/**
@@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2140#include <linux/proc_fs.h> 2257#include <linux/proc_fs.h>
2141#include <linux/seq_file.h> 2258#include <linux/seq_file.h>
2142 2259
2260struct locks_iterator {
2261 int li_cpu;
2262 loff_t li_pos;
2263};
2264
2143static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2265static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2144 loff_t id, char *pfx) 2266 loff_t id, char *pfx)
2145{ 2267{
@@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2213 2335
2214static int locks_show(struct seq_file *f, void *v) 2336static int locks_show(struct seq_file *f, void *v)
2215{ 2337{
2338 struct locks_iterator *iter = f->private;
2216 struct file_lock *fl, *bfl; 2339 struct file_lock *fl, *bfl;
2217 2340
2218 fl = list_entry(v, struct file_lock, fl_link); 2341 fl = hlist_entry(v, struct file_lock, fl_link);
2219 2342
2220 lock_get_status(f, fl, *((loff_t *)f->private), ""); 2343 lock_get_status(f, fl, iter->li_pos, "");
2221 2344
2222 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2345 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2223 lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); 2346 lock_get_status(f, bfl, iter->li_pos, " ->");
2224 2347
2225 return 0; 2348 return 0;
2226} 2349}
2227 2350
2228static void *locks_start(struct seq_file *f, loff_t *pos) 2351static void *locks_start(struct seq_file *f, loff_t *pos)
2229{ 2352{
2230 loff_t *p = f->private; 2353 struct locks_iterator *iter = f->private;
2231 2354
2232 lock_flocks(); 2355 iter->li_pos = *pos + 1;
2233 *p = (*pos + 1); 2356 lg_global_lock(&file_lock_lglock);
2234 return seq_list_start(&file_lock_list, *pos); 2357 spin_lock(&blocked_lock_lock);
2358 return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
2235} 2359}
2236 2360
2237static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2361static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2238{ 2362{
2239 loff_t *p = f->private; 2363 struct locks_iterator *iter = f->private;
2240 ++*p; 2364
2241 return seq_list_next(v, &file_lock_list, pos); 2365 ++iter->li_pos;
2366 return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
2242} 2367}
2243 2368
2244static void locks_stop(struct seq_file *f, void *v) 2369static void locks_stop(struct seq_file *f, void *v)
2245{ 2370{
2246 unlock_flocks(); 2371 spin_unlock(&blocked_lock_lock);
2372 lg_global_unlock(&file_lock_lglock);
2247} 2373}
2248 2374
2249static const struct seq_operations locks_seq_operations = { 2375static const struct seq_operations locks_seq_operations = {
@@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
2255 2381
2256static int locks_open(struct inode *inode, struct file *filp) 2382static int locks_open(struct inode *inode, struct file *filp)
2257{ 2383{
2258 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); 2384 return seq_open_private(filp, &locks_seq_operations,
2385 sizeof(struct locks_iterator));
2259} 2386}
2260 2387
2261static const struct file_operations proc_locks_operations = { 2388static const struct file_operations proc_locks_operations = {
@@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2290{ 2417{
2291 struct file_lock *fl; 2418 struct file_lock *fl;
2292 int result = 1; 2419 int result = 1;
2293 lock_flocks(); 2420
2421 spin_lock(&inode->i_lock);
2294 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2422 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2295 if (IS_POSIX(fl)) { 2423 if (IS_POSIX(fl)) {
2296 if (fl->fl_type == F_RDLCK) 2424 if (fl->fl_type == F_RDLCK)
@@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2307 result = 0; 2435 result = 0;
2308 break; 2436 break;
2309 } 2437 }
2310 unlock_flocks(); 2438 spin_unlock(&inode->i_lock);
2311 return result; 2439 return result;
2312} 2440}
2313 2441
@@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2330{ 2458{
2331 struct file_lock *fl; 2459 struct file_lock *fl;
2332 int result = 1; 2460 int result = 1;
2333 lock_flocks(); 2461
2462 spin_lock(&inode->i_lock);
2334 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2463 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2335 if (IS_POSIX(fl)) { 2464 if (IS_POSIX(fl)) {
2336 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2465 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2345 result = 0; 2474 result = 0;
2346 break; 2475 break;
2347 } 2476 }
2348 unlock_flocks(); 2477 spin_unlock(&inode->i_lock);
2349 return result; 2478 return result;
2350} 2479}
2351 2480
@@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
2353 2482
2354static int __init filelock_init(void) 2483static int __init filelock_init(void)
2355{ 2484{
2485 int i;
2486
2356 filelock_cache = kmem_cache_create("file_lock_cache", 2487 filelock_cache = kmem_cache_create("file_lock_cache",
2357 sizeof(struct file_lock), 0, SLAB_PANIC, NULL); 2488 sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
2358 2489
2490 lg_lock_init(&file_lock_lglock, "file_lock_lglock");
2491
2492 for_each_possible_cpu(i)
2493 INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
2494
2359 return 0; 2495 return 0;
2360} 2496}
2361 2497
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 08c442902fcd..dfaf6fa9b7b5 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -93,7 +93,7 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
93 unsigned offset; 93 unsigned offset;
94 unsigned long n; 94 unsigned long n;
95 95
96 ctx->pos = pos = (pos + chunk_size-1) & ~(chunk_size-1); 96 ctx->pos = pos = ALIGN(pos, chunk_size);
97 if (pos >= inode->i_size) 97 if (pos >= inode->i_size)
98 return 0; 98 return 0;
99 99
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd668..cd950e2331b6 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
54 return error; 54 return error;
55} 55}
56 56
57static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
58{
59 int error;
60 struct inode *inode = minix_new_inode(dir, mode, &error);
61 if (inode) {
62 minix_set_inode(inode, 0);
63 mark_inode_dirty(inode);
64 d_tmpfile(dentry, inode);
65 }
66 return error;
67}
68
57static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, 69static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
58 bool excl) 70 bool excl)
59{ 71{
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
254 .mknod = minix_mknod, 266 .mknod = minix_mknod,
255 .rename = minix_rename, 267 .rename = minix_rename,
256 .getattr = minix_getattr, 268 .getattr = minix_getattr,
269 .tmpfile = minix_tmpfile,
257}; 270};
diff --git a/fs/namei.c b/fs/namei.c
index 9ed9361223c0..b2beee7a733f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd,
1352 */ 1352 */
1353 if (nd->flags & LOOKUP_RCU) { 1353 if (nd->flags & LOOKUP_RCU) {
1354 unsigned seq; 1354 unsigned seq;
1355 dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); 1355 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1356 if (!dentry) 1356 if (!dentry)
1357 goto unlazy; 1357 goto unlazy;
1358 1358
@@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1787 struct dentry *parent = nd->path.dentry; 1787 struct dentry *parent = nd->path.dentry;
1788 nd->flags &= ~LOOKUP_JUMPED; 1788 nd->flags &= ~LOOKUP_JUMPED;
1789 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1789 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1790 err = parent->d_op->d_hash(parent, nd->inode, 1790 err = parent->d_op->d_hash(parent, &this);
1791 &this);
1792 if (err < 0) 1791 if (err < 0)
1793 break; 1792 break;
1794 } 1793 }
@@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2121 * to use its own hash.. 2120 * to use its own hash..
2122 */ 2121 */
2123 if (base->d_flags & DCACHE_OP_HASH) { 2122 if (base->d_flags & DCACHE_OP_HASH) {
2124 int err = base->d_op->d_hash(base, base->d_inode, &this); 2123 int err = base->d_op->d_hash(base, &this);
2125 if (err < 0) 2124 if (err < 0)
2126 return ERR_PTR(err); 2125 return ERR_PTR(err);
2127 } 2126 }
@@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path,
2690 nd->flags &= ~LOOKUP_PARENT; 2689 nd->flags &= ~LOOKUP_PARENT;
2691 nd->flags |= op->intent; 2690 nd->flags |= op->intent;
2692 2691
2693 switch (nd->last_type) { 2692 if (nd->last_type != LAST_NORM) {
2694 case LAST_DOTDOT:
2695 case LAST_DOT:
2696 error = handle_dots(nd, nd->last_type); 2693 error = handle_dots(nd, nd->last_type);
2697 if (error) 2694 if (error)
2698 return error; 2695 return error;
2699 /* fallthrough */
2700 case LAST_ROOT:
2701 error = complete_walk(nd);
2702 if (error)
2703 return error;
2704 audit_inode(name, nd->path.dentry, 0);
2705 if (open_flag & O_CREAT) {
2706 error = -EISDIR;
2707 goto out;
2708 }
2709 goto finish_open;
2710 case LAST_BIND:
2711 error = complete_walk(nd);
2712 if (error)
2713 return error;
2714 audit_inode(name, dir, 0);
2715 goto finish_open; 2696 goto finish_open;
2716 } 2697 }
2717 2698
@@ -2841,19 +2822,19 @@ finish_lookup:
2841 } 2822 }
2842 nd->inode = inode; 2823 nd->inode = inode;
2843 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2824 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2825finish_open:
2844 error = complete_walk(nd); 2826 error = complete_walk(nd);
2845 if (error) { 2827 if (error) {
2846 path_put(&save_parent); 2828 path_put(&save_parent);
2847 return error; 2829 return error;
2848 } 2830 }
2831 audit_inode(name, nd->path.dentry, 0);
2849 error = -EISDIR; 2832 error = -EISDIR;
2850 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) 2833 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
2851 goto out; 2834 goto out;
2852 error = -ENOTDIR; 2835 error = -ENOTDIR;
2853 if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) 2836 if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
2854 goto out; 2837 goto out;
2855 audit_inode(name, nd->path.dentry, 0);
2856finish_open:
2857 if (!S_ISREG(nd->inode->i_mode)) 2838 if (!S_ISREG(nd->inode->i_mode))
2858 will_truncate = false; 2839 will_truncate = false;
2859 2840
@@ -2920,6 +2901,67 @@ stale_open:
2920 goto retry_lookup; 2901 goto retry_lookup;
2921} 2902}
2922 2903
2904static int do_tmpfile(int dfd, struct filename *pathname,
2905 struct nameidata *nd, int flags,
2906 const struct open_flags *op,
2907 struct file *file, int *opened)
2908{
2909 static const struct qstr name = QSTR_INIT("/", 1);
2910 struct dentry *dentry, *child;
2911 struct inode *dir;
2912 int error = path_lookupat(dfd, pathname->name,
2913 flags | LOOKUP_DIRECTORY, nd);
2914 if (unlikely(error))
2915 return error;
2916 error = mnt_want_write(nd->path.mnt);
2917 if (unlikely(error))
2918 goto out;
2919 /* we want directory to be writable */
2920 error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
2921 if (error)
2922 goto out2;
2923 dentry = nd->path.dentry;
2924 dir = dentry->d_inode;
2925 if (!dir->i_op->tmpfile) {
2926 error = -EOPNOTSUPP;
2927 goto out2;
2928 }
2929 child = d_alloc(dentry, &name);
2930 if (unlikely(!child)) {
2931 error = -ENOMEM;
2932 goto out2;
2933 }
2934 nd->flags &= ~LOOKUP_DIRECTORY;
2935 nd->flags |= op->intent;
2936 dput(nd->path.dentry);
2937 nd->path.dentry = child;
2938 error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
2939 if (error)
2940 goto out2;
2941 audit_inode(pathname, nd->path.dentry, 0);
2942 error = may_open(&nd->path, op->acc_mode, op->open_flag);
2943 if (error)
2944 goto out2;
2945 file->f_path.mnt = nd->path.mnt;
2946 error = finish_open(file, nd->path.dentry, NULL, opened);
2947 if (error)
2948 goto out2;
2949 error = open_check_o_direct(file);
2950 if (error) {
2951 fput(file);
2952 } else if (!(op->open_flag & O_EXCL)) {
2953 struct inode *inode = file_inode(file);
2954 spin_lock(&inode->i_lock);
2955 inode->i_state |= I_LINKABLE;
2956 spin_unlock(&inode->i_lock);
2957 }
2958out2:
2959 mnt_drop_write(nd->path.mnt);
2960out:
2961 path_put(&nd->path);
2962 return error;
2963}
2964
2923static struct file *path_openat(int dfd, struct filename *pathname, 2965static struct file *path_openat(int dfd, struct filename *pathname,
2924 struct nameidata *nd, const struct open_flags *op, int flags) 2966 struct nameidata *nd, const struct open_flags *op, int flags)
2925{ 2967{
@@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname,
2935 2977
2936 file->f_flags = op->open_flag; 2978 file->f_flags = op->open_flag;
2937 2979
2980 if (unlikely(file->f_flags & O_TMPFILE)) {
2981 error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
2982 goto out;
2983 }
2984
2938 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); 2985 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
2939 if (unlikely(error)) 2986 if (unlikely(error))
2940 goto out; 2987 goto out;
@@ -2987,9 +3034,10 @@ out:
2987} 3034}
2988 3035
2989struct file *do_filp_open(int dfd, struct filename *pathname, 3036struct file *do_filp_open(int dfd, struct filename *pathname,
2990 const struct open_flags *op, int flags) 3037 const struct open_flags *op)
2991{ 3038{
2992 struct nameidata nd; 3039 struct nameidata nd;
3040 int flags = op->lookup_flags;
2993 struct file *filp; 3041 struct file *filp;
2994 3042
2995 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); 3043 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
@@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
3001} 3049}
3002 3050
3003struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, 3051struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3004 const char *name, const struct open_flags *op, int flags) 3052 const char *name, const struct open_flags *op)
3005{ 3053{
3006 struct nameidata nd; 3054 struct nameidata nd;
3007 struct file *file; 3055 struct file *file;
3008 struct filename filename = { .name = name }; 3056 struct filename filename = { .name = name };
3057 int flags = op->lookup_flags | LOOKUP_ROOT;
3009 3058
3010 nd.root.mnt = mnt; 3059 nd.root.mnt = mnt;
3011 nd.root.dentry = dentry; 3060 nd.root.dentry = dentry;
3012 3061
3013 flags |= LOOKUP_ROOT;
3014
3015 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 3062 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
3016 return ERR_PTR(-ELOOP); 3063 return ERR_PTR(-ELOOP);
3017 3064
@@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3586 3633
3587 mutex_lock(&inode->i_mutex); 3634 mutex_lock(&inode->i_mutex);
3588 /* Make sure we don't allow creating hardlink to an unlinked file */ 3635 /* Make sure we don't allow creating hardlink to an unlinked file */
3589 if (inode->i_nlink == 0) 3636 if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
3590 error = -ENOENT; 3637 error = -ENOENT;
3591 else if (max_links && inode->i_nlink >= max_links) 3638 else if (max_links && inode->i_nlink >= max_links)
3592 error = -EMLINK; 3639 error = -EMLINK;
3593 else 3640 else
3594 error = dir->i_op->link(old_dentry, dir, new_dentry); 3641 error = dir->i_op->link(old_dentry, dir, new_dentry);
3642
3643 if (!error && (inode->i_state & I_LINKABLE)) {
3644 spin_lock(&inode->i_lock);
3645 inode->i_state &= ~I_LINKABLE;
3646 spin_unlock(&inode->i_lock);
3647 }
3595 mutex_unlock(&inode->i_mutex); 3648 mutex_unlock(&inode->i_mutex);
3596 if (!error) 3649 if (!error)
3597 fsnotify_link(dir, inode, new_dentry); 3650 fsnotify_link(dir, inode, new_dentry);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 0e7f00298213..3be047474bfc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
73 * Dentry operations routines 73 * Dentry operations routines
74 */ 74 */
75static int ncp_lookup_validate(struct dentry *, unsigned int); 75static int ncp_lookup_validate(struct dentry *, unsigned int);
76static int ncp_hash_dentry(const struct dentry *, const struct inode *, 76static int ncp_hash_dentry(const struct dentry *, struct qstr *);
77 struct qstr *); 77static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
78static int ncp_compare_dentry(const struct dentry *, const struct inode *,
79 const struct dentry *, const struct inode *,
80 unsigned int, const char *, const struct qstr *); 78 unsigned int, const char *, const struct qstr *);
81static int ncp_delete_dentry(const struct dentry *); 79static int ncp_delete_dentry(const struct dentry *);
82 80
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
119/* 117/*
120 * Note: leave the hash unchanged if the directory 118 * Note: leave the hash unchanged if the directory
121 * is case-sensitive. 119 * is case-sensitive.
120 *
121 * Accessing the parent inode can be racy under RCU pathwalking.
122 * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
123 * the callers will handle races.
122 */ 124 */
123static int 125static int
124ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, 126ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
125 struct qstr *this)
126{ 127{
128 struct inode *inode = ACCESS_ONCE(dentry->d_inode);
129
130 if (!inode)
131 return 0;
132
127 if (!ncp_case_sensitive(inode)) { 133 if (!ncp_case_sensitive(inode)) {
128 struct super_block *sb = dentry->d_sb; 134 struct super_block *sb = dentry->d_sb;
129 struct nls_table *t; 135 struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
140 return 0; 146 return 0;
141} 147}
142 148
149/*
150 * Accessing the parent inode can be racy under RCU pathwalking.
151 * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
152 * the callers will handle races.
153 */
143static int 154static int
144ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, 155ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
145 const struct dentry *dentry, const struct inode *inode,
146 unsigned int len, const char *str, const struct qstr *name) 156 unsigned int len, const char *str, const struct qstr *name)
147{ 157{
158 struct inode *pinode;
159
148 if (len != name->len) 160 if (len != name->len)
149 return 1; 161 return 1;
150 162
163 pinode = ACCESS_ONCE(parent->d_inode);
164 if (!pinode)
165 return 1;
166
151 if (ncp_case_sensitive(pinode)) 167 if (ncp_case_sensitive(pinode))
152 return strncmp(str, name->name, len); 168 return strncmp(str, name->name, len);
153 169
@@ -660,8 +676,6 @@ end_advance:
660 ctl.valid = 0; 676 ctl.valid = 0;
661 if (!ctl.filled && (ctl.fpos == ctx->pos)) { 677 if (!ctl.filled && (ctl.fpos == ctx->pos)) {
662 if (!ino) 678 if (!ino)
663 ino = find_inode_number(dentry, &qname);
664 if (!ino)
665 ino = iunique(dir->i_sb, 2); 679 ino = iunique(dir->i_sb, 2);
666 ctl.filled = !dir_emit(ctx, qname.name, qname.len, 680 ctl.filled = !dir_emit(ctx, qname.name, qname.len,
667 ino, DT_UNKNOWN); 681 ino, DT_UNKNOWN);
@@ -1123,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1123 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1137 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1124 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1138 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1125 1139
1126 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
1127 /*
1128 * fail with EBUSY if there are still references to this
1129 * directory.
1130 */
1131 dentry_unhash(new_dentry);
1132 error = -EBUSY;
1133 if (!d_unhashed(new_dentry))
1134 goto out;
1135 }
1136
1137 ncp_age_dentry(server, old_dentry); 1140 ncp_age_dentry(server, old_dentry);
1138 ncp_age_dentry(server, new_dentry); 1141 ncp_age_dentry(server, new_dentry);
1139 1142
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 26910c8154da..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
403 switch (optval) { 403 switch (optval) {
404 case 'u': 404 case 'u':
405 data->uid = make_kuid(current_user_ns(), optint); 405 data->uid = make_kuid(current_user_ns(), optint);
406 if (!uid_valid(data->uid)) 406 if (!uid_valid(data->uid)) {
407 ret = -EINVAL;
407 goto err; 408 goto err;
409 }
408 break; 410 break;
409 case 'g': 411 case 'g':
410 data->gid = make_kgid(current_user_ns(), optint); 412 data->gid = make_kgid(current_user_ns(), optint);
411 if (!gid_valid(data->gid)) 413 if (!gid_valid(data->gid)) {
414 ret = -EINVAL;
412 goto err; 415 goto err;
416 }
413 break; 417 break;
414 case 'o': 418 case 'o':
415 data->mounted_uid = make_kuid(current_user_ns(), optint); 419 data->mounted_uid = make_kuid(current_user_ns(), optint);
416 if (!uid_valid(data->mounted_uid)) 420 if (!uid_valid(data->mounted_uid)) {
421 ret = -EINVAL;
417 goto err; 422 goto err;
423 }
418 break; 424 break;
419 case 'm': 425 case 'm':
420 data->file_mode = optint; 426 data->file_mode = optint;
@@ -891,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
891 if (!server) /* How this could happen? */ 897 if (!server) /* How this could happen? */
892 goto out; 898 goto out;
893 899
900 result = -EPERM;
901 if (IS_DEADDIR(dentry->d_inode))
902 goto out;
903
894 /* ageing the dentry to force validation */ 904 /* ageing the dentry to force validation */
895 ncp_age_dentry(server, dentry); 905 ncp_age_dentry(server, dentry);
896 906
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
117 return -EINVAL; 117 return -EINVAL;
118 /* we do not support files bigger than 4GB... We eventually 118 /* we do not support files bigger than 4GB... We eventually
119 supports just 4GB... */ 119 supports just 4GB... */
120 if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 120 if (vma_pages(vma) + vma->vm_pgoff
121 > (1U << (32 - PAGE_SHIFT))) 121 > (1U << (32 - PAGE_SHIFT)))
122 return -EFBIG; 122 return -EFBIG;
123 123
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
104 104
105 If unsure, say N. 105 If unsure, say N.
106 106
107config NFS_V4_2
108 bool "NFS client support for NFSv4.2"
109 depends on NFS_V4_1
110 help
111 This option enables support for minor version 2 of the NFSv4 protocol
112 in the kernel's NFS client.
113
114 If unsure, say N.
115
107config PNFS_FILE_LAYOUT 116config PNFS_FILE_LAYOUT
108 tristate 117 tristate
109 depends on NFS_V4_1 118 depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
131 If the NFS client is unchanged from the upstream kernel, this 140 If the NFS client is unchanged from the upstream kernel, this
132 option should be set to the default "kernel.org". 141 option should be set to the default "kernel.org".
133 142
143config NFS_V4_SECURITY_LABEL
144 bool
145 depends on NFS_V4_2 && SECURITY
146 default y
147
134config ROOT_NFS 148config ROOT_NFS
135 bool "Root file system on NFS" 149 bool "Root file system on NFS"
136 depends on NFS_FS=y && IP_PNP 150 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
8 direct.o pagelist.o read.o symlink.o unlink.o \ 8 direct.o pagelist.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o \ 9 write.o namespace.o mount_clnt.o
10 dns_resolve.o cache_lib.o
11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
12nfs-$(CONFIG_SYSCTL) += sysctl.o 11nfs-$(CONFIG_SYSCTL) += sysctl.o
13nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 12nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
22obj-$(CONFIG_NFS_V4) += nfsv4.o 21obj-$(CONFIG_NFS_V4) += nfsv4.o
23nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ 22nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 23 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
25 nfs4namespace.o nfs4getroot.o nfs4client.o 24 nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
25nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o 27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
28 28
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1089 dev->pgbase = 0; 1089 dev->pgbase = 0;
1090 dev->pglen = PAGE_SIZE * max_pages; 1090 dev->pglen = PAGE_SIZE * max_pages;
1091 dev->mincount = 0; 1091 dev->mincount = 0;
1092 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
1092 1093
1093 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 1094 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
1094 rc = nfs4_proc_getdeviceinfo(server, dev); 1095 rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
1095 dprintk("%s getdevice info returns %d\n", __func__, rc); 1096 dprintk("%s getdevice info returns %d\n", __func__, rc);
1096 if (rc) { 1097 if (rc) {
1097 rv = ERR_PTR(rc); 1098 rv = ERR_PTR(rc);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cff089a412c7..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
211 struct svc_rqst *rqstp; 211 struct svc_rqst *rqstp;
212 int (*callback_svc)(void *vrqstp); 212 int (*callback_svc)(void *vrqstp);
213 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 213 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
214 char svc_name[12];
215 int ret; 214 int ret;
216 215
217 nfs_callback_bc_serv(minorversion, xprt, serv); 216 nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
235 234
236 svc_sock_update_bufs(serv); 235 svc_sock_update_bufs(serv);
237 236
238 sprintf(svc_name, "nfsv4.%u-svc", minorversion);
239 cb_info->serv = serv; 237 cb_info->serv = serv;
240 cb_info->rqst = rqstp; 238 cb_info->rqst = rqstp;
241 cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); 239 cb_info->task = kthread_run(callback_svc, cb_info->rqst,
240 "nfsv4.%u-svc", minorversion);
242 if (IS_ERR(cb_info->task)) { 241 if (IS_ERR(cb_info->task)) {
243 ret = PTR_ERR(cb_info->task); 242 ret = PTR_ERR(cb_info->task);
244 svc_exit_thread(cb_info->rqst); 243 svc_exit_thread(cb_info->rqst);
@@ -282,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
282 ret = nfs4_callback_up_net(serv, net); 281 ret = nfs4_callback_up_net(serv, net);
283 break; 282 break;
284 case 1: 283 case 1:
284 case 2:
285 ret = nfs41_callback_up_net(serv, net); 285 ret = nfs41_callback_up_net(serv, net);
286 break; 286 break;
287 default: 287 default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
32 OP_CB_WANTS_CANCELLED = 12, 32 OP_CB_WANTS_CANCELLED = 12,
33 OP_CB_NOTIFY_LOCK = 13, 33 OP_CB_NOTIFY_LOCK = 13,
34 OP_CB_NOTIFY_DEVICEID = 14, 34 OP_CB_NOTIFY_DEVICEID = 14,
35/* Callback operations new to NFSv4.2 */
36 OP_CB_OFFLOAD = 15,
35 OP_CB_ILLEGAL = 10044, 37 OP_CB_ILLEGAL = 10044,
36}; 38};
37 39
@@ -39,6 +41,7 @@ struct cb_process_state {
39 __be32 drc_status; 41 __be32 drc_status;
40 struct nfs_client *clp; 42 struct nfs_client *clp;
41 u32 slotid; 43 u32 slotid;
44 u32 minorversion;
42 struct net *net; 45 struct net *net;
43}; 46};
44 47
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0bc27684ebfa..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
406 int i; 406 int i;
407 __be32 status = htonl(NFS4ERR_BADSESSION); 407 __be32 status = htonl(NFS4ERR_BADSESSION);
408 408
409 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid); 409 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
410 &args->csa_sessionid, cps->minorversion);
410 if (clp == NULL) 411 if (clp == NULL)
411 goto out; 412 goto out;
412 413
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a35582c9d444..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
166 if (unlikely(p == NULL)) 166 if (unlikely(p == NULL))
167 return htonl(NFS4ERR_RESOURCE); 167 return htonl(NFS4ERR_RESOURCE);
168 hdr->minorversion = ntohl(*p++); 168 hdr->minorversion = ntohl(*p++);
169 /* Check minor version is zero or one. */ 169 /* Check for minor version support */
170 if (hdr->minorversion <= 1) { 170 if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
171 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ 171 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
172 } else { 172 } else {
173 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " 173 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
174 "illegal minor version %u!\n", 174 "illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
786} 786}
787#endif /* CONFIG_NFS_V4_1 */ 787#endif /* CONFIG_NFS_V4_1 */
788 788
789#ifdef CONFIG_NFS_V4_2
790static __be32
791preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
792{
793 __be32 status = preprocess_nfs41_op(nop, op_nr, op);
794 if (status != htonl(NFS4ERR_OP_ILLEGAL))
795 return status;
796
797 if (op_nr == OP_CB_OFFLOAD)
798 return htonl(NFS4ERR_NOTSUPP);
799 return htonl(NFS4ERR_OP_ILLEGAL);
800}
801#else /* CONFIG_NFS_V4_2 */
802static __be32
803preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
804{
805 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
806}
807#endif /* CONFIG_NFS_V4_2 */
808
789static __be32 809static __be32
790preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) 810preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
791{ 811{
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
801 return htonl(NFS_OK); 821 return htonl(NFS_OK);
802} 822}
803 823
804static __be32 process_op(uint32_t minorversion, int nop, 824static __be32 process_op(int nop, struct svc_rqst *rqstp,
805 struct svc_rqst *rqstp,
806 struct xdr_stream *xdr_in, void *argp, 825 struct xdr_stream *xdr_in, void *argp,
807 struct xdr_stream *xdr_out, void *resp, 826 struct xdr_stream *xdr_out, void *resp,
808 struct cb_process_state *cps) 827 struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
819 return status; 838 return status;
820 839
821 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", 840 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
822 __func__, minorversion, nop, op_nr); 841 __func__, cps->minorversion, nop, op_nr);
842
843 switch (cps->minorversion) {
844 case 0:
845 status = preprocess_nfs4_op(op_nr, &op);
846 break;
847 case 1:
848 status = preprocess_nfs41_op(nop, op_nr, &op);
849 break;
850 case 2:
851 status = preprocess_nfs42_op(nop, op_nr, &op);
852 break;
853 default:
854 status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
855 }
823 856
824 status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
825 preprocess_nfs4_op(op_nr, &op);
826 if (status == htonl(NFS4ERR_OP_ILLEGAL)) 857 if (status == htonl(NFS4ERR_OP_ILLEGAL))
827 op_nr = OP_CB_ILLEGAL; 858 op_nr = OP_CB_ILLEGAL;
828 if (status) 859 if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
885 return rpc_drop_reply; 916 return rpc_drop_reply;
886 } 917 }
887 918
919 cps.minorversion = hdr_arg.minorversion;
888 hdr_res.taglen = hdr_arg.taglen; 920 hdr_res.taglen = hdr_arg.taglen;
889 hdr_res.tag = hdr_arg.tag; 921 hdr_res.tag = hdr_arg.tag;
890 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) 922 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
891 return rpc_system_err; 923 return rpc_system_err;
892 924
893 while (status == 0 && nops != hdr_arg.nops) { 925 while (status == 0 && nops != hdr_arg.nops) {
894 status = process_op(hdr_arg.minorversion, nops, rqstp, 926 status = process_op(nops, rqstp, &xdr_in,
895 &xdr_in, argp, &xdr_out, resp, &cps); 927 argp, &xdr_out, resp, &cps);
896 nops++; 928 nops++;
897 } 929 }
898 930
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c513b0cc835f..340b1eff0267 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
753 data->timeo, data->retrans); 753 data->timeo, data->retrans);
754 if (data->flags & NFS_MOUNT_NORESVPORT) 754 if (data->flags & NFS_MOUNT_NORESVPORT)
755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
756 if (server->options & NFS_OPTION_MIGRATION)
757 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
758 756
759 /* Allocate or find a client reference we can use */ 757 /* Allocate or find a client reference we can use */
760 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 758 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
1076 } 1074 }
1077 1075
1078 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1076 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1079 error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr); 1077 error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
1080 if (error < 0) { 1078 if (error < 0) {
1081 dprintk("nfs_create_server: getattr error = %d\n", -error); 1079 dprintk("nfs_create_server: getattr error = %d\n", -error);
1082 goto error; 1080 goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 57db3244f4d9..7ec4814e298d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
73 if (inode->i_flock == NULL) 73 if (inode->i_flock == NULL)
74 goto out; 74 goto out;
75 75
76 /* Protect inode->i_flock using the file locks lock */ 76 /* Protect inode->i_flock using the i_lock */
77 lock_flocks(); 77 spin_lock(&inode->i_lock);
78 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 78 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
79 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 79 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
80 continue; 80 continue;
81 if (nfs_file_open_context(fl->fl_file) != ctx) 81 if (nfs_file_open_context(fl->fl_file) != ctx)
82 continue; 82 continue;
83 unlock_flocks(); 83 spin_unlock(&inode->i_lock);
84 status = nfs4_lock_delegation_recall(fl, state, stateid); 84 status = nfs4_lock_delegation_recall(fl, state, stateid);
85 if (status < 0) 85 if (status < 0)
86 goto out; 86 goto out;
87 lock_flocks(); 87 spin_lock(&inode->i_lock);
88 } 88 }
89 unlock_flocks(); 89 spin_unlock(&inode->i_lock);
90out: 90out:
91 return status; 91 return status;
92} 92}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d051419527b..e474ca2b2bfe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
33#include <linux/pagevec.h> 33#include <linux/pagevec.h>
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/swap.h>
36#include <linux/sched.h> 37#include <linux/sched.h>
37#include <linux/kmemleak.h> 38#include <linux/kmemleak.h>
38#include <linux/xattr.h> 39#include <linux/xattr.h>
@@ -436,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
436 struct dentry *alias; 437 struct dentry *alias;
437 struct inode *dir = parent->d_inode; 438 struct inode *dir = parent->d_inode;
438 struct inode *inode; 439 struct inode *inode;
440 int status;
439 441
440 if (filename.name[0] == '.') { 442 if (filename.name[0] == '.') {
441 if (filename.len == 1) 443 if (filename.len == 1)
@@ -448,7 +450,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
448 dentry = d_lookup(parent, &filename); 450 dentry = d_lookup(parent, &filename);
449 if (dentry != NULL) { 451 if (dentry != NULL) {
450 if (nfs_same_file(dentry, entry)) { 452 if (nfs_same_file(dentry, entry)) {
451 nfs_refresh_inode(dentry->d_inode, entry->fattr); 453 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
454 status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
455 if (!status)
456 nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
452 goto out; 457 goto out;
453 } else { 458 } else {
454 if (d_invalidate(dentry) != 0) 459 if (d_invalidate(dentry) != 0)
@@ -461,7 +466,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
461 if (dentry == NULL) 466 if (dentry == NULL)
462 return; 467 return;
463 468
464 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); 469 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
465 if (IS_ERR(inode)) 470 if (IS_ERR(inode))
466 goto out; 471 goto out;
467 472
@@ -586,10 +591,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
586 if (entry.fh == NULL || entry.fattr == NULL) 591 if (entry.fh == NULL || entry.fattr == NULL)
587 goto out; 592 goto out;
588 593
594 entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
595 if (IS_ERR(entry.label)) {
596 status = PTR_ERR(entry.label);
597 goto out;
598 }
599
589 array = nfs_readdir_get_array(page); 600 array = nfs_readdir_get_array(page);
590 if (IS_ERR(array)) { 601 if (IS_ERR(array)) {
591 status = PTR_ERR(array); 602 status = PTR_ERR(array);
592 goto out; 603 goto out_label_free;
593 } 604 }
594 memset(array, 0, sizeof(struct nfs_cache_array)); 605 memset(array, 0, sizeof(struct nfs_cache_array));
595 array->eof_index = -1; 606 array->eof_index = -1;
@@ -615,6 +626,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
615 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 626 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
616out_release_array: 627out_release_array:
617 nfs_readdir_release_array(page); 628 nfs_readdir_release_array(page);
629out_label_free:
630 nfs4_label_free(entry.label);
618out: 631out:
619 nfs_free_fattr(entry.fattr); 632 nfs_free_fattr(entry.fattr);
620 nfs_free_fhandle(entry.fh); 633 nfs_free_fhandle(entry.fh);
@@ -805,7 +818,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
805 nfs_readdir_descriptor_t my_desc, 818 nfs_readdir_descriptor_t my_desc,
806 *desc = &my_desc; 819 *desc = &my_desc;
807 struct nfs_open_dir_context *dir_ctx = file->private_data; 820 struct nfs_open_dir_context *dir_ctx = file->private_data;
808 int res; 821 int res = 0;
809 822
810 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 823 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
811 dentry->d_parent->d_name.name, dentry->d_name.name, 824 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -827,7 +840,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
827 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; 840 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
828 841
829 nfs_block_sillyrename(dentry); 842 nfs_block_sillyrename(dentry);
830 res = nfs_revalidate_mapping(inode, file->f_mapping); 843 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
844 res = nfs_revalidate_mapping(inode, file->f_mapping);
831 if (res < 0) 845 if (res < 0)
832 goto out; 846 goto out;
833 847
@@ -1039,6 +1053,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1039 struct dentry *parent; 1053 struct dentry *parent;
1040 struct nfs_fh *fhandle = NULL; 1054 struct nfs_fh *fhandle = NULL;
1041 struct nfs_fattr *fattr = NULL; 1055 struct nfs_fattr *fattr = NULL;
1056 struct nfs4_label *label = NULL;
1042 int error; 1057 int error;
1043 1058
1044 if (flags & LOOKUP_RCU) 1059 if (flags & LOOKUP_RCU)
@@ -1081,7 +1096,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1081 if (fhandle == NULL || fattr == NULL) 1096 if (fhandle == NULL || fattr == NULL)
1082 goto out_error; 1097 goto out_error;
1083 1098
1084 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1099 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
1100 if (IS_ERR(label))
1101 goto out_error;
1102
1103 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1085 if (error) 1104 if (error)
1086 goto out_bad; 1105 goto out_bad;
1087 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1106 if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1089,8 +1108,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1089 if ((error = nfs_refresh_inode(inode, fattr)) != 0) 1108 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
1090 goto out_bad; 1109 goto out_bad;
1091 1110
1111 nfs_setsecurity(inode, fattr, label);
1112
1092 nfs_free_fattr(fattr); 1113 nfs_free_fattr(fattr);
1093 nfs_free_fhandle(fhandle); 1114 nfs_free_fhandle(fhandle);
1115 nfs4_label_free(label);
1116
1094out_set_verifier: 1117out_set_verifier:
1095 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1118 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1096 out_valid: 1119 out_valid:
@@ -1107,6 +1130,7 @@ out_zap_parent:
1107 out_bad: 1130 out_bad:
1108 nfs_free_fattr(fattr); 1131 nfs_free_fattr(fattr);
1109 nfs_free_fhandle(fhandle); 1132 nfs_free_fhandle(fhandle);
1133 nfs4_label_free(label);
1110 nfs_mark_for_revalidate(dir); 1134 nfs_mark_for_revalidate(dir);
1111 if (inode && S_ISDIR(inode->i_mode)) { 1135 if (inode && S_ISDIR(inode->i_mode)) {
1112 /* Purge readdir caches. */ 1136 /* Purge readdir caches. */
@@ -1127,6 +1151,7 @@ out_zap_parent:
1127out_error: 1151out_error:
1128 nfs_free_fattr(fattr); 1152 nfs_free_fattr(fattr);
1129 nfs_free_fhandle(fhandle); 1153 nfs_free_fhandle(fhandle);
1154 nfs4_label_free(label);
1130 dput(parent); 1155 dput(parent);
1131 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", 1156 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
1132 __func__, dentry->d_parent->d_name.name, 1157 __func__, dentry->d_parent->d_name.name,
@@ -1255,6 +1280,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1255 struct inode *inode = NULL; 1280 struct inode *inode = NULL;
1256 struct nfs_fh *fhandle = NULL; 1281 struct nfs_fh *fhandle = NULL;
1257 struct nfs_fattr *fattr = NULL; 1282 struct nfs_fattr *fattr = NULL;
1283 struct nfs4_label *label = NULL;
1258 int error; 1284 int error;
1259 1285
1260 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 1286 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1281,17 +1307,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1281 if (fhandle == NULL || fattr == NULL) 1307 if (fhandle == NULL || fattr == NULL)
1282 goto out; 1308 goto out;
1283 1309
1310 label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
1311 if (IS_ERR(label))
1312 goto out;
1313
1284 parent = dentry->d_parent; 1314 parent = dentry->d_parent;
1285 /* Protect against concurrent sillydeletes */ 1315 /* Protect against concurrent sillydeletes */
1286 nfs_block_sillyrename(parent); 1316 nfs_block_sillyrename(parent);
1287 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1317 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1288 if (error == -ENOENT) 1318 if (error == -ENOENT)
1289 goto no_entry; 1319 goto no_entry;
1290 if (error < 0) { 1320 if (error < 0) {
1291 res = ERR_PTR(error); 1321 res = ERR_PTR(error);
1292 goto out_unblock_sillyrename; 1322 goto out_unblock_sillyrename;
1293 } 1323 }
1294 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1324 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1295 res = ERR_CAST(inode); 1325 res = ERR_CAST(inode);
1296 if (IS_ERR(res)) 1326 if (IS_ERR(res))
1297 goto out_unblock_sillyrename; 1327 goto out_unblock_sillyrename;
@@ -1309,6 +1339,7 @@ no_entry:
1309 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1339 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1310out_unblock_sillyrename: 1340out_unblock_sillyrename:
1311 nfs_unblock_sillyrename(parent); 1341 nfs_unblock_sillyrename(parent);
1342 nfs4_label_free(label);
1312out: 1343out:
1313 nfs_free_fattr(fattr); 1344 nfs_free_fattr(fattr);
1314 nfs_free_fhandle(fhandle); 1345 nfs_free_fhandle(fhandle);
@@ -1356,18 +1387,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
1356{ 1387{
1357 int err; 1388 int err;
1358 1389
1359 if (ctx->dentry != dentry) {
1360 dput(ctx->dentry);
1361 ctx->dentry = dget(dentry);
1362 }
1363
1364 /* If the open_intent is for execute, we have an extra check to make */
1365 if (ctx->mode & FMODE_EXEC) {
1366 err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
1367 if (err < 0)
1368 goto out;
1369 }
1370
1371 err = finish_open(file, dentry, do_open, opened); 1390 err = finish_open(file, dentry, do_open, opened);
1372 if (err) 1391 if (err)
1373 goto out; 1392 goto out;
@@ -1426,13 +1445,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1426 1445
1427 nfs_block_sillyrename(dentry->d_parent); 1446 nfs_block_sillyrename(dentry->d_parent);
1428 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); 1447 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1429 d_drop(dentry); 1448 nfs_unblock_sillyrename(dentry->d_parent);
1430 if (IS_ERR(inode)) { 1449 if (IS_ERR(inode)) {
1431 nfs_unblock_sillyrename(dentry->d_parent);
1432 put_nfs_open_context(ctx); 1450 put_nfs_open_context(ctx);
1433 err = PTR_ERR(inode); 1451 err = PTR_ERR(inode);
1434 switch (err) { 1452 switch (err) {
1435 case -ENOENT: 1453 case -ENOENT:
1454 d_drop(dentry);
1436 d_add(dentry, NULL); 1455 d_add(dentry, NULL);
1437 break; 1456 break;
1438 case -EISDIR: 1457 case -EISDIR:
@@ -1448,16 +1467,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1448 } 1467 }
1449 goto out; 1468 goto out;
1450 } 1469 }
1451 res = d_add_unique(dentry, inode);
1452 if (res != NULL)
1453 dentry = res;
1454
1455 nfs_unblock_sillyrename(dentry->d_parent);
1456 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1457
1458 err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
1459 1470
1460 dput(res); 1471 err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
1461out: 1472out:
1462 return err; 1473 return err;
1463 1474
@@ -1527,7 +1538,8 @@ no_open:
1527 * Code common to create, mkdir, and mknod. 1538 * Code common to create, mkdir, and mknod.
1528 */ 1539 */
1529int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, 1540int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1530 struct nfs_fattr *fattr) 1541 struct nfs_fattr *fattr,
1542 struct nfs4_label *label)
1531{ 1543{
1532 struct dentry *parent = dget_parent(dentry); 1544 struct dentry *parent = dget_parent(dentry);
1533 struct inode *dir = parent->d_inode; 1545 struct inode *dir = parent->d_inode;
@@ -1540,18 +1552,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1540 if (dentry->d_inode) 1552 if (dentry->d_inode)
1541 goto out; 1553 goto out;
1542 if (fhandle->size == 0) { 1554 if (fhandle->size == 0) {
1543 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1555 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
1544 if (error) 1556 if (error)
1545 goto out_error; 1557 goto out_error;
1546 } 1558 }
1547 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1559 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1548 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1560 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1549 struct nfs_server *server = NFS_SB(dentry->d_sb); 1561 struct nfs_server *server = NFS_SB(dentry->d_sb);
1550 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); 1562 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
1551 if (error < 0) 1563 if (error < 0)
1552 goto out_error; 1564 goto out_error;
1553 } 1565 }
1554 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1566 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1555 error = PTR_ERR(inode); 1567 error = PTR_ERR(inode);
1556 if (IS_ERR(inode)) 1568 if (IS_ERR(inode))
1557 goto out_error; 1569 goto out_error;
@@ -1720,7 +1732,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
1720 dir->i_ino, dentry->d_name.name); 1732 dir->i_ino, dentry->d_name.name);
1721 1733
1722 spin_lock(&dentry->d_lock); 1734 spin_lock(&dentry->d_lock);
1723 if (dentry->d_count > 1) { 1735 if (d_count(dentry) > 1) {
1724 spin_unlock(&dentry->d_lock); 1736 spin_unlock(&dentry->d_lock);
1725 /* Start asynchronous writeout of the inode */ 1737 /* Start asynchronous writeout of the inode */
1726 write_inode_now(dentry->d_inode, 0); 1738 write_inode_now(dentry->d_inode, 0);
@@ -1758,7 +1770,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
1758 */ 1770 */
1759int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1771int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1760{ 1772{
1761 struct pagevec lru_pvec;
1762 struct page *page; 1773 struct page *page;
1763 char *kaddr; 1774 char *kaddr;
1764 struct iattr attr; 1775 struct iattr attr;
@@ -1798,11 +1809,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1798 * No big deal if we can't add this page to the page cache here. 1809 * No big deal if we can't add this page to the page cache here.
1799 * READLINK will get the missing page from the server if needed. 1810 * READLINK will get the missing page from the server if needed.
1800 */ 1811 */
1801 pagevec_init(&lru_pvec, 0); 1812 if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
1802 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
1803 GFP_KERNEL)) { 1813 GFP_KERNEL)) {
1804 pagevec_add(&lru_pvec, page);
1805 pagevec_lru_add_file(&lru_pvec);
1806 SetPageUptodate(page); 1814 SetPageUptodate(page);
1807 unlock_page(page); 1815 unlock_page(page);
1808 } else 1816 } else
@@ -1869,7 +1877,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1869 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1877 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1870 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1878 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1871 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1879 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1872 new_dentry->d_count); 1880 d_count(new_dentry));
1873 1881
1874 /* 1882 /*
1875 * For non-directories, check whether the target is busy and if so, 1883 * For non-directories, check whether the target is busy and if so,
@@ -1887,7 +1895,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1887 rehash = new_dentry; 1895 rehash = new_dentry;
1888 } 1896 }
1889 1897
1890 if (new_dentry->d_count > 2) { 1898 if (d_count(new_dentry) > 2) {
1891 int err; 1899 int err;
1892 1900
1893 /* copy the target dentry's name */ 1901 /* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
29 kfree(ip_addr); 29 kfree(ip_addr);
30 return ret; 30 return ret;
31} 31}
32EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
33 32
34#else 33#else
35 34
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
351 ret = -ESRCH; 350 ret = -ESRCH;
352 return ret; 351 return ret;
353} 352}
354EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
355 353
356static struct cache_detail nfs_dns_resolve_template = { 354static struct cache_detail nfs_dns_resolve_template = {
357 .owner = THIS_MODULE, 355 .owner = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
396 cache_destroy_net(nn->nfs_dns_resolve, net); 394 cache_destroy_net(nn->nfs_dns_resolve, net);
397} 395}
398 396
397static int nfs4_dns_net_init(struct net *net)
398{
399 return nfs_dns_resolver_cache_init(net);
400}
401
402static void nfs4_dns_net_exit(struct net *net)
403{
404 nfs_dns_resolver_cache_destroy(net);
405}
406
407static struct pernet_operations nfs4_dns_resolver_ops = {
408 .init = nfs4_dns_net_init,
409 .exit = nfs4_dns_net_exit,
410};
411
399static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, 412static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
400 void *ptr) 413 void *ptr)
401{ 414{
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
432 445
433int nfs_dns_resolver_init(void) 446int nfs_dns_resolver_init(void)
434{ 447{
435 return rpc_pipefs_notifier_register(&nfs_dns_resolver_block); 448 int err;
449
450 err = register_pernet_subsys(&nfs4_dns_resolver_ops);
451 if (err < 0)
452 goto out;
453 err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
454 if (err < 0)
455 goto out1;
456 return 0;
457out1:
458 unregister_pernet_subsys(&nfs4_dns_resolver_ops);
459out:
460 return err;
436} 461}
437 462
438void nfs_dns_resolver_destroy(void) 463void nfs_dns_resolver_destroy(void)
439{ 464{
440 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); 465 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
466 unregister_pernet_subsys(&nfs4_dns_resolver_ops);
441} 467}
442#endif 468#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b4a79f4ad1d..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -495,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
495 return nfs_fscache_release_page(page, gfp); 495 return nfs_fscache_release_page(page, gfp);
496} 496}
497 497
498static void nfs_check_dirty_writeback(struct page *page,
499 bool *dirty, bool *writeback)
500{
501 struct nfs_inode *nfsi;
502 struct address_space *mapping = page_file_mapping(page);
503
504 if (!mapping || PageSwapCache(page))
505 return;
506
507 /*
508 * Check if an unstable page is currently being committed and
509 * if so, have the VM treat it as if the page is under writeback
510 * so it will not block due to pages that will shortly be freeable.
511 */
512 nfsi = NFS_I(mapping->host);
513 if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
514 *writeback = true;
515 return;
516 }
517
518 /*
519 * If PagePrivate() is set, then the page is not freeable and as the
520 * inode is not being committed, it's not going to be cleaned in the
521 * near future so treat it as dirty
522 */
523 if (PagePrivate(page))
524 *dirty = true;
525}
526
498/* 527/*
499 * Attempt to clear the private state associated with a page when an error 528 * Attempt to clear the private state associated with a page when an error
500 * occurs that requires the cached contents of an inode to be written back or 529 * occurs that requires the cached contents of an inode to be written back or
@@ -542,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
542 .direct_IO = nfs_direct_IO, 571 .direct_IO = nfs_direct_IO,
543 .migratepage = nfs_migrate_page, 572 .migratepage = nfs_migrate_page,
544 .launder_page = nfs_launder_page, 573 .launder_page = nfs_launder_page,
574 .is_dirty_writeback = nfs_check_dirty_writeback,
545 .error_remove_page = generic_error_remove_page, 575 .error_remove_page = generic_error_remove_page,
546#ifdef CONFIG_NFS_SWAP 576#ifdef CONFIG_NFS_SWAP
547 .swap_activate = nfs_swap_activate, 577 .swap_activate = nfs_swap_activate,
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
95 goto out; 95 goto out;
96 } 96 }
97 97
98 inode = nfs_fhget(sb, mntfh, fsinfo.fattr); 98 inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 dprintk("nfs_get_root: get root inode failed\n"); 100 dprintk("nfs_get_root: get root inode failed\n");
101 ret = ERR_CAST(inode); 101 ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
262 return desclen; 262 return desclen;
263} 263}
264 264
265static ssize_t nfs_idmap_request_key(struct key_type *key_type, 265static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
266 const char *name, size_t namelen, 266 const char *type, struct idmap *idmap)
267 const char *type, void *data,
268 size_t data_size, struct idmap *idmap)
269{ 267{
270 const struct cred *saved_cred;
271 struct key *rkey;
272 char *desc; 268 char *desc;
273 struct user_key_payload *payload; 269 struct key *rkey;
274 ssize_t ret; 270 ssize_t ret;
275 271
276 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); 272 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
277 if (ret <= 0) 273 if (ret <= 0)
278 goto out; 274 return ERR_PTR(ret);
275
276 rkey = request_key(&key_type_id_resolver, desc, "");
277 if (IS_ERR(rkey)) {
278 mutex_lock(&idmap->idmap_mutex);
279 rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
280 desc, "", 0, idmap);
281 mutex_unlock(&idmap->idmap_mutex);
282 }
283
284 kfree(desc);
285 return rkey;
286}
287
288static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
289 const char *type, void *data,
290 size_t data_size, struct idmap *idmap)
291{
292 const struct cred *saved_cred;
293 struct key *rkey;
294 struct user_key_payload *payload;
295 ssize_t ret;
279 296
280 saved_cred = override_creds(id_resolver_cache); 297 saved_cred = override_creds(id_resolver_cache);
281 if (idmap) 298 rkey = nfs_idmap_request_key(name, namelen, type, idmap);
282 rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
283 else
284 rkey = request_key(&key_type_id_resolver, desc, "");
285 revert_creds(saved_cred); 299 revert_creds(saved_cred);
286 300
287 kfree(desc);
288 if (IS_ERR(rkey)) { 301 if (IS_ERR(rkey)) {
289 ret = PTR_ERR(rkey); 302 ret = PTR_ERR(rkey);
290 goto out; 303 goto out;
@@ -316,23 +329,6 @@ out:
316 return ret; 329 return ret;
317} 330}
318 331
319static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
320 const char *type, void *data,
321 size_t data_size, struct idmap *idmap)
322{
323 ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
324 name, namelen, type, data,
325 data_size, NULL);
326 if (ret < 0) {
327 mutex_lock(&idmap->idmap_mutex);
328 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
329 name, namelen, type, data,
330 data_size, idmap);
331 mutex_unlock(&idmap->idmap_mutex);
332 }
333 return ret;
334}
335
336/* ID -> Name */ 332/* ID -> Name */
337static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, 333static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
338 size_t buflen, struct idmap *idmap) 334 size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c1c7a9d78722..af6e806044d7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "dns_resolve.h"
52#include "pnfs.h" 51#include "pnfs.h"
53#include "nfs.h" 52#include "nfs.h"
54#include "netns.h" 53#include "netns.h"
@@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
79{ 78{
80 if (fatal_signal_pending(current)) 79 if (fatal_signal_pending(current))
81 return -ERESTARTSYS; 80 return -ERESTARTSYS;
82 freezable_schedule(); 81 freezable_schedule_unsafe();
83 return 0; 82 return 0;
84} 83}
85EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); 84EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
162 161
163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); 162 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 163 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
166 nfs_fscache_invalidate(inode); 164 nfs_fscache_invalidate(inode);
167 } else { 165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
168 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 166 | NFS_INO_INVALID_LABEL
169 } 167 | NFS_INO_INVALID_DATA
168 | NFS_INO_INVALID_ACCESS
169 | NFS_INO_INVALID_ACL
170 | NFS_INO_REVAL_PAGECACHE;
171 } else
172 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
173 | NFS_INO_INVALID_LABEL
174 | NFS_INO_INVALID_ACCESS
175 | NFS_INO_INVALID_ACL
176 | NFS_INO_REVAL_PAGECACHE;
170} 177}
171 178
172void nfs_zap_caches(struct inode *inode) 179void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
257 return 0; 264 return 0;
258} 265}
259 266
267#ifdef CONFIG_NFS_V4_SECURITY_LABEL
268void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
269 struct nfs4_label *label)
270{
271 int error;
272
273 if (label == NULL)
274 return;
275
276 if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
277 return;
278
279 if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
280 return;
281
282 if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
283 error = security_inode_notifysecctx(inode, label->label,
284 label->len);
285 if (error)
286 printk(KERN_ERR "%s() %s %d "
287 "security_inode_notifysecctx() %d\n",
288 __func__,
289 (char *)label->label,
290 label->len, error);
291 }
292}
293
294struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
295{
296 struct nfs4_label *label = NULL;
297 int minor_version = server->nfs_client->cl_minorversion;
298
299 if (minor_version < 2)
300 return label;
301
302 if (!(server->caps & NFS_CAP_SECURITY_LABEL))
303 return label;
304
305 label = kzalloc(sizeof(struct nfs4_label), flags);
306 if (label == NULL)
307 return ERR_PTR(-ENOMEM);
308
309 label->label = kzalloc(NFS4_MAXLABELLEN, flags);
310 if (label->label == NULL) {
311 kfree(label);
312 return ERR_PTR(-ENOMEM);
313 }
314 label->len = NFS4_MAXLABELLEN;
315
316 return label;
317}
318EXPORT_SYMBOL_GPL(nfs4_label_alloc);
319#else
320void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
321 struct nfs4_label *label)
322{
323}
324#endif
325EXPORT_SYMBOL_GPL(nfs_setsecurity);
326
260/* 327/*
261 * This is our front-end to iget that looks up inodes by file handle 328 * This is our front-end to iget that looks up inodes by file handle
262 * instead of inode number. 329 * instead of inode number.
263 */ 330 */
264struct inode * 331struct inode *
265nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) 332nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
266{ 333{
267 struct nfs_find_desc desc = { 334 struct nfs_find_desc desc = {
268 .fh = fh, 335 .fh = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
384 */ 451 */
385 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 452 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
386 } 453 }
454
455 nfs_setsecurity(inode, fattr, label);
456
387 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 457 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
388 nfsi->attrtimeo_timestamp = now; 458 nfsi->attrtimeo_timestamp = now;
389 nfsi->access_cache = RB_ROOT; 459 nfsi->access_cache = RB_ROOT;
@@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
393 unlock_new_inode(inode); 463 unlock_new_inode(inode);
394 } else 464 } else
395 nfs_refresh_inode(inode, fattr); 465 nfs_refresh_inode(inode, fattr);
466 nfs_setsecurity(inode, fattr, label);
396 dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", 467 dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
397 inode->i_sb->s_id, 468 inode->i_sb->s_id,
398 (long long)NFS_FILEID(inode), 469 (long long)NFS_FILEID(inode),
@@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
449 NFS_PROTO(inode)->return_delegation(inode); 520 NFS_PROTO(inode)->return_delegation(inode);
450 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); 521 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
451 if (error == 0) 522 if (error == 0)
452 nfs_refresh_inode(inode, fattr); 523 error = nfs_refresh_inode(inode, fattr);
453 nfs_free_fattr(fattr); 524 nfs_free_fattr(fattr);
454out: 525out:
455 return error; 526 return error;
@@ -713,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
713 * Ensure that mmap has a recent RPC credential for use when writing out 784 * Ensure that mmap has a recent RPC credential for use when writing out
714 * shared pages 785 * shared pages
715 */ 786 */
716void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 787void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
717{ 788{
718 struct inode *inode = file_inode(filp); 789 struct inode *inode = ctx->dentry->d_inode;
719 struct nfs_inode *nfsi = NFS_I(inode); 790 struct nfs_inode *nfsi = NFS_I(inode);
720 791
721 filp->private_data = get_nfs_open_context(ctx);
722 spin_lock(&inode->i_lock); 792 spin_lock(&inode->i_lock);
723 list_add(&ctx->list, &nfsi->open_files); 793 list_add(&ctx->list, &nfsi->open_files);
724 spin_unlock(&inode->i_lock); 794 spin_unlock(&inode->i_lock);
725} 795}
796EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
797
798void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
799{
800 filp->private_data = get_nfs_open_context(ctx);
801 if (list_empty(&ctx->list))
802 nfs_inode_attach_open_context(ctx);
803}
726EXPORT_SYMBOL_GPL(nfs_file_set_open_context); 804EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
727 805
728/* 806/*
@@ -748,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
748 826
749static void nfs_file_clear_open_context(struct file *filp) 827static void nfs_file_clear_open_context(struct file *filp)
750{ 828{
751 struct inode *inode = file_inode(filp);
752 struct nfs_open_context *ctx = nfs_file_open_context(filp); 829 struct nfs_open_context *ctx = nfs_file_open_context(filp);
753 830
754 if (ctx) { 831 if (ctx) {
832 struct inode *inode = ctx->dentry->d_inode;
833
755 filp->private_data = NULL; 834 filp->private_data = NULL;
756 spin_lock(&inode->i_lock); 835 spin_lock(&inode->i_lock);
757 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 836 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +869,7 @@ int
790__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 869__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
791{ 870{
792 int status = -ESTALE; 871 int status = -ESTALE;
872 struct nfs4_label *label = NULL;
793 struct nfs_fattr *fattr = NULL; 873 struct nfs_fattr *fattr = NULL;
794 struct nfs_inode *nfsi = NFS_I(inode); 874 struct nfs_inode *nfsi = NFS_I(inode);
795 875
@@ -807,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
807 goto out; 887 goto out;
808 888
809 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 889 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
810 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); 890
891 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
892 if (IS_ERR(label)) {
893 status = PTR_ERR(label);
894 goto out;
895 }
896
897 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
811 if (status != 0) { 898 if (status != 0) {
812 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 899 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
813 inode->i_sb->s_id, 900 inode->i_sb->s_id,
@@ -817,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
817 if (!S_ISDIR(inode->i_mode)) 904 if (!S_ISDIR(inode->i_mode))
818 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 905 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
819 } 906 }
820 goto out; 907 goto err_out;
821 } 908 }
822 909
823 status = nfs_refresh_inode(inode, fattr); 910 status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
825 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 912 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
826 inode->i_sb->s_id, 913 inode->i_sb->s_id,
827 (long long)NFS_FILEID(inode), status); 914 (long long)NFS_FILEID(inode), status);
828 goto out; 915 goto err_out;
829 } 916 }
830 917
831 if (nfsi->cache_validity & NFS_INO_INVALID_ACL) 918 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
835 inode->i_sb->s_id, 922 inode->i_sb->s_id,
836 (long long)NFS_FILEID(inode)); 923 (long long)NFS_FILEID(inode));
837 924
838 out: 925err_out:
926 nfs4_label_free(label);
927out:
839 nfs_free_fattr(fattr); 928 nfs_free_fattr(fattr);
840 return status; 929 return status;
841} 930}
@@ -847,7 +936,7 @@ int nfs_attribute_timeout(struct inode *inode)
847 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 936 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
848} 937}
849 938
850static int nfs_attribute_cache_expired(struct inode *inode) 939int nfs_attribute_cache_expired(struct inode *inode)
851{ 940{
852 if (nfs_have_delegated_attributes(inode)) 941 if (nfs_have_delegated_attributes(inode))
853 return 0; 942 return 0;
@@ -863,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
863 */ 952 */
864int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 953int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
865{ 954{
866 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) 955 if (!(NFS_I(inode)->cache_validity &
956 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
867 && !nfs_attribute_cache_expired(inode)) 957 && !nfs_attribute_cache_expired(inode))
868 return NFS_STALE(inode) ? -ESTALE : 0; 958 return NFS_STALE(inode) ? -ESTALE : 0;
869 return __nfs_revalidate_inode(server, inode); 959 return __nfs_revalidate_inode(server, inode);
@@ -1243,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1243 spin_lock(&inode->i_lock); 1333 spin_lock(&inode->i_lock);
1244 status = nfs_post_op_update_inode_locked(inode, fattr); 1334 status = nfs_post_op_update_inode_locked(inode, fattr);
1245 spin_unlock(&inode->i_lock); 1335 spin_unlock(&inode->i_lock);
1336
1246 return status; 1337 return status;
1247} 1338}
1248EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); 1339EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1483 inode->i_blocks = fattr->du.nfs2.blocks; 1574 inode->i_blocks = fattr->du.nfs2.blocks;
1484 1575
1485 /* Update attrtimeo value if we're out of the unstable period */ 1576 /* Update attrtimeo value if we're out of the unstable period */
1486 if (invalid & NFS_INO_INVALID_ATTR) { 1577 if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
1487 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1578 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1488 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1579 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1489 nfsi->attrtimeo_timestamp = now; 1580 nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1496 } 1587 }
1497 } 1588 }
1498 invalid &= ~NFS_INO_INVALID_ATTR; 1589 invalid &= ~NFS_INO_INVALID_ATTR;
1590 invalid &= ~NFS_INO_INVALID_LABEL;
1499 /* Don't invalidate the data if we were to blame */ 1591 /* Don't invalidate the data if we were to blame */
1500 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1592 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1501 || S_ISLNK(inode->i_mode))) 1593 || S_ISLNK(inode->i_mode)))
@@ -1638,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
1638static int nfs_net_init(struct net *net) 1730static int nfs_net_init(struct net *net)
1639{ 1731{
1640 nfs_clients_init(net); 1732 nfs_clients_init(net);
1641 return nfs_dns_resolver_cache_init(net); 1733 return 0;
1642} 1734}
1643 1735
1644static void nfs_net_exit(struct net *net) 1736static void nfs_net_exit(struct net *net)
1645{ 1737{
1646 nfs_dns_resolver_cache_destroy(net);
1647 nfs_cleanup_cb_ident_idr(net); 1738 nfs_cleanup_cb_ident_idr(net);
1648} 1739}
1649 1740
@@ -1661,10 +1752,6 @@ static int __init init_nfs_fs(void)
1661{ 1752{
1662 int err; 1753 int err;
1663 1754
1664 err = nfs_dns_resolver_init();
1665 if (err < 0)
1666 goto out10;;
1667
1668 err = register_pernet_subsys(&nfs_net_ops); 1755 err = register_pernet_subsys(&nfs_net_ops);
1669 if (err < 0) 1756 if (err < 0)
1670 goto out9; 1757 goto out9;
@@ -1730,8 +1817,6 @@ out7:
1730out8: 1817out8:
1731 unregister_pernet_subsys(&nfs_net_ops); 1818 unregister_pernet_subsys(&nfs_net_ops);
1732out9: 1819out9:
1733 nfs_dns_resolver_destroy();
1734out10:
1735 return err; 1820 return err;
1736} 1821}
1737 1822
@@ -1744,7 +1829,6 @@ static void __exit exit_nfs_fs(void)
1744 nfs_destroy_nfspagecache(); 1829 nfs_destroy_nfspagecache();
1745 nfs_fscache_unregister(); 1830 nfs_fscache_unregister();
1746 unregister_pernet_subsys(&nfs_net_ops); 1831 unregister_pernet_subsys(&nfs_net_ops);
1747 nfs_dns_resolver_destroy();
1748#ifdef CONFIG_PROC_FS 1832#ifdef CONFIG_PROC_FS
1749 rpc_proc_unregister(&init_net, "nfs"); 1833 rpc_proc_unregister(&init_net, "nfs");
1750#endif 1834#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 91e59a39fc08..3c8373f90ab3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
165extern struct nfs_client *nfs4_find_client_ident(struct net *, int); 165extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
166extern struct nfs_client * 166extern struct nfs_client *
167nfs4_find_client_sessionid(struct net *, const struct sockaddr *, 167nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
168 struct nfs4_sessionid *); 168 struct nfs4_sessionid *, u32);
169extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, 169extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
170 struct nfs_subversion *); 170 struct nfs_subversion *);
171extern struct nfs_server *nfs4_create_server( 171extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
255#ifdef CONFIG_NFS_V4_1 255#ifdef CONFIG_NFS_V4_1
256extern const u32 nfs41_maxread_overhead; 256extern const u32 nfs41_maxread_overhead;
257extern const u32 nfs41_maxwrite_overhead; 257extern const u32 nfs41_maxwrite_overhead;
258extern const u32 nfs41_maxgetdevinfo_overhead;
258#endif 259#endif
259 260
260/* nfs4proc.c */ 261/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
139 * nfs_mount - Obtain an NFS file handle for the given host and path 139 * nfs_mount - Obtain an NFS file handle for the given host and path
140 * @info: pointer to mount request arguments 140 * @info: pointer to mount request arguments
141 * 141 *
142 * Uses default timeout parameters specified by underlying transport. 142 * Uses default timeout parameters specified by underlying transport. On
143 * successful return, the auth_flavs list and auth_flav_len will be populated
144 * with the list from the server or a faked-up list if the server didn't
145 * provide one.
143 */ 146 */
144int nfs_mount(struct nfs_mount_request *info) 147int nfs_mount(struct nfs_mount_request *info)
145{ 148{
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
195 dprintk("NFS: MNT request succeeded\n"); 198 dprintk("NFS: MNT request succeeded\n");
196 status = 0; 199 status = 0;
197 200
201 /*
202 * If the server didn't provide a flavor list, allow the
203 * client to try any flavor.
204 */
205 if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
206 dprintk("NFS: Faking up auth_flavs list\n");
207 info->auth_flavs[0] = RPC_AUTH_NULL;
208 *info->auth_flav_len = 1;
209 }
198out: 210out:
199 return status; 211 return status;
200 212
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
280 struct dentry *parent = dget_parent(dentry); 280 struct dentry *parent = dget_parent(dentry);
281 281
282 /* Look it up again to get its attributes */ 282 /* Look it up again to get its attributes */
283 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr); 283 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
284 dput(parent); 284 dput(parent);
285 if (err != 0) 285 if (err != 0)
286 return ERR_PTR(err); 286 return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 43ea96ced28c..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
33 res = rpc_call_sync(clnt, msg, flags); 33 res = rpc_call_sync(clnt, msg, flags);
34 if (res != -EJUKEBOX) 34 if (res != -EJUKEBOX)
35 break; 35 break;
36 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 36 freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
37 res = -ERESTARTSYS; 37 res = -ERESTARTSYS;
38 } while (!fatal_signal_pending(current)); 38 } while (!fatal_signal_pending(current));
39 return res; 39 return res;
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
98 */ 98 */
99static int 99static int
100nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, 100nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
101 struct nfs_fattr *fattr) 101 struct nfs_fattr *fattr, struct nfs4_label *label)
102{ 102{
103 struct rpc_message msg = { 103 struct rpc_message msg = {
104 .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], 104 .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
143 143
144static int 144static int
145nfs3_proc_lookup(struct inode *dir, struct qstr *name, 145nfs3_proc_lookup(struct inode *dir, struct qstr *name,
146 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 146 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
147 struct nfs4_label *label)
147{ 148{
148 struct nfs3_diropargs arg = { 149 struct nfs3_diropargs arg = {
149 .fh = NFS_FH(dir), 150 .fh = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
300 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); 301 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
301 nfs_post_op_update_inode(dir, data->res.dir_attr); 302 nfs_post_op_update_inode(dir, data->res.dir_attr);
302 if (status == 0) 303 if (status == 0)
303 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 304 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
304 return status; 305 return status;
305} 306}
306 307
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a1dd768d0a35..ee81e354bce7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops {
194 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 194 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *); 196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
197 int (*reclaim_complete)(struct nfs_client *); 197 int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **, 198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
199 struct rpc_cred *); 199 struct rpc_cred *);
200}; 200};
@@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
304 304
305extern const u32 nfs4_fattr_bitmap[3]; 305extern const u32 nfs4_fattr_bitmap[3];
306extern const u32 nfs4_statfs_bitmap[2]; 306extern const u32 nfs4_statfs_bitmap[3];
307extern const u32 nfs4_pathconf_bitmap[2]; 307extern const u32 nfs4_pathconf_bitmap[3];
308extern const u32 nfs4_fsinfo_bitmap[3]; 308extern const u32 nfs4_fsinfo_bitmap[3];
309extern const u32 nfs4_fs_locations_bitmap[2]; 309extern const u32 nfs4_fs_locations_bitmap[3];
310 310
311void nfs4_free_client(struct nfs_client *); 311void nfs4_free_client(struct nfs_client *);
312 312
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..90dce91dd5b5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
66 if (err) 66 if (err)
67 goto error; 67 goto error;
68 68
69 if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
70 err = -EINVAL;
71 goto error;
72 }
73
69 spin_lock_init(&clp->cl_lock); 74 spin_lock_init(&clp->cl_lock);
70 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 75 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
71 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 76 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
562 */ 567 */
563struct nfs_client * 568struct nfs_client *
564nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, 569nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
565 struct nfs4_sessionid *sid) 570 struct nfs4_sessionid *sid, u32 minorversion)
566{ 571{
567 struct nfs_client *clp; 572 struct nfs_client *clp;
568 struct nfs_net *nn = net_generic(net, nfs_net_id); 573 struct nfs_net *nn = net_generic(net, nfs_net_id);
569 574
570 spin_lock(&nn->nfs_client_lock); 575 spin_lock(&nn->nfs_client_lock);
571 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { 576 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
572 if (nfs4_cb_match_client(addr, clp, 1) == false) 577 if (nfs4_cb_match_client(addr, clp, minorversion) == false)
573 continue; 578 continue;
574 579
575 if (!nfs4_has_session(clp)) 580 if (!nfs4_has_session(clp))
@@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
592 597
593struct nfs_client * 598struct nfs_client *
594nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, 599nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
595 struct nfs4_sessionid *sid) 600 struct nfs4_sessionid *sid, u32 minorversion)
596{ 601{
597 return NULL; 602 return NULL;
598} 603}
@@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
626 631
627 if (server->flags & NFS_MOUNT_NORESVPORT) 632 if (server->flags & NFS_MOUNT_NORESVPORT)
628 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 633 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
634 if (server->options & NFS_OPTION_MIGRATION)
635 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
629 636
630 /* Allocate or find a client reference we can use */ 637 /* Allocate or find a client reference we can use */
631 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); 638 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
730 return -ENOMEM; 737 return -ENOMEM;
731 738
732 /* We must ensure the session is initialised first */ 739 /* We must ensure the session is initialised first */
733 error = nfs4_init_session(server); 740 error = nfs4_init_session(server->nfs_client);
734 if (error < 0) 741 if (error < 0)
735 goto out; 742 goto out;
736 743
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
69 goto out_drop; 69 goto out_drop;
70 } 70 }
71 } 71 }
72 iput(inode);
73 if (inode != dentry->d_inode) 72 if (inode != dentry->d_inode)
74 goto out_drop; 73 goto out_drop;
75 74
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 22d10623f5ee..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
643 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 643 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
644 NFS_SERVER(lo->plh_inode)->nfs_client, id); 644 NFS_SERVER(lo->plh_inode)->nfs_client, id);
645 if (d == NULL) { 645 if (d == NULL) {
646 dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); 646 dsaddr = filelayout_get_device_info(lo->plh_inode, id,
647 lo->plh_lc_cred, gfp_flags);
647 if (dsaddr == NULL) 648 if (dsaddr == NULL)
648 goto out; 649 goto out;
649 } else 650 } else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 235ff952d3c8..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr * 152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
154 155
155#endif /* FS_NFS_NFS4FILELAYOUT_H */ 156#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 661a0f611215..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
668 * of available devices, and return it. 668 * of available devices, and return it.
669 */ 669 */
670struct nfs4_file_layout_dsaddr * 670struct nfs4_file_layout_dsaddr *
671filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) 671filelayout_get_device_info(struct inode *inode,
672 struct nfs4_deviceid *dev_id,
673 struct rpc_cred *cred,
674 gfp_t gfp_flags)
672{ 675{
673 struct pnfs_device *pdev = NULL; 676 struct pnfs_device *pdev = NULL;
674 u32 max_resp_sz; 677 u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
708 pdev->pgbase = 0; 711 pdev->pgbase = 0;
709 pdev->pglen = max_resp_sz; 712 pdev->pglen = max_resp_sz;
710 pdev->mincount = 0; 713 pdev->mincount = 0;
714 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
711 715
712 rc = nfs4_proc_getdeviceinfo(server, pdev); 716 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
713 dprintk("%s getdevice info returns %d\n", __func__, rc); 717 dprintk("%s getdevice info returns %d\n", __func__, rc);
714 if (rc) 718 if (rc)
715 goto out_free; 719 goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d7ba5616989c..cf11799297c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
77static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 77static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
78static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 78static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
79static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 79static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
80static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); 80static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
81static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 81static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
82static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 82static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
83 struct nfs_fattr *fattr, struct iattr *sattr, 83 struct nfs_fattr *fattr, struct iattr *sattr,
84 struct nfs4_state *state); 84 struct nfs4_state *state, struct nfs4_label *ilabel,
85 struct nfs4_label *olabel);
85#ifdef CONFIG_NFS_V4_1 86#ifdef CONFIG_NFS_V4_1
86static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); 87static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
87static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *); 88 struct rpc_cred *);
89static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
90 struct rpc_cred *);
88#endif 91#endif
92
93#ifdef CONFIG_NFS_V4_SECURITY_LABEL
94static inline struct nfs4_label *
95nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
96 struct iattr *sattr, struct nfs4_label *label)
97{
98 int err;
99
100 if (label == NULL)
101 return NULL;
102
103 if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
104 return NULL;
105
106 if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
107 return NULL;
108
109 err = security_dentry_init_security(dentry, sattr->ia_mode,
110 &dentry->d_name, (void **)&label->label, &label->len);
111 if (err == 0)
112 return label;
113
114 return NULL;
115}
116static inline void
117nfs4_label_release_security(struct nfs4_label *label)
118{
119 if (label)
120 security_release_secctx(label->label, label->len);
121}
122static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
123{
124 if (label)
125 return server->attr_bitmask;
126
127 return server->attr_bitmask_nl;
128}
129#else
130static inline struct nfs4_label *
131nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
132 struct iattr *sattr, struct nfs4_label *l)
133{ return NULL; }
134static inline void
135nfs4_label_release_security(struct nfs4_label *label)
136{ return; }
137static inline u32 *
138nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
139{ return server->attr_bitmask; }
140#endif
141
89/* Prevent leaks of NFSv4 errors into userland */ 142/* Prevent leaks of NFSv4 errors into userland */
90static int nfs4_map_errors(int err) 143static int nfs4_map_errors(int err)
91{ 144{
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
134 | FATTR4_WORD1_SPACE_USED 187 | FATTR4_WORD1_SPACE_USED
135 | FATTR4_WORD1_TIME_ACCESS 188 | FATTR4_WORD1_TIME_ACCESS
136 | FATTR4_WORD1_TIME_METADATA 189 | FATTR4_WORD1_TIME_METADATA
137 | FATTR4_WORD1_TIME_MODIFY 190 | FATTR4_WORD1_TIME_MODIFY,
191#ifdef CONFIG_NFS_V4_SECURITY_LABEL
192 FATTR4_WORD2_SECURITY_LABEL
193#endif
138}; 194};
139 195
140static const u32 nfs4_pnfs_open_bitmap[3] = { 196static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
161 | FATTR4_WORD0_FILEID, 217 | FATTR4_WORD0_FILEID,
162}; 218};
163 219
164const u32 nfs4_statfs_bitmap[2] = { 220const u32 nfs4_statfs_bitmap[3] = {
165 FATTR4_WORD0_FILES_AVAIL 221 FATTR4_WORD0_FILES_AVAIL
166 | FATTR4_WORD0_FILES_FREE 222 | FATTR4_WORD0_FILES_FREE
167 | FATTR4_WORD0_FILES_TOTAL, 223 | FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
170 | FATTR4_WORD1_SPACE_TOTAL 226 | FATTR4_WORD1_SPACE_TOTAL
171}; 227};
172 228
173const u32 nfs4_pathconf_bitmap[2] = { 229const u32 nfs4_pathconf_bitmap[3] = {
174 FATTR4_WORD0_MAXLINK 230 FATTR4_WORD0_MAXLINK
175 | FATTR4_WORD0_MAXNAME, 231 | FATTR4_WORD0_MAXNAME,
176 0 232 0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
185 FATTR4_WORD2_LAYOUT_BLKSIZE 241 FATTR4_WORD2_LAYOUT_BLKSIZE
186}; 242};
187 243
188const u32 nfs4_fs_locations_bitmap[2] = { 244const u32 nfs4_fs_locations_bitmap[3] = {
189 FATTR4_WORD0_TYPE 245 FATTR4_WORD0_TYPE
190 | FATTR4_WORD0_CHANGE 246 | FATTR4_WORD0_CHANGE
191 | FATTR4_WORD0_SIZE 247 | FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
201 | FATTR4_WORD1_TIME_ACCESS 257 | FATTR4_WORD1_TIME_ACCESS
202 | FATTR4_WORD1_TIME_METADATA 258 | FATTR4_WORD1_TIME_METADATA
203 | FATTR4_WORD1_TIME_MODIFY 259 | FATTR4_WORD1_TIME_MODIFY
204 | FATTR4_WORD1_MOUNTED_ON_FILEID 260 | FATTR4_WORD1_MOUNTED_ON_FILEID,
205}; 261};
206 262
207static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, 263static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -268,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
268 *timeout = NFS4_POLL_RETRY_MIN; 324 *timeout = NFS4_POLL_RETRY_MIN;
269 if (*timeout > NFS4_POLL_RETRY_MAX) 325 if (*timeout > NFS4_POLL_RETRY_MAX)
270 *timeout = NFS4_POLL_RETRY_MAX; 326 *timeout = NFS4_POLL_RETRY_MAX;
271 freezable_schedule_timeout_killable(*timeout); 327 freezable_schedule_timeout_killable_unsafe(*timeout);
272 if (fatal_signal_pending(current)) 328 if (fatal_signal_pending(current))
273 res = -ERESTARTSYS; 329 res = -ERESTARTSYS;
274 *timeout <<= 1; 330 *timeout <<= 1;
@@ -762,6 +818,7 @@ struct nfs4_opendata {
762 struct nfs4_string owner_name; 818 struct nfs4_string owner_name;
763 struct nfs4_string group_name; 819 struct nfs4_string group_name;
764 struct nfs_fattr f_attr; 820 struct nfs_fattr f_attr;
821 struct nfs4_label *f_label;
765 struct dentry *dir; 822 struct dentry *dir;
766 struct dentry *dentry; 823 struct dentry *dentry;
767 struct nfs4_state_owner *owner; 824 struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
807static void nfs4_init_opendata_res(struct nfs4_opendata *p) 864static void nfs4_init_opendata_res(struct nfs4_opendata *p)
808{ 865{
809 p->o_res.f_attr = &p->f_attr; 866 p->o_res.f_attr = &p->f_attr;
867 p->o_res.f_label = p->f_label;
810 p->o_res.seqid = p->o_arg.seqid; 868 p->o_res.seqid = p->o_arg.seqid;
811 p->c_res.seqid = p->c_arg.seqid; 869 p->c_res.seqid = p->c_arg.seqid;
812 p->o_res.server = p->o_arg.server; 870 p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
818static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, 876static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
819 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 877 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
820 const struct iattr *attrs, 878 const struct iattr *attrs,
879 struct nfs4_label *label,
821 enum open_claim_type4 claim, 880 enum open_claim_type4 claim,
822 gfp_t gfp_mask) 881 gfp_t gfp_mask)
823{ 882{
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
829 p = kzalloc(sizeof(*p), gfp_mask); 888 p = kzalloc(sizeof(*p), gfp_mask);
830 if (p == NULL) 889 if (p == NULL)
831 goto err; 890 goto err;
891
892 p->f_label = nfs4_label_alloc(server, gfp_mask);
893 if (IS_ERR(p->f_label))
894 goto err_free_p;
895
832 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 896 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
833 if (p->o_arg.seqid == NULL) 897 if (p->o_arg.seqid == NULL)
834 goto err_free; 898 goto err_free_label;
835 nfs_sb_active(dentry->d_sb); 899 nfs_sb_active(dentry->d_sb);
836 p->dentry = dget(dentry); 900 p->dentry = dget(dentry);
837 p->dir = parent; 901 p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
852 p->o_arg.id.uniquifier = sp->so_seqid.owner_id; 916 p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
853 p->o_arg.name = &dentry->d_name; 917 p->o_arg.name = &dentry->d_name;
854 p->o_arg.server = server; 918 p->o_arg.server = server;
855 p->o_arg.bitmask = server->attr_bitmask; 919 p->o_arg.bitmask = nfs4_bitmask(server, label);
856 p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; 920 p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
921 p->o_arg.label = label;
857 p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); 922 p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
858 switch (p->o_arg.claim) { 923 switch (p->o_arg.claim) {
859 case NFS4_OPEN_CLAIM_NULL: 924 case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
884 nfs4_init_opendata_res(p); 949 nfs4_init_opendata_res(p);
885 kref_init(&p->kref); 950 kref_init(&p->kref);
886 return p; 951 return p;
887err_free: 952
953err_free_label:
954 nfs4_label_free(p->f_label);
955err_free_p:
888 kfree(p); 956 kfree(p);
889err: 957err:
890 dput(parent); 958 dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
901 if (p->state != NULL) 969 if (p->state != NULL)
902 nfs4_put_open_state(p->state); 970 nfs4_put_open_state(p->state);
903 nfs4_put_state_owner(p->owner); 971 nfs4_put_state_owner(p->owner);
972
973 nfs4_label_free(p->f_label);
974
904 dput(p->dir); 975 dput(p->dir);
905 dput(p->dentry); 976 dput(p->dentry);
906 nfs_sb_deactive(sb); 977 nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
1179 if (ret) 1250 if (ret)
1180 goto err; 1251 goto err;
1181 1252
1253 nfs_setsecurity(inode, &data->f_attr, data->f_label);
1254
1182 if (data->o_res.delegation_type != 0) 1255 if (data->o_res.delegation_type != 0)
1183 nfs4_opendata_check_deleg(data, state); 1256 nfs4_opendata_check_deleg(data, state);
1184 update_open_stateid(state, &data->o_res.stateid, NULL, 1257 update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1205 ret = -EAGAIN; 1278 ret = -EAGAIN;
1206 if (!(data->f_attr.valid & NFS_ATTR_FATTR)) 1279 if (!(data->f_attr.valid & NFS_ATTR_FATTR))
1207 goto err; 1280 goto err;
1208 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); 1281 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
1209 ret = PTR_ERR(inode); 1282 ret = PTR_ERR(inode);
1210 if (IS_ERR(inode)) 1283 if (IS_ERR(inode))
1211 goto err; 1284 goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1258 struct nfs4_opendata *opendata; 1331 struct nfs4_opendata *opendata;
1259 1332
1260 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, 1333 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
1261 NULL, claim, GFP_NOFS); 1334 NULL, NULL, claim, GFP_NOFS);
1262 if (opendata == NULL) 1335 if (opendata == NULL)
1263 return ERR_PTR(-ENOMEM); 1336 return ERR_PTR(-ENOMEM);
1264 opendata->state = state; 1337 opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1784 return status; 1857 return status;
1785 } 1858 }
1786 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) 1859 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
1787 _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); 1860 _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
1788 return 0; 1861 return 0;
1789} 1862}
1790 1863
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1855{ 1928{
1856 struct nfs_server *server = NFS_SERVER(state->inode); 1929 struct nfs_server *server = NFS_SERVER(state->inode);
1857 nfs4_stateid *stateid = &state->stateid; 1930 nfs4_stateid *stateid = &state->stateid;
1858 int status; 1931 struct nfs_delegation *delegation;
1932 struct rpc_cred *cred = NULL;
1933 int status = -NFS4ERR_BAD_STATEID;
1859 1934
1860 /* If a state reset has been done, test_stateid is unneeded */ 1935 /* If a state reset has been done, test_stateid is unneeded */
1861 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1936 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1862 return; 1937 return;
1863 1938
1864 status = nfs41_test_stateid(server, stateid); 1939 /* Get the delegation credential for use by test/free_stateid */
1940 rcu_read_lock();
1941 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
1942 if (delegation != NULL &&
1943 nfs4_stateid_match(&delegation->stateid, stateid)) {
1944 cred = get_rpccred(delegation->cred);
1945 rcu_read_unlock();
1946 status = nfs41_test_stateid(server, stateid, cred);
1947 } else
1948 rcu_read_unlock();
1949
1865 if (status != NFS_OK) { 1950 if (status != NFS_OK) {
1866 /* Free the stateid unless the server explicitly 1951 /* Free the stateid unless the server explicitly
1867 * informs us the stateid is unrecognized. */ 1952 * informs us the stateid is unrecognized. */
1868 if (status != -NFS4ERR_BAD_STATEID) 1953 if (status != -NFS4ERR_BAD_STATEID)
1869 nfs41_free_stateid(server, stateid); 1954 nfs41_free_stateid(server, stateid, cred);
1870 nfs_remove_bad_delegation(state->inode); 1955 nfs_remove_bad_delegation(state->inode);
1871 1956
1872 write_seqlock(&state->seqlock); 1957 write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1874 write_sequnlock(&state->seqlock); 1959 write_sequnlock(&state->seqlock);
1875 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1960 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1876 } 1961 }
1962
1963 if (cred != NULL)
1964 put_rpccred(cred);
1877} 1965}
1878 1966
1879/** 1967/**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1888{ 1976{
1889 struct nfs_server *server = NFS_SERVER(state->inode); 1977 struct nfs_server *server = NFS_SERVER(state->inode);
1890 nfs4_stateid *stateid = &state->open_stateid; 1978 nfs4_stateid *stateid = &state->open_stateid;
1979 struct rpc_cred *cred = state->owner->so_cred;
1891 int status; 1980 int status;
1892 1981
1893 /* If a state reset has been done, test_stateid is unneeded */ 1982 /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1896 (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) 1985 (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
1897 return -NFS4ERR_BAD_STATEID; 1986 return -NFS4ERR_BAD_STATEID;
1898 1987
1899 status = nfs41_test_stateid(server, stateid); 1988 status = nfs41_test_stateid(server, stateid, cred);
1900 if (status != NFS_OK) { 1989 if (status != NFS_OK) {
1901 /* Free the stateid unless the server explicitly 1990 /* Free the stateid unless the server explicitly
1902 * informs us the stateid is unrecognized. */ 1991 * informs us the stateid is unrecognized. */
1903 if (status != -NFS4ERR_BAD_STATEID) 1992 if (status != -NFS4ERR_BAD_STATEID)
1904 nfs41_free_stateid(server, stateid); 1993 nfs41_free_stateid(server, stateid, cred);
1905 1994
1906 clear_bit(NFS_O_RDONLY_STATE, &state->flags); 1995 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1907 clear_bit(NFS_O_WRONLY_STATE, &state->flags); 1996 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
1942static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, 2031static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
1943 fmode_t fmode, 2032 fmode_t fmode,
1944 int flags, 2033 int flags,
1945 struct nfs4_state **res) 2034 struct nfs_open_context *ctx)
1946{ 2035{
1947 struct nfs4_state_owner *sp = opendata->owner; 2036 struct nfs4_state_owner *sp = opendata->owner;
1948 struct nfs_server *server = sp->so_server; 2037 struct nfs_server *server = sp->so_server;
2038 struct dentry *dentry;
1949 struct nfs4_state *state; 2039 struct nfs4_state *state;
1950 unsigned int seq; 2040 unsigned int seq;
1951 int ret; 2041 int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
1963 if (server->caps & NFS_CAP_POSIX_LOCK) 2053 if (server->caps & NFS_CAP_POSIX_LOCK)
1964 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 2054 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1965 2055
2056 dentry = opendata->dentry;
2057 if (dentry->d_inode == NULL) {
2058 /* FIXME: Is this d_drop() ever needed? */
2059 d_drop(dentry);
2060 dentry = d_add_unique(dentry, igrab(state->inode));
2061 if (dentry == NULL) {
2062 dentry = opendata->dentry;
2063 } else if (dentry != ctx->dentry) {
2064 dput(ctx->dentry);
2065 ctx->dentry = dget(dentry);
2066 }
2067 nfs_set_verifier(dentry,
2068 nfs_save_change_attribute(opendata->dir->d_inode));
2069 }
2070
1966 ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); 2071 ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
1967 if (ret != 0) 2072 if (ret != 0)
1968 goto out; 2073 goto out;
1969 2074
1970 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) 2075 ctx->state = state;
1971 nfs4_schedule_stateid_recovery(server, state); 2076 if (dentry->d_inode == state->inode) {
1972 *res = state; 2077 nfs_inode_attach_open_context(ctx);
2078 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
2079 nfs4_schedule_stateid_recovery(server, state);
2080 }
1973out: 2081out:
1974 return ret; 2082 return ret;
1975} 2083}
@@ -1978,19 +2086,21 @@ out:
1978 * Returns a referenced nfs4_state 2086 * Returns a referenced nfs4_state
1979 */ 2087 */
1980static int _nfs4_do_open(struct inode *dir, 2088static int _nfs4_do_open(struct inode *dir,
1981 struct dentry *dentry, 2089 struct nfs_open_context *ctx,
1982 fmode_t fmode,
1983 int flags, 2090 int flags,
1984 struct iattr *sattr, 2091 struct iattr *sattr,
1985 struct rpc_cred *cred, 2092 struct nfs4_label *label)
1986 struct nfs4_state **res,
1987 struct nfs4_threshold **ctx_th)
1988{ 2093{
1989 struct nfs4_state_owner *sp; 2094 struct nfs4_state_owner *sp;
1990 struct nfs4_state *state = NULL; 2095 struct nfs4_state *state = NULL;
1991 struct nfs_server *server = NFS_SERVER(dir); 2096 struct nfs_server *server = NFS_SERVER(dir);
1992 struct nfs4_opendata *opendata; 2097 struct nfs4_opendata *opendata;
2098 struct dentry *dentry = ctx->dentry;
2099 struct rpc_cred *cred = ctx->cred;
2100 struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
2101 fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
1993 enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; 2102 enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
2103 struct nfs4_label *olabel = NULL;
1994 int status; 2104 int status;
1995 2105
1996 /* Protect against reboot recovery conflicts */ 2106 /* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
2009 if (dentry->d_inode) 2119 if (dentry->d_inode)
2010 claim = NFS4_OPEN_CLAIM_FH; 2120 claim = NFS4_OPEN_CLAIM_FH;
2011 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, 2121 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
2012 claim, GFP_KERNEL); 2122 label, claim, GFP_KERNEL);
2013 if (opendata == NULL) 2123 if (opendata == NULL)
2014 goto err_put_state_owner; 2124 goto err_put_state_owner;
2015 2125
2126 if (label) {
2127 olabel = nfs4_label_alloc(server, GFP_KERNEL);
2128 if (IS_ERR(olabel)) {
2129 status = PTR_ERR(olabel);
2130 goto err_opendata_put;
2131 }
2132 }
2133
2016 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { 2134 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
2017 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); 2135 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
2018 if (!opendata->f_attr.mdsthreshold) 2136 if (!opendata->f_attr.mdsthreshold)
2019 goto err_opendata_put; 2137 goto err_free_label;
2020 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; 2138 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
2021 } 2139 }
2022 if (dentry->d_inode != NULL) 2140 if (dentry->d_inode != NULL)
2023 opendata->state = nfs4_get_open_state(dentry->d_inode, sp); 2141 opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
2024 2142
2025 status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); 2143 status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
2026 if (status != 0) 2144 if (status != 0)
2027 goto err_opendata_put; 2145 goto err_free_label;
2146 state = ctx->state;
2028 2147
2029 if ((opendata->o_arg.open_flags & O_EXCL) && 2148 if ((opendata->o_arg.open_flags & O_EXCL) &&
2030 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { 2149 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
2033 nfs_fattr_init(opendata->o_res.f_attr); 2152 nfs_fattr_init(opendata->o_res.f_attr);
2034 status = nfs4_do_setattr(state->inode, cred, 2153 status = nfs4_do_setattr(state->inode, cred,
2035 opendata->o_res.f_attr, sattr, 2154 opendata->o_res.f_attr, sattr,
2036 state); 2155 state, label, olabel);
2037 if (status == 0) 2156 if (status == 0) {
2038 nfs_setattr_update_inode(state->inode, sattr); 2157 nfs_setattr_update_inode(state->inode, sattr);
2039 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); 2158 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
2159 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
2160 }
2040 } 2161 }
2041 2162
2042 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) 2163 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
2045 kfree(opendata->f_attr.mdsthreshold); 2166 kfree(opendata->f_attr.mdsthreshold);
2046 opendata->f_attr.mdsthreshold = NULL; 2167 opendata->f_attr.mdsthreshold = NULL;
2047 2168
2169 nfs4_label_free(olabel);
2170
2048 nfs4_opendata_put(opendata); 2171 nfs4_opendata_put(opendata);
2049 nfs4_put_state_owner(sp); 2172 nfs4_put_state_owner(sp);
2050 *res = state;
2051 return 0; 2173 return 0;
2174err_free_label:
2175 nfs4_label_free(olabel);
2052err_opendata_put: 2176err_opendata_put:
2053 kfree(opendata->f_attr.mdsthreshold); 2177 kfree(opendata->f_attr.mdsthreshold);
2054 nfs4_opendata_put(opendata); 2178 nfs4_opendata_put(opendata);
2055err_put_state_owner: 2179err_put_state_owner:
2056 nfs4_put_state_owner(sp); 2180 nfs4_put_state_owner(sp);
2057out_err: 2181out_err:
2058 *res = NULL;
2059 return status; 2182 return status;
2060} 2183}
2061 2184
2062 2185
2063static struct nfs4_state *nfs4_do_open(struct inode *dir, 2186static struct nfs4_state *nfs4_do_open(struct inode *dir,
2064 struct dentry *dentry, 2187 struct nfs_open_context *ctx,
2065 fmode_t fmode,
2066 int flags, 2188 int flags,
2067 struct iattr *sattr, 2189 struct iattr *sattr,
2068 struct rpc_cred *cred, 2190 struct nfs4_label *label)
2069 struct nfs4_threshold **ctx_th)
2070{ 2191{
2071 struct nfs_server *server = NFS_SERVER(dir); 2192 struct nfs_server *server = NFS_SERVER(dir);
2072 struct nfs4_exception exception = { }; 2193 struct nfs4_exception exception = { };
2073 struct nfs4_state *res; 2194 struct nfs4_state *res;
2074 int status; 2195 int status;
2075 2196
2076 fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
2077 do { 2197 do {
2078 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, 2198 status = _nfs4_do_open(dir, ctx, flags, sattr, label);
2079 &res, ctx_th); 2199 res = ctx->state;
2080 if (status == 0) 2200 if (status == 0)
2081 break; 2201 break;
2082 /* NOTE: BAD_SEQID means the server and client disagree about the 2202 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
2122 2242
2123static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2243static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2124 struct nfs_fattr *fattr, struct iattr *sattr, 2244 struct nfs_fattr *fattr, struct iattr *sattr,
2125 struct nfs4_state *state) 2245 struct nfs4_state *state, struct nfs4_label *ilabel,
2246 struct nfs4_label *olabel)
2126{ 2247{
2127 struct nfs_server *server = NFS_SERVER(inode); 2248 struct nfs_server *server = NFS_SERVER(inode);
2128 struct nfs_setattrargs arg = { 2249 struct nfs_setattrargs arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2130 .iap = sattr, 2251 .iap = sattr,
2131 .server = server, 2252 .server = server,
2132 .bitmask = server->attr_bitmask, 2253 .bitmask = server->attr_bitmask,
2254 .label = ilabel,
2133 }; 2255 };
2134 struct nfs_setattrres res = { 2256 struct nfs_setattrres res = {
2135 .fattr = fattr, 2257 .fattr = fattr,
2258 .label = olabel,
2136 .server = server, 2259 .server = server,
2137 }; 2260 };
2138 struct rpc_message msg = { 2261 struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2146 bool truncate; 2269 bool truncate;
2147 int status; 2270 int status;
2148 2271
2272 arg.bitmask = nfs4_bitmask(server, ilabel);
2273 if (ilabel)
2274 arg.bitmask = nfs4_bitmask(server, olabel);
2275
2149 nfs_fattr_init(fattr); 2276 nfs_fattr_init(fattr);
2150 2277
2151 /* Servers should only apply open mode checks for file size changes */ 2278 /* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2172 2299
2173static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2300static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2174 struct nfs_fattr *fattr, struct iattr *sattr, 2301 struct nfs_fattr *fattr, struct iattr *sattr,
2175 struct nfs4_state *state) 2302 struct nfs4_state *state, struct nfs4_label *ilabel,
2303 struct nfs4_label *olabel)
2176{ 2304{
2177 struct nfs_server *server = NFS_SERVER(inode); 2305 struct nfs_server *server = NFS_SERVER(inode);
2178 struct nfs4_exception exception = { 2306 struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2181 }; 2309 };
2182 int err; 2310 int err;
2183 do { 2311 do {
2184 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); 2312 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
2185 switch (err) { 2313 switch (err) {
2186 case -NFS4ERR_OPENMODE: 2314 case -NFS4ERR_OPENMODE:
2187 if (!(sattr->ia_valid & ATTR_SIZE)) { 2315 if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
2426nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) 2554nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
2427{ 2555{
2428 struct nfs4_state *state; 2556 struct nfs4_state *state;
2557 struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
2558
2559 label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
2429 2560
2430 /* Protect against concurrent sillydeletes */ 2561 /* Protect against concurrent sillydeletes */
2431 state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, 2562 state = nfs4_do_open(dir, ctx, open_flags, attr, label);
2432 ctx->cred, &ctx->mdsthreshold); 2563
2564 nfs4_label_release_security(label);
2565
2433 if (IS_ERR(state)) 2566 if (IS_ERR(state))
2434 return ERR_CAST(state); 2567 return ERR_CAST(state);
2435 ctx->state = state; 2568 return state->inode;
2436 return igrab(state->inode);
2437} 2569}
2438 2570
2439static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2571static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2489 server->caps |= NFS_CAP_CTIME; 2621 server->caps |= NFS_CAP_CTIME;
2490 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) 2622 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
2491 server->caps |= NFS_CAP_MTIME; 2623 server->caps |= NFS_CAP_MTIME;
2624#ifdef CONFIG_NFS_V4_SECURITY_LABEL
2625 if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
2626 server->caps |= NFS_CAP_SECURITY_LABEL;
2627#endif
2628 memcpy(server->attr_bitmask_nl, res.attr_bitmask,
2629 sizeof(server->attr_bitmask));
2492 2630
2631 if (server->caps & NFS_CAP_SECURITY_LABEL) {
2632 server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
2633 res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
2634 }
2493 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); 2635 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
2494 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2636 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2495 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2637 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
2515static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2657static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2516 struct nfs_fsinfo *info) 2658 struct nfs_fsinfo *info)
2517{ 2659{
2660 u32 bitmask[3];
2518 struct nfs4_lookup_root_arg args = { 2661 struct nfs4_lookup_root_arg args = {
2519 .bitmask = nfs4_fattr_bitmap, 2662 .bitmask = bitmask,
2520 }; 2663 };
2521 struct nfs4_lookup_res res = { 2664 struct nfs4_lookup_res res = {
2522 .server = server, 2665 .server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2529 .rpc_resp = &res, 2672 .rpc_resp = &res,
2530 }; 2673 };
2531 2674
2675 bitmask[0] = nfs4_fattr_bitmap[0];
2676 bitmask[1] = nfs4_fattr_bitmap[1];
2677 /*
2678 * Process the label in the upcoming getfattr
2679 */
2680 bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
2681
2532 nfs_fattr_init(info->fattr); 2682 nfs_fattr_init(info->fattr);
2533 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2683 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2534} 2684}
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
2648{ 2798{
2649 int error; 2799 int error;
2650 struct nfs_fattr *fattr = info->fattr; 2800 struct nfs_fattr *fattr = info->fattr;
2801 struct nfs4_label *label = NULL;
2651 2802
2652 error = nfs4_server_capabilities(server, mntfh); 2803 error = nfs4_server_capabilities(server, mntfh);
2653 if (error < 0) { 2804 if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
2655 return error; 2806 return error;
2656 } 2807 }
2657 2808
2658 error = nfs4_proc_getattr(server, mntfh, fattr); 2809 label = nfs4_label_alloc(server, GFP_KERNEL);
2810 if (IS_ERR(label))
2811 return PTR_ERR(label);
2812
2813 error = nfs4_proc_getattr(server, mntfh, fattr, label);
2659 if (error < 0) { 2814 if (error < 0) {
2660 dprintk("nfs4_get_root: getattr error = %d\n", -error); 2815 dprintk("nfs4_get_root: getattr error = %d\n", -error);
2661 return error; 2816 goto err_free_label;
2662 } 2817 }
2663 2818
2664 if (fattr->valid & NFS_ATTR_FATTR_FSID && 2819 if (fattr->valid & NFS_ATTR_FATTR_FSID &&
2665 !nfs_fsid_equal(&server->fsid, &fattr->fsid)) 2820 !nfs_fsid_equal(&server->fsid, &fattr->fsid))
2666 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); 2821 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
2667 2822
2823err_free_label:
2824 nfs4_label_free(label);
2825
2668 return error; 2826 return error;
2669} 2827}
2670 2828
@@ -2711,7 +2869,8 @@ out:
2711 return status; 2869 return status;
2712} 2870}
2713 2871
2714static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2872static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2873 struct nfs_fattr *fattr, struct nfs4_label *label)
2715{ 2874{
2716 struct nfs4_getattr_arg args = { 2875 struct nfs4_getattr_arg args = {
2717 .fh = fhandle, 2876 .fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2719 }; 2878 };
2720 struct nfs4_getattr_res res = { 2879 struct nfs4_getattr_res res = {
2721 .fattr = fattr, 2880 .fattr = fattr,
2881 .label = label,
2722 .server = server, 2882 .server = server,
2723 }; 2883 };
2724 struct rpc_message msg = { 2884 struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2726 .rpc_argp = &args, 2886 .rpc_argp = &args,
2727 .rpc_resp = &res, 2887 .rpc_resp = &res,
2728 }; 2888 };
2729 2889
2890 args.bitmask = nfs4_bitmask(server, label);
2891
2730 nfs_fattr_init(fattr); 2892 nfs_fattr_init(fattr);
2731 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2893 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2732} 2894}
2733 2895
2734static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2896static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2897 struct nfs_fattr *fattr, struct nfs4_label *label)
2735{ 2898{
2736 struct nfs4_exception exception = { }; 2899 struct nfs4_exception exception = { };
2737 int err; 2900 int err;
2738 do { 2901 do {
2739 err = nfs4_handle_exception(server, 2902 err = nfs4_handle_exception(server,
2740 _nfs4_proc_getattr(server, fhandle, fattr), 2903 _nfs4_proc_getattr(server, fhandle, fattr, label),
2741 &exception); 2904 &exception);
2742 } while (exception.retry); 2905 } while (exception.retry);
2743 return err; 2906 return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2767 struct inode *inode = dentry->d_inode; 2930 struct inode *inode = dentry->d_inode;
2768 struct rpc_cred *cred = NULL; 2931 struct rpc_cred *cred = NULL;
2769 struct nfs4_state *state = NULL; 2932 struct nfs4_state *state = NULL;
2933 struct nfs4_label *label = NULL;
2770 int status; 2934 int status;
2771 2935
2772 if (pnfs_ld_layoutret_on_setattr(inode)) 2936 if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2793 } 2957 }
2794 } 2958 }
2795 2959
2796 status = nfs4_do_setattr(inode, cred, fattr, sattr, state); 2960 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
2797 if (status == 0) 2961 if (IS_ERR(label))
2962 return PTR_ERR(label);
2963
2964 status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
2965 if (status == 0) {
2798 nfs_setattr_update_inode(inode, sattr); 2966 nfs_setattr_update_inode(inode, sattr);
2967 nfs_setsecurity(inode, fattr, label);
2968 }
2969 nfs4_label_free(label);
2799 return status; 2970 return status;
2800} 2971}
2801 2972
2802static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, 2973static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2803 const struct qstr *name, struct nfs_fh *fhandle, 2974 const struct qstr *name, struct nfs_fh *fhandle,
2804 struct nfs_fattr *fattr) 2975 struct nfs_fattr *fattr, struct nfs4_label *label)
2805{ 2976{
2806 struct nfs_server *server = NFS_SERVER(dir); 2977 struct nfs_server *server = NFS_SERVER(dir);
2807 int status; 2978 int status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2813 struct nfs4_lookup_res res = { 2984 struct nfs4_lookup_res res = {
2814 .server = server, 2985 .server = server,
2815 .fattr = fattr, 2986 .fattr = fattr,
2987 .label = label,
2816 .fh = fhandle, 2988 .fh = fhandle,
2817 }; 2989 };
2818 struct rpc_message msg = { 2990 struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2821 .rpc_resp = &res, 2993 .rpc_resp = &res,
2822 }; 2994 };
2823 2995
2996 args.bitmask = nfs4_bitmask(server, label);
2997
2824 nfs_fattr_init(fattr); 2998 nfs_fattr_init(fattr);
2825 2999
2826 dprintk("NFS call lookup %s\n", name->name); 3000 dprintk("NFS call lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
2839 3013
2840static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, 3014static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
2841 struct qstr *name, struct nfs_fh *fhandle, 3015 struct qstr *name, struct nfs_fh *fhandle,
2842 struct nfs_fattr *fattr) 3016 struct nfs_fattr *fattr, struct nfs4_label *label)
2843{ 3017{
2844 struct nfs4_exception exception = { }; 3018 struct nfs4_exception exception = { };
2845 struct rpc_clnt *client = *clnt; 3019 struct rpc_clnt *client = *clnt;
2846 int err; 3020 int err;
2847 do { 3021 do {
2848 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr); 3022 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
2849 switch (err) { 3023 switch (err) {
2850 case -NFS4ERR_BADNAME: 3024 case -NFS4ERR_BADNAME:
2851 err = -ENOENT; 3025 err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
2879} 3053}
2880 3054
2881static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, 3055static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
2882 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 3056 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
3057 struct nfs4_label *label)
2883{ 3058{
2884 int status; 3059 int status;
2885 struct rpc_clnt *client = NFS_CLIENT(dir); 3060 struct rpc_clnt *client = NFS_CLIENT(dir);
2886 3061
2887 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); 3062 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
2888 if (client != NFS_CLIENT(dir)) { 3063 if (client != NFS_CLIENT(dir)) {
2889 rpc_shutdown_client(client); 3064 rpc_shutdown_client(client);
2890 nfs_fixup_secinfo_attributes(fattr); 3065 nfs_fixup_secinfo_attributes(fattr);
@@ -2899,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
2899 int status; 3074 int status;
2900 struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); 3075 struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
2901 3076
2902 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); 3077 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
2903 if (status < 0) { 3078 if (status < 0) {
2904 rpc_shutdown_client(client); 3079 rpc_shutdown_client(client);
2905 return ERR_PTR(status); 3080 return ERR_PTR(status);
@@ -2924,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2924 .rpc_cred = entry->cred, 3099 .rpc_cred = entry->cred,
2925 }; 3100 };
2926 int mode = entry->mask; 3101 int mode = entry->mask;
2927 int status; 3102 int status = 0;
2928 3103
2929 /* 3104 /*
2930 * Determine which access bits we want to ask for... 3105 * Determine which access bits we want to ask for...
@@ -3029,6 +3204,7 @@ static int
3029nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 3204nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3030 int flags) 3205 int flags)
3031{ 3206{
3207 struct nfs4_label l, *ilabel = NULL;
3032 struct nfs_open_context *ctx; 3208 struct nfs_open_context *ctx;
3033 struct nfs4_state *state; 3209 struct nfs4_state *state;
3034 int status = 0; 3210 int status = 0;
@@ -3037,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3037 if (IS_ERR(ctx)) 3213 if (IS_ERR(ctx))
3038 return PTR_ERR(ctx); 3214 return PTR_ERR(ctx);
3039 3215
3216 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
3217
3040 sattr->ia_mode &= ~current_umask(); 3218 sattr->ia_mode &= ~current_umask();
3041 state = nfs4_do_open(dir, dentry, ctx->mode, 3219 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
3042 flags, sattr, ctx->cred,
3043 &ctx->mdsthreshold);
3044 d_drop(dentry);
3045 if (IS_ERR(state)) { 3220 if (IS_ERR(state)) {
3046 status = PTR_ERR(state); 3221 status = PTR_ERR(state);
3047 goto out; 3222 goto out;
3048 } 3223 }
3049 d_add(dentry, igrab(state->inode));
3050 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
3051 ctx->state = state;
3052out: 3224out:
3225 nfs4_label_release_security(ilabel);
3053 put_nfs_open_context(ctx); 3226 put_nfs_open_context(ctx);
3054 return status; 3227 return status;
3055} 3228}
@@ -3098,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
3098 res->server = server; 3271 res->server = server;
3099 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 3272 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
3100 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); 3273 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
3274
3275 nfs_fattr_init(res->dir_attr);
3101} 3276}
3102 3277
3103static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) 3278static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3173 .rpc_resp = &res, 3348 .rpc_resp = &res,
3174 }; 3349 };
3175 int status = -ENOMEM; 3350 int status = -ENOMEM;
3176 3351
3177 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 3352 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3178 if (!status) { 3353 if (!status) {
3179 update_changeattr(old_dir, &res.old_cinfo); 3354 update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
3207 }; 3382 };
3208 struct nfs4_link_res res = { 3383 struct nfs4_link_res res = {
3209 .server = server, 3384 .server = server,
3385 .label = NULL,
3210 }; 3386 };
3211 struct rpc_message msg = { 3387 struct rpc_message msg = {
3212 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 3388 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
3219 if (res.fattr == NULL) 3395 if (res.fattr == NULL)
3220 goto out; 3396 goto out;
3221 3397
3398 res.label = nfs4_label_alloc(server, GFP_KERNEL);
3399 if (IS_ERR(res.label)) {
3400 status = PTR_ERR(res.label);
3401 goto out;
3402 }
3403 arg.bitmask = nfs4_bitmask(server, res.label);
3404
3222 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 3405 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3223 if (!status) { 3406 if (!status) {
3224 update_changeattr(dir, &res.cinfo); 3407 update_changeattr(dir, &res.cinfo);
3225 nfs_post_op_update_inode(inode, res.fattr); 3408 status = nfs_post_op_update_inode(inode, res.fattr);
3409 if (!status)
3410 nfs_setsecurity(inode, res.fattr, res.label);
3226 } 3411 }
3412
3413
3414 nfs4_label_free(res.label);
3415
3227out: 3416out:
3228 nfs_free_fattr(res.fattr); 3417 nfs_free_fattr(res.fattr);
3229 return status; 3418 return status;
@@ -3247,6 +3436,7 @@ struct nfs4_createdata {
3247 struct nfs4_create_res res; 3436 struct nfs4_create_res res;
3248 struct nfs_fh fh; 3437 struct nfs_fh fh;
3249 struct nfs_fattr fattr; 3438 struct nfs_fattr fattr;
3439 struct nfs4_label *label;
3250}; 3440};
3251 3441
3252static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, 3442static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
3258 if (data != NULL) { 3448 if (data != NULL) {
3259 struct nfs_server *server = NFS_SERVER(dir); 3449 struct nfs_server *server = NFS_SERVER(dir);
3260 3450
3451 data->label = nfs4_label_alloc(server, GFP_KERNEL);
3452 if (IS_ERR(data->label))
3453 goto out_free;
3454
3261 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; 3455 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
3262 data->msg.rpc_argp = &data->arg; 3456 data->msg.rpc_argp = &data->arg;
3263 data->msg.rpc_resp = &data->res; 3457 data->msg.rpc_resp = &data->res;
@@ -3266,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
3266 data->arg.name = name; 3460 data->arg.name = name;
3267 data->arg.attrs = sattr; 3461 data->arg.attrs = sattr;
3268 data->arg.ftype = ftype; 3462 data->arg.ftype = ftype;
3269 data->arg.bitmask = server->attr_bitmask; 3463 data->arg.bitmask = nfs4_bitmask(server, data->label);
3270 data->res.server = server; 3464 data->res.server = server;
3271 data->res.fh = &data->fh; 3465 data->res.fh = &data->fh;
3272 data->res.fattr = &data->fattr; 3466 data->res.fattr = &data->fattr;
3467 data->res.label = data->label;
3273 nfs_fattr_init(data->res.fattr); 3468 nfs_fattr_init(data->res.fattr);
3274 } 3469 }
3275 return data; 3470 return data;
3471out_free:
3472 kfree(data);
3473 return NULL;
3276} 3474}
3277 3475
3278static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) 3476static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
3281 &data->arg.seq_args, &data->res.seq_res, 1); 3479 &data->arg.seq_args, &data->res.seq_res, 1);
3282 if (status == 0) { 3480 if (status == 0) {
3283 update_changeattr(dir, &data->res.dir_cinfo); 3481 update_changeattr(dir, &data->res.dir_cinfo);
3284 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 3482 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
3285 } 3483 }
3286 return status; 3484 return status;
3287} 3485}
3288 3486
3289static void nfs4_free_createdata(struct nfs4_createdata *data) 3487static void nfs4_free_createdata(struct nfs4_createdata *data)
3290{ 3488{
3489 nfs4_label_free(data->label);
3291 kfree(data); 3490 kfree(data);
3292} 3491}
3293 3492
3294static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, 3493static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3295 struct page *page, unsigned int len, struct iattr *sattr) 3494 struct page *page, unsigned int len, struct iattr *sattr,
3495 struct nfs4_label *label)
3296{ 3496{
3297 struct nfs4_createdata *data; 3497 struct nfs4_createdata *data;
3298 int status = -ENAMETOOLONG; 3498 int status = -ENAMETOOLONG;
@@ -3308,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3308 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; 3508 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
3309 data->arg.u.symlink.pages = &page; 3509 data->arg.u.symlink.pages = &page;
3310 data->arg.u.symlink.len = len; 3510 data->arg.u.symlink.len = len;
3511 data->arg.label = label;
3311 3512
3312 status = nfs4_do_create(dir, dentry, data); 3513 status = nfs4_do_create(dir, dentry, data);
3313 3514
@@ -3320,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3320 struct page *page, unsigned int len, struct iattr *sattr) 3521 struct page *page, unsigned int len, struct iattr *sattr)
3321{ 3522{
3322 struct nfs4_exception exception = { }; 3523 struct nfs4_exception exception = { };
3524 struct nfs4_label l, *label = NULL;
3323 int err; 3525 int err;
3526
3527 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3528
3324 do { 3529 do {
3325 err = nfs4_handle_exception(NFS_SERVER(dir), 3530 err = nfs4_handle_exception(NFS_SERVER(dir),
3326 _nfs4_proc_symlink(dir, dentry, page, 3531 _nfs4_proc_symlink(dir, dentry, page,
3327 len, sattr), 3532 len, sattr, label),
3328 &exception); 3533 &exception);
3329 } while (exception.retry); 3534 } while (exception.retry);
3535
3536 nfs4_label_release_security(label);
3330 return err; 3537 return err;
3331} 3538}
3332 3539
3333static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, 3540static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3334 struct iattr *sattr) 3541 struct iattr *sattr, struct nfs4_label *label)
3335{ 3542{
3336 struct nfs4_createdata *data; 3543 struct nfs4_createdata *data;
3337 int status = -ENOMEM; 3544 int status = -ENOMEM;
@@ -3340,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3340 if (data == NULL) 3547 if (data == NULL)
3341 goto out; 3548 goto out;
3342 3549
3550 data->arg.label = label;
3343 status = nfs4_do_create(dir, dentry, data); 3551 status = nfs4_do_create(dir, dentry, data);
3344 3552
3345 nfs4_free_createdata(data); 3553 nfs4_free_createdata(data);
@@ -3351,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3351 struct iattr *sattr) 3559 struct iattr *sattr)
3352{ 3560{
3353 struct nfs4_exception exception = { }; 3561 struct nfs4_exception exception = { };
3562 struct nfs4_label l, *label = NULL;
3354 int err; 3563 int err;
3355 3564
3565 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3566
3356 sattr->ia_mode &= ~current_umask(); 3567 sattr->ia_mode &= ~current_umask();
3357 do { 3568 do {
3358 err = nfs4_handle_exception(NFS_SERVER(dir), 3569 err = nfs4_handle_exception(NFS_SERVER(dir),
3359 _nfs4_proc_mkdir(dir, dentry, sattr), 3570 _nfs4_proc_mkdir(dir, dentry, sattr, label),
3360 &exception); 3571 &exception);
3361 } while (exception.retry); 3572 } while (exception.retry);
3573 nfs4_label_release_security(label);
3574
3362 return err; 3575 return err;
3363} 3576}
3364 3577
@@ -3416,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
3416} 3629}
3417 3630
3418static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, 3631static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3419 struct iattr *sattr, dev_t rdev) 3632 struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
3420{ 3633{
3421 struct nfs4_createdata *data; 3634 struct nfs4_createdata *data;
3422 int mode = sattr->ia_mode; 3635 int mode = sattr->ia_mode;
@@ -3441,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3441 status = -EINVAL; 3654 status = -EINVAL;
3442 goto out_free; 3655 goto out_free;
3443 } 3656 }
3444 3657
3658 data->arg.label = label;
3445 status = nfs4_do_create(dir, dentry, data); 3659 status = nfs4_do_create(dir, dentry, data);
3446out_free: 3660out_free:
3447 nfs4_free_createdata(data); 3661 nfs4_free_createdata(data);
@@ -3453,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3453 struct iattr *sattr, dev_t rdev) 3667 struct iattr *sattr, dev_t rdev)
3454{ 3668{
3455 struct nfs4_exception exception = { }; 3669 struct nfs4_exception exception = { };
3670 struct nfs4_label l, *label = NULL;
3456 int err; 3671 int err;
3457 3672
3673 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3674
3458 sattr->ia_mode &= ~current_umask(); 3675 sattr->ia_mode &= ~current_umask();
3459 do { 3676 do {
3460 err = nfs4_handle_exception(NFS_SERVER(dir), 3677 err = nfs4_handle_exception(NFS_SERVER(dir),
3461 _nfs4_proc_mknod(dir, dentry, sattr, rdev), 3678 _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
3462 &exception); 3679 &exception);
3463 } while (exception.retry); 3680 } while (exception.retry);
3681
3682 nfs4_label_release_security(label);
3683
3464 return err; 3684 return err;
3465} 3685}
3466 3686
@@ -4187,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
4187 return err; 4407 return err;
4188} 4408}
4189 4409
4410#ifdef CONFIG_NFS_V4_SECURITY_LABEL
4411static int _nfs4_get_security_label(struct inode *inode, void *buf,
4412 size_t buflen)
4413{
4414 struct nfs_server *server = NFS_SERVER(inode);
4415 struct nfs_fattr fattr;
4416 struct nfs4_label label = {0, 0, buflen, buf};
4417
4418 u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
4419 struct nfs4_getattr_arg args = {
4420 .fh = NFS_FH(inode),
4421 .bitmask = bitmask,
4422 };
4423 struct nfs4_getattr_res res = {
4424 .fattr = &fattr,
4425 .label = &label,
4426 .server = server,
4427 };
4428 struct rpc_message msg = {
4429 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
4430 .rpc_argp = &args,
4431 .rpc_resp = &res,
4432 };
4433 int ret;
4434
4435 nfs_fattr_init(&fattr);
4436
4437 ret = rpc_call_sync(server->client, &msg, 0);
4438 if (ret)
4439 return ret;
4440 if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
4441 return -ENOENT;
4442 if (buflen < label.len)
4443 return -ERANGE;
4444 return 0;
4445}
4446
4447static int nfs4_get_security_label(struct inode *inode, void *buf,
4448 size_t buflen)
4449{
4450 struct nfs4_exception exception = { };
4451 int err;
4452
4453 if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
4454 return -EOPNOTSUPP;
4455
4456 do {
4457 err = nfs4_handle_exception(NFS_SERVER(inode),
4458 _nfs4_get_security_label(inode, buf, buflen),
4459 &exception);
4460 } while (exception.retry);
4461 return err;
4462}
4463
4464static int _nfs4_do_set_security_label(struct inode *inode,
4465 struct nfs4_label *ilabel,
4466 struct nfs_fattr *fattr,
4467 struct nfs4_label *olabel)
4468{
4469
4470 struct iattr sattr = {0};
4471 struct nfs_server *server = NFS_SERVER(inode);
4472 const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
4473 struct nfs_setattrargs args = {
4474 .fh = NFS_FH(inode),
4475 .iap = &sattr,
4476 .server = server,
4477 .bitmask = bitmask,
4478 .label = ilabel,
4479 };
4480 struct nfs_setattrres res = {
4481 .fattr = fattr,
4482 .label = olabel,
4483 .server = server,
4484 };
4485 struct rpc_message msg = {
4486 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
4487 .rpc_argp = &args,
4488 .rpc_resp = &res,
4489 };
4490 int status;
4491
4492 nfs4_stateid_copy(&args.stateid, &zero_stateid);
4493
4494 status = rpc_call_sync(server->client, &msg, 0);
4495 if (status)
4496 dprintk("%s failed: %d\n", __func__, status);
4497
4498 return status;
4499}
4500
4501static int nfs4_do_set_security_label(struct inode *inode,
4502 struct nfs4_label *ilabel,
4503 struct nfs_fattr *fattr,
4504 struct nfs4_label *olabel)
4505{
4506 struct nfs4_exception exception = { };
4507 int err;
4508
4509 do {
4510 err = nfs4_handle_exception(NFS_SERVER(inode),
4511 _nfs4_do_set_security_label(inode, ilabel,
4512 fattr, olabel),
4513 &exception);
4514 } while (exception.retry);
4515 return err;
4516}
4517
4518static int
4519nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
4520{
4521 struct nfs4_label ilabel, *olabel = NULL;
4522 struct nfs_fattr fattr;
4523 struct rpc_cred *cred;
4524 struct inode *inode = dentry->d_inode;
4525 int status;
4526
4527 if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
4528 return -EOPNOTSUPP;
4529
4530 nfs_fattr_init(&fattr);
4531
4532 ilabel.pi = 0;
4533 ilabel.lfs = 0;
4534 ilabel.label = (char *)buf;
4535 ilabel.len = buflen;
4536
4537 cred = rpc_lookup_cred();
4538 if (IS_ERR(cred))
4539 return PTR_ERR(cred);
4540
4541 olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
4542 if (IS_ERR(olabel)) {
4543 status = -PTR_ERR(olabel);
4544 goto out;
4545 }
4546
4547 status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
4548 if (status == 0)
4549 nfs_setsecurity(inode, &fattr, olabel);
4550
4551 nfs4_label_free(olabel);
4552out:
4553 put_rpccred(cred);
4554 return status;
4555}
4556#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
4557
4558
4190static int 4559static int
4191nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4560nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
4192{ 4561{
@@ -4345,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4345 /* cb_client4 */ 4714 /* cb_client4 */
4346 rcu_read_lock(); 4715 rcu_read_lock();
4347 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4716 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
4348 sizeof(setclientid.sc_netid), 4717 sizeof(setclientid.sc_netid), "%s",
4349 rpc_peeraddr2str(clp->cl_rpcclient, 4718 rpc_peeraddr2str(clp->cl_rpcclient,
4350 RPC_DISPLAY_NETID)); 4719 RPC_DISPLAY_NETID));
4351 rcu_read_unlock(); 4720 rcu_read_unlock();
@@ -4528,7 +4897,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
4528static unsigned long 4897static unsigned long
4529nfs4_set_lock_task_retry(unsigned long timeout) 4898nfs4_set_lock_task_retry(unsigned long timeout)
4530{ 4899{
4531 freezable_schedule_timeout_killable(timeout); 4900 freezable_schedule_timeout_killable_unsafe(timeout);
4532 timeout <<= 1; 4901 timeout <<= 1;
4533 if (timeout > NFS4_LOCK_MAXTIMEOUT) 4902 if (timeout > NFS4_LOCK_MAXTIMEOUT)
4534 return NFS4_LOCK_MAXTIMEOUT; 4903 return NFS4_LOCK_MAXTIMEOUT;
@@ -5056,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
5056 5425
5057 list_for_each_entry(lsp, &state->lock_states, ls_locks) { 5426 list_for_each_entry(lsp, &state->lock_states, ls_locks) {
5058 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { 5427 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
5059 status = nfs41_test_stateid(server, &lsp->ls_stateid); 5428 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
5429
5430 status = nfs41_test_stateid(server,
5431 &lsp->ls_stateid,
5432 cred);
5060 if (status != NFS_OK) { 5433 if (status != NFS_OK) {
5061 /* Free the stateid unless the server 5434 /* Free the stateid unless the server
5062 * informs us the stateid is unrecognized. */ 5435 * informs us the stateid is unrecognized. */
5063 if (status != -NFS4ERR_BAD_STATEID) 5436 if (status != -NFS4ERR_BAD_STATEID)
5064 nfs41_free_stateid(server, 5437 nfs41_free_stateid(server,
5065 &lsp->ls_stateid); 5438 &lsp->ls_stateid,
5439 cred);
5066 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); 5440 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
5067 ret = status; 5441 ret = status;
5068 } 5442 }
@@ -5295,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
5295 return len; 5669 return len;
5296} 5670}
5297 5671
5672#ifdef CONFIG_NFS_V4_SECURITY_LABEL
5673static inline int nfs4_server_supports_labels(struct nfs_server *server)
5674{
5675 return server->caps & NFS_CAP_SECURITY_LABEL;
5676}
5677
5678static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
5679 const void *buf, size_t buflen,
5680 int flags, int type)
5681{
5682 if (security_ismaclabel(key))
5683 return nfs4_set_security_label(dentry, buf, buflen);
5684
5685 return -EOPNOTSUPP;
5686}
5687
5688static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
5689 void *buf, size_t buflen, int type)
5690{
5691 if (security_ismaclabel(key))
5692 return nfs4_get_security_label(dentry->d_inode, buf, buflen);
5693 return -EOPNOTSUPP;
5694}
5695
5696static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
5697 size_t list_len, const char *name,
5698 size_t name_len, int type)
5699{
5700 size_t len = 0;
5701
5702 if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
5703 len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
5704 if (list && len <= list_len)
5705 security_inode_listsecurity(dentry->d_inode, list, len);
5706 }
5707 return len;
5708}
5709
5710static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
5711 .prefix = XATTR_SECURITY_PREFIX,
5712 .list = nfs4_xattr_list_nfs4_label,
5713 .get = nfs4_xattr_get_nfs4_label,
5714 .set = nfs4_xattr_set_nfs4_label,
5715};
5716#endif
5717
5718
5298/* 5719/*
5299 * nfs_fhget will use either the mounted_on_fileid or the fileid 5720 * nfs_fhget will use either the mounted_on_fileid or the fileid
5300 */ 5721 */
@@ -5318,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
5318 struct page *page) 5739 struct page *page)
5319{ 5740{
5320 struct nfs_server *server = NFS_SERVER(dir); 5741 struct nfs_server *server = NFS_SERVER(dir);
5321 u32 bitmask[2] = { 5742 u32 bitmask[3] = {
5322 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, 5743 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
5323 }; 5744 };
5324 struct nfs4_fs_locations_arg args = { 5745 struct nfs4_fs_locations_arg args = {
@@ -5505,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
5505 struct nfs41_exchange_id_args args = { 5926 struct nfs41_exchange_id_args args = {
5506 .verifier = &verifier, 5927 .verifier = &verifier,
5507 .client = clp, 5928 .client = clp,
5508 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, 5929 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
5930 EXCHGID4_FLAG_BIND_PRINC_STATEID,
5509 }; 5931 };
5510 struct nfs41_exchange_id_res res = { 5932 struct nfs41_exchange_id_res res = {
5511 0 5933 0
@@ -5762,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5762 */ 6184 */
5763static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) 6185static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
5764{ 6186{
5765 struct nfs4_session *session = args->client->cl_session; 6187 unsigned int max_rqst_sz, max_resp_sz;
5766 unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, 6188
5767 mxresp_sz = session->fc_target_max_resp_sz; 6189 max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
6190 max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
5768 6191
5769 if (mxrqst_sz == 0)
5770 mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
5771 if (mxresp_sz == 0)
5772 mxresp_sz = NFS_MAX_FILE_IO_SIZE;
5773 /* Fore channel attributes */ 6192 /* Fore channel attributes */
5774 args->fc_attrs.max_rqst_sz = mxrqst_sz; 6193 args->fc_attrs.max_rqst_sz = max_rqst_sz;
5775 args->fc_attrs.max_resp_sz = mxresp_sz; 6194 args->fc_attrs.max_resp_sz = max_resp_sz;
5776 args->fc_attrs.max_ops = NFS4_MAX_OPS; 6195 args->fc_attrs.max_ops = NFS4_MAX_OPS;
5777 args->fc_attrs.max_reqs = max_session_slots; 6196 args->fc_attrs.max_reqs = max_session_slots;
5778 6197
@@ -6159,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
6159/* 6578/*
6160 * Issue a global reclaim complete. 6579 * Issue a global reclaim complete.
6161 */ 6580 */
6162static int nfs41_proc_reclaim_complete(struct nfs_client *clp) 6581static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
6582 struct rpc_cred *cred)
6163{ 6583{
6164 struct nfs4_reclaim_complete_data *calldata; 6584 struct nfs4_reclaim_complete_data *calldata;
6165 struct rpc_task *task; 6585 struct rpc_task *task;
6166 struct rpc_message msg = { 6586 struct rpc_message msg = {
6167 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], 6587 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
6588 .rpc_cred = cred,
6168 }; 6589 };
6169 struct rpc_task_setup task_setup_data = { 6590 struct rpc_task_setup task_setup_data = {
6170 .rpc_client = clp->cl_rpcclient, 6591 .rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6348 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], 6769 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
6349 .rpc_argp = &lgp->args, 6770 .rpc_argp = &lgp->args,
6350 .rpc_resp = &lgp->res, 6771 .rpc_resp = &lgp->res,
6772 .rpc_cred = lgp->cred,
6351 }; 6773 };
6352 struct rpc_task_setup task_setup_data = { 6774 struct rpc_task_setup task_setup_data = {
6353 .rpc_client = server->client, 6775 .rpc_client = server->client,
@@ -6451,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
6451 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], 6873 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
6452 .rpc_argp = &lrp->args, 6874 .rpc_argp = &lrp->args,
6453 .rpc_resp = &lrp->res, 6875 .rpc_resp = &lrp->res,
6876 .rpc_cred = lrp->cred,
6454 }; 6877 };
6455 struct rpc_task_setup task_setup_data = { 6878 struct rpc_task_setup task_setup_data = {
6456 .rpc_client = lrp->clp->cl_rpcclient, 6879 .rpc_client = lrp->clp->cl_rpcclient,
@@ -6520,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
6520EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); 6943EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
6521 6944
6522static int 6945static int
6523_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 6946_nfs4_proc_getdeviceinfo(struct nfs_server *server,
6947 struct pnfs_device *pdev,
6948 struct rpc_cred *cred)
6524{ 6949{
6525 struct nfs4_getdeviceinfo_args args = { 6950 struct nfs4_getdeviceinfo_args args = {
6526 .pdev = pdev, 6951 .pdev = pdev,
@@ -6532,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
6532 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], 6957 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
6533 .rpc_argp = &args, 6958 .rpc_argp = &args,
6534 .rpc_resp = &res, 6959 .rpc_resp = &res,
6960 .rpc_cred = cred,
6535 }; 6961 };
6536 int status; 6962 int status;
6537 6963
@@ -6542,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
6542 return status; 6968 return status;
6543} 6969}
6544 6970
6545int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 6971int nfs4_proc_getdeviceinfo(struct nfs_server *server,
6972 struct pnfs_device *pdev,
6973 struct rpc_cred *cred)
6546{ 6974{
6547 struct nfs4_exception exception = { }; 6975 struct nfs4_exception exception = { };
6548 int err; 6976 int err;
6549 6977
6550 do { 6978 do {
6551 err = nfs4_handle_exception(server, 6979 err = nfs4_handle_exception(server,
6552 _nfs4_proc_getdeviceinfo(server, pdev), 6980 _nfs4_proc_getdeviceinfo(server, pdev, cred),
6553 &exception); 6981 &exception);
6554 } while (exception.retry); 6982 } while (exception.retry);
6555 return err; 6983 return err;
@@ -6733,7 +7161,9 @@ out:
6733 return err; 7161 return err;
6734} 7162}
6735 7163
6736static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7164static int _nfs41_test_stateid(struct nfs_server *server,
7165 nfs4_stateid *stateid,
7166 struct rpc_cred *cred)
6737{ 7167{
6738 int status; 7168 int status;
6739 struct nfs41_test_stateid_args args = { 7169 struct nfs41_test_stateid_args args = {
@@ -6744,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], 7174 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
6745 .rpc_argp = &args, 7175 .rpc_argp = &args,
6746 .rpc_resp = &res, 7176 .rpc_resp = &res,
7177 .rpc_cred = cred,
6747 }; 7178 };
6748 7179
6749 dprintk("NFS call test_stateid %p\n", stateid); 7180 dprintk("NFS call test_stateid %p\n", stateid);
@@ -6764,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6764 * 7195 *
6765 * @server: server / transport on which to perform the operation 7196 * @server: server / transport on which to perform the operation
6766 * @stateid: state ID to test 7197 * @stateid: state ID to test
7198 * @cred: credential
6767 * 7199 *
6768 * Returns NFS_OK if the server recognizes that "stateid" is valid. 7200 * Returns NFS_OK if the server recognizes that "stateid" is valid.
6769 * Otherwise a negative NFS4ERR value is returned if the operation 7201 * Otherwise a negative NFS4ERR value is returned if the operation
6770 * failed or the state ID is not currently valid. 7202 * failed or the state ID is not currently valid.
6771 */ 7203 */
6772static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7204static int nfs41_test_stateid(struct nfs_server *server,
7205 nfs4_stateid *stateid,
7206 struct rpc_cred *cred)
6773{ 7207{
6774 struct nfs4_exception exception = { }; 7208 struct nfs4_exception exception = { };
6775 int err; 7209 int err;
6776 do { 7210 do {
6777 err = _nfs41_test_stateid(server, stateid); 7211 err = _nfs41_test_stateid(server, stateid, cred);
6778 if (err != -NFS4ERR_DELAY) 7212 if (err != -NFS4ERR_DELAY)
6779 break; 7213 break;
6780 nfs4_handle_exception(server, err, &exception); 7214 nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7257,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
6823 7257
6824static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, 7258static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
6825 nfs4_stateid *stateid, 7259 nfs4_stateid *stateid,
7260 struct rpc_cred *cred,
6826 bool privileged) 7261 bool privileged)
6827{ 7262{
6828 struct rpc_message msg = { 7263 struct rpc_message msg = {
6829 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], 7264 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
7265 .rpc_cred = cred,
6830 }; 7266 };
6831 struct rpc_task_setup task_setup = { 7267 struct rpc_task_setup task_setup = {
6832 .rpc_client = server->client, 7268 .rpc_client = server->client,
@@ -6859,16 +7295,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
6859 * 7295 *
6860 * @server: server / transport on which to perform the operation 7296 * @server: server / transport on which to perform the operation
6861 * @stateid: state ID to release 7297 * @stateid: state ID to release
7298 * @cred: credential
6862 * 7299 *
6863 * Returns NFS_OK if the server freed "stateid". Otherwise a 7300 * Returns NFS_OK if the server freed "stateid". Otherwise a
6864 * negative NFS4ERR value is returned. 7301 * negative NFS4ERR value is returned.
6865 */ 7302 */
6866static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7303static int nfs41_free_stateid(struct nfs_server *server,
7304 nfs4_stateid *stateid,
7305 struct rpc_cred *cred)
6867{ 7306{
6868 struct rpc_task *task; 7307 struct rpc_task *task;
6869 int ret; 7308 int ret;
6870 7309
6871 task = _nfs41_free_stateid(server, stateid, true); 7310 task = _nfs41_free_stateid(server, stateid, cred, true);
6872 if (IS_ERR(task)) 7311 if (IS_ERR(task))
6873 return PTR_ERR(task); 7312 return PTR_ERR(task);
6874 ret = rpc_wait_for_completion_task(task); 7313 ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7320,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6881static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) 7320static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
6882{ 7321{
6883 struct rpc_task *task; 7322 struct rpc_task *task;
7323 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
6884 7324
6885 task = _nfs41_free_stateid(server, &lsp->ls_stateid, false); 7325 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
6886 nfs4_free_lock_state(server, lsp); 7326 nfs4_free_lock_state(server, lsp);
6887 if (IS_ERR(task)) 7327 if (IS_ERR(task))
6888 return PTR_ERR(task); 7328 return PTR_ERR(task);
@@ -7004,11 +7444,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
7004}; 7444};
7005#endif 7445#endif
7006 7446
7447#if defined(CONFIG_NFS_V4_2)
7448static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
7449 .minor_version = 2,
7450 .init_caps = NFS_CAP_READDIRPLUS
7451 | NFS_CAP_ATOMIC_OPEN
7452 | NFS_CAP_CHANGE_ATTR
7453 | NFS_CAP_POSIX_LOCK
7454 | NFS_CAP_STATEID_NFSV41
7455 | NFS_CAP_ATOMIC_OPEN_V1,
7456 .call_sync = nfs4_call_sync_sequence,
7457 .match_stateid = nfs41_match_stateid,
7458 .find_root_sec = nfs41_find_root_sec,
7459 .free_lock_state = nfs41_free_lock_state,
7460 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
7461 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
7462 .state_renewal_ops = &nfs41_state_renewal_ops,
7463};
7464#endif
7465
7007const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { 7466const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
7008 [0] = &nfs_v4_0_minor_ops, 7467 [0] = &nfs_v4_0_minor_ops,
7009#if defined(CONFIG_NFS_V4_1) 7468#if defined(CONFIG_NFS_V4_1)
7010 [1] = &nfs_v4_1_minor_ops, 7469 [1] = &nfs_v4_1_minor_ops,
7011#endif 7470#endif
7471#if defined(CONFIG_NFS_V4_2)
7472 [2] = &nfs_v4_2_minor_ops,
7473#endif
7012}; 7474};
7013 7475
7014const struct inode_operations nfs4_dir_inode_operations = { 7476const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
7108 7570
7109const struct xattr_handler *nfs4_xattr_handlers[] = { 7571const struct xattr_handler *nfs4_xattr_handlers[] = {
7110 &nfs4_xattr_nfs4_acl_handler, 7572 &nfs4_xattr_nfs4_acl_handler,
7573#ifdef CONFIG_NFS_V4_SECURITY_LABEL
7574 &nfs4_xattr_nfs4_label_handler,
7575#endif
7111 NULL 7576 NULL
7112}; 7577};
7113 7578
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c4e225e4a9af..36e21cb29d65 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
478 return 0; 478 return 0;
479} 479}
480 480
481int nfs4_init_session(struct nfs_server *server) 481int nfs4_init_session(struct nfs_client *clp)
482{ 482{
483 struct nfs_client *clp = server->nfs_client;
484 struct nfs4_session *session;
485 unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
486 unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
487
488 if (!nfs4_has_session(clp)) 483 if (!nfs4_has_session(clp))
489 return 0; 484 return 0;
490 485
491 if (server->rsize != 0) 486 clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
492 target_max_resp_sz = server->rsize;
493 target_max_resp_sz += nfs41_maxread_overhead;
494
495 if (server->wsize != 0)
496 target_max_rqst_sz = server->wsize;
497 target_max_rqst_sz += nfs41_maxwrite_overhead;
498
499 session = clp->cl_session;
500 spin_lock(&clp->cl_lock);
501 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
502 /* Initialise targets and channel attributes */
503 session->fc_target_max_rqst_sz = target_max_rqst_sz;
504 session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
505 session->fc_target_max_resp_sz = target_max_resp_sz;
506 session->fc_attrs.max_resp_sz = target_max_resp_sz;
507 } else {
508 /* Just adjust the targets */
509 if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
510 session->fc_target_max_rqst_sz = target_max_rqst_sz;
511 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
512 }
513 if (target_max_resp_sz > session->fc_target_max_resp_sz) {
514 session->fc_target_max_resp_sz = target_max_resp_sz;
515 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
516 }
517 }
518 spin_unlock(&clp->cl_lock);
519
520 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
521 nfs4_schedule_lease_recovery(clp);
522
523 return nfs41_check_session_ready(clp); 487 return nfs41_check_session_ready(clp);
524} 488}
525 489
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index ff7d9f0f8a65..3a153d82b90c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -66,9 +66,6 @@ struct nfs4_session {
66 struct nfs4_channel_attrs bc_attrs; 66 struct nfs4_channel_attrs bc_attrs;
67 struct nfs4_slot_table bc_slot_table; 67 struct nfs4_slot_table bc_slot_table;
68 struct nfs_client *clp; 68 struct nfs_client *clp;
69 /* Create session arguments */
70 unsigned int fc_target_max_rqst_sz;
71 unsigned int fc_target_max_resp_sz;
72}; 69};
73 70
74enum nfs4_session_state { 71enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
89 86
90extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 87extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
91extern void nfs4_destroy_session(struct nfs4_session *session); 88extern void nfs4_destroy_session(struct nfs4_session *session);
92extern int nfs4_init_session(struct nfs_server *server); 89extern int nfs4_init_session(struct nfs_client *clp);
93extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); 90extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
94 91
95extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); 92extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
122 119
123#else /* defined(CONFIG_NFS_V4_1) */ 120#else /* defined(CONFIG_NFS_V4_1) */
124 121
125static inline int nfs4_init_session(struct nfs_server *server) 122static inline int nfs4_init_session(struct nfs_client *clp)
126{ 123{
127 return 0; 124 return 0;
128} 125}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1fab140764c4..e22862f13564 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
228 return status; 228 return status;
229} 229}
230 230
231/* 231static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
232 * Back channel returns NFS4ERR_DELAY for new requests when
233 * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
234 * is ended.
235 */
236static void nfs4_end_drain_session(struct nfs_client *clp)
237{ 232{
238 struct nfs4_session *ses = clp->cl_session;
239 struct nfs4_slot_table *tbl;
240
241 if (ses == NULL)
242 return;
243 tbl = &ses->fc_slot_table;
244 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { 233 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
245 spin_lock(&tbl->slot_tbl_lock); 234 spin_lock(&tbl->slot_tbl_lock);
246 nfs41_wake_slot_table(tbl); 235 nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
248 } 237 }
249} 238}
250 239
240static void nfs4_end_drain_session(struct nfs_client *clp)
241{
242 struct nfs4_session *ses = clp->cl_session;
243
244 if (ses != NULL) {
245 nfs4_end_drain_slot_table(&ses->bc_slot_table);
246 nfs4_end_drain_slot_table(&ses->fc_slot_table);
247 }
248}
249
251/* 250/*
252 * Signal state manager thread if session fore channel is drained 251 * Signal state manager thread if session fore channel is drained
253 */ 252 */
@@ -1194,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
1194 snprintf(buf, sizeof(buf), "%s-manager", 1193 snprintf(buf, sizeof(buf), "%s-manager",
1195 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 1194 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
1196 rcu_read_unlock(); 1195 rcu_read_unlock();
1197 task = kthread_run(nfs4_run_state_manager, clp, buf); 1196 task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
1198 if (IS_ERR(task)) { 1197 if (IS_ERR(task)) {
1199 printk(KERN_ERR "%s: kthread_run: %ld\n", 1198 printk(KERN_ERR "%s: kthread_run: %ld\n",
1200 __func__, PTR_ERR(task)); 1199 __func__, PTR_ERR(task));
@@ -1373,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1373 /* Guard against delegation returns and new lock/unlock calls */ 1372 /* Guard against delegation returns and new lock/unlock calls */
1374 down_write(&nfsi->rwsem); 1373 down_write(&nfsi->rwsem);
1375 /* Protect inode->i_flock using the BKL */ 1374 /* Protect inode->i_flock using the BKL */
1376 lock_flocks(); 1375 spin_lock(&inode->i_lock);
1377 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1376 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1378 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 1377 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
1379 continue; 1378 continue;
1380 if (nfs_file_open_context(fl->fl_file)->state != state) 1379 if (nfs_file_open_context(fl->fl_file)->state != state)
1381 continue; 1380 continue;
1382 unlock_flocks(); 1381 spin_unlock(&inode->i_lock);
1383 status = ops->recover_lock(state, fl); 1382 status = ops->recover_lock(state, fl);
1384 switch (status) { 1383 switch (status) {
1385 case 0: 1384 case 0:
@@ -1406,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1406 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1405 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1407 status = 0; 1406 status = 0;
1408 } 1407 }
1409 lock_flocks(); 1408 spin_lock(&inode->i_lock);
1410 } 1409 }
1411 unlock_flocks(); 1410 spin_unlock(&inode->i_lock);
1412out: 1411out:
1413 up_write(&nfsi->rwsem); 1412 up_write(&nfsi->rwsem);
1414 return status; 1413 return status;
@@ -1563,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
1563} 1562}
1564 1563
1565static void nfs4_reclaim_complete(struct nfs_client *clp, 1564static void nfs4_reclaim_complete(struct nfs_client *clp,
1566 const struct nfs4_state_recovery_ops *ops) 1565 const struct nfs4_state_recovery_ops *ops,
1566 struct rpc_cred *cred)
1567{ 1567{
1568 /* Notify the server we're done reclaiming our state */ 1568 /* Notify the server we're done reclaiming our state */
1569 if (ops->reclaim_complete) 1569 if (ops->reclaim_complete)
1570 (void)ops->reclaim_complete(clp); 1570 (void)ops->reclaim_complete(clp, cred);
1571} 1571}
1572 1572
1573static void nfs4_clear_reclaim_server(struct nfs_server *server) 1573static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1612 1612
1613static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1613static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1614{ 1614{
1615 const struct nfs4_state_recovery_ops *ops;
1616 struct rpc_cred *cred;
1617
1615 if (!nfs4_state_clear_reclaim_reboot(clp)) 1618 if (!nfs4_state_clear_reclaim_reboot(clp))
1616 return; 1619 return;
1617 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); 1620 ops = clp->cl_mvops->reboot_recovery_ops;
1621 cred = ops->get_clid_cred(clp);
1622 nfs4_reclaim_complete(clp, ops, cred);
1623 put_rpccred(cred);
1618} 1624}
1619 1625
1620static void nfs_delegation_clear_all(struct nfs_client *clp) 1626static void nfs_delegation_clear_all(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index a5e1a3026d48..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
9#include "delegation.h" 9#include "delegation.h"
10#include "internal.h" 10#include "internal.h"
11#include "nfs4_fs.h" 11#include "nfs4_fs.h"
12#include "dns_resolve.h"
12#include "pnfs.h" 13#include "pnfs.h"
13#include "nfs.h" 14#include "nfs.h"
14 15
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
331{ 332{
332 int err; 333 int err;
333 334
334 err = nfs_idmap_init(); 335 err = nfs_dns_resolver_init();
335 if (err) 336 if (err)
336 goto out; 337 goto out;
337 338
338 err = nfs4_register_sysctl(); 339 err = nfs_idmap_init();
339 if (err) 340 if (err)
340 goto out1; 341 goto out1;
341 342
343 err = nfs4_register_sysctl();
344 if (err)
345 goto out2;
346
342 register_nfs_version(&nfs_v4); 347 register_nfs_version(&nfs_v4);
343 return 0; 348 return 0;
344out1: 349out2:
345 nfs_idmap_quit(); 350 nfs_idmap_quit();
351out1:
352 nfs_dns_resolver_destroy();
346out: 353out:
347 return err; 354 return err;
348} 355}
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
352 unregister_nfs_version(&nfs_v4); 359 unregister_nfs_version(&nfs_v4);
353 nfs4_unregister_sysctl(); 360 nfs4_unregister_sysctl();
354 nfs_idmap_quit(); 361 nfs_idmap_quit();
362 nfs_dns_resolver_destroy();
355} 363}
356 364
357MODULE_LICENSE("GPL"); 365MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..0abfb8466e79 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
102#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) 102#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
103#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 103#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
104#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 104#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
105#ifdef CONFIG_NFS_V4_SECURITY_LABEL
106/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
107#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
108#define encode_readdir_space 24
109#define encode_readdir_bitmask_sz 3
110#else
111#define nfs4_label_maxsz 0
112#define encode_readdir_space 20
113#define encode_readdir_bitmask_sz 2
114#endif
105/* We support only one layout type per file system */ 115/* We support only one layout type per file system */
106#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) 116#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
107/* This is based on getfattr, which uses the most attributes: */ 117/* This is based on getfattr, which uses the most attributes: */
108#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ 118#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
109 3 + 3 + 3 + nfs4_owner_maxsz + \ 119 3 + 3 + 3 + nfs4_owner_maxsz + \
110 nfs4_group_maxsz + decode_mdsthreshold_maxsz)) 120 nfs4_group_maxsz + nfs4_label_maxsz + \
121 decode_mdsthreshold_maxsz))
111#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ 122#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
112 nfs4_fattr_value_maxsz) 123 nfs4_fattr_value_maxsz)
113#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) 124#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
115 1 + 2 + 1 + \ 126 1 + 2 + 1 + \
116 nfs4_owner_maxsz + \ 127 nfs4_owner_maxsz + \
117 nfs4_group_maxsz + \ 128 nfs4_group_maxsz + \
129 nfs4_label_maxsz + \
118 4 + 4) 130 4 + 4)
119#define encode_savefh_maxsz (op_encode_hdr_maxsz) 131#define encode_savefh_maxsz (op_encode_hdr_maxsz)
120#define decode_savefh_maxsz (op_decode_hdr_maxsz) 132#define decode_savefh_maxsz (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
192 encode_stateid_maxsz + 3) 204 encode_stateid_maxsz + 3)
193#define decode_read_maxsz (op_decode_hdr_maxsz + 2) 205#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
194#define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 206#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
195 2 + encode_verifier_maxsz + 5) 207 2 + encode_verifier_maxsz + 5 + \
208 nfs4_label_maxsz)
196#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ 209#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
197 decode_verifier_maxsz) 210 decode_verifier_maxsz + \
211 nfs4_label_maxsz + nfs4_fattr_maxsz)
198#define encode_readlink_maxsz (op_encode_hdr_maxsz) 212#define encode_readlink_maxsz (op_encode_hdr_maxsz)
199#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) 213#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
200#define encode_write_maxsz (op_encode_hdr_maxsz + \ 214#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
853 decode_sequence_maxsz + 867 decode_sequence_maxsz +
854 decode_putfh_maxsz) * 868 decode_putfh_maxsz) *
855 XDR_UNIT); 869 XDR_UNIT);
870
871const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
872 compound_decode_hdr_maxsz +
873 decode_sequence_maxsz) *
874 XDR_UNIT);
875EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
856#endif /* CONFIG_NFS_V4_1 */ 876#endif /* CONFIG_NFS_V4_1 */
857 877
858static const umode_t nfs_type2fmt[] = { 878static const umode_t nfs_type2fmt[] = {
@@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
968 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); 988 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
969} 989}
970 990
971static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) 991static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
992 const struct nfs4_label *label,
993 const struct nfs_server *server)
972{ 994{
973 char owner_name[IDMAP_NAMESZ]; 995 char owner_name[IDMAP_NAMESZ];
974 char owner_group[IDMAP_NAMESZ]; 996 char owner_group[IDMAP_NAMESZ];
@@ -979,15 +1001,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
979 int len; 1001 int len;
980 uint32_t bmval0 = 0; 1002 uint32_t bmval0 = 0;
981 uint32_t bmval1 = 0; 1003 uint32_t bmval1 = 0;
1004 uint32_t bmval2 = 0;
982 1005
983 /* 1006 /*
984 * We reserve enough space to write the entire attribute buffer at once. 1007 * We reserve enough space to write the entire attribute buffer at once.
985 * In the worst-case, this would be 1008 * In the worst-case, this would be
986 * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) 1009 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
987 * = 36 bytes, plus any contribution from variable-length fields 1010 * = 40 bytes, plus any contribution from variable-length fields
988 * such as owner/group. 1011 * such as owner/group.
989 */ 1012 */
990 len = 16; 1013 len = 20;
991 1014
992 /* Sigh */ 1015 /* Sigh */
993 if (iap->ia_valid & ATTR_SIZE) 1016 if (iap->ia_valid & ATTR_SIZE)
@@ -1017,6 +1040,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1017 } 1040 }
1018 len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); 1041 len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
1019 } 1042 }
1043 if (label)
1044 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
1020 if (iap->ia_valid & ATTR_ATIME_SET) 1045 if (iap->ia_valid & ATTR_ATIME_SET)
1021 len += 16; 1046 len += 16;
1022 else if (iap->ia_valid & ATTR_ATIME) 1047 else if (iap->ia_valid & ATTR_ATIME)
@@ -1031,9 +1056,9 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1031 * We write the bitmap length now, but leave the bitmap and the attribute 1056 * We write the bitmap length now, but leave the bitmap and the attribute
1032 * buffer length to be backfilled at the end of this routine. 1057 * buffer length to be backfilled at the end of this routine.
1033 */ 1058 */
1034 *p++ = cpu_to_be32(2); 1059 *p++ = cpu_to_be32(3);
1035 q = p; 1060 q = p;
1036 p += 3; 1061 p += 4;
1037 1062
1038 if (iap->ia_valid & ATTR_SIZE) { 1063 if (iap->ia_valid & ATTR_SIZE) {
1039 bmval0 |= FATTR4_WORD0_SIZE; 1064 bmval0 |= FATTR4_WORD0_SIZE;
@@ -1071,6 +1096,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1071 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 1096 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
1072 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); 1097 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1073 } 1098 }
1099 if (label) {
1100 bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
1101 *p++ = cpu_to_be32(label->lfs);
1102 *p++ = cpu_to_be32(label->pi);
1103 *p++ = cpu_to_be32(label->len);
1104 p = xdr_encode_opaque_fixed(p, label->label, label->len);
1105 }
1074 1106
1075 /* 1107 /*
1076 * Now we backfill the bitmap and the attribute buffer length. 1108 * Now we backfill the bitmap and the attribute buffer length.
@@ -1080,9 +1112,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1080 len, ((char *)p - (char *)q) + 4); 1112 len, ((char *)p - (char *)q) + 4);
1081 BUG(); 1113 BUG();
1082 } 1114 }
1083 len = (char *)p - (char *)q - 12; 1115 len = (char *)p - (char *)q - 16;
1084 *q++ = htonl(bmval0); 1116 *q++ = htonl(bmval0);
1085 *q++ = htonl(bmval1); 1117 *q++ = htonl(bmval1);
1118 *q++ = htonl(bmval2);
1086 *q = htonl(len); 1119 *q = htonl(len);
1087 1120
1088/* out: */ 1121/* out: */
@@ -1136,7 +1169,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
1136 } 1169 }
1137 1170
1138 encode_string(xdr, create->name->len, create->name->name); 1171 encode_string(xdr, create->name->len, create->name->name);
1139 encode_attrs(xdr, create->attrs, create->server); 1172 encode_attrs(xdr, create->attrs, create->label, create->server);
1140} 1173}
1141 1174
1142static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) 1175static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1221,10 @@ encode_getattr_three(struct xdr_stream *xdr,
1188 1221
1189static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1222static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1190{ 1223{
1191 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], 1224 encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
1192 bitmask[1] & nfs4_fattr_bitmap[1], hdr); 1225 bitmask[1] & nfs4_fattr_bitmap[1],
1226 bitmask[2] & nfs4_fattr_bitmap[2],
1227 hdr);
1193} 1228}
1194 1229
1195static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, 1230static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1402,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1367 switch(arg->createmode) { 1402 switch(arg->createmode) {
1368 case NFS4_CREATE_UNCHECKED: 1403 case NFS4_CREATE_UNCHECKED:
1369 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); 1404 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1370 encode_attrs(xdr, arg->u.attrs, arg->server); 1405 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
1371 break; 1406 break;
1372 case NFS4_CREATE_GUARDED: 1407 case NFS4_CREATE_GUARDED:
1373 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1408 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1374 encode_attrs(xdr, arg->u.attrs, arg->server); 1409 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
1375 break; 1410 break;
1376 case NFS4_CREATE_EXCLUSIVE: 1411 case NFS4_CREATE_EXCLUSIVE:
1377 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); 1412 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1381 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); 1416 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
1382 encode_nfs4_verifier(xdr, &arg->u.verifier); 1417 encode_nfs4_verifier(xdr, &arg->u.verifier);
1383 dummy.ia_valid = 0; 1418 dummy.ia_valid = 0;
1384 encode_attrs(xdr, &dummy, arg->server); 1419 encode_attrs(xdr, &dummy, arg->label, arg->server);
1385 } 1420 }
1386} 1421}
1387 1422
@@ -1532,7 +1567,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1532 1567
1533static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1568static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1534{ 1569{
1535 uint32_t attrs[2] = { 1570 uint32_t attrs[3] = {
1536 FATTR4_WORD0_RDATTR_ERROR, 1571 FATTR4_WORD0_RDATTR_ERROR,
1537 FATTR4_WORD1_MOUNTED_ON_FILEID, 1572 FATTR4_WORD1_MOUNTED_ON_FILEID,
1538 }; 1573 };
@@ -1555,20 +1590,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1555 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); 1590 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
1556 encode_uint64(xdr, readdir->cookie); 1591 encode_uint64(xdr, readdir->cookie);
1557 encode_nfs4_verifier(xdr, &readdir->verifier); 1592 encode_nfs4_verifier(xdr, &readdir->verifier);
1558 p = reserve_space(xdr, 20); 1593 p = reserve_space(xdr, encode_readdir_space);
1559 *p++ = cpu_to_be32(dircount); 1594 *p++ = cpu_to_be32(dircount);
1560 *p++ = cpu_to_be32(readdir->count); 1595 *p++ = cpu_to_be32(readdir->count);
1561 *p++ = cpu_to_be32(2); 1596 *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
1562
1563 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1597 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1564 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1598 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1599 if (encode_readdir_bitmask_sz > 2) {
1600 if (hdr->minorversion > 1)
1601 attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
1602 p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
1603 }
1565 memcpy(verf, readdir->verifier.data, sizeof(verf)); 1604 memcpy(verf, readdir->verifier.data, sizeof(verf));
1566 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1605
1606 dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
1567 __func__, 1607 __func__,
1568 (unsigned long long)readdir->cookie, 1608 (unsigned long long)readdir->cookie,
1569 verf[0], verf[1], 1609 verf[0], verf[1],
1570 attrs[0] & readdir->bitmask[0], 1610 attrs[0] & readdir->bitmask[0],
1571 attrs[1] & readdir->bitmask[1]); 1611 attrs[1] & readdir->bitmask[1],
1612 attrs[2] & readdir->bitmask[2]);
1572} 1613}
1573 1614
1574static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) 1615static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1668,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1627{ 1668{
1628 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); 1669 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
1629 encode_nfs4_stateid(xdr, &arg->stateid); 1670 encode_nfs4_stateid(xdr, &arg->stateid);
1630 encode_attrs(xdr, arg->iap, server); 1671 encode_attrs(xdr, arg->iap, arg->label, server);
1631} 1672}
1632 1673
1633static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) 1674static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1930,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1889 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1930 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1890 NFS4_DEVICEID4_SIZE); 1931 NFS4_DEVICEID4_SIZE);
1891 *p++ = cpu_to_be32(args->pdev->layout_type); 1932 *p++ = cpu_to_be32(args->pdev->layout_type);
1892 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ 1933 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1893 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1934 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1894} 1935}
1895 1936
@@ -4038,6 +4079,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
4038 return status; 4079 return status;
4039} 4080}
4040 4081
4082static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
4083 struct nfs4_label *label)
4084{
4085 uint32_t pi = 0;
4086 uint32_t lfs = 0;
4087 __u32 len;
4088 __be32 *p;
4089 int status = 0;
4090
4091 if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
4092 return -EIO;
4093 if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
4094 p = xdr_inline_decode(xdr, 4);
4095 if (unlikely(!p))
4096 goto out_overflow;
4097 lfs = be32_to_cpup(p++);
4098 p = xdr_inline_decode(xdr, 4);
4099 if (unlikely(!p))
4100 goto out_overflow;
4101 pi = be32_to_cpup(p++);
4102 p = xdr_inline_decode(xdr, 4);
4103 if (unlikely(!p))
4104 goto out_overflow;
4105 len = be32_to_cpup(p++);
4106 p = xdr_inline_decode(xdr, len);
4107 if (unlikely(!p))
4108 goto out_overflow;
4109 if (len < NFS4_MAXLABELLEN) {
4110 if (label) {
4111 memcpy(label->label, p, len);
4112 label->len = len;
4113 label->pi = pi;
4114 label->lfs = lfs;
4115 status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
4116 }
4117 bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
4118 } else
4119 printk(KERN_WARNING "%s: label too long (%u)!\n",
4120 __func__, len);
4121 }
4122 if (label && label->label)
4123 dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
4124 (char *)label->label, label->len, label->pi, label->lfs);
4125 return status;
4126
4127out_overflow:
4128 print_overflow_msg(__func__, xdr);
4129 return -EIO;
4130}
4131
4041static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 4132static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
4042{ 4133{
4043 int status = 0; 4134 int status = 0;
@@ -4380,7 +4471,7 @@ out_overflow:
4380 4471
4381static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, 4472static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4382 struct nfs_fattr *fattr, struct nfs_fh *fh, 4473 struct nfs_fattr *fattr, struct nfs_fh *fh,
4383 struct nfs4_fs_locations *fs_loc, 4474 struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
4384 const struct nfs_server *server) 4475 const struct nfs_server *server)
4385{ 4476{
4386 int status; 4477 int status;
@@ -4488,6 +4579,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4488 if (status < 0) 4579 if (status < 0)
4489 goto xdr_error; 4580 goto xdr_error;
4490 4581
4582 if (label) {
4583 status = decode_attr_security_label(xdr, bitmap, label);
4584 if (status < 0)
4585 goto xdr_error;
4586 fattr->valid |= status;
4587 }
4588
4491xdr_error: 4589xdr_error:
4492 dprintk("%s: xdr returned %d\n", __func__, -status); 4590 dprintk("%s: xdr returned %d\n", __func__, -status);
4493 return status; 4591 return status;
@@ -4495,7 +4593,7 @@ xdr_error:
4495 4593
4496static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4594static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4497 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, 4595 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
4498 const struct nfs_server *server) 4596 struct nfs4_label *label, const struct nfs_server *server)
4499{ 4597{
4500 unsigned int savep; 4598 unsigned int savep;
4501 uint32_t attrlen, 4599 uint32_t attrlen,
@@ -4514,7 +4612,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4514 if (status < 0) 4612 if (status < 0)
4515 goto xdr_error; 4613 goto xdr_error;
4516 4614
4517 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); 4615 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
4616 label, server);
4518 if (status < 0) 4617 if (status < 0)
4519 goto xdr_error; 4618 goto xdr_error;
4520 4619
@@ -4524,10 +4623,16 @@ xdr_error:
4524 return status; 4623 return status;
4525} 4624}
4526 4625
4626static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4627 struct nfs4_label *label, const struct nfs_server *server)
4628{
4629 return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
4630}
4631
4527static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4632static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4528 const struct nfs_server *server) 4633 const struct nfs_server *server)
4529{ 4634{
4530 return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); 4635 return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
4531} 4636}
4532 4637
4533/* 4638/*
@@ -5919,7 +6024,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5919 status = decode_getfh(xdr, res->fh); 6024 status = decode_getfh(xdr, res->fh);
5920 if (status) 6025 if (status)
5921 goto out; 6026 goto out;
5922 status = decode_getfattr(xdr, res->fattr, res->server); 6027 status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
5923out: 6028out:
5924 return status; 6029 return status;
5925} 6030}
@@ -5945,7 +6050,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5945 goto out; 6050 goto out;
5946 status = decode_getfh(xdr, res->fh); 6051 status = decode_getfh(xdr, res->fh);
5947 if (status == 0) 6052 if (status == 0)
5948 status = decode_getfattr(xdr, res->fattr, res->server); 6053 status = decode_getfattr_label(xdr, res->fattr,
6054 res->label, res->server);
5949out: 6055out:
5950 return status; 6056 return status;
5951} 6057}
@@ -6036,7 +6142,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6036 status = decode_restorefh(xdr); 6142 status = decode_restorefh(xdr);
6037 if (status) 6143 if (status)
6038 goto out; 6144 goto out;
6039 decode_getfattr(xdr, res->fattr, res->server); 6145 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6040out: 6146out:
6041 return status; 6147 return status;
6042} 6148}
@@ -6065,7 +6171,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6065 status = decode_getfh(xdr, res->fh); 6171 status = decode_getfh(xdr, res->fh);
6066 if (status) 6172 if (status)
6067 goto out; 6173 goto out;
6068 decode_getfattr(xdr, res->fattr, res->server); 6174 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6069out: 6175out:
6070 return status; 6176 return status;
6071} 6177}
@@ -6097,7 +6203,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6097 status = decode_putfh(xdr); 6203 status = decode_putfh(xdr);
6098 if (status) 6204 if (status)
6099 goto out; 6205 goto out;
6100 status = decode_getfattr(xdr, res->fattr, res->server); 6206 status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6101out: 6207out:
6102 return status; 6208 return status;
6103} 6209}
@@ -6230,7 +6336,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6230 goto out; 6336 goto out;
6231 if (res->access_request) 6337 if (res->access_request)
6232 decode_access(xdr, &res->access_supported, &res->access_result); 6338 decode_access(xdr, &res->access_supported, &res->access_result);
6233 decode_getfattr(xdr, res->f_attr, res->server); 6339 decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
6234out: 6340out:
6235 return status; 6341 return status;
6236} 6342}
@@ -6307,7 +6413,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
6307 status = decode_setattr(xdr); 6413 status = decode_setattr(xdr);
6308 if (status) 6414 if (status)
6309 goto out; 6415 goto out;
6310 decode_getfattr(xdr, res->fattr, res->server); 6416 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6311out: 6417out:
6312 return status; 6418 return status;
6313} 6419}
@@ -6696,7 +6802,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
6696 xdr_enter_page(xdr, PAGE_SIZE); 6802 xdr_enter_page(xdr, PAGE_SIZE);
6697 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, 6803 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
6698 NULL, res->fs_locations, 6804 NULL, res->fs_locations,
6699 res->fs_locations->server); 6805 NULL, res->fs_locations->server);
6700out: 6806out:
6701 return status; 6807 return status;
6702} 6808}
@@ -7109,7 +7215,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
7109 goto out_overflow; 7215 goto out_overflow;
7110 7216
7111 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, 7217 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
7112 NULL, entry->server) < 0) 7218 NULL, entry->label, entry->server) < 0)
7113 goto out_overflow; 7219 goto out_overflow;
7114 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) 7220 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
7115 entry->ino = entry->fattr->mounted_on_fileid; 7221 entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
613 pd.pgbase = 0; 613 pd.pgbase = 0;
614 pd.pglen = PAGE_SIZE; 614 pd.pglen = PAGE_SIZE;
615 pd.mincount = 0; 615 pd.mincount = 0;
616 pd.maxcount = PAGE_SIZE;
616 617
617 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); 618 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
619 pnfslay->plh_lc_cred);
618 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); 620 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
619 if (err) 621 if (err)
620 goto err_out; 622 goto err_out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c5bd758e5637..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
360} 360}
361EXPORT_SYMBOL_GPL(pnfs_put_lseg); 361EXPORT_SYMBOL_GPL(pnfs_put_lseg);
362 362
363static inline u64 363static u64
364end_offset(u64 start, u64 len) 364end_offset(u64 start, u64 len)
365{ 365{
366 u64 end; 366 u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
376 * start2 end2 376 * start2 end2
377 * [----------------) 377 * [----------------)
378 */ 378 */
379static inline int 379static bool
380lo_seg_contained(struct pnfs_layout_range *l1, 380pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
381 struct pnfs_layout_range *l2) 381 const struct pnfs_layout_range *l2)
382{ 382{
383 u64 start1 = l1->offset; 383 u64 start1 = l1->offset;
384 u64 end1 = end_offset(start1, l1->length); 384 u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
395 * start2 end2 395 * start2 end2
396 * [----------------) 396 * [----------------)
397 */ 397 */
398static inline int 398static bool
399lo_seg_intersecting(struct pnfs_layout_range *l1, 399pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
400 struct pnfs_layout_range *l2) 400 const struct pnfs_layout_range *l2)
401{ 401{
402 u64 start1 = l1->offset; 402 u64 start1 = l1->offset;
403 u64 end1 = end_offset(start1, l1->length); 403 u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
409} 409}
410 410
411static bool 411static bool
412should_free_lseg(struct pnfs_layout_range *lseg_range, 412should_free_lseg(const struct pnfs_layout_range *lseg_range,
413 struct pnfs_layout_range *recall_range) 413 const struct pnfs_layout_range *recall_range)
414{ 414{
415 return (recall_range->iomode == IOMODE_ANY || 415 return (recall_range->iomode == IOMODE_ANY ||
416 lseg_range->iomode == recall_range->iomode) && 416 lseg_range->iomode == recall_range->iomode) &&
417 lo_seg_intersecting(lseg_range, recall_range); 417 pnfs_lseg_range_intersecting(lseg_range, recall_range);
418} 418}
419 419
420static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 420static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
766 lgp->args.inode = ino; 766 lgp->args.inode = ino;
767 lgp->args.ctx = get_nfs_open_context(ctx); 767 lgp->args.ctx = get_nfs_open_context(ctx);
768 lgp->gfp_flags = gfp_flags; 768 lgp->gfp_flags = gfp_flags;
769 lgp->cred = lo->plh_lc_cred;
769 770
770 /* Synchronously retrieve layout information from server and 771 /* Synchronously retrieve layout information from server and
771 * store in lseg. 772 * store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
860 lrp->args.inode = ino; 861 lrp->args.inode = ino;
861 lrp->args.layout = lo; 862 lrp->args.layout = lo;
862 lrp->clp = NFS_SERVER(ino)->nfs_client; 863 lrp->clp = NFS_SERVER(ino)->nfs_client;
864 lrp->cred = lo->plh_lc_cred;
863 865
864 status = nfs4_proc_layoutreturn(lrp); 866 status = nfs4_proc_layoutreturn(lrp);
865out: 867out:
@@ -984,8 +986,8 @@ out:
984 * are seen first. 986 * are seen first.
985 */ 987 */
986static s64 988static s64
987cmp_layout(struct pnfs_layout_range *l1, 989pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
988 struct pnfs_layout_range *l2) 990 const struct pnfs_layout_range *l2)
989{ 991{
990 s64 d; 992 s64 d;
991 993
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1012 dprintk("%s:Begin\n", __func__); 1014 dprintk("%s:Begin\n", __func__);
1013 1015
1014 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 1016 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
1015 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 1017 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
1016 continue; 1018 continue;
1017 list_add_tail(&lseg->pls_list, &lp->pls_list); 1019 list_add_tail(&lseg->pls_list, &lp->pls_list);
1018 dprintk("%s: inserted lseg %p " 1020 dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
1050 INIT_LIST_HEAD(&lo->plh_segs); 1052 INIT_LIST_HEAD(&lo->plh_segs);
1051 INIT_LIST_HEAD(&lo->plh_bulk_destroy); 1053 INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1052 lo->plh_inode = ino; 1054 lo->plh_inode = ino;
1053 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); 1055 lo->plh_lc_cred = get_rpccred(ctx->cred);
1054 return lo; 1056 return lo;
1055} 1057}
1056 1058
@@ -1091,21 +1093,21 @@ out_existing:
1091 * READ READ true 1093 * READ READ true
1092 * READ RW true 1094 * READ RW true
1093 */ 1095 */
1094static int 1096static bool
1095is_matching_lseg(struct pnfs_layout_range *ls_range, 1097pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1096 struct pnfs_layout_range *range) 1098 const struct pnfs_layout_range *range)
1097{ 1099{
1098 struct pnfs_layout_range range1; 1100 struct pnfs_layout_range range1;
1099 1101
1100 if ((range->iomode == IOMODE_RW && 1102 if ((range->iomode == IOMODE_RW &&
1101 ls_range->iomode != IOMODE_RW) || 1103 ls_range->iomode != IOMODE_RW) ||
1102 !lo_seg_intersecting(ls_range, range)) 1104 !pnfs_lseg_range_intersecting(ls_range, range))
1103 return 0; 1105 return 0;
1104 1106
1105 /* range1 covers only the first byte in the range */ 1107 /* range1 covers only the first byte in the range */
1106 range1 = *range; 1108 range1 = *range;
1107 range1.length = 1; 1109 range1.length = 1;
1108 return lo_seg_contained(ls_range, &range1); 1110 return pnfs_lseg_range_contained(ls_range, &range1);
1109} 1111}
1110 1112
1111/* 1113/*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1121 1123
1122 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1124 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1123 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1125 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1124 is_matching_lseg(&lseg->pls_range, range)) { 1126 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1125 ret = pnfs_get_lseg(lseg); 1127 ret = pnfs_get_lseg(lseg);
1126 break; 1128 break;
1127 } 1129 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
149 struct nfs4_deviceid dev_id; 149 struct nfs4_deviceid dev_id;
150 unsigned int layout_type; 150 unsigned int layout_type;
151 unsigned int mincount; 151 unsigned int mincount;
152 unsigned int maxcount; /* gdia_maxcount */
152 struct page **pages; 153 struct page **pages;
153 unsigned int pgbase; 154 unsigned int pgbase;
154 unsigned int pglen; 155 unsigned int pglen; /* reply buffer length */
155}; 156};
156 157
157#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 158#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
170 const struct nfs_fh *fh, 171 const struct nfs_fh *fh,
171 struct pnfs_devicelist *devlist); 172 struct pnfs_devicelist *devlist);
172extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 173extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
173 struct pnfs_device *dev); 174 struct pnfs_device *dev,
175 struct rpc_cred *cred);
174extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 176extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
175extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 177extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
176 178
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
98 */ 98 */
99static int 99static int
100nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, 100nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
101 struct nfs_fattr *fattr) 101 struct nfs_fattr *fattr, struct nfs4_label *label)
102{ 102{
103 struct rpc_message msg = { 103 struct rpc_message msg = {
104 .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], 104 .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
146 146
147static int 147static int
148nfs_proc_lookup(struct inode *dir, struct qstr *name, 148nfs_proc_lookup(struct inode *dir, struct qstr *name,
149 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 149 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
150 struct nfs4_label *label)
150{ 151{
151 struct nfs_diropargs arg = { 152 struct nfs_diropargs arg = {
152 .fh = NFS_FH(dir), 153 .fh = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
243 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 244 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
244 nfs_mark_for_revalidate(dir); 245 nfs_mark_for_revalidate(dir);
245 if (status == 0) 246 if (status == 0)
246 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 247 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
247 nfs_free_createdata(data); 248 nfs_free_createdata(data);
248out: 249out:
249 dprintk("NFS reply create: %d\n", status); 250 dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
290 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 291 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
291 } 292 }
292 if (status == 0) 293 if (status == 0)
293 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 294 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
294 nfs_free_createdata(data); 295 nfs_free_createdata(data);
295out: 296out:
296 dprintk("NFS reply mknod: %d\n", status); 297 dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
442 * should fill in the data with a LOOKUP call on the wire. 443 * should fill in the data with a LOOKUP call on the wire.
443 */ 444 */
444 if (status == 0) 445 if (status == 0)
445 status = nfs_instantiate(dentry, fh, fattr); 446 status = nfs_instantiate(dentry, fh, fattr, NULL);
446 447
447out_free: 448out_free:
448 nfs_free_fattr(fattr); 449 nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
471 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 472 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
472 nfs_mark_for_revalidate(dir); 473 nfs_mark_for_revalidate(dir);
473 if (status == 0) 474 if (status == 0)
474 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 475 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
475 nfs_free_createdata(data); 476 nfs_free_createdata(data);
476out: 477out:
477 dprintk("NFS reply mkdir: %d\n", status); 478 dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2d7525fbcf25..71fdc0dfa0d2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
269 269
270enum { 270enum {
271 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, 271 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
272 Opt_vers_4_1, 272 Opt_vers_4_1, Opt_vers_4_2,
273 273
274 Opt_vers_err 274 Opt_vers_err
275}; 275};
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
280 { Opt_vers_4, "4" }, 280 { Opt_vers_4, "4" },
281 { Opt_vers_4_0, "4.0" }, 281 { Opt_vers_4_0, "4.0" },
282 { Opt_vers_4_1, "4.1" }, 282 { Opt_vers_4_1, "4.1" },
283 { Opt_vers_4_2, "4.2" },
283 284
284 { Opt_vers_err, NULL } 285 { Opt_vers_err, NULL }
285}; 286};
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
832 seq_printf(m, "\n\tnfsv4:\t"); 833 seq_printf(m, "\n\tnfsv4:\t");
833 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 834 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
834 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 835 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
836 seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
835 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); 837 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
836 show_sessions(m, nfss); 838 show_sessions(m, nfss);
837 show_pnfs(m, nfss); 839 show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
1097 mnt->version = 4; 1099 mnt->version = 4;
1098 mnt->minorversion = 1; 1100 mnt->minorversion = 1;
1099 break; 1101 break;
1102 case Opt_vers_4_2:
1103 mnt->version = 4;
1104 mnt->minorversion = 2;
1105 break;
1100 default: 1106 default:
1101 return 0; 1107 return 0;
1102 } 1108 }
@@ -1608,29 +1614,13 @@ out_security_failure:
1608} 1614}
1609 1615
1610/* 1616/*
1611 * Select a security flavor for this mount. The selected flavor 1617 * Ensure that the specified authtype in args->auth_flavors[0] is supported by
1612 * is planted in args->auth_flavors[0]. 1618 * the server. Returns 0 if it's ok, and -EACCES if not.
1613 *
1614 * Returns 0 on success, -EACCES on failure.
1615 */ 1619 */
1616static int nfs_select_flavor(struct nfs_parsed_mount_data *args, 1620static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
1617 struct nfs_mount_request *request) 1621 rpc_authflavor_t *server_authlist, unsigned int count)
1618{ 1622{
1619 unsigned int i, count = *(request->auth_flav_len); 1623 unsigned int i;
1620 rpc_authflavor_t flavor;
1621
1622 /*
1623 * The NFSv2 MNT operation does not return a flavor list.
1624 */
1625 if (args->mount_server.version != NFS_MNT3_VERSION)
1626 goto out_default;
1627
1628 /*
1629 * Certain releases of Linux's mountd return an empty
1630 * flavor list in some cases.
1631 */
1632 if (count == 0)
1633 goto out_default;
1634 1624
1635 /* 1625 /*
1636 * If the sec= mount option is used, the specified flavor or AUTH_NULL 1626 * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
1640 * means that the server will ignore the rpc creds, so any flavor 1630 * means that the server will ignore the rpc creds, so any flavor
1641 * can be used. 1631 * can be used.
1642 */ 1632 */
1643 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
1644 for (i = 0; i < count; i++) {
1645 if (args->auth_flavors[0] == request->auth_flavs[i] ||
1646 request->auth_flavs[i] == RPC_AUTH_NULL)
1647 goto out;
1648 }
1649 dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
1650 args->auth_flavors[0]);
1651 goto out_err;
1652 }
1653
1654 /*
1655 * RFC 2623, section 2.7 suggests we SHOULD prefer the
1656 * flavor listed first. However, some servers list
1657 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
1658 */
1659 for (i = 0; i < count; i++) { 1633 for (i = 0; i < count; i++) {
1660 struct rpcsec_gss_info info; 1634 if (args->auth_flavors[0] == server_authlist[i] ||
1661 1635 server_authlist[i] == RPC_AUTH_NULL)
1662 flavor = request->auth_flavs[i]; 1636 goto out;
1663 switch (flavor) {
1664 case RPC_AUTH_UNIX:
1665 goto out_set;
1666 case RPC_AUTH_NULL:
1667 continue;
1668 default:
1669 if (rpcauth_get_gssinfo(flavor, &info) == 0)
1670 goto out_set;
1671 }
1672 } 1637 }
1673 1638
1674 /* 1639 dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
1675 * As a last chance, see if the server list contains AUTH_NULL - 1640 args->auth_flavors[0]);
1676 * if it does, use the default flavor. 1641 return -EACCES;
1677 */
1678 for (i = 0; i < count; i++) {
1679 if (request->auth_flavs[i] == RPC_AUTH_NULL)
1680 goto out_default;
1681 }
1682
1683 dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
1684 goto out_err;
1685 1642
1686out_default:
1687 /* use default if flavor not already set */
1688 flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
1689 RPC_AUTH_UNIX : args->auth_flavors[0];
1690out_set:
1691 args->auth_flavors[0] = flavor;
1692out: 1643out:
1693 dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); 1644 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1694 return 0; 1645 return 0;
1695out_err:
1696 return -EACCES;
1697} 1646}
1698 1647
1699/* 1648/*
@@ -1701,10 +1650,10 @@ out_err:
1701 * corresponding to the provided path. 1650 * corresponding to the provided path.
1702 */ 1651 */
1703static int nfs_request_mount(struct nfs_parsed_mount_data *args, 1652static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1704 struct nfs_fh *root_fh) 1653 struct nfs_fh *root_fh,
1654 rpc_authflavor_t *server_authlist,
1655 unsigned int *server_authlist_len)
1705{ 1656{
1706 rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
1707 unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
1708 struct nfs_mount_request request = { 1657 struct nfs_mount_request request = {
1709 .sap = (struct sockaddr *) 1658 .sap = (struct sockaddr *)
1710 &args->mount_server.address, 1659 &args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1712 .protocol = args->mount_server.protocol, 1661 .protocol = args->mount_server.protocol,
1713 .fh = root_fh, 1662 .fh = root_fh,
1714 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1663 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1715 .auth_flav_len = &server_authlist_len, 1664 .auth_flav_len = server_authlist_len,
1716 .auth_flavs = server_authlist, 1665 .auth_flavs = server_authlist,
1717 .net = args->net, 1666 .net = args->net,
1718 }; 1667 };
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1756 return status; 1705 return status;
1757 } 1706 }
1758 1707
1759 return nfs_select_flavor(args, &request); 1708 return 0;
1760} 1709}
1761 1710
1762struct dentry *nfs_try_mount(int flags, const char *dev_name, 1711static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
1763 struct nfs_mount_info *mount_info, 1712 struct nfs_subversion *nfs_mod)
1764 struct nfs_subversion *nfs_mod)
1765{ 1713{
1766 int status; 1714 int status;
1767 struct nfs_server *server; 1715 unsigned int i;
1716 bool tried_auth_unix = false;
1717 bool auth_null_in_list = false;
1718 struct nfs_server *server = ERR_PTR(-EACCES);
1719 struct nfs_parsed_mount_data *args = mount_info->parsed;
1720 rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
1721 unsigned int authlist_len = ARRAY_SIZE(authlist);
1722
1723 status = nfs_request_mount(args, mount_info->mntfh, authlist,
1724 &authlist_len);
1725 if (status)
1726 return ERR_PTR(status);
1768 1727
1769 if (mount_info->parsed->need_mount) { 1728 /*
1770 status = nfs_request_mount(mount_info->parsed, mount_info->mntfh); 1729 * Was a sec= authflavor specified in the options? First, verify
1730 * whether the server supports it, and then just try to use it if so.
1731 */
1732 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
1733 status = nfs_verify_authflavor(args, authlist, authlist_len);
1734 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1771 if (status) 1735 if (status)
1772 return ERR_PTR(status); 1736 return ERR_PTR(status);
1737 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1738 }
1739
1740 /*
1741 * No sec= option was provided. RFC 2623, section 2.7 suggests we
1742 * SHOULD prefer the flavor listed first. However, some servers list
1743 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
1744 */
1745 for (i = 0; i < authlist_len; ++i) {
1746 rpc_authflavor_t flavor;
1747 struct rpcsec_gss_info info;
1748
1749 flavor = authlist[i];
1750 switch (flavor) {
1751 case RPC_AUTH_UNIX:
1752 tried_auth_unix = true;
1753 break;
1754 case RPC_AUTH_NULL:
1755 auth_null_in_list = true;
1756 continue;
1757 default:
1758 if (rpcauth_get_gssinfo(flavor, &info) != 0)
1759 continue;
1760 /* Fallthrough */
1761 }
1762 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
1763 args->auth_flavors[0] = flavor;
1764 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1765 if (!IS_ERR(server))
1766 return server;
1773 } 1767 }
1774 1768
1775 /* Get a volume representation */ 1769 /*
1776 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); 1770 * Nothing we tried so far worked. At this point, give up if we've
1771 * already tried AUTH_UNIX or if the server's list doesn't contain
1772 * AUTH_NULL
1773 */
1774 if (tried_auth_unix || !auth_null_in_list)
1775 return server;
1776
1777 /* Last chance! Try AUTH_UNIX */
1778 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
1779 args->auth_flavors[0] = RPC_AUTH_UNIX;
1780 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1781}
1782
1783struct dentry *nfs_try_mount(int flags, const char *dev_name,
1784 struct nfs_mount_info *mount_info,
1785 struct nfs_subversion *nfs_mod)
1786{
1787 struct nfs_server *server;
1788
1789 if (mount_info->parsed->need_mount)
1790 server = nfs_try_mount_request(mount_info, nfs_mod);
1791 else
1792 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1793
1777 if (IS_ERR(server)) 1794 if (IS_ERR(server))
1778 return ERR_CAST(server); 1795 return ERR_CAST(server);
1779 1796
@@ -2412,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
2412int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, 2429int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
2413 struct nfs_mount_info *mount_info) 2430 struct nfs_mount_info *mount_info)
2414{ 2431{
2415 return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); 2432 int error;
2433 unsigned long kflags = 0, kflags_out = 0;
2434 if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
2435 kflags |= SECURITY_LSM_NATIVE_LABELS;
2436
2437 error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
2438 kflags, &kflags_out);
2439 if (error)
2440 goto err;
2441
2442 if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
2443 !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
2444 NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
2445err:
2446 return error;
2416} 2447}
2417EXPORT_SYMBOL_GPL(nfs_set_sb_security); 2448EXPORT_SYMBOL_GPL(nfs_set_sb_security);
2418 2449
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
479 479
480 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", 480 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
481 dentry->d_parent->d_name.name, dentry->d_name.name, 481 dentry->d_parent->d_name.name, dentry->d_name.name,
482 dentry->d_count); 482 d_count(dentry));
483 nfs_inc_stats(dir, NFSIOS_SILLYRENAME); 483 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
484 484
485 /* 485 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a2c7c28049d5..f1bdb7254776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -888,6 +888,28 @@ out:
888 return PageUptodate(page) != 0; 888 return PageUptodate(page) != 0;
889} 889}
890 890
891/* If we know the page is up to date, and we're not using byte range locks (or
892 * if we have the whole file locked for writing), it may be more efficient to
893 * extend the write to cover the entire page in order to avoid fragmentation
894 * inefficiencies.
895 *
896 * If the file is opened for synchronous writes or if we have a write delegation
897 * from the server then we can just skip the rest of the checks.
898 */
899static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
900{
901 if (file->f_flags & O_DSYNC)
902 return 0;
903 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
904 return 1;
905 if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
906 (inode->i_flock->fl_start == 0 &&
907 inode->i_flock->fl_end == OFFSET_MAX &&
908 inode->i_flock->fl_type != F_RDLCK)))
909 return 1;
910 return 0;
911}
912
891/* 913/*
892 * Update and possibly write a cached page of an NFS file. 914 * Update and possibly write a cached page of an NFS file.
893 * 915 *
@@ -908,14 +930,7 @@ int nfs_updatepage(struct file *file, struct page *page,
908 file->f_path.dentry->d_name.name, count, 930 file->f_path.dentry->d_name.name, count,
909 (long long)(page_file_offset(page) + offset)); 931 (long long)(page_file_offset(page) + offset));
910 932
911 /* If we're not using byte range locks, and we know the page 933 if (nfs_can_extend_write(file, page, inode)) {
912 * is up to date, it may be more efficient to extend the write
913 * to cover the entire page in order to avoid fragmentation
914 * inefficiencies.
915 */
916 if (nfs_write_pageuptodate(page, inode) &&
917 inode->i_flock == NULL &&
918 !(file->f_flags & O_DSYNC)) {
919 count = max(count + offset, nfs_page_length(page)); 934 count = max(count + offset, nfs_page_length(page));
920 offset = 0; 935 offset = 0;
921 } 936 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
81 81
82 If unsure, say N. 82 If unsure, say N.
83 83
84config NFSD_V4_SECURITY_LABEL
85 bool "Provide Security Label support for NFSv4 server"
86 depends on NFSD_V4 && SECURITY
87 help
88
89 Say Y here if you want enable fine-grained security label attribute
90 support for NFS version 4. Security labels allow security modules like
91 SELinux and Smack to label files to facilitate enforcement of their policies.
92 Without this an NFSv4 mount will have the same label on each file.
93
94 If you do not wish to enable fine-grained security labels SELinux or
95 Smack policies on NFSv4 files, say N.
96
97 WARNING: there is still a chance of backwards-incompatible protocol changes.
98 For now we recommend "Y" only for developers and testers."
99
84config NFSD_FAULT_INJECTION 100config NFSD_FAULT_INJECTION
85 bool "NFS server manual fault injection" 101 bool "NFS server manual fault injection"
86 depends on NFSD_V4 && DEBUG_KERNEL 102 depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 27d74a294515..a7cee864e7b2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
42#include "current_stateid.h" 42#include "current_stateid.h"
43#include "netns.h" 43#include "netns.h"
44 44
45#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
46#include <linux/security.h>
47
48static inline void
49nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
50{
51 struct inode *inode = resfh->fh_dentry->d_inode;
52 int status;
53
54 mutex_lock(&inode->i_mutex);
55 status = security_inode_setsecctx(resfh->fh_dentry,
56 label->data, label->len);
57 mutex_unlock(&inode->i_mutex);
58
59 if (status)
60 /*
61 * XXX: We should really fail the whole open, but we may
62 * already have created a new file, so it may be too
63 * late. For now this seems the least of evils:
64 */
65 bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
66
67 return;
68}
69#else
70static inline void
71nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
72{ }
73#endif
74
45#define NFSDDBG_FACILITY NFSDDBG_PROC 75#define NFSDDBG_FACILITY NFSDDBG_PROC
46 76
47static u32 nfsd_attrmask[] = { 77static u32 nfsd_attrmask[] = {
@@ -239,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
239 (u32 *)open->op_verf.data, 269 (u32 *)open->op_verf.data,
240 &open->op_truncate, &open->op_created); 270 &open->op_truncate, &open->op_created);
241 271
272 if (!status && open->op_label.len)
273 nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
274
242 /* 275 /*
243 * Following rfc 3530 14.2.16, use the returned bitmask 276 * Following rfc 3530 14.2.16, use the returned bitmask
244 * to indicate which attributes we used to store the 277 * to indicate which attributes we used to store the
@@ -263,7 +296,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
263 296
264 nfsd4_set_open_owner_reply_cache(cstate, open, resfh); 297 nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
265 accmode = NFSD_MAY_NOP; 298 accmode = NFSD_MAY_NOP;
266 if (open->op_created) 299 if (open->op_created ||
300 open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
267 accmode |= NFSD_MAY_OWNER_OVERRIDE; 301 accmode |= NFSD_MAY_OWNER_OVERRIDE;
268 status = do_open_permission(rqstp, resfh, open, accmode); 302 status = do_open_permission(rqstp, resfh, open, accmode);
269 set_change_info(&open->op_cinfo, current_fh); 303 set_change_info(&open->op_cinfo, current_fh);
@@ -637,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
637 if (status) 671 if (status)
638 goto out; 672 goto out;
639 673
674 if (create->cr_label.len)
675 nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
676
640 if (create->cr_acl != NULL) 677 if (create->cr_acl != NULL)
641 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, 678 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
642 create->cr_bmval); 679 create->cr_bmval);
@@ -916,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
916 setattr->sa_acl); 953 setattr->sa_acl);
917 if (status) 954 if (status)
918 goto out; 955 goto out;
956 if (setattr->sa_label.len)
957 status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
958 &setattr->sa_label);
959 if (status)
960 goto out;
919 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 961 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
920 0, (time_t)0); 962 0, (time_t)0);
921out: 963out:
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 316ec843dec2..280acef6f0dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -97,19 +97,20 @@ nfs4_lock_state(void)
97 97
98static void free_session(struct nfsd4_session *); 98static void free_session(struct nfsd4_session *);
99 99
100void nfsd4_put_session(struct nfsd4_session *ses) 100static bool is_session_dead(struct nfsd4_session *ses)
101{ 101{
102 atomic_dec(&ses->se_ref); 102 return ses->se_flags & NFS4_SESSION_DEAD;
103} 103}
104 104
105static bool is_session_dead(struct nfsd4_session *ses) 105void nfsd4_put_session(struct nfsd4_session *ses)
106{ 106{
107 return ses->se_flags & NFS4_SESSION_DEAD; 107 if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
108 free_session(ses);
108} 109}
109 110
110static __be32 mark_session_dead_locked(struct nfsd4_session *ses) 111static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
111{ 112{
112 if (atomic_read(&ses->se_ref)) 113 if (atomic_read(&ses->se_ref) > ref_held_by_me)
113 return nfserr_jukebox; 114 return nfserr_jukebox;
114 ses->se_flags |= NFS4_SESSION_DEAD; 115 ses->se_flags |= NFS4_SESSION_DEAD;
115 return nfs_ok; 116 return nfs_ok;
@@ -364,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
364} 365}
365 366
366static struct nfs4_delegation * 367static struct nfs4_delegation *
367alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) 368alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
368{ 369{
369 struct nfs4_delegation *dp; 370 struct nfs4_delegation *dp;
370 struct nfs4_file *fp = stp->st_file; 371 struct nfs4_file *fp = stp->st_file;
371 372
372 dprintk("NFSD alloc_init_deleg\n"); 373 dprintk("NFSD alloc_init_deleg\n");
373 /*
374 * Major work on the lease subsystem (for example, to support
375 * calbacks on stat) will be required before we can support
376 * write delegations properly.
377 */
378 if (type != NFS4_OPEN_DELEGATE_READ)
379 return NULL;
380 if (fp->fi_had_conflict) 374 if (fp->fi_had_conflict)
381 return NULL; 375 return NULL;
382 if (num_delegations > max_delegations) 376 if (num_delegations > max_delegations)
@@ -397,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
397 INIT_LIST_HEAD(&dp->dl_recall_lru); 391 INIT_LIST_HEAD(&dp->dl_recall_lru);
398 get_nfs4_file(fp); 392 get_nfs4_file(fp);
399 dp->dl_file = fp; 393 dp->dl_file = fp;
400 dp->dl_type = type; 394 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
401 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 395 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
402 dp->dl_time = 0; 396 dp->dl_time = 0;
403 atomic_set(&dp->dl_count, 1); 397 atomic_set(&dp->dl_count, 1);
@@ -1188,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
1188 target->cr_gid = source->cr_gid; 1182 target->cr_gid = source->cr_gid;
1189 target->cr_group_info = source->cr_group_info; 1183 target->cr_group_info = source->cr_group_info;
1190 get_group_info(target->cr_group_info); 1184 get_group_info(target->cr_group_info);
1185 target->cr_gss_mech = source->cr_gss_mech;
1186 if (source->cr_gss_mech)
1187 gss_mech_get(source->cr_gss_mech);
1191 return 0; 1188 return 0;
1192} 1189}
1193 1190
@@ -1262,6 +1259,31 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1262 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); 1259 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
1263} 1260}
1264 1261
1262static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
1263{
1264 struct svc_cred *cr = &rqstp->rq_cred;
1265 u32 service;
1266
1267 service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
1268 return service == RPC_GSS_SVC_INTEGRITY ||
1269 service == RPC_GSS_SVC_PRIVACY;
1270}
1271
1272static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
1273{
1274 struct svc_cred *cr = &rqstp->rq_cred;
1275
1276 if (!cl->cl_mach_cred)
1277 return true;
1278 if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
1279 return false;
1280 if (!svc_rqst_integrity_protected(rqstp))
1281 return false;
1282 if (!cr->cr_principal)
1283 return false;
1284 return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
1285}
1286
1265static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) 1287static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
1266{ 1288{
1267 static u32 current_clientid = 1; 1289 static u32 current_clientid = 1;
@@ -1639,16 +1661,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1639 if (exid->flags & ~EXCHGID4_FLAG_MASK_A) 1661 if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
1640 return nfserr_inval; 1662 return nfserr_inval;
1641 1663
1642 /* Currently only support SP4_NONE */
1643 switch (exid->spa_how) { 1664 switch (exid->spa_how) {
1665 case SP4_MACH_CRED:
1666 if (!svc_rqst_integrity_protected(rqstp))
1667 return nfserr_inval;
1644 case SP4_NONE: 1668 case SP4_NONE:
1645 break; 1669 break;
1646 default: /* checked by xdr code */ 1670 default: /* checked by xdr code */
1647 WARN_ON_ONCE(1); 1671 WARN_ON_ONCE(1);
1648 case SP4_SSV: 1672 case SP4_SSV:
1649 return nfserr_encr_alg_unsupp; 1673 return nfserr_encr_alg_unsupp;
1650 case SP4_MACH_CRED:
1651 return nfserr_serverfault; /* no excuse :-/ */
1652 } 1674 }
1653 1675
1654 /* Cases below refer to rfc 5661 section 18.35.4: */ 1676 /* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1663,6 +1685,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1663 status = nfserr_inval; 1685 status = nfserr_inval;
1664 goto out; 1686 goto out;
1665 } 1687 }
1688 if (!mach_creds_match(conf, rqstp)) {
1689 status = nfserr_wrong_cred;
1690 goto out;
1691 }
1666 if (!creds_match) { /* case 9 */ 1692 if (!creds_match) { /* case 9 */
1667 status = nfserr_perm; 1693 status = nfserr_perm;
1668 goto out; 1694 goto out;
@@ -1709,7 +1735,8 @@ out_new:
1709 status = nfserr_jukebox; 1735 status = nfserr_jukebox;
1710 goto out; 1736 goto out;
1711 } 1737 }
1712 new->cl_minorversion = 1; 1738 new->cl_minorversion = cstate->minorversion;
1739 new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
1713 1740
1714 gen_clid(new, nn); 1741 gen_clid(new, nn);
1715 add_to_unconfirmed(new); 1742 add_to_unconfirmed(new);
@@ -1839,6 +1866,24 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
1839 return nfs_ok; 1866 return nfs_ok;
1840} 1867}
1841 1868
1869static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
1870{
1871 switch (cbs->flavor) {
1872 case RPC_AUTH_NULL:
1873 case RPC_AUTH_UNIX:
1874 return nfs_ok;
1875 default:
1876 /*
1877 * GSS case: the spec doesn't allow us to return this
1878 * error. But it also doesn't allow us not to support
1879 * GSS.
1880 * I'd rather this fail hard than return some error the
1881 * client might think it can already handle:
1882 */
1883 return nfserr_encr_alg_unsupp;
1884 }
1885}
1886
1842__be32 1887__be32
1843nfsd4_create_session(struct svc_rqst *rqstp, 1888nfsd4_create_session(struct svc_rqst *rqstp,
1844 struct nfsd4_compound_state *cstate, 1889 struct nfsd4_compound_state *cstate,
@@ -1854,6 +1899,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1854 1899
1855 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1900 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1856 return nfserr_inval; 1901 return nfserr_inval;
1902 status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
1903 if (status)
1904 return status;
1857 status = check_forechannel_attrs(&cr_ses->fore_channel, nn); 1905 status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
1858 if (status) 1906 if (status)
1859 return status; 1907 return status;
@@ -1874,6 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1874 WARN_ON_ONCE(conf && unconf); 1922 WARN_ON_ONCE(conf && unconf);
1875 1923
1876 if (conf) { 1924 if (conf) {
1925 status = nfserr_wrong_cred;
1926 if (!mach_creds_match(conf, rqstp))
1927 goto out_free_conn;
1877 cs_slot = &conf->cl_cs_slot; 1928 cs_slot = &conf->cl_cs_slot;
1878 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1929 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1879 if (status == nfserr_replay_cache) { 1930 if (status == nfserr_replay_cache) {
@@ -1890,6 +1941,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1890 status = nfserr_clid_inuse; 1941 status = nfserr_clid_inuse;
1891 goto out_free_conn; 1942 goto out_free_conn;
1892 } 1943 }
1944 status = nfserr_wrong_cred;
1945 if (!mach_creds_match(unconf, rqstp))
1946 goto out_free_conn;
1893 cs_slot = &unconf->cl_cs_slot; 1947 cs_slot = &unconf->cl_cs_slot;
1894 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1948 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1895 if (status) { 1949 if (status) {
@@ -1957,7 +2011,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
1957{ 2011{
1958 struct nfsd4_session *session = cstate->session; 2012 struct nfsd4_session *session = cstate->session;
1959 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2013 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2014 __be32 status;
1960 2015
2016 status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
2017 if (status)
2018 return status;
1961 spin_lock(&nn->client_lock); 2019 spin_lock(&nn->client_lock);
1962 session->se_cb_prog = bc->bc_cb_program; 2020 session->se_cb_prog = bc->bc_cb_program;
1963 session->se_cb_sec = bc->bc_cb_sec; 2021 session->se_cb_sec = bc->bc_cb_sec;
@@ -1986,6 +2044,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1986 status = nfserr_badsession; 2044 status = nfserr_badsession;
1987 if (!session) 2045 if (!session)
1988 goto out; 2046 goto out;
2047 status = nfserr_wrong_cred;
2048 if (!mach_creds_match(session->se_client, rqstp))
2049 goto out;
1989 status = nfsd4_map_bcts_dir(&bcts->dir); 2050 status = nfsd4_map_bcts_dir(&bcts->dir);
1990 if (status) 2051 if (status)
1991 goto out; 2052 goto out;
@@ -2014,6 +2075,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
2014{ 2075{
2015 struct nfsd4_session *ses; 2076 struct nfsd4_session *ses;
2016 __be32 status; 2077 __be32 status;
2078 int ref_held_by_me = 0;
2017 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); 2079 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
2018 2080
2019 nfs4_lock_state(); 2081 nfs4_lock_state();
@@ -2021,6 +2083,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
2021 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { 2083 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
2022 if (!nfsd4_last_compound_op(r)) 2084 if (!nfsd4_last_compound_op(r))
2023 goto out; 2085 goto out;
2086 ref_held_by_me++;
2024 } 2087 }
2025 dump_sessionid(__func__, &sessionid->sessionid); 2088 dump_sessionid(__func__, &sessionid->sessionid);
2026 spin_lock(&nn->client_lock); 2089 spin_lock(&nn->client_lock);
@@ -2028,17 +2091,22 @@ nfsd4_destroy_session(struct svc_rqst *r,
2028 status = nfserr_badsession; 2091 status = nfserr_badsession;
2029 if (!ses) 2092 if (!ses)
2030 goto out_client_lock; 2093 goto out_client_lock;
2031 status = mark_session_dead_locked(ses); 2094 status = nfserr_wrong_cred;
2032 if (status) 2095 if (!mach_creds_match(ses->se_client, r))
2033 goto out_client_lock; 2096 goto out_client_lock;
2097 nfsd4_get_session_locked(ses);
2098 status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
2099 if (status)
2100 goto out_put_session;
2034 unhash_session(ses); 2101 unhash_session(ses);
2035 spin_unlock(&nn->client_lock); 2102 spin_unlock(&nn->client_lock);
2036 2103
2037 nfsd4_probe_callback_sync(ses->se_client); 2104 nfsd4_probe_callback_sync(ses->se_client);
2038 2105
2039 spin_lock(&nn->client_lock); 2106 spin_lock(&nn->client_lock);
2040 free_session(ses);
2041 status = nfs_ok; 2107 status = nfs_ok;
2108out_put_session:
2109 nfsd4_put_session(ses);
2042out_client_lock: 2110out_client_lock:
2043 spin_unlock(&nn->client_lock); 2111 spin_unlock(&nn->client_lock);
2044out: 2112out:
@@ -2058,26 +2126,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
2058 return NULL; 2126 return NULL;
2059} 2127}
2060 2128
2061static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) 2129static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
2062{ 2130{
2063 struct nfs4_client *clp = ses->se_client; 2131 struct nfs4_client *clp = ses->se_client;
2064 struct nfsd4_conn *c; 2132 struct nfsd4_conn *c;
2133 __be32 status = nfs_ok;
2065 int ret; 2134 int ret;
2066 2135
2067 spin_lock(&clp->cl_lock); 2136 spin_lock(&clp->cl_lock);
2068 c = __nfsd4_find_conn(new->cn_xprt, ses); 2137 c = __nfsd4_find_conn(new->cn_xprt, ses);
2069 if (c) { 2138 if (c)
2070 spin_unlock(&clp->cl_lock); 2139 goto out_free;
2071 free_conn(new); 2140 status = nfserr_conn_not_bound_to_session;
2072 return; 2141 if (clp->cl_mach_cred)
2073 } 2142 goto out_free;
2074 __nfsd4_hash_conn(new, ses); 2143 __nfsd4_hash_conn(new, ses);
2075 spin_unlock(&clp->cl_lock); 2144 spin_unlock(&clp->cl_lock);
2076 ret = nfsd4_register_conn(new); 2145 ret = nfsd4_register_conn(new);
2077 if (ret) 2146 if (ret)
2078 /* oops; xprt is already down: */ 2147 /* oops; xprt is already down: */
2079 nfsd4_conn_lost(&new->cn_xpt_user); 2148 nfsd4_conn_lost(&new->cn_xpt_user);
2080 return; 2149 return nfs_ok;
2150out_free:
2151 spin_unlock(&clp->cl_lock);
2152 free_conn(new);
2153 return status;
2081} 2154}
2082 2155
2083static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) 2156static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2169,8 +2242,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2169 if (status) 2242 if (status)
2170 goto out_put_session; 2243 goto out_put_session;
2171 2244
2172 nfsd4_sequence_check_conn(conn, session); 2245 status = nfsd4_sequence_check_conn(conn, session);
2173 conn = NULL; 2246 conn = NULL;
2247 if (status)
2248 goto out_put_session;
2174 2249
2175 /* Success! bump slot seqid */ 2250 /* Success! bump slot seqid */
2176 slot->sl_seqid = seq->seqid; 2251 slot->sl_seqid = seq->seqid;
@@ -2232,7 +2307,10 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2232 status = nfserr_stale_clientid; 2307 status = nfserr_stale_clientid;
2233 goto out; 2308 goto out;
2234 } 2309 }
2235 2310 if (!mach_creds_match(clp, rqstp)) {
2311 status = nfserr_wrong_cred;
2312 goto out;
2313 }
2236 expire_client(clp); 2314 expire_client(clp);
2237out: 2315out:
2238 nfs4_unlock_state(); 2316 nfs4_unlock_state();
@@ -2645,13 +2723,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
2645 2723
2646 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); 2724 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
2647 2725
2648 /* only place dl_time is set. protected by lock_flocks*/ 2726 /* Only place dl_time is set; protected by i_lock: */
2649 dp->dl_time = get_seconds(); 2727 dp->dl_time = get_seconds();
2650 2728
2651 nfsd4_cb_recall(dp); 2729 nfsd4_cb_recall(dp);
2652} 2730}
2653 2731
2654/* Called from break_lease() with lock_flocks() held. */ 2732/* Called from break_lease() with i_lock held. */
2655static void nfsd_break_deleg_cb(struct file_lock *fl) 2733static void nfsd_break_deleg_cb(struct file_lock *fl)
2656{ 2734{
2657 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; 2735 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -2940,13 +3018,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
2940 return fl; 3018 return fl;
2941} 3019}
2942 3020
2943static int nfs4_setlease(struct nfs4_delegation *dp, int flag) 3021static int nfs4_setlease(struct nfs4_delegation *dp)
2944{ 3022{
2945 struct nfs4_file *fp = dp->dl_file; 3023 struct nfs4_file *fp = dp->dl_file;
2946 struct file_lock *fl; 3024 struct file_lock *fl;
2947 int status; 3025 int status;
2948 3026
2949 fl = nfs4_alloc_init_lease(dp, flag); 3027 fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
2950 if (!fl) 3028 if (!fl)
2951 return -ENOMEM; 3029 return -ENOMEM;
2952 fl->fl_file = find_readable_file(fp); 3030 fl->fl_file = find_readable_file(fp);
@@ -2964,12 +3042,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2964 return 0; 3042 return 0;
2965} 3043}
2966 3044
2967static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) 3045static int nfs4_set_delegation(struct nfs4_delegation *dp)
2968{ 3046{
2969 struct nfs4_file *fp = dp->dl_file; 3047 struct nfs4_file *fp = dp->dl_file;
2970 3048
2971 if (!fp->fi_lease) 3049 if (!fp->fi_lease)
2972 return nfs4_setlease(dp, flag); 3050 return nfs4_setlease(dp);
2973 spin_lock(&recall_lock); 3051 spin_lock(&recall_lock);
2974 if (fp->fi_had_conflict) { 3052 if (fp->fi_had_conflict) {
2975 spin_unlock(&recall_lock); 3053 spin_unlock(&recall_lock);
@@ -3005,6 +3083,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
3005 3083
3006/* 3084/*
3007 * Attempt to hand out a delegation. 3085 * Attempt to hand out a delegation.
3086 *
3087 * Note we don't support write delegations, and won't until the vfs has
3088 * proper support for them.
3008 */ 3089 */
3009static void 3090static void
3010nfs4_open_delegation(struct net *net, struct svc_fh *fh, 3091nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -3013,39 +3094,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3013 struct nfs4_delegation *dp; 3094 struct nfs4_delegation *dp;
3014 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); 3095 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
3015 int cb_up; 3096 int cb_up;
3016 int status = 0, flag = 0; 3097 int status = 0;
3017 3098
3018 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); 3099 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
3019 flag = NFS4_OPEN_DELEGATE_NONE;
3020 open->op_recall = 0; 3100 open->op_recall = 0;
3021 switch (open->op_claim_type) { 3101 switch (open->op_claim_type) {
3022 case NFS4_OPEN_CLAIM_PREVIOUS: 3102 case NFS4_OPEN_CLAIM_PREVIOUS:
3023 if (!cb_up) 3103 if (!cb_up)
3024 open->op_recall = 1; 3104 open->op_recall = 1;
3025 flag = open->op_delegate_type; 3105 if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
3026 if (flag == NFS4_OPEN_DELEGATE_NONE) 3106 goto out_no_deleg;
3027 goto out;
3028 break; 3107 break;
3029 case NFS4_OPEN_CLAIM_NULL: 3108 case NFS4_OPEN_CLAIM_NULL:
3030 /* Let's not give out any delegations till everyone's 3109 /*
3031 * had the chance to reclaim theirs.... */ 3110 * Let's not give out any delegations till everyone's
3111 * had the chance to reclaim theirs....
3112 */
3032 if (locks_in_grace(net)) 3113 if (locks_in_grace(net))
3033 goto out; 3114 goto out_no_deleg;
3034 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) 3115 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
3035 goto out; 3116 goto out_no_deleg;
3117 /*
3118 * Also, if the file was opened for write or
3119 * create, there's a good chance the client's
3120 * about to write to it, resulting in an
3121 * immediate recall (since we don't support
3122 * write delegations):
3123 */
3036 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 3124 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
3037 flag = NFS4_OPEN_DELEGATE_WRITE; 3125 goto out_no_deleg;
3038 else 3126 if (open->op_create == NFS4_OPEN_CREATE)
3039 flag = NFS4_OPEN_DELEGATE_READ; 3127 goto out_no_deleg;
3040 break; 3128 break;
3041 default: 3129 default:
3042 goto out; 3130 goto out_no_deleg;
3043 } 3131 }
3044 3132 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
3045 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
3046 if (dp == NULL) 3133 if (dp == NULL)
3047 goto out_no_deleg; 3134 goto out_no_deleg;
3048 status = nfs4_set_delegation(dp, flag); 3135 status = nfs4_set_delegation(dp);
3049 if (status) 3136 if (status)
3050 goto out_free; 3137 goto out_free;
3051 3138
@@ -3053,24 +3140,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3053 3140
3054 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", 3141 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
3055 STATEID_VAL(&dp->dl_stid.sc_stateid)); 3142 STATEID_VAL(&dp->dl_stid.sc_stateid));
3056out: 3143 open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
3057 open->op_delegate_type = flag;
3058 if (flag == NFS4_OPEN_DELEGATE_NONE) {
3059 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
3060 open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
3061 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
3062
3063 /* 4.1 client asking for a delegation? */
3064 if (open->op_deleg_want)
3065 nfsd4_open_deleg_none_ext(open, status);
3066 }
3067 return; 3144 return;
3068out_free: 3145out_free:
3069 unhash_stid(&dp->dl_stid); 3146 unhash_stid(&dp->dl_stid);
3070 nfs4_put_delegation(dp); 3147 nfs4_put_delegation(dp);
3071out_no_deleg: 3148out_no_deleg:
3072 flag = NFS4_OPEN_DELEGATE_NONE; 3149 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
3073 goto out; 3150 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
3151 open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
3152 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
3153 open->op_recall = 1;
3154 }
3155
3156 /* 4.1 client asking for a delegation? */
3157 if (open->op_deleg_want)
3158 nfsd4_open_deleg_none_ext(open, status);
3159 return;
3074} 3160}
3075 3161
3076static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, 3162static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3427,7 +3513,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
3427/* Returns true iff a is later than b: */ 3513/* Returns true iff a is later than b: */
3428static bool stateid_generation_after(stateid_t *a, stateid_t *b) 3514static bool stateid_generation_after(stateid_t *a, stateid_t *b)
3429{ 3515{
3430 return (s32)a->si_generation - (s32)b->si_generation > 0; 3516 return (s32)(a->si_generation - b->si_generation) > 0;
3431} 3517}
3432 3518
3433static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) 3519static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -4435,7 +4521,6 @@ __be32
4435nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 4521nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4436 struct nfsd4_locku *locku) 4522 struct nfsd4_locku *locku)
4437{ 4523{
4438 struct nfs4_lockowner *lo;
4439 struct nfs4_ol_stateid *stp; 4524 struct nfs4_ol_stateid *stp;
4440 struct file *filp = NULL; 4525 struct file *filp = NULL;
4441 struct file_lock *file_lock = NULL; 4526 struct file_lock *file_lock = NULL;
@@ -4468,10 +4553,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4468 status = nfserr_jukebox; 4553 status = nfserr_jukebox;
4469 goto out; 4554 goto out;
4470 } 4555 }
4471 lo = lockowner(stp->st_stateowner);
4472 locks_init_lock(file_lock); 4556 locks_init_lock(file_lock);
4473 file_lock->fl_type = F_UNLCK; 4557 file_lock->fl_type = F_UNLCK;
4474 file_lock->fl_owner = (fl_owner_t)lo; 4558 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4475 file_lock->fl_pid = current->tgid; 4559 file_lock->fl_pid = current->tgid;
4476 file_lock->fl_file = filp; 4560 file_lock->fl_file = filp;
4477 file_lock->fl_flags = FL_POSIX; 4561 file_lock->fl_flags = FL_POSIX;
@@ -4490,11 +4574,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4490 update_stateid(&stp->st_stid.sc_stateid); 4574 update_stateid(&stp->st_stid.sc_stateid);
4491 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4575 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4492 4576
4493 if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
4494 WARN_ON_ONCE(cstate->replay_owner);
4495 release_lockowner(lo);
4496 }
4497
4498out: 4577out:
4499 nfsd4_bump_seqid(cstate, status); 4578 nfsd4_bump_seqid(cstate, status);
4500 if (!cstate->replay_owner) 4579 if (!cstate->replay_owner)
@@ -4520,7 +4599,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4520 struct inode *inode = filp->fi_inode; 4599 struct inode *inode = filp->fi_inode;
4521 int status = 0; 4600 int status = 0;
4522 4601
4523 lock_flocks(); 4602 spin_lock(&inode->i_lock);
4524 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4603 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
4525 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4604 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
4526 status = 1; 4605 status = 1;
@@ -4528,7 +4607,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4528 } 4607 }
4529 } 4608 }
4530out: 4609out:
4531 unlock_flocks(); 4610 spin_unlock(&inode->i_lock);
4532 return status; 4611 return status;
4533} 4612}
4534 4613
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe450..0c0f3ea90de5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
55#include "cache.h" 55#include "cache.h"
56#include "netns.h" 56#include "netns.h"
57 57
58#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
59#include <linux/security.h>
60#endif
61
62
58#define NFSDDBG_FACILITY NFSDDBG_XDR 63#define NFSDDBG_FACILITY NFSDDBG_XDR
59 64
60/* 65/*
@@ -134,6 +139,19 @@ xdr_error: \
134 } \ 139 } \
135} while (0) 140} while (0)
136 141
142static void next_decode_page(struct nfsd4_compoundargs *argp)
143{
144 argp->pagelist++;
145 argp->p = page_address(argp->pagelist[0]);
146 if (argp->pagelen < PAGE_SIZE) {
147 argp->end = argp->p + (argp->pagelen>>2);
148 argp->pagelen = 0;
149 } else {
150 argp->end = argp->p + (PAGE_SIZE>>2);
151 argp->pagelen -= PAGE_SIZE;
152 }
153}
154
137static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) 155static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
138{ 156{
139 /* We want more bytes than seem to be available. 157 /* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
161 * guarantee p points to at least nbytes bytes. 179 * guarantee p points to at least nbytes bytes.
162 */ 180 */
163 memcpy(p, argp->p, avail); 181 memcpy(p, argp->p, avail);
164 /* step to next page */ 182 next_decode_page(argp);
165 argp->p = page_address(argp->pagelist[0]);
166 argp->pagelist++;
167 if (argp->pagelen < PAGE_SIZE) {
168 argp->end = argp->p + (argp->pagelen>>2);
169 argp->pagelen = 0;
170 } else {
171 argp->end = argp->p + (PAGE_SIZE>>2);
172 argp->pagelen -= PAGE_SIZE;
173 }
174 memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); 183 memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
175 argp->p += XDR_QUADLEN(nbytes - avail); 184 argp->p += XDR_QUADLEN(nbytes - avail);
176 return p; 185 return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
242 251
243static __be32 252static __be32
244nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, 253nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
245 struct iattr *iattr, struct nfs4_acl **acl) 254 struct iattr *iattr, struct nfs4_acl **acl,
255 struct xdr_netobj *label)
246{ 256{
247 int expected_len, len = 0; 257 int expected_len, len = 0;
248 u32 dummy32; 258 u32 dummy32;
@@ -380,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
380 goto xdr_error; 390 goto xdr_error;
381 } 391 }
382 } 392 }
393
394 label->len = 0;
395#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
396 if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
397 READ_BUF(4);
398 len += 4;
399 READ32(dummy32); /* lfs: we don't use it */
400 READ_BUF(4);
401 len += 4;
402 READ32(dummy32); /* pi: we don't use it either */
403 READ_BUF(4);
404 len += 4;
405 READ32(dummy32);
406 READ_BUF(dummy32);
407 if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
408 return nfserr_badlabel;
409 len += (XDR_QUADLEN(dummy32) << 2);
410 READMEM(buf, dummy32);
411 label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
412 if (!label->data)
413 return nfserr_jukebox;
414 defer_free(argp, kfree, label->data);
415 memcpy(label->data, buf, dummy32);
416 }
417#endif
418
383 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 419 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
384 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 420 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
385 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) 421 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -428,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
428 /* callback_sec_params4 */ 464 /* callback_sec_params4 */
429 READ_BUF(4); 465 READ_BUF(4);
430 READ32(nr_secflavs); 466 READ32(nr_secflavs);
431 cbs->flavor = (u32)(-1); 467 if (nr_secflavs)
468 cbs->flavor = (u32)(-1);
469 else
470 /* Is this legal? Be generous, take it to mean AUTH_NONE: */
471 cbs->flavor = 0;
432 for (i = 0; i < nr_secflavs; ++i) { 472 for (i = 0; i < nr_secflavs; ++i) {
433 READ_BUF(4); 473 READ_BUF(4);
434 READ32(dummy); 474 READ32(dummy);
@@ -576,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
576 return status; 616 return status;
577 617
578 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, 618 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
579 &create->cr_acl); 619 &create->cr_acl, &create->cr_label);
580 if (status) 620 if (status)
581 goto out; 621 goto out;
582 622
@@ -827,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
827 case NFS4_CREATE_UNCHECKED: 867 case NFS4_CREATE_UNCHECKED:
828 case NFS4_CREATE_GUARDED: 868 case NFS4_CREATE_GUARDED:
829 status = nfsd4_decode_fattr(argp, open->op_bmval, 869 status = nfsd4_decode_fattr(argp, open->op_bmval,
830 &open->op_iattr, &open->op_acl); 870 &open->op_iattr, &open->op_acl, &open->op_label);
831 if (status) 871 if (status)
832 goto out; 872 goto out;
833 break; 873 break;
@@ -841,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
841 READ_BUF(NFS4_VERIFIER_SIZE); 881 READ_BUF(NFS4_VERIFIER_SIZE);
842 COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); 882 COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
843 status = nfsd4_decode_fattr(argp, open->op_bmval, 883 status = nfsd4_decode_fattr(argp, open->op_bmval,
844 &open->op_iattr, &open->op_acl); 884 &open->op_iattr, &open->op_acl, &open->op_label);
845 if (status) 885 if (status)
846 goto out; 886 goto out;
847 break; 887 break;
@@ -1063,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
1063 if (status) 1103 if (status)
1064 return status; 1104 return status;
1065 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, 1105 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
1066 &setattr->sa_acl); 1106 &setattr->sa_acl, &setattr->sa_label);
1067} 1107}
1068 1108
1069static __be32 1109static __be32
@@ -1567,6 +1607,7 @@ struct nfsd4_minorversion_ops {
1567static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1607static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1568 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1608 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1569 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, 1609 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1610 [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1570}; 1611};
1571 1612
1572static __be32 1613static __be32
@@ -1953,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
1953 FATTR4_WORD0_RDATTR_ERROR) 1994 FATTR4_WORD0_RDATTR_ERROR)
1954#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID 1995#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
1955 1996
1997#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
1998static inline __be32
1999nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
2000{
2001 __be32 *p = *pp;
2002
2003 if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
2004 return nfserr_resource;
2005
2006 /*
2007 * For now we use a 0 here to indicate the null translation; in
2008 * the future we may place a call to translation code here.
2009 */
2010 if ((*buflen -= 8) < 0)
2011 return nfserr_resource;
2012
2013 WRITE32(0); /* lfs */
2014 WRITE32(0); /* pi */
2015 p = xdr_encode_opaque(p, context, len);
2016 *buflen -= (XDR_QUADLEN(len) << 2) + 4;
2017
2018 *pp = p;
2019 return 0;
2020}
2021#else
2022static inline __be32
2023nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
2024{ return 0; }
2025#endif
2026
1956static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) 2027static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
1957{ 2028{
1958 /* As per referral draft: */ 2029 /* As per referral draft: */
@@ -2012,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2012 int err; 2083 int err;
2013 int aclsupport = 0; 2084 int aclsupport = 0;
2014 struct nfs4_acl *acl = NULL; 2085 struct nfs4_acl *acl = NULL;
2086 void *context = NULL;
2087 int contextlen;
2088 bool contextsupport = false;
2015 struct nfsd4_compoundres *resp = rqstp->rq_resp; 2089 struct nfsd4_compoundres *resp = rqstp->rq_resp;
2016 u32 minorversion = resp->cstate.minorversion; 2090 u32 minorversion = resp->cstate.minorversion;
2017 struct path path = { 2091 struct path path = {
@@ -2065,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2065 } 2139 }
2066 } 2140 }
2067 2141
2142#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
2143 if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
2144 bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
2145 err = security_inode_getsecctx(dentry->d_inode,
2146 &context, &contextlen);
2147 contextsupport = (err == 0);
2148 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2149 if (err == -EOPNOTSUPP)
2150 bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
2151 else if (err)
2152 goto out_nfserr;
2153 }
2154 }
2155#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2156
2068 if (bmval2) { 2157 if (bmval2) {
2069 if ((buflen -= 16) < 0) 2158 if ((buflen -= 16) < 0)
2070 goto out_resource; 2159 goto out_resource;
@@ -2093,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2093 2182
2094 if (!aclsupport) 2183 if (!aclsupport)
2095 word0 &= ~FATTR4_WORD0_ACL; 2184 word0 &= ~FATTR4_WORD0_ACL;
2185 if (!contextsupport)
2186 word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
2096 if (!word2) { 2187 if (!word2) {
2097 if ((buflen -= 12) < 0) 2188 if ((buflen -= 12) < 0)
2098 goto out_resource; 2189 goto out_resource;
@@ -2400,6 +2491,12 @@ out_acl:
2400 get_parent_attributes(exp, &stat); 2491 get_parent_attributes(exp, &stat);
2401 WRITE64(stat.ino); 2492 WRITE64(stat.ino);
2402 } 2493 }
2494 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2495 status = nfsd4_encode_security_label(rqstp, context,
2496 contextlen, &p, &buflen);
2497 if (status)
2498 goto out;
2499 }
2403 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { 2500 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2404 WRITE32(3); 2501 WRITE32(3);
2405 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); 2502 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2412,6 +2509,10 @@ out_acl:
2412 status = nfs_ok; 2509 status = nfs_ok;
2413 2510
2414out: 2511out:
2512#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
2513 if (context)
2514 security_release_secctx(context, contextlen);
2515#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2415 kfree(acl); 2516 kfree(acl);
2416 if (fhp == &tempfh) 2517 if (fhp == &tempfh)
2417 fh_put(&tempfh); 2518 fh_put(&tempfh);
@@ -3176,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3176{ 3277{
3177 __be32 *p; 3278 __be32 *p;
3178 3279
3179 RESERVE_SPACE(12); 3280 RESERVE_SPACE(16);
3180 if (nfserr) { 3281 if (nfserr) {
3181 WRITE32(2); 3282 WRITE32(3);
3283 WRITE32(0);
3182 WRITE32(0); 3284 WRITE32(0);
3183 WRITE32(0); 3285 WRITE32(0);
3184 } 3286 }
3185 else { 3287 else {
3186 WRITE32(2); 3288 WRITE32(3);
3187 WRITE32(setattr->sa_bmval[0]); 3289 WRITE32(setattr->sa_bmval[0]);
3188 WRITE32(setattr->sa_bmval[1]); 3290 WRITE32(setattr->sa_bmval[1]);
3291 WRITE32(setattr->sa_bmval[2]);
3189 } 3292 }
3190 ADJUST_ARGS(); 3293 ADJUST_ARGS();
3191 return nfserr; 3294 return nfserr;
@@ -3226,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
3226 return nfserr; 3329 return nfserr;
3227} 3330}
3228 3331
3332static const u32 nfs4_minimal_spo_must_enforce[2] = {
3333 [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
3334 1 << (OP_EXCHANGE_ID - 32) |
3335 1 << (OP_CREATE_SESSION - 32) |
3336 1 << (OP_DESTROY_SESSION - 32) |
3337 1 << (OP_DESTROY_CLIENTID - 32)
3338};
3339
3229static __be32 3340static __be32
3230nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 3341nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3231 struct nfsd4_exchange_id *exid) 3342 struct nfsd4_exchange_id *exid)
@@ -3264,6 +3375,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3264 /* state_protect4_r. Currently only support SP4_NONE */ 3375 /* state_protect4_r. Currently only support SP4_NONE */
3265 BUG_ON(exid->spa_how != SP4_NONE); 3376 BUG_ON(exid->spa_how != SP4_NONE);
3266 WRITE32(exid->spa_how); 3377 WRITE32(exid->spa_how);
3378 switch (exid->spa_how) {
3379 case SP4_NONE:
3380 break;
3381 case SP4_MACH_CRED:
3382 /* spo_must_enforce bitmap: */
3383 WRITE32(2);
3384 WRITE32(nfs4_minimal_spo_must_enforce[0]);
3385 WRITE32(nfs4_minimal_spo_must_enforce[1]);
3386 /* empty spo_must_allow bitmap: */
3387 WRITE32(0);
3388 break;
3389 default:
3390 WARN_ON_ONCE(1);
3391 }
3267 3392
3268 /* The server_owner struct */ 3393 /* The server_owner struct */
3269 WRITE64(minor_id); /* Minor id */ 3394 WRITE64(minor_id); /* Minor id */
@@ -3635,13 +3760,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3635 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3760 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3636 BUG_ON(iov->iov_len > PAGE_SIZE); 3761 BUG_ON(iov->iov_len > PAGE_SIZE);
3637 if (nfsd4_has_session(cs)) { 3762 if (nfsd4_has_session(cs)) {
3763 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3764 struct nfs4_client *clp = cs->session->se_client;
3638 if (cs->status != nfserr_replay_cache) { 3765 if (cs->status != nfserr_replay_cache) {
3639 nfsd4_store_cache_entry(resp); 3766 nfsd4_store_cache_entry(resp);
3640 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; 3767 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
3641 } 3768 }
3642 /* Renew the clientid on success and on replay */ 3769 /* Renew the clientid on success and on replay */
3643 put_client_renew(cs->session->se_client); 3770 spin_lock(&nn->client_lock);
3644 nfsd4_put_session(cs->session); 3771 nfsd4_put_session(cs->session);
3772 spin_unlock(&nn->client_lock);
3773 put_client_renew(clp);
3645 } 3774 }
3646 return 1; 3775 return 1;
3647} 3776}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..2bbd94e51efc 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
24/* 24/*
25 * nfsd version 25 * nfsd version
26 */ 26 */
27#define NFSD_SUPPORTED_MINOR_VERSION 1 27#define NFSD_SUPPORTED_MINOR_VERSION 2
28/* 28/*
29 * Maximum blocksizes supported by daemon under various circumstances. 29 * Maximum blocksizes supported by daemon under various circumstances.
30 */ 30 */
@@ -243,6 +243,12 @@ void nfsd_lockd_shutdown(void);
243#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) 243#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
244#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) 244#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
245#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) 245#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
246#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
247#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
248#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
249#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
250#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
251#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
246 252
247/* error codes for internal use */ 253/* error codes for internal use */
248/* if a request fails due to kmalloc failure, it gets dropped. 254/* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +328,13 @@ void nfsd_lockd_shutdown(void);
322#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 328#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
323 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 329 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
324 330
331#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
332#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
333 (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
334#else
335#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
336#endif
337
325static inline u32 nfsd_suppattrs0(u32 minorversion) 338static inline u32 nfsd_suppattrs0(u32 minorversion)
326{ 339{
327 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 340 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +349,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
336 349
337static inline u32 nfsd_suppattrs2(u32 minorversion) 350static inline u32 nfsd_suppattrs2(u32 minorversion)
338{ 351{
339 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 352 switch (minorversion) {
340 : NFSD4_SUPPORTED_ATTRS_WORD2; 353 default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
354 case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
355 case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
356 }
341} 357}
342 358
343/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ 359/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +366,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
350#define NFSD_WRITEABLE_ATTRS_WORD1 \ 366#define NFSD_WRITEABLE_ATTRS_WORD1 \
351 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ 367 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
352 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 368 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
369#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
370#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
371#else
353#define NFSD_WRITEABLE_ATTRS_WORD2 0 372#define NFSD_WRITEABLE_ATTRS_WORD2 0
373#endif
354 374
355#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ 375#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
356 NFSD_WRITEABLE_ATTRS_WORD0 376 NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..6b9f48ca4c25 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,7 @@ struct svc_program nfsd_program = {
116 116
117}; 117};
118 118
119u32 nfsd_supported_minorversion; 119u32 nfsd_supported_minorversion = 1;
120 120
121int nfsd_vers(int vers, enum vers_op change) 121int nfsd_vers(int vers, enum vers_op change)
122{ 122{
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 274e2a114e05..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -246,6 +246,7 @@ struct nfs4_client {
246 nfs4_verifier cl_verifier; /* generated by client */ 246 nfs4_verifier cl_verifier; /* generated by client */
247 time_t cl_time; /* time of last lease renewal */ 247 time_t cl_time; /* time of last lease renewal */
248 struct sockaddr_storage cl_addr; /* client ipaddress */ 248 struct sockaddr_storage cl_addr; /* client ipaddress */
249 bool cl_mach_cred; /* SP4_MACH_CRED in force */
249 struct svc_cred cl_cred; /* setclientid principal */ 250 struct svc_cred cl_cred; /* setclientid principal */
250 clientid_t cl_clientid; /* generated by server */ 251 clientid_t cl_clientid; /* generated by server */
251 nfs4_verifier cl_confirm; /* generated by server */ 252 nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a6bc8a7423db..8ff6a0019b0b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <linux/exportfs.h> 29#include <linux/exportfs.h>
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/security.h>
31 32
32#ifdef CONFIG_NFSD_V3 33#ifdef CONFIG_NFSD_V3
33#include "xdr3.h" 34#include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
621 return 0; 622 return 0;
622 return 1; 623 return 1;
623} 624}
625#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
626__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
627 struct xdr_netobj *label)
628{
629 __be32 error;
630 int host_error;
631 struct dentry *dentry;
632
633 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
634 if (error)
635 return error;
636
637 dentry = fhp->fh_dentry;
638
639 mutex_lock(&dentry->d_inode->i_mutex);
640 host_error = security_inode_setsecctx(dentry, label->data, label->len);
641 mutex_unlock(&dentry->d_inode->i_mutex);
642 return nfserrno(host_error);
643}
644#else
645__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
646 struct xdr_netobj *label)
647{
648 return nfserr_notsupp;
649}
650#endif
651
624#endif /* defined(CONFIG_NFSD_V4) */ 652#endif /* defined(CONFIG_NFSD_V4) */
625 653
626#ifdef CONFIG_NFSD_V3 654#ifdef CONFIG_NFSD_V3
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
39typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); 39typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
40 40
41/* nfsd/vfs.c */ 41/* nfsd/vfs.c */
42int fh_lock_parent(struct svc_fh *, struct dentry *);
43int nfsd_racache_init(int); 42int nfsd_racache_init(int);
44void nfsd_racache_shutdown(void); 43void nfsd_racache_shutdown(void);
45int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 44int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
56__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, 55__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
57 struct nfs4_acl *); 56 struct nfs4_acl *);
58int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); 57int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
58__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
59 struct xdr_netobj *);
59#endif /* CONFIG_NFSD_V4 */ 60#endif /* CONFIG_NFSD_V4 */
60__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, 61__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
61 char *name, int len, struct iattr *attrs, 62 char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *,
92 struct svc_fh *, char *, int); 93 struct svc_fh *, char *, int);
93__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, 94__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
94 char *name, int len); 95 char *name, int len);
95int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
96 unsigned long size);
97__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, 96__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
98 loff_t *, struct readdir_cd *, filldir_t); 97 loff_t *, struct readdir_cd *, filldir_t);
99__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, 98__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
100 struct kstatfs *, int access); 99 struct kstatfs *, int access);
101 100
102int nfsd_notify_change(struct inode *, struct iattr *);
103__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, 101__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
104 struct dentry *, int); 102 struct dentry *, int);
105int nfsd_sync_dir(struct dentry *dp);
106 103
107#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 104#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
108struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); 105struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3b271d2092b6..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
40#include "state.h" 40#include "state.h"
41#include "nfsd.h" 41#include "nfsd.h"
42 42
43#define NFSD4_MAX_SEC_LABEL_LEN 2048
43#define NFSD4_MAX_TAGLEN 128 44#define NFSD4_MAX_TAGLEN 128
44#define XDR_LEN(n) (((n) + 3) & ~3) 45#define XDR_LEN(n) (((n) + 3) & ~3)
45 46
@@ -118,6 +119,7 @@ struct nfsd4_create {
118 struct iattr cr_iattr; /* request */ 119 struct iattr cr_iattr; /* request */
119 struct nfsd4_change_info cr_cinfo; /* response */ 120 struct nfsd4_change_info cr_cinfo; /* response */
120 struct nfs4_acl *cr_acl; 121 struct nfs4_acl *cr_acl;
122 struct xdr_netobj cr_label;
121}; 123};
122#define cr_linklen u.link.namelen 124#define cr_linklen u.link.namelen
123#define cr_linkname u.link.name 125#define cr_linkname u.link.name
@@ -246,6 +248,7 @@ struct nfsd4_open {
246 struct nfs4_file *op_file; /* used during processing */ 248 struct nfs4_file *op_file; /* used during processing */
247 struct nfs4_ol_stateid *op_stp; /* used during processing */ 249 struct nfs4_ol_stateid *op_stp; /* used during processing */
248 struct nfs4_acl *op_acl; 250 struct nfs4_acl *op_acl;
251 struct xdr_netobj op_label;
249}; 252};
250#define op_iattr iattr 253#define op_iattr iattr
251 254
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
330 u32 sa_bmval[3]; /* request */ 333 u32 sa_bmval[3]; /* request */
331 struct iattr sa_iattr; /* request */ 334 struct iattr sa_iattr; /* request */
332 struct nfs4_acl *sa_acl; 335 struct nfs4_acl *sa_acl;
336 struct xdr_netobj sa_label;
333}; 337};
334 338
335struct nfsd4_setclientid { 339struct nfsd4_setclientid {
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
398} 398}
399 399
400/** 400/**
401 * nilfs_palloc_count_desc_blocks - count descriptor blocks number
402 * @inode: inode of metadata file using this allocator
403 * @desc_blocks: descriptor blocks number [out]
404 */
405static int nilfs_palloc_count_desc_blocks(struct inode *inode,
406 unsigned long *desc_blocks)
407{
408 unsigned long blknum;
409 int ret;
410
411 ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
412 if (likely(!ret))
413 *desc_blocks = DIV_ROUND_UP(
414 blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
415 return ret;
416}
417
418/**
419 * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
420 * MDT file growing
421 * @inode: inode of metadata file using this allocator
422 * @desc_blocks: known current descriptor blocks count
423 */
424static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
425 unsigned long desc_blocks)
426{
427 return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
428 nilfs_palloc_groups_count(inode);
429}
430
431/**
432 * nilfs_palloc_count_max_entries - count max number of entries that can be
433 * described by descriptor blocks count
434 * @inode: inode of metadata file using this allocator
435 * @nused: current number of used entries
436 * @nmaxp: max number of entries [out]
437 */
438int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
439{
440 unsigned long desc_blocks = 0;
441 u64 entries_per_desc_block, nmax;
442 int err;
443
444 err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
445 if (unlikely(err))
446 return err;
447
448 entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
449 nilfs_palloc_groups_per_desc_block(inode);
450 nmax = entries_per_desc_block * desc_blocks;
451
452 if (nused == nmax &&
453 nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
454 nmax += entries_per_desc_block;
455
456 if (nused > nmax)
457 return -ERANGE;
458
459 *nmaxp = nmax;
460 return 0;
461}
462
463/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object 464 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator 465 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation 466 * @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
48void *nilfs_palloc_block_get_entry(const struct inode *, __u64, 48void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
49 const struct buffer_head *, void *); 49 const struct buffer_head *, void *);
50 50
51int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
52
51/** 53/**
52 * nilfs_palloc_req - persistent allocator request and reply 54 * nilfs_palloc_req - persistent allocator request and reply
53 * @pr_entry_nr: entry number (vblocknr or inode number) 55 * @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
160} 160}
161 161
162/** 162/**
163 * nilfs_ifile_count_free_inodes - calculate free inodes count
164 * @ifile: ifile inode
165 * @nmaxinodes: current maximum of available inodes count [out]
166 * @nfreeinodes: free inodes count [out]
167 */
168int nilfs_ifile_count_free_inodes(struct inode *ifile,
169 u64 *nmaxinodes, u64 *nfreeinodes)
170{
171 u64 nused;
172 int err;
173
174 *nmaxinodes = 0;
175 *nfreeinodes = 0;
176
177 nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
178 err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
179 if (likely(!err))
180 *nfreeinodes = *nmaxinodes - nused;
181 return err;
182}
183
184/**
163 * nilfs_ifile_read - read or get ifile inode 185 * nilfs_ifile_read - read or get ifile inode
164 * @sb: super block instance 186 * @sb: super block instance
165 * @root: root object 187 * @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
53
52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, 54int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode, 55 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep); 56 struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bccfec8343c5..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
54 54
55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n); 55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
56 if (root) 56 if (root)
57 atomic_add(n, &root->blocks_count); 57 atomic64_add(n, &root->blocks_count);
58} 58}
59 59
60void nilfs_inode_sub_blocks(struct inode *inode, int n) 60void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
63 63
64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); 64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
65 if (root) 65 if (root)
66 atomic_sub(n, &root->blocks_count); 66 atomic64_sub(n, &root->blocks_count);
67} 67}
68 68
69/** 69/**
@@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
369 goto failed_ifile_create_inode; 369 goto failed_ifile_create_inode;
370 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 370 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
371 371
372 atomic_inc(&root->inodes_count); 372 atomic64_inc(&root->inodes_count);
373 inode_init_owner(inode, dir, mode); 373 inode_init_owner(inode, dir, mode);
374 inode->i_ino = ino; 374 inode->i_ino = ino;
375 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 375 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
801 801
802 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 802 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
803 if (!ret) 803 if (!ret)
804 atomic_dec(&ii->i_root->inodes_count); 804 atomic64_dec(&ii->i_root->inodes_count);
805 805
806 nilfs_clear_inode(inode); 806 nilfs_clear_inode(inode);
807 807
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
835 raw_cp->cp_snapshot_list.ssl_next = 0; 835 raw_cp->cp_snapshot_list.ssl_next = 0;
836 raw_cp->cp_snapshot_list.ssl_prev = 0; 836 raw_cp->cp_snapshot_list.ssl_prev = 0;
837 raw_cp->cp_inodes_count = 837 raw_cp->cp_inodes_count =
838 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); 838 cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
839 raw_cp->cp_blocks_count = 839 raw_cp->cp_blocks_count =
840 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); 840 cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
841 raw_cp->cp_nblk_inc = 841 raw_cp->cp_nblk_inc =
842 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 842 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
843 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 843 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
554 if (err) 554 if (err)
555 goto failed_bh; 555 goto failed_bh;
556 556
557 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 557 atomic64_set(&root->inodes_count,
558 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 558 le64_to_cpu(raw_cp->cp_inodes_count));
559 atomic64_set(&root->blocks_count,
560 le64_to_cpu(raw_cp->cp_blocks_count));
559 561
560 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 562 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
561 563
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
609 unsigned long overhead; 611 unsigned long overhead;
610 unsigned long nrsvblocks; 612 unsigned long nrsvblocks;
611 sector_t nfreeblocks; 613 sector_t nfreeblocks;
614 u64 nmaxinodes, nfreeinodes;
612 int err; 615 int err;
613 616
614 /* 617 /*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
633 if (unlikely(err)) 636 if (unlikely(err))
634 return err; 637 return err;
635 638
639 err = nilfs_ifile_count_free_inodes(root->ifile,
640 &nmaxinodes, &nfreeinodes);
641 if (unlikely(err)) {
642 printk(KERN_WARNING
643 "NILFS warning: fail to count free inodes: err %d.\n",
644 err);
645 if (err == -ERANGE) {
646 /*
647 * If nilfs_palloc_count_max_entries() returns
648 * -ERANGE error code then we simply treat
649 * curent inodes count as maximum possible and
650 * zero as free inodes value.
651 */
652 nmaxinodes = atomic64_read(&root->inodes_count);
653 nfreeinodes = 0;
654 err = 0;
655 } else
656 return err;
657 }
658
636 buf->f_type = NILFS_SUPER_MAGIC; 659 buf->f_type = NILFS_SUPER_MAGIC;
637 buf->f_bsize = sb->s_blocksize; 660 buf->f_bsize = sb->s_blocksize;
638 buf->f_blocks = blocks - overhead; 661 buf->f_blocks = blocks - overhead;
639 buf->f_bfree = nfreeblocks; 662 buf->f_bfree = nfreeblocks;
640 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 663 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
641 (buf->f_bfree - nrsvblocks) : 0; 664 (buf->f_bfree - nrsvblocks) : 0;
642 buf->f_files = atomic_read(&root->inodes_count); 665 buf->f_files = nmaxinodes;
643 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 666 buf->f_ffree = nfreeinodes;
644 buf->f_namelen = NILFS_NAME_LEN; 667 buf->f_namelen = NILFS_NAME_LEN;
645 buf->f_fsid.val[0] = (u32)id; 668 buf->f_fsid.val[0] = (u32)id;
646 buf->f_fsid.val[1] = (u32)(id >> 32); 669 buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
973 996
974static int nilfs_tree_was_touched(struct dentry *root_dentry) 997static int nilfs_tree_was_touched(struct dentry *root_dentry)
975{ 998{
976 return root_dentry->d_count > 1; 999 return d_count(root_dentry) > 1;
977} 1000}
978 1001
979/** 1002/**
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
764 new->ifile = NULL; 764 new->ifile = NULL;
765 new->nilfs = nilfs; 765 new->nilfs = nilfs;
766 atomic_set(&new->count, 1); 766 atomic_set(&new->count, 1);
767 atomic_set(&new->inodes_count, 0); 767 atomic64_set(&new->inodes_count, 0);
768 atomic_set(&new->blocks_count, 0); 768 atomic64_set(&new->blocks_count, 0);
769 769
770 rb_link_node(&new->rb_node, parent, p); 770 rb_link_node(&new->rb_node, parent, p);
771 rb_insert_color(&new->rb_node, &nilfs->ns_cptree); 771 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
241 struct the_nilfs *nilfs; 241 struct the_nilfs *nilfs;
242 struct inode *ifile; 242 struct inode *ifile;
243 243
244 atomic_t inodes_count; 244 atomic64_t inodes_count;
245 atomic_t blocks_count; 245 atomic64_t blocks_count;
246}; 246};
247 247
248/* Special checkpoint number */ 248/* Special checkpoint number */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex);
35 34
36/* 35/*
37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which 36 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
183 return; 182 return;
184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); 183 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
185 184
186 mutex_lock(&dnotify_mark_mutex); 185 mutex_lock(&dnotify_group->mark_mutex);
187 186
188 spin_lock(&fsn_mark->lock); 187 spin_lock(&fsn_mark->lock);
189 prev = &dn_mark->dn; 188 prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
199 198
200 spin_unlock(&fsn_mark->lock); 199 spin_unlock(&fsn_mark->lock);
201 200
202 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 201 /* nothing else could have found us thanks to the dnotify_groups
202 mark_mutex */
203 if (dn_mark->dn == NULL) 203 if (dn_mark->dn == NULL)
204 fsnotify_destroy_mark(fsn_mark, dnotify_group); 204 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
205 205
206 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_group->mark_mutex);
207 207
208 fsnotify_put_mark(fsn_mark); 208 fsnotify_put_mark(fsn_mark);
209} 209}
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
326 new_dn_mark->dn = NULL; 326 new_dn_mark->dn = NULL;
327 327
328 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
329 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_group->mark_mutex);
330 330
331 /* add the new_fsn_mark or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
335 spin_lock(&fsn_mark->lock); 335 spin_lock(&fsn_mark->lock);
336 } else { 336 } else {
337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); 337 fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
338 NULL, 0);
338 spin_lock(&new_fsn_mark->lock); 339 spin_lock(&new_fsn_mark->lock);
339 fsn_mark = new_fsn_mark; 340 fsn_mark = new_fsn_mark;
340 dn_mark = new_dn_mark; 341 dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
348 349
349 /* if (f != filp) means that we lost a race and another task/thread 350 /* if (f != filp) means that we lost a race and another task/thread
350 * actually closed the fd we are still playing with before we grabbed 351 * actually closed the fd we are still playing with before we grabbed
351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the 352 * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
352 * only time we clean up the marks we need to get our mark off 353 * fd is the only time we clean up the marks we need to get our mark
353 * the list. */ 354 * off the list. */
354 if (f != filp) { 355 if (f != filp) {
355 /* if we added ourselves, shoot ourselves, it's possible that 356 /* if we added ourselves, shoot ourselves, it's possible that
356 * the flush actually did shoot this fsn_mark. That's fine too 357 * the flush actually did shoot this fsn_mark. That's fine too
@@ -385,9 +386,9 @@ out:
385 spin_unlock(&fsn_mark->lock); 386 spin_unlock(&fsn_mark->lock);
386 387
387 if (destroy) 388 if (destroy)
388 fsnotify_destroy_mark(fsn_mark, dnotify_group); 389 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
389 390
390 mutex_unlock(&dnotify_mark_mutex); 391 mutex_unlock(&dnotify_group->mark_mutex);
391 fsnotify_put_mark(fsn_mark); 392 fsnotify_put_mark(fsn_mark);
392out_err: 393out_err:
393 if (new_fsn_mark) 394 if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6c80083a984f..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
122 metadata->event_len = FAN_EVENT_METADATA_LEN; 122 metadata->event_len = FAN_EVENT_METADATA_LEN;
123 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 123 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
124 metadata->vers = FANOTIFY_METADATA_VERSION; 124 metadata->vers = FANOTIFY_METADATA_VERSION;
125 metadata->reserved = 0;
125 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 126 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
126 metadata->pid = pid_vnr(event->tgid); 127 metadata->pid = pid_vnr(event->tgid);
127 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 128 if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -399,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file)
399 wake_up(&group->fanotify_data.access_waitq); 400 wake_up(&group->fanotify_data.access_waitq);
400#endif 401#endif
401 402
402 if (file->f_flags & FASYNC)
403 fsnotify_fasync(-1, file, 0);
404
405 /* matches the fanotify_init->fsnotify_alloc_group */ 403 /* matches the fanotify_init->fsnotify_alloc_group */
406 fsnotify_destroy_group(group); 404 fsnotify_destroy_group(group);
407 405
@@ -526,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
526 __u32 removed; 524 __u32 removed;
527 int destroy_mark; 525 int destroy_mark;
528 526
527 mutex_lock(&group->mark_mutex);
529 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 528 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
530 if (!fsn_mark) 529 if (!fsn_mark) {
530 mutex_unlock(&group->mark_mutex);
531 return -ENOENT; 531 return -ENOENT;
532 }
532 533
533 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 534 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
534 &destroy_mark); 535 &destroy_mark);
535 if (destroy_mark) 536 if (destroy_mark)
536 fsnotify_destroy_mark(fsn_mark, group); 537 fsnotify_destroy_mark_locked(fsn_mark, group);
538 mutex_unlock(&group->mark_mutex);
537 539
538 fsnotify_put_mark(fsn_mark); 540 fsnotify_put_mark(fsn_mark);
539 if (removed & real_mount(mnt)->mnt_fsnotify_mask) 541 if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -550,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
550 __u32 removed; 552 __u32 removed;
551 int destroy_mark; 553 int destroy_mark;
552 554
555 mutex_lock(&group->mark_mutex);
553 fsn_mark = fsnotify_find_inode_mark(group, inode); 556 fsn_mark = fsnotify_find_inode_mark(group, inode);
554 if (!fsn_mark) 557 if (!fsn_mark) {
558 mutex_unlock(&group->mark_mutex);
555 return -ENOENT; 559 return -ENOENT;
560 }
556 561
557 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 562 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
558 &destroy_mark); 563 &destroy_mark);
559 if (destroy_mark) 564 if (destroy_mark)
560 fsnotify_destroy_mark(fsn_mark, group); 565 fsnotify_destroy_mark_locked(fsn_mark, group);
566 mutex_unlock(&group->mark_mutex);
567
561 /* matches the fsnotify_find_inode_mark() */ 568 /* matches the fsnotify_find_inode_mark() */
562 fsnotify_put_mark(fsn_mark); 569 fsnotify_put_mark(fsn_mark);
563 if (removed & inode->i_fsnotify_mask) 570 if (removed & inode->i_fsnotify_mask)
@@ -593,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
593 return mask & ~oldmask; 600 return mask & ~oldmask;
594} 601}
595 602
603static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
604 struct inode *inode,
605 struct vfsmount *mnt)
606{
607 struct fsnotify_mark *mark;
608 int ret;
609
610 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
611 return ERR_PTR(-ENOSPC);
612
613 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
614 if (!mark)
615 return ERR_PTR(-ENOMEM);
616
617 fsnotify_init_mark(mark, fanotify_free_mark);
618 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
619 if (ret) {
620 fsnotify_put_mark(mark);
621 return ERR_PTR(ret);
622 }
623
624 return mark;
625}
626
627
596static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, 628static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
597 struct vfsmount *mnt, __u32 mask, 629 struct vfsmount *mnt, __u32 mask,
598 unsigned int flags) 630 unsigned int flags)
599{ 631{
600 struct fsnotify_mark *fsn_mark; 632 struct fsnotify_mark *fsn_mark;
601 __u32 added; 633 __u32 added;
602 int ret = 0;
603 634
635 mutex_lock(&group->mark_mutex);
604 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 636 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
605 if (!fsn_mark) { 637 if (!fsn_mark) {
606 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 638 fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
607 return -ENOSPC; 639 if (IS_ERR(fsn_mark)) {
608 640 mutex_unlock(&group->mark_mutex);
609 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 641 return PTR_ERR(fsn_mark);
610 if (!fsn_mark) 642 }
611 return -ENOMEM;
612
613 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
614 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
615 if (ret)
616 goto err;
617 } 643 }
618 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 644 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
645 mutex_unlock(&group->mark_mutex);
619 646
620 if (added & ~real_mount(mnt)->mnt_fsnotify_mask) 647 if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
621 fsnotify_recalc_vfsmount_mask(mnt); 648 fsnotify_recalc_vfsmount_mask(mnt);
622err: 649
623 fsnotify_put_mark(fsn_mark); 650 fsnotify_put_mark(fsn_mark);
624 return ret; 651 return 0;
625} 652}
626 653
627static int fanotify_add_inode_mark(struct fsnotify_group *group, 654static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -630,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
630{ 657{
631 struct fsnotify_mark *fsn_mark; 658 struct fsnotify_mark *fsn_mark;
632 __u32 added; 659 __u32 added;
633 int ret = 0;
634 660
635 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 661 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
636 662
@@ -644,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
644 (atomic_read(&inode->i_writecount) > 0)) 670 (atomic_read(&inode->i_writecount) > 0))
645 return 0; 671 return 0;
646 672
673 mutex_lock(&group->mark_mutex);
647 fsn_mark = fsnotify_find_inode_mark(group, inode); 674 fsn_mark = fsnotify_find_inode_mark(group, inode);
648 if (!fsn_mark) { 675 if (!fsn_mark) {
649 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 676 fsn_mark = fanotify_add_new_mark(group, inode, NULL);
650 return -ENOSPC; 677 if (IS_ERR(fsn_mark)) {
651 678 mutex_unlock(&group->mark_mutex);
652 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 679 return PTR_ERR(fsn_mark);
653 if (!fsn_mark) 680 }
654 return -ENOMEM;
655
656 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
657 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
658 if (ret)
659 goto err;
660 } 681 }
661 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 682 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
683 mutex_unlock(&group->mark_mutex);
662 684
663 if (added & ~inode->i_fsnotify_mask) 685 if (added & ~inode->i_fsnotify_mask)
664 fsnotify_recalc_inode_mask(inode); 686 fsnotify_recalc_inode_mask(inode);
665err: 687
666 fsnotify_put_mark(fsn_mark); 688 fsnotify_put_mark(fsn_mark);
667 return ret; 689 return 0;
668} 690}
669 691
670/* fanotify syscalls */ 692/* fanotify syscalls */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c1e017..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
636 goto out_err; 636 goto out_err;
637 637
638 /* we are on the idr, now get on the inode */ 638 /* we are on the idr, now get on the inode */
639 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); 639 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
640 NULL, 0);
640 if (ret) { 641 if (ret) {
641 /* we failed to get on the inode, get off the idr */ 642 /* we failed to get on the inode, get off the idr */
642 inotify_remove_from_idr(group, tmp_i_mark); 643 inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
660{ 661{
661 int ret = 0; 662 int ret = 0;
662 663
663retry: 664 mutex_lock(&group->mark_mutex);
664 /* try to update and existing watch with the new arg */ 665 /* try to update and existing watch with the new arg */
665 ret = inotify_update_existing_watch(group, inode, arg); 666 ret = inotify_update_existing_watch(group, inode, arg);
666 /* no mark present, try to add a new one */ 667 /* no mark present, try to add a new one */
667 if (ret == -ENOENT) 668 if (ret == -ENOENT)
668 ret = inotify_new_watch(group, inode, arg); 669 ret = inotify_new_watch(group, inode, arg);
669 /* 670 mutex_unlock(&group->mark_mutex);
670 * inotify_new_watch could race with another thread which did an
671 * inotify_new_watch between the update_existing and the add watch
672 * here, go back and try to update an existing mark again.
673 */
674 if (ret == -EEXIST)
675 goto retry;
676 671
677 return ret; 672 return ret;
678} 673}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
20 * fsnotify inode mark locking/lifetime/and refcnting 20 * fsnotify inode mark locking/lifetime/and refcnting
21 * 21 *
22 * REFCNT: 22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are 23 * The group->recnt and mark->refcnt tell how many "things" in the kernel
24 * referencing this object. The object typically will live inside the kernel 24 * currently are referencing the objects. Both kind of objects typically will
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task 25 * live inside the kernel with a refcnt of 2, one for its creation and one for
26 * which can find this object holding the appropriete locks, can take a reference 26 * the reference a group and a mark hold to each other.
27 * and the object itself is guaranteed to survive until the reference is dropped. 27 * If you are holding the appropriate locks, you can take a reference and the
28 * object itself is guaranteed to survive until the reference is dropped.
28 * 29 *
29 * LOCKING: 30 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST 31 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
31 * be taken in order as follows: 32 * in order as follows:
32 * 33 *
34 * group->mark_mutex
33 * mark->lock 35 * mark->lock
34 * group->mark_lock
35 * inode->i_lock 36 * inode->i_lock
36 * 37 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold 38 * group->mark_mutex protects the marks_list anchored inside a given group and
38 * that lock to dereference either of these things (they could be NULL even with 39 * each mark is hooked via the g_list. It also protects the groups private
39 * the lock) 40 * data (i.e group limits).
40 * 41
41 * group->mark_lock protects the marks_list anchored inside a given group 42 * mark->lock protects the marks attributes like its masks and flags.
42 * and each mark is hooked via the g_list. It also sorta protects the 43 * Furthermore it protects the access to a reference of the group that the mark
43 * free_g_list, which when used is anchored by a private list on the stack of the 44 * is assigned to as well as the access to a reference of the inode/vfsmount
44 * task which held the group->mark_lock. 45 * that is being watched by the mark.
45 * 46 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a 47 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the 48 * given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each 65 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us). 66 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a 67 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no 68 * private list anchored on the stack using i_free_list; we walk i_free_list
68 * longer fear anything finding the mark using the inode's list of marks. 69 * and before we destroy the mark we make sure that we dont race with a
69 * 70 * concurrent destroy_group by getting a ref to the marks group and taking the
70 * We can safely and locklessly run the private list on the stack of everything 71 * groups mutex.
71 * we just unattached from the original inode. For each mark on the private list 72
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list. 73 * Very similarly for freeing by group, except we use free_g_list.
80 * 74 *
81 * This has the very interesting property of being able to run concurrently with 75 * This has the very interesting property of being able to run concurrently with
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5655 &ref_tree, NULL); 5655 &ref_tree, NULL);
5656 if (ret) { 5656 if (ret) {
5657 mlog_errno(ret); 5657 mlog_errno(ret);
5658 goto out; 5658 goto bail;
5659 } 5659 }
5660 5660
5661 ret = ocfs2_prepare_refcount_change_for_del(inode, 5661 ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5666 &extra_blocks); 5666 &extra_blocks);
5667 if (ret < 0) { 5667 if (ret < 0) {
5668 mlog_errno(ret); 5668 mlog_errno(ret);
5669 goto out; 5669 goto bail;
5670 } 5670 }
5671 } 5671 }
5672 5672
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5674 extra_blocks); 5674 extra_blocks);
5675 if (ret) { 5675 if (ret) {
5676 mlog_errno(ret); 5676 mlog_errno(ret);
5677 return ret; 5677 goto bail;
5678 } 5678 }
5679 5679
5680 mutex_lock(&tl_inode->i_mutex); 5680 mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
5734 ocfs2_commit_trans(osb, handle); 5734 ocfs2_commit_trans(osb, handle);
5735out: 5735out:
5736 mutex_unlock(&tl_inode->i_mutex); 5736 mutex_unlock(&tl_inode->i_mutex);
5737 5737bail:
5738 if (meta_ac) 5738 if (meta_ac)
5739 ocfs2_free_alloc_context(meta_ac); 5739 ocfs2_free_alloc_context(meta_ac);
5740 5740
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..5c1c864e81cc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
176 } 176 }
177} 177}
178 178
179static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) 179static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
180{ 180{
181 int ret = -1; 181 int ret = -1;
182 182
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
500 } 500 }
501 501
502 atomic_inc(&write_wc->wc_num_reqs); 502 atomic_inc(&write_wc->wc_num_reqs);
503 submit_bio(WRITE, bio); 503 submit_bio(WRITE_SYNC, bio);
504 504
505 status = 0; 505 status = 0;
506bail: 506bail:
@@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2271 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) 2271 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2272 continue; 2272 continue;
2273 2273
2274 ret = o2hb_global_hearbeat_mode_set(i); 2274 ret = o2hb_global_heartbeat_mode_set(i);
2275 if (!ret) 2275 if (!ret)
2276 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", 2276 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2277 o2hb_heartbeat_mode_desc[i]); 2277 o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
2304 NULL, 2304 NULL,
2305}; 2305};
2306 2306
2307static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { 2307static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
2308 .show_attribute = o2hb_heartbeat_group_show, 2308 .show_attribute = o2hb_heartbeat_group_show,
2309 .store_attribute = o2hb_heartbeat_group_store, 2309 .store_attribute = o2hb_heartbeat_group_store,
2310}; 2310};
@@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
2316 2316
2317static struct config_item_type o2hb_heartbeat_group_type = { 2317static struct config_item_type o2hb_heartbeat_group_type = {
2318 .ct_group_ops = &o2hb_heartbeat_group_group_ops, 2318 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
2319 .ct_item_ops = &o2hb_hearbeat_group_item_ops, 2319 .ct_item_ops = &o2hb_heartbeat_group_item_ops,
2320 .ct_attrs = o2hb_heartbeat_group_attrs, 2320 .ct_attrs = o2hb_heartbeat_group_attrs,
2321 .ct_owner = THIS_MODULE, 2321 .ct_owner = THIS_MODULE,
2322}; 2322};
@@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
2389 assert_spin_locked(&o2hb_live_lock); 2389 assert_spin_locked(&o2hb_live_lock);
2390 2390
2391 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2391 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2392 if (reg->hr_item_dropped)
2393 continue;
2394
2392 uuid = config_item_name(&reg->hr_item); 2395 uuid = config_item_name(&reg->hr_item);
2393 2396
2394 /* local heartbeat */ 2397 /* local heartbeat */
@@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
2439 assert_spin_locked(&o2hb_live_lock); 2442 assert_spin_locked(&o2hb_live_lock);
2440 2443
2441 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2444 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2445 if (reg->hr_item_dropped)
2446 continue;
2447
2442 uuid = config_item_name(&reg->hr_item); 2448 uuid = config_item_name(&reg->hr_item);
2443 if (region_uuid) { 2449 if (region_uuid) {
2444 if (strcmp(region_uuid, uuid)) 2450 if (strcmp(region_uuid, uuid))
@@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2654 2660
2655 p = region_uuids; 2661 p = region_uuids;
2656 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2662 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2663 if (reg->hr_item_dropped)
2664 continue;
2665
2657 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item)); 2666 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2658 if (numregs < max_regions) { 2667 if (numregs < max_regions) {
2659 memcpy(p, config_item_name(&reg->hr_item), 2668 memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
264/* This is analogous to hb_up. as a node's connection comes up we delay the 264/* This is analogous to hb_up. as a node's connection comes up we delay the
265 * quorum decision until we see it heartbeating. the hold will be droped in 265 * quorum decision until we see it heartbeating. the hold will be droped in
266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if 266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
267 * it's already heartbeating we we might be dropping a hold that conn_up got. 267 * it's already heartbeating we might be dropping a hold that conn_up got.
268 * */ 268 * */
269void o2quo_conn_up(u8 node) 269void o2quo_conn_up(u8 node)
270{ 270{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
406 sc->sc_node = NULL; 406 sc->sc_node = NULL;
407 407
408 o2net_debug_del_sc(sc); 408 o2net_debug_del_sc(sc);
409
410 if (sc->sc_page)
411 __free_page(sc->sc_page);
409 kfree(sc); 412 kfree(sc);
410} 413}
411 414
@@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
630 state_change = sc->sc_state_change; 633 state_change = sc->sc_state_change;
631 634
632 switch(sk->sk_state) { 635 switch(sk->sk_state) {
633 /* ignore connecting sockets as they make progress */ 636 /* ignore connecting sockets as they make progress */
634 case TCP_SYN_SENT: 637 case TCP_SYN_SENT:
635 case TCP_SYN_RECV: 638 case TCP_SYN_RECV:
636 break; 639 break;
637 case TCP_ESTABLISHED: 640 case TCP_ESTABLISHED:
638 o2net_sc_queue_work(sc, &sc->sc_connect_work); 641 o2net_sc_queue_work(sc, &sc->sc_connect_work);
639 break; 642 break;
640 default: 643 default:
641 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT 644 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
642 " shutdown, state %d\n", 645 " shutdown, state %d\n",
643 SC_NODEF_ARGS(sc), sk->sk_state); 646 SC_NODEF_ARGS(sc), sk->sk_state);
644 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 647 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
645 break; 648 break;
646 } 649 }
647out: 650out:
648 read_unlock(&sk->sk_callback_lock); 651 read_unlock(&sk->sk_callback_lock);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
178 lock->ml.node); 178 lock->ml.node);
179 } 179 }
180 } else { 180 } else {
181 status = DLM_NORMAL;
181 dlm_lock_get(lock); 182 dlm_lock_get(lock);
182 list_add_tail(&lock->list, &res->blocked); 183 list_add_tail(&lock->list, &res->blocked);
183 kick_thread = 1; 184 kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e68588e6b1e8..773bd32bfd8c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); 55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
56 56
57static int dlm_recovery_thread(void *data); 57static int dlm_recovery_thread(void *data);
58void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
59int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
60void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
61static int dlm_do_recovery(struct dlm_ctxt *dlm); 58static int dlm_do_recovery(struct dlm_ctxt *dlm);
62 59
63static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); 60static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
789 u8 dead_node) 786 u8 dead_node)
790{ 787{
791 struct dlm_lock_request lr; 788 struct dlm_lock_request lr;
792 enum dlm_status ret; 789 int ret;
793 790
794 mlog(0, "\n"); 791 mlog(0, "\n");
795 792
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 lr.dead_node = dead_node; 799 lr.dead_node = dead_node;
803 800
804 // send message 801 // send message
805 ret = DLM_NOLOCKMGR;
806 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, 802 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
807 &lr, sizeof(lr), request_from, NULL); 803 &lr, sizeof(lr), request_from, NULL);
808 804
@@ -2696,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2696 dlm->name, br->node_idx, br->dead_node, 2692 dlm->name, br->node_idx, br->dead_node,
2697 dlm->reco.dead_node, dlm->reco.new_master); 2693 dlm->reco.dead_node, dlm->reco.new_master);
2698 spin_unlock(&dlm->spinlock); 2694 spin_unlock(&dlm->spinlock);
2695 dlm_put(dlm);
2699 return -EAGAIN; 2696 return -EAGAIN;
2700 } 2697 }
2701 spin_unlock(&dlm->spinlock); 2698 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8a38714f1d92..41000f223ca4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2646,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2646 goto out; 2646 goto out;
2647 } 2647 }
2648 2648
2649 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 2649 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2650 ret = -EINVAL;
2651 if (!ret && offset > inode->i_sb->s_maxbytes)
2652 ret = -EINVAL;
2653 if (ret)
2654 goto out;
2655
2656 if (offset != file->f_pos) {
2657 file->f_pos = offset;
2658 file->f_version = 0;
2659 }
2660 2650
2661out: 2651out:
2662 mutex_unlock(&inode->i_mutex); 2652 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..96f9ac237e86 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
200 200
201static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) 201static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
202{ 202{
203 atomic_set(&osb->needs_checkpoint, 1);
204 wake_up(&osb->checkpoint_event); 203 wake_up(&osb->checkpoint_event);
205} 204}
206 205
@@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
538 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); 537 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
539 538
540 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + 539 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
541 ocfs2_quota_trans_credits(sb); 540 ocfs2_quota_trans_credits(sb) + bits_wanted;
542} 541}
543 542
544static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 543static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b4a5cdf9dbc5..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
522 522
523 fe->i_last_eb_blk = 0; 523 fe->i_last_eb_blk = 0;
524 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 524 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
525 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); 525 fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
526 fe->i_atime = fe->i_ctime = fe->i_mtime = 526 fe->i_atime = fe->i_ctime = fe->i_mtime =
527 cpu_to_le64(CURRENT_TIME.tv_sec); 527 cpu_to_le64(CURRENT_TIME.tv_sec);
528 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = 528 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
773 return ret; 773 return ret;
774} 774}
775 775
776static inline int inode_is_unlinkable(struct inode *inode) 776static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
777{ 777{
778 if (S_ISDIR(inode->i_mode)) { 778 if (S_ISDIR(inode->i_mode)) {
779 if (inode->i_nlink == 2) 779 if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
791{ 791{
792 int status; 792 int status;
793 int child_locked = 0; 793 int child_locked = 0;
794 bool is_unlinkable = false;
794 struct inode *inode = dentry->d_inode; 795 struct inode *inode = dentry->d_inode;
795 struct inode *orphan_dir = NULL; 796 struct inode *orphan_dir = NULL;
796 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 797 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
865 goto leave; 866 goto leave;
866 } 867 }
867 868
868 if (inode_is_unlinkable(inode)) { 869 if (ocfs2_inode_is_unlinkable(inode)) {
869 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 870 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
870 OCFS2_I(inode)->ip_blkno, 871 OCFS2_I(inode)->ip_blkno,
871 orphan_name, &orphan_insert); 872 orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
873 mlog_errno(status); 874 mlog_errno(status);
874 goto leave; 875 goto leave;
875 } 876 }
877 is_unlinkable = true;
876 } 878 }
877 879
878 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); 880 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
892 894
893 fe = (struct ocfs2_dinode *) fe_bh->b_data; 895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
894 896
895 if (inode_is_unlinkable(inode)) {
896 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
897 &orphan_insert, orphan_dir);
898 if (status < 0) {
899 mlog_errno(status);
900 goto leave;
901 }
902 }
903
904 /* delete the name from the parent dir */ 897 /* delete the name from the parent dir */
905 status = ocfs2_delete_entry(handle, dir, &lookup); 898 status = ocfs2_delete_entry(handle, dir, &lookup);
906 if (status < 0) { 899 if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
923 mlog_errno(status); 916 mlog_errno(status);
924 if (S_ISDIR(inode->i_mode)) 917 if (S_ISDIR(inode->i_mode))
925 inc_nlink(dir); 918 inc_nlink(dir);
919 goto leave;
920 }
921
922 if (is_unlinkable) {
923 status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
924 orphan_name, &orphan_insert, orphan_dir);
925 if (status < 0)
926 mlog_errno(status);
926 } 927 }
927 928
928leave: 929leave:
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2012 goto leave; 2013 goto leave;
2013 } 2014 }
2014 2015
2016 /*
2017 * We're going to journal the change of i_flags and i_orphaned_slot.
2018 * It's safe anyway, though some callers may duplicate the journaling.
2019 * Journaling within the func just make the logic look more
2020 * straightforward.
2021 */
2022 status = ocfs2_journal_access_di(handle,
2023 INODE_CACHE(inode),
2024 fe_bh,
2025 OCFS2_JOURNAL_ACCESS_WRITE);
2026 if (status < 0) {
2027 mlog_errno(status);
2028 goto leave;
2029 }
2030
2015 /* we're a cluster, and nlink can change on disk from 2031 /* we're a cluster, and nlink can change on disk from
2016 * underneath us... */ 2032 * underneath us... */
2017 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2033 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2026 orphan_dir_bh, lookup); 2042 orphan_dir_bh, lookup);
2027 if (status < 0) { 2043 if (status < 0) {
2028 mlog_errno(status); 2044 mlog_errno(status);
2029 goto leave; 2045 goto rollback;
2030 }
2031
2032 /*
2033 * We're going to journal the change of i_flags and i_orphaned_slot.
2034 * It's safe anyway, though some callers may duplicate the journaling.
2035 * Journaling within the func just make the logic look more
2036 * straightforward.
2037 */
2038 status = ocfs2_journal_access_di(handle,
2039 INODE_CACHE(inode),
2040 fe_bh,
2041 OCFS2_JOURNAL_ACCESS_WRITE);
2042 if (status < 0) {
2043 mlog_errno(status);
2044 goto leave;
2045 } 2046 }
2046 2047
2047 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 2048 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
2048 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; 2049 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
2049 2050
2050 /* Record which orphan dir our inode now resides 2051 /* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2057 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, 2058 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
2058 osb->slot_num); 2059 osb->slot_num);
2059 2060
2061rollback:
2062 if (status < 0) {
2063 if (S_ISDIR(inode->i_mode))
2064 ocfs2_add_links_count(orphan_fe, -1);
2065 set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
2066 }
2067
2060leave: 2068leave:
2061 brelse(orphan_dir_bh); 2069 brelse(orphan_dir_bh);
2062 2070
2063 if (status)
2064 mlog_errno(status);
2065 return status; 2071 return status;
2066} 2072}
2067 2073
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2434 } 2440 }
2435 2441
2436 di = (struct ocfs2_dinode *)di_bh->b_data; 2442 di = (struct ocfs2_dinode *)di_bh->b_data;
2437 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); 2443 di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
2438 di->i_orphaned_slot = 0; 2444 di->i_orphaned_slot = 0;
2439 set_nlink(inode, 1); 2445 set_nlink(inode, 1);
2440 ocfs2_set_links_count(di, inode->i_nlink); 2446 ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
347 struct task_struct *recovery_thread_task; 347 struct task_struct *recovery_thread_task;
348 int disable_recovery; 348 int disable_recovery;
349 wait_queue_head_t checkpoint_event; 349 wait_queue_head_t checkpoint_event;
350 atomic_t needs_checkpoint;
351 struct ocfs2_journal *journal; 350 struct ocfs2_journal *journal;
352 unsigned long osb_commit_interval; 351 unsigned long osb_commit_interval;
353 352
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1422 int status; 1422 int status;
1423 /* there is a really tiny chance the journal calls could fail, 1423 /* there is a really tiny chance the journal calls could fail,
1424 * but we wouldn't want inconsistent blocks in *any* case. */ 1424 * but we wouldn't want inconsistent blocks in *any* case. */
1425 u64 fe_ptr, bg_ptr, prev_bg_ptr; 1425 u64 bg_ptr, prev_bg_ptr;
1426 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1426 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1428 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1428 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
1437 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1437 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1439 1439
1440 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1441 bg_ptr = le64_to_cpu(bg->bg_next_group); 1440 bg_ptr = le64_to_cpu(bg->bg_next_group);
1442 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1441 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1443 1442
1444 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1443 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1445 prev_bg_bh, 1444 prev_bg_bh,
1446 OCFS2_JOURNAL_ACCESS_WRITE); 1445 OCFS2_JOURNAL_ACCESS_WRITE);
1447 if (status < 0) { 1446 if (status < 0)
1448 mlog_errno(status); 1447 goto out;
1449 goto out_rollback;
1450 }
1451 1448
1452 prev_bg->bg_next_group = bg->bg_next_group; 1449 prev_bg->bg_next_group = bg->bg_next_group;
1453 ocfs2_journal_dirty(handle, prev_bg_bh); 1450 ocfs2_journal_dirty(handle, prev_bg_bh);
1454 1451
1455 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1452 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1456 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1453 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1457 if (status < 0) { 1454 if (status < 0)
1458 mlog_errno(status); 1455 goto out_rollback_prev_bg;
1459 goto out_rollback;
1460 }
1461 1456
1462 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1457 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1463 ocfs2_journal_dirty(handle, bg_bh); 1458 ocfs2_journal_dirty(handle, bg_bh);
1464 1459
1465 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1460 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1466 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1461 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (status < 0) { 1462 if (status < 0)
1468 mlog_errno(status); 1463 goto out_rollback_bg;
1469 goto out_rollback;
1470 }
1471 1464
1472 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1465 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1473 ocfs2_journal_dirty(handle, fe_bh); 1466 ocfs2_journal_dirty(handle, fe_bh);
1474 1467
1475out_rollback: 1468out:
1476 if (status < 0) { 1469 if (status < 0)
1477 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1478 bg->bg_next_group = cpu_to_le64(bg_ptr);
1479 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1480 }
1481
1482 if (status)
1483 mlog_errno(status); 1470 mlog_errno(status);
1484 return status; 1471 return status;
1472
1473out_rollback_bg:
1474 bg->bg_next_group = cpu_to_le64(bg_ptr);
1475out_rollback_prev_bg:
1476 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1477 goto out;
1485} 1478}
1486 1479
1487static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1480static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
286 spin_unlock(&osb->osb_lock); 286 spin_unlock(&osb->osb_lock);
287 287
288 out += snprintf(buf + out, len - out, 288 out += snprintf(buf + out, len - out,
289 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", 289 "%10s => Pid: %d Interval: %lu\n", "Commit",
290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), 290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
291 osb->osb_commit_interval, 291 osb->osb_commit_interval);
292 atomic_read(&osb->needs_checkpoint));
293 292
294 out += snprintf(buf + out, len - out, 293 out += snprintf(buf + out, len - out,
295 "%10s => State: %d TxnId: %lu NumTxns: %d\n", 294 "%10s => State: %d TxnId: %lu NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2154 } 2153 }
2155 2154
2156 init_waitqueue_head(&osb->checkpoint_event); 2155 init_waitqueue_head(&osb->checkpoint_event);
2157 atomic_set(&osb->needs_checkpoint, 0);
2158 2156
2159 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 2157 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
2160 2158
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2751{ 2751{
2752 int ret; 2752 int ret;
2753 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2753 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2754 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2755 struct ocfs2_xa_loc loc; 2754 struct ocfs2_xa_loc loc;
2756 2755
2757 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2756 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2759 2758
2760 down_write(&oi->ip_alloc_sem); 2759 down_write(&oi->ip_alloc_sem);
2761 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { 2760 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2762 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2763 ret = -ENOSPC;
2764 goto out;
2765 }
2766 }
2767
2768 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2769 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); 2761 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2770 if (ret) { 2762 if (ret) {
2771 if (ret != -ENOSPC) 2763 if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
6499 } 6491 }
6500 6492
6501 new_oi = OCFS2_I(args->new_inode); 6493 new_oi = OCFS2_I(args->new_inode);
6494 /*
6495 * Adjust extent record count to reserve space for extended attribute.
6496 * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
6497 */
6498 if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
6499 !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
6500 struct ocfs2_extent_list *el = &new_di->id2.i_list;
6501 le16_add_cpu(&el->l_count, -(inline_size /
6502 sizeof(struct ocfs2_extent_rec)));
6503 }
6502 spin_lock(&new_oi->ip_lock); 6504 spin_lock(&new_oi->ip_lock);
6503 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; 6505 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
6504 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); 6506 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/open.c b/fs/open.c
index 8c741002f947..fca72c4d3f17 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
840 if (flags & __O_SYNC) 840 if (flags & __O_SYNC)
841 flags |= O_DSYNC; 841 flags |= O_DSYNC;
842 842
843 /* 843 if (flags & O_TMPFILE) {
844 * If we have O_PATH in the open flag. Then we 844 if (!(flags & O_CREAT))
845 * cannot have anything other than the below set of flags 845 return -EINVAL;
846 */ 846 acc_mode = MAY_OPEN | ACC_MODE(flags);
847 if (flags & O_PATH) { 847 } else if (flags & O_PATH) {
848 /*
849 * If we have O_PATH in the open flag. Then we
850 * cannot have anything other than the below set of flags
851 */
848 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; 852 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
849 acc_mode = 0; 853 acc_mode = 0;
850 } else { 854 } else {
@@ -876,7 +880,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
876 lookup_flags |= LOOKUP_DIRECTORY; 880 lookup_flags |= LOOKUP_DIRECTORY;
877 if (!(flags & O_NOFOLLOW)) 881 if (!(flags & O_NOFOLLOW))
878 lookup_flags |= LOOKUP_FOLLOW; 882 lookup_flags |= LOOKUP_FOLLOW;
879 return lookup_flags; 883 op->lookup_flags = lookup_flags;
884 return 0;
880} 885}
881 886
882/** 887/**
@@ -893,8 +898,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
893struct file *file_open_name(struct filename *name, int flags, umode_t mode) 898struct file *file_open_name(struct filename *name, int flags, umode_t mode)
894{ 899{
895 struct open_flags op; 900 struct open_flags op;
896 int lookup = build_open_flags(flags, mode, &op); 901 int err = build_open_flags(flags, mode, &op);
897 return do_filp_open(AT_FDCWD, name, &op, lookup); 902 return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
898} 903}
899 904
900/** 905/**
@@ -919,37 +924,43 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
919 const char *filename, int flags) 924 const char *filename, int flags)
920{ 925{
921 struct open_flags op; 926 struct open_flags op;
922 int lookup = build_open_flags(flags, 0, &op); 927 int err = build_open_flags(flags, 0, &op);
928 if (err)
929 return ERR_PTR(err);
923 if (flags & O_CREAT) 930 if (flags & O_CREAT)
924 return ERR_PTR(-EINVAL); 931 return ERR_PTR(-EINVAL);
925 if (!filename && (flags & O_DIRECTORY)) 932 if (!filename && (flags & O_DIRECTORY))
926 if (!dentry->d_inode->i_op->lookup) 933 if (!dentry->d_inode->i_op->lookup)
927 return ERR_PTR(-ENOTDIR); 934 return ERR_PTR(-ENOTDIR);
928 return do_file_open_root(dentry, mnt, filename, &op, lookup); 935 return do_file_open_root(dentry, mnt, filename, &op);
929} 936}
930EXPORT_SYMBOL(file_open_root); 937EXPORT_SYMBOL(file_open_root);
931 938
932long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) 939long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
933{ 940{
934 struct open_flags op; 941 struct open_flags op;
935 int lookup = build_open_flags(flags, mode, &op); 942 int fd = build_open_flags(flags, mode, &op);
936 struct filename *tmp = getname(filename); 943 struct filename *tmp;
937 int fd = PTR_ERR(tmp); 944
938 945 if (fd)
939 if (!IS_ERR(tmp)) { 946 return fd;
940 fd = get_unused_fd_flags(flags); 947
941 if (fd >= 0) { 948 tmp = getname(filename);
942 struct file *f = do_filp_open(dfd, tmp, &op, lookup); 949 if (IS_ERR(tmp))
943 if (IS_ERR(f)) { 950 return PTR_ERR(tmp);
944 put_unused_fd(fd); 951
945 fd = PTR_ERR(f); 952 fd = get_unused_fd_flags(flags);
946 } else { 953 if (fd >= 0) {
947 fsnotify_open(f); 954 struct file *f = do_filp_open(dfd, tmp, &op);
948 fd_install(fd, f); 955 if (IS_ERR(f)) {
949 } 956 put_unused_fd(fd);
957 fd = PTR_ERR(f);
958 } else {
959 fsnotify_open(f);
960 fd_install(fd, f);
950 } 961 }
951 putname(tmp);
952 } 962 }
963 putname(tmp);
953 return fd; 964 return fd;
954} 965}
955 966
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0016350ad95e..1485e38daaa3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1686,41 +1686,29 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
1686 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1686 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1687{ 1687{
1688 struct dentry *child, *dir = file->f_path.dentry; 1688 struct dentry *child, *dir = file->f_path.dentry;
1689 struct qstr qname = QSTR_INIT(name, len);
1689 struct inode *inode; 1690 struct inode *inode;
1690 struct qstr qname; 1691 unsigned type;
1691 ino_t ino = 0; 1692 ino_t ino;
1692 unsigned type = DT_UNKNOWN;
1693 1693
1694 qname.name = name; 1694 child = d_hash_and_lookup(dir, &qname);
1695 qname.len = len;
1696 qname.hash = full_name_hash(name, len);
1697
1698 child = d_lookup(dir, &qname);
1699 if (!child) { 1695 if (!child) {
1700 struct dentry *new; 1696 child = d_alloc(dir, &qname);
1701 new = d_alloc(dir, &qname); 1697 if (!child)
1702 if (new) { 1698 goto end_instantiate;
1703 child = instantiate(dir->d_inode, new, task, ptr); 1699 if (instantiate(dir->d_inode, child, task, ptr) < 0) {
1704 if (child) 1700 dput(child);
1705 dput(new); 1701 goto end_instantiate;
1706 else
1707 child = new;
1708 } 1702 }
1709 } 1703 }
1710 if (!child || IS_ERR(child) || !child->d_inode)
1711 goto end_instantiate;
1712 inode = child->d_inode; 1704 inode = child->d_inode;
1713 if (inode) { 1705 ino = inode->i_ino;
1714 ino = inode->i_ino; 1706 type = inode->i_mode >> 12;
1715 type = inode->i_mode >> 12;
1716 }
1717 dput(child); 1707 dput(child);
1718end_instantiate:
1719 if (!ino)
1720 ino = find_inode_number(dir, &qname);
1721 if (!ino)
1722 ino = 1;
1723 return dir_emit(ctx, name, len, ino, type); 1708 return dir_emit(ctx, name, len, ino, type);
1709
1710end_instantiate:
1711 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1724} 1712}
1725 1713
1726#ifdef CONFIG_CHECKPOINT_RESTORE 1714#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1846,7 +1834,7 @@ struct map_files_info {
1846 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1834 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1847}; 1835};
1848 1836
1849static struct dentry * 1837static int
1850proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1838proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1851 struct task_struct *task, const void *ptr) 1839 struct task_struct *task, const void *ptr)
1852{ 1840{
@@ -1856,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1856 1844
1857 inode = proc_pid_make_inode(dir->i_sb, task); 1845 inode = proc_pid_make_inode(dir->i_sb, task);
1858 if (!inode) 1846 if (!inode)
1859 return ERR_PTR(-ENOENT); 1847 return -ENOENT;
1860 1848
1861 ei = PROC_I(inode); 1849 ei = PROC_I(inode);
1862 ei->op.proc_get_link = proc_map_files_get_link; 1850 ei->op.proc_get_link = proc_map_files_get_link;
@@ -1873,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1873 d_set_d_op(dentry, &tid_map_files_dentry_operations); 1861 d_set_d_op(dentry, &tid_map_files_dentry_operations);
1874 d_add(dentry, inode); 1862 d_add(dentry, inode);
1875 1863
1876 return NULL; 1864 return 0;
1877} 1865}
1878 1866
1879static struct dentry *proc_map_files_lookup(struct inode *dir, 1867static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1882,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1882 unsigned long vm_start, vm_end; 1870 unsigned long vm_start, vm_end;
1883 struct vm_area_struct *vma; 1871 struct vm_area_struct *vma;
1884 struct task_struct *task; 1872 struct task_struct *task;
1885 struct dentry *result; 1873 int result;
1886 struct mm_struct *mm; 1874 struct mm_struct *mm;
1887 1875
1888 result = ERR_PTR(-EPERM); 1876 result = -EPERM;
1889 if (!capable(CAP_SYS_ADMIN)) 1877 if (!capable(CAP_SYS_ADMIN))
1890 goto out; 1878 goto out;
1891 1879
1892 result = ERR_PTR(-ENOENT); 1880 result = -ENOENT;
1893 task = get_proc_task(dir); 1881 task = get_proc_task(dir);
1894 if (!task) 1882 if (!task)
1895 goto out; 1883 goto out;
1896 1884
1897 result = ERR_PTR(-EACCES); 1885 result = -EACCES;
1898 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1886 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1899 goto out_put_task; 1887 goto out_put_task;
1900 1888
1901 result = ERR_PTR(-ENOENT); 1889 result = -ENOENT;
1902 if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 1890 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
1903 goto out_put_task; 1891 goto out_put_task;
1904 1892
@@ -1921,7 +1909,7 @@ out_no_vma:
1921out_put_task: 1909out_put_task:
1922 put_task_struct(task); 1910 put_task_struct(task);
1923out: 1911out:
1924 return result; 1912 return ERR_PTR(result);
1925} 1913}
1926 1914
1927static const struct inode_operations proc_map_files_inode_operations = { 1915static const struct inode_operations proc_map_files_inode_operations = {
@@ -2135,13 +2123,12 @@ static const struct file_operations proc_timers_operations = {
2135}; 2123};
2136#endif /* CONFIG_CHECKPOINT_RESTORE */ 2124#endif /* CONFIG_CHECKPOINT_RESTORE */
2137 2125
2138static struct dentry *proc_pident_instantiate(struct inode *dir, 2126static int proc_pident_instantiate(struct inode *dir,
2139 struct dentry *dentry, struct task_struct *task, const void *ptr) 2127 struct dentry *dentry, struct task_struct *task, const void *ptr)
2140{ 2128{
2141 const struct pid_entry *p = ptr; 2129 const struct pid_entry *p = ptr;
2142 struct inode *inode; 2130 struct inode *inode;
2143 struct proc_inode *ei; 2131 struct proc_inode *ei;
2144 struct dentry *error = ERR_PTR(-ENOENT);
2145 2132
2146 inode = proc_pid_make_inode(dir->i_sb, task); 2133 inode = proc_pid_make_inode(dir->i_sb, task);
2147 if (!inode) 2134 if (!inode)
@@ -2160,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2160 d_add(dentry, inode); 2147 d_add(dentry, inode);
2161 /* Close the race of the process dying before we return the dentry */ 2148 /* Close the race of the process dying before we return the dentry */
2162 if (pid_revalidate(dentry, 0)) 2149 if (pid_revalidate(dentry, 0))
2163 error = NULL; 2150 return 0;
2164out: 2151out:
2165 return error; 2152 return -ENOENT;
2166} 2153}
2167 2154
2168static struct dentry *proc_pident_lookup(struct inode *dir, 2155static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2170,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2170 const struct pid_entry *ents, 2157 const struct pid_entry *ents,
2171 unsigned int nents) 2158 unsigned int nents)
2172{ 2159{
2173 struct dentry *error; 2160 int error;
2174 struct task_struct *task = get_proc_task(dir); 2161 struct task_struct *task = get_proc_task(dir);
2175 const struct pid_entry *p, *last; 2162 const struct pid_entry *p, *last;
2176 2163
2177 error = ERR_PTR(-ENOENT); 2164 error = -ENOENT;
2178 2165
2179 if (!task) 2166 if (!task)
2180 goto out_no_task; 2167 goto out_no_task;
@@ -2197,7 +2184,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2197out: 2184out:
2198 put_task_struct(task); 2185 put_task_struct(task);
2199out_no_task: 2186out_no_task:
2200 return error; 2187 return ERR_PTR(error);
2201} 2188}
2202 2189
2203static int proc_pident_readdir(struct file *file, struct dir_context *ctx, 2190static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
@@ -2780,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
2780 } 2767 }
2781} 2768}
2782 2769
2783static struct dentry *proc_pid_instantiate(struct inode *dir, 2770static int proc_pid_instantiate(struct inode *dir,
2784 struct dentry * dentry, 2771 struct dentry * dentry,
2785 struct task_struct *task, const void *ptr) 2772 struct task_struct *task, const void *ptr)
2786{ 2773{
2787 struct dentry *error = ERR_PTR(-ENOENT);
2788 struct inode *inode; 2774 struct inode *inode;
2789 2775
2790 inode = proc_pid_make_inode(dir->i_sb, task); 2776 inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2804,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2804 d_add(dentry, inode); 2790 d_add(dentry, inode);
2805 /* Close the race of the process dying before we return the dentry */ 2791 /* Close the race of the process dying before we return the dentry */
2806 if (pid_revalidate(dentry, 0)) 2792 if (pid_revalidate(dentry, 0))
2807 error = NULL; 2793 return 0;
2808out: 2794out:
2809 return error; 2795 return -ENOENT;
2810} 2796}
2811 2797
2812struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2798struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2813{ 2799{
2814 struct dentry *result = NULL; 2800 int result = 0;
2815 struct task_struct *task; 2801 struct task_struct *task;
2816 unsigned tgid; 2802 unsigned tgid;
2817 struct pid_namespace *ns; 2803 struct pid_namespace *ns;
@@ -2832,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
2832 result = proc_pid_instantiate(dir, dentry, task, NULL); 2818 result = proc_pid_instantiate(dir, dentry, task, NULL);
2833 put_task_struct(task); 2819 put_task_struct(task);
2834out: 2820out:
2835 return result; 2821 return ERR_PTR(result);
2836} 2822}
2837 2823
2838/* 2824/*
@@ -2884,21 +2870,21 @@ retry:
2884int proc_pid_readdir(struct file *file, struct dir_context *ctx) 2870int proc_pid_readdir(struct file *file, struct dir_context *ctx)
2885{ 2871{
2886 struct tgid_iter iter; 2872 struct tgid_iter iter;
2887 struct pid_namespace *ns; 2873 struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
2888 loff_t pos = ctx->pos; 2874 loff_t pos = ctx->pos;
2889 2875
2890 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2876 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
2891 return 0; 2877 return 0;
2892 2878
2893 if (pos == TGID_OFFSET - 1) { 2879 if (pos == TGID_OFFSET - 1) {
2894 if (!proc_fill_cache(file, ctx, "self", 4, NULL, NULL, NULL)) 2880 struct inode *inode = ns->proc_self->d_inode;
2881 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
2895 return 0; 2882 return 0;
2896 iter.tgid = 0; 2883 iter.tgid = 0;
2897 } else { 2884 } else {
2898 iter.tgid = pos - TGID_OFFSET; 2885 iter.tgid = pos - TGID_OFFSET;
2899 } 2886 }
2900 iter.task = NULL; 2887 iter.task = NULL;
2901 ns = file->f_dentry->d_sb->s_fs_info;
2902 for (iter = next_tgid(ns, iter); 2888 for (iter = next_tgid(ns, iter);
2903 iter.task; 2889 iter.task;
2904 iter.tgid += 1, iter = next_tgid(ns, iter)) { 2890 iter.tgid += 1, iter = next_tgid(ns, iter)) {
@@ -3027,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
3027 .setattr = proc_setattr, 3013 .setattr = proc_setattr,
3028}; 3014};
3029 3015
3030static struct dentry *proc_task_instantiate(struct inode *dir, 3016static int proc_task_instantiate(struct inode *dir,
3031 struct dentry *dentry, struct task_struct *task, const void *ptr) 3017 struct dentry *dentry, struct task_struct *task, const void *ptr)
3032{ 3018{
3033 struct dentry *error = ERR_PTR(-ENOENT);
3034 struct inode *inode; 3019 struct inode *inode;
3035 inode = proc_pid_make_inode(dir->i_sb, task); 3020 inode = proc_pid_make_inode(dir->i_sb, task);
3036 3021
@@ -3049,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3049 d_add(dentry, inode); 3034 d_add(dentry, inode);
3050 /* Close the race of the process dying before we return the dentry */ 3035 /* Close the race of the process dying before we return the dentry */
3051 if (pid_revalidate(dentry, 0)) 3036 if (pid_revalidate(dentry, 0))
3052 error = NULL; 3037 return 0;
3053out: 3038out:
3054 return error; 3039 return -ENOENT;
3055} 3040}
3056 3041
3057static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3042static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3058{ 3043{
3059 struct dentry *result = ERR_PTR(-ENOENT); 3044 int result = -ENOENT;
3060 struct task_struct *task; 3045 struct task_struct *task;
3061 struct task_struct *leader = get_proc_task(dir); 3046 struct task_struct *leader = get_proc_task(dir);
3062 unsigned tid; 3047 unsigned tid;
@@ -3086,7 +3071,7 @@ out_drop_task:
3086out: 3071out:
3087 put_task_struct(leader); 3072 put_task_struct(leader);
3088out_no_task: 3073out_no_task:
3089 return result; 3074 return ERR_PTR(result);
3090} 3075}
3091 3076
3092/* 3077/*
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 1441f143c43b..75f2890abbd8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
167 return ret; 167 return ret;
168} 168}
169 169
170static struct dentry * 170static int
171proc_fd_instantiate(struct inode *dir, struct dentry *dentry, 171proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
172 struct task_struct *task, const void *ptr) 172 struct task_struct *task, const void *ptr)
173{ 173{
174 struct dentry *error = ERR_PTR(-ENOENT);
175 unsigned fd = (unsigned long)ptr; 174 unsigned fd = (unsigned long)ptr;
176 struct proc_inode *ei; 175 struct proc_inode *ei;
177 struct inode *inode; 176 struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
194 193
195 /* Close the race of the process dying before we return the dentry */ 194 /* Close the race of the process dying before we return the dentry */
196 if (tid_fd_revalidate(dentry, 0)) 195 if (tid_fd_revalidate(dentry, 0))
197 error = NULL; 196 return 0;
198 out: 197 out:
199 return error; 198 return -ENOENT;
200} 199}
201 200
202static struct dentry *proc_lookupfd_common(struct inode *dir, 201static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
204 instantiate_t instantiate) 203 instantiate_t instantiate)
205{ 204{
206 struct task_struct *task = get_proc_task(dir); 205 struct task_struct *task = get_proc_task(dir);
207 struct dentry *result = ERR_PTR(-ENOENT); 206 int result = -ENOENT;
208 unsigned fd = name_to_int(dentry); 207 unsigned fd = name_to_int(dentry);
209 208
210 if (!task) 209 if (!task)
@@ -216,7 +215,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
216out: 215out:
217 put_task_struct(task); 216 put_task_struct(task);
218out_no_task: 217out_no_task:
219 return result; 218 return ERR_PTR(result);
220} 219}
221 220
222static int proc_readfd_common(struct file *file, struct dir_context *ctx, 221static int proc_readfd_common(struct file *file, struct dir_context *ctx,
@@ -300,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
300 .setattr = proc_setattr, 299 .setattr = proc_setattr,
301}; 300};
302 301
303static struct dentry * 302static int
304proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, 303proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
305 struct task_struct *task, const void *ptr) 304 struct task_struct *task, const void *ptr)
306{ 305{
307 struct dentry *error = ERR_PTR(-ENOENT);
308 unsigned fd = (unsigned long)ptr; 306 unsigned fd = (unsigned long)ptr;
309 struct proc_inode *ei; 307 struct proc_inode *ei;
310 struct inode *inode; 308 struct inode *inode;
@@ -324,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
324 322
325 /* Close the race of the process dying before we return the dentry */ 323 /* Close the race of the process dying before we return the dentry */
326 if (tid_fd_revalidate(dentry, 0)) 324 if (tid_fd_revalidate(dentry, 0))
327 error = NULL; 325 return 0;
328 out: 326 out:
329 return error; 327 return -ENOENT;
330} 328}
331 329
332static struct dentry * 330static struct dentry *
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 4eae2e149f31..651d09a11dde 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -170,7 +170,7 @@ extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned
170extern loff_t mem_lseek(struct file *, loff_t, int); 170extern loff_t mem_lseek(struct file *, loff_t, int);
171 171
172/* Lookups */ 172/* Lookups */
173typedef struct dentry *instantiate_t(struct inode *, struct dentry *, 173typedef int instantiate_t(struct inode *, struct dentry *,
174 struct task_struct *, const void *); 174 struct task_struct *, const void *);
175extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, 175extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
176 instantiate_t, struct task_struct *, const void *); 176 instantiate_t, struct task_struct *, const void *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
408 prpsinfo.pr_zomb = 0; 408 prpsinfo.pr_zomb = 0;
409 409
410 strcpy(prpsinfo.pr_fname, "vmlinux"); 410 strcpy(prpsinfo.pr_fname, "vmlinux");
411 strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); 411 strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
412 412
413 nhdr->p_filesz += notesize(&notes[1]); 413 nhdr->p_filesz += notesize(&notes[1]);
414 bufp = storenote(&notes[1], bufp); 414 bufp = storenote(&notes[1], bufp);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6abbbbfad8a..49a7fff2e83a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
187 .setattr = proc_setattr, 187 .setattr = proc_setattr,
188}; 188};
189 189
190static struct dentry *proc_ns_instantiate(struct inode *dir, 190static int proc_ns_instantiate(struct inode *dir,
191 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
192{ 192{
193 const struct proc_ns_operations *ns_ops = ptr; 193 const struct proc_ns_operations *ns_ops = ptr;
194 struct inode *inode; 194 struct inode *inode;
195 struct proc_inode *ei; 195 struct proc_inode *ei;
196 struct dentry *error = ERR_PTR(-ENOENT);
197 196
198 inode = proc_pid_make_inode(dir->i_sb, task); 197 inode = proc_pid_make_inode(dir->i_sb, task);
199 if (!inode) 198 if (!inode)
@@ -208,9 +207,9 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
208 d_add(dentry, inode); 207 d_add(dentry, inode);
209 /* Close the race of the process dying before we return the dentry */ 208 /* Close the race of the process dying before we return the dentry */
210 if (pid_revalidate(dentry, 0)) 209 if (pid_revalidate(dentry, 0))
211 error = NULL; 210 return 0;
212out: 211out:
213 return error; 212 return -ENOENT;
214} 213}
215 214
216static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) 215static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -248,12 +247,12 @@ const struct file_operations proc_ns_dir_operations = {
248static struct dentry *proc_ns_dir_lookup(struct inode *dir, 247static struct dentry *proc_ns_dir_lookup(struct inode *dir,
249 struct dentry *dentry, unsigned int flags) 248 struct dentry *dentry, unsigned int flags)
250{ 249{
251 struct dentry *error; 250 int error;
252 struct task_struct *task = get_proc_task(dir); 251 struct task_struct *task = get_proc_task(dir);
253 const struct proc_ns_operations **entry, **last; 252 const struct proc_ns_operations **entry, **last;
254 unsigned int len = dentry->d_name.len; 253 unsigned int len = dentry->d_name.len;
255 254
256 error = ERR_PTR(-ENOENT); 255 error = -ENOENT;
257 256
258 if (!task) 257 if (!task)
259 goto out_no_task; 258 goto out_no_task;
@@ -272,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
272out: 271out:
273 put_task_struct(task); 272 put_task_struct(task);
274out_no_task: 273out_no_task:
275 return error; 274 return ERR_PTR(error);
276} 275}
277 276
278const struct inode_operations proc_ns_dir_inode_operations = { 277const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f3a570e7c257..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -796,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
796 return res; 796 return res;
797} 797}
798 798
799static int proc_sys_compare(const struct dentry *parent, 799static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
800 const struct inode *pinode,
801 const struct dentry *dentry, const struct inode *inode,
802 unsigned int len, const char *str, const struct qstr *name) 800 unsigned int len, const char *str, const struct qstr *name)
803{ 801{
804 struct ctl_table_header *head; 802 struct ctl_table_header *head;
803 struct inode *inode;
804
805 /* Although proc doesn't have negative dentries, rcu-walk means 805 /* Although proc doesn't have negative dentries, rcu-walk means
806 * that inode here can be NULL */ 806 * that inode here can be NULL */
807 /* AV: can it, indeed? */ 807 /* AV: can it, indeed? */
808 inode = ACCESS_ONCE(dentry->d_inode);
808 if (!inode) 809 if (!inode)
809 return 1; 810 return 1;
810 if (name->len != len) 811 if (name->len != len)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/rmap.h> 11#include <linux/rmap.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h> 13#include <linux/swapops.h>
14#include <linux/mmu_notifier.h>
14 15
15#include <asm/elf.h> 16#include <asm/elf.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
688 .release = seq_release_private, 689 .release = seq_release_private,
689}; 690};
690 691
692/*
693 * We do not want to have constant page-shift bits sitting in
694 * pagemap entries and are about to reuse them some time soon.
695 *
696 * Here's the "migration strategy":
697 * 1. when the system boots these bits remain what they are,
698 * but a warning about future change is printed in log;
699 * 2. once anyone clears soft-dirty bits via clear_refs file,
700 * these flag is set to denote, that user is aware of the
701 * new API and those page-shift bits change their meaning.
702 * The respective warning is printed in dmesg;
703 * 3. In a couple of releases we will remove all the mentions
704 * of page-shift in pagemap entries.
705 */
706
707static bool soft_dirty_cleared __read_mostly;
708
709enum clear_refs_types {
710 CLEAR_REFS_ALL = 1,
711 CLEAR_REFS_ANON,
712 CLEAR_REFS_MAPPED,
713 CLEAR_REFS_SOFT_DIRTY,
714 CLEAR_REFS_LAST,
715};
716
717struct clear_refs_private {
718 struct vm_area_struct *vma;
719 enum clear_refs_types type;
720};
721
722static inline void clear_soft_dirty(struct vm_area_struct *vma,
723 unsigned long addr, pte_t *pte)
724{
725#ifdef CONFIG_MEM_SOFT_DIRTY
726 /*
727 * The soft-dirty tracker uses #PF-s to catch writes
728 * to pages, so write-protect the pte as well. See the
729 * Documentation/vm/soft-dirty.txt for full description
730 * of how soft-dirty works.
731 */
732 pte_t ptent = *pte;
733 ptent = pte_wrprotect(ptent);
734 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
735 set_pte_at(vma->vm_mm, addr, pte, ptent);
736#endif
737}
738
691static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 739static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
692 unsigned long end, struct mm_walk *walk) 740 unsigned long end, struct mm_walk *walk)
693{ 741{
694 struct vm_area_struct *vma = walk->private; 742 struct clear_refs_private *cp = walk->private;
743 struct vm_area_struct *vma = cp->vma;
695 pte_t *pte, ptent; 744 pte_t *pte, ptent;
696 spinlock_t *ptl; 745 spinlock_t *ptl;
697 struct page *page; 746 struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
706 if (!pte_present(ptent)) 755 if (!pte_present(ptent))
707 continue; 756 continue;
708 757
758 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
759 clear_soft_dirty(vma, addr, pte);
760 continue;
761 }
762
709 page = vm_normal_page(vma, addr, ptent); 763 page = vm_normal_page(vma, addr, ptent);
710 if (!page) 764 if (!page)
711 continue; 765 continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
719 return 0; 773 return 0;
720} 774}
721 775
722#define CLEAR_REFS_ALL 1
723#define CLEAR_REFS_ANON 2
724#define CLEAR_REFS_MAPPED 3
725
726static ssize_t clear_refs_write(struct file *file, const char __user *buf, 776static ssize_t clear_refs_write(struct file *file, const char __user *buf,
727 size_t count, loff_t *ppos) 777 size_t count, loff_t *ppos)
728{ 778{
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
730 char buffer[PROC_NUMBUF]; 780 char buffer[PROC_NUMBUF];
731 struct mm_struct *mm; 781 struct mm_struct *mm;
732 struct vm_area_struct *vma; 782 struct vm_area_struct *vma;
733 int type; 783 enum clear_refs_types type;
784 int itype;
734 int rv; 785 int rv;
735 786
736 memset(buffer, 0, sizeof(buffer)); 787 memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
738 count = sizeof(buffer) - 1; 789 count = sizeof(buffer) - 1;
739 if (copy_from_user(buffer, buf, count)) 790 if (copy_from_user(buffer, buf, count))
740 return -EFAULT; 791 return -EFAULT;
741 rv = kstrtoint(strstrip(buffer), 10, &type); 792 rv = kstrtoint(strstrip(buffer), 10, &itype);
742 if (rv < 0) 793 if (rv < 0)
743 return rv; 794 return rv;
744 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) 795 type = (enum clear_refs_types)itype;
796 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
745 return -EINVAL; 797 return -EINVAL;
798
799 if (type == CLEAR_REFS_SOFT_DIRTY) {
800 soft_dirty_cleared = true;
801 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
802 "See the linux/Documentation/vm/pagemap.txt for details.\n");
803 }
804
746 task = get_proc_task(file_inode(file)); 805 task = get_proc_task(file_inode(file));
747 if (!task) 806 if (!task)
748 return -ESRCH; 807 return -ESRCH;
749 mm = get_task_mm(task); 808 mm = get_task_mm(task);
750 if (mm) { 809 if (mm) {
810 struct clear_refs_private cp = {
811 .type = type,
812 };
751 struct mm_walk clear_refs_walk = { 813 struct mm_walk clear_refs_walk = {
752 .pmd_entry = clear_refs_pte_range, 814 .pmd_entry = clear_refs_pte_range,
753 .mm = mm, 815 .mm = mm,
816 .private = &cp,
754 }; 817 };
755 down_read(&mm->mmap_sem); 818 down_read(&mm->mmap_sem);
819 if (type == CLEAR_REFS_SOFT_DIRTY)
820 mmu_notifier_invalidate_range_start(mm, 0, -1);
756 for (vma = mm->mmap; vma; vma = vma->vm_next) { 821 for (vma = mm->mmap; vma; vma = vma->vm_next) {
757 clear_refs_walk.private = vma; 822 cp.vma = vma;
758 if (is_vm_hugetlb_page(vma)) 823 if (is_vm_hugetlb_page(vma))
759 continue; 824 continue;
760 /* 825 /*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
773 walk_page_range(vma->vm_start, vma->vm_end, 838 walk_page_range(vma->vm_start, vma->vm_end,
774 &clear_refs_walk); 839 &clear_refs_walk);
775 } 840 }
841 if (type == CLEAR_REFS_SOFT_DIRTY)
842 mmu_notifier_invalidate_range_end(mm, 0, -1);
776 flush_tlb_mm(mm); 843 flush_tlb_mm(mm);
777 up_read(&mm->mmap_sem); 844 up_read(&mm->mmap_sem);
778 mmput(mm); 845 mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
794struct pagemapread { 861struct pagemapread {
795 int pos, len; 862 int pos, len;
796 pagemap_entry_t *buffer; 863 pagemap_entry_t *buffer;
864 bool v2;
797}; 865};
798 866
799#define PAGEMAP_WALK_SIZE (PMD_SIZE) 867#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
807#define PM_PSHIFT_BITS 6 875#define PM_PSHIFT_BITS 6
808#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 876#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
809#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 877#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
810#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 878#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
811#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 879#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
812#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 880#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
881/* in "new" pagemap pshift bits are occupied with more status bits */
882#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
813 883
884#define __PM_SOFT_DIRTY (1LL)
814#define PM_PRESENT PM_STATUS(4LL) 885#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 886#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL) 887#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 888#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
818#define PM_END_OF_BUFFER 1 889#define PM_END_OF_BUFFER 1
819 890
820static inline pagemap_entry_t make_pme(u64 val) 891static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 908 struct pagemapread *pm = walk->private;
838 unsigned long addr; 909 unsigned long addr;
839 int err = 0; 910 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 911 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
841 912
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 913 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 914 err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
847 return err; 918 return err;
848} 919}
849 920
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 921static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 922 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 923{
853 u64 frame, flags; 924 u64 frame, flags;
854 struct page *page = NULL; 925 struct page *page = NULL;
926 int flags2 = 0;
855 927
856 if (pte_present(pte)) { 928 if (pte_present(pte)) {
857 frame = pte_pfn(pte); 929 frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
866 if (is_migration_entry(entry)) 938 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry); 939 page = migration_entry_to_page(entry);
868 } else { 940 } else {
869 *pme = make_pme(PM_NOT_PRESENT); 941 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
870 return; 942 return;
871 } 943 }
872 944
873 if (page && !PageAnon(page)) 945 if (page && !PageAnon(page))
874 flags |= PM_FILE; 946 flags |= PM_FILE;
947 if (pte_soft_dirty(pte))
948 flags2 |= __PM_SOFT_DIRTY;
875 949
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); 950 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
877} 951}
878 952
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 953#ifdef CONFIG_TRANSPARENT_HUGEPAGE
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 954static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
881 pmd_t pmd, int offset) 955 pmd_t pmd, int offset, int pmd_flags2)
882{ 956{
883 /* 957 /*
884 * Currently pmd for thp is always present because thp can not be 958 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
887 */ 961 */
888 if (pmd_present(pmd)) 962 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 963 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 964 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
891 else 965 else
892 *pme = make_pme(PM_NOT_PRESENT); 966 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
893} 967}
894#else 968#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 969static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
896 pmd_t pmd, int offset) 970 pmd_t pmd, int offset, int pmd_flags2)
897{ 971{
898} 972}
899#endif 973#endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 979 struct pagemapread *pm = walk->private;
906 pte_t *pte; 980 pte_t *pte;
907 int err = 0; 981 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 982 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
909 983
910 /* find the first VMA at or above 'addr' */ 984 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 985 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 986 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
987 int pmd_flags2;
988
989 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
913 for (; addr != end; addr += PAGE_SIZE) { 990 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset; 991 unsigned long offset;
915 992
916 offset = (addr & ~PAGEMAP_WALK_MASK) >> 993 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT; 994 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset); 995 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
919 err = add_to_pagemap(addr, &pme, pm); 996 err = add_to_pagemap(addr, &pme, pm);
920 if (err) 997 if (err)
921 break; 998 break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
932 * and need a new, higher one */ 1009 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 1010 if (vma && (addr >= vma->vm_end)) {
934 vma = find_vma(walk->mm, addr); 1011 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT); 1012 pme = make_pme(PM_NOT_PRESENT(pm->v2));
936 } 1013 }
937 1014
938 /* check that 'vma' actually covers this address, 1015 /* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
940 if (vma && (vma->vm_start <= addr) && 1017 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 1018 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 1019 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 1020 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
944 /* unmap before userspace copy */ 1021 /* unmap before userspace copy */
945 pte_unmap(pte); 1022 pte_unmap(pte);
946 } 1023 }
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 1032}
956 1033
957#ifdef CONFIG_HUGETLB_PAGE 1034#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 1035static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
959 pte_t pte, int offset) 1036 pte_t pte, int offset)
960{ 1037{
961 if (pte_present(pte)) 1038 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1039 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 1040 | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
964 else 1041 else
965 *pme = make_pme(PM_NOT_PRESENT); 1042 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
966} 1043}
967 1044
968/* This function walks within one hugetlb entry in the single call */ 1045/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
976 1053
977 for (; addr != end; addr += PAGE_SIZE) { 1054 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1055 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 1056 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 1057 err = add_to_pagemap(addr, &pme, pm);
981 if (err) 1058 if (err)
982 return err; 1059 return err;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1038 if (!count) 1115 if (!count)
1039 goto out_task; 1116 goto out_task;
1040 1117
1118 pm.v2 = soft_dirty_cleared;
1041 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1119 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1042 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 1120 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
1043 ret = -ENOMEM; 1121 ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
1110 return ret; 1188 return ret;
1111} 1189}
1112 1190
1191static int pagemap_open(struct inode *inode, struct file *file)
1192{
1193 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1194 "to stop being page-shift some time soon. See the "
1195 "linux/Documentation/vm/pagemap.txt for details.\n");
1196 return 0;
1197}
1198
1113const struct file_operations proc_pagemap_operations = { 1199const struct file_operations proc_pagemap_operations = {
1114 .llseek = mem_lseek, /* borrow this */ 1200 .llseek = mem_lseek, /* borrow this */
1115 .read = pagemap_read, 1201 .read = pagemap_read,
1202 .open = pagemap_open,
1116}; 1203};
1117#endif /* CONFIG_PROC_PAGE_MONITOR */ 1204#endif /* CONFIG_PROC_PAGE_MONITOR */
1118 1205
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
20 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
22 22
23 do_posix_clock_monotonic_gettime(&uptime); 23 get_monotonic_boottime(&uptime);
24 monotonic_to_bootbased(&uptime);
25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; 24 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); 25 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem; 26 idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..28503172f2e4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/crash_dump.h> 21#include <linux/crash_dump.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/vmalloc.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include "internal.h" 26#include "internal.h"
@@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list);
32/* Stores the pointer to the buffer containing kernel elf core headers. */ 33/* Stores the pointer to the buffer containing kernel elf core headers. */
33static char *elfcorebuf; 34static char *elfcorebuf;
34static size_t elfcorebuf_sz; 35static size_t elfcorebuf_sz;
36static size_t elfcorebuf_sz_orig;
37
38static char *elfnotes_buf;
39static size_t elfnotes_sz;
35 40
36/* Total size of vmcore file. */ 41/* Total size of vmcore file. */
37static u64 vmcore_size; 42static u64 vmcore_size;
@@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
118 return read; 123 return read;
119} 124}
120 125
121/* Maps vmcore file offset to respective physical address in memroy. */
122static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
123 struct vmcore **m_ptr)
124{
125 struct vmcore *m;
126 u64 paddr;
127
128 list_for_each_entry(m, vc_list, list) {
129 u64 start, end;
130 start = m->offset;
131 end = m->offset + m->size - 1;
132 if (offset >= start && offset <= end) {
133 paddr = m->paddr + offset - start;
134 *m_ptr = m;
135 return paddr;
136 }
137 }
138 *m_ptr = NULL;
139 return 0;
140}
141
142/* Read from the ELF header and then the crash dump. On error, negative value is 126/* Read from the ELF header and then the crash dump. On error, negative value is
143 * returned otherwise number of bytes read are returned. 127 * returned otherwise number of bytes read are returned.
144 */ 128 */
@@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
147{ 131{
148 ssize_t acc = 0, tmp; 132 ssize_t acc = 0, tmp;
149 size_t tsz; 133 size_t tsz;
150 u64 start, nr_bytes; 134 u64 start;
151 struct vmcore *curr_m = NULL; 135 struct vmcore *m = NULL;
152 136
153 if (buflen == 0 || *fpos >= vmcore_size) 137 if (buflen == 0 || *fpos >= vmcore_size)
154 return 0; 138 return 0;
@@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
159 143
160 /* Read ELF core header */ 144 /* Read ELF core header */
161 if (*fpos < elfcorebuf_sz) { 145 if (*fpos < elfcorebuf_sz) {
162 tsz = elfcorebuf_sz - *fpos; 146 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
163 if (buflen < tsz)
164 tsz = buflen;
165 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) 147 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
166 return -EFAULT; 148 return -EFAULT;
167 buflen -= tsz; 149 buflen -= tsz;
@@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
174 return acc; 156 return acc;
175 } 157 }
176 158
177 start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); 159 /* Read Elf note segment */
178 if (!curr_m) 160 if (*fpos < elfcorebuf_sz + elfnotes_sz) {
179 return -EINVAL; 161 void *kaddr;
180
181 while (buflen) {
182 tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
183 162
184 /* Calculate left bytes in current memory segment. */ 163 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
185 nr_bytes = (curr_m->size - (start - curr_m->paddr)); 164 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
186 if (tsz > nr_bytes) 165 if (copy_to_user(buffer, kaddr, tsz))
187 tsz = nr_bytes; 166 return -EFAULT;
188
189 tmp = read_from_oldmem(buffer, tsz, &start, 1);
190 if (tmp < 0)
191 return tmp;
192 buflen -= tsz; 167 buflen -= tsz;
193 *fpos += tsz; 168 *fpos += tsz;
194 buffer += tsz; 169 buffer += tsz;
195 acc += tsz; 170 acc += tsz;
196 if (start >= (curr_m->paddr + curr_m->size)) { 171
197 if (curr_m->list.next == &vmcore_list) 172 /* leave now if filled buffer already */
198 return acc; /*EOF*/ 173 if (buflen == 0)
199 curr_m = list_entry(curr_m->list.next, 174 return acc;
200 struct vmcore, list); 175 }
201 start = curr_m->paddr; 176
177 list_for_each_entry(m, &vmcore_list, list) {
178 if (*fpos < m->offset + m->size) {
179 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
180 start = m->paddr + *fpos - m->offset;
181 tmp = read_from_oldmem(buffer, tsz, &start, 1);
182 if (tmp < 0)
183 return tmp;
184 buflen -= tsz;
185 *fpos += tsz;
186 buffer += tsz;
187 acc += tsz;
188
189 /* leave now if filled buffer already */
190 if (buflen == 0)
191 return acc;
202 } 192 }
203 } 193 }
194
204 return acc; 195 return acc;
205} 196}
206 197
198/**
199 * alloc_elfnotes_buf - allocate buffer for ELF note segment in
200 * vmalloc memory
201 *
202 * @notes_sz: size of buffer
203 *
204 * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
205 * the buffer to user-space by means of remap_vmalloc_range().
206 *
207 * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
208 * disabled and there's no need to allow users to mmap the buffer.
209 */
210static inline char *alloc_elfnotes_buf(size_t notes_sz)
211{
212#ifdef CONFIG_MMU
213 return vmalloc_user(notes_sz);
214#else
215 return vzalloc(notes_sz);
216#endif
217}
218
219/*
220 * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
221 * essential for mmap_vmcore() in order to map physically
222 * non-contiguous objects (ELF header, ELF note segment and memory
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout.
225 */
226#ifdef CONFIG_MMU
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{
229 size_t size = vma->vm_end - vma->vm_start;
230 u64 start, end, len, tsz;
231 struct vmcore *m;
232
233 start = (u64)vma->vm_pgoff << PAGE_SHIFT;
234 end = start + size;
235
236 if (size > vmcore_size || end > vmcore_size)
237 return -EINVAL;
238
239 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
240 return -EPERM;
241
242 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
243 vma->vm_flags |= VM_MIXEDMAP;
244
245 len = 0;
246
247 if (start < elfcorebuf_sz) {
248 u64 pfn;
249
250 tsz = min(elfcorebuf_sz - (size_t)start, size);
251 pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
252 if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
253 vma->vm_page_prot))
254 return -EAGAIN;
255 size -= tsz;
256 start += tsz;
257 len += tsz;
258
259 if (size == 0)
260 return 0;
261 }
262
263 if (start < elfcorebuf_sz + elfnotes_sz) {
264 void *kaddr;
265
266 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
267 kaddr = elfnotes_buf + start - elfcorebuf_sz;
268 if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
269 kaddr, tsz))
270 goto fail;
271 size -= tsz;
272 start += tsz;
273 len += tsz;
274
275 if (size == 0)
276 return 0;
277 }
278
279 list_for_each_entry(m, &vmcore_list, list) {
280 if (start < m->offset + m->size) {
281 u64 paddr = 0;
282
283 tsz = min_t(size_t, m->offset + m->size - start, size);
284 paddr = m->paddr + start - m->offset;
285 if (remap_pfn_range(vma, vma->vm_start + len,
286 paddr >> PAGE_SHIFT, tsz,
287 vma->vm_page_prot))
288 goto fail;
289 size -= tsz;
290 start += tsz;
291 len += tsz;
292
293 if (size == 0)
294 return 0;
295 }
296 }
297
298 return 0;
299fail:
300 do_munmap(vma->vm_mm, vma->vm_start, len);
301 return -EAGAIN;
302}
303#else
304static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
305{
306 return -ENOSYS;
307}
308#endif
309
207static const struct file_operations proc_vmcore_operations = { 310static const struct file_operations proc_vmcore_operations = {
208 .read = read_vmcore, 311 .read = read_vmcore,
209 .llseek = default_llseek, 312 .llseek = default_llseek,
313 .mmap = mmap_vmcore,
210}; 314};
211 315
212static struct vmcore* __init get_new_element(void) 316static struct vmcore* __init get_new_element(void)
@@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
214 return kzalloc(sizeof(struct vmcore), GFP_KERNEL); 318 return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
215} 319}
216 320
217static u64 __init get_vmcore_size_elf64(char *elfptr) 321static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
322 struct list_head *vc_list)
218{ 323{
219 int i;
220 u64 size;
221 Elf64_Ehdr *ehdr_ptr;
222 Elf64_Phdr *phdr_ptr;
223
224 ehdr_ptr = (Elf64_Ehdr *)elfptr;
225 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
226 size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
227 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
228 size += phdr_ptr->p_memsz;
229 phdr_ptr++;
230 }
231 return size;
232}
233
234static u64 __init get_vmcore_size_elf32(char *elfptr)
235{
236 int i;
237 u64 size; 324 u64 size;
238 Elf32_Ehdr *ehdr_ptr; 325 struct vmcore *m;
239 Elf32_Phdr *phdr_ptr;
240 326
241 ehdr_ptr = (Elf32_Ehdr *)elfptr; 327 size = elfsz + elfnotesegsz;
242 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); 328 list_for_each_entry(m, vc_list, list) {
243 size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); 329 size += m->size;
244 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
245 size += phdr_ptr->p_memsz;
246 phdr_ptr++;
247 } 330 }
248 return size; 331 return size;
249} 332}
250 333
251/* Merges all the PT_NOTE headers into one. */ 334/**
252static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, 335 * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
253 struct list_head *vc_list) 336 *
337 * @ehdr_ptr: ELF header
338 *
339 * This function updates p_memsz member of each PT_NOTE entry in the
340 * program header table pointed to by @ehdr_ptr to real size of ELF
341 * note segment.
342 */
343static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
254{ 344{
255 int i, nr_ptnote=0, rc=0; 345 int i, rc=0;
256 char *tmp; 346 Elf64_Phdr *phdr_ptr;
257 Elf64_Ehdr *ehdr_ptr;
258 Elf64_Phdr phdr, *phdr_ptr;
259 Elf64_Nhdr *nhdr_ptr; 347 Elf64_Nhdr *nhdr_ptr;
260 u64 phdr_sz = 0, note_off;
261 348
262 ehdr_ptr = (Elf64_Ehdr *)elfptr; 349 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
263 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
264 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 350 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
265 int j;
266 void *notes_section; 351 void *notes_section;
267 struct vmcore *new;
268 u64 offset, max_sz, sz, real_sz = 0; 352 u64 offset, max_sz, sz, real_sz = 0;
269 if (phdr_ptr->p_type != PT_NOTE) 353 if (phdr_ptr->p_type != PT_NOTE)
270 continue; 354 continue;
271 nr_ptnote++;
272 max_sz = phdr_ptr->p_memsz; 355 max_sz = phdr_ptr->p_memsz;
273 offset = phdr_ptr->p_offset; 356 offset = phdr_ptr->p_offset;
274 notes_section = kmalloc(max_sz, GFP_KERNEL); 357 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
280 return rc; 363 return rc;
281 } 364 }
282 nhdr_ptr = notes_section; 365 nhdr_ptr = notes_section;
283 for (j = 0; j < max_sz; j += sz) { 366 while (real_sz < max_sz) {
284 if (nhdr_ptr->n_namesz == 0) 367 if (nhdr_ptr->n_namesz == 0)
285 break; 368 break;
286 sz = sizeof(Elf64_Nhdr) + 369 sz = sizeof(Elf64_Nhdr) +
@@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
289 real_sz += sz; 372 real_sz += sz;
290 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); 373 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
291 } 374 }
292
293 /* Add this contiguous chunk of notes section to vmcore list.*/
294 new = get_new_element();
295 if (!new) {
296 kfree(notes_section);
297 return -ENOMEM;
298 }
299 new->paddr = phdr_ptr->p_offset;
300 new->size = real_sz;
301 list_add_tail(&new->list, vc_list);
302 phdr_sz += real_sz;
303 kfree(notes_section); 375 kfree(notes_section);
376 phdr_ptr->p_memsz = real_sz;
377 }
378
379 return 0;
380}
381
382/**
383 * get_note_number_and_size_elf64 - get the number of PT_NOTE program
384 * headers and sum of real size of their ELF note segment headers and
385 * data.
386 *
387 * @ehdr_ptr: ELF header
388 * @nr_ptnote: buffer for the number of PT_NOTE program headers
389 * @sz_ptnote: buffer for size of unique PT_NOTE program header
390 *
391 * This function is used to merge multiple PT_NOTE program headers
392 * into a unique single one. The resulting unique entry will have
393 * @sz_ptnote in its phdr->p_mem.
394 *
395 * It is assumed that program headers with PT_NOTE type pointed to by
396 * @ehdr_ptr has already been updated by update_note_header_size_elf64
397 * and each of PT_NOTE program headers has actual ELF note segment
398 * size in its p_memsz member.
399 */
400static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
401 int *nr_ptnote, u64 *sz_ptnote)
402{
403 int i;
404 Elf64_Phdr *phdr_ptr;
405
406 *nr_ptnote = *sz_ptnote = 0;
407
408 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
409 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
410 if (phdr_ptr->p_type != PT_NOTE)
411 continue;
412 *nr_ptnote += 1;
413 *sz_ptnote += phdr_ptr->p_memsz;
414 }
415
416 return 0;
417}
418
419/**
420 * copy_notes_elf64 - copy ELF note segments in a given buffer
421 *
422 * @ehdr_ptr: ELF header
423 * @notes_buf: buffer into which ELF note segments are copied
424 *
425 * This function is used to copy ELF note segment in the 1st kernel
426 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
427 * size of the buffer @notes_buf is equal to or larger than sum of the
428 * real ELF note segment headers and data.
429 *
430 * It is assumed that program headers with PT_NOTE type pointed to by
431 * @ehdr_ptr has already been updated by update_note_header_size_elf64
432 * and each of PT_NOTE program headers has actual ELF note segment
433 * size in its p_memsz member.
434 */
435static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
436{
437 int i, rc=0;
438 Elf64_Phdr *phdr_ptr;
439
440 phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
441
442 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
443 u64 offset;
444 if (phdr_ptr->p_type != PT_NOTE)
445 continue;
446 offset = phdr_ptr->p_offset;
447 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
448 if (rc < 0)
449 return rc;
450 notes_buf += phdr_ptr->p_memsz;
304 } 451 }
305 452
453 return 0;
454}
455
456/* Merges all the PT_NOTE headers into one. */
457static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
458 char **notes_buf, size_t *notes_sz)
459{
460 int i, nr_ptnote=0, rc=0;
461 char *tmp;
462 Elf64_Ehdr *ehdr_ptr;
463 Elf64_Phdr phdr;
464 u64 phdr_sz = 0, note_off;
465
466 ehdr_ptr = (Elf64_Ehdr *)elfptr;
467
468 rc = update_note_header_size_elf64(ehdr_ptr);
469 if (rc < 0)
470 return rc;
471
472 rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
473 if (rc < 0)
474 return rc;
475
476 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
477 *notes_buf = alloc_elfnotes_buf(*notes_sz);
478 if (!*notes_buf)
479 return -ENOMEM;
480
481 rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
482 if (rc < 0)
483 return rc;
484
306 /* Prepare merged PT_NOTE program header. */ 485 /* Prepare merged PT_NOTE program header. */
307 phdr.p_type = PT_NOTE; 486 phdr.p_type = PT_NOTE;
308 phdr.p_flags = 0; 487 phdr.p_flags = 0;
309 note_off = sizeof(Elf64_Ehdr) + 488 note_off = sizeof(Elf64_Ehdr) +
310 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); 489 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
311 phdr.p_offset = note_off; 490 phdr.p_offset = roundup(note_off, PAGE_SIZE);
312 phdr.p_vaddr = phdr.p_paddr = 0; 491 phdr.p_vaddr = phdr.p_paddr = 0;
313 phdr.p_filesz = phdr.p_memsz = phdr_sz; 492 phdr.p_filesz = phdr.p_memsz = phdr_sz;
314 phdr.p_align = 0; 493 phdr.p_align = 0;
@@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
322 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); 501 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
323 *elfsz = *elfsz - i; 502 *elfsz = *elfsz - i;
324 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); 503 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
504 memset(elfptr + *elfsz, 0, i);
505 *elfsz = roundup(*elfsz, PAGE_SIZE);
325 506
326 /* Modify e_phnum to reflect merged headers. */ 507 /* Modify e_phnum to reflect merged headers. */
327 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 508 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
329 return 0; 510 return 0;
330} 511}
331 512
332/* Merges all the PT_NOTE headers into one. */ 513/**
333static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, 514 * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
334 struct list_head *vc_list) 515 *
516 * @ehdr_ptr: ELF header
517 *
518 * This function updates p_memsz member of each PT_NOTE entry in the
519 * program header table pointed to by @ehdr_ptr to real size of ELF
520 * note segment.
521 */
522static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
335{ 523{
336 int i, nr_ptnote=0, rc=0; 524 int i, rc=0;
337 char *tmp; 525 Elf32_Phdr *phdr_ptr;
338 Elf32_Ehdr *ehdr_ptr;
339 Elf32_Phdr phdr, *phdr_ptr;
340 Elf32_Nhdr *nhdr_ptr; 526 Elf32_Nhdr *nhdr_ptr;
341 u64 phdr_sz = 0, note_off;
342 527
343 ehdr_ptr = (Elf32_Ehdr *)elfptr; 528 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
344 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
345 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 529 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
346 int j;
347 void *notes_section; 530 void *notes_section;
348 struct vmcore *new;
349 u64 offset, max_sz, sz, real_sz = 0; 531 u64 offset, max_sz, sz, real_sz = 0;
350 if (phdr_ptr->p_type != PT_NOTE) 532 if (phdr_ptr->p_type != PT_NOTE)
351 continue; 533 continue;
352 nr_ptnote++;
353 max_sz = phdr_ptr->p_memsz; 534 max_sz = phdr_ptr->p_memsz;
354 offset = phdr_ptr->p_offset; 535 offset = phdr_ptr->p_offset;
355 notes_section = kmalloc(max_sz, GFP_KERNEL); 536 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
361 return rc; 542 return rc;
362 } 543 }
363 nhdr_ptr = notes_section; 544 nhdr_ptr = notes_section;
364 for (j = 0; j < max_sz; j += sz) { 545 while (real_sz < max_sz) {
365 if (nhdr_ptr->n_namesz == 0) 546 if (nhdr_ptr->n_namesz == 0)
366 break; 547 break;
367 sz = sizeof(Elf32_Nhdr) + 548 sz = sizeof(Elf32_Nhdr) +
@@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
370 real_sz += sz; 551 real_sz += sz;
371 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); 552 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
372 } 553 }
373
374 /* Add this contiguous chunk of notes section to vmcore list.*/
375 new = get_new_element();
376 if (!new) {
377 kfree(notes_section);
378 return -ENOMEM;
379 }
380 new->paddr = phdr_ptr->p_offset;
381 new->size = real_sz;
382 list_add_tail(&new->list, vc_list);
383 phdr_sz += real_sz;
384 kfree(notes_section); 554 kfree(notes_section);
555 phdr_ptr->p_memsz = real_sz;
556 }
557
558 return 0;
559}
560
561/**
562 * get_note_number_and_size_elf32 - get the number of PT_NOTE program
563 * headers and sum of real size of their ELF note segment headers and
564 * data.
565 *
566 * @ehdr_ptr: ELF header
567 * @nr_ptnote: buffer for the number of PT_NOTE program headers
568 * @sz_ptnote: buffer for size of unique PT_NOTE program header
569 *
570 * This function is used to merge multiple PT_NOTE program headers
571 * into a unique single one. The resulting unique entry will have
572 * @sz_ptnote in its phdr->p_mem.
573 *
574 * It is assumed that program headers with PT_NOTE type pointed to by
575 * @ehdr_ptr has already been updated by update_note_header_size_elf32
576 * and each of PT_NOTE program headers has actual ELF note segment
577 * size in its p_memsz member.
578 */
579static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
580 int *nr_ptnote, u64 *sz_ptnote)
581{
582 int i;
583 Elf32_Phdr *phdr_ptr;
584
585 *nr_ptnote = *sz_ptnote = 0;
586
587 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
588 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
589 if (phdr_ptr->p_type != PT_NOTE)
590 continue;
591 *nr_ptnote += 1;
592 *sz_ptnote += phdr_ptr->p_memsz;
593 }
594
595 return 0;
596}
597
598/**
599 * copy_notes_elf32 - copy ELF note segments in a given buffer
600 *
601 * @ehdr_ptr: ELF header
602 * @notes_buf: buffer into which ELF note segments are copied
603 *
604 * This function is used to copy ELF note segment in the 1st kernel
605 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
606 * size of the buffer @notes_buf is equal to or larger than sum of the
607 * real ELF note segment headers and data.
608 *
609 * It is assumed that program headers with PT_NOTE type pointed to by
610 * @ehdr_ptr has already been updated by update_note_header_size_elf32
611 * and each of PT_NOTE program headers has actual ELF note segment
612 * size in its p_memsz member.
613 */
614static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
615{
616 int i, rc=0;
617 Elf32_Phdr *phdr_ptr;
618
619 phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
620
621 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
622 u64 offset;
623 if (phdr_ptr->p_type != PT_NOTE)
624 continue;
625 offset = phdr_ptr->p_offset;
626 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
627 if (rc < 0)
628 return rc;
629 notes_buf += phdr_ptr->p_memsz;
385 } 630 }
386 631
632 return 0;
633}
634
635/* Merges all the PT_NOTE headers into one. */
636static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
637 char **notes_buf, size_t *notes_sz)
638{
639 int i, nr_ptnote=0, rc=0;
640 char *tmp;
641 Elf32_Ehdr *ehdr_ptr;
642 Elf32_Phdr phdr;
643 u64 phdr_sz = 0, note_off;
644
645 ehdr_ptr = (Elf32_Ehdr *)elfptr;
646
647 rc = update_note_header_size_elf32(ehdr_ptr);
648 if (rc < 0)
649 return rc;
650
651 rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
652 if (rc < 0)
653 return rc;
654
655 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
656 *notes_buf = alloc_elfnotes_buf(*notes_sz);
657 if (!*notes_buf)
658 return -ENOMEM;
659
660 rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
661 if (rc < 0)
662 return rc;
663
387 /* Prepare merged PT_NOTE program header. */ 664 /* Prepare merged PT_NOTE program header. */
388 phdr.p_type = PT_NOTE; 665 phdr.p_type = PT_NOTE;
389 phdr.p_flags = 0; 666 phdr.p_flags = 0;
390 note_off = sizeof(Elf32_Ehdr) + 667 note_off = sizeof(Elf32_Ehdr) +
391 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); 668 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
392 phdr.p_offset = note_off; 669 phdr.p_offset = roundup(note_off, PAGE_SIZE);
393 phdr.p_vaddr = phdr.p_paddr = 0; 670 phdr.p_vaddr = phdr.p_paddr = 0;
394 phdr.p_filesz = phdr.p_memsz = phdr_sz; 671 phdr.p_filesz = phdr.p_memsz = phdr_sz;
395 phdr.p_align = 0; 672 phdr.p_align = 0;
@@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
403 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); 680 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
404 *elfsz = *elfsz - i; 681 *elfsz = *elfsz - i;
405 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); 682 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
683 memset(elfptr + *elfsz, 0, i);
684 *elfsz = roundup(*elfsz, PAGE_SIZE);
406 685
407 /* Modify e_phnum to reflect merged headers. */ 686 /* Modify e_phnum to reflect merged headers. */
408 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 687 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
414 * the new offset fields of exported program headers. */ 693 * the new offset fields of exported program headers. */
415static int __init process_ptload_program_headers_elf64(char *elfptr, 694static int __init process_ptload_program_headers_elf64(char *elfptr,
416 size_t elfsz, 695 size_t elfsz,
696 size_t elfnotes_sz,
417 struct list_head *vc_list) 697 struct list_head *vc_list)
418{ 698{
419 int i; 699 int i;
@@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
425 ehdr_ptr = (Elf64_Ehdr *)elfptr; 705 ehdr_ptr = (Elf64_Ehdr *)elfptr;
426 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ 706 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
427 707
428 /* First program header is PT_NOTE header. */ 708 /* Skip Elf header, program headers and Elf note segment. */
429 vmcore_off = sizeof(Elf64_Ehdr) + 709 vmcore_off = elfsz + elfnotes_sz;
430 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
431 phdr_ptr->p_memsz; /* Note sections */
432 710
433 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 711 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
712 u64 paddr, start, end, size;
713
434 if (phdr_ptr->p_type != PT_LOAD) 714 if (phdr_ptr->p_type != PT_LOAD)
435 continue; 715 continue;
436 716
717 paddr = phdr_ptr->p_offset;
718 start = rounddown(paddr, PAGE_SIZE);
719 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
720 size = end - start;
721
437 /* Add this contiguous chunk of memory to vmcore list.*/ 722 /* Add this contiguous chunk of memory to vmcore list.*/
438 new = get_new_element(); 723 new = get_new_element();
439 if (!new) 724 if (!new)
440 return -ENOMEM; 725 return -ENOMEM;
441 new->paddr = phdr_ptr->p_offset; 726 new->paddr = start;
442 new->size = phdr_ptr->p_memsz; 727 new->size = size;
443 list_add_tail(&new->list, vc_list); 728 list_add_tail(&new->list, vc_list);
444 729
445 /* Update the program header offset. */ 730 /* Update the program header offset. */
446 phdr_ptr->p_offset = vmcore_off; 731 phdr_ptr->p_offset = vmcore_off + (paddr - start);
447 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 732 vmcore_off = vmcore_off + size;
448 } 733 }
449 return 0; 734 return 0;
450} 735}
451 736
452static int __init process_ptload_program_headers_elf32(char *elfptr, 737static int __init process_ptload_program_headers_elf32(char *elfptr,
453 size_t elfsz, 738 size_t elfsz,
739 size_t elfnotes_sz,
454 struct list_head *vc_list) 740 struct list_head *vc_list)
455{ 741{
456 int i; 742 int i;
@@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
462 ehdr_ptr = (Elf32_Ehdr *)elfptr; 748 ehdr_ptr = (Elf32_Ehdr *)elfptr;
463 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ 749 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
464 750
465 /* First program header is PT_NOTE header. */ 751 /* Skip Elf header, program headers and Elf note segment. */
466 vmcore_off = sizeof(Elf32_Ehdr) + 752 vmcore_off = elfsz + elfnotes_sz;
467 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
468 phdr_ptr->p_memsz; /* Note sections */
469 753
470 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 754 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
755 u64 paddr, start, end, size;
756
471 if (phdr_ptr->p_type != PT_LOAD) 757 if (phdr_ptr->p_type != PT_LOAD)
472 continue; 758 continue;
473 759
760 paddr = phdr_ptr->p_offset;
761 start = rounddown(paddr, PAGE_SIZE);
762 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
763 size = end - start;
764
474 /* Add this contiguous chunk of memory to vmcore list.*/ 765 /* Add this contiguous chunk of memory to vmcore list.*/
475 new = get_new_element(); 766 new = get_new_element();
476 if (!new) 767 if (!new)
477 return -ENOMEM; 768 return -ENOMEM;
478 new->paddr = phdr_ptr->p_offset; 769 new->paddr = start;
479 new->size = phdr_ptr->p_memsz; 770 new->size = size;
480 list_add_tail(&new->list, vc_list); 771 list_add_tail(&new->list, vc_list);
481 772
482 /* Update the program header offset */ 773 /* Update the program header offset */
483 phdr_ptr->p_offset = vmcore_off; 774 phdr_ptr->p_offset = vmcore_off + (paddr - start);
484 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 775 vmcore_off = vmcore_off + size;
485 } 776 }
486 return 0; 777 return 0;
487} 778}
488 779
489/* Sets offset fields of vmcore elements. */ 780/* Sets offset fields of vmcore elements. */
490static void __init set_vmcore_list_offsets_elf64(char *elfptr, 781static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
491 struct list_head *vc_list) 782 struct list_head *vc_list)
492{ 783{
493 loff_t vmcore_off; 784 loff_t vmcore_off;
494 Elf64_Ehdr *ehdr_ptr;
495 struct vmcore *m; 785 struct vmcore *m;
496 786
497 ehdr_ptr = (Elf64_Ehdr *)elfptr; 787 /* Skip Elf header, program headers and Elf note segment. */
498 788 vmcore_off = elfsz + elfnotes_sz;
499 /* Skip Elf header and program headers. */
500 vmcore_off = sizeof(Elf64_Ehdr) +
501 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
502 789
503 list_for_each_entry(m, vc_list, list) { 790 list_for_each_entry(m, vc_list, list) {
504 m->offset = vmcore_off; 791 m->offset = vmcore_off;
@@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
506 } 793 }
507} 794}
508 795
509/* Sets offset fields of vmcore elements. */ 796static void free_elfcorebuf(void)
510static void __init set_vmcore_list_offsets_elf32(char *elfptr,
511 struct list_head *vc_list)
512{ 797{
513 loff_t vmcore_off; 798 free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
514 Elf32_Ehdr *ehdr_ptr; 799 elfcorebuf = NULL;
515 struct vmcore *m; 800 vfree(elfnotes_buf);
516 801 elfnotes_buf = NULL;
517 ehdr_ptr = (Elf32_Ehdr *)elfptr;
518
519 /* Skip Elf header and program headers. */
520 vmcore_off = sizeof(Elf32_Ehdr) +
521 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
522
523 list_for_each_entry(m, vc_list, list) {
524 m->offset = vmcore_off;
525 vmcore_off += m->size;
526 }
527} 802}
528 803
529static int __init parse_crash_elf64_headers(void) 804static int __init parse_crash_elf64_headers(void)
@@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
554 } 829 }
555 830
556 /* Read in all elf headers. */ 831 /* Read in all elf headers. */
557 elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); 832 elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
558 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 833 ehdr.e_phnum * sizeof(Elf64_Phdr);
834 elfcorebuf_sz = elfcorebuf_sz_orig;
835 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
836 get_order(elfcorebuf_sz_orig));
559 if (!elfcorebuf) 837 if (!elfcorebuf)
560 return -ENOMEM; 838 return -ENOMEM;
561 addr = elfcorehdr_addr; 839 addr = elfcorehdr_addr;
562 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 840 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
563 if (rc < 0) { 841 if (rc < 0)
564 kfree(elfcorebuf); 842 goto fail;
565 return rc;
566 }
567 843
568 /* Merge all PT_NOTE headers into one. */ 844 /* Merge all PT_NOTE headers into one. */
569 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 845 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
570 if (rc) { 846 &elfnotes_buf, &elfnotes_sz);
571 kfree(elfcorebuf); 847 if (rc)
572 return rc; 848 goto fail;
573 }
574 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, 849 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
575 &vmcore_list); 850 elfnotes_sz, &vmcore_list);
576 if (rc) { 851 if (rc)
577 kfree(elfcorebuf); 852 goto fail;
578 return rc; 853 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
579 }
580 set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
581 return 0; 854 return 0;
855fail:
856 free_elfcorebuf();
857 return rc;
582} 858}
583 859
584static int __init parse_crash_elf32_headers(void) 860static int __init parse_crash_elf32_headers(void)
@@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
609 } 885 }
610 886
611 /* Read in all elf headers. */ 887 /* Read in all elf headers. */
612 elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); 888 elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
613 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 889 elfcorebuf_sz = elfcorebuf_sz_orig;
890 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
891 get_order(elfcorebuf_sz_orig));
614 if (!elfcorebuf) 892 if (!elfcorebuf)
615 return -ENOMEM; 893 return -ENOMEM;
616 addr = elfcorehdr_addr; 894 addr = elfcorehdr_addr;
617 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 895 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
618 if (rc < 0) { 896 if (rc < 0)
619 kfree(elfcorebuf); 897 goto fail;
620 return rc;
621 }
622 898
623 /* Merge all PT_NOTE headers into one. */ 899 /* Merge all PT_NOTE headers into one. */
624 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 900 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
625 if (rc) { 901 &elfnotes_buf, &elfnotes_sz);
626 kfree(elfcorebuf); 902 if (rc)
627 return rc; 903 goto fail;
628 }
629 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, 904 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
630 &vmcore_list); 905 elfnotes_sz, &vmcore_list);
631 if (rc) { 906 if (rc)
632 kfree(elfcorebuf); 907 goto fail;
633 return rc; 908 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
634 }
635 set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
636 return 0; 909 return 0;
910fail:
911 free_elfcorebuf();
912 return rc;
637} 913}
638 914
639static int __init parse_crash_elf_headers(void) 915static int __init parse_crash_elf_headers(void)
@@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
655 rc = parse_crash_elf64_headers(); 931 rc = parse_crash_elf64_headers();
656 if (rc) 932 if (rc)
657 return rc; 933 return rc;
658
659 /* Determine vmcore size. */
660 vmcore_size = get_vmcore_size_elf64(elfcorebuf);
661 } else if (e_ident[EI_CLASS] == ELFCLASS32) { 934 } else if (e_ident[EI_CLASS] == ELFCLASS32) {
662 rc = parse_crash_elf32_headers(); 935 rc = parse_crash_elf32_headers();
663 if (rc) 936 if (rc)
664 return rc; 937 return rc;
665
666 /* Determine vmcore size. */
667 vmcore_size = get_vmcore_size_elf32(elfcorebuf);
668 } else { 938 } else {
669 pr_warn("Warning: Core image elf header is not sane\n"); 939 pr_warn("Warning: Core image elf header is not sane\n");
670 return -EINVAL; 940 return -EINVAL;
671 } 941 }
942
943 /* Determine vmcore size. */
944 vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
945 &vmcore_list);
946
672 return 0; 947 return 0;
673} 948}
674 949
@@ -711,7 +986,6 @@ void vmcore_cleanup(void)
711 list_del(&m->list); 986 list_del(&m->list);
712 kfree(m); 987 kfree(m);
713 } 988 }
714 kfree(elfcorebuf); 989 free_elfcorebuf();
715 elfcorebuf = NULL;
716} 990}
717EXPORT_SYMBOL_GPL(vmcore_cleanup); 991EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51d..76a4eeb92982 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
44 rec.parent_ip = parent_ip; 44 rec.parent_ip = parent_ip;
45 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); 45 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
46 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, 46 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
47 sizeof(rec), psinfo); 47 0, sizeof(rec), psinfo);
48 48
49 local_irq_restore(flags); 49 local_irq_restore(flags);
50} 50}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2cf055a..71bf5f4ae84c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
178 if (p->psi->erase) 178 if (p->psi->erase)
179 p->psi->erase(p->type, p->id, p->count, 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi); 180 dentry->d_inode->i_ctime, p->psi);
181 else
182 return -EPERM;
181 183
182 return simple_unlink(dir, dentry); 184 return simple_unlink(dir, dentry);
183} 185}
@@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
324 case PSTORE_TYPE_MCE: 326 case PSTORE_TYPE_MCE:
325 sprintf(name, "mce-%s-%lld", psname, id); 327 sprintf(name, "mce-%s-%lld", psname, id);
326 break; 328 break;
329 case PSTORE_TYPE_PPC_RTAS:
330 sprintf(name, "rtas-%s-%lld", psname, id);
331 break;
332 case PSTORE_TYPE_PPC_OF:
333 sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
334 break;
335 case PSTORE_TYPE_PPC_COMMON:
336 sprintf(name, "powerpc-common-%s-%lld", psname, id);
337 break;
327 case PSTORE_TYPE_UNKNOWN: 338 case PSTORE_TYPE_UNKNOWN:
328 sprintf(name, "unknown-%s-%lld", psname, id); 339 sprintf(name, "unknown-%s-%lld", psname, id);
329 break; 340 break;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038b5a12..422962ae9fc2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
159 break; 159 break;
160 160
161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
162 oopscount, hsize + len, psinfo); 162 oopscount, hsize, hsize + len, psinfo);
163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
164 pstore_new_entry = 1; 164 pstore_new_entry = 1;
165 165
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
196 spin_lock_irqsave(&psinfo->buf_lock, flags); 196 spin_lock_irqsave(&psinfo->buf_lock, flags);
197 } 197 }
198 memcpy(psinfo->buf, s, c); 198 memcpy(psinfo->buf, s, c);
199 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); 199 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
200 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 200 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
201 s += c; 201 s += c;
202 c = e - s; 202 c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
221static int pstore_write_compat(enum pstore_type_id type, 221static int pstore_write_compat(enum pstore_type_id type,
222 enum kmsg_dump_reason reason, 222 enum kmsg_dump_reason reason,
223 u64 *id, unsigned int part, int count, 223 u64 *id, unsigned int part, int count,
224 size_t size, struct pstore_info *psi) 224 size_t hsize, size_t size,
225 struct pstore_info *psi)
225{ 226{
226 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); 227 return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
228 size, psi);
227} 229}
228 230
229/* 231/*
@@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi)
239{ 241{
240 struct module *owner = psi->owner; 242 struct module *owner = psi->owner;
241 243
244 if (backend && strcmp(backend, psi->name))
245 return -EPERM;
246
242 spin_lock(&pstore_lock); 247 spin_lock(&pstore_lock);
243 if (psinfo) { 248 if (psinfo) {
244 spin_unlock(&pstore_lock); 249 spin_unlock(&pstore_lock);
245 return -EBUSY; 250 return -EBUSY;
246 } 251 }
247 252
248 if (backend && strcmp(backend, psi->name)) {
249 spin_unlock(&pstore_lock);
250 return -EINVAL;
251 }
252
253 if (!psi->write) 253 if (!psi->write)
254 psi->write = pstore_write_compat; 254 psi->write = pstore_write_compat;
255 psinfo = psi; 255 psinfo = psi;
@@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi)
274 add_timer(&pstore_timer); 274 add_timer(&pstore_timer);
275 } 275 }
276 276
277 pr_info("pstore: Registered %s as persistent store backend\n",
278 psi->name);
279
277 return 0; 280 return 0;
278} 281}
279EXPORT_SYMBOL_GPL(pstore_register); 282EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1376e5a8f0d6..a6119f9469e2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
195static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, 195static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
196 enum kmsg_dump_reason reason, 196 enum kmsg_dump_reason reason,
197 u64 *id, unsigned int part, 197 u64 *id, unsigned int part,
198 const char *buf, size_t size, 198 const char *buf,
199 size_t hsize, size_t size,
199 struct pstore_info *psi) 200 struct pstore_info *psi)
200{ 201{
201 struct ramoops_context *cxt = psi->data; 202 struct ramoops_context *cxt = psi->data;
@@ -399,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev)
399 goto fail_out; 400 goto fail_out;
400 } 401 }
401 402
402 if (!is_power_of_2(pdata->mem_size))
403 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
404 if (!is_power_of_2(pdata->record_size)) 403 if (!is_power_of_2(pdata->record_size))
405 pdata->record_size = rounddown_pow_of_two(pdata->record_size); 404 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
406 if (!is_power_of_2(pdata->console_size)) 405 if (!is_power_of_2(pdata->console_size))
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 59337326e288..de272d426763 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
46} 46}
47 47
48/* increase and wrap the start pointer, returning the old value */ 48/* increase and wrap the start pointer, returning the old value */
49static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) 49static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
50{ 50{
51 int old; 51 int old;
52 int new; 52 int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
62} 62}
63 63
64/* increase the size counter until it hits the max size */ 64/* increase the size counter until it hits the max size */
65static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) 65static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
66{ 66{
67 size_t old; 67 size_t old;
68 size_t new; 68 size_t new;
@@ -78,6 +78,53 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
78 } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); 78 } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
79} 79}
80 80
81static DEFINE_RAW_SPINLOCK(buffer_lock);
82
83/* increase and wrap the start pointer, returning the old value */
84static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
85{
86 int old;
87 int new;
88 unsigned long flags;
89
90 raw_spin_lock_irqsave(&buffer_lock, flags);
91
92 old = atomic_read(&prz->buffer->start);
93 new = old + a;
94 while (unlikely(new > prz->buffer_size))
95 new -= prz->buffer_size;
96 atomic_set(&prz->buffer->start, new);
97
98 raw_spin_unlock_irqrestore(&buffer_lock, flags);
99
100 return old;
101}
102
103/* increase the size counter until it hits the max size */
104static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
105{
106 size_t old;
107 size_t new;
108 unsigned long flags;
109
110 raw_spin_lock_irqsave(&buffer_lock, flags);
111
112 old = atomic_read(&prz->buffer->size);
113 if (old == prz->buffer_size)
114 goto exit;
115
116 new = old + a;
117 if (new > prz->buffer_size)
118 new = prz->buffer_size;
119 atomic_set(&prz->buffer->size, new);
120
121exit:
122 raw_spin_unlock_irqrestore(&buffer_lock, flags);
123}
124
125static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
126static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
127
81static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, 128static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
82 uint8_t *data, size_t len, uint8_t *ecc) 129 uint8_t *data, size_t len, uint8_t *ecc)
83{ 130{
@@ -372,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
372 return NULL; 419 return NULL;
373 } 420 }
374 421
422 buffer_start_add = buffer_start_add_locked;
423 buffer_size_add = buffer_size_add_locked;
424
375 return ioremap(start, size); 425 return ioremap(start, size);
376} 426}
377 427
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
2585 return proc_dointvec(table, write, buffer, lenp, ppos); 2585 return proc_dointvec(table, write, buffer, lenp, ppos);
2586} 2586}
2587 2587
2588static ctl_table fs_dqstats_table[] = { 2588static struct ctl_table fs_dqstats_table[] = {
2589 { 2589 {
2590 .procname = "lookups", 2590 .procname = "lookups",
2591 .data = &dqstats.stat[DQST_LOOKUPS], 2591 .data = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
2654 { }, 2654 { },
2655}; 2655};
2656 2656
2657static ctl_table fs_table[] = { 2657static struct ctl_table fs_table[] = {
2658 { 2658 {
2659 .procname = "quota", 2659 .procname = "quota",
2660 .mode = 0555, 2660 .mode = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
2663 { }, 2663 { },
2664}; 2664};
2665 2665
2666static ctl_table sys_table[] = { 2666static struct ctl_table sys_table[] = {
2667 { 2667 {
2668 .procname = "fs", 2668 .procname = "fs",
2669 .mode = 0555, 2669 .mode = 0555,
diff --git a/fs/read_write.c b/fs/read_write.c
index 2cefa417be34..122a3846d9e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -41,8 +41,19 @@ static inline int unsigned_offsets(struct file *file)
41 return file->f_mode & FMODE_UNSIGNED_OFFSET; 41 return file->f_mode & FMODE_UNSIGNED_OFFSET;
42} 42}
43 43
44static loff_t lseek_execute(struct file *file, struct inode *inode, 44/**
45 loff_t offset, loff_t maxsize) 45 * vfs_setpos - update the file offset for lseek
46 * @file: file structure in question
47 * @offset: file offset to seek to
48 * @maxsize: maximum file size
49 *
50 * This is a low-level filesystem helper for updating the file offset to
51 * the value specified by @offset if the given offset is valid and it is
52 * not equal to the current file offset.
53 *
54 * Return the specified offset on success and -EINVAL on invalid offset.
55 */
56loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
46{ 57{
47 if (offset < 0 && !unsigned_offsets(file)) 58 if (offset < 0 && !unsigned_offsets(file))
48 return -EINVAL; 59 return -EINVAL;
@@ -55,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
55 } 66 }
56 return offset; 67 return offset;
57} 68}
69EXPORT_SYMBOL(vfs_setpos);
58 70
59/** 71/**
60 * generic_file_llseek_size - generic llseek implementation for regular files 72 * generic_file_llseek_size - generic llseek implementation for regular files
@@ -76,8 +88,6 @@ loff_t
76generic_file_llseek_size(struct file *file, loff_t offset, int whence, 88generic_file_llseek_size(struct file *file, loff_t offset, int whence,
77 loff_t maxsize, loff_t eof) 89 loff_t maxsize, loff_t eof)
78{ 90{
79 struct inode *inode = file->f_mapping->host;
80
81 switch (whence) { 91 switch (whence) {
82 case SEEK_END: 92 case SEEK_END:
83 offset += eof; 93 offset += eof;
@@ -97,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
97 * like SEEK_SET. 107 * like SEEK_SET.
98 */ 108 */
99 spin_lock(&file->f_lock); 109 spin_lock(&file->f_lock);
100 offset = lseek_execute(file, inode, file->f_pos + offset, 110 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
101 maxsize);
102 spin_unlock(&file->f_lock); 111 spin_unlock(&file->f_lock);
103 return offset; 112 return offset;
104 case SEEK_DATA: 113 case SEEK_DATA:
@@ -120,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
120 break; 129 break;
121 } 130 }
122 131
123 return lseek_execute(file, inode, offset, maxsize); 132 return vfs_setpos(file, offset, maxsize);
124} 133}
125EXPORT_SYMBOL(generic_file_llseek_size); 134EXPORT_SYMBOL(generic_file_llseek_size);
126 135
@@ -145,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145EXPORT_SYMBOL(generic_file_llseek); 154EXPORT_SYMBOL(generic_file_llseek);
146 155
147/** 156/**
157 * fixed_size_llseek - llseek implementation for fixed-sized devices
158 * @file: file structure to seek on
159 * @offset: file offset to seek to
160 * @whence: type of seek
161 * @size: size of the file
162 *
163 */
164loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
165{
166 switch (whence) {
167 case SEEK_SET: case SEEK_CUR: case SEEK_END:
168 return generic_file_llseek_size(file, offset, whence,
169 size, size);
170 default:
171 return -EINVAL;
172 }
173}
174EXPORT_SYMBOL(fixed_size_llseek);
175
176/**
148 * noop_llseek - No Operation Performed llseek implementation 177 * noop_llseek - No Operation Performed llseek implementation
149 * @file: file structure to seek on 178 * @file: file structure to seek on
150 * @offset: file offset to seek to 179 * @offset: file offset to seek to
@@ -296,7 +325,7 @@ out_putf:
296 * them to something that fits in "int" so that others 325 * them to something that fits in "int" so that others
297 * won't have to do range checks all the time. 326 * won't have to do range checks all the time.
298 */ 327 */
299int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 328int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
300{ 329{
301 struct inode *inode; 330 struct inode *inode;
302 loff_t pos; 331 loff_t pos;
@@ -477,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
477 if (f.file) { 506 if (f.file) {
478 loff_t pos = file_pos_read(f.file); 507 loff_t pos = file_pos_read(f.file);
479 ret = vfs_read(f.file, buf, count, &pos); 508 ret = vfs_read(f.file, buf, count, &pos);
480 file_pos_write(f.file, pos); 509 if (ret >= 0)
510 file_pos_write(f.file, pos);
481 fdput(f); 511 fdput(f);
482 } 512 }
483 return ret; 513 return ret;
@@ -492,7 +522,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
492 if (f.file) { 522 if (f.file) {
493 loff_t pos = file_pos_read(f.file); 523 loff_t pos = file_pos_read(f.file);
494 ret = vfs_write(f.file, buf, count, &pos); 524 ret = vfs_write(f.file, buf, count, &pos);
495 file_pos_write(f.file, pos); 525 if (ret >= 0)
526 file_pos_write(f.file, pos);
496 fdput(f); 527 fdput(f);
497 } 528 }
498 529
@@ -780,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
780 if (f.file) { 811 if (f.file) {
781 loff_t pos = file_pos_read(f.file); 812 loff_t pos = file_pos_read(f.file);
782 ret = vfs_readv(f.file, vec, vlen, &pos); 813 ret = vfs_readv(f.file, vec, vlen, &pos);
783 file_pos_write(f.file, pos); 814 if (ret >= 0)
815 file_pos_write(f.file, pos);
784 fdput(f); 816 fdput(f);
785 } 817 }
786 818
@@ -799,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
799 if (f.file) { 831 if (f.file) {
800 loff_t pos = file_pos_read(f.file); 832 loff_t pos = file_pos_read(f.file);
801 ret = vfs_writev(f.file, vec, vlen, &pos); 833 ret = vfs_writev(f.file, vec, vlen, &pos);
802 file_pos_write(f.file, pos); 834 if (ret >= 0)
835 file_pos_write(f.file, pos);
803 fdput(f); 836 fdput(f);
804 } 837 }
805 838
@@ -959,7 +992,8 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
959 return -EBADF; 992 return -EBADF;
960 pos = f.file->f_pos; 993 pos = f.file->f_pos;
961 ret = compat_readv(f.file, vec, vlen, &pos); 994 ret = compat_readv(f.file, vec, vlen, &pos);
962 f.file->f_pos = pos; 995 if (ret >= 0)
996 f.file->f_pos = pos;
963 fdput(f); 997 fdput(f);
964 return ret; 998 return ret;
965} 999}
@@ -1025,7 +1059,8 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1025 return -EBADF; 1059 return -EBADF;
1026 pos = f.file->f_pos; 1060 pos = f.file->f_pos;
1027 ret = compat_writev(f.file, vec, vlen, &pos); 1061 ret = compat_writev(f.file, vec, vlen, &pos);
1028 f.file->f_pos = pos; 1062 if (ret >= 0)
1063 f.file->f_pos = pos;
1029 fdput(f); 1064 fdput(f);
1030 return ret; 1065 return ret;
1031} 1066}
@@ -1129,7 +1164,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1129 if (in.file->f_flags & O_NONBLOCK) 1164 if (in.file->f_flags & O_NONBLOCK)
1130 fl = SPLICE_F_NONBLOCK; 1165 fl = SPLICE_F_NONBLOCK;
1131#endif 1166#endif
1167 file_start_write(out.file);
1132 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); 1168 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1169 file_end_write(out.file);
1133 1170
1134 if (retval > 0) { 1171 if (retval > 0) {
1135 add_rchar(current, retval); 1172 add_rchar(current, retval);
diff --git a/fs/select.c b/fs/select.c
index 8c1c96c27062..f9f49c40cfd4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -27,6 +27,8 @@
27#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/sched/rt.h> 29#include <linux/sched/rt.h>
30#include <linux/freezer.h>
31#include <net/ll_poll.h>
30 32
31#include <asm/uaccess.h> 33#include <asm/uaccess.h>
32 34
@@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
236 238
237 set_current_state(state); 239 set_current_state(state);
238 if (!pwq->triggered) 240 if (!pwq->triggered)
239 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); 241 rc = freezable_schedule_hrtimeout_range(expires, slack,
242 HRTIMER_MODE_ABS);
240 __set_current_state(TASK_RUNNING); 243 __set_current_state(TASK_RUNNING);
241 244
242 /* 245 /*
@@ -384,9 +387,10 @@ get_max:
384#define POLLEX_SET (POLLPRI) 387#define POLLEX_SET (POLLPRI)
385 388
386static inline void wait_key_set(poll_table *wait, unsigned long in, 389static inline void wait_key_set(poll_table *wait, unsigned long in,
387 unsigned long out, unsigned long bit) 390 unsigned long out, unsigned long bit,
391 unsigned int ll_flag)
388{ 392{
389 wait->_key = POLLEX_SET; 393 wait->_key = POLLEX_SET | ll_flag;
390 if (in & bit) 394 if (in & bit)
391 wait->_key |= POLLIN_SET; 395 wait->_key |= POLLIN_SET;
392 if (out & bit) 396 if (out & bit)
@@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
400 poll_table *wait; 404 poll_table *wait;
401 int retval, i, timed_out = 0; 405 int retval, i, timed_out = 0;
402 unsigned long slack = 0; 406 unsigned long slack = 0;
407 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
408 unsigned long busy_end = 0;
403 409
404 rcu_read_lock(); 410 rcu_read_lock();
405 retval = max_select_fd(n, fds); 411 retval = max_select_fd(n, fds);
@@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
422 retval = 0; 428 retval = 0;
423 for (;;) { 429 for (;;) {
424 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 430 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
431 bool can_busy_loop = false;
425 432
426 inp = fds->in; outp = fds->out; exp = fds->ex; 433 inp = fds->in; outp = fds->out; exp = fds->ex;
427 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 434 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
449 f_op = f.file->f_op; 456 f_op = f.file->f_op;
450 mask = DEFAULT_POLLMASK; 457 mask = DEFAULT_POLLMASK;
451 if (f_op && f_op->poll) { 458 if (f_op && f_op->poll) {
452 wait_key_set(wait, in, out, bit); 459 wait_key_set(wait, in, out,
460 bit, busy_flag);
453 mask = (*f_op->poll)(f.file, wait); 461 mask = (*f_op->poll)(f.file, wait);
454 } 462 }
455 fdput(f); 463 fdput(f);
@@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
468 retval++; 476 retval++;
469 wait->_qproc = NULL; 477 wait->_qproc = NULL;
470 } 478 }
479 /* got something, stop busy polling */
480 if (retval) {
481 can_busy_loop = false;
482 busy_flag = 0;
483
484 /*
485 * only remember a returned
486 * POLL_BUSY_LOOP if we asked for it
487 */
488 } else if (busy_flag & mask)
489 can_busy_loop = true;
490
471 } 491 }
472 } 492 }
473 if (res_in) 493 if (res_in)
@@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
486 break; 506 break;
487 } 507 }
488 508
509 /* only if found POLL_BUSY_LOOP sockets && not out of time */
510 if (can_busy_loop && !need_resched()) {
511 if (!busy_end) {
512 busy_end = busy_loop_end_time();
513 continue;
514 }
515 if (!busy_loop_timeout(busy_end))
516 continue;
517 }
518 busy_flag = 0;
519
489 /* 520 /*
490 * If this is the first loop and we have a timeout 521 * If this is the first loop and we have a timeout
491 * given, then we convert to ktime_t and set the to 522 * given, then we convert to ktime_t and set the to
@@ -717,7 +748,9 @@ struct poll_list {
717 * pwait poll_table will be used by the fd-provided poll handler for waiting, 748 * pwait poll_table will be used by the fd-provided poll handler for waiting,
718 * if pwait->_qproc is non-NULL. 749 * if pwait->_qproc is non-NULL.
719 */ 750 */
720static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) 751static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
752 bool *can_busy_poll,
753 unsigned int busy_flag)
721{ 754{
722 unsigned int mask; 755 unsigned int mask;
723 int fd; 756 int fd;
@@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
731 mask = DEFAULT_POLLMASK; 764 mask = DEFAULT_POLLMASK;
732 if (f.file->f_op && f.file->f_op->poll) { 765 if (f.file->f_op && f.file->f_op->poll) {
733 pwait->_key = pollfd->events|POLLERR|POLLHUP; 766 pwait->_key = pollfd->events|POLLERR|POLLHUP;
767 pwait->_key |= busy_flag;
734 mask = f.file->f_op->poll(f.file, pwait); 768 mask = f.file->f_op->poll(f.file, pwait);
769 if (mask & busy_flag)
770 *can_busy_poll = true;
735 } 771 }
736 /* Mask out unneeded events. */ 772 /* Mask out unneeded events. */
737 mask &= pollfd->events | POLLERR | POLLHUP; 773 mask &= pollfd->events | POLLERR | POLLHUP;
@@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
750 ktime_t expire, *to = NULL; 786 ktime_t expire, *to = NULL;
751 int timed_out = 0, count = 0; 787 int timed_out = 0, count = 0;
752 unsigned long slack = 0; 788 unsigned long slack = 0;
789 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
790 unsigned long busy_end = 0;
753 791
754 /* Optimise the no-wait case */ 792 /* Optimise the no-wait case */
755 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 793 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
762 800
763 for (;;) { 801 for (;;) {
764 struct poll_list *walk; 802 struct poll_list *walk;
803 bool can_busy_loop = false;
765 804
766 for (walk = list; walk != NULL; walk = walk->next) { 805 for (walk = list; walk != NULL; walk = walk->next) {
767 struct pollfd * pfd, * pfd_end; 806 struct pollfd * pfd, * pfd_end;
@@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
776 * this. They'll get immediately deregistered 815 * this. They'll get immediately deregistered
777 * when we break out and return. 816 * when we break out and return.
778 */ 817 */
779 if (do_pollfd(pfd, pt)) { 818 if (do_pollfd(pfd, pt, &can_busy_loop,
819 busy_flag)) {
780 count++; 820 count++;
781 pt->_qproc = NULL; 821 pt->_qproc = NULL;
822 /* found something, stop busy polling */
823 busy_flag = 0;
824 can_busy_loop = false;
782 } 825 }
783 } 826 }
784 } 827 }
@@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
795 if (count || timed_out) 838 if (count || timed_out)
796 break; 839 break;
797 840
841 /* only if found POLL_BUSY_LOOP sockets && not out of time */
842 if (can_busy_loop && !need_resched()) {
843 if (!busy_end) {
844 busy_end = busy_loop_end_time();
845 continue;
846 }
847 if (!busy_loop_timeout(busy_end))
848 continue;
849 }
850 busy_flag = 0;
851
798 /* 852 /*
799 * If this is the first loop and we have a timeout 853 * If this is the first loop and we have a timeout
800 * given, then we convert to ktime_t and set the to 854 * given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 774c1eb7f1c9..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
921 return rcu_dereference(node->next); 921 return rcu_dereference(node->next);
922} 922}
923EXPORT_SYMBOL(seq_hlist_next_rcu); 923EXPORT_SYMBOL(seq_hlist_next_rcu);
924
925/**
926 * seq_hlist_start_precpu - start an iteration of a percpu hlist array
927 * @head: pointer to percpu array of struct hlist_heads
928 * @cpu: pointer to cpu "cursor"
929 * @pos: start position of sequence
930 *
931 * Called at seq_file->op->start().
932 */
933struct hlist_node *
934seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
935{
936 struct hlist_node *node;
937
938 for_each_possible_cpu(*cpu) {
939 hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
940 if (pos-- == 0)
941 return node;
942 }
943 }
944 return NULL;
945}
946EXPORT_SYMBOL(seq_hlist_start_percpu);
947
948/**
949 * seq_hlist_next_percpu - move to the next position of the percpu hlist array
950 * @v: pointer to current hlist_node
951 * @head: pointer to percpu array of struct hlist_heads
952 * @cpu: pointer to cpu "cursor"
953 * @pos: start position of sequence
954 *
955 * Called at seq_file->op->next().
956 */
957struct hlist_node *
958seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
959 int *cpu, loff_t *pos)
960{
961 struct hlist_node *node = v;
962
963 ++*pos;
964
965 if (node->next)
966 return node->next;
967
968 for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
969 *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
970 struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
971
972 if (!hlist_empty(bucket))
973 return bucket->first;
974 }
975 return NULL;
976}
977EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/splice.c b/fs/splice.c
index d37431dd60a1..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1098,27 +1098,13 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1098{ 1098{
1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1100 loff_t *, size_t, unsigned int); 1100 loff_t *, size_t, unsigned int);
1101 int ret;
1102
1103 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1104 return -EBADF;
1105
1106 if (unlikely(out->f_flags & O_APPEND))
1107 return -EINVAL;
1108
1109 ret = rw_verify_area(WRITE, out, ppos, len);
1110 if (unlikely(ret < 0))
1111 return ret;
1112 1101
1113 if (out->f_op && out->f_op->splice_write) 1102 if (out->f_op && out->f_op->splice_write)
1114 splice_write = out->f_op->splice_write; 1103 splice_write = out->f_op->splice_write;
1115 else 1104 else
1116 splice_write = default_file_splice_write; 1105 splice_write = default_file_splice_write;
1117 1106
1118 file_start_write(out); 1107 return splice_write(pipe, out, ppos, len, flags);
1119 ret = splice_write(pipe, out, ppos, len, flags);
1120 file_end_write(out);
1121 return ret;
1122} 1108}
1123 1109
1124/* 1110/*
@@ -1307,6 +1293,16 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1307 }; 1293 };
1308 long ret; 1294 long ret;
1309 1295
1296 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1297 return -EBADF;
1298
1299 if (unlikely(out->f_flags & O_APPEND))
1300 return -EINVAL;
1301
1302 ret = rw_verify_area(WRITE, out, opos, len);
1303 if (unlikely(ret < 0))
1304 return ret;
1305
1310 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1306 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1311 if (ret > 0) 1307 if (ret > 0)
1312 *ppos = sd.pos; 1308 *ppos = sd.pos;
@@ -1362,7 +1358,19 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1362 offset = out->f_pos; 1358 offset = out->f_pos;
1363 } 1359 }
1364 1360
1361 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1362 return -EBADF;
1363
1364 if (unlikely(out->f_flags & O_APPEND))
1365 return -EINVAL;
1366
1367 ret = rw_verify_area(WRITE, out, &offset, len);
1368 if (unlikely(ret < 0))
1369 return ret;
1370
1371 file_start_write(out);
1365 ret = do_splice_from(ipipe, out, &offset, len, flags); 1372 ret = do_splice_from(ipipe, out, &offset, len, flags);
1373 file_end_write(out);
1366 1374
1367 if (!off_out) 1375 if (!off_out)
1368 out->f_pos = offset; 1376 out->f_pos = offset;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f264767..731b2bbcaab3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
27 return err; 27 return err;
28} 28}
29 29
30static int sysv_hash(const struct dentry *dentry, const struct inode *inode, 30static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
31 struct qstr *qstr)
32{ 31{
33 /* Truncate the name in place, avoids having to define a compare 32 /* Truncate the name in place, avoids having to define a compare
34 function. */ 33 function. */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/alarmtimer.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
13#include <linux/init.h> 14#include <linux/init.h>
@@ -26,7 +27,10 @@
26#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
27 28
28struct timerfd_ctx { 29struct timerfd_ctx {
29 struct hrtimer tmr; 30 union {
31 struct hrtimer tmr;
32 struct alarm alarm;
33 } t;
30 ktime_t tintv; 34 ktime_t tintv;
31 ktime_t moffs; 35 ktime_t moffs;
32 wait_queue_head_t wqh; 36 wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
41static LIST_HEAD(cancel_list); 45static LIST_HEAD(cancel_list);
42static DEFINE_SPINLOCK(cancel_lock); 46static DEFINE_SPINLOCK(cancel_lock);
43 47
48static inline bool isalarm(struct timerfd_ctx *ctx)
49{
50 return ctx->clockid == CLOCK_REALTIME_ALARM ||
51 ctx->clockid == CLOCK_BOOTTIME_ALARM;
52}
53
44/* 54/*
45 * This gets called when the timer event triggers. We set the "expired" 55 * This gets called when the timer event triggers. We set the "expired"
46 * flag, but we do not re-arm the timer (in case it's necessary, 56 * flag, but we do not re-arm the timer (in case it's necessary,
47 * tintv.tv64 != 0) until the timer is accessed. 57 * tintv.tv64 != 0) until the timer is accessed.
48 */ 58 */
49static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) 59static void timerfd_triggered(struct timerfd_ctx *ctx)
50{ 60{
51 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
52 unsigned long flags; 61 unsigned long flags;
53 62
54 spin_lock_irqsave(&ctx->wqh.lock, flags); 63 spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
56 ctx->ticks++; 65 ctx->ticks++;
57 wake_up_locked(&ctx->wqh); 66 wake_up_locked(&ctx->wqh);
58 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 67 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
68}
59 69
70static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
71{
72 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
73 t.tmr);
74 timerfd_triggered(ctx);
60 return HRTIMER_NORESTART; 75 return HRTIMER_NORESTART;
61} 76}
62 77
78static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
79 ktime_t now)
80{
81 struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
82 t.alarm);
83 timerfd_triggered(ctx);
84 return ALARMTIMER_NORESTART;
85}
86
63/* 87/*
64 * Called when the clock was set to cancel the timers in the cancel 88 * Called when the clock was set to cancel the timers in the cancel
65 * list. This will wake up processes waiting on these timers. The 89 * list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
107 131
108static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) 132static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
109{ 133{
110 if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && 134 if ((ctx->clockid == CLOCK_REALTIME ||
111 (flags & TFD_TIMER_CANCEL_ON_SET)) { 135 ctx->clockid == CLOCK_REALTIME_ALARM) &&
136 (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
112 if (!ctx->might_cancel) { 137 if (!ctx->might_cancel) {
113 ctx->might_cancel = true; 138 ctx->might_cancel = true;
114 spin_lock(&cancel_lock); 139 spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
124{ 149{
125 ktime_t remaining; 150 ktime_t remaining;
126 151
127 remaining = hrtimer_expires_remaining(&ctx->tmr); 152 if (isalarm(ctx))
153 remaining = alarm_expires_remaining(&ctx->t.alarm);
154 else
155 remaining = hrtimer_expires_remaining(&ctx->t.tmr);
156
128 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; 157 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
129} 158}
130 159
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
142 ctx->expired = 0; 171 ctx->expired = 0;
143 ctx->ticks = 0; 172 ctx->ticks = 0;
144 ctx->tintv = timespec_to_ktime(ktmr->it_interval); 173 ctx->tintv = timespec_to_ktime(ktmr->it_interval);
145 hrtimer_init(&ctx->tmr, clockid, htmode); 174
146 hrtimer_set_expires(&ctx->tmr, texp); 175 if (isalarm(ctx)) {
147 ctx->tmr.function = timerfd_tmrproc; 176 alarm_init(&ctx->t.alarm,
177 ctx->clockid == CLOCK_REALTIME_ALARM ?
178 ALARM_REALTIME : ALARM_BOOTTIME,
179 timerfd_alarmproc);
180 } else {
181 hrtimer_init(&ctx->t.tmr, clockid, htmode);
182 hrtimer_set_expires(&ctx->t.tmr, texp);
183 ctx->t.tmr.function = timerfd_tmrproc;
184 }
185
148 if (texp.tv64 != 0) { 186 if (texp.tv64 != 0) {
149 hrtimer_start(&ctx->tmr, texp, htmode); 187 if (isalarm(ctx)) {
188 if (flags & TFD_TIMER_ABSTIME)
189 alarm_start(&ctx->t.alarm, texp);
190 else
191 alarm_start_relative(&ctx->t.alarm, texp);
192 } else {
193 hrtimer_start(&ctx->t.tmr, texp, htmode);
194 }
195
150 if (timerfd_canceled(ctx)) 196 if (timerfd_canceled(ctx))
151 return -ECANCELED; 197 return -ECANCELED;
152 } 198 }
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
158 struct timerfd_ctx *ctx = file->private_data; 204 struct timerfd_ctx *ctx = file->private_data;
159 205
160 timerfd_remove_cancel(ctx); 206 timerfd_remove_cancel(ctx);
161 hrtimer_cancel(&ctx->tmr); 207
208 if (isalarm(ctx))
209 alarm_cancel(&ctx->t.alarm);
210 else
211 hrtimer_cancel(&ctx->t.tmr);
162 kfree_rcu(ctx, rcu); 212 kfree_rcu(ctx, rcu);
163 return 0; 213 return 0;
164} 214}
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
215 * callback to avoid DoS attacks specifying a very 265 * callback to avoid DoS attacks specifying a very
216 * short timer period. 266 * short timer period.
217 */ 267 */
218 ticks += hrtimer_forward_now(&ctx->tmr, 268 if (isalarm(ctx)) {
219 ctx->tintv) - 1; 269 ticks += alarm_forward_now(
220 hrtimer_restart(&ctx->tmr); 270 &ctx->t.alarm, ctx->tintv) - 1;
271 alarm_restart(&ctx->t.alarm);
272 } else {
273 ticks += hrtimer_forward_now(&ctx->t.tmr,
274 ctx->tintv) - 1;
275 hrtimer_restart(&ctx->t.tmr);
276 }
221 } 277 }
222 ctx->expired = 0; 278 ctx->expired = 0;
223 ctx->ticks = 0; 279 ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
259 315
260 if ((flags & ~TFD_CREATE_FLAGS) || 316 if ((flags & ~TFD_CREATE_FLAGS) ||
261 (clockid != CLOCK_MONOTONIC && 317 (clockid != CLOCK_MONOTONIC &&
262 clockid != CLOCK_REALTIME)) 318 clockid != CLOCK_REALTIME &&
319 clockid != CLOCK_REALTIME_ALARM &&
320 clockid != CLOCK_BOOTTIME_ALARM))
263 return -EINVAL; 321 return -EINVAL;
264 322
265 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 323 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
268 326
269 init_waitqueue_head(&ctx->wqh); 327 init_waitqueue_head(&ctx->wqh);
270 ctx->clockid = clockid; 328 ctx->clockid = clockid;
271 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 329
330 if (isalarm(ctx))
331 alarm_init(&ctx->t.alarm,
332 ctx->clockid == CLOCK_REALTIME_ALARM ?
333 ALARM_REALTIME : ALARM_BOOTTIME,
334 timerfd_alarmproc);
335 else
336 hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
337
272 ctx->moffs = ktime_get_monotonic_offset(); 338 ctx->moffs = ktime_get_monotonic_offset();
273 339
274 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 340 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
305 */ 371 */
306 for (;;) { 372 for (;;) {
307 spin_lock_irq(&ctx->wqh.lock); 373 spin_lock_irq(&ctx->wqh.lock);
308 if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) 374
309 break; 375 if (isalarm(ctx)) {
376 if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
377 break;
378 } else {
379 if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
380 break;
381 }
310 spin_unlock_irq(&ctx->wqh.lock); 382 spin_unlock_irq(&ctx->wqh.lock);
311 cpu_relax(); 383 cpu_relax();
312 } 384 }
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
317 * We do not update "ticks" and "expired" since the timer will be 389 * We do not update "ticks" and "expired" since the timer will be
318 * re-programmed again in the following timerfd_setup() call. 390 * re-programmed again in the following timerfd_setup() call.
319 */ 391 */
320 if (ctx->expired && ctx->tintv.tv64) 392 if (ctx->expired && ctx->tintv.tv64) {
321 hrtimer_forward_now(&ctx->tmr, ctx->tintv); 393 if (isalarm(ctx))
394 alarm_forward_now(&ctx->t.alarm, ctx->tintv);
395 else
396 hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
397 }
322 398
323 old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 399 old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
324 old->it_interval = ktime_to_timespec(ctx->tintv); 400 old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
345 spin_lock_irq(&ctx->wqh.lock); 421 spin_lock_irq(&ctx->wqh.lock);
346 if (ctx->expired && ctx->tintv.tv64) { 422 if (ctx->expired && ctx->tintv.tv64) {
347 ctx->expired = 0; 423 ctx->expired = 0;
348 ctx->ticks += 424
349 hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; 425 if (isalarm(ctx)) {
350 hrtimer_restart(&ctx->tmr); 426 ctx->ticks +=
427 alarm_forward_now(
428 &ctx->t.alarm, ctx->tintv) - 1;
429 alarm_restart(&ctx->t.alarm);
430 } else {
431 ctx->ticks +=
432 hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
433 - 1;
434 hrtimer_restart(&ctx->t.tmr);
435 }
351 } 436 }
352 t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 437 t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
353 t->it_interval = ktime_to_timespec(ctx->tintv); 438 t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f21acf0ef01f..879b9976c12b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
1412 1412
1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", 1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name, 1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name,
1415 c->ro_mount ? ", R/O mode" : NULL); 1415 c->ro_mount ? ", R/O mode" : "");
1416 x = (long long)c->main_lebs * c->leb_size; 1416 x = (long long)c->main_lebs * c->leb_size;
1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", 1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 102c072c6bbf..5f6fc17d6bc5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
594 return 0; 594 return 0;
595} 595}
596 596
597static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
598{
599 struct inode *inode;
600 struct udf_inode_info *iinfo;
601 int err;
602
603 inode = udf_new_inode(dir, mode, &err);
604 if (!inode)
605 return err;
606
607 iinfo = UDF_I(inode);
608 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops;
610 else
611 inode->i_data.a_ops = &udf_aops;
612 inode->i_op = &udf_file_inode_operations;
613 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode);
615
616 d_tmpfile(dentry, inode);
617 return 0;
618}
619
597static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, 620static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
598 dev_t rdev) 621 dev_t rdev)
599{ 622{
@@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = {
1311 .rmdir = udf_rmdir, 1334 .rmdir = udf_rmdir,
1312 .mknod = udf_mknod, 1335 .mknod = udf_mknod,
1313 .rename = udf_rename, 1336 .rename = udf_rename,
1337 .tmpfile = udf_tmpfile,
1314}; 1338};
1315const struct inode_operations udf_symlink_inode_operations = { 1339const struct inode_operations udf_symlink_inode_operations = {
1316 .readlink = generic_readlink, 1340 .readlink = generic_readlink,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6313b69b6644..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -71,6 +71,7 @@ xfs-y += xfs_alloc.o \
71 xfs_dir2_sf.o \ 71 xfs_dir2_sf.o \
72 xfs_ialloc.o \ 72 xfs_ialloc.o \
73 xfs_ialloc_btree.o \ 73 xfs_ialloc_btree.o \
74 xfs_icreate_item.o \
74 xfs_inode.o \ 75 xfs_inode.o \
75 xfs_log_recover.o \ 76 xfs_log_recover.o \
76 xfs_mount.o \ 77 xfs_mount.o \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5673bcfda2f0..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
175 xfs_agblock_t wantbno, /* target starting block */ 175 xfs_agblock_t wantbno, /* target starting block */
176 xfs_extlen_t wantlen, /* target length */ 176 xfs_extlen_t wantlen, /* target length */
177 xfs_extlen_t alignment, /* target alignment */ 177 xfs_extlen_t alignment, /* target alignment */
178 char userdata, /* are we allocating data? */
178 xfs_agblock_t freebno, /* freespace's starting block */ 179 xfs_agblock_t freebno, /* freespace's starting block */
179 xfs_extlen_t freelen, /* freespace's length */ 180 xfs_extlen_t freelen, /* freespace's length */
180 xfs_agblock_t *newbnop) /* result: best start block from free */ 181 xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
189 ASSERT(freelen >= wantlen); 190 ASSERT(freelen >= wantlen);
190 freeend = freebno + freelen; 191 freeend = freebno + freelen;
191 wantend = wantbno + wantlen; 192 wantend = wantbno + wantlen;
192 if (freebno >= wantbno) { 193 /*
194 * We want to allocate from the start of a free extent if it is past
195 * the desired block or if we are allocating user data and the free
196 * extent is before desired block. The second case is there to allow
197 * for contiguous allocation from the remaining free space if the file
198 * grows in the short term.
199 */
200 if (freebno >= wantbno || (userdata && freeend < wantend)) {
193 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 201 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
194 newbno1 = NULLAGBLOCK; 202 newbno1 = NULLAGBLOCK;
195 } else if (freeend >= wantend && alignment > 1) { 203 } else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
805 xfs_alloc_fix_len(args); 813 xfs_alloc_fix_len(args);
806 814
807 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 815 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
808 args->alignment, *sbnoa, 816 args->alignment,
817 args->userdata, *sbnoa,
809 *slena, &new); 818 *slena, &new);
810 819
811 /* 820 /*
@@ -976,7 +985,8 @@ restart:
976 if (args->len < blen) 985 if (args->len < blen)
977 continue; 986 continue;
978 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 987 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
979 args->alignment, ltbnoa, ltlena, &ltnew); 988 args->alignment, args->userdata, ltbnoa,
989 ltlena, &ltnew);
980 if (ltnew != NULLAGBLOCK && 990 if (ltnew != NULLAGBLOCK &&
981 (args->len > blen || ltdiff < bdiff)) { 991 (args->len > blen || ltdiff < bdiff)) {
982 bdiff = ltdiff; 992 bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
1128 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1138 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1129 xfs_alloc_fix_len(args); 1139 xfs_alloc_fix_len(args);
1130 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1140 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1131 args->alignment, ltbnoa, ltlena, &ltnew); 1141 args->alignment, args->userdata, ltbnoa,
1142 ltlena, &ltnew);
1132 1143
1133 error = xfs_alloc_find_best_extent(args, 1144 error = xfs_alloc_find_best_extent(args,
1134 &bno_cur_lt, &bno_cur_gt, 1145 &bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
1144 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1155 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1145 xfs_alloc_fix_len(args); 1156 xfs_alloc_fix_len(args);
1146 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1157 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1147 args->alignment, gtbnoa, gtlena, &gtnew); 1158 args->alignment, args->userdata, gtbnoa,
1159 gtlena, &gtnew);
1148 1160
1149 error = xfs_alloc_find_best_extent(args, 1161 error = xfs_alloc_find_best_extent(args,
1150 &bno_cur_gt, &bno_cur_lt, 1162 &bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
1203 } 1215 }
1204 rlen = args->len; 1216 rlen = args->len;
1205 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, 1217 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1206 ltbnoa, ltlena, &ltnew); 1218 args->userdata, ltbnoa, ltlena, &ltnew);
1207 ASSERT(ltnew >= ltbno); 1219 ASSERT(ltnew >= ltbno);
1208 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1220 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1209 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1221 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 70c43d9f72c1..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
196#define XFS_BMDR_SPACE_CALC(nrecs) \ 196#define XFS_BMDR_SPACE_CALC(nrecs) \
197 (int)(sizeof(xfs_bmdr_block_t) + \ 197 (int)(sizeof(xfs_bmdr_block_t) + \
198 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) 198 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
199#define XFS_BMAP_BMDR_SPACE(bb) \
200 (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
199 201
200/* 202/*
201 * Maximum number of bmap btree levels. 203 * Maximum number of bmap btree levels.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4ec431777048..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
140 140
141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
142 142
143 if (bip->bli_flags & XFS_BLI_ORDERED) {
144 /*
145 * The buffer has been logged just to order it.
146 * It is not being included in the transaction
147 * commit, so no vectors are used at all.
148 */
149 trace_xfs_buf_item_size_ordered(bip);
150 return XFS_LOG_VEC_ORDERED;
151 }
152
143 /* 153 /*
144 * the vector count is based on the number of buffer vectors we have 154 * the vector count is based on the number of buffer vectors we have
145 * dirty bits in. This will only be greater than one when we have a 155 * dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
212 goto out; 222 goto out;
213 } 223 }
214 224
225
215 /* 226 /*
216 * Fill in an iovec for each set of contiguous chunks. 227 * Fill in an iovec for each set of contiguous chunks.
217 */ 228 */
@@ -299,18 +310,36 @@ xfs_buf_item_format(
299 310
300 /* 311 /*
301 * If it is an inode buffer, transfer the in-memory state to the 312 * If it is an inode buffer, transfer the in-memory state to the
302 * format flags and clear the in-memory state. We do not transfer 313 * format flags and clear the in-memory state.
314 *
315 * For buffer based inode allocation, we do not transfer
303 * this state if the inode buffer allocation has not yet been committed 316 * this state if the inode buffer allocation has not yet been committed
304 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 317 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
305 * correct replay of the inode allocation. 318 * correct replay of the inode allocation.
319 *
320 * For icreate item based inode allocation, the buffers aren't written
321 * to the journal during allocation, and hence we should always tag the
322 * buffer as an inode buffer so that the correct unlinked list replay
323 * occurs during recovery.
306 */ 324 */
307 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 325 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
308 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 326 if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
327 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
309 xfs_log_item_in_current_chkpt(lip))) 328 xfs_log_item_in_current_chkpt(lip)))
310 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 329 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
311 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 330 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
312 } 331 }
313 332
333 if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
334 XFS_BLI_ORDERED) {
335 /*
336 * The buffer has been logged just to order it. It is not being
337 * included in the transaction commit, so don't format it.
338 */
339 trace_xfs_buf_item_format_ordered(bip);
340 return;
341 }
342
314 for (i = 0; i < bip->bli_format_count; i++) { 343 for (i = 0; i < bip->bli_format_count; i++) {
315 vecp = xfs_buf_item_format_segment(bip, vecp, offset, 344 vecp = xfs_buf_item_format_segment(bip, vecp, offset,
316 &bip->bli_formats[i]); 345 &bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
340 369
341 ASSERT(atomic_read(&bip->bli_refcount) > 0); 370 ASSERT(atomic_read(&bip->bli_refcount) > 0);
342 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 371 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
372 (bip->bli_flags & XFS_BLI_ORDERED) ||
343 (bip->bli_flags & XFS_BLI_STALE)); 373 (bip->bli_flags & XFS_BLI_STALE));
344 374
345 trace_xfs_buf_item_pin(bip); 375 trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
512{ 542{
513 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 543 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
514 struct xfs_buf *bp = bip->bli_buf; 544 struct xfs_buf *bp = bip->bli_buf;
515 int aborted, clean, i; 545 bool clean;
516 uint hold; 546 bool aborted;
547 int flags;
517 548
518 /* Clear the buffer's association with this transaction. */ 549 /* Clear the buffer's association with this transaction. */
519 bp->b_transp = NULL; 550 bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
524 * (cancelled) buffers at unpin time, but we'll never go through the 555 * (cancelled) buffers at unpin time, but we'll never go through the
525 * pin/unpin cycle if we abort inside commit. 556 * pin/unpin cycle if we abort inside commit.
526 */ 557 */
527 aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; 558 aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
528
529 /* 559 /*
530 * Before possibly freeing the buf item, determine if we should 560 * Before possibly freeing the buf item, copy the per-transaction state
531 * release the buffer at the end of this routine. 561 * so we can reference it safely later after clearing it from the
562 * buffer log item.
532 */ 563 */
533 hold = bip->bli_flags & XFS_BLI_HOLD; 564 flags = bip->bli_flags;
534 565 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
535 /* Clear the per transaction state. */
536 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
537 566
538 /* 567 /*
539 * If the buf item is marked stale, then don't do anything. We'll 568 * If the buf item is marked stale, then don't do anything. We'll
540 * unlock the buffer and free the buf item when the buffer is unpinned 569 * unlock the buffer and free the buf item when the buffer is unpinned
541 * for the last time. 570 * for the last time.
542 */ 571 */
543 if (bip->bli_flags & XFS_BLI_STALE) { 572 if (flags & XFS_BLI_STALE) {
544 trace_xfs_buf_item_unlock_stale(bip); 573 trace_xfs_buf_item_unlock_stale(bip);
545 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 574 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
546 if (!aborted) { 575 if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
557 * be the only reference to the buf item, so we free it anyway 586 * be the only reference to the buf item, so we free it anyway
558 * regardless of whether it is dirty or not. A dirty abort implies a 587 * regardless of whether it is dirty or not. A dirty abort implies a
559 * shutdown, anyway. 588 * shutdown, anyway.
589 *
590 * Ordered buffers are dirty but may have no recorded changes, so ensure
591 * we only release clean items here.
560 */ 592 */
561 clean = 1; 593 clean = (flags & XFS_BLI_DIRTY) ? false : true;
562 for (i = 0; i < bip->bli_format_count; i++) { 594 if (clean) {
563 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 595 int i;
564 bip->bli_formats[i].blf_map_size)) { 596 for (i = 0; i < bip->bli_format_count; i++) {
565 clean = 0; 597 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
566 break; 598 bip->bli_formats[i].blf_map_size)) {
599 clean = false;
600 break;
601 }
567 } 602 }
568 } 603 }
569 if (clean) 604 if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
576 } else 611 } else
577 atomic_dec(&bip->bli_refcount); 612 atomic_dec(&bip->bli_refcount);
578 613
579 if (!hold) 614 if (!(flags & XFS_BLI_HOLD))
580 xfs_buf_relse(bp); 615 xfs_buf_relse(bp);
581} 616}
582 617
@@ -842,12 +877,6 @@ xfs_buf_item_log(
842 struct xfs_buf *bp = bip->bli_buf; 877 struct xfs_buf *bp = bip->bli_buf;
843 878
844 /* 879 /*
845 * Mark the item as having some dirty data for
846 * quick reference in xfs_buf_item_dirty.
847 */
848 bip->bli_flags |= XFS_BLI_DIRTY;
849
850 /*
851 * walk each buffer segment and mark them dirty appropriately. 880 * walk each buffer segment and mark them dirty appropriately.
852 */ 881 */
853 start = 0; 882 start = 0;
@@ -873,7 +902,7 @@ xfs_buf_item_log(
873 902
874 903
875/* 904/*
876 * Return 1 if the buffer has some data that has been logged (at any 905 * Return 1 if the buffer has been logged or ordered in a transaction (at any
877 * point, not just the current transaction) and 0 if not. 906 * point, not just the current transaction) and 0 if not.
878 */ 907 */
879uint 908uint
@@ -907,11 +936,11 @@ void
907xfs_buf_item_relse( 936xfs_buf_item_relse(
908 xfs_buf_t *bp) 937 xfs_buf_t *bp)
909{ 938{
910 xfs_buf_log_item_t *bip; 939 xfs_buf_log_item_t *bip = bp->b_fspriv;
911 940
912 trace_xfs_buf_item_relse(bp, _RET_IP_); 941 trace_xfs_buf_item_relse(bp, _RET_IP_);
942 ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
913 943
914 bip = bp->b_fspriv;
915 bp->b_fspriv = bip->bli_item.li_bio_list; 944 bp->b_fspriv = bip->bli_item.li_bio_list;
916 if (bp->b_fspriv == NULL) 945 if (bp->b_fspriv == NULL)
917 bp->b_iodone = NULL; 946 bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 2573d2a75fc8..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
120#define XFS_BLI_INODE_ALLOC_BUF 0x10 120#define XFS_BLI_INODE_ALLOC_BUF 0x10
121#define XFS_BLI_STALE_INODE 0x20 121#define XFS_BLI_STALE_INODE 0x20
122#define XFS_BLI_INODE_BUF 0x40 122#define XFS_BLI_INODE_BUF 0x40
123#define XFS_BLI_ORDERED 0x80
123 124
124#define XFS_BLI_FLAGS \ 125#define XFS_BLI_FLAGS \
125 { XFS_BLI_HOLD, "HOLD" }, \ 126 { XFS_BLI_HOLD, "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
128 { XFS_BLI_LOGGED, "LOGGED" }, \ 129 { XFS_BLI_LOGGED, "LOGGED" }, \
129 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 130 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
130 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ 131 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
131 { XFS_BLI_INODE_BUF, "INODE_BUF" } 132 { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
133 { XFS_BLI_ORDERED, "ORDERED" }
132 134
133 135
134#ifdef __KERNEL__ 136#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index c407e1ccff43..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_alloc_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
27#include "xfs_dinode.h" 30#include "xfs_dinode.h"
28#include "xfs_inode.h" 31#include "xfs_inode.h"
29#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
182 */ 185 */
183 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 186 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 if (XFS_IFORK_BOFF(ip) && 187 if (XFS_IFORK_BOFF(ip) &&
185 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) 188 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
186 return EINVAL; 189 return EINVAL;
187 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= 190 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
188 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) 191 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
192 /* Reciprocal target->temp btree format checks */ 195 /* Reciprocal target->temp btree format checks */
193 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
194 if (XFS_IFORK_BOFF(tip) && 197 if (XFS_IFORK_BOFF(tip) &&
195 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) 198 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
196 return EINVAL; 199 return EINVAL;
197
198 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= 200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
199 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) 201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
200 return EINVAL; 202 return EINVAL;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e0cc1243a8aa..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
1108 struct xfs_mount *mp = dp->i_mount; 1108 struct xfs_mount *mp = dp->i_mount;
1109 struct xfs_buf *bp = *bpp; 1109 struct xfs_buf *bp = *bpp;
1110 struct xfs_bmbt_irec *map = mip->map; 1110 struct xfs_bmbt_irec *map = mip->map;
1111 struct blk_plug plug;
1111 int error = 0; 1112 int error = 0;
1112 int length; 1113 int length;
1113 int i; 1114 int i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
1236 /* 1237 /*
1237 * Do we need more readahead? 1238 * Do we need more readahead?
1238 */ 1239 */
1240 blk_start_plug(&plug);
1239 for (mip->ra_index = mip->ra_offset = i = 0; 1241 for (mip->ra_index = mip->ra_offset = i = 0;
1240 mip->ra_want > mip->ra_current && i < mip->map_blocks; 1242 mip->ra_want > mip->ra_current && i < mip->map_blocks;
1241 i += mp->m_dirblkfsbs) { 1243 i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
1287 } 1289 }
1288 } 1290 }
1289 } 1291 }
1292 blk_finish_plug(&plug);
1290 1293
1291out: 1294out:
1292 *bpp = bp; 1295 *bpp = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a33c8d..f01012de06d0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
570 xfs_buf_t **O_bpp, 570 xfs_buf_t **O_bpp,
571 uint flags) 571 uint flags)
572{ 572{
573 xfs_bmbt_irec_t map; 573 struct xfs_bmbt_irec map;
574 int nmaps = 1, error; 574 int nmaps = 1, error;
575 xfs_buf_t *bp; 575 struct xfs_buf *bp;
576 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 576 struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
577 xfs_mount_t *mp = dqp->q_mount; 577 struct xfs_mount *mp = dqp->q_mount;
578 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 578 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
579 xfs_trans_t *tp = (tpp ? *tpp : NULL); 579 struct xfs_trans *tp = (tpp ? *tpp : NULL);
580 580
581 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 581 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
582 582
@@ -804,7 +804,7 @@ xfs_qm_dqget(
804 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ 804 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
805{ 805{
806 struct xfs_quotainfo *qi = mp->m_quotainfo; 806 struct xfs_quotainfo *qi = mp->m_quotainfo;
807 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 807 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
808 struct xfs_dquot *dqp; 808 struct xfs_dquot *dqp;
809 int error; 809 int error;
810 810
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4f0ebfc43cc9..b596626249b8 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -143,10 +143,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
143#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 143#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
144#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 144#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
145#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) 145#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
146#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
147#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
148 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
149 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
150 146
151extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, 147extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
152 uint, struct xfs_dquot **); 148 uint, struct xfs_dquot **);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0ad2b95fca12..de3dc98f4e8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1268,8 +1268,7 @@ xfs_seek_data(
1268 } 1268 }
1269 1269
1270out: 1270out:
1271 if (offset != file->f_pos) 1271 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1272 file->f_pos = offset;
1273 1272
1274out_unlock: 1273out_unlock:
1275 xfs_iunlock_map_shared(ip, lock); 1274 xfs_iunlock_map_shared(ip, lock);
@@ -1377,8 +1376,7 @@ out:
1377 * situation in particular. 1376 * situation in particular.
1378 */ 1377 */
1379 offset = min_t(loff_t, offset, isize); 1378 offset = min_t(loff_t, offset, isize);
1380 if (offset != file->f_pos) 1379 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1381 file->f_pos = offset;
1382 1380
1383out_unlock: 1381out_unlock:
1384 xfs_iunlock_map_shared(ip, lock); 1382 xfs_iunlock_map_shared(ip, lock);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
176 if (!bp) 176 if (!bp)
177 return EIO; 177 return EIO;
178 if (bp->b_error) { 178 if (bp->b_error) {
179 int error = bp->b_error; 179 error = bp->b_error;
180 xfs_buf_relse(bp); 180 xfs_buf_relse(bp);
181 return error; 181 return error;
182 } 182 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c8f5ae1debf2..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
38#include "xfs_bmap.h" 38#include "xfs_bmap.h"
39#include "xfs_cksum.h" 39#include "xfs_cksum.h"
40#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
41#include "xfs_icreate_item.h"
41 42
42 43
43/* 44/*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
150#endif 151#endif
151 152
152/* 153/*
153 * Initialise a new set of inodes. 154 * Initialise a new set of inodes. When called without a transaction context
155 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
156 * than logging them (which in a transaction context puts them into the AIL
157 * for writeback rather than the xfsbufd queue).
154 */ 158 */
155STATIC int 159int
156xfs_ialloc_inode_init( 160xfs_ialloc_inode_init(
157 struct xfs_mount *mp, 161 struct xfs_mount *mp,
158 struct xfs_trans *tp, 162 struct xfs_trans *tp,
163 struct list_head *buffer_list,
159 xfs_agnumber_t agno, 164 xfs_agnumber_t agno,
160 xfs_agblock_t agbno, 165 xfs_agblock_t agbno,
161 xfs_agblock_t length, 166 xfs_agblock_t length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
208 version = 3; 213 version = 3;
209 ino = XFS_AGINO_TO_INO(mp, agno, 214 ino = XFS_AGINO_TO_INO(mp, agno,
210 XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); 215 XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
216
217 /*
218 * log the initialisation that is about to take place as an
219 * logical operation. This means the transaction does not
220 * need to log the physical changes to the inode buffers as log
221 * recovery will know what initialisation is actually needed.
222 * Hence we only need to log the buffers as "ordered" buffers so
223 * they track in the AIL as if they were physically logged.
224 */
225 if (tp)
226 xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
227 mp->m_sb.sb_inodesize, length, gen);
211 } else if (xfs_sb_version_hasnlink(&mp->m_sb)) 228 } else if (xfs_sb_version_hasnlink(&mp->m_sb))
212 version = 2; 229 version = 2;
213 else 230 else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
223 XBF_UNMAPPED); 240 XBF_UNMAPPED);
224 if (!fbuf) 241 if (!fbuf)
225 return ENOMEM; 242 return ENOMEM;
226 /* 243
227 * Initialize all inodes in this buffer and then log them. 244 /* Initialize the inode buffers and log them appropriately. */
228 *
229 * XXX: It would be much better if we had just one transaction
230 * to log a whole cluster of inodes instead of all the
231 * individual transactions causing a lot of log traffic.
232 */
233 fbuf->b_ops = &xfs_inode_buf_ops; 245 fbuf->b_ops = &xfs_inode_buf_ops;
234 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); 246 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
235 for (i = 0; i < ninodes; i++) { 247 for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
247 ino++; 259 ino++;
248 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); 260 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
249 xfs_dinode_calc_crc(mp, free); 261 xfs_dinode_calc_crc(mp, free);
250 } else { 262 } else if (tp) {
251 /* just log the inode core */ 263 /* just log the inode core */
252 xfs_trans_log_buf(tp, fbuf, ioffset, 264 xfs_trans_log_buf(tp, fbuf, ioffset,
253 ioffset + isize - 1); 265 ioffset + isize - 1);
254 } 266 }
255 } 267 }
256 if (version == 3) { 268
257 /* need to log the entire buffer */ 269 if (tp) {
258 xfs_trans_log_buf(tp, fbuf, 0, 270 /*
259 BBTOB(fbuf->b_length) - 1); 271 * Mark the buffer as an inode allocation buffer so it
272 * sticks in AIL at the point of this allocation
273 * transaction. This ensures the they are on disk before
274 * the tail of the log can be moved past this
275 * transaction (i.e. by preventing relogging from moving
276 * it forward in the log).
277 */
278 xfs_trans_inode_alloc_buf(tp, fbuf);
279 if (version == 3) {
280 /*
281 * Mark the buffer as ordered so that they are
282 * not physically logged in the transaction but
283 * still tracked in the AIL as part of the
284 * transaction and pin the log appropriately.
285 */
286 xfs_trans_ordered_buf(tp, fbuf);
287 xfs_trans_log_buf(tp, fbuf, 0,
288 BBTOB(fbuf->b_length) - 1);
289 }
290 } else {
291 fbuf->b_flags |= XBF_DONE;
292 xfs_buf_delwri_queue(fbuf, buffer_list);
293 xfs_buf_relse(fbuf);
260 } 294 }
261 xfs_trans_inode_alloc_buf(tp, fbuf);
262 } 295 }
263 return 0; 296 return 0;
264} 297}
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
303 * First try to allocate inodes contiguous with the last-allocated 336 * First try to allocate inodes contiguous with the last-allocated
304 * chunk of inodes. If the filesystem is striped, this will fill 337 * chunk of inodes. If the filesystem is striped, this will fill
305 * an entire stripe unit with inodes. 338 * an entire stripe unit with inodes.
306 */ 339 */
307 agi = XFS_BUF_TO_AGI(agbp); 340 agi = XFS_BUF_TO_AGI(agbp);
308 newino = be32_to_cpu(agi->agi_newino); 341 newino = be32_to_cpu(agi->agi_newino);
309 agno = be32_to_cpu(agi->agi_seqno); 342 agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
402 * rather than a linear progression to prevent the next generation 435 * rather than a linear progression to prevent the next generation
403 * number from being easily guessable. 436 * number from being easily guessable.
404 */ 437 */
405 error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, 438 error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
406 args.len, prandom_u32()); 439 args.len, prandom_u32());
407 440
408 if (error) 441 if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
615 struct xfs_btree_cur *cur, 648 struct xfs_btree_cur *cur,
616 xfs_agino_t agino, 649 xfs_agino_t agino,
617 xfs_inobt_rec_incore_t *rec, 650 xfs_inobt_rec_incore_t *rec,
618 int *done, 651 int *done)
619 int left)
620{ 652{
621 int error; 653 int error;
622 int i; 654 int i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
724 pag->pagl_leftrec != NULLAGINO && 756 pag->pagl_leftrec != NULLAGINO &&
725 pag->pagl_rightrec != NULLAGINO) { 757 pag->pagl_rightrec != NULLAGINO) {
726 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, 758 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
727 &trec, &doneleft, 1); 759 &trec, &doneleft);
728 if (error) 760 if (error)
729 goto error1; 761 goto error1;
730 762
731 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, 763 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
732 &rec, &doneright, 0); 764 &rec, &doneright);
733 if (error) 765 if (error)
734 goto error1; 766 goto error1;
735 } else { 767 } else {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
150int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153/*
154 * Inode chunk initialisation routine
155 */
156int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
157 struct list_head *buffer_list,
158 xfs_agnumber_t agno, xfs_agblock_t agbno,
159 xfs_agblock_t length, unsigned int gen);
160
153extern const struct xfs_buf_ops xfs_agi_buf_ops; 161extern const struct xfs_buf_ops xfs_agi_buf_ops;
154 162
155#endif /* __XFS_IALLOC_H__ */ 163#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..9560dc1f15a9 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,8 @@ xfs_iget_cache_miss(
335 iflags = XFS_INEW; 335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE) 336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE; 337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL; 338 ip->i_udquot = NULL;
339 ip->i_gdquot = NULL;
339 xfs_iflags_set(ip, iflags); 340 xfs_iflags_set(ip, iflags);
340 341
341 /* insert the new inode */ 342 /* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); 40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *); 41void xfs_eofblocks_worker(struct work_struct *);
42 42
43int xfs_sync_inode_grab(struct xfs_inode *ip);
44int xfs_inode_ag_iterator(struct xfs_mount *mp, 43int xfs_inode_ag_iterator(struct xfs_mount *mp,
45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, 44 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
46 int flags, void *args), 45 int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
1/*
2 * Copyright (c) 2008-2010, 2013 Dave Chinner
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h"
30#include "xfs_trans_priv.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_error.h"
41#include "xfs_icreate_item.h"
42
43kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
44
45static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
46{
47 return container_of(lip, struct xfs_icreate_item, ic_item);
48}
49
50/*
51 * This returns the number of iovecs needed to log the given inode item.
52 *
53 * We only need one iovec for the icreate log structure.
54 */
55STATIC uint
56xfs_icreate_item_size(
57 struct xfs_log_item *lip)
58{
59 return 1;
60}
61
62/*
63 * This is called to fill in the vector of log iovecs for the
64 * given inode create log item.
65 */
66STATIC void
67xfs_icreate_item_format(
68 struct xfs_log_item *lip,
69 struct xfs_log_iovec *log_vector)
70{
71 struct xfs_icreate_item *icp = ICR_ITEM(lip);
72
73 log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
74 log_vector->i_len = sizeof(struct xfs_icreate_log);
75 log_vector->i_type = XLOG_REG_TYPE_ICREATE;
76}
77
78
79/* Pinning has no meaning for the create item, so just return. */
80STATIC void
81xfs_icreate_item_pin(
82 struct xfs_log_item *lip)
83{
84}
85
86
87/* pinning has no meaning for the create item, so just return. */
88STATIC void
89xfs_icreate_item_unpin(
90 struct xfs_log_item *lip,
91 int remove)
92{
93}
94
95STATIC void
96xfs_icreate_item_unlock(
97 struct xfs_log_item *lip)
98{
99 struct xfs_icreate_item *icp = ICR_ITEM(lip);
100
101 if (icp->ic_item.li_flags & XFS_LI_ABORTED)
102 kmem_zone_free(xfs_icreate_zone, icp);
103 return;
104}
105
106/*
107 * Because we have ordered buffers being tracked in the AIL for the inode
108 * creation, we don't need the create item after this. Hence we can free
109 * the log item and return -1 to tell the caller we're done with the item.
110 */
111STATIC xfs_lsn_t
112xfs_icreate_item_committed(
113 struct xfs_log_item *lip,
114 xfs_lsn_t lsn)
115{
116 struct xfs_icreate_item *icp = ICR_ITEM(lip);
117
118 kmem_zone_free(xfs_icreate_zone, icp);
119 return (xfs_lsn_t)-1;
120}
121
122/* item can never get into the AIL */
123STATIC uint
124xfs_icreate_item_push(
125 struct xfs_log_item *lip,
126 struct list_head *buffer_list)
127{
128 ASSERT(0);
129 return XFS_ITEM_SUCCESS;
130}
131
132/* Ordered buffers do the dependency tracking here, so this does nothing. */
133STATIC void
134xfs_icreate_item_committing(
135 struct xfs_log_item *lip,
136 xfs_lsn_t lsn)
137{
138}
139
140/*
141 * This is the ops vector shared by all buf log items.
142 */
143static struct xfs_item_ops xfs_icreate_item_ops = {
144 .iop_size = xfs_icreate_item_size,
145 .iop_format = xfs_icreate_item_format,
146 .iop_pin = xfs_icreate_item_pin,
147 .iop_unpin = xfs_icreate_item_unpin,
148 .iop_push = xfs_icreate_item_push,
149 .iop_unlock = xfs_icreate_item_unlock,
150 .iop_committed = xfs_icreate_item_committed,
151 .iop_committing = xfs_icreate_item_committing,
152};
153
154
155/*
156 * Initialize the inode log item for a newly allocated (in-core) inode.
157 *
158 * Inode extents can only reside within an AG. Hence specify the starting
159 * block for the inode chunk by offset within an AG as well as the
160 * length of the allocated extent.
161 *
162 * This joins the item to the transaction and marks it dirty so
163 * that we don't need a separate call to do this, nor does the
164 * caller need to know anything about the icreate item.
165 */
166void
167xfs_icreate_log(
168 struct xfs_trans *tp,
169 xfs_agnumber_t agno,
170 xfs_agblock_t agbno,
171 unsigned int count,
172 unsigned int inode_size,
173 xfs_agblock_t length,
174 unsigned int generation)
175{
176 struct xfs_icreate_item *icp;
177
178 icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
179
180 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
181 &xfs_icreate_item_ops);
182
183 icp->ic_format.icl_type = XFS_LI_ICREATE;
184 icp->ic_format.icl_size = 1; /* single vector */
185 icp->ic_format.icl_ag = cpu_to_be32(agno);
186 icp->ic_format.icl_agbno = cpu_to_be32(agbno);
187 icp->ic_format.icl_count = cpu_to_be32(count);
188 icp->ic_format.icl_isize = cpu_to_be32(inode_size);
189 icp->ic_format.icl_length = cpu_to_be32(length);
190 icp->ic_format.icl_gen = cpu_to_be32(generation);
191
192 xfs_trans_add_item(tp, &icp->ic_item);
193 tp->t_flags |= XFS_TRANS_DIRTY;
194 icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
195}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (c) 2008-2010, Dave Chinner
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef XFS_ICREATE_ITEM_H
19#define XFS_ICREATE_ITEM_H 1
20
21/*
22 * on disk log item structure
23 *
24 * Log recovery assumes the first two entries are the type and size and they fit
25 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
26 * decoding can be done correctly.
27 */
28struct xfs_icreate_log {
29 __uint16_t icl_type; /* type of log format structure */
30 __uint16_t icl_size; /* size of log format structure */
31 __be32 icl_ag; /* ag being allocated in */
32 __be32 icl_agbno; /* start block of inode range */
33 __be32 icl_count; /* number of inodes to initialise */
34 __be32 icl_isize; /* size of inodes */
35 __be32 icl_length; /* length of extent to initialise */
36 __be32 icl_gen; /* inode generation number to use */
37};
38
39/* in memory log item structure */
40struct xfs_icreate_item {
41 struct xfs_log_item ic_item;
42 struct xfs_icreate_log ic_format;
43};
44
45extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
46
47void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
48 xfs_agblock_t agbno, unsigned int count,
49 unsigned int inode_size, xfs_agblock_t length,
50 unsigned int generation);
51
52#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f98f52..9ecfe1e559fc 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1028,6 +1028,11 @@ xfs_dinode_calc_crc(
1028 1028
1029/* 1029/*
1030 * Read the disk inode attributes into the in-core inode structure. 1030 * Read the disk inode attributes into the in-core inode structure.
1031 *
1032 * If we are initialising a new inode and we are not utilising the
1033 * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core
1034 * with a random generation number. If we are keeping inodes around, we need to
1035 * read the inode cluster to get the existing generation number off disk.
1031 */ 1036 */
1032int 1037int
1033xfs_iread( 1038xfs_iread(
@@ -1047,6 +1052,22 @@ xfs_iread(
1047 if (error) 1052 if (error)
1048 return error; 1053 return error;
1049 1054
1055 /* shortcut IO on inode allocation if possible */
1056 if ((iget_flags & XFS_IGET_CREATE) &&
1057 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1058 /* initialise the on-disk inode core */
1059 memset(&ip->i_d, 0, sizeof(ip->i_d));
1060 ip->i_d.di_magic = XFS_DINODE_MAGIC;
1061 ip->i_d.di_gen = prandom_u32();
1062 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1063 ip->i_d.di_version = 3;
1064 ip->i_d.di_ino = ip->i_ino;
1065 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
1066 } else
1067 ip->i_d.di_version = 2;
1068 return 0;
1069 }
1070
1050 /* 1071 /*
1051 * Get pointers to the on-disk inode and the buffer containing it. 1072 * Get pointers to the on-disk inode and the buffer containing it.
1052 */ 1073 */
@@ -1133,17 +1154,16 @@ xfs_iread(
1133 xfs_buf_set_ref(bp, XFS_INO_REF); 1154 xfs_buf_set_ref(bp, XFS_INO_REF);
1134 1155
1135 /* 1156 /*
1136 * Use xfs_trans_brelse() to release the buffer containing the 1157 * Use xfs_trans_brelse() to release the buffer containing the on-disk
1137 * on-disk inode, because it was acquired with xfs_trans_read_buf() 1158 * inode, because it was acquired with xfs_trans_read_buf() in
1138 * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal 1159 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
1139 * brelse(). If we're within a transaction, then xfs_trans_brelse() 1160 * brelse(). If we're within a transaction, then xfs_trans_brelse()
1140 * will only release the buffer if it is not dirty within the 1161 * will only release the buffer if it is not dirty within the
1141 * transaction. It will be OK to release the buffer in this case, 1162 * transaction. It will be OK to release the buffer in this case,
1142 * because inodes on disk are never destroyed and we will be 1163 * because inodes on disk are never destroyed and we will be locking the
1143 * locking the new in-core inode before putting it in the hash 1164 * new in-core inode before putting it in the cache where other
1144 * table where other processes can find it. Thus we don't have 1165 * processes can find it. Thus we don't have to worry about the inode
1145 * to worry about the inode being changed just because we released 1166 * being changed just because we released the buffer.
1146 * the buffer.
1147 */ 1167 */
1148 out_brelse: 1168 out_brelse:
1149 xfs_trans_brelse(tp, bp); 1169 xfs_trans_brelse(tp, bp);
@@ -2028,8 +2048,6 @@ xfs_ifree(
2028 int error; 2048 int error;
2029 int delete; 2049 int delete;
2030 xfs_ino_t first_ino; 2050 xfs_ino_t first_ino;
2031 xfs_dinode_t *dip;
2032 xfs_buf_t *ibp;
2033 2051
2034 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2052 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2035 ASSERT(ip->i_d.di_nlink == 0); 2053 ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2060,13 @@ xfs_ifree(
2042 * Pull the on-disk inode from the AGI unlinked list. 2060 * Pull the on-disk inode from the AGI unlinked list.
2043 */ 2061 */
2044 error = xfs_iunlink_remove(tp, ip); 2062 error = xfs_iunlink_remove(tp, ip);
2045 if (error != 0) { 2063 if (error)
2046 return error; 2064 return error;
2047 }
2048 2065
2049 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2066 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2050 if (error != 0) { 2067 if (error)
2051 return error; 2068 return error;
2052 } 2069
2053 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2070 ip->i_d.di_mode = 0; /* mark incore inode as free */
2054 ip->i_d.di_flags = 0; 2071 ip->i_d.di_flags = 0;
2055 ip->i_d.di_dmevmask = 0; 2072 ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2078,10 @@ xfs_ifree(
2061 * by reincarnations of this inode. 2078 * by reincarnations of this inode.
2062 */ 2079 */
2063 ip->i_d.di_gen++; 2080 ip->i_d.di_gen++;
2064
2065 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2081 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2066 2082
2067 error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, 2083 if (delete)
2068 0, 0);
2069 if (error)
2070 return error;
2071
2072 /*
2073 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
2074 * from picking up this inode when it is reclaimed (its incore state
2075 * initialzed but not flushed to disk yet). The in-core di_mode is
2076 * already cleared and a corresponding transaction logged.
2077 * The hack here just synchronizes the in-core to on-disk
2078 * di_mode value in advance before the actual inode sync to disk.
2079 * This is OK because the inode is already unlinked and would never
2080 * change its di_mode again for this inode generation.
2081 * This is a temporary hack that would require a proper fix
2082 * in the future.
2083 */
2084 dip->di_mode = 0;
2085
2086 if (delete) {
2087 error = xfs_ifree_cluster(ip, tp, first_ino); 2084 error = xfs_ifree_cluster(ip, tp, first_ino);
2088 }
2089 2085
2090 return error; 2086 return error;
2091} 2087}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8f8aaee7f379..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
284 return 0; 284 return 0;
285 285
286 /* 286 /*
287 * If the file is smaller than the minimum prealloc and we are using
288 * dynamic preallocation, don't do any preallocation at all as it is
289 * likely this is the only write to the file that is going to be done.
290 */
291 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
292 XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
293 return 0;
294
295 /*
287 * If there are any real blocks past eof, then don't 296 * If there are any real blocks past eof, then don't
288 * do any speculative allocation. 297 * do any speculative allocation.
289 */ 298 */
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
345 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) 354 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
346 return 0; 355 return 0;
347 356
357 /* If the file is small, then use the minimum prealloc */
358 if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
359 return 0;
360
348 /* 361 /*
349 * As we write multiple pages, the offset will always align to the 362 * As we write multiple pages, the offset will always align to the
350 * start of a page and hence point to a hole at EOF. i.e. if the size is 363 * start of a page and hence point to a hole at EOF. i.e. if the size is
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa81112..c69bbc493cb0 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -987,7 +987,8 @@ xfs_fiemap_format(
987 if (bmv->bmv_oflags & BMV_OF_PREALLOC) 987 if (bmv->bmv_oflags & BMV_OF_PREALLOC)
988 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; 988 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
989 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { 989 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
990 fiemap_flags |= FIEMAP_EXTENT_DELALLOC; 990 fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
991 FIEMAP_EXTENT_UNKNOWN);
991 physical = 0; /* no block yet */ 992 physical = 0; /* no block yet */
992 } 993 }
993 if (bmv->bmv_oflags & BMV_OF_LAST) 994 if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..bc92c5306a17 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
43{ 43{
44 return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || 44 return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
45 (xfs_sb_version_hasquota(&mp->m_sb) && 45 (xfs_sb_version_hasquota(&mp->m_sb) &&
46 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); 46 xfs_is_quota_inode(&mp->m_sb, ino)));
47} 47}
48 48
49/* 49/*
@@ -383,11 +383,13 @@ xfs_bulkstat(
383 * Also start read-ahead now for this chunk. 383 * Also start read-ahead now for this chunk.
384 */ 384 */
385 if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 385 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
386 struct blk_plug plug;
386 /* 387 /*
387 * Loop over all clusters in the next chunk. 388 * Loop over all clusters in the next chunk.
388 * Do a readahead if there are any allocated 389 * Do a readahead if there are any allocated
389 * inodes in that cluster. 390 * inodes in that cluster.
390 */ 391 */
392 blk_start_plug(&plug);
391 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); 393 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
392 for (chunkidx = 0; 394 for (chunkidx = 0;
393 chunkidx < XFS_INODES_PER_CHUNK; 395 chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +401,7 @@ xfs_bulkstat(
399 agbno, nbcluster, 401 agbno, nbcluster,
400 &xfs_inode_buf_ops); 402 &xfs_inode_buf_ops);
401 } 403 }
404 blk_finish_plug(&plug);
402 irbp->ir_startino = r.ir_startino; 405 irbp->ir_startino = r.ir_startino;
403 irbp->ir_freecount = r.ir_freecount; 406 irbp->ir_freecount = r.ir_freecount;
404 irbp->ir_free = r.ir_free; 407 irbp->ir_free = r.ir_free;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b345a7c85153..d852a2b3e1fd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
1963 headers++; 1963 headers++;
1964 1964
1965 for (lv = log_vector; lv; lv = lv->lv_next) { 1965 for (lv = log_vector; lv; lv = lv->lv_next) {
1966 /* we don't write ordered log vectors */
1967 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
1968 continue;
1969
1966 headers += lv->lv_niovecs; 1970 headers += lv->lv_niovecs;
1967 1971
1968 for (i = 0; i < lv->lv_niovecs; i++) { 1972 for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
2216 index = 0; 2220 index = 0;
2217 lv = log_vector; 2221 lv = log_vector;
2218 vecp = lv->lv_iovecp; 2222 vecp = lv->lv_iovecp;
2219 while (lv && index < lv->lv_niovecs) { 2223 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2220 void *ptr; 2224 void *ptr;
2221 int log_offset; 2225 int log_offset;
2222 2226
@@ -2236,13 +2240,22 @@ xlog_write(
2236 * This loop writes out as many regions as can fit in the amount 2240 * This loop writes out as many regions as can fit in the amount
2237 * of space which was allocated by xlog_state_get_iclog_space(). 2241 * of space which was allocated by xlog_state_get_iclog_space().
2238 */ 2242 */
2239 while (lv && index < lv->lv_niovecs) { 2243 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2240 struct xfs_log_iovec *reg = &vecp[index]; 2244 struct xfs_log_iovec *reg;
2241 struct xlog_op_header *ophdr; 2245 struct xlog_op_header *ophdr;
2242 int start_rec_copy; 2246 int start_rec_copy;
2243 int copy_len; 2247 int copy_len;
2244 int copy_off; 2248 int copy_off;
2249 bool ordered = false;
2250
2251 /* ordered log vectors have no regions to write */
2252 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
2253 ASSERT(lv->lv_niovecs == 0);
2254 ordered = true;
2255 goto next_lv;
2256 }
2245 2257
2258 reg = &vecp[index];
2246 ASSERT(reg->i_len % sizeof(__int32_t) == 0); 2259 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
2247 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); 2260 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
2248 2261
@@ -2302,12 +2315,13 @@ xlog_write(
2302 break; 2315 break;
2303 2316
2304 if (++index == lv->lv_niovecs) { 2317 if (++index == lv->lv_niovecs) {
2318next_lv:
2305 lv = lv->lv_next; 2319 lv = lv->lv_next;
2306 index = 0; 2320 index = 0;
2307 if (lv) 2321 if (lv)
2308 vecp = lv->lv_iovecp; 2322 vecp = lv->lv_iovecp;
2309 } 2323 }
2310 if (record_cnt == 0) { 2324 if (record_cnt == 0 && ordered == false) {
2311 if (!lv) 2325 if (!lv)
2312 return 0; 2326 return 0;
2313 break; 2327 break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
88#define XLOG_REG_TYPE_UNMOUNT 17 88#define XLOG_REG_TYPE_UNMOUNT 17
89#define XLOG_REG_TYPE_COMMIT 18 89#define XLOG_REG_TYPE_COMMIT 18
90#define XLOG_REG_TYPE_TRANSHDR 19 90#define XLOG_REG_TYPE_TRANSHDR 19
91#define XLOG_REG_TYPE_MAX 19 91#define XLOG_REG_TYPE_ICREATE 20
92#define XLOG_REG_TYPE_MAX 20
92 93
93typedef struct xfs_log_iovec { 94typedef struct xfs_log_iovec {
94 void *i_addr; /* beginning address of region */ 95 void *i_addr; /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
105 int lv_buf_len; /* size of formatted buffer */ 106 int lv_buf_len; /* size of formatted buffer */
106}; 107};
107 108
109#define XFS_LOG_VEC_ORDERED (-1)
110
108/* 111/*
109 * Structure used to pass callback function and the function's argument 112 * Structure used to pass callback function and the function's argument
110 * to the log manager. 113 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d0833b54e55d..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
127 int index; 127 int index;
128 int len = 0; 128 int len = 0;
129 uint niovecs; 129 uint niovecs;
130 bool ordered = false;
130 131
131 /* Skip items which aren't dirty in this transaction. */ 132 /* Skip items which aren't dirty in this transaction. */
132 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 133 if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
137 if (!niovecs) 138 if (!niovecs)
138 continue; 139 continue;
139 140
141 /*
142 * Ordered items need to be tracked but we do not wish to write
143 * them. We need a logvec to track the object, but we do not
144 * need an iovec or buffer to be allocated for copying data.
145 */
146 if (niovecs == XFS_LOG_VEC_ORDERED) {
147 ordered = true;
148 niovecs = 0;
149 }
150
140 new_lv = kmem_zalloc(sizeof(*new_lv) + 151 new_lv = kmem_zalloc(sizeof(*new_lv) +
141 niovecs * sizeof(struct xfs_log_iovec), 152 niovecs * sizeof(struct xfs_log_iovec),
142 KM_SLEEP|KM_NOFS); 153 KM_SLEEP|KM_NOFS);
143 154
155 new_lv->lv_item = lidp->lid_item;
156 new_lv->lv_niovecs = niovecs;
157 if (ordered) {
158 /* track as an ordered logvec */
159 new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
160 goto next;
161 }
162
144 /* The allocated iovec region lies beyond the log vector. */ 163 /* The allocated iovec region lies beyond the log vector. */
145 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; 164 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
146 new_lv->lv_niovecs = niovecs;
147 new_lv->lv_item = lidp->lid_item;
148 165
149 /* build the vector array and calculate it's length */ 166 /* build the vector array and calculate it's length */
150 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); 167 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
165 } 182 }
166 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); 183 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
167 184
185next:
168 if (!ret_lv) 186 if (!ret_lv)
169 ret_lv = new_lv; 187 ret_lv = new_lv;
170 else 188 else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
191 209
192 if (old) { 210 if (old) {
193 /* existing lv on log item, space used is a delta */ 211 /* existing lv on log item, space used is a delta */
194 ASSERT(!list_empty(&lv->lv_item->li_cil)); 212 ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
195 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); 213 old->lv_buf_len == XFS_LOG_VEC_ORDERED);
214
215 /*
216 * If the new item is ordered, keep the old one that is already
217 * tracking dirty or ordered regions
218 */
219 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
220 ASSERT(!lv->lv_buf);
221 kmem_free(lv);
222 return;
223 }
196 224
197 *len += lv->lv_buf_len - old->lv_buf_len; 225 *len += lv->lv_buf_len - old->lv_buf_len;
198 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; 226 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
201 } else { 229 } else {
202 /* new lv, must pin the log item */ 230 /* new lv, must pin the log item */
203 ASSERT(!lv->lv_item->li_lv); 231 ASSERT(!lv->lv_item->li_lv);
204 ASSERT(list_empty(&lv->lv_item->li_cil));
205 232
206 *len += lv->lv_buf_len; 233 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
207 *diff_iovecs += lv->lv_niovecs; 234 *len += lv->lv_buf_len;
235 *diff_iovecs += lv->lv_niovecs;
236 }
208 IOP_PIN(lv->lv_item); 237 IOP_PIN(lv->lv_item);
209 238
210 } 239 }
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
259 * We can do this safely because the context can't checkpoint until we 288 * We can do this safely because the context can't checkpoint until we
260 * are done so it doesn't matter exactly how we update the CIL. 289 * are done so it doesn't matter exactly how we update the CIL.
261 */ 290 */
262 for (lv = log_vector; lv; lv = lv->lv_next)
263 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
264
265 /* account for space used by new iovec headers */
266 len += diff_iovecs * sizeof(xlog_op_header_t);
267
268 spin_lock(&cil->xc_cil_lock); 291 spin_lock(&cil->xc_cil_lock);
292 for (lv = log_vector; lv; ) {
293 struct xfs_log_vec *next = lv->lv_next;
269 294
270 /* move the items to the tail of the CIL */ 295 ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
271 for (lv = log_vector; lv; lv = lv->lv_next) 296 lv->lv_next = NULL;
297
298 /*
299 * xfs_cil_prepare_item() may free the lv, so move the item on
300 * the CIL first.
301 */
272 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); 302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
303 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
304 lv = next;
305 }
273 306
307 /* account for space used by new iovec headers */
308 len += diff_iovecs * sizeof(xlog_op_header_t);
274 ctx->nvecs += diff_iovecs; 309 ctx->nvecs += diff_iovecs;
275 310
276 /* 311 /*
@@ -381,9 +416,7 @@ xlog_cil_push(
381 struct xfs_cil_ctx *new_ctx; 416 struct xfs_cil_ctx *new_ctx;
382 struct xlog_in_core *commit_iclog; 417 struct xlog_in_core *commit_iclog;
383 struct xlog_ticket *tic; 418 struct xlog_ticket *tic;
384 int num_lv;
385 int num_iovecs; 419 int num_iovecs;
386 int len;
387 int error = 0; 420 int error = 0;
388 struct xfs_trans_header thdr; 421 struct xfs_trans_header thdr;
389 struct xfs_log_iovec lhdr; 422 struct xfs_log_iovec lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
428 * side which is currently locked out by the flush lock. 461 * side which is currently locked out by the flush lock.
429 */ 462 */
430 lv = NULL; 463 lv = NULL;
431 num_lv = 0;
432 num_iovecs = 0; 464 num_iovecs = 0;
433 len = 0;
434 while (!list_empty(&cil->xc_cil)) { 465 while (!list_empty(&cil->xc_cil)) {
435 struct xfs_log_item *item; 466 struct xfs_log_item *item;
436 int i;
437 467
438 item = list_first_entry(&cil->xc_cil, 468 item = list_first_entry(&cil->xc_cil,
439 struct xfs_log_item, li_cil); 469 struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
444 lv->lv_next = item->li_lv; 474 lv->lv_next = item->li_lv;
445 lv = item->li_lv; 475 lv = item->li_lv;
446 item->li_lv = NULL; 476 item->li_lv = NULL;
447
448 num_lv++;
449 num_iovecs += lv->lv_niovecs; 477 num_iovecs += lv->lv_niovecs;
450 for (i = 0; i < lv->lv_niovecs; i++)
451 len += lv->lv_iovecp[i].i_len;
452 } 478 }
453 479
454 /* 480 /*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
701 if (commit_lsn) 727 if (commit_lsn)
702 *commit_lsn = log->l_cilp->xc_ctx->sequence; 728 *commit_lsn = log->l_cilp->xc_ctx->sequence;
703 729
730 /* xlog_cil_insert_items() destroys log_vector list */
704 xlog_cil_insert_items(log, log_vector, tp->t_ticket); 731 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
705 732
706 /* check we didn't blow the reservation */ 733 /* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7cf5e4eafe28..6fcc910a50b9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,7 @@
45#include "xfs_cksum.h" 45#include "xfs_cksum.h"
46#include "xfs_trace.h" 46#include "xfs_trace.h"
47#include "xfs_icache.h" 47#include "xfs_icache.h"
48#include "xfs_icreate_item.h"
48 49
49/* Need all the magic numbers and buffer ops structures from these headers */ 50/* Need all the magic numbers and buffer ops structures from these headers */
50#include "xfs_symlink.h" 51#include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
1617 * form the cancelled buffer table. Hence they have tobe done last. 1618 * form the cancelled buffer table. Hence they have tobe done last.
1618 * 1619 *
1619 * 3. Inode allocation buffers must be replayed before inode items that 1620 * 3. Inode allocation buffers must be replayed before inode items that
1620 * read the buffer and replay changes into it. 1621 * read the buffer and replay changes into it. For filesystems using the
1622 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1623 * treated the same as inode allocation buffers as they create and
1624 * initialise the buffers directly.
1621 * 1625 *
1622 * 4. Inode unlink buffers must be replayed after inode items are replayed. 1626 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1623 * This ensures that inodes are completely flushed to the inode buffer 1627 * This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
1632 * from all the other buffers and move them to last. 1636 * from all the other buffers and move them to last.
1633 * 1637 *
1634 * Hence, 4 lists, in order from head to tail: 1638 * Hence, 4 lists, in order from head to tail:
1635 * - buffer_list for all buffers except cancelled/inode unlink buffers 1639 * - buffer_list for all buffers except cancelled/inode unlink buffers
1636 * - item_list for all non-buffer items 1640 * - item_list for all non-buffer items
1637 * - inode_buffer_list for inode unlink buffers 1641 * - inode_buffer_list for inode unlink buffers
1638 * - cancel_list for the cancelled buffers 1642 * - cancel_list for the cancelled buffers
1643 *
1644 * Note that we add objects to the tail of the lists so that first-to-last
1645 * ordering is preserved within the lists. Adding objects to the head of the
1646 * list means when we traverse from the head we walk them in last-to-first
1647 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1648 * but for all other items there may be specific ordering that we need to
1649 * preserve.
1639 */ 1650 */
1640STATIC int 1651STATIC int
1641xlog_recover_reorder_trans( 1652xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
1655 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1666 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1656 1667
1657 switch (ITEM_TYPE(item)) { 1668 switch (ITEM_TYPE(item)) {
1669 case XFS_LI_ICREATE:
1670 list_move_tail(&item->ri_list, &buffer_list);
1671 break;
1658 case XFS_LI_BUF: 1672 case XFS_LI_BUF:
1659 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1673 if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1660 trace_xfs_log_recover_item_reorder_head(log, 1674 trace_xfs_log_recover_item_reorder_head(log,
@@ -2982,6 +2996,93 @@ xlog_recover_efd_pass2(
2982} 2996}
2983 2997
2984/* 2998/*
2999 * This routine is called when an inode create format structure is found in a
3000 * committed transaction in the log. It's purpose is to initialise the inodes
3001 * being allocated on disk. This requires us to get inode cluster buffers that
3002 * match the range to be intialised, stamped with inode templates and written
3003 * by delayed write so that subsequent modifications will hit the cached buffer
3004 * and only need writing out at the end of recovery.
3005 */
3006STATIC int
3007xlog_recover_do_icreate_pass2(
3008 struct xlog *log,
3009 struct list_head *buffer_list,
3010 xlog_recover_item_t *item)
3011{
3012 struct xfs_mount *mp = log->l_mp;
3013 struct xfs_icreate_log *icl;
3014 xfs_agnumber_t agno;
3015 xfs_agblock_t agbno;
3016 unsigned int count;
3017 unsigned int isize;
3018 xfs_agblock_t length;
3019
3020 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3021 if (icl->icl_type != XFS_LI_ICREATE) {
3022 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3023 return EINVAL;
3024 }
3025
3026 if (icl->icl_size != 1) {
3027 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3028 return EINVAL;
3029 }
3030
3031 agno = be32_to_cpu(icl->icl_ag);
3032 if (agno >= mp->m_sb.sb_agcount) {
3033 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3034 return EINVAL;
3035 }
3036 agbno = be32_to_cpu(icl->icl_agbno);
3037 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3038 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3039 return EINVAL;
3040 }
3041 isize = be32_to_cpu(icl->icl_isize);
3042 if (isize != mp->m_sb.sb_inodesize) {
3043 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3044 return EINVAL;
3045 }
3046 count = be32_to_cpu(icl->icl_count);
3047 if (!count) {
3048 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3049 return EINVAL;
3050 }
3051 length = be32_to_cpu(icl->icl_length);
3052 if (!length || length >= mp->m_sb.sb_agblocks) {
3053 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3054 return EINVAL;
3055 }
3056
3057 /* existing allocation is fixed value */
3058 ASSERT(count == XFS_IALLOC_INODES(mp));
3059 ASSERT(length == XFS_IALLOC_BLOCKS(mp));
3060 if (count != XFS_IALLOC_INODES(mp) ||
3061 length != XFS_IALLOC_BLOCKS(mp)) {
3062 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3063 return EINVAL;
3064 }
3065
3066 /*
3067 * Inode buffers can be freed. Do not replay the inode initialisation as
3068 * we could be overwriting something written after this inode buffer was
3069 * cancelled.
3070 *
3071 * XXX: we need to iterate all buffers and only init those that are not
3072 * cancelled. I think that a more fine grained factoring of
3073 * xfs_ialloc_inode_init may be appropriate here to enable this to be
3074 * done easily.
3075 */
3076 if (xlog_check_buffer_cancelled(log,
3077 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3078 return 0;
3079
3080 xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
3081 be32_to_cpu(icl->icl_gen));
3082 return 0;
3083}
3084
3085/*
2985 * Free up any resources allocated by the transaction 3086 * Free up any resources allocated by the transaction
2986 * 3087 *
2987 * Remember that EFIs, EFDs, and IUNLINKs are handled later. 3088 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -3023,6 +3124,7 @@ xlog_recover_commit_pass1(
3023 case XFS_LI_EFI: 3124 case XFS_LI_EFI:
3024 case XFS_LI_EFD: 3125 case XFS_LI_EFD:
3025 case XFS_LI_DQUOT: 3126 case XFS_LI_DQUOT:
3127 case XFS_LI_ICREATE:
3026 /* nothing to do in pass 1 */ 3128 /* nothing to do in pass 1 */
3027 return 0; 3129 return 0;
3028 default: 3130 default:
@@ -3053,6 +3155,8 @@ xlog_recover_commit_pass2(
3053 return xlog_recover_efd_pass2(log, item); 3155 return xlog_recover_efd_pass2(log, item);
3054 case XFS_LI_DQUOT: 3156 case XFS_LI_DQUOT:
3055 return xlog_recover_dquot_pass2(log, buffer_list, item); 3157 return xlog_recover_dquot_pass2(log, buffer_list, item);
3158 case XFS_LI_ICREATE:
3159 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3056 case XFS_LI_QUOTAOFF: 3160 case XFS_LI_QUOTAOFF:
3057 /* nothing to do in pass2 */ 3161 /* nothing to do in pass2 */
3058 return 0; 3162 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8e310c05097..2b0ba3581656 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -336,6 +336,14 @@ xfs_mount_validate_sb(
336 return XFS_ERROR(EWRONGFS); 336 return XFS_ERROR(EWRONGFS);
337 } 337 }
338 338
339 if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
340 (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
341 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
342 xfs_notice(mp,
343"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
344 return XFS_ERROR(EFSCORRUPTED);
345 }
346
339 /* 347 /*
340 * Version 5 superblock feature mask validation. Reject combinations the 348 * Version 5 superblock feature mask validation. Reject combinations the
341 * kernel cannot support up front before checking anything else. For 349 * kernel cannot support up front before checking anything else. For
@@ -561,6 +569,18 @@ out_unwind:
561 return error; 569 return error;
562} 570}
563 571
572static void
573xfs_sb_quota_from_disk(struct xfs_sb *sbp)
574{
575 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
576 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
577 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
578 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
579 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
580 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
581 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
582}
583
564void 584void
565xfs_sb_from_disk( 585xfs_sb_from_disk(
566 struct xfs_sb *to, 586 struct xfs_sb *to,
@@ -622,6 +642,35 @@ xfs_sb_from_disk(
622 to->sb_lsn = be64_to_cpu(from->sb_lsn); 642 to->sb_lsn = be64_to_cpu(from->sb_lsn);
623} 643}
624 644
645static inline void
646xfs_sb_quota_to_disk(
647 xfs_dsb_t *to,
648 xfs_sb_t *from,
649 __int64_t *fields)
650{
651 __uint16_t qflags = from->sb_qflags;
652
653 if (*fields & XFS_SB_QFLAGS) {
654 /*
655 * The in-core version of sb_qflags do not have
656 * XFS_OQUOTA_* flags, whereas the on-disk version
657 * does. So, convert incore XFS_{PG}QUOTA_* flags
658 * to on-disk XFS_OQUOTA_* flags.
659 */
660 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
661 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
662
663 if (from->sb_qflags &
664 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
665 qflags |= XFS_OQUOTA_ENFD;
666 if (from->sb_qflags &
667 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
668 qflags |= XFS_OQUOTA_CHKD;
669 to->sb_qflags = cpu_to_be16(qflags);
670 *fields &= ~XFS_SB_QFLAGS;
671 }
672}
673
625/* 674/*
626 * Copy in core superblock to ondisk one. 675 * Copy in core superblock to ondisk one.
627 * 676 *
@@ -643,6 +692,7 @@ xfs_sb_to_disk(
643 if (!fields) 692 if (!fields)
644 return; 693 return;
645 694
695 xfs_sb_quota_to_disk(to, from, &fields);
646 while (fields) { 696 while (fields) {
647 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 697 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
648 first = xfs_sb_info[f].offset; 698 first = xfs_sb_info[f].offset;
@@ -835,6 +885,7 @@ reread:
835 */ 885 */
836 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 886 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
837 887
888 xfs_sb_quota_from_disk(&mp->m_sb);
838 /* 889 /*
839 * We must be able to do sector-sized and sector-aligned IO. 890 * We must be able to do sector-sized and sector-aligned IO.
840 */ 891 */
@@ -987,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
987 */ 1038 */
988 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 1039 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
989 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 1040 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
990 if (mp->m_flags & XFS_MOUNT_RETERR) { 1041 xfs_warn(mp,
991 xfs_warn(mp, "alignment check failed: " 1042 "alignment check failed: sunit/swidth vs. blocksize(%d)",
992 "(sunit/swidth vs. blocksize)"); 1043 sbp->sb_blocksize);
993 return XFS_ERROR(EINVAL); 1044 return XFS_ERROR(EINVAL);
994 }
995 mp->m_dalign = mp->m_swidth = 0;
996 } else { 1045 } else {
997 /* 1046 /*
998 * Convert the stripe unit and width to FSBs. 1047 * Convert the stripe unit and width to FSBs.
999 */ 1048 */
1000 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); 1049 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
1001 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { 1050 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
1002 if (mp->m_flags & XFS_MOUNT_RETERR) {
1003 xfs_warn(mp, "alignment check failed: "
1004 "(sunit/swidth vs. ag size)");
1005 return XFS_ERROR(EINVAL);
1006 }
1007 xfs_warn(mp, 1051 xfs_warn(mp,
1008 "stripe alignment turned off: sunit(%d)/swidth(%d) " 1052 "alignment check failed: sunit/swidth vs. agsize(%d)",
1009 "incompatible with agsize(%d)", 1053 sbp->sb_agblocks);
1010 mp->m_dalign, mp->m_swidth, 1054 return XFS_ERROR(EINVAL);
1011 sbp->sb_agblocks);
1012
1013 mp->m_dalign = 0;
1014 mp->m_swidth = 0;
1015 } else if (mp->m_dalign) { 1055 } else if (mp->m_dalign) {
1016 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 1056 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
1017 } else { 1057 } else {
1018 if (mp->m_flags & XFS_MOUNT_RETERR) { 1058 xfs_warn(mp,
1019 xfs_warn(mp, "alignment check failed: " 1059 "alignment check failed: sunit(%d) less than bsize(%d)",
1020 "sunit(%d) less than bsize(%d)", 1060 mp->m_dalign, sbp->sb_blocksize);
1021 mp->m_dalign, 1061 return XFS_ERROR(EINVAL);
1022 mp->m_blockmask +1);
1023 return XFS_ERROR(EINVAL);
1024 }
1025 mp->m_swidth = 0;
1026 } 1062 }
1027 } 1063 }
1028 1064
@@ -1039,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
1039 sbp->sb_width = mp->m_swidth; 1075 sbp->sb_width = mp->m_swidth;
1040 mp->m_update_flags |= XFS_SB_WIDTH; 1076 mp->m_update_flags |= XFS_SB_WIDTH;
1041 } 1077 }
1078 } else {
1079 xfs_warn(mp,
1080 "cannot change alignment: superblock does not support data alignment");
1081 return XFS_ERROR(EINVAL);
1042 } 1082 }
1043 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 1083 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
1044 xfs_sb_version_hasdalign(&mp->m_sb)) { 1084 xfs_sb_version_hasdalign(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b004cecdfb04..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
192 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ 192 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
193 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ 193 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
194 uint m_chsize; /* size of next field */ 194 uint m_chsize; /* size of next field */
195 struct xfs_chash *m_chash; /* fs private inode per-cluster
196 * hash table */
197 atomic_t m_active_trans; /* number trans frozen */ 195 atomic_t m_active_trans; /* number trans frozen */
198#ifdef HAVE_PERCPU_SB 196#ifdef HAVE_PERCPU_SB
199 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ 197 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
229 operations, typically for 227 operations, typically for
230 disk errors in metadata */ 228 disk errors in metadata */
231#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ 229#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
232#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
233 user */
234#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 230#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
235 allocations */ 231 allocations */
236#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ 232#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb6e71e..7a3e007b49f4 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
70 void *data) 70 void *data)
71{ 71{
72 struct xfs_quotainfo *qi = mp->m_quotainfo; 72 struct xfs_quotainfo *qi = mp->m_quotainfo;
73 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 73 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
74 uint32_t next_index; 74 uint32_t next_index;
75 int last_error = 0; 75 int last_error = 0;
76 int skipped; 76 int skipped;
@@ -189,7 +189,7 @@ xfs_qm_dqpurge(
189 xfs_dqfunlock(dqp); 189 xfs_dqfunlock(dqp);
190 xfs_dqunlock(dqp); 190 xfs_dqunlock(dqp);
191 191
192 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), 192 radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
193 be32_to_cpu(dqp->q_core.d_id)); 193 be32_to_cpu(dqp->q_core.d_id));
194 qi->qi_dquots--; 194 qi->qi_dquots--;
195 195
@@ -299,8 +299,10 @@ xfs_qm_mount_quotas(
299 */ 299 */
300 if (!XFS_IS_UQUOTA_ON(mp)) 300 if (!XFS_IS_UQUOTA_ON(mp))
301 mp->m_qflags &= ~XFS_UQUOTA_CHKD; 301 mp->m_qflags &= ~XFS_UQUOTA_CHKD;
302 if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) 302 if (!XFS_IS_GQUOTA_ON(mp))
303 mp->m_qflags &= ~XFS_OQUOTA_CHKD; 303 mp->m_qflags &= ~XFS_GQUOTA_CHKD;
304 if (!XFS_IS_PQUOTA_ON(mp))
305 mp->m_qflags &= ~XFS_PQUOTA_CHKD;
304 306
305 write_changes: 307 write_changes:
306 /* 308 /*
@@ -489,8 +491,7 @@ xfs_qm_need_dqattach(
489 return false; 491 return false;
490 if (!XFS_NOT_DQATTACHED(mp, ip)) 492 if (!XFS_NOT_DQATTACHED(mp, ip))
491 return false; 493 return false;
492 if (ip->i_ino == mp->m_sb.sb_uquotino || 494 if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
493 ip->i_ino == mp->m_sb.sb_gquotino)
494 return false; 495 return false;
495 return true; 496 return true;
496} 497}
@@ -606,8 +607,7 @@ xfs_qm_dqdetach(
606 607
607 trace_xfs_dquot_dqdetach(ip); 608 trace_xfs_dquot_dqdetach(ip);
608 609
609 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); 610 ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
610 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
611 if (ip->i_udquot) { 611 if (ip->i_udquot) {
612 xfs_qm_dqrele(ip->i_udquot); 612 xfs_qm_dqrele(ip->i_udquot);
613 ip->i_udquot = NULL; 613 ip->i_udquot = NULL;
@@ -1152,7 +1152,7 @@ xfs_qm_dqusage_adjust(
1152 * rootino must have its resources accounted for, not so with the quota 1152 * rootino must have its resources accounted for, not so with the quota
1153 * inodes. 1153 * inodes.
1154 */ 1154 */
1155 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1155 if (xfs_is_quota_inode(&mp->m_sb, ino)) {
1156 *res = BULKSTAT_RV_NOTHING; 1156 *res = BULKSTAT_RV_NOTHING;
1157 return XFS_ERROR(EINVAL); 1157 return XFS_ERROR(EINVAL);
1158 } 1158 }
@@ -1262,19 +1262,20 @@ int
1262xfs_qm_quotacheck( 1262xfs_qm_quotacheck(
1263 xfs_mount_t *mp) 1263 xfs_mount_t *mp)
1264{ 1264{
1265 int done, count, error, error2; 1265 int done, count, error, error2;
1266 xfs_ino_t lastino; 1266 xfs_ino_t lastino;
1267 size_t structsz; 1267 size_t structsz;
1268 xfs_inode_t *uip, *gip; 1268 uint flags;
1269 uint flags; 1269 LIST_HEAD (buffer_list);
1270 LIST_HEAD (buffer_list); 1270 struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
1271 struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
1271 1272
1272 count = INT_MAX; 1273 count = INT_MAX;
1273 structsz = 1; 1274 structsz = 1;
1274 lastino = 0; 1275 lastino = 0;
1275 flags = 0; 1276 flags = 0;
1276 1277
1277 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); 1278 ASSERT(uip || gip);
1278 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1279 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1279 1280
1280 xfs_notice(mp, "Quotacheck needed: Please wait."); 1281 xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1285,6 @@ xfs_qm_quotacheck(
1284 * their counters to zero. We need a clean slate. 1285 * their counters to zero. We need a clean slate.
1285 * We don't log our changes till later. 1286 * We don't log our changes till later.
1286 */ 1287 */
1287 uip = mp->m_quotainfo->qi_uquotaip;
1288 if (uip) { 1288 if (uip) {
1289 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, 1289 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
1290 &buffer_list); 1290 &buffer_list);
@@ -1293,14 +1293,14 @@ xfs_qm_quotacheck(
1293 flags |= XFS_UQUOTA_CHKD; 1293 flags |= XFS_UQUOTA_CHKD;
1294 } 1294 }
1295 1295
1296 gip = mp->m_quotainfo->qi_gquotaip;
1297 if (gip) { 1296 if (gip) {
1298 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1297 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1299 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, 1298 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
1300 &buffer_list); 1299 &buffer_list);
1301 if (error) 1300 if (error)
1302 goto error_return; 1301 goto error_return;
1303 flags |= XFS_OQUOTA_CHKD; 1302 flags |= XFS_IS_GQUOTA_ON(mp) ?
1303 XFS_GQUOTA_CHKD : XFS_PQUOTA_CHKD;
1304 } 1304 }
1305 1305
1306 do { 1306 do {
@@ -1395,15 +1395,13 @@ STATIC int
1395xfs_qm_init_quotainos( 1395xfs_qm_init_quotainos(
1396 xfs_mount_t *mp) 1396 xfs_mount_t *mp)
1397{ 1397{
1398 xfs_inode_t *uip, *gip; 1398 struct xfs_inode *uip = NULL;
1399 int error; 1399 struct xfs_inode *gip = NULL;
1400 __int64_t sbflags; 1400 int error;
1401 uint flags; 1401 __int64_t sbflags = 0;
1402 uint flags = 0;
1402 1403
1403 ASSERT(mp->m_quotainfo); 1404 ASSERT(mp->m_quotainfo);
1404 uip = gip = NULL;
1405 sbflags = 0;
1406 flags = 0;
1407 1405
1408 /* 1406 /*
1409 * Get the uquota and gquota inodes 1407 * Get the uquota and gquota inodes
@@ -1412,19 +1410,18 @@ xfs_qm_init_quotainos(
1412 if (XFS_IS_UQUOTA_ON(mp) && 1410 if (XFS_IS_UQUOTA_ON(mp) &&
1413 mp->m_sb.sb_uquotino != NULLFSINO) { 1411 mp->m_sb.sb_uquotino != NULLFSINO) {
1414 ASSERT(mp->m_sb.sb_uquotino > 0); 1412 ASSERT(mp->m_sb.sb_uquotino > 0);
1415 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 1413 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
1416 0, 0, &uip))) 1414 0, 0, &uip);
1415 if (error)
1417 return XFS_ERROR(error); 1416 return XFS_ERROR(error);
1418 } 1417 }
1419 if (XFS_IS_OQUOTA_ON(mp) && 1418 if (XFS_IS_OQUOTA_ON(mp) &&
1420 mp->m_sb.sb_gquotino != NULLFSINO) { 1419 mp->m_sb.sb_gquotino != NULLFSINO) {
1421 ASSERT(mp->m_sb.sb_gquotino > 0); 1420 ASSERT(mp->m_sb.sb_gquotino > 0);
1422 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1421 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1423 0, 0, &gip))) { 1422 0, 0, &gip);
1424 if (uip) 1423 if (error)
1425 IRELE(uip); 1424 goto error_rele;
1426 return XFS_ERROR(error);
1427 }
1428 } 1425 }
1429 } else { 1426 } else {
1430 flags |= XFS_QMOPT_SBVERSION; 1427 flags |= XFS_QMOPT_SBVERSION;
@@ -1439,10 +1436,11 @@ xfs_qm_init_quotainos(
1439 * temporarily switch to read-write to do this. 1436 * temporarily switch to read-write to do this.
1440 */ 1437 */
1441 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { 1438 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
1442 if ((error = xfs_qm_qino_alloc(mp, &uip, 1439 error = xfs_qm_qino_alloc(mp, &uip,
1443 sbflags | XFS_SB_UQUOTINO, 1440 sbflags | XFS_SB_UQUOTINO,
1444 flags | XFS_QMOPT_UQUOTA))) 1441 flags | XFS_QMOPT_UQUOTA);
1445 return XFS_ERROR(error); 1442 if (error)
1443 goto error_rele;
1446 1444
1447 flags &= ~XFS_QMOPT_SBVERSION; 1445 flags &= ~XFS_QMOPT_SBVERSION;
1448 } 1446 }
@@ -1451,18 +1449,21 @@ xfs_qm_init_quotainos(
1451 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); 1449 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1452 error = xfs_qm_qino_alloc(mp, &gip, 1450 error = xfs_qm_qino_alloc(mp, &gip,
1453 sbflags | XFS_SB_GQUOTINO, flags); 1451 sbflags | XFS_SB_GQUOTINO, flags);
1454 if (error) { 1452 if (error)
1455 if (uip) 1453 goto error_rele;
1456 IRELE(uip);
1457
1458 return XFS_ERROR(error);
1459 }
1460 } 1454 }
1461 1455
1462 mp->m_quotainfo->qi_uquotaip = uip; 1456 mp->m_quotainfo->qi_uquotaip = uip;
1463 mp->m_quotainfo->qi_gquotaip = gip; 1457 mp->m_quotainfo->qi_gquotaip = gip;
1464 1458
1465 return 0; 1459 return 0;
1460
1461error_rele:
1462 if (uip)
1463 IRELE(uip);
1464 if (gip)
1465 IRELE(gip);
1466 return XFS_ERROR(error);
1466} 1467}
1467 1468
1468STATIC void 1469STATIC void
@@ -1473,7 +1474,7 @@ xfs_qm_dqfree_one(
1473 struct xfs_quotainfo *qi = mp->m_quotainfo; 1474 struct xfs_quotainfo *qi = mp->m_quotainfo;
1474 1475
1475 mutex_lock(&qi->qi_tree_lock); 1476 mutex_lock(&qi->qi_tree_lock);
1476 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), 1477 radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
1477 be32_to_cpu(dqp->q_core.d_id)); 1478 be32_to_cpu(dqp->q_core.d_id));
1478 1479
1479 qi->qi_dquots--; 1480 qi->qi_dquots--;
@@ -1659,7 +1660,8 @@ xfs_qm_vop_dqalloc(
1659 struct xfs_dquot **O_gdqpp) 1660 struct xfs_dquot **O_gdqpp)
1660{ 1661{
1661 struct xfs_mount *mp = ip->i_mount; 1662 struct xfs_mount *mp = ip->i_mount;
1662 struct xfs_dquot *uq, *gq; 1663 struct xfs_dquot *uq = NULL;
1664 struct xfs_dquot *gq = NULL;
1663 int error; 1665 int error;
1664 uint lockflags; 1666 uint lockflags;
1665 1667
@@ -1684,7 +1686,6 @@ xfs_qm_vop_dqalloc(
1684 } 1686 }
1685 } 1687 }
1686 1688
1687 uq = gq = NULL;
1688 if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { 1689 if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
1689 if (ip->i_d.di_uid != uid) { 1690 if (ip->i_d.di_uid != uid) {
1690 /* 1691 /*
@@ -1697,11 +1698,12 @@ xfs_qm_vop_dqalloc(
1697 * holding ilock. 1698 * holding ilock.
1698 */ 1699 */
1699 xfs_iunlock(ip, lockflags); 1700 xfs_iunlock(ip, lockflags);
1700 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, 1701 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
1701 XFS_DQ_USER, 1702 XFS_DQ_USER,
1702 XFS_QMOPT_DQALLOC | 1703 XFS_QMOPT_DQALLOC |
1703 XFS_QMOPT_DOWARN, 1704 XFS_QMOPT_DOWARN,
1704 &uq))) { 1705 &uq);
1706 if (error) {
1705 ASSERT(error != ENOENT); 1707 ASSERT(error != ENOENT);
1706 return error; 1708 return error;
1707 } 1709 }
@@ -1723,15 +1725,14 @@ xfs_qm_vop_dqalloc(
1723 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { 1725 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
1724 if (ip->i_d.di_gid != gid) { 1726 if (ip->i_d.di_gid != gid) {
1725 xfs_iunlock(ip, lockflags); 1727 xfs_iunlock(ip, lockflags);
1726 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, 1728 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
1727 XFS_DQ_GROUP, 1729 XFS_DQ_GROUP,
1728 XFS_QMOPT_DQALLOC | 1730 XFS_QMOPT_DQALLOC |
1729 XFS_QMOPT_DOWARN, 1731 XFS_QMOPT_DOWARN,
1730 &gq))) { 1732 &gq);
1731 if (uq) 1733 if (error) {
1732 xfs_qm_dqrele(uq);
1733 ASSERT(error != ENOENT); 1734 ASSERT(error != ENOENT);
1734 return error; 1735 goto error_rele;
1735 } 1736 }
1736 xfs_dqunlock(gq); 1737 xfs_dqunlock(gq);
1737 lockflags = XFS_ILOCK_SHARED; 1738 lockflags = XFS_ILOCK_SHARED;
@@ -1743,15 +1744,14 @@ xfs_qm_vop_dqalloc(
1743 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 1744 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
1744 if (xfs_get_projid(ip) != prid) { 1745 if (xfs_get_projid(ip) != prid) {
1745 xfs_iunlock(ip, lockflags); 1746 xfs_iunlock(ip, lockflags);
1746 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 1747 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
1747 XFS_DQ_PROJ, 1748 XFS_DQ_PROJ,
1748 XFS_QMOPT_DQALLOC | 1749 XFS_QMOPT_DQALLOC |
1749 XFS_QMOPT_DOWARN, 1750 XFS_QMOPT_DOWARN,
1750 &gq))) { 1751 &gq);
1751 if (uq) 1752 if (error) {
1752 xfs_qm_dqrele(uq);
1753 ASSERT(error != ENOENT); 1753 ASSERT(error != ENOENT);
1754 return (error); 1754 goto error_rele;
1755 } 1755 }
1756 xfs_dqunlock(gq); 1756 xfs_dqunlock(gq);
1757 lockflags = XFS_ILOCK_SHARED; 1757 lockflags = XFS_ILOCK_SHARED;
@@ -1774,6 +1774,11 @@ xfs_qm_vop_dqalloc(
1774 else if (gq) 1774 else if (gq)
1775 xfs_qm_dqrele(gq); 1775 xfs_qm_dqrele(gq);
1776 return 0; 1776 return 0;
1777
1778error_rele:
1779 if (uq)
1780 xfs_qm_dqrele(uq);
1781 return error;
1777} 1782}
1778 1783
1779/* 1784/*
@@ -1821,29 +1826,31 @@ xfs_qm_vop_chown(
1821 */ 1826 */
1822int 1827int
1823xfs_qm_vop_chown_reserve( 1828xfs_qm_vop_chown_reserve(
1824 xfs_trans_t *tp, 1829 struct xfs_trans *tp,
1825 xfs_inode_t *ip, 1830 struct xfs_inode *ip,
1826 xfs_dquot_t *udqp, 1831 struct xfs_dquot *udqp,
1827 xfs_dquot_t *gdqp, 1832 struct xfs_dquot *gdqp,
1828 uint flags) 1833 uint flags)
1829{ 1834{
1830 xfs_mount_t *mp = ip->i_mount; 1835 struct xfs_mount *mp = ip->i_mount;
1831 uint delblks, blkflags, prjflags = 0; 1836 uint delblks, blkflags, prjflags = 0;
1832 xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; 1837 struct xfs_dquot *udq_unres = NULL;
1833 int error; 1838 struct xfs_dquot *gdq_unres = NULL;
1839 struct xfs_dquot *udq_delblks = NULL;
1840 struct xfs_dquot *gdq_delblks = NULL;
1841 int error;
1834 1842
1835 1843
1836 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 1844 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
1837 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1845 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1838 1846
1839 delblks = ip->i_delayed_blks; 1847 delblks = ip->i_delayed_blks;
1840 delblksudq = delblksgdq = unresudq = unresgdq = NULL;
1841 blkflags = XFS_IS_REALTIME_INODE(ip) ? 1848 blkflags = XFS_IS_REALTIME_INODE(ip) ?
1842 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; 1849 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
1843 1850
1844 if (XFS_IS_UQUOTA_ON(mp) && udqp && 1851 if (XFS_IS_UQUOTA_ON(mp) && udqp &&
1845 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { 1852 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
1846 delblksudq = udqp; 1853 udq_delblks = udqp;
1847 /* 1854 /*
1848 * If there are delayed allocation blocks, then we have to 1855 * If there are delayed allocation blocks, then we have to
1849 * unreserve those from the old dquot, and add them to the 1856 * unreserve those from the old dquot, and add them to the
@@ -1851,7 +1858,7 @@ xfs_qm_vop_chown_reserve(
1851 */ 1858 */
1852 if (delblks) { 1859 if (delblks) {
1853 ASSERT(ip->i_udquot); 1860 ASSERT(ip->i_udquot);
1854 unresudq = ip->i_udquot; 1861 udq_unres = ip->i_udquot;
1855 } 1862 }
1856 } 1863 }
1857 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 1864 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
@@ -1862,18 +1869,19 @@ xfs_qm_vop_chown_reserve(
1862 if (prjflags || 1869 if (prjflags ||
1863 (XFS_IS_GQUOTA_ON(ip->i_mount) && 1870 (XFS_IS_GQUOTA_ON(ip->i_mount) &&
1864 ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { 1871 ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
1865 delblksgdq = gdqp; 1872 gdq_delblks = gdqp;
1866 if (delblks) { 1873 if (delblks) {
1867 ASSERT(ip->i_gdquot); 1874 ASSERT(ip->i_gdquot);
1868 unresgdq = ip->i_gdquot; 1875 gdq_unres = ip->i_gdquot;
1869 } 1876 }
1870 } 1877 }
1871 } 1878 }
1872 1879
1873 if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, 1880 error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
1874 delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, 1881 udq_delblks, gdq_delblks, ip->i_d.di_nblocks, 1,
1875 flags | blkflags | prjflags))) 1882 flags | blkflags | prjflags);
1876 return (error); 1883 if (error)
1884 return error;
1877 1885
1878 /* 1886 /*
1879 * Do the delayed blks reservations/unreservations now. Since, these 1887 * Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,14 +1893,15 @@ xfs_qm_vop_chown_reserve(
1885 /* 1893 /*
1886 * Do the reservations first. Unreservation can't fail. 1894 * Do the reservations first. Unreservation can't fail.
1887 */ 1895 */
1888 ASSERT(delblksudq || delblksgdq); 1896 ASSERT(udq_delblks || gdq_delblks);
1889 ASSERT(unresudq || unresgdq); 1897 ASSERT(udq_unres || gdq_unres);
1890 if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, 1898 error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
1891 delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, 1899 udq_delblks, gdq_delblks, (xfs_qcnt_t)delblks, 0,
1892 flags | blkflags | prjflags))) 1900 flags | blkflags | prjflags);
1893 return (error); 1901 if (error)
1902 return error;
1894 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, 1903 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
1895 unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, 1904 udq_unres, gdq_unres, -((xfs_qcnt_t)delblks), 0,
1896 blkflags); 1905 blkflags);
1897 } 1906 }
1898 1907
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e6900f..bdb4f8b95207 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -69,30 +69,62 @@ typedef struct xfs_quotainfo {
69 struct shrinker qi_shrinker; 69 struct shrinker qi_shrinker;
70} xfs_quotainfo_t; 70} xfs_quotainfo_t;
71 71
72#define XFS_DQUOT_TREE(qi, type) \ 72static inline struct radix_tree_root *
73 ((type & XFS_DQ_USER) ? \ 73xfs_dquot_tree(
74 &((qi)->qi_uquota_tree) : \ 74 struct xfs_quotainfo *qi,
75 &((qi)->qi_gquota_tree)) 75 int type)
76{
77 switch (type) {
78 case XFS_DQ_USER:
79 return &qi->qi_uquota_tree;
80 case XFS_DQ_GROUP:
81 case XFS_DQ_PROJ:
82 return &qi->qi_gquota_tree;
83 default:
84 ASSERT(0);
85 }
86 return NULL;
87}
76 88
89static inline struct xfs_inode *
90xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
91{
92 switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
93 case XFS_DQ_USER:
94 return dqp->q_mount->m_quotainfo->qi_uquotaip;
95 case XFS_DQ_GROUP:
96 case XFS_DQ_PROJ:
97 return dqp->q_mount->m_quotainfo->qi_gquotaip;
98 default:
99 ASSERT(0);
100 }
101 return NULL;
102}
77 103
78extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, 104extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
79 unsigned int nbblks); 105 unsigned int nbblks);
80extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); 106extern void xfs_trans_mod_dquot(struct xfs_trans *,
81extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, 107 struct xfs_dquot *, uint, long);
82 xfs_dquot_t *, xfs_dquot_t *, long, long, uint); 108extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
83extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); 109 struct xfs_mount *, struct xfs_dquot *,
84extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); 110 struct xfs_dquot *, long, long, uint);
111extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
112extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
85 113
86/* 114/*
87 * We keep the usr and grp dquots separately so that locking will be easier 115 * We keep the usr and grp dquots separately so that locking will be easier
88 * to do at commit time. All transactions that we know of at this point 116 * to do at commit time. All transactions that we know of at this point
89 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. 117 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
90 */ 118 */
119enum {
120 XFS_QM_TRANS_USR = 0,
121 XFS_QM_TRANS_GRP,
122 XFS_QM_TRANS_DQTYPES
123};
91#define XFS_QM_TRANS_MAXDQS 2 124#define XFS_QM_TRANS_MAXDQS 2
92typedef struct xfs_dquot_acct { 125struct xfs_dquot_acct {
93 xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; 126 struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
94 xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; 127};
95} xfs_dquot_acct_t;
96 128
97/* 129/*
98 * Users are allowed to have a usage exceeding their softlimit for 130 * Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +138,23 @@ typedef struct xfs_dquot_acct {
106#define XFS_QM_IWARNLIMIT 5 138#define XFS_QM_IWARNLIMIT 5
107#define XFS_QM_RTBWARNLIMIT 5 139#define XFS_QM_RTBWARNLIMIT 5
108 140
109extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 141extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
110extern int xfs_qm_quotacheck(xfs_mount_t *); 142extern int xfs_qm_quotacheck(struct xfs_mount *);
111extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); 143extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
112 144
113/* dquot stuff */ 145/* dquot stuff */
114extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); 146extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
115extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 147extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
116 148
117/* quota ops */ 149/* quota ops */
118extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); 150extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
119extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, 151extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
120 fs_disk_quota_t *); 152 uint, struct fs_disk_quota *);
121extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, 153extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
122 fs_disk_quota_t *); 154 struct fs_disk_quota *);
123extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); 155extern int xfs_qm_scall_getqstat(struct xfs_mount *,
124extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 156 struct fs_quota_stat *);
125extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 157extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
158extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
126 159
127#endif /* __XFS_QM_H__ */ 160#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 6cdf6ffc36a1..a08801ae24e2 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,11 @@ xfs_qm_scall_quotaoff(
117 } 117 }
118 if (flags & XFS_GQUOTA_ACCT) { 118 if (flags & XFS_GQUOTA_ACCT) {
119 dqtype |= XFS_QMOPT_GQUOTA; 119 dqtype |= XFS_QMOPT_GQUOTA;
120 flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); 120 flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
121 inactivate_flags |= XFS_GQUOTA_ACTIVE; 121 inactivate_flags |= XFS_GQUOTA_ACTIVE;
122 } else if (flags & XFS_PQUOTA_ACCT) { 122 } else if (flags & XFS_PQUOTA_ACCT) {
123 dqtype |= XFS_QMOPT_PQUOTA; 123 dqtype |= XFS_QMOPT_PQUOTA;
124 flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); 124 flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
125 inactivate_flags |= XFS_PQUOTA_ACTIVE; 125 inactivate_flags |= XFS_PQUOTA_ACTIVE;
126 } 126 }
127 127
@@ -335,14 +335,14 @@ xfs_qm_scall_quotaon(
335 * quota acct on ondisk without m_qflags' knowing. 335 * quota acct on ondisk without m_qflags' knowing.
336 */ 336 */
337 if (((flags & XFS_UQUOTA_ACCT) == 0 && 337 if (((flags & XFS_UQUOTA_ACCT) == 0 &&
338 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && 338 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
339 (flags & XFS_UQUOTA_ENFD)) 339 (flags & XFS_UQUOTA_ENFD)) ||
340 || 340 ((flags & XFS_GQUOTA_ACCT) == 0 &&
341 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
342 (flags & XFS_GQUOTA_ENFD)) ||
341 ((flags & XFS_PQUOTA_ACCT) == 0 && 343 ((flags & XFS_PQUOTA_ACCT) == 0 &&
342 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && 344 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
343 (flags & XFS_GQUOTA_ACCT) == 0 && 345 (flags & XFS_PQUOTA_ENFD))) {
344 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
345 (flags & XFS_OQUOTA_ENFD))) {
346 xfs_debug(mp, 346 xfs_debug(mp,
347 "%s: Can't enforce without acct, flags=%x sbflags=%x\n", 347 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
348 __func__, flags, mp->m_sb.sb_qflags); 348 __func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +407,11 @@ xfs_qm_scall_getqstat(
407 struct fs_quota_stat *out) 407 struct fs_quota_stat *out)
408{ 408{
409 struct xfs_quotainfo *q = mp->m_quotainfo; 409 struct xfs_quotainfo *q = mp->m_quotainfo;
410 struct xfs_inode *uip, *gip; 410 struct xfs_inode *uip = NULL;
411 bool tempuqip, tempgqip; 411 struct xfs_inode *gip = NULL;
412 bool tempuqip = false;
413 bool tempgqip = false;
412 414
413 uip = gip = NULL;
414 tempuqip = tempgqip = false;
415 memset(out, 0, sizeof(fs_quota_stat_t)); 415 memset(out, 0, sizeof(fs_quota_stat_t));
416 416
417 out->qs_version = FS_QSTAT_VERSION; 417 out->qs_version = FS_QSTAT_VERSION;
@@ -776,9 +776,12 @@ xfs_qm_scall_getquota(
776 * gets turned off. No need to confuse the user level code, 776 * gets turned off. No need to confuse the user level code,
777 * so return zeroes in that case. 777 * so return zeroes in that case.
778 */ 778 */
779 if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || 779 if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
780 (!XFS_IS_OQUOTA_ENFORCED(mp) && 780 dqp->q_core.d_flags == XFS_DQ_USER) ||
781 (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { 781 (!XFS_IS_GQUOTA_ENFORCED(mp) &&
782 dqp->q_core.d_flags == XFS_DQ_GROUP) ||
783 (!XFS_IS_PQUOTA_ENFORCED(mp) &&
784 dqp->q_core.d_flags == XFS_DQ_PROJ)) {
782 dst->d_btimer = 0; 785 dst->d_btimer = 0;
783 dst->d_itimer = 0; 786 dst->d_itimer = 0;
784 dst->d_rtbtimer = 0; 787 dst->d_rtbtimer = 0;
@@ -786,8 +789,8 @@ xfs_qm_scall_getquota(
786 789
787#ifdef DEBUG 790#ifdef DEBUG
788 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || 791 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
789 (XFS_IS_OQUOTA_ENFORCED(mp) && 792 (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
790 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && 793 (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
791 dst->d_id != 0) { 794 dst->d_id != 0) {
792 if ((dst->d_bcount > dst->d_blk_softlimit) && 795 if ((dst->d_bcount > dst->d_blk_softlimit) &&
793 (dst->d_blk_softlimit > 0)) { 796 (dst->d_blk_softlimit > 0)) {
@@ -833,16 +836,16 @@ xfs_qm_export_flags(
833 uflags = 0; 836 uflags = 0;
834 if (flags & XFS_UQUOTA_ACCT) 837 if (flags & XFS_UQUOTA_ACCT)
835 uflags |= FS_QUOTA_UDQ_ACCT; 838 uflags |= FS_QUOTA_UDQ_ACCT;
836 if (flags & XFS_PQUOTA_ACCT)
837 uflags |= FS_QUOTA_PDQ_ACCT;
838 if (flags & XFS_GQUOTA_ACCT) 839 if (flags & XFS_GQUOTA_ACCT)
839 uflags |= FS_QUOTA_GDQ_ACCT; 840 uflags |= FS_QUOTA_GDQ_ACCT;
841 if (flags & XFS_PQUOTA_ACCT)
842 uflags |= FS_QUOTA_PDQ_ACCT;
840 if (flags & XFS_UQUOTA_ENFD) 843 if (flags & XFS_UQUOTA_ENFD)
841 uflags |= FS_QUOTA_UDQ_ENFD; 844 uflags |= FS_QUOTA_UDQ_ENFD;
842 if (flags & (XFS_OQUOTA_ENFD)) { 845 if (flags & XFS_GQUOTA_ENFD)
843 uflags |= (flags & XFS_GQUOTA_ACCT) ? 846 uflags |= FS_QUOTA_GDQ_ENFD;
844 FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; 847 if (flags & XFS_PQUOTA_ENFD)
845 } 848 uflags |= FS_QUOTA_PDQ_ENFD;
846 return (uflags); 849 return (uflags);
847} 850}
848 851
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f26c55..c3483bab9cde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -161,30 +161,42 @@ typedef struct xfs_qoff_logformat {
161#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ 161#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
162 162
163/* 163/*
164 * Conversion to and from the combined OQUOTA flag (if necessary)
165 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
166 */
167#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
168#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
169#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
170#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
171
172/*
164 * Quota Accounting/Enforcement flags 173 * Quota Accounting/Enforcement flags
165 */ 174 */
166#define XFS_ALL_QUOTA_ACCT \ 175#define XFS_ALL_QUOTA_ACCT \
167 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) 176 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
168#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) 177#define XFS_ALL_QUOTA_ENFD \
169#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) 178 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
179#define XFS_ALL_QUOTA_CHKD \
180 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
170 181
171#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) 182#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
172#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) 183#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
173#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) 184#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
174#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) 185#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
175#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) 186#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
176#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) 187#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
188#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
177 189
178/* 190/*
179 * Incore only flags for quotaoff - these bits get cleared when quota(s) 191 * Incore only flags for quotaoff - these bits get cleared when quota(s)
180 * are in the process of getting turned off. These flags are in m_qflags but 192 * are in the process of getting turned off. These flags are in m_qflags but
181 * never in sb_qflags. 193 * never in sb_qflags.
182 */ 194 */
183#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ 195#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
184#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ 196#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
185#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ 197#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
186#define XFS_ALL_QUOTA_ACTIVE \ 198#define XFS_ALL_QUOTA_ACTIVE \
187 (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) 199 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
188 200
189/* 201/*
190 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees 202 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -268,24 +280,23 @@ typedef struct xfs_qoff_logformat {
268 ((XFS_IS_UQUOTA_ON(mp) && \ 280 ((XFS_IS_UQUOTA_ON(mp) && \
269 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ 281 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
270 (XFS_IS_GQUOTA_ON(mp) && \ 282 (XFS_IS_GQUOTA_ON(mp) && \
271 ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ 283 (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
272 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
273 (XFS_IS_PQUOTA_ON(mp) && \ 284 (XFS_IS_PQUOTA_ON(mp) && \
274 ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ 285 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
275 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
276 286
277#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ 287#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
278 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ 288 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
279 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) 289 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
280 290
281#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ 291#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
282 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ 292 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
283 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) 293 XFS_PQUOTA_ENFD|XFS_PQUOTA_CHKD)
284 294
285#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ 295#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
286 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ 296 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
287 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ 297 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
288 XFS_GQUOTA_ACCT) 298 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
299 XFS_PQUOTA_CHKD)
289 300
290 301
291/* 302/*
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
75 flags |= XFS_GQUOTA_ACCT; 75 flags |= XFS_GQUOTA_ACCT;
76 if (uflags & FS_QUOTA_UDQ_ENFD) 76 if (uflags & FS_QUOTA_UDQ_ENFD)
77 flags |= XFS_UQUOTA_ENFD; 77 flags |= XFS_UQUOTA_ENFD;
78 if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) 78 if (uflags & FS_QUOTA_GDQ_ENFD)
79 flags |= XFS_OQUOTA_ENFD; 79 flags |= XFS_GQUOTA_ENFD;
80 if (uflags & FS_QUOTA_PDQ_ENFD)
81 flags |= XFS_PQUOTA_ENFD;
80 82
81 switch (op) { 83 switch (op) {
82 case Q_XQUOTAON: 84 case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 2de58a85833c..78f9e70b80c7 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature(
618 return (sbp->sb_features_log_incompat & feature) != 0; 618 return (sbp->sb_features_log_incompat & feature) != 0;
619} 619}
620 620
621static inline bool
622xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
623{
624 return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
625}
626
621/* 627/*
622 * end of superblock version macros 628 * end of superblock version macros
623 */ 629 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5e9762..1d68ffcdeaa7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_icache.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54#include "xfs_icreate_item.h"
54 55
55#include <linux/namei.h> 56#include <linux/namei.h>
56#include <linux/init.h> 57#include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
359 } else if (!strcmp(this_char, MNTOPT_PQUOTA) || 360 } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
360 !strcmp(this_char, MNTOPT_PRJQUOTA)) { 361 !strcmp(this_char, MNTOPT_PRJQUOTA)) {
361 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | 362 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
362 XFS_OQUOTA_ENFD); 363 XFS_PQUOTA_ENFD);
363 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { 364 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
364 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); 365 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
365 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 366 mp->m_qflags &= ~XFS_PQUOTA_ENFD;
366 } else if (!strcmp(this_char, MNTOPT_GQUOTA) || 367 } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
367 !strcmp(this_char, MNTOPT_GRPQUOTA)) { 368 !strcmp(this_char, MNTOPT_GRPQUOTA)) {
368 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | 369 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
369 XFS_OQUOTA_ENFD); 370 XFS_GQUOTA_ENFD);
370 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 371 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
371 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 372 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
372 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 373 mp->m_qflags &= ~XFS_GQUOTA_ENFD;
373 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 374 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
374 xfs_warn(mp, 375 xfs_warn(mp,
375 "delaylog is the default now, option is deprecated."); 376 "delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
439 } 440 }
440 441
441done: 442done:
442 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { 443 if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
443 /* 444 /*
444 * At this point the superblock has not been read 445 * At this point the superblock has not been read
445 * in, therefore we do not know the block size. 446 * in, therefore we do not know the block size.
446 * Before the mount call ends we will convert 447 * Before the mount call ends we will convert
447 * these to FSBs. 448 * these to FSBs.
448 */ 449 */
449 if (dsunit) { 450 mp->m_dalign = dsunit;
450 mp->m_dalign = dsunit; 451 mp->m_swidth = dswidth;
451 mp->m_flags |= XFS_MOUNT_RETERR;
452 }
453
454 if (dswidth)
455 mp->m_swidth = dswidth;
456 } 452 }
457 453
458 if (mp->m_logbufs != -1 && 454 if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
563 /* Either project or group quotas can be active, not both */ 559 /* Either project or group quotas can be active, not both */
564 560
565 if (mp->m_qflags & XFS_PQUOTA_ACCT) { 561 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
566 if (mp->m_qflags & XFS_OQUOTA_ENFD) 562 if (mp->m_qflags & XFS_PQUOTA_ENFD)
567 seq_puts(m, "," MNTOPT_PRJQUOTA); 563 seq_puts(m, "," MNTOPT_PRJQUOTA);
568 else 564 else
569 seq_puts(m, "," MNTOPT_PQUOTANOENF); 565 seq_puts(m, "," MNTOPT_PQUOTANOENF);
570 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { 566 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
571 if (mp->m_qflags & XFS_OQUOTA_ENFD) 567 if (mp->m_qflags & XFS_GQUOTA_ENFD)
572 seq_puts(m, "," MNTOPT_GRPQUOTA); 568 seq_puts(m, "," MNTOPT_GRPQUOTA);
573 else 569 else
574 seq_puts(m, "," MNTOPT_GQUOTANOENF); 570 seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
1136 spin_unlock(&mp->m_sb_lock); 1132 spin_unlock(&mp->m_sb_lock);
1137 1133
1138 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1134 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1139 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == 1135 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
1140 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 1136 (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
1141 xfs_qm_statvfs(ip, statp); 1137 xfs_qm_statvfs(ip, statp);
1142 return 0; 1138 return 0;
1143} 1139}
@@ -1481,6 +1477,10 @@ xfs_fs_fill_super(
1481 sb->s_time_gran = 1; 1477 sb->s_time_gran = 1;
1482 set_posix_acl_flag(sb); 1478 set_posix_acl_flag(sb);
1483 1479
1480 /* version 5 superblocks support inode version counters. */
1481 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
1482 sb->s_flags |= MS_I_VERSION;
1483
1484 error = xfs_mountfs(mp); 1484 error = xfs_mountfs(mp);
1485 if (error) 1485 if (error)
1486 goto out_filestream_unmount; 1486 goto out_filestream_unmount;
@@ -1655,9 +1655,15 @@ xfs_init_zones(void)
1655 KM_ZONE_SPREAD, NULL); 1655 KM_ZONE_SPREAD, NULL);
1656 if (!xfs_ili_zone) 1656 if (!xfs_ili_zone)
1657 goto out_destroy_inode_zone; 1657 goto out_destroy_inode_zone;
1658 xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
1659 "xfs_icr");
1660 if (!xfs_icreate_zone)
1661 goto out_destroy_ili_zone;
1658 1662
1659 return 0; 1663 return 0;
1660 1664
1665 out_destroy_ili_zone:
1666 kmem_zone_destroy(xfs_ili_zone);
1661 out_destroy_inode_zone: 1667 out_destroy_inode_zone:
1662 kmem_zone_destroy(xfs_inode_zone); 1668 kmem_zone_destroy(xfs_inode_zone);
1663 out_destroy_efi_zone: 1669 out_destroy_efi_zone:
@@ -1696,6 +1702,7 @@ xfs_destroy_zones(void)
1696 * destroy caches. 1702 * destroy caches.
1697 */ 1703 */
1698 rcu_barrier(); 1704 rcu_barrier();
1705 kmem_zone_destroy(xfs_icreate_zone);
1699 kmem_zone_destroy(xfs_ili_zone); 1706 kmem_zone_destroy(xfs_ili_zone);
1700 kmem_zone_destroy(xfs_inode_zone); 1707 kmem_zone_destroy(xfs_inode_zone);
1701 kmem_zone_destroy(xfs_efi_zone); 1708 kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403e1522..e830fb56e27f 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -358,7 +358,8 @@ xfs_symlink(
358 int n; 358 int n;
359 xfs_buf_t *bp; 359 xfs_buf_t *bp;
360 prid_t prid; 360 prid_t prid;
361 struct xfs_dquot *udqp, *gdqp; 361 struct xfs_dquot *udqp = NULL;
362 struct xfs_dquot *gdqp = NULL;
362 uint resblks; 363 uint resblks;
363 364
364 *ipp = NULL; 365 *ipp = NULL;
@@ -585,7 +586,7 @@ xfs_symlink(
585/* 586/*
586 * Free a symlink that has blocks associated with it. 587 * Free a symlink that has blocks associated with it.
587 */ 588 */
588int 589STATIC int
589xfs_inactive_symlink_rmt( 590xfs_inactive_symlink_rmt(
590 xfs_inode_t *ip, 591 xfs_inode_t *ip,
591 xfs_trans_t **tpp) 592 xfs_trans_t **tpp)
@@ -606,7 +607,7 @@ xfs_inactive_symlink_rmt(
606 607
607 tp = *tpp; 608 tp = *tpp;
608 mp = ip->i_mount; 609 mp = ip->i_mount;
609 ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); 610 ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
610 /* 611 /*
611 * We're freeing a symlink that has some 612 * We're freeing a symlink that has some
612 * blocks allocated to it. Free the 613 * blocks allocated to it. Free the
@@ -720,3 +721,47 @@ xfs_inactive_symlink_rmt(
720 error0: 721 error0:
721 return error; 722 return error;
722} 723}
724
725/*
726 * xfs_inactive_symlink - free a symlink
727 */
728int
729xfs_inactive_symlink(
730 struct xfs_inode *ip,
731 struct xfs_trans **tp)
732{
733 struct xfs_mount *mp = ip->i_mount;
734 int pathlen;
735
736 trace_xfs_inactive_symlink(ip);
737
738 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
739
740 if (XFS_FORCED_SHUTDOWN(mp))
741 return XFS_ERROR(EIO);
742
743 /*
744 * Zero length symlinks _can_ exist.
745 */
746 pathlen = (int)ip->i_d.di_size;
747 if (!pathlen)
748 return 0;
749
750 if (pathlen < 0 || pathlen > MAXPATHLEN) {
751 xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
752 __func__, (unsigned long long)ip->i_ino, pathlen);
753 ASSERT(0);
754 return XFS_ERROR(EFSCORRUPTED);
755 }
756
757 if (ip->i_df.if_flags & XFS_IFINLINE) {
758 if (ip->i_df.if_bytes > 0)
759 xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
760 XFS_DATA_FORK);
761 ASSERT(ip->i_df.if_bytes == 0);
762 return 0;
763 }
764
765 /* remove the remote symlink */
766 return xfs_inactive_symlink_rmt(ip, tp);
767}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b39398d2097c..374394880c01 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
61 const char *target_path, umode_t mode, struct xfs_inode **ipp); 61 const char *target_path, umode_t mode, struct xfs_inode **ipp);
62int xfs_readlink(struct xfs_inode *ip, char *link); 62int xfs_readlink(struct xfs_inode *ip, char *link);
63int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp); 63int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
64 64
65#endif /* __KERNEL__ */ 65#endif /* __KERNEL__ */
66#endif /* __XFS_SYMLINK_H */ 66#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
25#ifdef CONFIG_PROC_FS 25#ifdef CONFIG_PROC_FS
26STATIC int 26STATIC int
27xfs_stats_clear_proc_handler( 27xfs_stats_clear_proc_handler(
28 ctl_table *ctl, 28 struct ctl_table *ctl,
29 int write, 29 int write,
30 void __user *buffer, 30 void __user *buffer,
31 size_t *lenp, 31 size_t *lenp,
32 loff_t *ppos) 32 loff_t *ppos)
33{ 33{
34 int c, ret, *valp = ctl->data; 34 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 35 __uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
55 55
56STATIC int 56STATIC int
57xfs_panic_mask_proc_handler( 57xfs_panic_mask_proc_handler(
58 ctl_table *ctl, 58 struct ctl_table *ctl,
59 int write, 59 int write,
60 void __user *buffer, 60 void __user *buffer,
61 size_t *lenp, 61 size_t *lenp,
62 loff_t *ppos) 62 loff_t *ppos)
63{ 63{
64 int ret, *valp = ctl->data; 64 int ret, *valp = ctl->data;
65 65
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
74} 74}
75#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
76 76
77static ctl_table xfs_table[] = { 77static struct ctl_table xfs_table[] = {
78 { 78 {
79 .procname = "irix_sgid_inherit", 79 .procname = "irix_sgid_inherit",
80 .data = &xfs_params.sgid_inherit.val, 80 .data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
227 {} 227 {}
228}; 228};
229 229
230static ctl_table xfs_dir_table[] = { 230static struct ctl_table xfs_dir_table[] = {
231 { 231 {
232 .procname = "xfs", 232 .procname = "xfs",
233 .mode = 0555, 233 .mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
236 {} 236 {}
237}; 237};
238 238
239static ctl_table xfs_root_table[] = { 239static struct ctl_table xfs_root_table[] = {
240 { 240 {
241 .procname = "fs", 241 .procname = "fs",
242 .mode = 0555, 242 .mode = 0555,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a04701de6bbd..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
486 TP_PROTO(struct xfs_buf_log_item *bip), \ 486 TP_PROTO(struct xfs_buf_log_item *bip), \
487 TP_ARGS(bip)) 487 TP_ARGS(bip))
488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); 488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); 490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); 491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); 493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); 495DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); 496DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); 497DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
508DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); 511DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
509DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); 512DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
510DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); 513DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
514DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
511 515
512DECLARE_EVENT_CLASS(xfs_lock_class, 516DECLARE_EVENT_CLASS(xfs_lock_class,
513 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, 517 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
571DEFINE_INODE_EVENT(xfs_getattr); 575DEFINE_INODE_EVENT(xfs_getattr);
572DEFINE_INODE_EVENT(xfs_setattr); 576DEFINE_INODE_EVENT(xfs_setattr);
573DEFINE_INODE_EVENT(xfs_readlink); 577DEFINE_INODE_EVENT(xfs_readlink);
578DEFINE_INODE_EVENT(xfs_inactive_symlink);
574DEFINE_INODE_EVENT(xfs_alloc_file_space); 579DEFINE_INODE_EVENT(xfs_alloc_file_space);
575DEFINE_INODE_EVENT(xfs_free_file_space); 580DEFINE_INODE_EVENT(xfs_free_file_space);
576DEFINE_INODE_EVENT(xfs_readdir); 581DEFINE_INODE_EVENT(xfs_readdir);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
234} 234}
235 235
236/* 236/*
237 * For symlink we can modify: 237 * For create, break it in to the two cases that the transaction
238 * covers. We start with the modify case - allocation done by modification
239 * of the state of existing inodes - and the allocation case.
240 */
241
242/*
243 * For create we can modify:
238 * the parent directory inode: inode size 244 * the parent directory inode: inode size
239 * the new inode: inode size 245 * the new inode: inode size
240 * the inode btree entry: 1 block 246 * the inode btree entry: block size
247 * the superblock for the nlink flag: sector size
241 * the directory btree: (max depth + v2) * dir block size 248 * the directory btree: (max depth + v2) * dir block size
242 * the directory inode's bmap btree: (max depth + v2) * block size 249 * the directory inode's bmap btree: (max depth + v2) * block size
243 * the blocks for the symlink: 1 kB 250 */
244 * Or in the first xact we allocate some inodes giving: 251STATIC uint
252xfs_calc_create_resv_modify(
253 struct xfs_mount *mp)
254{
255 return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
256 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
257 (uint)XFS_FSB_TO_B(mp, 1) +
258 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
259}
260
261/*
262 * For create we can allocate some inodes giving:
245 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 263 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
264 * the superblock for the nlink flag: sector size
246 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 265 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
247 * the inode btree: max depth * blocksize 266 * the inode btree: max depth * blocksize
248 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size 267 * the allocation btrees: 2 trees * (max depth - 1) * block size
249 */ 268 */
250STATIC uint 269STATIC uint
251xfs_calc_symlink_reservation( 270xfs_calc_create_resv_alloc(
271 struct xfs_mount *mp)
272{
273 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
274 mp->m_sb.sb_sectsize +
275 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
276 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
277 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
278 XFS_FSB_TO_B(mp, 1));
279}
280
281STATIC uint
282__xfs_calc_create_reservation(
252 struct xfs_mount *mp) 283 struct xfs_mount *mp)
253{ 284{
254 return XFS_DQUOT_LOGRES(mp) + 285 return XFS_DQUOT_LOGRES(mp) +
255 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 286 MAX(xfs_calc_create_resv_alloc(mp),
256 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 287 xfs_calc_create_resv_modify(mp));
257 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
258 XFS_FSB_TO_B(mp, 1)) +
259 xfs_calc_buf_res(1, 1024)),
260 (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
261 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
262 XFS_FSB_TO_B(mp, 1)) +
263 xfs_calc_buf_res(mp->m_in_maxlevels,
264 XFS_FSB_TO_B(mp, 1)) +
265 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
266 XFS_FSB_TO_B(mp, 1))));
267} 288}
268 289
269/* 290/*
270 * For create we can modify: 291 * For icreate we can allocate some inodes giving:
271 * the parent directory inode: inode size
272 * the new inode: inode size
273 * the inode btree entry: block size
274 * the superblock for the nlink flag: sector size
275 * the directory btree: (max depth + v2) * dir block size
276 * the directory inode's bmap btree: (max depth + v2) * block size
277 * Or in the first xact we allocate some inodes giving:
278 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 292 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
279 * the superblock for the nlink flag: sector size 293 * the superblock for the nlink flag: sector size
280 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
281 * the inode btree: max depth * blocksize 294 * the inode btree: max depth * blocksize
282 * the allocation btrees: 2 trees * (max depth - 1) * block size 295 * the allocation btrees: 2 trees * (max depth - 1) * block size
283 */ 296 */
284STATIC uint 297STATIC uint
285xfs_calc_create_reservation( 298xfs_calc_icreate_resv_alloc(
286 struct xfs_mount *mp) 299 struct xfs_mount *mp)
287{ 300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
305 XFS_FSB_TO_B(mp, 1));
306}
307
308STATIC uint
309xfs_calc_icreate_reservation(xfs_mount_t *mp)
310{
288 return XFS_DQUOT_LOGRES(mp) + 311 return XFS_DQUOT_LOGRES(mp) +
289 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 312 MAX(xfs_calc_icreate_resv_alloc(mp),
290 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + 313 xfs_calc_create_resv_modify(mp));
291 (uint)XFS_FSB_TO_B(mp, 1) + 314}
292 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 315
293 XFS_FSB_TO_B(mp, 1))), 316STATIC uint
294 (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 317xfs_calc_create_reservation(
295 mp->m_sb.sb_sectsize + 318 struct xfs_mount *mp)
296 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), 319{
297 XFS_FSB_TO_B(mp, 1)) + 320 if (xfs_sb_version_hascrc(&mp->m_sb))
298 xfs_calc_buf_res(mp->m_in_maxlevels, 321 return xfs_calc_icreate_reservation(mp);
299 XFS_FSB_TO_B(mp, 1)) + 322 return __xfs_calc_create_reservation(mp);
300 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 323
301 XFS_FSB_TO_B(mp, 1))));
302} 324}
303 325
304/* 326/*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
311 return xfs_calc_create_reservation(mp); 333 return xfs_calc_create_reservation(mp);
312} 334}
313 335
336
337/*
338 * Making a new symplink is the same as creating a new file, but
339 * with the added blocks for remote symlink data which can be up to 1kB in
340 * length (MAXPATHLEN).
341 */
342STATIC uint
343xfs_calc_symlink_reservation(
344 struct xfs_mount *mp)
345{
346 return xfs_calc_create_reservation(mp) +
347 xfs_calc_buf_res(1, MAXPATHLEN);
348}
349
314/* 350/*
315 * In freeing an inode we can modify: 351 * In freeing an inode we can modify:
316 * the inode being freed: inode size 352 * the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a44dba5b2cdb..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ 48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51#define XFS_LI_ICREATE 0x123f
51 52
52#define XFS_LI_TYPE_DESC \ 53#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \ 54 { XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
107#define XFS_TRANS_SWAPEXT 40 108#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 109#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_CHECKPOINT 42 110#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42 111#define XFS_TRANS_ICREATE 43
112#define XFS_TRANS_TYPE_MAX 43
111/* new transaction types need to be reflected in xfs_logprint(8) */ 113/* new transaction types need to be reflected in xfs_logprint(8) */
112 114
113#define XFS_TRANS_TYPES \ 115#define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
210/* 212/*
211 * Per-extent log reservation for the allocation btree changes 213 * Per-extent log reservation for the allocation btree changes
212 * involved in freeing or allocating an extent. 214 * involved in freeing or allocating an extent.
213 * 2 trees * (2 blocks/level * max depth - 1) * block size 215 * 2 trees * (2 blocks/level * max depth - 1)
214 */ 216 */
215#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
216 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ 217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) 218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
219 219
220/* 220/*
221 * Per-directory log reservation for any directory change. 221 * Per-directory log reservation for any directory change.
222 * dir blocks: (1 btree block per level + data block + free block) * dblock size 222 * dir blocks: (1 btree block per level + data block + free block)
223 * bmap btree: (levels + 2) * max depth * block size 223 * bmap btree: (levels + 2) * max depth
224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb 224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
225 * size, so account for that in the DAENTER macros. 225 * size, so account for that in the DAENTER macros.
226 */ 226 */
227#define XFS_DIROP_LOG_RES(mp) \
228 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
229 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
230#define XFS_DIROP_LOG_COUNT(mp) \ 227#define XFS_DIROP_LOG_COUNT(mp) \
231 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ 228 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
232 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) 229 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
503void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); 500void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
504void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); 501void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
505void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 502void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
503void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
506void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 504void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
507void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 505void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
508void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 506void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 73a5fa457e16..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
397 return XFS_ERROR(EIO); 397 return XFS_ERROR(EIO);
398} 398}
399 399
400
401/* 400/*
402 * Release the buffer bp which was previously acquired with one of the 401 * Release the buffer bp which was previously acquired with one of the
403 * xfs_trans_... buffer allocation routines if the buffer has not 402 * xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
603 602
604 tp->t_flags |= XFS_TRANS_DIRTY; 603 tp->t_flags |= XFS_TRANS_DIRTY;
605 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; 604 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
606 bip->bli_flags |= XFS_BLI_LOGGED; 605
607 xfs_buf_item_log(bip, first, last); 606 /*
607 * If we have an ordered buffer we are not logging any dirty range but
608 * it still needs to be marked dirty and that it has been logged.
609 */
610 bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
611 if (!(bip->bli_flags & XFS_BLI_ORDERED))
612 xfs_buf_item_log(bip, first, last);
608} 613}
609 614
610 615
@@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf(
757} 762}
758 763
759/* 764/*
765 * Mark the buffer as ordered for this transaction. This means
766 * that the contents of the buffer are not recorded in the transaction
767 * but it is tracked in the AIL as though it was. This allows us
768 * to record logical changes in transactions rather than the physical
769 * changes we make to the buffer without changing writeback ordering
770 * constraints of metadata buffers.
771 */
772void
773xfs_trans_ordered_buf(
774 struct xfs_trans *tp,
775 struct xfs_buf *bp)
776{
777 struct xfs_buf_log_item *bip = bp->b_fspriv;
778
779 ASSERT(bp->b_transp == tp);
780 ASSERT(bip != NULL);
781 ASSERT(atomic_read(&bip->bli_refcount) > 0);
782
783 bip->bli_flags |= XFS_BLI_ORDERED;
784 trace_xfs_buf_item_ordered(bip);
785}
786
787/*
760 * Set the type of the buffer for log recovery so that it can correctly identify 788 * Set the type of the buffer for log recovery so that it can correctly identify
761 * and hence attach the correct buffer ops to the buffer after replay. 789 * and hence attach the correct buffer ops to the buffer after replay.
762 */ 790 */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fec75d023703..3ba64d540168 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
103 return; 103 return;
104 104
105 xfs_trans_alloc_dqinfo(ntp); 105 xfs_trans_alloc_dqinfo(ntp);
106 oqa = otp->t_dqinfo->dqa_usrdquots;
107 nqa = ntp->t_dqinfo->dqa_usrdquots;
108 106
109 /* 107 /*
110 * Because the quota blk reservation is carried forward, 108 * Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
113 if(otp->t_flags & XFS_TRANS_DQ_DIRTY) 111 if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
114 ntp->t_flags |= XFS_TRANS_DQ_DIRTY; 112 ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
115 113
116 for (j = 0; j < 2; j++) { 114 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
115 oqa = otp->t_dqinfo->dqs[j];
116 nqa = ntp->t_dqinfo->dqs[j];
117 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 117 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
118 if (oqa[i].qt_dquot == NULL) 118 if (oqa[i].qt_dquot == NULL)
119 break; 119 break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
138 oq->qt_ino_res = oq->qt_ino_res_used; 138 oq->qt_ino_res = oq->qt_ino_res_used;
139 139
140 } 140 }
141 oqa = otp->t_dqinfo->dqa_grpdquots;
142 nqa = ntp->t_dqinfo->dqa_grpdquots;
143 } 141 }
144} 142}
145 143
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
157 155
158 if (!XFS_IS_QUOTA_RUNNING(mp) || 156 if (!XFS_IS_QUOTA_RUNNING(mp) ||
159 !XFS_IS_QUOTA_ON(mp) || 157 !XFS_IS_QUOTA_ON(mp) ||
160 ip->i_ino == mp->m_sb.sb_uquotino || 158 xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
161 ip->i_ino == mp->m_sb.sb_gquotino)
162 return; 159 return;
163 160
164 if (tp->t_dqinfo == NULL) 161 if (tp->t_dqinfo == NULL)
@@ -170,16 +167,18 @@ xfs_trans_mod_dquot_byino(
170 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); 167 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
171} 168}
172 169
173STATIC xfs_dqtrx_t * 170STATIC struct xfs_dqtrx *
174xfs_trans_get_dqtrx( 171xfs_trans_get_dqtrx(
175 xfs_trans_t *tp, 172 struct xfs_trans *tp,
176 xfs_dquot_t *dqp) 173 struct xfs_dquot *dqp)
177{ 174{
178 int i; 175 int i;
179 xfs_dqtrx_t *qa; 176 struct xfs_dqtrx *qa;
180 177
181 qa = XFS_QM_ISUDQ(dqp) ? 178 if (XFS_QM_ISUDQ(dqp))
182 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; 179 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
180 else
181 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
183 182
184 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 183 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
185 if (qa[i].qt_dquot == NULL || 184 if (qa[i].qt_dquot == NULL ||
@@ -339,12 +338,10 @@ xfs_trans_apply_dquot_deltas(
339 return; 338 return;
340 339
341 ASSERT(tp->t_dqinfo); 340 ASSERT(tp->t_dqinfo);
342 qa = tp->t_dqinfo->dqa_usrdquots; 341 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
343 for (j = 0; j < 2; j++) { 342 qa = tp->t_dqinfo->dqs[j];
344 if (qa[0].qt_dquot == NULL) { 343 if (qa[0].qt_dquot == NULL)
345 qa = tp->t_dqinfo->dqa_grpdquots;
346 continue; 344 continue;
347 }
348 345
349 /* 346 /*
350 * Lock all of the dquots and join them to the transaction. 347 * Lock all of the dquots and join them to the transaction.
@@ -495,10 +492,6 @@ xfs_trans_apply_dquot_deltas(
495 ASSERT(dqp->q_res_rtbcount >= 492 ASSERT(dqp->q_res_rtbcount >=
496 be64_to_cpu(dqp->q_core.d_rtbcount)); 493 be64_to_cpu(dqp->q_core.d_rtbcount));
497 } 494 }
498 /*
499 * Do the group quotas next
500 */
501 qa = tp->t_dqinfo->dqa_grpdquots;
502 } 495 }
503} 496}
504 497
@@ -521,9 +514,9 @@ xfs_trans_unreserve_and_mod_dquots(
521 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) 514 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
522 return; 515 return;
523 516
524 qa = tp->t_dqinfo->dqa_usrdquots; 517 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
518 qa = tp->t_dqinfo->dqs[j];
525 519
526 for (j = 0; j < 2; j++) {
527 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 520 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
528 qtrx = &qa[i]; 521 qtrx = &qa[i];
529 /* 522 /*
@@ -565,7 +558,6 @@ xfs_trans_unreserve_and_mod_dquots(
565 xfs_dqunlock(dqp); 558 xfs_dqunlock(dqp);
566 559
567 } 560 }
568 qa = tp->t_dqinfo->dqa_grpdquots;
569 } 561 }
570} 562}
571 563
@@ -640,8 +632,8 @@ xfs_trans_dqresv(
640 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 632 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
641 dqp->q_core.d_id && 633 dqp->q_core.d_id &&
642 ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || 634 ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 635 (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 636 (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
645 if (nblks > 0) { 637 if (nblks > 0) {
646 /* 638 /*
647 * dquot is locked already. See if we'd go over the 639 * dquot is locked already. See if we'd go over the
@@ -748,15 +740,15 @@ error_return:
748 */ 740 */
749int 741int
750xfs_trans_reserve_quota_bydquots( 742xfs_trans_reserve_quota_bydquots(
751 xfs_trans_t *tp, 743 struct xfs_trans *tp,
752 xfs_mount_t *mp, 744 struct xfs_mount *mp,
753 xfs_dquot_t *udqp, 745 struct xfs_dquot *udqp,
754 xfs_dquot_t *gdqp, 746 struct xfs_dquot *gdqp,
755 long nblks, 747 long nblks,
756 long ninos, 748 long ninos,
757 uint flags) 749 uint flags)
758{ 750{
759 int resvd = 0, error; 751 int error;
760 752
761 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 753 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
762 return 0; 754 return 0;
@@ -771,28 +763,24 @@ xfs_trans_reserve_quota_bydquots(
771 (flags & ~XFS_QMOPT_ENOSPC)); 763 (flags & ~XFS_QMOPT_ENOSPC));
772 if (error) 764 if (error)
773 return error; 765 return error;
774 resvd = 1;
775 } 766 }
776 767
777 if (gdqp) { 768 if (gdqp) {
778 error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); 769 error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
779 if (error) { 770 if (error)
780 /* 771 goto unwind_usr;
781 * can't do it, so backout previous reservation
782 */
783 if (resvd) {
784 flags |= XFS_QMOPT_FORCE_RES;
785 xfs_trans_dqresv(tp, mp, udqp,
786 -nblks, -ninos, flags);
787 }
788 return error;
789 }
790 } 772 }
791 773
792 /* 774 /*
793 * Didn't change anything critical, so, no need to log 775 * Didn't change anything critical, so, no need to log
794 */ 776 */
795 return 0; 777 return 0;
778
779unwind_usr:
780 flags |= XFS_QMOPT_FORCE_RES;
781 if (udqp)
782 xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
783 return error;
796} 784}
797 785
798 786
@@ -816,8 +804,7 @@ xfs_trans_reserve_quota_nblks(
816 if (XFS_IS_PQUOTA_ON(mp)) 804 if (XFS_IS_PQUOTA_ON(mp))
817 flags |= XFS_QMOPT_ENOSPC; 805 flags |= XFS_QMOPT_ENOSPC;
818 806
819 ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); 807 ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
820 ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
821 808
822 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 809 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
823 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == 810 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
112 ASSERT(ip->i_itemp != NULL); 112 ASSERT(ip->i_itemp != NULL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 114
115 /*
116 * First time we log the inode in a transaction, bump the inode change
117 * counter if it is configured for this to occur.
118 */
119 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
120 IS_I_VERSION(VFS_I(ip))) {
121 inode_inc_iversion(VFS_I(ip));
122 ip->i_d.di_changecount = VFS_I(ip)->i_version;
123 flags |= XFS_ILOG_CORE;
124 }
125
115 tp->t_flags |= XFS_TRANS_DIRTY; 126 tp->t_flags |= XFS_TRANS_DIRTY;
116 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; 127 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
117 128
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb21f09a..42c0ef288aeb 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -322,18 +322,9 @@ xfs_inactive(
322 xfs_trans_ijoin(tp, ip, 0); 322 xfs_trans_ijoin(tp, ip, 0);
323 323
324 if (S_ISLNK(ip->i_d.di_mode)) { 324 if (S_ISLNK(ip->i_d.di_mode)) {
325 /* 325 error = xfs_inactive_symlink(ip, &tp);
326 * Zero length symlinks _can_ exist. 326 if (error)
327 */ 327 goto out_cancel;
328 if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
329 error = xfs_inactive_symlink_rmt(ip, &tp);
330 if (error)
331 goto out_cancel;
332 } else if (ip->i_df.if_bytes > 0) {
333 xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
334 XFS_DATA_FORK);
335 ASSERT(ip->i_df.if_bytes == 0);
336 }
337 } else if (truncate) { 328 } else if (truncate) {
338 ip->i_d.di_size = 0; 329 ip->i_d.di_size = 0;
339 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);