aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/autofs4/expire.c8
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/backref.c72
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c120
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c14
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c483
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c340
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c150
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c501
-rw-r--r--fs/btrfs/ioctl.c74
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c322
-rw-r--r--fs/btrfs/transaction.h50
-rw-r--r--fs/btrfs/tree-log.c41
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/ceph/addr.c88
-rw-r--r--fs/ceph/caps.c102
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c18
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/mdsmap.c42
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c9
-rw-r--r--fs/cifs/cifsencrypt.c195
-rw-r--r--fs/cifs/cifsglob.h17
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/cifs/dir.c14
-rw-r--r--fs/cifs/file.c54
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/smb1ops.c29
-rw-r--r--fs/cifs/smb2file.c24
-rw-r--r--fs/cifs/smb2inode.c57
-rw-r--r--fs/cifs/smb2ops.c54
-rw-r--r--fs/cifs/smb2pdu.c220
-rw-r--r--fs/cifs/smb2pdu.h14
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2transport.c90
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/configfs/dir.c15
-rw-r--r--fs/ecryptfs/crypto.c337
-rw-r--r--fs/ecryptfs/file.c7
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efivarfs/inode.c14
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/super.c13
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/extents.c23
-rw-r--r--fs/ext4/extents_status.c73
-rw-r--r--fs/ext4/inode.c19
-rw-r--r--fs/ext4/mballoc.c11
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c35
-rw-r--r--fs/ext4/super.c14
-rw-r--r--fs/f2fs/dir.c20
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/file_table.c31
-rw-r--r--fs/fs-writeback.c10
-rw-r--r--fs/fuse/dir.c51
-rw-r--r--fs/jfs/jfs_dmap.c70
-rw-r--r--fs/jfs/jfs_dtree.c37
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_imap.c69
-rw-r--r--fs/jfs/jfs_metapage.c5
-rw-r--r--fs/jfs/jfs_superblock.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c62
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c22
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/libfs.c3
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/locks.c71
-rw-r--r--fs/namei.c2
-rw-r--r--fs/ncpfs/inode.c12
-rw-r--r--fs/nfs/Kconfig14
-rw-r--r--fs/nfs/Makefile6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c3
-rw-r--r--fs/nfs/callback_xdr.c52
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/dir.c87
-rw-r--r--fs/nfs/dns_resolve.c32
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c56
-rw-r--r--fs/nfs/inode.c138
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c7
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c15
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c3
-rw-r--r--fs/nfs/nfs4filelayout.h3
-rw-r--r--fs/nfs/nfs4filelayoutdev.c8
-rw-r--r--fs/nfs/nfs4proc.c687
-rw-r--r--fs/nfs/nfs4session.c40
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c36
-rw-r--r--fs/nfs/nfs4super.c14
-rw-r--r--fs/nfs/nfs4xdr.c189
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/proc.c13
-rw-r--r--fs/nfs/super.c199
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/nfs4proc.c46
-rw-r--r--fs/nfsd/nfs4state.c225
-rw-r--r--fs/nfsd/nfs4xdr.c169
-rw-r--r--fs/nfsd/nfsd.h27
-rw-r--r--fs/nfsd/nfssvc.c13
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c33
-rw-r--r--fs/nfsd/vfs.h7
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify_user.c89
-rw-r--r--fs/notify/inotify/inotify_user.c13
-rw-r--r--fs/notify/mark.c50
-rw-r--r--fs/ocfs2/refcounttree.c5
-rw-r--r--fs/open.c6
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/select.c62
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/super.c25
-rw-r--r--fs/sysfs/group.c70
-rw-r--r--fs/timerfd.c131
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_alloc.c24
-rw-r--r--fs/xfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/xfs_bmap.c199
-rw-r--r--fs/xfs/xfs_bmap.h1
-rw-r--r--fs/xfs/xfs_bmap_btree.h2
-rw-r--r--fs/xfs/xfs_buf_item.c87
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dinode.h6
-rw-r--r--fs/xfs/xfs_dir2_block.c20
-rw-r--r--fs/xfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/xfs_dquot.c31
-rw-r--r--fs/xfs/xfs_dquot.h11
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c74
-rw-r--r--fs/xfs/xfs_ialloc.h8
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c195
-rw-r--r--fs/xfs/xfs_icreate_item.h52
-rw-r--r--fs/xfs/xfs_inode.c105
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_ioctl.c16
-rw-r--r--fs/xfs/xfs_iomap.c13
-rw-r--r--fs/xfs/xfs_iops.c27
-rw-r--r--fs/xfs/xfs_itable.c33
-rw-r--r--fs/xfs/xfs_log.c22
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c75
-rw-r--r--fs/xfs/xfs_log_recover.c127
-rw-r--r--fs/xfs/xfs_mount.c92
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_qm.c394
-rw-r--r--fs/xfs/xfs_qm.h97
-rw-r--r--fs/xfs/xfs_qm_bhv.c10
-rw-r--r--fs/xfs/xfs_qm_syscalls.c75
-rw-r--r--fs/xfs/xfs_quota.h104
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_sb.h6
-rw-r--r--fs/xfs/xfs_super.c39
-rw-r--r--fs/xfs/xfs_symlink.c61
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.h5
-rw-r--r--fs/xfs/xfs_trans.c118
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_buf.c34
-rw-r--r--fs/xfs/xfs_trans_dquot.c122
-rw-r--r--fs/xfs/xfs_trans_inode.c11
-rw-r--r--fs/xfs/xfs_vnodeops.c28
226 files changed, 7423 insertions, 4267 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 55abfd62654a..6489e1fc1afd 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
31 If you don't know what Access Control Lists are, say N 31 If you don't know what Access Control Lists are, say N
32 32
33endif 33endif
34
35
36config 9P_FS_SECURITY
37 bool "9P Security Labels"
38 depends on 9P_FS
39 help
40 Security labels support alternative access control models
41 implemented by security modules like SELinux. This option
42 enables an extended attribute handler for file security
43 labels in the 9P filesystem.
44
45 If you are not using a security module that requires using
46 extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c12780634..ff7be98f84f2 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
11 v9fs.o \ 11 v9fs.o \
12 fid.o \ 12 fid.o \
13 xattr.o \ 13 xattr.o \
14 xattr_user.o 14 xattr_user.o \
15 xattr_trusted.o
15 16
169p-$(CONFIG_9P_FSCACHE) += cache.o 179p-$(CONFIG_9P_FSCACHE) += cache.o
179p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o 189p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
199p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d86edc8d3fd0..25b018efb8ab 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1054,13 +1054,11 @@ static int
1054v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1054v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1055 struct kstat *stat) 1055 struct kstat *stat)
1056{ 1056{
1057 int err;
1058 struct v9fs_session_info *v9ses; 1057 struct v9fs_session_info *v9ses;
1059 struct p9_fid *fid; 1058 struct p9_fid *fid;
1060 struct p9_wstat *st; 1059 struct p9_wstat *st;
1061 1060
1062 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); 1061 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1063 err = -EPERM;
1064 v9ses = v9fs_dentry2v9ses(dentry); 1062 v9ses = v9fs_dentry2v9ses(dentry);
1065 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 1063 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1066 generic_fillattr(dentry->d_inode, stat); 1064 generic_fillattr(dentry->d_inode, stat);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index c45e016b190f..3c28cdfb8c47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
167 167
168const struct xattr_handler *v9fs_xattr_handlers[] = { 168const struct xattr_handler *v9fs_xattr_handlers[] = {
169 &v9fs_xattr_user_handler, 169 &v9fs_xattr_user_handler,
170 &v9fs_xattr_trusted_handler,
170#ifdef CONFIG_9P_FS_POSIX_ACL 171#ifdef CONFIG_9P_FS_POSIX_ACL
171 &v9fs_xattr_acl_access_handler, 172 &v9fs_xattr_acl_access_handler,
172 &v9fs_xattr_acl_default_handler, 173 &v9fs_xattr_acl_default_handler,
173#endif 174#endif
175#ifdef CONFIG_9P_FS_SECURITY
176 &v9fs_xattr_security_handler,
177#endif
174 NULL 178 NULL
175}; 179};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eec348a3df71..d3e2ea3840be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
20 20
21extern const struct xattr_handler *v9fs_xattr_handlers[]; 21extern const struct xattr_handler *v9fs_xattr_handlers[];
22extern struct xattr_handler v9fs_xattr_user_handler; 22extern struct xattr_handler v9fs_xattr_user_handler;
23extern struct xattr_handler v9fs_xattr_trusted_handler;
24extern struct xattr_handler v9fs_xattr_security_handler;
23extern const struct xattr_handler v9fs_xattr_acl_access_handler; 25extern const struct xattr_handler v9fs_xattr_acl_access_handler;
24extern const struct xattr_handler v9fs_xattr_acl_default_handler; 26extern const struct xattr_handler v9fs_xattr_acl_default_handler;
25 27
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 000000000000..cb247a142a6e
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_security_handler = {
77 .prefix = XATTR_SECURITY_PREFIX,
78 .get = v9fs_xattr_security_get,
79 .set = v9fs_xattr_security_set,
80};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000000..e30d33b8a3fb
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_trusted_handler = {
77 .prefix = XATTR_TRUSTED_PREFIX,
78 .get = v9fs_xattr_trusted_get,
79 .set = v9fs_xattr_trusted_set,
80};
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec92341c..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
109 109
110 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); 110 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
111 /* Already gone or negative dentry (under construction) - try next */ 111 /* Already gone or negative dentry (under construction) - try next */
112 if (q->d_count == 0 || !simple_positive(q)) { 112 if (!d_count(q) || !simple_positive(q)) {
113 spin_unlock(&q->d_lock); 113 spin_unlock(&q->d_lock);
114 next = q->d_u.d_child.next; 114 next = q->d_u.d_child.next;
115 goto cont; 115 goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
267 else 267 else
268 ino_count++; 268 ino_count++;
269 269
270 if (p->d_count > ino_count) { 270 if (d_count(p) > ino_count) {
271 top_ino->last_used = jiffies; 271 top_ino->last_used = jiffies;
272 dput(p); 272 dput(p);
273 return 1; 273 return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
409 if (!exp_leaves) { 409 if (!exp_leaves) {
410 /* Path walk currently on this dentry? */ 410 /* Path walk currently on this dentry? */
411 ino_count = atomic_read(&ino->count) + 1; 411 ino_count = atomic_read(&ino->count) + 1;
412 if (dentry->d_count > ino_count) 412 if (d_count(dentry) > ino_count)
413 goto next; 413 goto next;
414 414
415 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { 415 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
423 } else { 423 } else {
424 /* Path walk currently on this dentry? */ 424 /* Path walk currently on this dentry? */
425 ino_count = atomic_read(&ino->count) + 1; 425 ino_count = atomic_read(&ino->count) + 1;
426 if (dentry->d_count > ino_count) 426 if (d_count(dentry) > ino_count)
427 goto next; 427 goto next;
428 428
429 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); 429 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ca8e55548d98..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
179 spin_lock(&active->d_lock); 179 spin_lock(&active->d_lock);
180 180
181 /* Already gone? */ 181 /* Already gone? */
182 if (active->d_count == 0) 182 if (!d_count(active))
183 goto next; 183 goto next;
184 184
185 qstr = &active->d_name; 185 qstr = &active->d_name;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
255 (current->mm->start_data = N_DATADDR(ex)); 255 (current->mm->start_data = N_DATADDR(ex));
256 current->mm->brk = ex.a_bss + 256 current->mm->brk = ex.a_bss +
257 (current->mm->start_brk = N_BSSADDR(ex)); 257 (current->mm->start_brk = N_BSSADDR(ex));
258 current->mm->free_area_cache = current->mm->mmap_base;
259 current->mm->cached_hole_size = 0;
260 258
261 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 259 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
262 if (retval < 0) { 260 if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
738 738
739 /* Do this so that we can load the interpreter, if need be. We will 739 /* Do this so that we can load the interpreter, if need be. We will
740 change some of these later */ 740 change some of these later */
741 current->mm->free_area_cache = current->mm->mmap_base;
742 current->mm->cached_hole_size = 0;
743 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 741 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
744 executable_stack); 742 executable_stack);
745 if (retval < 0) { 743 if (retval < 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bb43ce081d6e..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
58 struct backing_dev_info *dst) 58 struct backing_dev_info *dst)
59{ 59{
60 struct backing_dev_info *old = inode->i_data.backing_dev_info; 60 struct backing_dev_info *old = inode->i_data.backing_dev_info;
61 bool wakeup_bdi = false;
61 62
62 if (unlikely(dst == old)) /* deadlock avoidance */ 63 if (unlikely(dst == old)) /* deadlock avoidance */
63 return; 64 return;
64 bdi_lock_two(&old->wb, &dst->wb); 65 bdi_lock_two(&old->wb, &dst->wb);
65 spin_lock(&inode->i_lock); 66 spin_lock(&inode->i_lock);
66 inode->i_data.backing_dev_info = dst; 67 inode->i_data.backing_dev_info = dst;
67 if (inode->i_state & I_DIRTY) 68 if (inode->i_state & I_DIRTY) {
69 if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
70 wakeup_bdi = true;
68 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 71 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
72 }
69 spin_unlock(&inode->i_lock); 73 spin_unlock(&inode->i_lock);
70 spin_unlock(&old->wb.list_lock); 74 spin_unlock(&old->wb.list_lock);
71 spin_unlock(&dst->wb.list_lock); 75 spin_unlock(&dst->wb.list_lock);
76
77 if (wakeup_bdi)
78 bdi_wakeup_thread_delayed(dst);
72} 79}
73 80
74/* Kill _all_ buffers and pagecache , dirty or not.. */ 81/* Kill _all_ buffers and pagecache , dirty or not.. */
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..eaf133384a8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
255 * to a logical address 255 * to a logical address
256 */ 256 */
257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
258 int search_commit_root, 258 struct btrfs_path *path, u64 time_seq,
259 u64 time_seq, 259 struct __prelim_ref *ref,
260 struct __prelim_ref *ref, 260 struct ulist *parents,
261 struct ulist *parents, 261 const u64 *extent_item_pos)
262 const u64 *extent_item_pos)
263{ 262{
264 struct btrfs_path *path;
265 struct btrfs_root *root; 263 struct btrfs_root *root;
266 struct btrfs_key root_key; 264 struct btrfs_key root_key;
267 struct extent_buffer *eb; 265 struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
269 int root_level; 267 int root_level;
270 int level = ref->level; 268 int level = ref->level;
271 269
272 path = btrfs_alloc_path();
273 if (!path)
274 return -ENOMEM;
275 path->search_commit_root = !!search_commit_root;
276
277 root_key.objectid = ref->root_id; 270 root_key.objectid = ref->root_id;
278 root_key.type = BTRFS_ROOT_ITEM_KEY; 271 root_key.type = BTRFS_ROOT_ITEM_KEY;
279 root_key.offset = (u64)-1; 272 root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
314 time_seq, ref->wanted_disk_byte, 307 time_seq, ref->wanted_disk_byte,
315 extent_item_pos); 308 extent_item_pos);
316out: 309out:
317 btrfs_free_path(path); 310 path->lowest_level = 0;
311 btrfs_release_path(path);
318 return ret; 312 return ret;
319} 313}
320 314
@@ -322,7 +316,7 @@ out:
322 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
323 */ 317 */
324static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
325 int search_commit_root, u64 time_seq, 319 struct btrfs_path *path, u64 time_seq,
326 struct list_head *head, 320 struct list_head *head,
327 const u64 *extent_item_pos) 321 const u64 *extent_item_pos)
328{ 322{
@@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
349 continue; 343 continue;
350 if (ref->count == 0) 344 if (ref->count == 0)
351 continue; 345 continue;
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
353 time_seq, ref, parents, 347 parents, extent_item_pos);
354 extent_item_pos);
355 if (err == -ENOMEM) 348 if (err == -ENOMEM)
356 goto out; 349 goto out;
357 if (err) 350 if (err)
@@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
604 int slot; 597 int slot;
605 struct extent_buffer *leaf; 598 struct extent_buffer *leaf;
606 struct btrfs_key key; 599 struct btrfs_key key;
600 struct btrfs_key found_key;
607 unsigned long ptr; 601 unsigned long ptr;
608 unsigned long end; 602 unsigned long end;
609 struct btrfs_extent_item *ei; 603 struct btrfs_extent_item *ei;
@@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
621 615
622 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 616 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
623 flags = btrfs_extent_flags(leaf, ei); 617 flags = btrfs_extent_flags(leaf, ei);
618 btrfs_item_key_to_cpu(leaf, &found_key, slot);
624 619
625 ptr = (unsigned long)(ei + 1); 620 ptr = (unsigned long)(ei + 1);
626 end = (unsigned long)ei + item_size; 621 end = (unsigned long)ei + item_size;
627 622
628 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 623 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
624 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
629 struct btrfs_tree_block_info *info; 625 struct btrfs_tree_block_info *info;
630 626
631 info = (struct btrfs_tree_block_info *)ptr; 627 info = (struct btrfs_tree_block_info *)ptr;
632 *info_level = btrfs_tree_block_level(leaf, info); 628 *info_level = btrfs_tree_block_level(leaf, info);
633 ptr += sizeof(struct btrfs_tree_block_info); 629 ptr += sizeof(struct btrfs_tree_block_info);
634 BUG_ON(ptr > end); 630 BUG_ON(ptr > end);
631 } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
632 *info_level = found_key.offset;
635 } else { 633 } else {
636 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 634 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
637 } 635 }
@@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
795 struct btrfs_delayed_ref_head *head; 793 struct btrfs_delayed_ref_head *head;
796 int info_level = 0; 794 int info_level = 0;
797 int ret; 795 int ret;
798 int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
799 struct list_head prefs_delayed; 796 struct list_head prefs_delayed;
800 struct list_head prefs; 797 struct list_head prefs;
801 struct __prelim_ref *ref; 798 struct __prelim_ref *ref;
@@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
804 INIT_LIST_HEAD(&prefs_delayed); 801 INIT_LIST_HEAD(&prefs_delayed);
805 802
806 key.objectid = bytenr; 803 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = (u64)-1; 804 key.offset = (u64)-1;
805 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
806 key.type = BTRFS_METADATA_ITEM_KEY;
807 else
808 key.type = BTRFS_EXTENT_ITEM_KEY;
809 809
810 path = btrfs_alloc_path(); 810 path = btrfs_alloc_path();
811 if (!path) 811 if (!path)
812 return -ENOMEM; 812 return -ENOMEM;
813 path->search_commit_root = !!search_commit_root; 813 if (!trans)
814 path->search_commit_root = 1;
814 815
815 /* 816 /*
816 * grab both a lock on the path and a lock on the delayed ref head. 817 * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +826,7 @@ again:
825 goto out; 826 goto out;
826 BUG_ON(ret == 0); 827 BUG_ON(ret == 0);
827 828
828 if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { 829 if (trans) {
829 /* 830 /*
830 * look if there are updates for this ref queued and lock the 831 * look if there are updates for this ref queued and lock the
831 * head 832 * head
@@ -869,7 +870,8 @@ again:
869 slot = path->slots[0]; 870 slot = path->slots[0];
870 btrfs_item_key_to_cpu(leaf, &key, slot); 871 btrfs_item_key_to_cpu(leaf, &key, slot);
871 if (key.objectid == bytenr && 872 if (key.objectid == bytenr &&
872 key.type == BTRFS_EXTENT_ITEM_KEY) { 873 (key.type == BTRFS_EXTENT_ITEM_KEY ||
874 key.type == BTRFS_METADATA_ITEM_KEY)) {
873 ret = __add_inline_refs(fs_info, path, bytenr, 875 ret = __add_inline_refs(fs_info, path, bytenr,
874 &info_level, &prefs); 876 &info_level, &prefs);
875 if (ret) 877 if (ret)
@@ -890,8 +892,8 @@ again:
890 892
891 __merge_refs(&prefs, 1); 893 __merge_refs(&prefs, 1);
892 894
893 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, 895 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
894 &prefs, extent_item_pos); 896 extent_item_pos);
895 if (ret) 897 if (ret)
896 goto out; 898 goto out;
897 899
@@ -1283,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1283{ 1285{
1284 int ret; 1286 int ret;
1285 u64 flags; 1287 u64 flags;
1288 u64 size = 0;
1286 u32 item_size; 1289 u32 item_size;
1287 struct extent_buffer *eb; 1290 struct extent_buffer *eb;
1288 struct btrfs_extent_item *ei; 1291 struct btrfs_extent_item *ei;
1289 struct btrfs_key key; 1292 struct btrfs_key key;
1290 1293
1291 key.type = BTRFS_EXTENT_ITEM_KEY; 1294 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1295 key.type = BTRFS_METADATA_ITEM_KEY;
1296 else
1297 key.type = BTRFS_EXTENT_ITEM_KEY;
1292 key.objectid = logical; 1298 key.objectid = logical;
1293 key.offset = (u64)-1; 1299 key.offset = (u64)-1;
1294 1300
@@ -1301,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1301 return ret; 1307 return ret;
1302 1308
1303 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1309 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1304 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 1310 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1311 size = fs_info->extent_root->leafsize;
1312 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1313 size = found_key->offset;
1314
1315 if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
1316 found_key->type != BTRFS_METADATA_ITEM_KEY) ||
1305 found_key->objectid > logical || 1317 found_key->objectid > logical ||
1306 found_key->objectid + found_key->offset <= logical) { 1318 found_key->objectid + size <= logical) {
1307 pr_debug("logical %llu is not within any extent\n", 1319 pr_debug("logical %llu is not within any extent\n",
1308 (unsigned long long)logical); 1320 (unsigned long long)logical);
1309 return -ENOENT; 1321 return -ENOENT;
@@ -1459,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1459 iterate_extent_inodes_t *iterate, void *ctx) 1471 iterate_extent_inodes_t *iterate, void *ctx)
1460{ 1472{
1461 int ret; 1473 int ret;
1462 struct btrfs_trans_handle *trans; 1474 struct btrfs_trans_handle *trans = NULL;
1463 struct ulist *refs = NULL; 1475 struct ulist *refs = NULL;
1464 struct ulist *roots = NULL; 1476 struct ulist *roots = NULL;
1465 struct ulist_node *ref_node = NULL; 1477 struct ulist_node *ref_node = NULL;
@@ -1471,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1471 pr_debug("resolving all inodes for extent %llu\n", 1483 pr_debug("resolving all inodes for extent %llu\n",
1472 extent_item_objectid); 1484 extent_item_objectid);
1473 1485
1474 if (search_commit_root) { 1486 if (!search_commit_root) {
1475 trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
1476 } else {
1477 trans = btrfs_join_transaction(fs_info->extent_root); 1487 trans = btrfs_join_transaction(fs_info->extent_root);
1478 if (IS_ERR(trans)) 1488 if (IS_ERR(trans))
1479 return PTR_ERR(trans); 1489 return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
27
28struct inode_fs_paths { 26struct inode_fs_paths {
29 struct btrfs_path *btrfs_path; 27 struct btrfs_path *btrfs_path;
30 struct btrfs_root *fs_root; 28 struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02fae7f7e42c..5bf4c39e2ad6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1089 btrfs_set_node_ptr_generation(parent, parent_slot, 1089 btrfs_set_node_ptr_generation(parent, parent_slot,
1090 trans->transid); 1090 trans->transid);
1091 btrfs_mark_buffer_dirty(parent); 1091 btrfs_mark_buffer_dirty(parent);
1092 tree_mod_log_free_eb(root->fs_info, buf); 1092 if (last_ref)
1093 tree_mod_log_free_eb(root->fs_info, buf);
1093 btrfs_free_tree_block(trans, root, buf, parent_start, 1094 btrfs_free_tree_block(trans, root, buf, parent_start,
1094 last_ref); 1095 last_ref);
1095 } 1096 }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1161 * time_seq). 1162 * time_seq).
1162 */ 1163 */
1163static void 1164static void
1164__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1165__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1165 struct tree_mod_elem *first_tm) 1166 u64 time_seq, struct tree_mod_elem *first_tm)
1166{ 1167{
1167 u32 n; 1168 u32 n;
1168 struct rb_node *next; 1169 struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1172 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1173 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1173 1174
1174 n = btrfs_header_nritems(eb); 1175 n = btrfs_header_nritems(eb);
1176 tree_mod_log_read_lock(fs_info);
1175 while (tm && tm->seq >= time_seq) { 1177 while (tm && tm->seq >= time_seq) {
1176 /* 1178 /*
1177 * all the operations are recorded with the operator used for 1179 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1226 if (tm->index != first_tm->index) 1228 if (tm->index != first_tm->index)
1227 break; 1229 break;
1228 } 1230 }
1231 tree_mod_log_read_unlock(fs_info);
1229 btrfs_set_header_nritems(eb, n); 1232 btrfs_set_header_nritems(eb, n);
1230} 1233}
1231 1234
@@ -1274,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1274 1277
1275 extent_buffer_get(eb_rewin); 1278 extent_buffer_get(eb_rewin);
1276 btrfs_tree_read_lock(eb_rewin); 1279 btrfs_tree_read_lock(eb_rewin);
1277 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1280 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1278 WARN_ON(btrfs_header_nritems(eb_rewin) > 1281 WARN_ON(btrfs_header_nritems(eb_rewin) >
1279 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1282 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1280 1283
@@ -1350,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1350 btrfs_set_header_generation(eb, old_generation); 1353 btrfs_set_header_generation(eb, old_generation);
1351 } 1354 }
1352 if (tm) 1355 if (tm)
1353 __tree_mod_log_rewind(eb, time_seq, tm); 1356 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1354 else 1357 else
1355 WARN_ON(btrfs_header_level(eb) != 0); 1358 WARN_ON(btrfs_header_level(eb) != 0);
1356 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1359 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
2178 } 2181 }
2179} 2182}
2180 2183
2181/* 2184static noinline void reada_for_balance(struct btrfs_root *root,
2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2185 struct btrfs_path *path, int level)
2183 * cache
2184 */
2185static noinline int reada_for_balance(struct btrfs_root *root,
2186 struct btrfs_path *path, int level)
2187{ 2186{
2188 int slot; 2187 int slot;
2189 int nritems; 2188 int nritems;
@@ -2192,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2192 u64 gen; 2191 u64 gen;
2193 u64 block1 = 0; 2192 u64 block1 = 0;
2194 u64 block2 = 0; 2193 u64 block2 = 0;
2195 int ret = 0;
2196 int blocksize; 2194 int blocksize;
2197 2195
2198 parent = path->nodes[level + 1]; 2196 parent = path->nodes[level + 1];
2199 if (!parent) 2197 if (!parent)
2200 return 0; 2198 return;
2201 2199
2202 nritems = btrfs_header_nritems(parent); 2200 nritems = btrfs_header_nritems(parent);
2203 slot = path->slots[level + 1]; 2201 slot = path->slots[level + 1];
@@ -2224,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2224 block2 = 0; 2222 block2 = 0;
2225 free_extent_buffer(eb); 2223 free_extent_buffer(eb);
2226 } 2224 }
2227 if (block1 || block2) {
2228 ret = -EAGAIN;
2229
2230 /* release the whole path */
2231 btrfs_release_path(path);
2232
2233 /* read the blocks */
2234 if (block1)
2235 readahead_tree_block(root, block1, blocksize, 0);
2236 if (block2)
2237 readahead_tree_block(root, block2, blocksize, 0);
2238 2225
2239 if (block1) { 2226 if (block1)
2240 eb = read_tree_block(root, block1, blocksize, 0); 2227 readahead_tree_block(root, block1, blocksize, 0);
2241 free_extent_buffer(eb); 2228 if (block2)
2242 } 2229 readahead_tree_block(root, block2, blocksize, 0);
2243 if (block2) {
2244 eb = read_tree_block(root, block2, blocksize, 0);
2245 free_extent_buffer(eb);
2246 }
2247 }
2248 return ret;
2249} 2230}
2250 2231
2251 2232
@@ -2359,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2359 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2340 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2360 if (tmp) { 2341 if (tmp) {
2361 /* first we do an atomic uptodate check */ 2342 /* first we do an atomic uptodate check */
2362 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2343 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2363 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2344 *eb_ret = tmp;
2364 /* 2345 return 0;
2365 * we found an up to date block without 2346 }
2366 * sleeping, return
2367 * right away
2368 */
2369 *eb_ret = tmp;
2370 return 0;
2371 }
2372 /* the pages were up to date, but we failed
2373 * the generation number check. Do a full
2374 * read for the generation number that is correct.
2375 * We must do this without dropping locks so
2376 * we can trust our generation number
2377 */
2378 free_extent_buffer(tmp);
2379 btrfs_set_path_blocking(p);
2380 2347
2381 /* now we're allowed to do a blocking uptodate check */ 2348 /* the pages were up to date, but we failed
2382 tmp = read_tree_block(root, blocknr, blocksize, gen); 2349 * the generation number check. Do a full
2383 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2350 * read for the generation number that is correct.
2384 *eb_ret = tmp; 2351 * We must do this without dropping locks so
2385 return 0; 2352 * we can trust our generation number
2386 } 2353 */
2387 free_extent_buffer(tmp); 2354 btrfs_set_path_blocking(p);
2388 btrfs_release_path(p); 2355
2389 return -EIO; 2356 /* now we're allowed to do a blocking uptodate check */
2357 ret = btrfs_read_buffer(tmp, gen);
2358 if (!ret) {
2359 *eb_ret = tmp;
2360 return 0;
2390 } 2361 }
2362 free_extent_buffer(tmp);
2363 btrfs_release_path(p);
2364 return -EIO;
2391 } 2365 }
2392 2366
2393 /* 2367 /*
@@ -2448,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2448 goto again; 2422 goto again;
2449 } 2423 }
2450 2424
2451 sret = reada_for_balance(root, p, level);
2452 if (sret)
2453 goto again;
2454
2455 btrfs_set_path_blocking(p); 2425 btrfs_set_path_blocking(p);
2426 reada_for_balance(root, p, level);
2456 sret = split_node(trans, root, p, level); 2427 sret = split_node(trans, root, p, level);
2457 btrfs_clear_path_blocking(p, NULL, 0); 2428 btrfs_clear_path_blocking(p, NULL, 0);
2458 2429
@@ -2472,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2472 goto again; 2443 goto again;
2473 } 2444 }
2474 2445
2475 sret = reada_for_balance(root, p, level);
2476 if (sret)
2477 goto again;
2478
2479 btrfs_set_path_blocking(p); 2446 btrfs_set_path_blocking(p);
2447 reada_for_balance(root, p, level);
2480 sret = balance_level(trans, root, p, level); 2448 sret = balance_level(trans, root, p, level);
2481 btrfs_clear_path_blocking(p, NULL, 0); 2449 btrfs_clear_path_blocking(p, NULL, 0);
2482 2450
@@ -3143,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
3143 */ 3111 */
3144static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3112static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3145 struct btrfs_root *root, 3113 struct btrfs_root *root,
3146 struct btrfs_path *path, int level, int log_removal) 3114 struct btrfs_path *path, int level)
3147{ 3115{
3148 u64 lower_gen; 3116 u64 lower_gen;
3149 struct extent_buffer *lower; 3117 struct extent_buffer *lower;
@@ -3194,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3194 btrfs_mark_buffer_dirty(c); 3162 btrfs_mark_buffer_dirty(c);
3195 3163
3196 old = root->node; 3164 old = root->node;
3197 tree_mod_log_set_root_pointer(root, c, log_removal); 3165 tree_mod_log_set_root_pointer(root, c, 0);
3198 rcu_assign_pointer(root->node, c); 3166 rcu_assign_pointer(root->node, c);
3199 3167
3200 /* the super has an extra ref to root->node */ 3168 /* the super has an extra ref to root->node */
@@ -3278,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3278 /* 3246 /*
3279 * trying to split the root, lets make a new one 3247 * trying to split the root, lets make a new one
3280 * 3248 *
3281 * tree mod log: We pass 0 as log_removal parameter to 3249 * tree mod log: We don't log_removal old root in
3282 * insert_new_root, because that root buffer will be kept as a 3250 * insert_new_root, because that root buffer will be kept as a
3283 * normal node. We are going to log removal of half of the 3251 * normal node. We are going to log removal of half of the
3284 * elements below with tree_mod_log_eb_copy. We're holding a 3252 * elements below with tree_mod_log_eb_copy. We're holding a
3285 * tree lock on the buffer, which is why we cannot race with 3253 * tree lock on the buffer, which is why we cannot race with
3286 * other tree_mod_log users. 3254 * other tree_mod_log users.
3287 */ 3255 */
3288 ret = insert_new_root(trans, root, path, level + 1, 0); 3256 ret = insert_new_root(trans, root, path, level + 1);
3289 if (ret) 3257 if (ret)
3290 return ret; 3258 return ret;
3291 } else { 3259 } else {
@@ -3986,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
3986 return -EOVERFLOW; 3954 return -EOVERFLOW;
3987 3955
3988 /* first try to make some room by pushing left and right */ 3956 /* first try to make some room by pushing left and right */
3989 if (data_size) { 3957 if (data_size && path->nodes[1]) {
3990 wret = push_leaf_right(trans, root, path, data_size, 3958 wret = push_leaf_right(trans, root, path, data_size,
3991 data_size, 0, 0); 3959 data_size, 0, 0);
3992 if (wret < 0) 3960 if (wret < 0)
@@ -4005,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
4005 } 3973 }
4006 3974
4007 if (!path->nodes[1]) { 3975 if (!path->nodes[1]) {
4008 ret = insert_new_root(trans, root, path, 1, 1); 3976 ret = insert_new_root(trans, root, path, 1);
4009 if (ret) 3977 if (ret)
4010 return ret; 3978 return ret;
4011 } 3979 }
@@ -4430,7 +4398,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4430} 4398}
4431 4399
4432/* 4400/*
4433 * make the item pointed to by the path bigger, data_size is the new size. 4401 * make the item pointed to by the path bigger, data_size is the added size.
4434 */ 4402 */
4435void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, 4403void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4436 u32 data_size) 4404 u32 data_size)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
964#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 964#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
965#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 965#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
967 967
968enum btrfs_raid_types { 968enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
1102 account */ 1102 account */
1103 1103
1104 /* 1104 /*
1105 * bytes_pinned is kept in line with what is actually pinned, as in
1106 * we've called update_block_group and dropped the bytes_used counter
1107 * and increased the bytes_pinned counter. However this means that
1108 * bytes_pinned does not reflect the bytes that will be pinned once the
1109 * delayed refs are flushed, so this counter is inc'ed everytime we call
1110 * btrfs_free_extent so it is a realtime count of what will be freed
1111 * once the transaction is committed. It will be zero'ed everytime the
1112 * transaction commits.
1113 */
1114 struct percpu_counter total_bytes_pinned;
1115
1116 /*
1105 * we bump reservation progress every time we decrement 1117 * we bump reservation progress every time we decrement
1106 * bytes_reserved. This way people waiting for reservations 1118 * bytes_reserved. This way people waiting for reservations
1107 * know something good has happened and they can check 1119 * know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
1437 atomic_t open_ioctl_trans; 1449 atomic_t open_ioctl_trans;
1438 1450
1439 /* 1451 /*
1440 * this is used by the balancing code to wait for all the pending 1452 * this is used to protect the following list -- ordered_roots.
1441 * ordered extents
1442 */ 1453 */
1443 spinlock_t ordered_extent_lock; 1454 spinlock_t ordered_root_lock;
1444 1455
1445 /* 1456 /*
1446 * all of the data=ordered extents pending writeback 1457 * all fs/file tree roots in which there are data=ordered extents
1458 * pending writeback are added into this list.
1459 *
1447 * these can span multiple transactions and basically include 1460 * these can span multiple transactions and basically include
1448 * every dirty data page that isn't from nodatacow 1461 * every dirty data page that isn't from nodatacow
1449 */ 1462 */
1450 struct list_head ordered_extents; 1463 struct list_head ordered_roots;
1451 1464
1452 spinlock_t delalloc_lock; 1465 spinlock_t delalloc_root_lock;
1453 /* 1466 /* all fs/file tree roots that have delalloc inodes. */
1454 * all of the inodes that have delalloc bytes. It is possible for 1467 struct list_head delalloc_roots;
1455 * this list to be empty even when there is still dirty data=ordered
1456 * extents waiting to finish IO.
1457 */
1458 struct list_head delalloc_inodes;
1459 1468
1460 /* 1469 /*
1461 * there is a pool of worker threads for checksumming during writes 1470 * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
1498 int do_barriers; 1507 int do_barriers;
1499 int closing; 1508 int closing;
1500 int log_root_recovering; 1509 int log_root_recovering;
1501 int enospc_unlink;
1502 int trans_no_join;
1503 1510
1504 u64 total_pinned; 1511 u64 total_pinned;
1505 1512
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
1594 struct rb_root qgroup_tree; 1601 struct rb_root qgroup_tree;
1595 spinlock_t qgroup_lock; 1602 spinlock_t qgroup_lock;
1596 1603
1604 /*
1605 * used to avoid frequently calling ulist_alloc()/ulist_free()
1606 * when doing qgroup accounting, it must be protected by qgroup_lock.
1607 */
1608 struct ulist *qgroup_ulist;
1609
1597 /* protect user change for quota operations */ 1610 /* protect user change for quota operations */
1598 struct mutex qgroup_ioctl_lock; 1611 struct mutex qgroup_ioctl_lock;
1599 1612
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
1607 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1620 struct mutex qgroup_rescan_lock; /* protects the progress item */
1608 struct btrfs_key qgroup_rescan_progress; 1621 struct btrfs_key qgroup_rescan_progress;
1609 struct btrfs_workers qgroup_rescan_workers; 1622 struct btrfs_workers qgroup_rescan_workers;
1623 struct completion qgroup_rescan_completion;
1624 struct btrfs_work qgroup_rescan_work;
1610 1625
1611 /* filesystem state */ 1626 /* filesystem state */
1612 unsigned long fs_state; 1627 unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
1739 int force_cow; 1754 int force_cow;
1740 1755
1741 spinlock_t root_item_lock; 1756 spinlock_t root_item_lock;
1757 atomic_t refs;
1758
1759 spinlock_t delalloc_lock;
1760 /*
1761 * all of the inodes that have delalloc bytes. It is possible for
1762 * this list to be empty even when there is still dirty data=ordered
1763 * extents waiting to finish IO.
1764 */
1765 struct list_head delalloc_inodes;
1766 struct list_head delalloc_root;
1767 u64 nr_delalloc_inodes;
1768 /*
1769 * this is used by the balancing code to wait for all the pending
1770 * ordered extents
1771 */
1772 spinlock_t ordered_extent_lock;
1773
1774 /*
1775 * all of the data=ordered extents pending writeback
1776 * these can span multiple transactions and basically include
1777 * every dirty data page that isn't from nodatacow
1778 */
1779 struct list_head ordered_extents;
1780 struct list_head ordered_root;
1781 u64 nr_ordered_extents;
1742}; 1782};
1743 1783
1744struct btrfs_ioctl_defrag_range_args { 1784struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3028 num_items; 3068 num_items;
3029} 3069}
3030 3070
3071int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3072 struct btrfs_root *root);
3031void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3073void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3032int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3074int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3033 struct btrfs_root *root, unsigned long count); 3075 struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
3039 u64 bytenr, u64 num, int reserved); 3081 u64 bytenr, u64 num, int reserved);
3040int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 3082int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
3041 u64 bytenr, u64 num_bytes); 3083 u64 bytenr, u64 num_bytes);
3084int btrfs_exclude_logged_extents(struct btrfs_root *root,
3085 struct extent_buffer *eb);
3042int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3086int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3043 struct btrfs_root *root, 3087 struct btrfs_root *root,
3044 u64 objectid, u64 offset, u64 bytenr); 3088 u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3155int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3199int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3156 struct btrfs_block_rsv *dst_rsv, 3200 struct btrfs_block_rsv *dst_rsv,
3157 u64 num_bytes); 3201 u64 num_bytes);
3202int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
3203 struct btrfs_block_rsv *dest, u64 num_bytes,
3204 int min_factor);
3158void btrfs_block_rsv_release(struct btrfs_root *root, 3205void btrfs_block_rsv_release(struct btrfs_root *root,
3159 struct btrfs_block_rsv *block_rsv, 3206 struct btrfs_block_rsv *block_rsv,
3160 u64 num_bytes); 3207 u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
3311 smp_mb(); 3358 smp_mb();
3312 return fs_info->closing; 3359 return fs_info->closing;
3313} 3360}
3361
3362/*
3363 * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
3364 * anything except sleeping. This function is used to check the status of
3365 * the fs.
3366 */
3367static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
3368{
3369 return (root->fs_info->sb->s_flags & MS_RDONLY ||
3370 btrfs_fs_closing(root->fs_info));
3371}
3372
3314static inline void free_fs_info(struct btrfs_fs_info *fs_info) 3373static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3315{ 3374{
3316 kfree(fs_info->balance_ctl); 3375 kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
3357 struct btrfs_root_item *item); 3416 struct btrfs_root_item *item);
3358void btrfs_read_root_item(struct extent_buffer *eb, int slot, 3417void btrfs_read_root_item(struct extent_buffer *eb, int slot,
3359 struct btrfs_root_item *item); 3418 struct btrfs_root_item *item);
3360int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3419int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
3361 btrfs_root_item *item, struct btrfs_key *key); 3420 struct btrfs_path *path, struct btrfs_root_item *root_item,
3362int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3421 struct btrfs_key *root_key);
3363int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 3422int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
3364void btrfs_set_root_node(struct btrfs_root_item *item, 3423void btrfs_set_root_node(struct btrfs_root_item *item,
3365 struct extent_buffer *node); 3424 struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3493struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3552struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3494 size_t pg_offset, u64 start, u64 len, 3553 size_t pg_offset, u64 start, u64 len,
3495 int create); 3554 int create);
3555noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
3556 struct inode *inode, u64 offset, u64 *len,
3557 u64 *orig_start, u64 *orig_block_len,
3558 u64 *ram_bytes);
3496 3559
3497/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 3560/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
3498#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) 3561#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3530 u32 min_type); 3593 u32 min_type);
3531 3594
3532int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3595int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3596int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
3597 int delay_iput);
3533int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3598int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3534 struct extent_state **cached_state); 3599 struct extent_state **cached_state);
3535int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3600int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3814int btrfs_quota_disable(struct btrfs_trans_handle *trans, 3879int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3815 struct btrfs_fs_info *fs_info); 3880 struct btrfs_fs_info *fs_info);
3816int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 3881int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
3882void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
3883int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
3817int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 3884int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3818 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 3885 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3819int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 3886int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb34438ddedb..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
535 return next; 535 return next;
536} 536}
537 537
538static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
539 u64 root_id)
540{
541 struct btrfs_key root_key;
542
543 if (root->objectid == root_id)
544 return root;
545
546 root_key.objectid = root_id;
547 root_key.type = BTRFS_ROOT_ITEM_KEY;
548 root_key.offset = (u64)-1;
549 return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
550}
551
552static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 538static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
553 struct btrfs_root *root, 539 struct btrfs_root *root,
554 struct btrfs_delayed_item *item) 540 struct btrfs_delayed_item *item)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
401 btrfs_dev_replace_unlock(dev_replace); 401 btrfs_dev_replace_unlock(dev_replace);
402 402
403 btrfs_wait_ordered_extents(root, 0); 403 btrfs_wait_all_ordered_extents(root->fs_info, 0);
404 404
405 /* force writing the updated state information to disk */ 405 /* force writing the updated state information to disk */
406 trans = btrfs_start_transaction(root, 0); 406 trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
470 * flush all outstanding I/O and inode extent mappings before the 470 * flush all outstanding I/O and inode extent mappings before the
471 * copy operation is declared as being finished 471 * copy operation is declared as being finished
472 */ 472 */
473 ret = btrfs_start_delalloc_inodes(root, 0); 473 ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
474 if (ret) { 474 if (ret) {
475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
476 return ret; 476 return ret;
477 } 477 }
478 btrfs_wait_ordered_extents(root, 0); 478 btrfs_wait_all_ordered_extents(root->fs_info, 0);
479 479
480 trans = btrfs_start_transaction(root, 0); 480 trans = btrfs_start_transaction(root, 0);
481 if (IS_ERR(trans)) { 481 if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0292b3ead54..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1192,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1192 root->objectid = objectid; 1192 root->objectid = objectid;
1193 root->last_trans = 0; 1193 root->last_trans = 0;
1194 root->highest_objectid = 0; 1194 root->highest_objectid = 0;
1195 root->nr_delalloc_inodes = 0;
1196 root->nr_ordered_extents = 0;
1195 root->name = NULL; 1197 root->name = NULL;
1196 root->inode_tree = RB_ROOT; 1198 root->inode_tree = RB_ROOT;
1197 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1199 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1200,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1200 1202
1201 INIT_LIST_HEAD(&root->dirty_list); 1203 INIT_LIST_HEAD(&root->dirty_list);
1202 INIT_LIST_HEAD(&root->root_list); 1204 INIT_LIST_HEAD(&root->root_list);
1205 INIT_LIST_HEAD(&root->delalloc_inodes);
1206 INIT_LIST_HEAD(&root->delalloc_root);
1207 INIT_LIST_HEAD(&root->ordered_extents);
1208 INIT_LIST_HEAD(&root->ordered_root);
1203 INIT_LIST_HEAD(&root->logged_list[0]); 1209 INIT_LIST_HEAD(&root->logged_list[0]);
1204 INIT_LIST_HEAD(&root->logged_list[1]); 1210 INIT_LIST_HEAD(&root->logged_list[1]);
1205 spin_lock_init(&root->orphan_lock); 1211 spin_lock_init(&root->orphan_lock);
1206 spin_lock_init(&root->inode_lock); 1212 spin_lock_init(&root->inode_lock);
1213 spin_lock_init(&root->delalloc_lock);
1214 spin_lock_init(&root->ordered_extent_lock);
1207 spin_lock_init(&root->accounting_lock); 1215 spin_lock_init(&root->accounting_lock);
1208 spin_lock_init(&root->log_extents_lock[0]); 1216 spin_lock_init(&root->log_extents_lock[0]);
1209 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
@@ -1217,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1217 atomic_set(&root->log_writers, 0); 1225 atomic_set(&root->log_writers, 0);
1218 atomic_set(&root->log_batch, 0); 1226 atomic_set(&root->log_batch, 0);
1219 atomic_set(&root->orphan_inodes, 0); 1227 atomic_set(&root->orphan_inodes, 0);
1228 atomic_set(&root->refs, 1);
1220 root->log_transid = 0; 1229 root->log_transid = 0;
1221 root->last_log_commit = 0; 1230 root->last_log_commit = 0;
1222 extent_io_tree_init(&root->dirty_log_pages, 1231 extent_io_tree_init(&root->dirty_log_pages,
@@ -1235,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1235 spin_lock_init(&root->root_item_lock); 1244 spin_lock_init(&root->root_item_lock);
1236} 1245}
1237 1246
1238static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
1239 struct btrfs_fs_info *fs_info,
1240 u64 objectid,
1241 struct btrfs_root *root)
1242{
1243 int ret;
1244 u32 blocksize;
1245 u64 generation;
1246
1247 __setup_root(tree_root->nodesize, tree_root->leafsize,
1248 tree_root->sectorsize, tree_root->stripesize,
1249 root, fs_info, objectid);
1250 ret = btrfs_find_last_root(tree_root, objectid,
1251 &root->root_item, &root->root_key);
1252 if (ret > 0)
1253 return -ENOENT;
1254 else if (ret < 0)
1255 return ret;
1256
1257 generation = btrfs_root_generation(&root->root_item);
1258 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1259 root->commit_root = NULL;
1260 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1261 blocksize, generation);
1262 if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
1263 free_extent_buffer(root->node);
1264 root->node = NULL;
1265 return -EIO;
1266 }
1267 root->commit_root = btrfs_root_node(root);
1268 return 0;
1269}
1270
1271static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1247static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1272{ 1248{
1273 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1249 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1452,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1452 return 0; 1428 return 0;
1453} 1429}
1454 1430
1455struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 1431struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1456 struct btrfs_key *location) 1432 struct btrfs_key *key)
1457{ 1433{
1458 struct btrfs_root *root; 1434 struct btrfs_root *root;
1459 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1435 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1460 struct btrfs_path *path; 1436 struct btrfs_path *path;
1461 struct extent_buffer *l;
1462 u64 generation; 1437 u64 generation;
1463 u32 blocksize; 1438 u32 blocksize;
1464 int ret = 0; 1439 int ret;
1465 int slot;
1466 1440
1467 root = btrfs_alloc_root(fs_info); 1441 path = btrfs_alloc_path();
1468 if (!root) 1442 if (!path)
1469 return ERR_PTR(-ENOMEM); 1443 return ERR_PTR(-ENOMEM);
1470 if (location->offset == (u64)-1) { 1444
1471 ret = find_and_setup_root(tree_root, fs_info, 1445 root = btrfs_alloc_root(fs_info);
1472 location->objectid, root); 1446 if (!root) {
1473 if (ret) { 1447 ret = -ENOMEM;
1474 kfree(root); 1448 goto alloc_fail;
1475 return ERR_PTR(ret);
1476 }
1477 goto out;
1478 } 1449 }
1479 1450
1480 __setup_root(tree_root->nodesize, tree_root->leafsize, 1451 __setup_root(tree_root->nodesize, tree_root->leafsize,
1481 tree_root->sectorsize, tree_root->stripesize, 1452 tree_root->sectorsize, tree_root->stripesize,
1482 root, fs_info, location->objectid); 1453 root, fs_info, key->objectid);
1483 1454
1484 path = btrfs_alloc_path(); 1455 ret = btrfs_find_root(tree_root, key, path,
1485 if (!path) { 1456 &root->root_item, &root->root_key);
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1490 if (ret == 0) {
1491 l = path->nodes[0];
1492 slot = path->slots[0];
1493 btrfs_read_root_item(l, slot, &root->root_item);
1494 memcpy(&root->root_key, location, sizeof(*location));
1495 }
1496 btrfs_free_path(path);
1497 if (ret) { 1457 if (ret) {
1498 kfree(root);
1499 if (ret > 0) 1458 if (ret > 0)
1500 ret = -ENOENT; 1459 ret = -ENOENT;
1501 return ERR_PTR(ret); 1460 goto find_fail;
1502 } 1461 }
1503 1462
1504 generation = btrfs_root_generation(&root->root_item); 1463 generation = btrfs_root_generation(&root->root_item);
1505 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1464 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1506 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1465 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1507 blocksize, generation); 1466 blocksize, generation);
1508 if (!root->node || !extent_buffer_uptodate(root->node)) { 1467 if (!root->node) {
1509 ret = (!root->node) ? -ENOMEM : -EIO; 1468 ret = -ENOMEM;
1510 1469 goto find_fail;
1511 free_extent_buffer(root->node); 1470 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1512 kfree(root); 1471 ret = -EIO;
1513 return ERR_PTR(ret); 1472 goto read_fail;
1514 } 1473 }
1515
1516 root->commit_root = btrfs_root_node(root); 1474 root->commit_root = btrfs_root_node(root);
1517out: 1475out:
1518 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1476 btrfs_free_path(path);
1477 return root;
1478
1479read_fail:
1480 free_extent_buffer(root->node);
1481find_fail:
1482 kfree(root);
1483alloc_fail:
1484 root = ERR_PTR(ret);
1485 goto out;
1486}
1487
1488struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1489 struct btrfs_key *location)
1490{
1491 struct btrfs_root *root;
1492
1493 root = btrfs_read_tree_root(tree_root, location);
1494 if (IS_ERR(root))
1495 return root;
1496
1497 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1519 root->ref_cows = 1; 1498 root->ref_cows = 1;
1520 btrfs_check_and_init_root_item(&root->root_item); 1499 btrfs_check_and_init_root_item(&root->root_item);
1521 } 1500 }
@@ -1523,6 +1502,66 @@ out:
1523 return root; 1502 return root;
1524} 1503}
1525 1504
1505int btrfs_init_fs_root(struct btrfs_root *root)
1506{
1507 int ret;
1508
1509 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1510 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1511 GFP_NOFS);
1512 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1513 ret = -ENOMEM;
1514 goto fail;
1515 }
1516
1517 btrfs_init_free_ino_ctl(root);
1518 mutex_init(&root->fs_commit_mutex);
1519 spin_lock_init(&root->cache_lock);
1520 init_waitqueue_head(&root->cache_wait);
1521
1522 ret = get_anon_bdev(&root->anon_dev);
1523 if (ret)
1524 goto fail;
1525 return 0;
1526fail:
1527 kfree(root->free_ino_ctl);
1528 kfree(root->free_ino_pinned);
1529 return ret;
1530}
1531
1532struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1533 u64 root_id)
1534{
1535 struct btrfs_root *root;
1536
1537 spin_lock(&fs_info->fs_roots_radix_lock);
1538 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1539 (unsigned long)root_id);
1540 spin_unlock(&fs_info->fs_roots_radix_lock);
1541 return root;
1542}
1543
1544int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1545 struct btrfs_root *root)
1546{
1547 int ret;
1548
1549 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1550 if (ret)
1551 return ret;
1552
1553 spin_lock(&fs_info->fs_roots_radix_lock);
1554 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1555 (unsigned long)root->root_key.objectid,
1556 root);
1557 if (ret == 0)
1558 root->in_radix = 1;
1559 spin_unlock(&fs_info->fs_roots_radix_lock);
1560 radix_tree_preload_end();
1561
1562 return ret;
1563}
1564
1526struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1565struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1527 struct btrfs_key *location) 1566 struct btrfs_key *location)
1528{ 1567{
@@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1543 return fs_info->quota_root ? fs_info->quota_root : 1582 return fs_info->quota_root ? fs_info->quota_root :
1544 ERR_PTR(-ENOENT); 1583 ERR_PTR(-ENOENT);
1545again: 1584again:
1546 spin_lock(&fs_info->fs_roots_radix_lock); 1585 root = btrfs_lookup_fs_root(fs_info, location->objectid);
1547 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1548 (unsigned long)location->objectid);
1549 spin_unlock(&fs_info->fs_roots_radix_lock);
1550 if (root) 1586 if (root)
1551 return root; 1587 return root;
1552 1588
1553 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1589 root = btrfs_read_fs_root(fs_info->tree_root, location);
1554 if (IS_ERR(root)) 1590 if (IS_ERR(root))
1555 return root; 1591 return root;
1556 1592
1557 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1593 if (btrfs_root_refs(&root->root_item) == 0) {
1558 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1594 ret = -ENOENT;
1559 GFP_NOFS);
1560 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1561 ret = -ENOMEM;
1562 goto fail; 1595 goto fail;
1563 } 1596 }
1564 1597
1565 btrfs_init_free_ino_ctl(root); 1598 ret = btrfs_init_fs_root(root);
1566 mutex_init(&root->fs_commit_mutex);
1567 spin_lock_init(&root->cache_lock);
1568 init_waitqueue_head(&root->cache_wait);
1569
1570 ret = get_anon_bdev(&root->anon_dev);
1571 if (ret) 1599 if (ret)
1572 goto fail; 1600 goto fail;
1573 1601
1574 if (btrfs_root_refs(&root->root_item) == 0) {
1575 ret = -ENOENT;
1576 goto fail;
1577 }
1578
1579 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1602 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1580 if (ret < 0) 1603 if (ret < 0)
1581 goto fail; 1604 goto fail;
1582 if (ret == 0) 1605 if (ret == 0)
1583 root->orphan_item_inserted = 1; 1606 root->orphan_item_inserted = 1;
1584 1607
1585 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1608 ret = btrfs_insert_fs_root(fs_info, root);
1586 if (ret)
1587 goto fail;
1588
1589 spin_lock(&fs_info->fs_roots_radix_lock);
1590 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1591 (unsigned long)root->root_key.objectid,
1592 root);
1593 if (ret == 0)
1594 root->in_radix = 1;
1595
1596 spin_unlock(&fs_info->fs_roots_radix_lock);
1597 radix_tree_preload_end();
1598 if (ret) { 1609 if (ret) {
1599 if (ret == -EEXIST) { 1610 if (ret == -EEXIST) {
1600 free_fs_root(root); 1611 free_fs_root(root);
@@ -1602,10 +1613,6 @@ again:
1602 } 1613 }
1603 goto fail; 1614 goto fail;
1604 } 1615 }
1605
1606 ret = btrfs_find_dead_roots(fs_info->tree_root,
1607 root->root_key.objectid);
1608 WARN_ON(ret);
1609 return root; 1616 return root;
1610fail: 1617fail:
1611 free_fs_root(root); 1618 free_fs_root(root);
@@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
1677static int cleaner_kthread(void *arg) 1684static int cleaner_kthread(void *arg)
1678{ 1685{
1679 struct btrfs_root *root = arg; 1686 struct btrfs_root *root = arg;
1687 int again;
1680 1688
1681 do { 1689 do {
1682 int again = 0; 1690 again = 0;
1683 1691
1684 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1692 /* Make the cleaner go to sleep early. */
1685 down_read_trylock(&root->fs_info->sb->s_umount)) { 1693 if (btrfs_need_cleaner_sleep(root))
1686 if (mutex_trylock(&root->fs_info->cleaner_mutex)) { 1694 goto sleep;
1687 btrfs_run_delayed_iputs(root); 1695
1688 again = btrfs_clean_one_deleted_snapshot(root); 1696 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1689 mutex_unlock(&root->fs_info->cleaner_mutex); 1697 goto sleep;
1690 } 1698
1691 btrfs_run_defrag_inodes(root->fs_info); 1699 /*
1692 up_read(&root->fs_info->sb->s_umount); 1700 * Avoid the problem that we change the status of the fs
1701 * during the above check and trylock.
1702 */
1703 if (btrfs_need_cleaner_sleep(root)) {
1704 mutex_unlock(&root->fs_info->cleaner_mutex);
1705 goto sleep;
1693 } 1706 }
1694 1707
1708 btrfs_run_delayed_iputs(root);
1709 again = btrfs_clean_one_deleted_snapshot(root);
1710 mutex_unlock(&root->fs_info->cleaner_mutex);
1711
1712 /*
1713 * The defragger has dealt with the R/O remount and umount,
1714 * needn't do anything special here.
1715 */
1716 btrfs_run_defrag_inodes(root->fs_info);
1717sleep:
1695 if (!try_to_freeze() && !again) { 1718 if (!try_to_freeze() && !again) {
1696 set_current_state(TASK_INTERRUPTIBLE); 1719 set_current_state(TASK_INTERRUPTIBLE);
1697 if (!kthread_should_stop()) 1720 if (!kthread_should_stop())
@@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg)
1725 } 1748 }
1726 1749
1727 now = get_seconds(); 1750 now = get_seconds();
1728 if (!cur->blocked && 1751 if (cur->state < TRANS_STATE_BLOCKED &&
1729 (now < cur->start_time || now - cur->start_time < 30)) { 1752 (now < cur->start_time || now - cur->start_time < 30)) {
1730 spin_unlock(&root->fs_info->trans_lock); 1753 spin_unlock(&root->fs_info->trans_lock);
1731 delay = HZ * 5; 1754 delay = HZ * 5;
@@ -2035,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2035 list_del(&gang[0]->root_list); 2058 list_del(&gang[0]->root_list);
2036 2059
2037 if (gang[0]->in_radix) { 2060 if (gang[0]->in_radix) {
2038 btrfs_free_fs_root(fs_info, gang[0]); 2061 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2039 } else { 2062 } else {
2040 free_extent_buffer(gang[0]->node); 2063 free_extent_buffer(gang[0]->node);
2041 free_extent_buffer(gang[0]->commit_root); 2064 free_extent_buffer(gang[0]->commit_root);
2042 kfree(gang[0]); 2065 btrfs_put_fs_root(gang[0]);
2043 } 2066 }
2044 } 2067 }
2045 2068
@@ -2050,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2050 if (!ret) 2073 if (!ret)
2051 break; 2074 break;
2052 for (i = 0; i < ret; i++) 2075 for (i = 0; i < ret; i++)
2053 btrfs_free_fs_root(fs_info, gang[i]); 2076 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2054 } 2077 }
2055} 2078}
2056 2079
@@ -2082,14 +2105,8 @@ int open_ctree(struct super_block *sb,
2082 int backup_index = 0; 2105 int backup_index = 0;
2083 2106
2084 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2107 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2085 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
2086 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
2087 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2108 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2088 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 2109 if (!tree_root || !chunk_root) {
2089 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
2090
2091 if (!tree_root || !extent_root || !csum_root ||
2092 !chunk_root || !dev_root || !quota_root) {
2093 err = -ENOMEM; 2110 err = -ENOMEM;
2094 goto fail; 2111 goto fail;
2095 } 2112 }
@@ -2132,9 +2149,9 @@ int open_ctree(struct super_block *sb,
2132 INIT_LIST_HEAD(&fs_info->trans_list); 2149 INIT_LIST_HEAD(&fs_info->trans_list);
2133 INIT_LIST_HEAD(&fs_info->dead_roots); 2150 INIT_LIST_HEAD(&fs_info->dead_roots);
2134 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2151 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2135 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2152 INIT_LIST_HEAD(&fs_info->delalloc_roots);
2136 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2153 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2137 spin_lock_init(&fs_info->delalloc_lock); 2154 spin_lock_init(&fs_info->delalloc_root_lock);
2138 spin_lock_init(&fs_info->trans_lock); 2155 spin_lock_init(&fs_info->trans_lock);
2139 spin_lock_init(&fs_info->fs_roots_radix_lock); 2156 spin_lock_init(&fs_info->fs_roots_radix_lock);
2140 spin_lock_init(&fs_info->delayed_iput_lock); 2157 spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2170,7 +2187,6 @@ int open_ctree(struct super_block *sb,
2170 fs_info->max_inline = 8192 * 1024; 2187 fs_info->max_inline = 8192 * 1024;
2171 fs_info->metadata_ratio = 0; 2188 fs_info->metadata_ratio = 0;
2172 fs_info->defrag_inodes = RB_ROOT; 2189 fs_info->defrag_inodes = RB_ROOT;
2173 fs_info->trans_no_join = 0;
2174 fs_info->free_chunk_space = 0; 2190 fs_info->free_chunk_space = 0;
2175 fs_info->tree_mod_log = RB_ROOT; 2191 fs_info->tree_mod_log = RB_ROOT;
2176 2192
@@ -2181,8 +2197,8 @@ int open_ctree(struct super_block *sb,
2181 fs_info->thread_pool_size = min_t(unsigned long, 2197 fs_info->thread_pool_size = min_t(unsigned long,
2182 num_online_cpus() + 2, 8); 2198 num_online_cpus() + 2, 8);
2183 2199
2184 INIT_LIST_HEAD(&fs_info->ordered_extents); 2200 INIT_LIST_HEAD(&fs_info->ordered_roots);
2185 spin_lock_init(&fs_info->ordered_extent_lock); 2201 spin_lock_init(&fs_info->ordered_root_lock);
2186 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2202 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2187 GFP_NOFS); 2203 GFP_NOFS);
2188 if (!fs_info->delayed_root) { 2204 if (!fs_info->delayed_root) {
@@ -2275,6 +2291,7 @@ int open_ctree(struct super_block *sb,
2275 fs_info->qgroup_seq = 1; 2291 fs_info->qgroup_seq = 1;
2276 fs_info->quota_enabled = 0; 2292 fs_info->quota_enabled = 0;
2277 fs_info->pending_quota_state = 0; 2293 fs_info->pending_quota_state = 0;
2294 fs_info->qgroup_ulist = NULL;
2278 mutex_init(&fs_info->qgroup_rescan_lock); 2295 mutex_init(&fs_info->qgroup_rescan_lock);
2279 2296
2280 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2297 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2639,33 +2656,44 @@ retry_root_backup:
2639 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2656 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2640 tree_root->commit_root = btrfs_root_node(tree_root); 2657 tree_root->commit_root = btrfs_root_node(tree_root);
2641 2658
2642 ret = find_and_setup_root(tree_root, fs_info, 2659 location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2643 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2660 location.type = BTRFS_ROOT_ITEM_KEY;
2644 if (ret) 2661 location.offset = 0;
2662
2663 extent_root = btrfs_read_tree_root(tree_root, &location);
2664 if (IS_ERR(extent_root)) {
2665 ret = PTR_ERR(extent_root);
2645 goto recovery_tree_root; 2666 goto recovery_tree_root;
2667 }
2646 extent_root->track_dirty = 1; 2668 extent_root->track_dirty = 1;
2669 fs_info->extent_root = extent_root;
2647 2670
2648 ret = find_and_setup_root(tree_root, fs_info, 2671 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2649 BTRFS_DEV_TREE_OBJECTID, dev_root); 2672 dev_root = btrfs_read_tree_root(tree_root, &location);
2650 if (ret) 2673 if (IS_ERR(dev_root)) {
2674 ret = PTR_ERR(dev_root);
2651 goto recovery_tree_root; 2675 goto recovery_tree_root;
2676 }
2652 dev_root->track_dirty = 1; 2677 dev_root->track_dirty = 1;
2678 fs_info->dev_root = dev_root;
2679 btrfs_init_devices_late(fs_info);
2653 2680
2654 ret = find_and_setup_root(tree_root, fs_info, 2681 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2655 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2682 csum_root = btrfs_read_tree_root(tree_root, &location);
2656 if (ret) 2683 if (IS_ERR(csum_root)) {
2684 ret = PTR_ERR(csum_root);
2657 goto recovery_tree_root; 2685 goto recovery_tree_root;
2686 }
2658 csum_root->track_dirty = 1; 2687 csum_root->track_dirty = 1;
2688 fs_info->csum_root = csum_root;
2659 2689
2660 ret = find_and_setup_root(tree_root, fs_info, 2690 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2661 BTRFS_QUOTA_TREE_OBJECTID, quota_root); 2691 quota_root = btrfs_read_tree_root(tree_root, &location);
2662 if (ret) { 2692 if (!IS_ERR(quota_root)) {
2663 kfree(quota_root);
2664 quota_root = fs_info->quota_root = NULL;
2665 } else {
2666 quota_root->track_dirty = 1; 2693 quota_root->track_dirty = 1;
2667 fs_info->quota_enabled = 1; 2694 fs_info->quota_enabled = 1;
2668 fs_info->pending_quota_state = 1; 2695 fs_info->pending_quota_state = 1;
2696 fs_info->quota_root = quota_root;
2669 } 2697 }
2670 2698
2671 fs_info->generation = generation; 2699 fs_info->generation = generation;
@@ -2818,11 +2846,9 @@ retry_root_backup:
2818 2846
2819 location.objectid = BTRFS_FS_TREE_OBJECTID; 2847 location.objectid = BTRFS_FS_TREE_OBJECTID;
2820 location.type = BTRFS_ROOT_ITEM_KEY; 2848 location.type = BTRFS_ROOT_ITEM_KEY;
2821 location.offset = (u64)-1; 2849 location.offset = 0;
2822 2850
2823 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2851 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2824 if (!fs_info->fs_root)
2825 goto fail_qgroup;
2826 if (IS_ERR(fs_info->fs_root)) { 2852 if (IS_ERR(fs_info->fs_root)) {
2827 err = PTR_ERR(fs_info->fs_root); 2853 err = PTR_ERR(fs_info->fs_root);
2828 goto fail_qgroup; 2854 goto fail_qgroup;
@@ -2854,6 +2880,8 @@ retry_root_backup:
2854 return ret; 2880 return ret;
2855 } 2881 }
2856 2882
2883 btrfs_qgroup_rescan_resume(fs_info);
2884
2857 return 0; 2885 return 0;
2858 2886
2859fail_qgroup: 2887fail_qgroup:
@@ -3259,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3259 BTRFS_BLOCK_GROUP_RAID10)) { 3287 BTRFS_BLOCK_GROUP_RAID10)) {
3260 num_tolerated_disk_barrier_failures = 1; 3288 num_tolerated_disk_barrier_failures = 1;
3261 } else if (flags & 3289 } else if (flags &
3262 BTRFS_BLOCK_GROUP_RAID5) { 3290 BTRFS_BLOCK_GROUP_RAID6) {
3263 num_tolerated_disk_barrier_failures = 2; 3291 num_tolerated_disk_barrier_failures = 2;
3264 } 3292 }
3265 } 3293 }
@@ -3367,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
3367 return ret; 3395 return ret;
3368} 3396}
3369 3397
3370void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3398/* Drop a fs root from the radix tree and free it. */
3399void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3400 struct btrfs_root *root)
3371{ 3401{
3372 spin_lock(&fs_info->fs_roots_radix_lock); 3402 spin_lock(&fs_info->fs_roots_radix_lock);
3373 radix_tree_delete(&fs_info->fs_roots_radix, 3403 radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3398,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
3398 kfree(root->free_ino_ctl); 3428 kfree(root->free_ino_ctl);
3399 kfree(root->free_ino_pinned); 3429 kfree(root->free_ino_pinned);
3400 kfree(root->name); 3430 kfree(root->name);
3401 kfree(root); 3431 btrfs_put_fs_root(root);
3432}
3433
3434void btrfs_free_fs_root(struct btrfs_root *root)
3435{
3436 free_fs_root(root);
3402} 3437}
3403 3438
3404int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 3439int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3654,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3654 INIT_LIST_HEAD(&splice); 3689 INIT_LIST_HEAD(&splice);
3655 3690
3656 mutex_lock(&root->fs_info->ordered_operations_mutex); 3691 mutex_lock(&root->fs_info->ordered_operations_mutex);
3657 spin_lock(&root->fs_info->ordered_extent_lock); 3692 spin_lock(&root->fs_info->ordered_root_lock);
3658 3693
3659 list_splice_init(&t->ordered_operations, &splice); 3694 list_splice_init(&t->ordered_operations, &splice);
3660 while (!list_empty(&splice)) { 3695 while (!list_empty(&splice)) {
@@ -3662,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3662 ordered_operations); 3697 ordered_operations);
3663 3698
3664 list_del_init(&btrfs_inode->ordered_operations); 3699 list_del_init(&btrfs_inode->ordered_operations);
3665 spin_unlock(&root->fs_info->ordered_extent_lock); 3700 spin_unlock(&root->fs_info->ordered_root_lock);
3666 3701
3667 btrfs_invalidate_inodes(btrfs_inode->root); 3702 btrfs_invalidate_inodes(btrfs_inode->root);
3668 3703
3669 spin_lock(&root->fs_info->ordered_extent_lock); 3704 spin_lock(&root->fs_info->ordered_root_lock);
3670 } 3705 }
3671 3706
3672 spin_unlock(&root->fs_info->ordered_extent_lock); 3707 spin_unlock(&root->fs_info->ordered_root_lock);
3673 mutex_unlock(&root->fs_info->ordered_operations_mutex); 3708 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3674} 3709}
3675 3710
@@ -3677,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3677{ 3712{
3678 struct btrfs_ordered_extent *ordered; 3713 struct btrfs_ordered_extent *ordered;
3679 3714
3680 spin_lock(&root->fs_info->ordered_extent_lock); 3715 spin_lock(&root->ordered_extent_lock);
3681 /* 3716 /*
3682 * This will just short circuit the ordered completion stuff which will 3717 * This will just short circuit the ordered completion stuff which will
3683 * make sure the ordered extent gets properly cleaned up. 3718 * make sure the ordered extent gets properly cleaned up.
3684 */ 3719 */
3685 list_for_each_entry(ordered, &root->fs_info->ordered_extents, 3720 list_for_each_entry(ordered, &root->ordered_extents,
3686 root_extent_list) 3721 root_extent_list)
3687 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 3722 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3688 spin_unlock(&root->fs_info->ordered_extent_lock); 3723 spin_unlock(&root->ordered_extent_lock);
3724}
3725
3726static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3727{
3728 struct btrfs_root *root;
3729 struct list_head splice;
3730
3731 INIT_LIST_HEAD(&splice);
3732
3733 spin_lock(&fs_info->ordered_root_lock);
3734 list_splice_init(&fs_info->ordered_roots, &splice);
3735 while (!list_empty(&splice)) {
3736 root = list_first_entry(&splice, struct btrfs_root,
3737 ordered_root);
3738 list_del_init(&root->ordered_root);
3739
3740 btrfs_destroy_ordered_extents(root);
3741
3742 cond_resched_lock(&fs_info->ordered_root_lock);
3743 }
3744 spin_unlock(&fs_info->ordered_root_lock);
3689} 3745}
3690 3746
3691int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 3747int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3707,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3707 3763
3708 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3764 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3709 struct btrfs_delayed_ref_head *head = NULL; 3765 struct btrfs_delayed_ref_head *head = NULL;
3766 bool pin_bytes = false;
3710 3767
3711 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3768 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3712 atomic_set(&ref->refs, 1); 3769 atomic_set(&ref->refs, 1);
@@ -3727,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3727 } 3784 }
3728 3785
3729 if (head->must_insert_reserved) 3786 if (head->must_insert_reserved)
3730 btrfs_pin_extent(root, ref->bytenr, 3787 pin_bytes = true;
3731 ref->num_bytes, 1);
3732 btrfs_free_delayed_extent_op(head->extent_op); 3788 btrfs_free_delayed_extent_op(head->extent_op);
3733 delayed_refs->num_heads--; 3789 delayed_refs->num_heads--;
3734 if (list_empty(&head->cluster)) 3790 if (list_empty(&head->cluster))
@@ -3739,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3739 ref->in_tree = 0; 3795 ref->in_tree = 0;
3740 rb_erase(&ref->rb_node, &delayed_refs->root); 3796 rb_erase(&ref->rb_node, &delayed_refs->root);
3741 delayed_refs->num_entries--; 3797 delayed_refs->num_entries--;
3742 if (head)
3743 mutex_unlock(&head->mutex);
3744 spin_unlock(&delayed_refs->lock); 3798 spin_unlock(&delayed_refs->lock);
3799 if (head) {
3800 if (pin_bytes)
3801 btrfs_pin_extent(root, ref->bytenr,
3802 ref->num_bytes, 1);
3803 mutex_unlock(&head->mutex);
3804 }
3745 btrfs_put_delayed_ref(ref); 3805 btrfs_put_delayed_ref(ref);
3746 3806
3747 cond_resched(); 3807 cond_resched();
@@ -3778,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3778 3838
3779 INIT_LIST_HEAD(&splice); 3839 INIT_LIST_HEAD(&splice);
3780 3840
3781 spin_lock(&root->fs_info->delalloc_lock); 3841 spin_lock(&root->delalloc_lock);
3782 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 3842 list_splice_init(&root->delalloc_inodes, &splice);
3783 3843
3784 while (!list_empty(&splice)) { 3844 while (!list_empty(&splice)) {
3785 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3845 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
3786 delalloc_inodes); 3846 delalloc_inodes);
3787 3847
3788 list_del_init(&btrfs_inode->delalloc_inodes); 3848 list_del_init(&btrfs_inode->delalloc_inodes);
3789 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 3849 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3790 &btrfs_inode->runtime_flags); 3850 &btrfs_inode->runtime_flags);
3791 spin_unlock(&root->fs_info->delalloc_lock); 3851 spin_unlock(&root->delalloc_lock);
3792 3852
3793 btrfs_invalidate_inodes(btrfs_inode->root); 3853 btrfs_invalidate_inodes(btrfs_inode->root);
3794 3854
3795 spin_lock(&root->fs_info->delalloc_lock); 3855 spin_lock(&root->delalloc_lock);
3796 } 3856 }
3797 3857
3798 spin_unlock(&root->fs_info->delalloc_lock); 3858 spin_unlock(&root->delalloc_lock);
3859}
3860
3861static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
3862{
3863 struct btrfs_root *root;
3864 struct list_head splice;
3865
3866 INIT_LIST_HEAD(&splice);
3867
3868 spin_lock(&fs_info->delalloc_root_lock);
3869 list_splice_init(&fs_info->delalloc_roots, &splice);
3870 while (!list_empty(&splice)) {
3871 root = list_first_entry(&splice, struct btrfs_root,
3872 delalloc_root);
3873 list_del_init(&root->delalloc_root);
3874 root = btrfs_grab_fs_root(root);
3875 BUG_ON(!root);
3876 spin_unlock(&fs_info->delalloc_root_lock);
3877
3878 btrfs_destroy_delalloc_inodes(root);
3879 btrfs_put_fs_root(root);
3880
3881 spin_lock(&fs_info->delalloc_root_lock);
3882 }
3883 spin_unlock(&fs_info->delalloc_root_lock);
3799} 3884}
3800 3885
3801static int btrfs_destroy_marked_extents(struct btrfs_root *root, 3886static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3879,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3879 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, 3964 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
3880 cur_trans->dirty_pages.dirty_bytes); 3965 cur_trans->dirty_pages.dirty_bytes);
3881 3966
3882 /* FIXME: cleanup wait for commit */ 3967 cur_trans->state = TRANS_STATE_COMMIT_START;
3883 cur_trans->in_commit = 1;
3884 cur_trans->blocked = 1;
3885 wake_up(&root->fs_info->transaction_blocked_wait); 3968 wake_up(&root->fs_info->transaction_blocked_wait);
3886 3969
3887 btrfs_evict_pending_snapshots(cur_trans); 3970 btrfs_evict_pending_snapshots(cur_trans);
3888 3971
3889 cur_trans->blocked = 0; 3972 cur_trans->state = TRANS_STATE_UNBLOCKED;
3890 wake_up(&root->fs_info->transaction_wait); 3973 wake_up(&root->fs_info->transaction_wait);
3891 3974
3892 cur_trans->commit_done = 1;
3893 wake_up(&cur_trans->commit_wait);
3894
3895 btrfs_destroy_delayed_inodes(root); 3975 btrfs_destroy_delayed_inodes(root);
3896 btrfs_assert_delayed_root_empty(root); 3976 btrfs_assert_delayed_root_empty(root);
3897 3977
@@ -3900,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3900 btrfs_destroy_pinned_extent(root, 3980 btrfs_destroy_pinned_extent(root,
3901 root->fs_info->pinned_extents); 3981 root->fs_info->pinned_extents);
3902 3982
3983 cur_trans->state =TRANS_STATE_COMPLETED;
3984 wake_up(&cur_trans->commit_wait);
3985
3903 /* 3986 /*
3904 memset(cur_trans, 0, sizeof(*cur_trans)); 3987 memset(cur_trans, 0, sizeof(*cur_trans));
3905 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 3988 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3915,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3915 3998
3916 spin_lock(&root->fs_info->trans_lock); 3999 spin_lock(&root->fs_info->trans_lock);
3917 list_splice_init(&root->fs_info->trans_list, &list); 4000 list_splice_init(&root->fs_info->trans_list, &list);
3918 root->fs_info->trans_no_join = 1; 4001 root->fs_info->running_transaction = NULL;
3919 spin_unlock(&root->fs_info->trans_lock); 4002 spin_unlock(&root->fs_info->trans_lock);
3920 4003
3921 while (!list_empty(&list)) { 4004 while (!list_empty(&list)) {
@@ -3923,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3923 4006
3924 btrfs_destroy_ordered_operations(t, root); 4007 btrfs_destroy_ordered_operations(t, root);
3925 4008
3926 btrfs_destroy_ordered_extents(root); 4009 btrfs_destroy_all_ordered_extents(root->fs_info);
3927 4010
3928 btrfs_destroy_delayed_refs(t, root); 4011 btrfs_destroy_delayed_refs(t, root);
3929 4012
3930 /* FIXME: cleanup wait for commit */ 4013 /*
3931 t->in_commit = 1; 4014 * FIXME: cleanup wait for commit
3932 t->blocked = 1; 4015 * We needn't acquire the lock here, because we are during
4016 * the umount, there is no other task which will change it.
4017 */
4018 t->state = TRANS_STATE_COMMIT_START;
3933 smp_mb(); 4019 smp_mb();
3934 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 4020 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3935 wake_up(&root->fs_info->transaction_blocked_wait); 4021 wake_up(&root->fs_info->transaction_blocked_wait);
3936 4022
3937 btrfs_evict_pending_snapshots(t); 4023 btrfs_evict_pending_snapshots(t);
3938 4024
3939 t->blocked = 0; 4025 t->state = TRANS_STATE_UNBLOCKED;
3940 smp_mb(); 4026 smp_mb();
3941 if (waitqueue_active(&root->fs_info->transaction_wait)) 4027 if (waitqueue_active(&root->fs_info->transaction_wait))
3942 wake_up(&root->fs_info->transaction_wait); 4028 wake_up(&root->fs_info->transaction_wait);
3943 4029
3944 t->commit_done = 1;
3945 smp_mb();
3946 if (waitqueue_active(&t->commit_wait))
3947 wake_up(&t->commit_wait);
3948
3949 btrfs_destroy_delayed_inodes(root); 4030 btrfs_destroy_delayed_inodes(root);
3950 btrfs_assert_delayed_root_empty(root); 4031 btrfs_assert_delayed_root_empty(root);
3951 4032
3952 btrfs_destroy_delalloc_inodes(root); 4033 btrfs_destroy_all_delalloc_inodes(root->fs_info);
3953
3954 spin_lock(&root->fs_info->trans_lock);
3955 root->fs_info->running_transaction = NULL;
3956 spin_unlock(&root->fs_info->trans_lock);
3957 4034
3958 btrfs_destroy_marked_extents(root, &t->dirty_pages, 4035 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3959 EXTENT_DIRTY); 4036 EXTENT_DIRTY);
@@ -3961,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3961 btrfs_destroy_pinned_extent(root, 4038 btrfs_destroy_pinned_extent(root,
3962 root->fs_info->pinned_extents); 4039 root->fs_info->pinned_extents);
3963 4040
4041 t->state = TRANS_STATE_COMPLETED;
4042 smp_mb();
4043 if (waitqueue_active(&t->commit_wait))
4044 wake_up(&t->commit_wait);
4045
3964 atomic_set(&t->use_count, 0); 4046 atomic_set(&t->use_count, 0);
3965 list_del_init(&t->list); 4047 list_del_init(&t->list);
3966 memset(t, 0, sizeof(*t)); 4048 memset(t, 0, sizeof(*t));
3967 kmem_cache_free(btrfs_transaction_cachep, t); 4049 kmem_cache_free(btrfs_transaction_cachep, t);
3968 } 4050 }
3969 4051
3970 spin_lock(&root->fs_info->trans_lock);
3971 root->fs_info->trans_no_join = 0;
3972 spin_unlock(&root->fs_info->trans_lock);
3973 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 4052 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3974 4053
3975 return 0; 4054 return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr, u32 blocksize);
66struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root);
68struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 71struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
69 struct btrfs_key *location); 72 struct btrfs_key *location);
70int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 73int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
71void btrfs_btree_balance_dirty(struct btrfs_root *root); 74void btrfs_btree_balance_dirty(struct btrfs_root *root);
72void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 75void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
73void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 76void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
77 struct btrfs_root *root);
78void btrfs_free_fs_root(struct btrfs_root *root);
79
80/*
81 * This function is used to grab the root, and avoid it is freed when we
82 * access it. But it doesn't ensure that the tree is not dropped.
83 *
84 * If you want to ensure the whole tree is safe, you should use
85 * fs_info->subvol_srcu
86 */
87static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
88{
89 if (atomic_inc_not_zero(&root->refs))
90 return root;
91 return NULL;
92}
93
94static inline void btrfs_put_fs_root(struct btrfs_root *root)
95{
96 if (atomic_dec_and_test(&root->refs))
97 kfree(root);
98}
99
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 100void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 101int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
76 int atomic); 102 int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 goto fail; 82 goto fail;
83 } 83 }
84 84
85 if (btrfs_root_refs(&root->root_item) == 0) {
86 err = -ENOENT;
87 goto fail;
88 }
89
90 key.objectid = objectid; 85 key.objectid = objectid;
91 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 86 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
92 key.offset = 0; 87 key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..1204c8ef6f32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h>
27#include "compat.h" 28#include "compat.h"
28#include "hash.h" 29#include "hash.h"
29#include "ctree.h" 30#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2526 return 0; 2527 return 0;
2527} 2528}
2528 2529
2530static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2531{
2532 u64 num_bytes;
2533
2534 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2535 sizeof(struct btrfs_extent_inline_ref));
2536 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2537 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2538
2539 /*
2540 * We don't ever fill up leaves all the way so multiply by 2 just to be
2541 * closer to what we're really going to want to ouse.
2542 */
2543 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2544}
2545
2546int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2547 struct btrfs_root *root)
2548{
2549 struct btrfs_block_rsv *global_rsv;
2550 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2551 u64 num_bytes;
2552 int ret = 0;
2553
2554 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2555 num_heads = heads_to_leaves(root, num_heads);
2556 if (num_heads > 1)
2557 num_bytes += (num_heads - 1) * root->leafsize;
2558 num_bytes <<= 1;
2559 global_rsv = &root->fs_info->global_block_rsv;
2560
2561 /*
2562 * If we can't allocate any more chunks lets make sure we have _lots_ of
2563 * wiggle room since running delayed refs can create more delayed refs.
2564 */
2565 if (global_rsv->space_info->full)
2566 num_bytes <<= 1;
2567
2568 spin_lock(&global_rsv->lock);
2569 if (global_rsv->reserved <= num_bytes)
2570 ret = 1;
2571 spin_unlock(&global_rsv->lock);
2572 return ret;
2573}
2574
2529/* 2575/*
2530 * this starts processing the delayed reference count updates and 2576 * this starts processing the delayed reference count updates and
2531 * extent insertions we have queued up so far. count can be 2577 * extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
2573 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2619 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2574 if (old) { 2620 if (old) {
2575 DEFINE_WAIT(__wait); 2621 DEFINE_WAIT(__wait);
2576 if (delayed_refs->num_entries < 16348) 2622 if (delayed_refs->flushing ||
2623 !btrfs_should_throttle_delayed_refs(trans, root))
2577 return 0; 2624 return 0;
2578 2625
2579 prepare_to_wait(&delayed_refs->wait, &__wait, 2626 prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
2608 2655
2609 while (1) { 2656 while (1) {
2610 if (!(run_all || run_most) && 2657 if (!(run_all || run_most) &&
2611 delayed_refs->num_heads_ready < 64) 2658 !btrfs_should_throttle_delayed_refs(trans, root))
2612 break; 2659 break;
2613 2660
2614 /* 2661 /*
@@ -2629,6 +2676,7 @@ again:
2629 spin_unlock(&delayed_refs->lock); 2676 spin_unlock(&delayed_refs->lock);
2630 btrfs_abort_transaction(trans, root, ret); 2677 btrfs_abort_transaction(trans, root, ret);
2631 atomic_dec(&delayed_refs->procs_running_refs); 2678 atomic_dec(&delayed_refs->procs_running_refs);
2679 wake_up(&delayed_refs->wait);
2632 return ret; 2680 return ret;
2633 } 2681 }
2634 2682
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3310 struct btrfs_space_info *found; 3358 struct btrfs_space_info *found;
3311 int i; 3359 int i;
3312 int factor; 3360 int factor;
3361 int ret;
3313 3362
3314 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3363 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3315 BTRFS_BLOCK_GROUP_RAID10)) 3364 BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3333 if (!found) 3382 if (!found)
3334 return -ENOMEM; 3383 return -ENOMEM;
3335 3384
3385 ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3386 if (ret) {
3387 kfree(found);
3388 return ret;
3389 }
3390
3336 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3391 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3337 INIT_LIST_HEAD(&found->block_groups[i]); 3392 INIT_LIST_HEAD(&found->block_groups[i]);
3338 init_rwsem(&found->groups_sem); 3393 init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
3565 } 3620 }
3566 3621
3567 /* 3622 /*
3568 * If we have less pinned bytes than we want to allocate then 3623 * If we don't have enough pinned space to deal with this
3569 * don't bother committing the transaction, it won't help us. 3624 * allocation don't bother committing the transaction.
3570 */ 3625 */
3571 if (data_sinfo->bytes_pinned < bytes) 3626 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3627 bytes) < 0)
3572 committed = 1; 3628 committed = 1;
3573 spin_unlock(&data_sinfo->lock); 3629 spin_unlock(&data_sinfo->lock);
3574 3630
@@ -3577,6 +3633,7 @@ commit_trans:
3577 if (!committed && 3633 if (!committed &&
3578 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3634 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3579 committed = 1; 3635 committed = 1;
3636
3580 trans = btrfs_join_transaction(root); 3637 trans = btrfs_join_transaction(root);
3581 if (IS_ERR(trans)) 3638 if (IS_ERR(trans))
3582 return PTR_ERR(trans); 3639 return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3609 3666
3610 data_sinfo = root->fs_info->data_sinfo; 3667 data_sinfo = root->fs_info->data_sinfo;
3611 spin_lock(&data_sinfo->lock); 3668 spin_lock(&data_sinfo->lock);
3669 WARN_ON(data_sinfo->bytes_may_use < bytes);
3612 data_sinfo->bytes_may_use -= bytes; 3670 data_sinfo->bytes_may_use -= bytes;
3613 trace_btrfs_space_reservation(root->fs_info, "space_info", 3671 trace_btrfs_space_reservation(root->fs_info, "space_info",
3614 data_sinfo->flags, bytes, 0); 3672 data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3886 unsigned long nr_pages) 3944 unsigned long nr_pages)
3887{ 3945{
3888 struct super_block *sb = root->fs_info->sb; 3946 struct super_block *sb = root->fs_info->sb;
3889 int started;
3890 3947
3891 /* If we can not start writeback, just sync all the delalloc file. */ 3948 if (down_read_trylock(&sb->s_umount)) {
3892 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3949 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3893 WB_REASON_FS_FREE_SPACE); 3950 up_read(&sb->s_umount);
3894 if (!started) { 3951 } else {
3895 /* 3952 /*
3896 * We needn't worry the filesystem going from r/w to r/o though 3953 * We needn't worry the filesystem going from r/w to r/o though
3897 * we don't acquire ->s_umount mutex, because the filesystem 3954 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3899 * the filesystem is readonly(all dirty pages are written to 3956 * the filesystem is readonly(all dirty pages are written to
3900 * the disk). 3957 * the disk).
3901 */ 3958 */
3902 btrfs_start_delalloc_inodes(root, 0); 3959 btrfs_start_all_delalloc_inodes(root->fs_info, 0);
3903 if (!current->journal_info) 3960 if (!current->journal_info)
3904 btrfs_wait_ordered_extents(root, 0); 3961 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3905 } 3962 }
3906} 3963}
3907 3964
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3931 if (delalloc_bytes == 0) { 3988 if (delalloc_bytes == 0) {
3932 if (trans) 3989 if (trans)
3933 return; 3990 return;
3934 btrfs_wait_ordered_extents(root, 0); 3991 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3935 return; 3992 return;
3936 } 3993 }
3937 3994
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3959 4016
3960 loops++; 4017 loops++;
3961 if (wait_ordered && !trans) { 4018 if (wait_ordered && !trans) {
3962 btrfs_wait_ordered_extents(root, 0); 4019 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3963 } else { 4020 } else {
3964 time_left = schedule_timeout_killable(1); 4021 time_left = schedule_timeout_killable(1);
3965 if (time_left) 4022 if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
3997 4054
3998 /* See if there is enough pinned space to make this reservation */ 4055 /* See if there is enough pinned space to make this reservation */
3999 spin_lock(&space_info->lock); 4056 spin_lock(&space_info->lock);
4000 if (space_info->bytes_pinned >= bytes) { 4057 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4058 bytes) >= 0) {
4001 spin_unlock(&space_info->lock); 4059 spin_unlock(&space_info->lock);
4002 goto commit; 4060 goto commit;
4003 } 4061 }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
4012 4070
4013 spin_lock(&space_info->lock); 4071 spin_lock(&space_info->lock);
4014 spin_lock(&delayed_rsv->lock); 4072 spin_lock(&delayed_rsv->lock);
4015 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4073 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4074 bytes - delayed_rsv->size) >= 0) {
4016 spin_unlock(&delayed_rsv->lock); 4075 spin_unlock(&delayed_rsv->lock);
4017 spin_unlock(&space_info->lock); 4076 spin_unlock(&space_info->lock);
4018 return -ENOSPC; 4077 return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4297 spin_unlock(&block_rsv->lock); 4356 spin_unlock(&block_rsv->lock);
4298} 4357}
4299 4358
4359int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4360 struct btrfs_block_rsv *dest, u64 num_bytes,
4361 int min_factor)
4362{
4363 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4364 u64 min_bytes;
4365
4366 if (global_rsv->space_info != dest->space_info)
4367 return -ENOSPC;
4368
4369 spin_lock(&global_rsv->lock);
4370 min_bytes = div_factor(global_rsv->size, min_factor);
4371 if (global_rsv->reserved < min_bytes + num_bytes) {
4372 spin_unlock(&global_rsv->lock);
4373 return -ENOSPC;
4374 }
4375 global_rsv->reserved -= num_bytes;
4376 if (global_rsv->reserved < global_rsv->size)
4377 global_rsv->full = 0;
4378 spin_unlock(&global_rsv->lock);
4379
4380 block_rsv_add_bytes(dest, num_bytes, 1);
4381 return 0;
4382}
4383
4300static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4384static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4301 struct btrfs_block_rsv *block_rsv, 4385 struct btrfs_block_rsv *block_rsv,
4302 struct btrfs_block_rsv *dest, u64 num_bytes) 4386 struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
5030 int factor; 5114 int factor;
5031 5115
5032 /* block accounting for super block */ 5116 /* block accounting for super block */
5033 spin_lock(&info->delalloc_lock); 5117 spin_lock(&info->delalloc_root_lock);
5034 old_val = btrfs_super_bytes_used(info->super_copy); 5118 old_val = btrfs_super_bytes_used(info->super_copy);
5035 if (alloc) 5119 if (alloc)
5036 old_val += num_bytes; 5120 old_val += num_bytes;
5037 else 5121 else
5038 old_val -= num_bytes; 5122 old_val -= num_bytes;
5039 btrfs_set_super_bytes_used(info->super_copy, old_val); 5123 btrfs_set_super_bytes_used(info->super_copy, old_val);
5040 spin_unlock(&info->delalloc_lock); 5124 spin_unlock(&info->delalloc_root_lock);
5041 5125
5042 while (total) { 5126 while (total) {
5043 cache = btrfs_lookup_block_group(info, bytenr); 5127 cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5189 return ret; 5273 return ret;
5190} 5274}
5191 5275
5276static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5277{
5278 int ret;
5279 struct btrfs_block_group_cache *block_group;
5280 struct btrfs_caching_control *caching_ctl;
5281
5282 block_group = btrfs_lookup_block_group(root->fs_info, start);
5283 if (!block_group)
5284 return -EINVAL;
5285
5286 cache_block_group(block_group, 0);
5287 caching_ctl = get_caching_control(block_group);
5288
5289 if (!caching_ctl) {
5290 /* Logic error */
5291 BUG_ON(!block_group_cache_done(block_group));
5292 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5293 } else {
5294 mutex_lock(&caching_ctl->mutex);
5295
5296 if (start >= caching_ctl->progress) {
5297 ret = add_excluded_extent(root, start, num_bytes);
5298 } else if (start + num_bytes <= caching_ctl->progress) {
5299 ret = btrfs_remove_free_space(block_group,
5300 start, num_bytes);
5301 } else {
5302 num_bytes = caching_ctl->progress - start;
5303 ret = btrfs_remove_free_space(block_group,
5304 start, num_bytes);
5305 if (ret)
5306 goto out_lock;
5307
5308 num_bytes = (start + num_bytes) -
5309 caching_ctl->progress;
5310 start = caching_ctl->progress;
5311 ret = add_excluded_extent(root, start, num_bytes);
5312 }
5313out_lock:
5314 mutex_unlock(&caching_ctl->mutex);
5315 put_caching_control(caching_ctl);
5316 }
5317 btrfs_put_block_group(block_group);
5318 return ret;
5319}
5320
5321int btrfs_exclude_logged_extents(struct btrfs_root *log,
5322 struct extent_buffer *eb)
5323{
5324 struct btrfs_file_extent_item *item;
5325 struct btrfs_key key;
5326 int found_type;
5327 int i;
5328
5329 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5330 return 0;
5331
5332 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5333 btrfs_item_key_to_cpu(eb, &key, i);
5334 if (key.type != BTRFS_EXTENT_DATA_KEY)
5335 continue;
5336 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5337 found_type = btrfs_file_extent_type(eb, item);
5338 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5339 continue;
5340 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5341 continue;
5342 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5343 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5344 __exclude_logged_extent(log, key.objectid, key.offset);
5345 }
5346
5347 return 0;
5348}
5349
5192/** 5350/**
5193 * btrfs_update_reserved_bytes - update the block_group and space info counters 5351 * btrfs_update_reserved_bytes - update the block_group and space info counters
5194 * @cache: The cache we are manipulating 5352 * @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5251 struct btrfs_caching_control *next; 5409 struct btrfs_caching_control *next;
5252 struct btrfs_caching_control *caching_ctl; 5410 struct btrfs_caching_control *caching_ctl;
5253 struct btrfs_block_group_cache *cache; 5411 struct btrfs_block_group_cache *cache;
5412 struct btrfs_space_info *space_info;
5254 5413
5255 down_write(&fs_info->extent_commit_sem); 5414 down_write(&fs_info->extent_commit_sem);
5256 5415
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5273 5432
5274 up_write(&fs_info->extent_commit_sem); 5433 up_write(&fs_info->extent_commit_sem);
5275 5434
5435 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5436 percpu_counter_set(&space_info->total_bytes_pinned, 0);
5437
5276 update_global_block_rsv(fs_info); 5438 update_global_block_rsv(fs_info);
5277} 5439}
5278 5440
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5370 return 0; 5532 return 0;
5371} 5533}
5372 5534
5535static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5536 u64 owner, u64 root_objectid)
5537{
5538 struct btrfs_space_info *space_info;
5539 u64 flags;
5540
5541 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5542 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5543 flags = BTRFS_BLOCK_GROUP_SYSTEM;
5544 else
5545 flags = BTRFS_BLOCK_GROUP_METADATA;
5546 } else {
5547 flags = BTRFS_BLOCK_GROUP_DATA;
5548 }
5549
5550 space_info = __find_space_info(fs_info, flags);
5551 BUG_ON(!space_info); /* Logic bug */
5552 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5553}
5554
5555
5373static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5556static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5374 struct btrfs_root *root, 5557 struct btrfs_root *root,
5375 u64 bytenr, u64 num_bytes, u64 parent, 5558 u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5590 goto out; 5773 goto out;
5591 } 5774 }
5592 } 5775 }
5776 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5777 root_objectid);
5593 } else { 5778 } else {
5594 if (found_extent) { 5779 if (found_extent) {
5595 BUG_ON(is_data && refs_to_drop != 5780 BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5713 u64 parent, int last_ref) 5898 u64 parent, int last_ref)
5714{ 5899{
5715 struct btrfs_block_group_cache *cache = NULL; 5900 struct btrfs_block_group_cache *cache = NULL;
5901 int pin = 1;
5716 int ret; 5902 int ret;
5717 5903
5718 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5904 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5745 5931
5746 btrfs_add_free_space(cache, buf->start, buf->len); 5932 btrfs_add_free_space(cache, buf->start, buf->len);
5747 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5933 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5934 pin = 0;
5748 } 5935 }
5749out: 5936out:
5937 if (pin)
5938 add_pinned_bytes(root->fs_info, buf->len,
5939 btrfs_header_level(buf),
5940 root->root_key.objectid);
5941
5750 /* 5942 /*
5751 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5943 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5752 * anymore. 5944 * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5763 int ret; 5955 int ret;
5764 struct btrfs_fs_info *fs_info = root->fs_info; 5956 struct btrfs_fs_info *fs_info = root->fs_info;
5765 5957
5958 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
5959
5766 /* 5960 /*
5767 * tree log blocks never actually go into the extent allocation 5961 * tree log blocks never actually go into the extent allocation
5768 * tree, just update pinning info and exit early. 5962 * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6560{ 6754{
6561 int ret; 6755 int ret;
6562 struct btrfs_block_group_cache *block_group; 6756 struct btrfs_block_group_cache *block_group;
6563 struct btrfs_caching_control *caching_ctl;
6564 u64 start = ins->objectid;
6565 u64 num_bytes = ins->offset;
6566
6567 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6568 cache_block_group(block_group, 0);
6569 caching_ctl = get_caching_control(block_group);
6570
6571 if (!caching_ctl) {
6572 BUG_ON(!block_group_cache_done(block_group));
6573 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6574 if (ret)
6575 goto out;
6576 } else {
6577 mutex_lock(&caching_ctl->mutex);
6578
6579 if (start >= caching_ctl->progress) {
6580 ret = add_excluded_extent(root, start, num_bytes);
6581 } else if (start + num_bytes <= caching_ctl->progress) {
6582 ret = btrfs_remove_free_space(block_group,
6583 start, num_bytes);
6584 } else {
6585 num_bytes = caching_ctl->progress - start;
6586 ret = btrfs_remove_free_space(block_group,
6587 start, num_bytes);
6588 if (ret)
6589 goto out_lock;
6590 6757
6591 start = caching_ctl->progress; 6758 /*
6592 num_bytes = ins->objectid + ins->offset - 6759 * Mixed block groups will exclude before processing the log so we only
6593 caching_ctl->progress; 6760 * need to do the exlude dance if this fs isn't mixed.
6594 ret = add_excluded_extent(root, start, num_bytes); 6761 */
6595 } 6762 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6596out_lock: 6763 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6597 mutex_unlock(&caching_ctl->mutex);
6598 put_caching_control(caching_ctl);
6599 if (ret) 6764 if (ret)
6600 goto out; 6765 return ret;
6601 } 6766 }
6602 6767
6768 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6769 if (!block_group)
6770 return -EINVAL;
6771
6603 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6772 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6604 RESERVE_ALLOC_NO_ACCOUNT); 6773 RESERVE_ALLOC_NO_ACCOUNT);
6605 BUG_ON(ret); /* logic error */ 6774 BUG_ON(ret); /* logic error */
6606 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6775 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6607 0, owner, offset, ins, 1); 6776 0, owner, offset, ins, 1);
6608out:
6609 btrfs_put_block_group(block_group); 6777 btrfs_put_block_group(block_group);
6610 return ret; 6778 return ret;
6611} 6779}
@@ -7298,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7298 int err = 0; 7466 int err = 0;
7299 int ret; 7467 int ret;
7300 int level; 7468 int level;
7469 bool root_dropped = false;
7301 7470
7302 path = btrfs_alloc_path(); 7471 path = btrfs_alloc_path();
7303 if (!path) { 7472 if (!path) {
@@ -7355,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7355 while (1) { 7524 while (1) {
7356 btrfs_tree_lock(path->nodes[level]); 7525 btrfs_tree_lock(path->nodes[level]);
7357 btrfs_set_lock_blocking(path->nodes[level]); 7526 btrfs_set_lock_blocking(path->nodes[level]);
7527 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7358 7528
7359 ret = btrfs_lookup_extent_info(trans, root, 7529 ret = btrfs_lookup_extent_info(trans, root,
7360 path->nodes[level]->start, 7530 path->nodes[level]->start,
@@ -7370,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7370 break; 7540 break;
7371 7541
7372 btrfs_tree_unlock(path->nodes[level]); 7542 btrfs_tree_unlock(path->nodes[level]);
7543 path->locks[level] = 0;
7373 WARN_ON(wc->refs[level] != 1); 7544 WARN_ON(wc->refs[level] != 1);
7374 level--; 7545 level--;
7375 } 7546 }
@@ -7384,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7384 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7555 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7385 7556
7386 while (1) { 7557 while (1) {
7387 if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
7388 pr_debug("btrfs: drop snapshot early exit\n");
7389 err = -EAGAIN;
7390 goto out_end_trans;
7391 }
7392 7558
7393 ret = walk_down_tree(trans, root, path, wc); 7559 ret = walk_down_tree(trans, root, path, wc);
7394 if (ret < 0) { 7560 if (ret < 0) {
@@ -7416,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7416 } 7582 }
7417 7583
7418 BUG_ON(wc->level == 0); 7584 BUG_ON(wc->level == 0);
7419 if (btrfs_should_end_transaction(trans, tree_root)) { 7585 if (btrfs_should_end_transaction(trans, tree_root) ||
7586 (!for_reloc && btrfs_need_cleaner_sleep(root))) {
7420 ret = btrfs_update_root(trans, tree_root, 7587 ret = btrfs_update_root(trans, tree_root,
7421 &root->root_key, 7588 &root->root_key,
7422 root_item); 7589 root_item);
@@ -7427,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7427 } 7594 }
7428 7595
7429 btrfs_end_transaction_throttle(trans, tree_root); 7596 btrfs_end_transaction_throttle(trans, tree_root);
7597 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7598 pr_debug("btrfs: drop snapshot early exit\n");
7599 err = -EAGAIN;
7600 goto out_free;
7601 }
7602
7430 trans = btrfs_start_transaction(tree_root, 0); 7603 trans = btrfs_start_transaction(tree_root, 0);
7431 if (IS_ERR(trans)) { 7604 if (IS_ERR(trans)) {
7432 err = PTR_ERR(trans); 7605 err = PTR_ERR(trans);
@@ -7447,8 +7620,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7447 } 7620 }
7448 7621
7449 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7622 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7450 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7623 ret = btrfs_find_root(tree_root, &root->root_key, path,
7451 NULL, NULL); 7624 NULL, NULL);
7452 if (ret < 0) { 7625 if (ret < 0) {
7453 btrfs_abort_transaction(trans, tree_root, ret); 7626 btrfs_abort_transaction(trans, tree_root, ret);
7454 err = ret; 7627 err = ret;
@@ -7465,18 +7638,28 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7465 } 7638 }
7466 7639
7467 if (root->in_radix) { 7640 if (root->in_radix) {
7468 btrfs_free_fs_root(tree_root->fs_info, root); 7641 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7469 } else { 7642 } else {
7470 free_extent_buffer(root->node); 7643 free_extent_buffer(root->node);
7471 free_extent_buffer(root->commit_root); 7644 free_extent_buffer(root->commit_root);
7472 kfree(root); 7645 btrfs_put_fs_root(root);
7473 } 7646 }
7647 root_dropped = true;
7474out_end_trans: 7648out_end_trans:
7475 btrfs_end_transaction_throttle(trans, tree_root); 7649 btrfs_end_transaction_throttle(trans, tree_root);
7476out_free: 7650out_free:
7477 kfree(wc); 7651 kfree(wc);
7478 btrfs_free_path(path); 7652 btrfs_free_path(path);
7479out: 7653out:
7654 /*
7655 * So if we need to stop dropping the snapshot for whatever reason we
7656 * need to make sure to add it back to the dead root list so that we
7657 * keep trying to do the work later. This also cleans up roots if we
7658 * don't have it in the radix (like when we recover after a power fail
7659 * or unmount) so we don't leak memory.
7660 */
7661 if (root_dropped == false)
7662 btrfs_add_dead_root(root);
7480 if (err) 7663 if (err)
7481 btrfs_std_error(root->fs_info, err); 7664 btrfs_std_error(root->fs_info, err);
7482 return err; 7665 return err;
@@ -7782,6 +7965,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7782 struct btrfs_space_info *space_info; 7965 struct btrfs_space_info *space_info;
7783 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7966 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7784 struct btrfs_device *device; 7967 struct btrfs_device *device;
7968 struct btrfs_trans_handle *trans;
7785 u64 min_free; 7969 u64 min_free;
7786 u64 dev_min = 1; 7970 u64 dev_min = 1;
7787 u64 dev_nr = 0; 7971 u64 dev_nr = 0;
@@ -7868,6 +8052,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7868 do_div(min_free, dev_min); 8052 do_div(min_free, dev_min);
7869 } 8053 }
7870 8054
8055 /* We need to do this so that we can look at pending chunks */
8056 trans = btrfs_join_transaction(root);
8057 if (IS_ERR(trans)) {
8058 ret = PTR_ERR(trans);
8059 goto out;
8060 }
8061
7871 mutex_lock(&root->fs_info->chunk_mutex); 8062 mutex_lock(&root->fs_info->chunk_mutex);
7872 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8063 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7873 u64 dev_offset; 8064 u64 dev_offset;
@@ -7878,7 +8069,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7878 */ 8069 */
7879 if (device->total_bytes > device->bytes_used + min_free && 8070 if (device->total_bytes > device->bytes_used + min_free &&
7880 !device->is_tgtdev_for_dev_replace) { 8071 !device->is_tgtdev_for_dev_replace) {
7881 ret = find_free_dev_extent(device, min_free, 8072 ret = find_free_dev_extent(trans, device, min_free,
7882 &dev_offset, NULL); 8073 &dev_offset, NULL);
7883 if (!ret) 8074 if (!ret)
7884 dev_nr++; 8075 dev_nr++;
@@ -7890,6 +8081,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7890 } 8081 }
7891 } 8082 }
7892 mutex_unlock(&root->fs_info->chunk_mutex); 8083 mutex_unlock(&root->fs_info->chunk_mutex);
8084 btrfs_end_transaction(trans, root);
7893out: 8085out:
7894 btrfs_put_block_group(block_group); 8086 btrfs_put_block_group(block_group);
7895 return ret; 8087 return ret;
@@ -8032,6 +8224,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8032 dump_space_info(space_info, 0, 0); 8224 dump_space_info(space_info, 0, 0);
8033 } 8225 }
8034 } 8226 }
8227 percpu_counter_destroy(&space_info->total_bytes_pinned);
8035 list_del(&space_info->list); 8228 list_del(&space_info->list);
8036 kfree(space_info); 8229 kfree(space_info);
8037 } 8230 }
@@ -8254,6 +8447,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8254 sizeof(item)); 8447 sizeof(item));
8255 if (ret) 8448 if (ret)
8256 btrfs_abort_transaction(trans, extent_root, ret); 8449 btrfs_abort_transaction(trans, extent_root, ret);
8450 ret = btrfs_finish_chunk_alloc(trans, extent_root,
8451 key.objectid, key.offset);
8452 if (ret)
8453 btrfs_abort_transaction(trans, extent_root, ret);
8257 } 8454 }
8258} 8455}
8259 8456
@@ -8591,8 +8788,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8591 if (end - start >= range->minlen) { 8788 if (end - start >= range->minlen) {
8592 if (!block_group_cache_done(cache)) { 8789 if (!block_group_cache_done(cache)) {
8593 ret = cache_block_group(cache, 0); 8790 ret = cache_block_group(cache, 0);
8594 if (!ret) 8791 if (ret) {
8595 wait_block_group_cache_done(cache); 8792 btrfs_put_block_group(cache);
8793 break;
8794 }
8795 ret = wait_block_group_cache_done(cache);
8796 if (ret) {
8797 btrfs_put_block_group(cache);
8798 break;
8799 }
8596 } 8800 }
8597 ret = btrfs_trim_block_group(cache, 8801 ret = btrfs_trim_block_group(cache,
8598 &group_trimmed, 8802 &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6bca9472f313..583d98bd065e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
77 kmem_cache_free(extent_buffer_cache, eb); 77 kmem_cache_free(extent_buffer_cache, eb);
78 } 78 }
79} 79}
80
81#define btrfs_debug_check_extent_io_range(inode, start, end) \
82 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
83static inline void __btrfs_debug_check_extent_io_range(const char *caller,
84 struct inode *inode, u64 start, u64 end)
85{
86 u64 isize = i_size_read(inode);
87
88 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
89 printk_ratelimited(KERN_DEBUG
90 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
91 caller,
92 (unsigned long long)btrfs_ino(inode),
93 (unsigned long long)isize,
94 (unsigned long long)start,
95 (unsigned long long)end);
96 }
97}
80#else 98#else
81#define btrfs_leak_debug_add(new, head) do {} while (0) 99#define btrfs_leak_debug_add(new, head) do {} while (0)
82#define btrfs_leak_debug_del(entry) do {} while (0) 100#define btrfs_leak_debug_del(entry) do {} while (0)
83#define btrfs_leak_debug_check() do {} while (0) 101#define btrfs_leak_debug_check() do {} while (0)
102#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
84#endif 103#endif
85 104
86#define BUFFER_LRU_MAX 64 105#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
522 int err; 541 int err;
523 int clear = 0; 542 int clear = 0;
524 543
544 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
545
546 if (bits & EXTENT_DELALLOC)
547 bits |= EXTENT_NORESERVE;
548
525 if (delete) 549 if (delete)
526 bits |= ~EXTENT_CTLBITS; 550 bits |= ~EXTENT_CTLBITS;
527 bits |= EXTENT_FIRST_DELALLOC; 551 bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
677 struct extent_state *state; 701 struct extent_state *state;
678 struct rb_node *node; 702 struct rb_node *node;
679 703
704 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
705
680 spin_lock(&tree->lock); 706 spin_lock(&tree->lock);
681again: 707again:
682 while (1) { 708 while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
769 u64 last_start; 795 u64 last_start;
770 u64 last_end; 796 u64 last_end;
771 797
798 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
799
772 bits |= EXTENT_FIRST_DELALLOC; 800 bits |= EXTENT_FIRST_DELALLOC;
773again: 801again:
774 if (!prealloc && (mask & __GFP_WAIT)) { 802 if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
989 u64 last_start; 1017 u64 last_start;
990 u64 last_end; 1018 u64 last_end;
991 1019
1020 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
1021
992again: 1022again:
993 if (!prealloc && (mask & __GFP_WAIT)) { 1023 if (!prealloc && (mask & __GFP_WAIT)) {
994 prealloc = alloc_extent_state(mask); 1024 prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2450 struct extent_state *cached = NULL; 2480 struct extent_state *cached = NULL;
2451 struct extent_state *state; 2481 struct extent_state *state;
2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2482 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2483 struct inode *inode = page->mapping->host;
2453 2484
2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2485 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2455 "mirror=%lu\n", (u64)bio->bi_sector, err, 2486 "mirror=%lu\n", (u64)bio->bi_sector, err,
2456 io_bio->mirror_num); 2487 io_bio->mirror_num);
2457 tree = &BTRFS_I(page->mapping->host)->io_tree; 2488 tree = &BTRFS_I(inode)->io_tree;
2458 2489
2459 /* We always issue full-page reads, but if some block 2490 /* We always issue full-page reads, but if some block
2460 * in a page fails to read, blk_update_request() will 2491 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2559 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2529 2560
2530 if (uptodate) { 2561 if (uptodate) {
2562 loff_t i_size = i_size_read(inode);
2563 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2564 unsigned offset;
2565
2566 /* Zero out the end if this page straddles i_size */
2567 offset = i_size & (PAGE_CACHE_SIZE-1);
2568 if (page->index == end_index && offset)
2569 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2531 SetPageUptodate(page); 2570 SetPageUptodate(page);
2532 } else { 2571 } else {
2533 ClearPageUptodate(page); 2572 ClearPageUptodate(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14) 21#define EXTENT_DAMAGED (1 << 14)
22#define EXTENT_NORESERVE (1 << 15)
22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
24 25
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
34 34
35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
36 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
37 sizeof(struct btrfs_sector_sum) * \ 37 sizeof(u32) * (r)->sectorsize)
38 (r)->sectorsize - (r)->sectorsize)
39 38
40int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 39int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 40 struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
297 struct btrfs_path *path; 296 struct btrfs_path *path;
298 struct extent_buffer *leaf; 297 struct extent_buffer *leaf;
299 struct btrfs_ordered_sum *sums; 298 struct btrfs_ordered_sum *sums;
300 struct btrfs_sector_sum *sector_sum;
301 struct btrfs_csum_item *item; 299 struct btrfs_csum_item *item;
302 LIST_HEAD(tmplist); 300 LIST_HEAD(tmplist);
303 unsigned long offset; 301 unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
368 struct btrfs_csum_item); 366 struct btrfs_csum_item);
369 while (start < csum_end) { 367 while (start < csum_end) {
370 size = min_t(size_t, csum_end - start, 368 size = min_t(size_t, csum_end - start,
371 MAX_ORDERED_SUM_BYTES(root)); 369 MAX_ORDERED_SUM_BYTES(root));
372 sums = kzalloc(btrfs_ordered_sum_size(root, size), 370 sums = kzalloc(btrfs_ordered_sum_size(root, size),
373 GFP_NOFS); 371 GFP_NOFS);
374 if (!sums) { 372 if (!sums) {
375 ret = -ENOMEM; 373 ret = -ENOMEM;
376 goto fail; 374 goto fail;
377 } 375 }
378 376
379 sector_sum = sums->sums;
380 sums->bytenr = start; 377 sums->bytenr = start;
381 sums->len = size; 378 sums->len = (int)size;
382 379
383 offset = (start - key.offset) >> 380 offset = (start - key.offset) >>
384 root->fs_info->sb->s_blocksize_bits; 381 root->fs_info->sb->s_blocksize_bits;
385 offset *= csum_size; 382 offset *= csum_size;
383 size >>= root->fs_info->sb->s_blocksize_bits;
386 384
387 while (size > 0) { 385 read_extent_buffer(path->nodes[0],
388 read_extent_buffer(path->nodes[0], 386 sums->sums,
389 &sector_sum->sum, 387 ((unsigned long)item) + offset,
390 ((unsigned long)item) + 388 csum_size * size);
391 offset, csum_size); 389
392 sector_sum->bytenr = start; 390 start += root->sectorsize * size;
393
394 size -= root->sectorsize;
395 start += root->sectorsize;
396 offset += csum_size;
397 sector_sum++;
398 }
399 list_add_tail(&sums->list, &tmplist); 391 list_add_tail(&sums->list, &tmplist);
400 } 392 }
401 path->slots[0]++; 393 path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
417 struct bio *bio, u64 file_start, int contig) 409 struct bio *bio, u64 file_start, int contig)
418{ 410{
419 struct btrfs_ordered_sum *sums; 411 struct btrfs_ordered_sum *sums;
420 struct btrfs_sector_sum *sector_sum;
421 struct btrfs_ordered_extent *ordered; 412 struct btrfs_ordered_extent *ordered;
422 char *data; 413 char *data;
423 struct bio_vec *bvec = bio->bi_io_vec; 414 struct bio_vec *bvec = bio->bi_io_vec;
424 int bio_index = 0; 415 int bio_index = 0;
416 int index;
425 unsigned long total_bytes = 0; 417 unsigned long total_bytes = 0;
426 unsigned long this_sum_bytes = 0; 418 unsigned long this_sum_bytes = 0;
427 u64 offset; 419 u64 offset;
428 u64 disk_bytenr;
429 420
430 WARN_ON(bio->bi_vcnt <= 0); 421 WARN_ON(bio->bi_vcnt <= 0);
431 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 422 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
432 if (!sums) 423 if (!sums)
433 return -ENOMEM; 424 return -ENOMEM;
434 425
435 sector_sum = sums->sums;
436 disk_bytenr = (u64)bio->bi_sector << 9;
437 sums->len = bio->bi_size; 426 sums->len = bio->bi_size;
438 INIT_LIST_HEAD(&sums->list); 427 INIT_LIST_HEAD(&sums->list);
439 428
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
444 433
445 ordered = btrfs_lookup_ordered_extent(inode, offset); 434 ordered = btrfs_lookup_ordered_extent(inode, offset);
446 BUG_ON(!ordered); /* Logic error */ 435 BUG_ON(!ordered); /* Logic error */
447 sums->bytenr = ordered->start; 436 sums->bytenr = (u64)bio->bi_sector << 9;
437 index = 0;
448 438
449 while (bio_index < bio->bi_vcnt) { 439 while (bio_index < bio->bi_vcnt) {
450 if (!contig) 440 if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
463 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 453 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
464 GFP_NOFS); 454 GFP_NOFS);
465 BUG_ON(!sums); /* -ENOMEM */ 455 BUG_ON(!sums); /* -ENOMEM */
466 sector_sum = sums->sums;
467 sums->len = bytes_left; 456 sums->len = bytes_left;
468 ordered = btrfs_lookup_ordered_extent(inode, offset); 457 ordered = btrfs_lookup_ordered_extent(inode, offset);
469 BUG_ON(!ordered); /* Logic error */ 458 BUG_ON(!ordered); /* Logic error */
470 sums->bytenr = ordered->start; 459 sums->bytenr = ((u64)bio->bi_sector << 9) +
460 total_bytes;
461 index = 0;
471 } 462 }
472 463
473 data = kmap_atomic(bvec->bv_page); 464 data = kmap_atomic(bvec->bv_page);
474 sector_sum->sum = ~(u32)0; 465 sums->sums[index] = ~(u32)0;
475 sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, 466 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
476 sector_sum->sum, 467 sums->sums[index],
477 bvec->bv_len); 468 bvec->bv_len);
478 kunmap_atomic(data); 469 kunmap_atomic(data);
479 btrfs_csum_final(sector_sum->sum, 470 btrfs_csum_final(sums->sums[index],
480 (char *)&sector_sum->sum); 471 (char *)(sums->sums + index));
481 sector_sum->bytenr = disk_bytenr;
482 472
483 sector_sum++;
484 bio_index++; 473 bio_index++;
474 index++;
485 total_bytes += bvec->bv_len; 475 total_bytes += bvec->bv_len;
486 this_sum_bytes += bvec->bv_len; 476 this_sum_bytes += bvec->bv_len;
487 disk_bytenr += bvec->bv_len;
488 offset += bvec->bv_len; 477 offset += bvec->bv_len;
489 bvec++; 478 bvec++;
490 } 479 }
@@ -672,62 +661,46 @@ out:
672 return ret; 661 return ret;
673} 662}
674 663
675static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
676 struct btrfs_sector_sum *sector_sum,
677 u64 total_bytes, u64 sectorsize)
678{
679 u64 tmp = sectorsize;
680 u64 next_sector = sector_sum->bytenr;
681 struct btrfs_sector_sum *next = sector_sum + 1;
682
683 while ((tmp + total_bytes) < sums->len) {
684 if (next_sector + sectorsize != next->bytenr)
685 break;
686 tmp += sectorsize;
687 next_sector = next->bytenr;
688 next++;
689 }
690 return tmp;
691}
692
693int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 664int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
694 struct btrfs_root *root, 665 struct btrfs_root *root,
695 struct btrfs_ordered_sum *sums) 666 struct btrfs_ordered_sum *sums)
696{ 667{
697 u64 bytenr;
698 int ret;
699 struct btrfs_key file_key; 668 struct btrfs_key file_key;
700 struct btrfs_key found_key; 669 struct btrfs_key found_key;
701 u64 next_offset;
702 u64 total_bytes = 0;
703 int found_next;
704 struct btrfs_path *path; 670 struct btrfs_path *path;
705 struct btrfs_csum_item *item; 671 struct btrfs_csum_item *item;
706 struct btrfs_csum_item *item_end; 672 struct btrfs_csum_item *item_end;
707 struct extent_buffer *leaf = NULL; 673 struct extent_buffer *leaf = NULL;
674 u64 next_offset;
675 u64 total_bytes = 0;
708 u64 csum_offset; 676 u64 csum_offset;
709 struct btrfs_sector_sum *sector_sum; 677 u64 bytenr;
710 u32 nritems; 678 u32 nritems;
711 u32 ins_size; 679 u32 ins_size;
680 int index = 0;
681 int found_next;
682 int ret;
712 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 683 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
713 684
714 path = btrfs_alloc_path(); 685 path = btrfs_alloc_path();
715 if (!path) 686 if (!path)
716 return -ENOMEM; 687 return -ENOMEM;
717
718 sector_sum = sums->sums;
719again: 688again:
720 next_offset = (u64)-1; 689 next_offset = (u64)-1;
721 found_next = 0; 690 found_next = 0;
691 bytenr = sums->bytenr + total_bytes;
722 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 692 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
723 file_key.offset = sector_sum->bytenr; 693 file_key.offset = bytenr;
724 bytenr = sector_sum->bytenr;
725 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 694 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
726 695
727 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); 696 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
728 if (!IS_ERR(item)) { 697 if (!IS_ERR(item)) {
729 leaf = path->nodes[0];
730 ret = 0; 698 ret = 0;
699 leaf = path->nodes[0];
700 item_end = btrfs_item_ptr(leaf, path->slots[0],
701 struct btrfs_csum_item);
702 item_end = (struct btrfs_csum_item *)((char *)item_end +
703 btrfs_item_size_nr(leaf, path->slots[0]));
731 goto found; 704 goto found;
732 } 705 }
733 ret = PTR_ERR(item); 706 ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
807 780
808 free_space = btrfs_leaf_free_space(root, leaf) - 781 free_space = btrfs_leaf_free_space(root, leaf) -
809 sizeof(struct btrfs_item) - csum_size; 782 sizeof(struct btrfs_item) - csum_size;
810 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 783 tmp = sums->len - total_bytes;
811 root->sectorsize);
812 tmp >>= root->fs_info->sb->s_blocksize_bits; 784 tmp >>= root->fs_info->sb->s_blocksize_bits;
813 WARN_ON(tmp < 1); 785 WARN_ON(tmp < 1);
814 786
@@ -822,6 +794,7 @@ again:
822 diff *= csum_size; 794 diff *= csum_size;
823 795
824 btrfs_extend_item(root, path, diff); 796 btrfs_extend_item(root, path, diff);
797 ret = 0;
825 goto csum; 798 goto csum;
826 } 799 }
827 800
@@ -831,8 +804,7 @@ insert:
831 if (found_next) { 804 if (found_next) {
832 u64 tmp; 805 u64 tmp;
833 806
834 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 807 tmp = sums->len - total_bytes;
835 root->sectorsize);
836 tmp >>= root->fs_info->sb->s_blocksize_bits; 808 tmp >>= root->fs_info->sb->s_blocksize_bits;
837 tmp = min(tmp, (next_offset - file_key.offset) >> 809 tmp = min(tmp, (next_offset - file_key.offset) >>
838 root->fs_info->sb->s_blocksize_bits); 810 root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
853 WARN_ON(1); 825 WARN_ON(1);
854 goto fail_unlock; 826 goto fail_unlock;
855 } 827 }
856csum:
857 leaf = path->nodes[0]; 828 leaf = path->nodes[0];
829csum:
858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 830 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
859 ret = 0; 831 item_end = (struct btrfs_csum_item *)((unsigned char *)item +
832 btrfs_item_size_nr(leaf, path->slots[0]));
860 item = (struct btrfs_csum_item *)((unsigned char *)item + 833 item = (struct btrfs_csum_item *)((unsigned char *)item +
861 csum_offset * csum_size); 834 csum_offset * csum_size);
862found: 835found:
863 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 836 ins_size = (u32)(sums->len - total_bytes) >>
864 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 837 root->fs_info->sb->s_blocksize_bits;
865 btrfs_item_size_nr(leaf, path->slots[0])); 838 ins_size *= csum_size;
866next_sector: 839 ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
867 840 ins_size);
868 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size); 841 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
869 842 ins_size);
870 total_bytes += root->sectorsize; 843
871 sector_sum++; 844 ins_size /= csum_size;
872 if (total_bytes < sums->len) { 845 total_bytes += ins_size * root->sectorsize;
873 item = (struct btrfs_csum_item *)((char *)item + 846 index += ins_size;
874 csum_size);
875 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
876 sector_sum->bytenr) {
877 bytenr = sector_sum->bytenr;
878 goto next_sector;
879 }
880 }
881 847
882 btrfs_mark_buffer_dirty(path->nodes[0]); 848 btrfs_mark_buffer_dirty(path->nodes[0]);
883 if (total_bytes < sums->len) { 849 if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89da56a58b63..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
309 ret = PTR_ERR(inode_root); 309 ret = PTR_ERR(inode_root);
310 goto cleanup; 310 goto cleanup;
311 } 311 }
312 if (btrfs_root_refs(&inode_root->root_item) == 0) {
313 ret = -ENOENT;
314 goto cleanup;
315 }
316 312
317 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
318 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1317,6 +1313,56 @@ fail:
1317 1313
1318} 1314}
1319 1315
1316static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1317 size_t *write_bytes)
1318{
1319 struct btrfs_trans_handle *trans;
1320 struct btrfs_root *root = BTRFS_I(inode)->root;
1321 struct btrfs_ordered_extent *ordered;
1322 u64 lockstart, lockend;
1323 u64 num_bytes;
1324 int ret;
1325
1326 lockstart = round_down(pos, root->sectorsize);
1327 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
1328
1329 while (1) {
1330 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1331 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1332 lockend - lockstart + 1);
1333 if (!ordered) {
1334 break;
1335 }
1336 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1337 btrfs_start_ordered_extent(inode, ordered, 1);
1338 btrfs_put_ordered_extent(ordered);
1339 }
1340
1341 trans = btrfs_join_transaction(root);
1342 if (IS_ERR(trans)) {
1343 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1344 return PTR_ERR(trans);
1345 }
1346
1347 num_bytes = lockend - lockstart + 1;
1348 ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
1349 NULL);
1350 btrfs_end_transaction(trans, root);
1351 if (ret <= 0) {
1352 ret = 0;
1353 } else {
1354 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1355 EXTENT_DIRTY | EXTENT_DELALLOC |
1356 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1357 NULL, GFP_NOFS);
1358 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1359 }
1360
1361 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1362
1363 return ret;
1364}
1365
1320static noinline ssize_t __btrfs_buffered_write(struct file *file, 1366static noinline ssize_t __btrfs_buffered_write(struct file *file,
1321 struct iov_iter *i, 1367 struct iov_iter *i,
1322 loff_t pos) 1368 loff_t pos)
@@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1324 struct inode *inode = file_inode(file); 1370 struct inode *inode = file_inode(file);
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1371 struct btrfs_root *root = BTRFS_I(inode)->root;
1326 struct page **pages = NULL; 1372 struct page **pages = NULL;
1373 u64 release_bytes = 0;
1327 unsigned long first_index; 1374 unsigned long first_index;
1328 size_t num_written = 0; 1375 size_t num_written = 0;
1329 int nrptrs; 1376 int nrptrs;
1330 int ret = 0; 1377 int ret = 0;
1378 bool only_release_metadata = false;
1331 bool force_page_uptodate = false; 1379 bool force_page_uptodate = false;
1332 1380
1333 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1381 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 offset); 1396 offset);
1349 size_t num_pages = (write_bytes + offset + 1397 size_t num_pages = (write_bytes + offset +
1350 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1398 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1399 size_t reserve_bytes;
1351 size_t dirty_pages; 1400 size_t dirty_pages;
1352 size_t copied; 1401 size_t copied;
1353 1402
@@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1362 break; 1411 break;
1363 } 1412 }
1364 1413
1365 ret = btrfs_delalloc_reserve_space(inode, 1414 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1366 num_pages << PAGE_CACHE_SHIFT); 1415 ret = btrfs_check_data_free_space(inode, reserve_bytes);
1416 if (ret == -ENOSPC &&
1417 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1418 BTRFS_INODE_PREALLOC))) {
1419 ret = check_can_nocow(inode, pos, &write_bytes);
1420 if (ret > 0) {
1421 only_release_metadata = true;
1422 /*
1423 * our prealloc extent may be smaller than
1424 * write_bytes, so scale down.
1425 */
1426 num_pages = (write_bytes + offset +
1427 PAGE_CACHE_SIZE - 1) >>
1428 PAGE_CACHE_SHIFT;
1429 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1430 ret = 0;
1431 } else {
1432 ret = -ENOSPC;
1433 }
1434 }
1435
1367 if (ret) 1436 if (ret)
1368 break; 1437 break;
1369 1438
1439 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1440 if (ret) {
1441 if (!only_release_metadata)
1442 btrfs_free_reserved_data_space(inode,
1443 reserve_bytes);
1444 break;
1445 }
1446
1447 release_bytes = reserve_bytes;
1448
1370 /* 1449 /*
1371 * This is going to setup the pages array with the number of 1450 * This is going to setup the pages array with the number of
1372 * pages we want, so we don't really need to worry about the 1451 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1375 ret = prepare_pages(root, file, pages, num_pages, 1454 ret = prepare_pages(root, file, pages, num_pages,
1376 pos, first_index, write_bytes, 1455 pos, first_index, write_bytes,
1377 force_page_uptodate); 1456 force_page_uptodate);
1378 if (ret) { 1457 if (ret)
1379 btrfs_delalloc_release_space(inode,
1380 num_pages << PAGE_CACHE_SHIFT);
1381 break; 1458 break;
1382 }
1383 1459
1384 copied = btrfs_copy_from_user(pos, num_pages, 1460 copied = btrfs_copy_from_user(pos, num_pages,
1385 write_bytes, pages, i); 1461 write_bytes, pages, i);
@@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1409 * managed to copy. 1485 * managed to copy.
1410 */ 1486 */
1411 if (num_pages > dirty_pages) { 1487 if (num_pages > dirty_pages) {
1488 release_bytes = (num_pages - dirty_pages) <<
1489 PAGE_CACHE_SHIFT;
1412 if (copied > 0) { 1490 if (copied > 0) {
1413 spin_lock(&BTRFS_I(inode)->lock); 1491 spin_lock(&BTRFS_I(inode)->lock);
1414 BTRFS_I(inode)->outstanding_extents++; 1492 BTRFS_I(inode)->outstanding_extents++;
1415 spin_unlock(&BTRFS_I(inode)->lock); 1493 spin_unlock(&BTRFS_I(inode)->lock);
1416 } 1494 }
1417 btrfs_delalloc_release_space(inode, 1495 if (only_release_metadata)
1418 (num_pages - dirty_pages) << 1496 btrfs_delalloc_release_metadata(inode,
1419 PAGE_CACHE_SHIFT); 1497 release_bytes);
1498 else
1499 btrfs_delalloc_release_space(inode,
1500 release_bytes);
1420 } 1501 }
1421 1502
1503 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1422 if (copied > 0) { 1504 if (copied > 0) {
1423 ret = btrfs_dirty_pages(root, inode, pages, 1505 ret = btrfs_dirty_pages(root, inode, pages,
1424 dirty_pages, pos, copied, 1506 dirty_pages, pos, copied,
1425 NULL); 1507 NULL);
1426 if (ret) { 1508 if (ret) {
1427 btrfs_delalloc_release_space(inode,
1428 dirty_pages << PAGE_CACHE_SHIFT);
1429 btrfs_drop_pages(pages, num_pages); 1509 btrfs_drop_pages(pages, num_pages);
1430 break; 1510 break;
1431 } 1511 }
1432 } 1512 }
1433 1513
1514 release_bytes = 0;
1434 btrfs_drop_pages(pages, num_pages); 1515 btrfs_drop_pages(pages, num_pages);
1435 1516
1517 if (only_release_metadata && copied > 0) {
1518 u64 lockstart = round_down(pos, root->sectorsize);
1519 u64 lockend = lockstart +
1520 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1521
1522 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1523 lockend, EXTENT_NORESERVE, NULL,
1524 NULL, GFP_NOFS);
1525 only_release_metadata = false;
1526 }
1527
1436 cond_resched(); 1528 cond_resched();
1437 1529
1438 balance_dirty_pages_ratelimited(inode->i_mapping); 1530 balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1445 1537
1446 kfree(pages); 1538 kfree(pages);
1447 1539
1540 if (release_bytes) {
1541 if (only_release_metadata)
1542 btrfs_delalloc_release_metadata(inode, release_bytes);
1543 else
1544 btrfs_delalloc_release_space(inode, release_bytes);
1545 }
1546
1448 return num_written ? num_written : ret; 1547 return num_written ? num_written : ret;
1449} 1548}
1450 1549
@@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2175 goto out_reserve_fail; 2274 goto out_reserve_fail;
2176 } 2275 }
2177 2276
2178 /*
2179 * wait for ordered IO before we have any locks. We'll loop again
2180 * below with the locks held.
2181 */
2182 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2183
2184 mutex_lock(&inode->i_mutex); 2277 mutex_lock(&inode->i_mutex);
2185 ret = inode_newsize_ok(inode, alloc_end); 2278 ret = inode_newsize_ok(inode, alloc_end);
2186 if (ret) 2279 if (ret)
@@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
2191 alloc_start); 2284 alloc_start);
2192 if (ret) 2285 if (ret)
2193 goto out; 2286 goto out;
2287 } else {
2288 /*
2289 * If we are fallocating from the end of the file onward we
2290 * need to zero out the end of the page if i_size lands in the
2291 * middle of a page.
2292 */
2293 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2294 if (ret)
2295 goto out;
2194 } 2296 }
2195 2297
2298 /*
2299 * wait for ordered IO before we have any locks. We'll loop again
2300 * below with the locks held.
2301 */
2302 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2303
2196 locked_end = alloc_end - 1; 2304 locked_end = alloc_end - 1;
2197 while (1) { 2305 while (1) {
2198 struct btrfs_ordered_extent *ordered; 2306 struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e53009657f0e..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
213 else 213 else
214 ret = 0; 214 ret = 0;
215 spin_unlock(&rsv->lock); 215 spin_unlock(&rsv->lock);
216 return 0; 216 return ret;
217} 217}
218 218
219int btrfs_truncate_free_space_cache(struct btrfs_root *root, 219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
3150 return 0; 3150 return 0;
3151} 3151}
3152 3152
3153#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
3154
3153/* 3155/*
3154 * This test just does basic sanity checking, making sure we can add an exten 3156 * This test just does basic sanity checking, making sure we can add an exten
3155 * entry and remove space from either end and the middle, and make sure we can 3157 * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
3159{ 3161{
3160 int ret = 0; 3162 int ret = 0;
3161 3163
3162 printk(KERN_ERR "Running extent only tests\n"); 3164 test_msg("Running extent only tests\n");
3163 3165
3164 /* First just make sure we can remove an entire entry */ 3166 /* First just make sure we can remove an entire entry */
3165 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3167 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3166 if (ret) { 3168 if (ret) {
3167 printk(KERN_ERR "Error adding initial extents %d\n", ret); 3169 test_msg("Error adding initial extents %d\n", ret);
3168 return ret; 3170 return ret;
3169 } 3171 }
3170 3172
3171 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3173 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3172 if (ret) { 3174 if (ret) {
3173 printk(KERN_ERR "Error removing extent %d\n", ret); 3175 test_msg("Error removing extent %d\n", ret);
3174 return ret; 3176 return ret;
3175 } 3177 }
3176 3178
3177 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3179 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3178 printk(KERN_ERR "Full remove left some lingering space\n"); 3180 test_msg("Full remove left some lingering space\n");
3179 return -1; 3181 return -1;
3180 } 3182 }
3181 3183
3182 /* Ok edge and middle cases now */ 3184 /* Ok edge and middle cases now */
3183 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3185 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3184 if (ret) { 3186 if (ret) {
3185 printk(KERN_ERR "Error adding half extent %d\n", ret); 3187 test_msg("Error adding half extent %d\n", ret);
3186 return ret; 3188 return ret;
3187 } 3189 }
3188 3190
3189 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); 3191 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
3190 if (ret) { 3192 if (ret) {
3191 printk(KERN_ERR "Error removing tail end %d\n", ret); 3193 test_msg("Error removing tail end %d\n", ret);
3192 return ret; 3194 return ret;
3193 } 3195 }
3194 3196
3195 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3197 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3196 if (ret) { 3198 if (ret) {
3197 printk(KERN_ERR "Error removing front end %d\n", ret); 3199 test_msg("Error removing front end %d\n", ret);
3198 return ret; 3200 return ret;
3199 } 3201 }
3200 3202
3201 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 3203 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
3202 if (ret) { 3204 if (ret) {
3203 printk(KERN_ERR "Error removing middle peice %d\n", ret); 3205 test_msg("Error removing middle piece %d\n", ret);
3204 return ret; 3206 return ret;
3205 } 3207 }
3206 3208
3207 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3209 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3208 printk(KERN_ERR "Still have space at the front\n"); 3210 test_msg("Still have space at the front\n");
3209 return -1; 3211 return -1;
3210 } 3212 }
3211 3213
3212 if (check_exists(cache, 2 * 1024 * 1024, 4096)) { 3214 if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
3213 printk(KERN_ERR "Still have space in the middle\n"); 3215 test_msg("Still have space in the middle\n");
3214 return -1; 3216 return -1;
3215 } 3217 }
3216 3218
3217 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { 3219 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
3218 printk(KERN_ERR "Still have space at the end\n"); 3220 test_msg("Still have space at the end\n");
3219 return -1; 3221 return -1;
3220 } 3222 }
3221 3223
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3230 u64 next_bitmap_offset; 3232 u64 next_bitmap_offset;
3231 int ret; 3233 int ret;
3232 3234
3233 printk(KERN_ERR "Running bitmap only tests\n"); 3235 test_msg("Running bitmap only tests\n");
3234 3236
3235 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3237 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3236 if (ret) { 3238 if (ret) {
3237 printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); 3239 test_msg("Couldn't create a bitmap entry %d\n", ret);
3238 return ret; 3240 return ret;
3239 } 3241 }
3240 3242
3241 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3243 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3242 if (ret) { 3244 if (ret) {
3243 printk(KERN_ERR "Error removing bitmap full range %d\n", ret); 3245 test_msg("Error removing bitmap full range %d\n", ret);
3244 return ret; 3246 return ret;
3245 } 3247 }
3246 3248
3247 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3249 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3248 printk(KERN_ERR "Left some space in bitmap\n"); 3250 test_msg("Left some space in bitmap\n");
3249 return -1; 3251 return -1;
3250 } 3252 }
3251 3253
3252 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3254 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3253 if (ret) { 3255 if (ret) {
3254 printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); 3256 test_msg("Couldn't add to our bitmap entry %d\n", ret);
3255 return ret; 3257 return ret;
3256 } 3258 }
3257 3259
3258 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); 3260 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
3259 if (ret) { 3261 if (ret) {
3260 printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); 3262 test_msg("Couldn't remove middle chunk %d\n", ret);
3261 return ret; 3263 return ret;
3262 } 3264 }
3263 3265
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3271 ret = add_free_space_entry(cache, next_bitmap_offset - 3273 ret = add_free_space_entry(cache, next_bitmap_offset -
3272 (2 * 1024 * 1024), 4 * 1024 * 1024, 1); 3274 (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
3273 if (ret) { 3275 if (ret) {
3274 printk(KERN_ERR "Couldn't add space that straddles two bitmaps" 3276 test_msg("Couldn't add space that straddles two bitmaps %d\n",
3275 " %d\n", ret); 3277 ret);
3276 return ret; 3278 return ret;
3277 } 3279 }
3278 3280
3279 ret = btrfs_remove_free_space(cache, next_bitmap_offset - 3281 ret = btrfs_remove_free_space(cache, next_bitmap_offset -
3280 (1 * 1024 * 1024), 2 * 1024 * 1024); 3282 (1 * 1024 * 1024), 2 * 1024 * 1024);
3281 if (ret) { 3283 if (ret) {
3282 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3284 test_msg("Couldn't remove overlapping space %d\n", ret);
3283 return ret; 3285 return ret;
3284 } 3286 }
3285 3287
3286 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), 3288 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
3287 2 * 1024 * 1024)) { 3289 2 * 1024 * 1024)) {
3288 printk(KERN_ERR "Left some space when removing overlapping\n"); 3290 test_msg("Left some space when removing overlapping\n");
3289 return -1; 3291 return -1;
3290 } 3292 }
3291 3293
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3300 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); 3302 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
3301 int ret; 3303 int ret;
3302 3304
3303 printk(KERN_ERR "Running bitmap and extent tests\n"); 3305 test_msg("Running bitmap and extent tests\n");
3304 3306
3305 /* 3307 /*
3306 * First let's do something simple, an extent at the same offset as the 3308 * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3309 */ 3311 */
3310 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); 3312 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
3311 if (ret) { 3313 if (ret) {
3312 printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); 3314 test_msg("Couldn't create bitmap entry %d\n", ret);
3313 return ret; 3315 return ret;
3314 } 3316 }
3315 3317
3316 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3318 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3317 if (ret) { 3319 if (ret) {
3318 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3320 test_msg("Couldn't add extent entry %d\n", ret);
3319 return ret; 3321 return ret;
3320 } 3322 }
3321 3323
3322 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3324 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3323 if (ret) { 3325 if (ret) {
3324 printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); 3326 test_msg("Couldn't remove extent entry %d\n", ret);
3325 return ret; 3327 return ret;
3326 } 3328 }
3327 3329
3328 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3330 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3329 printk(KERN_ERR "Left remnants after our remove\n"); 3331 test_msg("Left remnants after our remove\n");
3330 return -1; 3332 return -1;
3331 } 3333 }
3332 3334
3333 /* Now to add back the extent entry and remove from the bitmap */ 3335 /* Now to add back the extent entry and remove from the bitmap */
3334 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3336 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3335 if (ret) { 3337 if (ret) {
3336 printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); 3338 test_msg("Couldn't re-add extent entry %d\n", ret);
3337 return ret; 3339 return ret;
3338 } 3340 }
3339 3341
3340 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); 3342 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
3341 if (ret) { 3343 if (ret) {
3342 printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); 3344 test_msg("Couldn't remove from bitmap %d\n", ret);
3343 return ret; 3345 return ret;
3344 } 3346 }
3345 3347
3346 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { 3348 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
3347 printk(KERN_ERR "Left remnants in the bitmap\n"); 3349 test_msg("Left remnants in the bitmap\n");
3348 return -1; 3350 return -1;
3349 } 3351 }
3350 3352
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3354 */ 3356 */
3355 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); 3357 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
3356 if (ret) { 3358 if (ret) {
3357 printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); 3359 test_msg("Couldn't add to a bitmap %d\n", ret);
3358 return ret; 3360 return ret;
3359 } 3361 }
3360 3362
3361 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); 3363 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
3362 if (ret) { 3364 if (ret) {
3363 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3365 test_msg("Couldn't remove overlapping space %d\n", ret);
3364 return ret; 3366 return ret;
3365 } 3367 }
3366 3368
3367 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 3369 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
3368 printk(KERN_ERR "Left over peices after removing " 3370 test_msg("Left over peices after removing overlapping\n");
3369 "overlapping\n");
3370 return -1; 3371 return -1;
3371 } 3372 }
3372 3373
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3375 /* Now with the extent entry offset into the bitmap */ 3376 /* Now with the extent entry offset into the bitmap */
3376 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); 3377 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
3377 if (ret) { 3378 if (ret) {
3378 printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); 3379 test_msg("Couldn't add space to the bitmap %d\n", ret);
3379 return ret; 3380 return ret;
3380 } 3381 }
3381 3382
3382 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); 3383 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
3383 if (ret) { 3384 if (ret) {
3384 printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); 3385 test_msg("Couldn't add extent to the cache %d\n", ret);
3385 return ret; 3386 return ret;
3386 } 3387 }
3387 3388
3388 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); 3389 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
3389 if (ret) { 3390 if (ret) {
3390 printk(KERN_ERR "Problem removing overlapping space %d\n", ret); 3391 test_msg("Problem removing overlapping space %d\n", ret);
3391 return ret; 3392 return ret;
3392 } 3393 }
3393 3394
3394 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { 3395 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
3395 printk(KERN_ERR "Left something behind when removing space"); 3396 test_msg("Left something behind when removing space");
3396 return -1; 3397 return -1;
3397 } 3398 }
3398 3399
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3410 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, 3411 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
3411 4 * 1024 * 1024, 1); 3412 4 * 1024 * 1024, 1);
3412 if (ret) { 3413 if (ret) {
3413 printk(KERN_ERR "Couldn't add bitmap %d\n", ret); 3414 test_msg("Couldn't add bitmap %d\n", ret);
3414 return ret; 3415 return ret;
3415 } 3416 }
3416 3417
3417 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, 3418 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
3418 5 * 1024 * 1024, 0); 3419 5 * 1024 * 1024, 0);
3419 if (ret) { 3420 if (ret) {
3420 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3421 test_msg("Couldn't add extent entry %d\n", ret);
3421 return ret; 3422 return ret;
3422 } 3423 }
3423 3424
3424 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, 3425 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
3425 5 * 1024 * 1024); 3426 5 * 1024 * 1024);
3426 if (ret) { 3427 if (ret) {
3427 printk(KERN_ERR "Failed to free our space %d\n", ret); 3428 test_msg("Failed to free our space %d\n", ret);
3428 return ret; 3429 return ret;
3429 } 3430 }
3430 3431
3431 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, 3432 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
3432 5 * 1024 * 1024)) { 3433 5 * 1024 * 1024)) {
3433 printk(KERN_ERR "Left stuff over\n"); 3434 test_msg("Left stuff over\n");
3434 return -1; 3435 return -1;
3435 } 3436 }
3436 3437
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3444 */ 3445 */
3445 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); 3446 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
3446 if (ret) { 3447 if (ret) {
3447 printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); 3448 test_msg("Couldn't add bitmap entry %d\n", ret);
3448 return ret; 3449 return ret;
3449 } 3450 }
3450 3451
3451 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); 3452 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
3452 if (ret) { 3453 if (ret) {
3453 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3454 test_msg("Couldn't add extent entry %d\n", ret);
3454 return ret; 3455 return ret;
3455 } 3456 }
3456 3457
3457 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); 3458 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
3458 if (ret) { 3459 if (ret) {
3459 printk(KERN_ERR "Error removing bitmap and extent " 3460 test_msg("Error removing bitmap and extent overlapping %d\n", ret);
3460 "overlapping %d\n", ret);
3461 return ret; 3461 return ret;
3462 } 3462 }
3463 3463
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
3469{ 3469{
3470 struct btrfs_block_group_cache *cache; 3470 struct btrfs_block_group_cache *cache;
3471 3471
3472 printk(KERN_ERR "Running btrfs free space cache tests\n"); 3472 test_msg("Running btrfs free space cache tests\n");
3473 3473
3474 cache = init_test_block_group(); 3474 cache = init_test_block_group();
3475 if (!cache) { 3475 if (!cache) {
3476 printk(KERN_ERR "Couldn't run the tests\n"); 3476 test_msg("Couldn't run the tests\n");
3477 return; 3477 return;
3478 } 3478 }
3479 3479
@@ -3487,6 +3487,9 @@ out:
3487 __btrfs_remove_free_space_cache(cache->free_space_ctl); 3487 __btrfs_remove_free_space_cache(cache->free_space_ctl);
3488 kfree(cache->free_space_ctl); 3488 kfree(cache->free_space_ctl);
3489 kfree(cache); 3489 kfree(cache);
3490 printk(KERN_ERR "Free space cache tests finished\n"); 3490 test_msg("Free space cache tests finished\n");
3491} 3491}
3492#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3492#undef test_msg
3493#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3494void btrfs_test_free_space_cache(void) {}
3495#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
114 u64 *trimmed, u64 start, u64 end, u64 minlen); 114 u64 *trimmed, u64 start, u64 end, u64 minlen);
115 115
116#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
117void btrfs_test_free_space_cache(void); 116void btrfs_test_free_space_cache(void);
118#endif
119 117
120#endif 118#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f9d16b70d3d..6d1b93c8aafb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/btrfs.h> 43#include <linux/btrfs.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/posix_acl_xattr.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
@@ -57,6 +58,7 @@
57#include "free-space-cache.h" 58#include "free-space-cache.h"
58#include "inode-map.h" 59#include "inode-map.h"
59#include "backref.h" 60#include "backref.h"
61#include "hash.h"
60 62
61struct btrfs_iget_args { 63struct btrfs_iget_args {
62 u64 ino; 64 u64 ino;
@@ -701,8 +703,12 @@ retry:
701 async_extent->nr_pages = 0; 703 async_extent->nr_pages = 0;
702 async_extent->pages = NULL; 704 async_extent->pages = NULL;
703 705
704 if (ret == -ENOSPC) 706 if (ret == -ENOSPC) {
707 unlock_extent(io_tree, async_extent->start,
708 async_extent->start +
709 async_extent->ram_size - 1);
705 goto retry; 710 goto retry;
711 }
706 goto out_free; 712 goto out_free;
707 } 713 }
708 714
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
1529 spin_unlock(&BTRFS_I(inode)->lock); 1535 spin_unlock(&BTRFS_I(inode)->lock);
1530} 1536}
1531 1537
1538static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1539 struct inode *inode)
1540{
1541 spin_lock(&root->delalloc_lock);
1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 root->nr_delalloc_inodes++;
1548 if (root->nr_delalloc_inodes == 1) {
1549 spin_lock(&root->fs_info->delalloc_root_lock);
1550 BUG_ON(!list_empty(&root->delalloc_root));
1551 list_add_tail(&root->delalloc_root,
1552 &root->fs_info->delalloc_roots);
1553 spin_unlock(&root->fs_info->delalloc_root_lock);
1554 }
1555 }
1556 spin_unlock(&root->delalloc_lock);
1557}
1558
1559static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1560 struct inode *inode)
1561{
1562 spin_lock(&root->delalloc_lock);
1563 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1564 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1565 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1566 &BTRFS_I(inode)->runtime_flags);
1567 root->nr_delalloc_inodes--;
1568 if (!root->nr_delalloc_inodes) {
1569 spin_lock(&root->fs_info->delalloc_root_lock);
1570 BUG_ON(list_empty(&root->delalloc_root));
1571 list_del_init(&root->delalloc_root);
1572 spin_unlock(&root->fs_info->delalloc_root_lock);
1573 }
1574 }
1575 spin_unlock(&root->delalloc_lock);
1576}
1577
1532/* 1578/*
1533 * extent_io.c set_bit_hook, used to track delayed allocation 1579 * extent_io.c set_bit_hook, used to track delayed allocation
1534 * bytes in this file, and to maintain the list of inodes that 1580 * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1561 spin_lock(&BTRFS_I(inode)->lock); 1607 spin_lock(&BTRFS_I(inode)->lock);
1562 BTRFS_I(inode)->delalloc_bytes += len; 1608 BTRFS_I(inode)->delalloc_bytes += len;
1563 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1609 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1564 &BTRFS_I(inode)->runtime_flags)) { 1610 &BTRFS_I(inode)->runtime_flags))
1565 spin_lock(&root->fs_info->delalloc_lock); 1611 btrfs_add_delalloc_inodes(root, inode);
1566 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1567 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1568 &root->fs_info->delalloc_inodes);
1569 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1570 &BTRFS_I(inode)->runtime_flags);
1571 }
1572 spin_unlock(&root->fs_info->delalloc_lock);
1573 }
1574 spin_unlock(&BTRFS_I(inode)->lock); 1612 spin_unlock(&BTRFS_I(inode)->lock);
1575 } 1613 }
1576} 1614}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1604 btrfs_delalloc_release_metadata(inode, len); 1642 btrfs_delalloc_release_metadata(inode, len);
1605 1643
1606 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1644 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1607 && do_list) 1645 && do_list && !(state->state & EXTENT_NORESERVE))
1608 btrfs_free_reserved_data_space(inode, len); 1646 btrfs_free_reserved_data_space(inode, len);
1609 1647
1610 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1648 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1613 BTRFS_I(inode)->delalloc_bytes -= len; 1651 BTRFS_I(inode)->delalloc_bytes -= len;
1614 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1652 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1615 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1653 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1616 &BTRFS_I(inode)->runtime_flags)) { 1654 &BTRFS_I(inode)->runtime_flags))
1617 spin_lock(&root->fs_info->delalloc_lock); 1655 btrfs_del_delalloc_inode(root, inode);
1618 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1619 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1620 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1621 &BTRFS_I(inode)->runtime_flags);
1622 }
1623 spin_unlock(&root->fs_info->delalloc_lock);
1624 }
1625 spin_unlock(&BTRFS_I(inode)->lock); 1656 spin_unlock(&BTRFS_I(inode)->lock);
1626 } 1657 }
1627} 1658}
@@ -2263,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2263 return 0; 2294 return 0;
2264 return PTR_ERR(root); 2295 return PTR_ERR(root);
2265 } 2296 }
2266 if (btrfs_root_refs(&root->root_item) == 0) {
2267 srcu_read_unlock(&fs_info->subvol_srcu, index);
2268 /* parse ENOENT to 0 */
2269 return 0;
2270 }
2271 2297
2272 /* step 2: get inode */ 2298 /* step 2: get inode */
2273 key.objectid = backref->inum; 2299 key.objectid = backref->inum;
@@ -3215,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3215 /* 1 for the orphan item deletion. */ 3241 /* 1 for the orphan item deletion. */
3216 trans = btrfs_start_transaction(root, 1); 3242 trans = btrfs_start_transaction(root, 1);
3217 if (IS_ERR(trans)) { 3243 if (IS_ERR(trans)) {
3244 iput(inode);
3218 ret = PTR_ERR(trans); 3245 ret = PTR_ERR(trans);
3219 goto out; 3246 goto out;
3220 } 3247 }
3221 ret = btrfs_orphan_add(trans, inode); 3248 ret = btrfs_orphan_add(trans, inode);
3222 btrfs_end_transaction(trans, root); 3249 btrfs_end_transaction(trans, root);
3223 if (ret) 3250 if (ret) {
3251 iput(inode);
3224 goto out; 3252 goto out;
3253 }
3225 3254
3226 ret = btrfs_truncate(inode); 3255 ret = btrfs_truncate(inode);
3227 if (ret) 3256 if (ret)
@@ -3274,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3274{ 3303{
3275 u32 nritems = btrfs_header_nritems(leaf); 3304 u32 nritems = btrfs_header_nritems(leaf);
3276 struct btrfs_key found_key; 3305 struct btrfs_key found_key;
3306 static u64 xattr_access = 0;
3307 static u64 xattr_default = 0;
3277 int scanned = 0; 3308 int scanned = 0;
3278 3309
3310 if (!xattr_access) {
3311 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3312 strlen(POSIX_ACL_XATTR_ACCESS));
3313 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3314 strlen(POSIX_ACL_XATTR_DEFAULT));
3315 }
3316
3279 slot++; 3317 slot++;
3280 while (slot < nritems) { 3318 while (slot < nritems) {
3281 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3319 btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3285 return 0; 3323 return 0;
3286 3324
3287 /* we found an xattr, assume we've got an acl */ 3325 /* we found an xattr, assume we've got an acl */
3288 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 3326 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3289 return 1; 3327 if (found_key.offset == xattr_access ||
3328 found_key.offset == xattr_default)
3329 return 1;
3330 }
3290 3331
3291 /* 3332 /*
3292 * we found a key greater than an xattr key, there can't 3333 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3660 } 3701 }
3661 return ret; 3702 return ret;
3662} 3703}
3663
3664
3665/* helper to check if there is any shared block in the path */
3666static int check_path_shared(struct btrfs_root *root,
3667 struct btrfs_path *path)
3668{
3669 struct extent_buffer *eb;
3670 int level;
3671 u64 refs = 1;
3672
3673 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3674 int ret;
3675
3676 if (!path->nodes[level])
3677 break;
3678 eb = path->nodes[level];
3679 if (!btrfs_block_can_be_shared(root, eb))
3680 continue;
3681 ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
3682 &refs, NULL);
3683 if (refs > 1)
3684 return 1;
3685 }
3686 return 0;
3687}
3688 3704
3689/* 3705/*
3690 * helper to start transaction for unlink and rmdir. 3706 * helper to start transaction for unlink and rmdir.
3691 * 3707 *
3692 * unlink and rmdir are special in btrfs, they do not always free space. 3708 * unlink and rmdir are special in btrfs, they do not always free space, so
3693 * so in enospc case, we should make sure they will free space before 3709 * if we cannot make our reservations the normal way try and see if there is
3694 * allowing them to use the global metadata reservation. 3710 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3711 * allow the unlink to occur.
3695 */ 3712 */
3696static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 3713static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3697 struct dentry *dentry)
3698{ 3714{
3699 struct btrfs_trans_handle *trans; 3715 struct btrfs_trans_handle *trans;
3700 struct btrfs_root *root = BTRFS_I(dir)->root; 3716 struct btrfs_root *root = BTRFS_I(dir)->root;
3701 struct btrfs_path *path;
3702 struct btrfs_dir_item *di;
3703 struct inode *inode = dentry->d_inode;
3704 u64 index;
3705 int check_link = 1;
3706 int err = -ENOSPC;
3707 int ret; 3717 int ret;
3708 u64 ino = btrfs_ino(inode);
3709 u64 dir_ino = btrfs_ino(dir);
3710 3718
3711 /* 3719 /*
3712 * 1 for the possible orphan item 3720 * 1 for the possible orphan item
@@ -3719,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3719 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3727 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3720 return trans; 3728 return trans;
3721 3729
3722 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 3730 if (PTR_ERR(trans) == -ENOSPC) {
3723 return ERR_PTR(-ENOSPC); 3731 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3724
3725 /* check if there is someone else holds reference */
3726 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
3727 return ERR_PTR(-ENOSPC);
3728
3729 if (atomic_read(&inode->i_count) > 2)
3730 return ERR_PTR(-ENOSPC);
3731
3732 if (xchg(&root->fs_info->enospc_unlink, 1))
3733 return ERR_PTR(-ENOSPC);
3734
3735 path = btrfs_alloc_path();
3736 if (!path) {
3737 root->fs_info->enospc_unlink = 0;
3738 return ERR_PTR(-ENOMEM);
3739 }
3740 3732
3741 /* 1 for the orphan item */ 3733 trans = btrfs_start_transaction(root, 0);
3742 trans = btrfs_start_transaction(root, 1); 3734 if (IS_ERR(trans))
3743 if (IS_ERR(trans)) { 3735 return trans;
3744 btrfs_free_path(path); 3736 ret = btrfs_cond_migrate_bytes(root->fs_info,
3745 root->fs_info->enospc_unlink = 0; 3737 &root->fs_info->trans_block_rsv,
3746 return trans; 3738 num_bytes, 5);
3747 } 3739 if (ret) {
3748 3740 btrfs_end_transaction(trans, root);
3749 path->skip_locking = 1; 3741 return ERR_PTR(ret);
3750 path->search_commit_root = 1;
3751
3752 ret = btrfs_lookup_inode(trans, root, path,
3753 &BTRFS_I(dir)->location, 0);
3754 if (ret < 0) {
3755 err = ret;
3756 goto out;
3757 }
3758 if (ret == 0) {
3759 if (check_path_shared(root, path))
3760 goto out;
3761 } else {
3762 check_link = 0;
3763 }
3764 btrfs_release_path(path);
3765
3766 ret = btrfs_lookup_inode(trans, root, path,
3767 &BTRFS_I(inode)->location, 0);
3768 if (ret < 0) {
3769 err = ret;
3770 goto out;
3771 }
3772 if (ret == 0) {
3773 if (check_path_shared(root, path))
3774 goto out;
3775 } else {
3776 check_link = 0;
3777 }
3778 btrfs_release_path(path);
3779
3780 if (ret == 0 && S_ISREG(inode->i_mode)) {
3781 ret = btrfs_lookup_file_extent(trans, root, path,
3782 ino, (u64)-1, 0);
3783 if (ret < 0) {
3784 err = ret;
3785 goto out;
3786 } 3742 }
3787 BUG_ON(ret == 0); /* Corruption */
3788 if (check_path_shared(root, path))
3789 goto out;
3790 btrfs_release_path(path);
3791 }
3792
3793 if (!check_link) {
3794 err = 0;
3795 goto out;
3796 }
3797
3798 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3799 dentry->d_name.name, dentry->d_name.len, 0);
3800 if (IS_ERR(di)) {
3801 err = PTR_ERR(di);
3802 goto out;
3803 }
3804 if (di) {
3805 if (check_path_shared(root, path))
3806 goto out;
3807 } else {
3808 err = 0;
3809 goto out;
3810 }
3811 btrfs_release_path(path);
3812
3813 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3814 dentry->d_name.len, ino, dir_ino, 0,
3815 &index);
3816 if (ret) {
3817 err = ret;
3818 goto out;
3819 }
3820
3821 if (check_path_shared(root, path))
3822 goto out;
3823
3824 btrfs_release_path(path);
3825
3826 /*
3827 * This is a commit root search, if we can lookup inode item and other
3828 * relative items in the commit root, it means the transaction of
3829 * dir/file creation has been committed, and the dir index item that we
3830 * delay to insert has also been inserted into the commit root. So
3831 * we needn't worry about the delayed insertion of the dir index item
3832 * here.
3833 */
3834 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
3835 dentry->d_name.name, dentry->d_name.len, 0);
3836 if (IS_ERR(di)) {
3837 err = PTR_ERR(di);
3838 goto out;
3839 }
3840 BUG_ON(ret == -ENOENT);
3841 if (check_path_shared(root, path))
3842 goto out;
3843
3844 err = 0;
3845out:
3846 btrfs_free_path(path);
3847 /* Migrate the orphan reservation over */
3848 if (!err)
3849 err = btrfs_block_rsv_migrate(trans->block_rsv,
3850 &root->fs_info->global_block_rsv,
3851 trans->bytes_reserved);
3852
3853 if (err) {
3854 btrfs_end_transaction(trans, root);
3855 root->fs_info->enospc_unlink = 0;
3856 return ERR_PTR(err);
3857 }
3858
3859 trans->block_rsv = &root->fs_info->global_block_rsv;
3860 return trans;
3861}
3862
3863static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3864 struct btrfs_root *root)
3865{
3866 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3867 btrfs_block_rsv_release(root, trans->block_rsv,
3868 trans->bytes_reserved);
3869 trans->block_rsv = &root->fs_info->trans_block_rsv; 3743 trans->block_rsv = &root->fs_info->trans_block_rsv;
3870 BUG_ON(!root->fs_info->enospc_unlink); 3744 trans->bytes_reserved = num_bytes;
3871 root->fs_info->enospc_unlink = 0;
3872 } 3745 }
3873 btrfs_end_transaction(trans, root); 3746 return trans;
3874} 3747}
3875 3748
3876static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3749static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3880 struct inode *inode = dentry->d_inode; 3753 struct inode *inode = dentry->d_inode;
3881 int ret; 3754 int ret;
3882 3755
3883 trans = __unlink_start_trans(dir, dentry); 3756 trans = __unlink_start_trans(dir);
3884 if (IS_ERR(trans)) 3757 if (IS_ERR(trans))
3885 return PTR_ERR(trans); 3758 return PTR_ERR(trans);
3886 3759
@@ -3898,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3898 } 3771 }
3899 3772
3900out: 3773out:
3901 __unlink_end_trans(trans, root); 3774 btrfs_end_transaction(trans, root);
3902 btrfs_btree_balance_dirty(root); 3775 btrfs_btree_balance_dirty(root);
3903 return ret; 3776 return ret;
3904} 3777}
@@ -3995,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3995 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3868 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3996 return -EPERM; 3869 return -EPERM;
3997 3870
3998 trans = __unlink_start_trans(dir, dentry); 3871 trans = __unlink_start_trans(dir);
3999 if (IS_ERR(trans)) 3872 if (IS_ERR(trans))
4000 return PTR_ERR(trans); 3873 return PTR_ERR(trans);
4001 3874
@@ -4017,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4017 if (!err) 3890 if (!err)
4018 btrfs_i_size_write(inode, 0); 3891 btrfs_i_size_write(inode, 0);
4019out: 3892out:
4020 __unlink_end_trans(trans, root); 3893 btrfs_end_transaction(trans, root);
4021 btrfs_btree_balance_dirty(root); 3894 btrfs_btree_balance_dirty(root);
4022 3895
4023 return err; 3896 return err;
@@ -4395,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4395 u64 hole_size; 4268 u64 hole_size;
4396 int err = 0; 4269 int err = 0;
4397 4270
4271 /*
4272 * If our size started in the middle of a page we need to zero out the
4273 * rest of the page before we expand the i_size, otherwise we could
4274 * expose stale data.
4275 */
4276 err = btrfs_truncate_page(inode, oldsize, 0, 0);
4277 if (err)
4278 return err;
4279
4398 if (size <= hole_start) 4280 if (size <= hole_start)
4399 return 0; 4281 return 0;
4400 4282
@@ -4822,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
4822 goto out; 4704 goto out;
4823 } 4705 }
4824 4706
4825 if (btrfs_root_refs(&new_root->root_item) == 0) {
4826 err = -ENOENT;
4827 goto out;
4828 }
4829
4830 *sub_root = new_root; 4707 *sub_root = new_root;
4831 location->objectid = btrfs_root_dirid(&new_root->root_item); 4708 location->objectid = btrfs_root_dirid(&new_root->root_item);
4832 location->type = BTRFS_INODE_ITEM_KEY; 4709 location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5092 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4969 if (!(inode->i_sb->s_flags & MS_RDONLY))
5093 ret = btrfs_orphan_cleanup(sub_root); 4970 ret = btrfs_orphan_cleanup(sub_root);
5094 up_read(&root->fs_info->cleanup_work_sem); 4971 up_read(&root->fs_info->cleanup_work_sem);
5095 if (ret) 4972 if (ret) {
4973 iput(inode);
5096 inode = ERR_PTR(ret); 4974 inode = ERR_PTR(ret);
4975 }
5097 } 4976 }
5098 4977
5099 return inode; 4978 return inode;
@@ -6501,10 +6380,10 @@ out:
6501 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6380 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6502 * block must be cow'd 6381 * block must be cow'd
6503 */ 6382 */
6504static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 6383noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
6505 struct inode *inode, u64 offset, u64 *len, 6384 struct inode *inode, u64 offset, u64 *len,
6506 u64 *orig_start, u64 *orig_block_len, 6385 u64 *orig_start, u64 *orig_block_len,
6507 u64 *ram_bytes) 6386 u64 *ram_bytes)
6508{ 6387{
6509 struct btrfs_path *path; 6388 struct btrfs_path *path;
6510 int ret; 6389 int ret;
@@ -6518,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6518 u64 num_bytes; 6397 u64 num_bytes;
6519 int slot; 6398 int slot;
6520 int found_type; 6399 int found_type;
6521 6400 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6522 path = btrfs_alloc_path(); 6401 path = btrfs_alloc_path();
6523 if (!path) 6402 if (!path)
6524 return -ENOMEM; 6403 return -ENOMEM;
@@ -6558,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6558 /* not a regular extent, must cow */ 6437 /* not a regular extent, must cow */
6559 goto out; 6438 goto out;
6560 } 6439 }
6440
6441 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6442 goto out;
6443
6561 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6444 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6445 if (disk_bytenr == 0)
6446 goto out;
6447
6448 if (btrfs_file_extent_compression(leaf, fi) ||
6449 btrfs_file_extent_encryption(leaf, fi) ||
6450 btrfs_file_extent_other_encoding(leaf, fi))
6451 goto out;
6452
6562 backref_offset = btrfs_file_extent_offset(leaf, fi); 6453 backref_offset = btrfs_file_extent_offset(leaf, fi);
6563 6454
6564 *orig_start = key.offset - backref_offset; 6455 if (orig_start) {
6565 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6456 *orig_start = key.offset - backref_offset;
6566 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6457 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6458 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6459 }
6567 6460
6568 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6461 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6569 if (extent_end < offset + *len) {
6570 /* extent doesn't include our full range, must cow */
6571 goto out;
6572 }
6573 6462
6574 if (btrfs_extent_readonly(root, disk_bytenr)) 6463 if (btrfs_extent_readonly(root, disk_bytenr))
6575 goto out; 6464 goto out;
@@ -6813,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6813 if (IS_ERR(trans)) 6702 if (IS_ERR(trans))
6814 goto must_cow; 6703 goto must_cow;
6815 6704
6816 if (can_nocow_odirect(trans, inode, start, &len, &orig_start, 6705 if (can_nocow_extent(trans, inode, start, &len, &orig_start,
6817 &orig_block_len, &ram_bytes) == 1) { 6706 &orig_block_len, &ram_bytes) == 1) {
6818 if (type == BTRFS_ORDERED_PREALLOC) { 6707 if (type == BTRFS_ORDERED_PREALLOC) {
6819 free_extent_map(em); 6708 free_extent_map(em);
6820 em = create_pinned_em(inode, start, len, 6709 em = create_pinned_em(inode, start, len,
@@ -7243,7 +7132,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7243{ 7132{
7244 struct btrfs_root *root = BTRFS_I(inode)->root; 7133 struct btrfs_root *root = BTRFS_I(inode)->root;
7245 struct btrfs_dio_private *dip; 7134 struct btrfs_dio_private *dip;
7246 struct bio_vec *bvec = dio_bio->bi_io_vec;
7247 struct bio *io_bio; 7135 struct bio *io_bio;
7248 int skip_sum; 7136 int skip_sum;
7249 int write = rw & REQ_WRITE; 7137 int write = rw & REQ_WRITE;
@@ -7265,16 +7153,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7265 } 7153 }
7266 7154
7267 dip->private = dio_bio->bi_private; 7155 dip->private = dio_bio->bi_private;
7268 io_bio->bi_private = dio_bio->bi_private;
7269 dip->inode = inode; 7156 dip->inode = inode;
7270 dip->logical_offset = file_offset; 7157 dip->logical_offset = file_offset;
7271 7158 dip->bytes = dio_bio->bi_size;
7272 dip->bytes = 0;
7273 do {
7274 dip->bytes += bvec->bv_len;
7275 bvec++;
7276 } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
7277
7278 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7159 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7279 io_bio->bi_private = dip; 7160 io_bio->bi_private = dip;
7280 dip->errors = 0; 7161 dip->errors = 0;
@@ -7373,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7373 atomic_inc(&inode->i_dio_count); 7254 atomic_inc(&inode->i_dio_count);
7374 smp_mb__after_atomic_inc(); 7255 smp_mb__after_atomic_inc();
7375 7256
7257 /*
7258 * The generic stuff only does filemap_write_and_wait_range, which isn't
7259 * enough if we've written compressed pages to this area, so we need to
7260 * call btrfs_wait_ordered_range to make absolutely sure that any
7261 * outstanding dirty pages are on disk.
7262 */
7263 count = iov_length(iov, nr_segs);
7264 btrfs_wait_ordered_range(inode, offset, count);
7265
7376 if (rw & WRITE) { 7266 if (rw & WRITE) {
7377 count = iov_length(iov, nr_segs);
7378 /* 7267 /*
7379 * If the write DIO is beyond the EOF, we need update 7268 * If the write DIO is beyond the EOF, we need update
7380 * the isize, but it is protected by i_mutex. So we can 7269 * the isize, but it is protected by i_mutex. So we can
@@ -7694,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
7694{ 7583{
7695 struct btrfs_root *root = BTRFS_I(inode)->root; 7584 struct btrfs_root *root = BTRFS_I(inode)->root;
7696 struct btrfs_block_rsv *rsv; 7585 struct btrfs_block_rsv *rsv;
7697 int ret; 7586 int ret = 0;
7698 int err = 0; 7587 int err = 0;
7699 struct btrfs_trans_handle *trans; 7588 struct btrfs_trans_handle *trans;
7700 u64 mask = root->sectorsize - 1; 7589 u64 mask = root->sectorsize - 1;
7701 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 7590 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7702 7591
7703 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
7704 if (ret)
7705 return ret;
7706
7707 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 7592 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7708 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 7593 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7709 7594
@@ -7961,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode)
7961 */ 7846 */
7962 smp_mb(); 7847 smp_mb();
7963 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7848 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7964 spin_lock(&root->fs_info->ordered_extent_lock); 7849 spin_lock(&root->fs_info->ordered_root_lock);
7965 list_del_init(&BTRFS_I(inode)->ordered_operations); 7850 list_del_init(&BTRFS_I(inode)->ordered_operations);
7966 spin_unlock(&root->fs_info->ordered_extent_lock); 7851 spin_unlock(&root->fs_info->ordered_root_lock);
7967 } 7852 }
7968 7853
7969 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7854 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8333,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8333 * some fairly slow code that needs optimization. This walks the list 8218 * some fairly slow code that needs optimization. This walks the list
8334 * of all the inodes with pending delalloc and forces them to disk. 8219 * of all the inodes with pending delalloc and forces them to disk.
8335 */ 8220 */
8336int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8221static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8337{ 8222{
8338 struct btrfs_inode *binode; 8223 struct btrfs_inode *binode;
8339 struct inode *inode; 8224 struct inode *inode;
@@ -8342,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8342 struct list_head splice; 8227 struct list_head splice;
8343 int ret = 0; 8228 int ret = 0;
8344 8229
8345 if (root->fs_info->sb->s_flags & MS_RDONLY)
8346 return -EROFS;
8347
8348 INIT_LIST_HEAD(&works); 8230 INIT_LIST_HEAD(&works);
8349 INIT_LIST_HEAD(&splice); 8231 INIT_LIST_HEAD(&splice);
8350 8232
8351 spin_lock(&root->fs_info->delalloc_lock); 8233 spin_lock(&root->delalloc_lock);
8352 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8234 list_splice_init(&root->delalloc_inodes, &splice);
8353 while (!list_empty(&splice)) { 8235 while (!list_empty(&splice)) {
8354 binode = list_entry(splice.next, struct btrfs_inode, 8236 binode = list_entry(splice.next, struct btrfs_inode,
8355 delalloc_inodes); 8237 delalloc_inodes);
8356 8238
8357 list_del_init(&binode->delalloc_inodes); 8239 list_move_tail(&binode->delalloc_inodes,
8358 8240 &root->delalloc_inodes);
8359 inode = igrab(&binode->vfs_inode); 8241 inode = igrab(&binode->vfs_inode);
8360 if (!inode) { 8242 if (!inode) {
8361 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 8243 cond_resched_lock(&root->delalloc_lock);
8362 &binode->runtime_flags);
8363 continue; 8244 continue;
8364 } 8245 }
8365 8246 spin_unlock(&root->delalloc_lock);
8366 list_add_tail(&binode->delalloc_inodes,
8367 &root->fs_info->delalloc_inodes);
8368 spin_unlock(&root->fs_info->delalloc_lock);
8369 8247
8370 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8248 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8371 if (unlikely(!work)) { 8249 if (unlikely(!work)) {
@@ -8377,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8377 &work->work); 8255 &work->work);
8378 8256
8379 cond_resched(); 8257 cond_resched();
8380 spin_lock(&root->fs_info->delalloc_lock); 8258 spin_lock(&root->delalloc_lock);
8381 } 8259 }
8382 spin_unlock(&root->fs_info->delalloc_lock); 8260 spin_unlock(&root->delalloc_lock);
8383 8261
8384 list_for_each_entry_safe(work, next, &works, list) { 8262 list_for_each_entry_safe(work, next, &works, list) {
8385 list_del_init(&work->list); 8263 list_del_init(&work->list);
8386 btrfs_wait_and_free_delalloc_work(work); 8264 btrfs_wait_and_free_delalloc_work(work);
8387 } 8265 }
8266 return 0;
8267out:
8268 list_for_each_entry_safe(work, next, &works, list) {
8269 list_del_init(&work->list);
8270 btrfs_wait_and_free_delalloc_work(work);
8271 }
8272
8273 if (!list_empty_careful(&splice)) {
8274 spin_lock(&root->delalloc_lock);
8275 list_splice_tail(&splice, &root->delalloc_inodes);
8276 spin_unlock(&root->delalloc_lock);
8277 }
8278 return ret;
8279}
8280
8281int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8282{
8283 int ret;
8388 8284
8389 /* the filemap_flush will queue IO into the worker threads, but 8285 if (root->fs_info->sb->s_flags & MS_RDONLY)
8286 return -EROFS;
8287
8288 ret = __start_delalloc_inodes(root, delay_iput);
8289 /*
8290 * the filemap_flush will queue IO into the worker threads, but
8390 * we have to make sure the IO is actually started and that 8291 * we have to make sure the IO is actually started and that
8391 * ordered extents get created before we return 8292 * ordered extents get created before we return
8392 */ 8293 */
@@ -8398,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8398 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8299 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8399 } 8300 }
8400 atomic_dec(&root->fs_info->async_submit_draining); 8301 atomic_dec(&root->fs_info->async_submit_draining);
8401 return 0; 8302 return ret;
8402out: 8303}
8403 list_for_each_entry_safe(work, next, &works, list) { 8304
8404 list_del_init(&work->list); 8305int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8405 btrfs_wait_and_free_delalloc_work(work); 8306 int delay_iput)
8307{
8308 struct btrfs_root *root;
8309 struct list_head splice;
8310 int ret;
8311
8312 if (fs_info->sb->s_flags & MS_RDONLY)
8313 return -EROFS;
8314
8315 INIT_LIST_HEAD(&splice);
8316
8317 spin_lock(&fs_info->delalloc_root_lock);
8318 list_splice_init(&fs_info->delalloc_roots, &splice);
8319 while (!list_empty(&splice)) {
8320 root = list_first_entry(&splice, struct btrfs_root,
8321 delalloc_root);
8322 root = btrfs_grab_fs_root(root);
8323 BUG_ON(!root);
8324 list_move_tail(&root->delalloc_root,
8325 &fs_info->delalloc_roots);
8326 spin_unlock(&fs_info->delalloc_root_lock);
8327
8328 ret = __start_delalloc_inodes(root, delay_iput);
8329 btrfs_put_fs_root(root);
8330 if (ret)
8331 goto out;
8332
8333 spin_lock(&fs_info->delalloc_root_lock);
8406 } 8334 }
8335 spin_unlock(&fs_info->delalloc_root_lock);
8407 8336
8337 atomic_inc(&fs_info->async_submit_draining);
8338 while (atomic_read(&fs_info->nr_async_submits) ||
8339 atomic_read(&fs_info->async_delalloc_pages)) {
8340 wait_event(fs_info->async_submit_wait,
8341 (atomic_read(&fs_info->nr_async_submits) == 0 &&
8342 atomic_read(&fs_info->async_delalloc_pages) == 0));
8343 }
8344 atomic_dec(&fs_info->async_submit_draining);
8345 return 0;
8346out:
8408 if (!list_empty_careful(&splice)) { 8347 if (!list_empty_careful(&splice)) {
8409 spin_lock(&root->fs_info->delalloc_lock); 8348 spin_lock(&fs_info->delalloc_root_lock);
8410 list_splice_tail(&splice, &root->fs_info->delalloc_inodes); 8349 list_splice_tail(&splice, &fs_info->delalloc_roots);
8411 spin_unlock(&root->fs_info->delalloc_lock); 8350 spin_unlock(&fs_info->delalloc_root_lock);
8412 } 8351 }
8413 return ret; 8352 return ret;
8414} 8353}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cd7e96c73cb7..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
555 if (!root->ref_cows) 555 if (!root->ref_cows)
556 return -EINVAL; 556 return -EINVAL;
557 557
558 ret = btrfs_start_delalloc_inodes(root, 0);
559 if (ret)
560 return ret;
561
562 btrfs_wait_ordered_extents(root, 0);
563
558 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 564 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
559 if (!pending_snapshot) 565 if (!pending_snapshot)
560 return -ENOMEM; 566 return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2354 if (ret) 2360 if (ret)
2355 return ret; 2361 return ret;
2356 2362
2357 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2358 1)) {
2359 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2360 mnt_drop_write_file(file);
2361 return -EINVAL;
2362 }
2363
2364 mutex_lock(&root->fs_info->volume_mutex);
2365 vol_args = memdup_user(arg, sizeof(*vol_args)); 2363 vol_args = memdup_user(arg, sizeof(*vol_args));
2366 if (IS_ERR(vol_args)) { 2364 if (IS_ERR(vol_args)) {
2367 ret = PTR_ERR(vol_args); 2365 ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2369 } 2367 }
2370 2368
2371 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2369 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2372 ret = btrfs_rm_device(root, vol_args->name);
2373 2370
2374 kfree(vol_args); 2371 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2375out: 2372 1)) {
2373 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2374 goto out;
2375 }
2376
2377 mutex_lock(&root->fs_info->volume_mutex);
2378 ret = btrfs_rm_device(root, vol_args->name);
2376 mutex_unlock(&root->fs_info->volume_mutex); 2379 mutex_unlock(&root->fs_info->volume_mutex);
2377 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2380 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2381
2382out:
2383 kfree(vol_args);
2378 mnt_drop_write_file(file); 2384 mnt_drop_write_file(file);
2379 return ret; 2385 return ret;
2380} 2386}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2480 int ret; 2486 int ret;
2481 u64 len = olen; 2487 u64 len = olen;
2482 u64 bs = root->fs_info->sb->s_blocksize; 2488 u64 bs = root->fs_info->sb->s_blocksize;
2489 int same_inode = 0;
2483 2490
2484 /* 2491 /*
2485 * TODO: 2492 * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2516 2523
2517 ret = -EINVAL; 2524 ret = -EINVAL;
2518 if (src == inode) 2525 if (src == inode)
2519 goto out_fput; 2526 same_inode = 1;
2520 2527
2521 /* the src must be open for reading */ 2528 /* the src must be open for reading */
2522 if (!(src_file.file->f_mode & FMODE_READ)) 2529 if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2547 } 2554 }
2548 path->reada = 2; 2555 path->reada = 2;
2549 2556
2550 if (inode < src) { 2557 if (!same_inode) {
2551 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 2558 if (inode < src) {
2552 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 2559 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
2560 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
2561 } else {
2562 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
2563 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2564 }
2553 } else { 2565 } else {
2554 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 2566 mutex_lock(&src->i_mutex);
2555 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2556 } 2567 }
2557 2568
2558 /* determine range to clone */ 2569 /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2570 !IS_ALIGNED(destoff, bs)) 2581 !IS_ALIGNED(destoff, bs))
2571 goto out_unlock; 2582 goto out_unlock;
2572 2583
2584 /* verify if ranges are overlapped within the same file */
2585 if (same_inode) {
2586 if (destoff + len > off && destoff < off + len)
2587 goto out_unlock;
2588 }
2589
2573 if (destoff > inode->i_size) { 2590 if (destoff > inode->i_size) {
2574 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 2591 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2575 if (ret) 2592 if (ret)
@@ -2846,7 +2863,8 @@ out:
2846 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 2863 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2847out_unlock: 2864out_unlock:
2848 mutex_unlock(&src->i_mutex); 2865 mutex_unlock(&src->i_mutex);
2849 mutex_unlock(&inode->i_mutex); 2866 if (!same_inode)
2867 mutex_unlock(&inode->i_mutex);
2850 vfree(buf); 2868 vfree(buf);
2851 btrfs_free_path(path); 2869 btrfs_free_path(path);
2852out_fput: 2870out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2951 goto out; 2969 goto out;
2952 } 2970 }
2953 2971
2954 if (btrfs_root_refs(&new_root->root_item) == 0) {
2955 ret = -ENOENT;
2956 goto out;
2957 }
2958
2959 path = btrfs_alloc_path(); 2972 path = btrfs_alloc_path();
2960 if (!path) { 2973 if (!path) {
2961 ret = -ENOMEM; 2974 ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3719 break; 3732 break;
3720 } 3733 }
3721 3734
3722 if (copy_to_user(arg, sa, sizeof(*sa)))
3723 ret = -EFAULT;
3724
3725 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 3735 err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
3726 if (err && !ret) 3736 if (err && !ret)
3727 ret = err; 3737 ret = err;
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3937 return ret; 3947 return ret;
3938} 3948}
3939 3949
3950static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
3951{
3952 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3953
3954 if (!capable(CAP_SYS_ADMIN))
3955 return -EPERM;
3956
3957 return btrfs_qgroup_wait_for_completion(root->fs_info);
3958}
3959
3940static long btrfs_ioctl_set_received_subvol(struct file *file, 3960static long btrfs_ioctl_set_received_subvol(struct file *file,
3941 void __user *arg) 3961 void __user *arg)
3942{ 3962{
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
4179 return btrfs_ioctl_quota_rescan(file, argp); 4199 return btrfs_ioctl_quota_rescan(file, argp);
4180 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 4200 case BTRFS_IOC_QUOTA_RESCAN_STATUS:
4181 return btrfs_ioctl_quota_rescan_status(file, argp); 4201 return btrfs_ioctl_quota_rescan_status(file, argp);
4202 case BTRFS_IOC_QUOTA_RESCAN_WAIT:
4203 return btrfs_ioctl_quota_rescan_wait(file, argp);
4182 case BTRFS_IOC_DEV_REPLACE: 4204 case BTRFS_IOC_DEV_REPLACE:
4183 return btrfs_ioctl_dev_replace(root, argp); 4205 return btrfs_ioctl_dev_replace(root, argp);
4184 case BTRFS_IOC_GET_FSLABEL: 4206 case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
31 31
32struct workspace { 32struct workspace {
33 void *mem; 33 void *mem;
34 void *buf; /* where compressed data goes */ 34 void *buf; /* where decompressed data goes */
35 void *cbuf; /* where decompressed data goes */ 35 void *cbuf; /* where compressed data goes */
36 struct list_head list; 36 struct list_head list;
37}; 37};
38 38
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
24#include "transaction.h" 24#include "transaction.h"
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h"
27 28
28static struct kmem_cache *btrfs_ordered_extent_cache; 29static struct kmem_cache *btrfs_ordered_extent_cache;
29 30
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
184 u64 start, u64 len, u64 disk_len, 185 u64 start, u64 len, u64 disk_len,
185 int type, int dio, int compress_type) 186 int type, int dio, int compress_type)
186{ 187{
188 struct btrfs_root *root = BTRFS_I(inode)->root;
187 struct btrfs_ordered_inode_tree *tree; 189 struct btrfs_ordered_inode_tree *tree;
188 struct rb_node *node; 190 struct rb_node *node;
189 struct btrfs_ordered_extent *entry; 191 struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
227 ordered_data_tree_panic(inode, -EEXIST, file_offset); 229 ordered_data_tree_panic(inode, -EEXIST, file_offset);
228 spin_unlock_irq(&tree->lock); 230 spin_unlock_irq(&tree->lock);
229 231
230 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 232 spin_lock(&root->ordered_extent_lock);
231 list_add_tail(&entry->root_extent_list, 233 list_add_tail(&entry->root_extent_list,
232 &BTRFS_I(inode)->root->fs_info->ordered_extents); 234 &root->ordered_extents);
233 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 235 root->nr_ordered_extents++;
236 if (root->nr_ordered_extents == 1) {
237 spin_lock(&root->fs_info->ordered_root_lock);
238 BUG_ON(!list_empty(&root->ordered_root));
239 list_add_tail(&root->ordered_root,
240 &root->fs_info->ordered_roots);
241 spin_unlock(&root->fs_info->ordered_root_lock);
242 }
243 spin_unlock(&root->ordered_extent_lock);
234 244
235 return 0; 245 return 0;
236} 246}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
516 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 526 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
517 spin_unlock_irq(&tree->lock); 527 spin_unlock_irq(&tree->lock);
518 528
519 spin_lock(&root->fs_info->ordered_extent_lock); 529 spin_lock(&root->ordered_extent_lock);
520 list_del_init(&entry->root_extent_list); 530 list_del_init(&entry->root_extent_list);
531 root->nr_ordered_extents--;
521 532
522 trace_btrfs_ordered_extent_remove(inode, entry); 533 trace_btrfs_ordered_extent_remove(inode, entry);
523 534
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
530 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 541 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
531 list_del_init(&BTRFS_I(inode)->ordered_operations); 542 list_del_init(&BTRFS_I(inode)->ordered_operations);
532 } 543 }
533 spin_unlock(&root->fs_info->ordered_extent_lock); 544
545 if (!root->nr_ordered_extents) {
546 spin_lock(&root->fs_info->ordered_root_lock);
547 BUG_ON(list_empty(&root->ordered_root));
548 list_del_init(&root->ordered_root);
549 spin_unlock(&root->fs_info->ordered_root_lock);
550 }
551 spin_unlock(&root->ordered_extent_lock);
534 wake_up(&entry->wait); 552 wake_up(&entry->wait);
535} 553}
536 554
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
550void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 568void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
551{ 569{
552 struct list_head splice, works; 570 struct list_head splice, works;
553 struct list_head *cur;
554 struct btrfs_ordered_extent *ordered, *next; 571 struct btrfs_ordered_extent *ordered, *next;
555 struct inode *inode; 572 struct inode *inode;
556 573
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
558 INIT_LIST_HEAD(&works); 575 INIT_LIST_HEAD(&works);
559 576
560 mutex_lock(&root->fs_info->ordered_operations_mutex); 577 mutex_lock(&root->fs_info->ordered_operations_mutex);
561 spin_lock(&root->fs_info->ordered_extent_lock); 578 spin_lock(&root->ordered_extent_lock);
562 list_splice_init(&root->fs_info->ordered_extents, &splice); 579 list_splice_init(&root->ordered_extents, &splice);
563 while (!list_empty(&splice)) { 580 while (!list_empty(&splice)) {
564 cur = splice.next; 581 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
565 ordered = list_entry(cur, struct btrfs_ordered_extent, 582 root_extent_list);
566 root_extent_list); 583 list_move_tail(&ordered->root_extent_list,
567 list_del_init(&ordered->root_extent_list); 584 &root->ordered_extents);
568 atomic_inc(&ordered->refs);
569
570 /* 585 /*
571 * the inode may be getting freed (in sys_unlink path). 586 * the inode may be getting freed (in sys_unlink path).
572 */ 587 */
573 inode = igrab(ordered->inode); 588 inode = igrab(ordered->inode);
589 if (!inode) {
590 cond_resched_lock(&root->ordered_extent_lock);
591 continue;
592 }
574 593
575 spin_unlock(&root->fs_info->ordered_extent_lock); 594 atomic_inc(&ordered->refs);
595 spin_unlock(&root->ordered_extent_lock);
576 596
577 if (inode) { 597 ordered->flush_work.func = btrfs_run_ordered_extent_work;
578 ordered->flush_work.func = btrfs_run_ordered_extent_work; 598 list_add_tail(&ordered->work_list, &works);
579 list_add_tail(&ordered->work_list, &works); 599 btrfs_queue_worker(&root->fs_info->flush_workers,
580 btrfs_queue_worker(&root->fs_info->flush_workers, 600 &ordered->flush_work);
581 &ordered->flush_work);
582 } else {
583 btrfs_put_ordered_extent(ordered);
584 }
585 601
586 cond_resched(); 602 cond_resched();
587 spin_lock(&root->fs_info->ordered_extent_lock); 603 spin_lock(&root->ordered_extent_lock);
588 } 604 }
589 spin_unlock(&root->fs_info->ordered_extent_lock); 605 spin_unlock(&root->ordered_extent_lock);
590 606
591 list_for_each_entry_safe(ordered, next, &works, work_list) { 607 list_for_each_entry_safe(ordered, next, &works, work_list) {
592 list_del_init(&ordered->work_list); 608 list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
604 mutex_unlock(&root->fs_info->ordered_operations_mutex); 620 mutex_unlock(&root->fs_info->ordered_operations_mutex);
605} 621}
606 622
623void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
624 int delay_iput)
625{
626 struct btrfs_root *root;
627 struct list_head splice;
628
629 INIT_LIST_HEAD(&splice);
630
631 spin_lock(&fs_info->ordered_root_lock);
632 list_splice_init(&fs_info->ordered_roots, &splice);
633 while (!list_empty(&splice)) {
634 root = list_first_entry(&splice, struct btrfs_root,
635 ordered_root);
636 root = btrfs_grab_fs_root(root);
637 BUG_ON(!root);
638 list_move_tail(&root->ordered_root,
639 &fs_info->ordered_roots);
640 spin_unlock(&fs_info->ordered_root_lock);
641
642 btrfs_wait_ordered_extents(root, delay_iput);
643 btrfs_put_fs_root(root);
644
645 spin_lock(&fs_info->ordered_root_lock);
646 }
647 spin_unlock(&fs_info->ordered_root_lock);
648}
649
607/* 650/*
608 * this is used during transaction commit to write all the inodes 651 * this is used during transaction commit to write all the inodes
609 * added to the ordered operation list. These files must be fully on 652 * added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
629 INIT_LIST_HEAD(&works); 672 INIT_LIST_HEAD(&works);
630 673
631 mutex_lock(&root->fs_info->ordered_operations_mutex); 674 mutex_lock(&root->fs_info->ordered_operations_mutex);
632 spin_lock(&root->fs_info->ordered_extent_lock); 675 spin_lock(&root->fs_info->ordered_root_lock);
633 list_splice_init(&cur_trans->ordered_operations, &splice); 676 list_splice_init(&cur_trans->ordered_operations, &splice);
634 while (!list_empty(&splice)) { 677 while (!list_empty(&splice)) {
635 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 678 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
648 if (!wait) 691 if (!wait)
649 list_add_tail(&BTRFS_I(inode)->ordered_operations, 692 list_add_tail(&BTRFS_I(inode)->ordered_operations,
650 &cur_trans->ordered_operations); 693 &cur_trans->ordered_operations);
651 spin_unlock(&root->fs_info->ordered_extent_lock); 694 spin_unlock(&root->fs_info->ordered_root_lock);
652 695
653 work = btrfs_alloc_delalloc_work(inode, wait, 1); 696 work = btrfs_alloc_delalloc_work(inode, wait, 1);
654 if (!work) { 697 if (!work) {
655 spin_lock(&root->fs_info->ordered_extent_lock); 698 spin_lock(&root->fs_info->ordered_root_lock);
656 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 699 if (list_empty(&BTRFS_I(inode)->ordered_operations))
657 list_add_tail(&btrfs_inode->ordered_operations, 700 list_add_tail(&btrfs_inode->ordered_operations,
658 &splice); 701 &splice);
659 list_splice_tail(&splice, 702 list_splice_tail(&splice,
660 &cur_trans->ordered_operations); 703 &cur_trans->ordered_operations);
661 spin_unlock(&root->fs_info->ordered_extent_lock); 704 spin_unlock(&root->fs_info->ordered_root_lock);
662 ret = -ENOMEM; 705 ret = -ENOMEM;
663 goto out; 706 goto out;
664 } 707 }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
667 &work->work); 710 &work->work);
668 711
669 cond_resched(); 712 cond_resched();
670 spin_lock(&root->fs_info->ordered_extent_lock); 713 spin_lock(&root->fs_info->ordered_root_lock);
671 } 714 }
672 spin_unlock(&root->fs_info->ordered_extent_lock); 715 spin_unlock(&root->fs_info->ordered_root_lock);
673out: 716out:
674 list_for_each_entry_safe(work, next, &works, list) { 717 list_for_each_entry_safe(work, next, &works, list) {
675 list_del_init(&work->list); 718 list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
989 u32 *sum, int len) 1032 u32 *sum, int len)
990{ 1033{
991 struct btrfs_ordered_sum *ordered_sum; 1034 struct btrfs_ordered_sum *ordered_sum;
992 struct btrfs_sector_sum *sector_sums;
993 struct btrfs_ordered_extent *ordered; 1035 struct btrfs_ordered_extent *ordered;
994 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 1036 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
995 unsigned long num_sectors; 1037 unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1007 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 1049 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1008 i = (disk_bytenr - ordered_sum->bytenr) >> 1050 i = (disk_bytenr - ordered_sum->bytenr) >>
1009 inode->i_sb->s_blocksize_bits; 1051 inode->i_sb->s_blocksize_bits;
1010 sector_sums = ordered_sum->sums + i;
1011 num_sectors = ordered_sum->len >> 1052 num_sectors = ordered_sum->len >>
1012 inode->i_sb->s_blocksize_bits; 1053 inode->i_sb->s_blocksize_bits;
1013 for (; i < num_sectors; i++) { 1054 num_sectors = min_t(int, len - index, num_sectors - i);
1014 if (sector_sums[i].bytenr == disk_bytenr) { 1055 memcpy(sum + index, ordered_sum->sums + i,
1015 sum[index] = sector_sums[i].sum; 1056 num_sectors);
1016 index++; 1057
1017 if (index == len) 1058 index += (int)num_sectors;
1018 goto out; 1059 if (index == len)
1019 disk_bytenr += sectorsize; 1060 goto out;
1020 } 1061 disk_bytenr += num_sectors * sectorsize;
1021 }
1022 } 1062 }
1023 } 1063 }
1024out: 1064out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1055 if (last_mod < root->fs_info->last_trans_committed) 1095 if (last_mod < root->fs_info->last_trans_committed)
1056 return; 1096 return;
1057 1097
1058 spin_lock(&root->fs_info->ordered_extent_lock); 1098 spin_lock(&root->fs_info->ordered_root_lock);
1059 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1099 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1060 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1100 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1061 &cur_trans->ordered_operations); 1101 &cur_trans->ordered_operations);
1062 } 1102 }
1063 spin_unlock(&root->fs_info->ordered_extent_lock); 1103 spin_unlock(&root->fs_info->ordered_root_lock);
1064} 1104}
1065 1105
1066int __init ordered_data_init(void) 1106int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
28 28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum { 29struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */ 30 /* bytenr is the start of this extent on disk */
43 u64 bytenr; 31 u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
45 /* 33 /*
46 * this is the length in bytes covered by the sums array below. 34 * this is the length in bytes covered by the sums array below.
47 */ 35 */
48 unsigned long len; 36 int len;
49 struct list_head list; 37 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */ 38 /* last field is a variable length array of csums */
51 struct btrfs_sector_sum sums[]; 39 u32 sums[];
52}; 40};
53 41
54/* 42/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
149static inline int btrfs_ordered_sum_size(struct btrfs_root *root, 137static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
150 unsigned long bytes) 138 unsigned long bytes)
151{ 139{
152 unsigned long num_sectors = (bytes + root->sectorsize - 1) / 140 int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
153 root->sectorsize; 141 return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
154 num_sectors++;
155 return sizeof(struct btrfs_ordered_sum) +
156 num_sectors * sizeof(struct btrfs_sector_sum);
157} 142}
158 143
159static inline void 144static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
204 struct btrfs_root *root, 189 struct btrfs_root *root,
205 struct inode *inode); 190 struct inode *inode);
206void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 191void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
192void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
193 int delay_iput);
207void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 194void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
208void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 195void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 196void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 98 struct btrfs_qgroup *member;
99}; 99};
100 100
101struct qgroup_rescan { 101static int
102 struct btrfs_work work; 102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 struct btrfs_fs_info *fs_info; 103 int init_flags);
104}; 104static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
105
106static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
107 struct qgroup_rescan *qscan);
108 105
109/* must be called with qgroup_ioctl_lock held */ 106/* must be called with qgroup_ioctl_lock held */
110static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 107static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
255 int slot; 252 int slot;
256 int ret = 0; 253 int ret = 0;
257 u64 flags = 0; 254 u64 flags = 0;
255 u64 rescan_progress = 0;
258 256
259 if (!fs_info->quota_enabled) 257 if (!fs_info->quota_enabled)
260 return 0; 258 return 0;
261 259
260 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
261 if (!fs_info->qgroup_ulist) {
262 ret = -ENOMEM;
263 goto out;
264 }
265
262 path = btrfs_alloc_path(); 266 path = btrfs_alloc_path();
263 if (!path) { 267 if (!path) {
264 ret = -ENOMEM; 268 ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
306 } 310 }
307 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 311 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
308 ptr); 312 ptr);
309 fs_info->qgroup_rescan_progress.objectid = 313 rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
310 btrfs_qgroup_status_rescan(l, ptr);
311 if (fs_info->qgroup_flags &
312 BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
313 struct qgroup_rescan *qscan =
314 kmalloc(sizeof(*qscan), GFP_NOFS);
315 if (!qscan) {
316 ret = -ENOMEM;
317 goto out;
318 }
319 fs_info->qgroup_rescan_progress.type = 0;
320 fs_info->qgroup_rescan_progress.offset = 0;
321 qgroup_rescan_start(fs_info, qscan);
322 }
323 goto next1; 314 goto next1;
324 } 315 }
325 316
@@ -421,9 +412,18 @@ out:
421 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { 412 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
422 fs_info->quota_enabled = 0; 413 fs_info->quota_enabled = 0;
423 fs_info->pending_quota_state = 0; 414 fs_info->pending_quota_state = 0;
415 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
416 ret >= 0) {
417 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
424 } 418 }
425 btrfs_free_path(path); 419 btrfs_free_path(path);
426 420
421 if (ret < 0) {
422 ulist_free(fs_info->qgroup_ulist);
423 fs_info->qgroup_ulist = NULL;
424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
425 }
426
427 return ret < 0 ? ret : 0; 427 return ret < 0 ? ret : 0;
428} 428}
429 429
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
460 } 460 }
461 kfree(qgroup); 461 kfree(qgroup);
462 } 462 }
463 ulist_free(fs_info->qgroup_ulist);
463} 464}
464 465
465static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, 466static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
819 goto out; 820 goto out;
820 } 821 }
821 822
823 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
824 if (!fs_info->qgroup_ulist) {
825 ret = -ENOMEM;
826 goto out;
827 }
828
822 /* 829 /*
823 * initially create the quota tree 830 * initially create the quota tree
824 */ 831 */
@@ -916,6 +923,10 @@ out_free_root:
916 kfree(quota_root); 923 kfree(quota_root);
917 } 924 }
918out: 925out:
926 if (ret) {
927 ulist_free(fs_info->qgroup_ulist);
928 fs_info->qgroup_ulist = NULL;
929 }
919 mutex_unlock(&fs_info->qgroup_ioctl_lock); 930 mutex_unlock(&fs_info->qgroup_ioctl_lock);
920 return ret; 931 return ret;
921} 932}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1355 u64 ref_root; 1366 u64 ref_root;
1356 struct btrfs_qgroup *qgroup; 1367 struct btrfs_qgroup *qgroup;
1357 struct ulist *roots = NULL; 1368 struct ulist *roots = NULL;
1358 struct ulist *tmp = NULL;
1359 u64 seq; 1369 u64 seq;
1360 int ret = 0; 1370 int ret = 0;
1361 int sgn; 1371 int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1428 if (ret < 0) 1438 if (ret < 0)
1429 return ret; 1439 return ret;
1430 1440
1431 mutex_lock(&fs_info->qgroup_rescan_lock);
1432 spin_lock(&fs_info->qgroup_lock); 1441 spin_lock(&fs_info->qgroup_lock);
1433 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1434 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
1435 ret = 0;
1436 goto unlock;
1437 }
1438 }
1439 1442
1440 quota_root = fs_info->quota_root; 1443 quota_root = fs_info->quota_root;
1441 if (!quota_root) 1444 if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1448 /* 1451 /*
1449 * step 1: for each old ref, visit all nodes once and inc refcnt 1452 * step 1: for each old ref, visit all nodes once and inc refcnt
1450 */ 1453 */
1451 tmp = ulist_alloc(GFP_ATOMIC); 1454 ulist_reinit(fs_info->qgroup_ulist);
1452 if (!tmp) {
1453 ret = -ENOMEM;
1454 goto unlock;
1455 }
1456 seq = fs_info->qgroup_seq; 1455 seq = fs_info->qgroup_seq;
1457 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1456 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1458 1457
1459 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 1458 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
1459 seq);
1460 if (ret) 1460 if (ret)
1461 goto unlock; 1461 goto unlock;
1462 1462
1463 /* 1463 /*
1464 * step 2: walk from the new root 1464 * step 2: walk from the new root
1465 */ 1465 */
1466 ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, 1466 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
1467 node->num_bytes, qgroup); 1467 seq, sgn, node->num_bytes, qgroup);
1468 if (ret) 1468 if (ret)
1469 goto unlock; 1469 goto unlock;
1470 1470
1471 /* 1471 /*
1472 * step 3: walk again from old refs 1472 * step 3: walk again from old refs
1473 */ 1473 */
1474 ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, 1474 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
1475 node->num_bytes); 1475 seq, sgn, node->num_bytes);
1476 if (ret) 1476 if (ret)
1477 goto unlock; 1477 goto unlock;
1478 1478
1479unlock: 1479unlock:
1480 spin_unlock(&fs_info->qgroup_lock); 1480 spin_unlock(&fs_info->qgroup_lock);
1481 mutex_unlock(&fs_info->qgroup_rescan_lock);
1482 ulist_free(roots); 1481 ulist_free(roots);
1483 ulist_free(tmp);
1484 1482
1485 return ret; 1483 return ret;
1486} 1484}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1527 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1525 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1528 1526
1529 if (!ret && start_rescan_worker) { 1527 if (!ret && start_rescan_worker) {
1530 ret = btrfs_qgroup_rescan(fs_info); 1528 ret = qgroup_rescan_init(fs_info, 0, 1);
1531 if (ret) 1529 if (!ret) {
1532 pr_err("btrfs: start rescan quota failed: %d\n", ret); 1530 qgroup_rescan_zero_tracking(fs_info);
1531 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
1532 &fs_info->qgroup_rescan_work);
1533 }
1533 ret = 0; 1534 ret = 0;
1534 } 1535 }
1535 1536
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1720 struct btrfs_fs_info *fs_info = root->fs_info; 1721 struct btrfs_fs_info *fs_info = root->fs_info;
1721 u64 ref_root = root->root_key.objectid; 1722 u64 ref_root = root->root_key.objectid;
1722 int ret = 0; 1723 int ret = 0;
1723 struct ulist *ulist = NULL;
1724 struct ulist_node *unode; 1724 struct ulist_node *unode;
1725 struct ulist_iterator uiter; 1725 struct ulist_iterator uiter;
1726 1726
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1743 * in a first step, we check all affected qgroups if any limits would 1743 * in a first step, we check all affected qgroups if any limits would
1744 * be exceeded 1744 * be exceeded
1745 */ 1745 */
1746 ulist = ulist_alloc(GFP_ATOMIC); 1746 ulist_reinit(fs_info->qgroup_ulist);
1747 if (!ulist) { 1747 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1748 ret = -ENOMEM;
1749 goto out;
1750 }
1751 ret = ulist_add(ulist, qgroup->qgroupid,
1752 (uintptr_t)qgroup, GFP_ATOMIC); 1748 (uintptr_t)qgroup, GFP_ATOMIC);
1753 if (ret < 0) 1749 if (ret < 0)
1754 goto out; 1750 goto out;
1755 ULIST_ITER_INIT(&uiter); 1751 ULIST_ITER_INIT(&uiter);
1756 while ((unode = ulist_next(ulist, &uiter))) { 1752 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1757 struct btrfs_qgroup *qg; 1753 struct btrfs_qgroup *qg;
1758 struct btrfs_qgroup_list *glist; 1754 struct btrfs_qgroup_list *glist;
1759 1755
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1774 } 1770 }
1775 1771
1776 list_for_each_entry(glist, &qg->groups, next_group) { 1772 list_for_each_entry(glist, &qg->groups, next_group) {
1777 ret = ulist_add(ulist, glist->group->qgroupid, 1773 ret = ulist_add(fs_info->qgroup_ulist,
1774 glist->group->qgroupid,
1778 (uintptr_t)glist->group, GFP_ATOMIC); 1775 (uintptr_t)glist->group, GFP_ATOMIC);
1779 if (ret < 0) 1776 if (ret < 0)
1780 goto out; 1777 goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1785 * no limits exceeded, now record the reservation into all qgroups 1782 * no limits exceeded, now record the reservation into all qgroups
1786 */ 1783 */
1787 ULIST_ITER_INIT(&uiter); 1784 ULIST_ITER_INIT(&uiter);
1788 while ((unode = ulist_next(ulist, &uiter))) { 1785 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1789 struct btrfs_qgroup *qg; 1786 struct btrfs_qgroup *qg;
1790 1787
1791 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1788 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1795 1792
1796out: 1793out:
1797 spin_unlock(&fs_info->qgroup_lock); 1794 spin_unlock(&fs_info->qgroup_lock);
1798 ulist_free(ulist);
1799
1800 return ret; 1795 return ret;
1801} 1796}
1802 1797
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1805 struct btrfs_root *quota_root; 1800 struct btrfs_root *quota_root;
1806 struct btrfs_qgroup *qgroup; 1801 struct btrfs_qgroup *qgroup;
1807 struct btrfs_fs_info *fs_info = root->fs_info; 1802 struct btrfs_fs_info *fs_info = root->fs_info;
1808 struct ulist *ulist = NULL;
1809 struct ulist_node *unode; 1803 struct ulist_node *unode;
1810 struct ulist_iterator uiter; 1804 struct ulist_iterator uiter;
1811 u64 ref_root = root->root_key.objectid; 1805 u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1827 if (!qgroup) 1821 if (!qgroup)
1828 goto out; 1822 goto out;
1829 1823
1830 ulist = ulist_alloc(GFP_ATOMIC); 1824 ulist_reinit(fs_info->qgroup_ulist);
1831 if (!ulist) { 1825 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1832 btrfs_std_error(fs_info, -ENOMEM);
1833 goto out;
1834 }
1835 ret = ulist_add(ulist, qgroup->qgroupid,
1836 (uintptr_t)qgroup, GFP_ATOMIC); 1826 (uintptr_t)qgroup, GFP_ATOMIC);
1837 if (ret < 0) 1827 if (ret < 0)
1838 goto out; 1828 goto out;
1839 ULIST_ITER_INIT(&uiter); 1829 ULIST_ITER_INIT(&uiter);
1840 while ((unode = ulist_next(ulist, &uiter))) { 1830 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1841 struct btrfs_qgroup *qg; 1831 struct btrfs_qgroup *qg;
1842 struct btrfs_qgroup_list *glist; 1832 struct btrfs_qgroup_list *glist;
1843 1833
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1846 qg->reserved -= num_bytes; 1836 qg->reserved -= num_bytes;
1847 1837
1848 list_for_each_entry(glist, &qg->groups, next_group) { 1838 list_for_each_entry(glist, &qg->groups, next_group) {
1849 ret = ulist_add(ulist, glist->group->qgroupid, 1839 ret = ulist_add(fs_info->qgroup_ulist,
1840 glist->group->qgroupid,
1850 (uintptr_t)glist->group, GFP_ATOMIC); 1841 (uintptr_t)glist->group, GFP_ATOMIC);
1851 if (ret < 0) 1842 if (ret < 0)
1852 goto out; 1843 goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1855 1846
1856out: 1847out:
1857 spin_unlock(&fs_info->qgroup_lock); 1848 spin_unlock(&fs_info->qgroup_lock);
1858 ulist_free(ulist);
1859} 1849}
1860 1850
1861void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) 1851void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1874 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. 1864 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
1875 */ 1865 */
1876static int 1866static int
1877qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, 1867qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1878 struct btrfs_trans_handle *trans, struct ulist *tmp, 1868 struct btrfs_trans_handle *trans, struct ulist *tmp,
1879 struct extent_buffer *scratch_leaf) 1869 struct extent_buffer *scratch_leaf)
1880{ 1870{
1881 struct btrfs_key found; 1871 struct btrfs_key found;
1882 struct btrfs_fs_info *fs_info = qscan->fs_info;
1883 struct ulist *roots = NULL; 1872 struct ulist *roots = NULL;
1884 struct ulist_node *unode; 1873 struct ulist_node *unode;
1885 struct ulist_iterator uiter; 1874 struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
2007 1996
2008static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 1997static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2009{ 1998{
2010 struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, 1999 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2011 work); 2000 qgroup_rescan_work);
2012 struct btrfs_path *path; 2001 struct btrfs_path *path;
2013 struct btrfs_trans_handle *trans = NULL; 2002 struct btrfs_trans_handle *trans = NULL;
2014 struct btrfs_fs_info *fs_info = qscan->fs_info;
2015 struct ulist *tmp = NULL; 2003 struct ulist *tmp = NULL;
2016 struct extent_buffer *scratch_leaf = NULL; 2004 struct extent_buffer *scratch_leaf = NULL;
2017 int err = -ENOMEM; 2005 int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2036 if (!fs_info->quota_enabled) { 2024 if (!fs_info->quota_enabled) {
2037 err = -EINTR; 2025 err = -EINTR;
2038 } else { 2026 } else {
2039 err = qgroup_rescan_leaf(qscan, path, trans, 2027 err = qgroup_rescan_leaf(fs_info, path, trans,
2040 tmp, scratch_leaf); 2028 tmp, scratch_leaf);
2041 } 2029 }
2042 if (err > 0) 2030 if (err > 0)
@@ -2049,7 +2037,6 @@ out:
2049 kfree(scratch_leaf); 2037 kfree(scratch_leaf);
2050 ulist_free(tmp); 2038 ulist_free(tmp);
2051 btrfs_free_path(path); 2039 btrfs_free_path(path);
2052 kfree(qscan);
2053 2040
2054 mutex_lock(&fs_info->qgroup_rescan_lock); 2041 mutex_lock(&fs_info->qgroup_rescan_lock);
2055 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2042 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
2068 } else { 2055 } else {
2069 pr_err("btrfs: qgroup scan failed with %d\n", err); 2056 pr_err("btrfs: qgroup scan failed with %d\n", err);
2070 } 2057 }
2071}
2072 2058
2073static void 2059 complete_all(&fs_info->qgroup_rescan_completion);
2074qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
2075{
2076 memset(&qscan->work, 0, sizeof(qscan->work));
2077 qscan->work.func = btrfs_qgroup_rescan_worker;
2078 qscan->fs_info = fs_info;
2079
2080 pr_info("btrfs: qgroup scan started\n");
2081 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
2082} 2060}
2083 2061
2084int 2062/*
2085btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 2063 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
2064 * memory required for the rescan context.
2065 */
2066static int
2067qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2068 int init_flags)
2086{ 2069{
2087 int ret = 0; 2070 int ret = 0;
2088 struct rb_node *n;
2089 struct btrfs_qgroup *qgroup;
2090 struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
2091 2071
2092 if (!qscan) 2072 if (!init_flags &&
2093 return -ENOMEM; 2073 (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
2074 !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
2075 ret = -EINVAL;
2076 goto err;
2077 }
2094 2078
2095 mutex_lock(&fs_info->qgroup_rescan_lock); 2079 mutex_lock(&fs_info->qgroup_rescan_lock);
2096 spin_lock(&fs_info->qgroup_lock); 2080 spin_lock(&fs_info->qgroup_lock);
2097 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2081
2098 ret = -EINPROGRESS; 2082 if (init_flags) {
2099 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 2083 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2100 ret = -EINVAL; 2084 ret = -EINPROGRESS;
2101 if (ret) { 2085 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
2102 spin_unlock(&fs_info->qgroup_lock); 2086 ret = -EINVAL;
2103 mutex_unlock(&fs_info->qgroup_rescan_lock); 2087
2104 kfree(qscan); 2088 if (ret) {
2105 return ret; 2089 spin_unlock(&fs_info->qgroup_lock);
2090 mutex_unlock(&fs_info->qgroup_rescan_lock);
2091 goto err;
2092 }
2093
2094 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2106 } 2095 }
2107 2096
2108 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2109 memset(&fs_info->qgroup_rescan_progress, 0, 2097 memset(&fs_info->qgroup_rescan_progress, 0,
2110 sizeof(fs_info->qgroup_rescan_progress)); 2098 sizeof(fs_info->qgroup_rescan_progress));
2099 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
2100
2101 spin_unlock(&fs_info->qgroup_lock);
2102 mutex_unlock(&fs_info->qgroup_rescan_lock);
2103
2104 init_completion(&fs_info->qgroup_rescan_completion);
2105
2106 memset(&fs_info->qgroup_rescan_work, 0,
2107 sizeof(fs_info->qgroup_rescan_work));
2108 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
2109
2110 if (ret) {
2111err:
2112 pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
2113 return ret;
2114 }
2115
2116 return 0;
2117}
2118
2119static void
2120qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
2121{
2122 struct rb_node *n;
2123 struct btrfs_qgroup *qgroup;
2111 2124
2125 spin_lock(&fs_info->qgroup_lock);
2112 /* clear all current qgroup tracking information */ 2126 /* clear all current qgroup tracking information */
2113 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 2127 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
2114 qgroup = rb_entry(n, struct btrfs_qgroup, node); 2128 qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2118 qgroup->excl_cmpr = 0; 2132 qgroup->excl_cmpr = 0;
2119 } 2133 }
2120 spin_unlock(&fs_info->qgroup_lock); 2134 spin_unlock(&fs_info->qgroup_lock);
2121 mutex_unlock(&fs_info->qgroup_rescan_lock); 2135}
2136
2137int
2138btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2139{
2140 int ret = 0;
2141 struct btrfs_trans_handle *trans;
2122 2142
2123 qgroup_rescan_start(fs_info, qscan); 2143 ret = qgroup_rescan_init(fs_info, 0, 1);
2144 if (ret)
2145 return ret;
2146
2147 /*
2148 * We have set the rescan_progress to 0, which means no more
2149 * delayed refs will be accounted by btrfs_qgroup_account_ref.
2150 * However, btrfs_qgroup_account_ref may be right after its call
2151 * to btrfs_find_all_roots, in which case it would still do the
2152 * accounting.
2153 * To solve this, we're committing the transaction, which will
2154 * ensure we run all delayed refs and only after that, we are
2155 * going to clear all tracking information for a clean start.
2156 */
2157
2158 trans = btrfs_join_transaction(fs_info->fs_root);
2159 if (IS_ERR(trans)) {
2160 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2161 return PTR_ERR(trans);
2162 }
2163 ret = btrfs_commit_transaction(trans, fs_info->fs_root);
2164 if (ret) {
2165 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2166 return ret;
2167 }
2168
2169 qgroup_rescan_zero_tracking(fs_info);
2170
2171 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2172 &fs_info->qgroup_rescan_work);
2124 2173
2125 return 0; 2174 return 0;
2126} 2175}
2176
2177int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
2178{
2179 int running;
2180 int ret = 0;
2181
2182 mutex_lock(&fs_info->qgroup_rescan_lock);
2183 spin_lock(&fs_info->qgroup_lock);
2184 running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2185 spin_unlock(&fs_info->qgroup_lock);
2186 mutex_unlock(&fs_info->qgroup_rescan_lock);
2187
2188 if (running)
2189 ret = wait_for_completion_interruptible(
2190 &fs_info->qgroup_rescan_completion);
2191
2192 return ret;
2193}
2194
2195/*
2196 * this is only called from open_ctree where we're still single threaded, thus
2197 * locking is omitted here.
2198 */
2199void
2200btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2201{
2202 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2203 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2204 &fs_info->qgroup_rescan_work);
2205}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1305 struct extent_buffer *eb; 1305 struct extent_buffer *eb;
1306 struct btrfs_root_item *root_item; 1306 struct btrfs_root_item *root_item;
1307 struct btrfs_key root_key; 1307 struct btrfs_key root_key;
1308 u64 last_snap = 0;
1308 int ret; 1309 int ret;
1309 1310
1310 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1311 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1320 BTRFS_TREE_RELOC_OBJECTID); 1321 BTRFS_TREE_RELOC_OBJECTID);
1321 BUG_ON(ret); 1322 BUG_ON(ret);
1322 1323
1324 last_snap = btrfs_root_last_snapshot(&root->root_item);
1323 btrfs_set_root_last_snapshot(&root->root_item, 1325 btrfs_set_root_last_snapshot(&root->root_item,
1324 trans->transid - 1); 1326 trans->transid - 1);
1325 } else { 1327 } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1345 memset(&root_item->drop_progress, 0, 1347 memset(&root_item->drop_progress, 0,
1346 sizeof(struct btrfs_disk_key)); 1348 sizeof(struct btrfs_disk_key));
1347 root_item->drop_level = 0; 1349 root_item->drop_level = 0;
1350 /*
1351 * abuse rtransid, it is safe because it is impossible to
1352 * receive data into a relocation tree.
1353 */
1354 btrfs_set_root_rtransid(root_item, last_snap);
1355 btrfs_set_root_otransid(root_item, trans->transid);
1348 } 1356 }
1349 1357
1350 btrfs_tree_unlock(eb); 1358 btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1355 BUG_ON(ret); 1363 BUG_ON(ret);
1356 kfree(root_item); 1364 kfree(root_item);
1357 1365
1358 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 1366 reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1359 &root_key);
1360 BUG_ON(IS_ERR(reloc_root)); 1367 BUG_ON(IS_ERR(reloc_root));
1361 reloc_root->last_trans = trans->transid; 1368 reloc_root->last_trans = trans->transid;
1362 return reloc_root; 1369 return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
2273static noinline_for_stack 2280static noinline_for_stack
2274int merge_reloc_roots(struct reloc_control *rc) 2281int merge_reloc_roots(struct reloc_control *rc)
2275{ 2282{
2283 struct btrfs_trans_handle *trans;
2276 struct btrfs_root *root; 2284 struct btrfs_root *root;
2277 struct btrfs_root *reloc_root; 2285 struct btrfs_root *reloc_root;
2286 u64 last_snap;
2287 u64 otransid;
2288 u64 objectid;
2278 LIST_HEAD(reloc_roots); 2289 LIST_HEAD(reloc_roots);
2279 int found = 0; 2290 int found = 0;
2280 int ret = 0; 2291 int ret = 0;
@@ -2308,12 +2319,44 @@ again:
2308 } else { 2319 } else {
2309 list_del_init(&reloc_root->root_list); 2320 list_del_init(&reloc_root->root_list);
2310 } 2321 }
2322
2323 /*
2324 * we keep the old last snapshod transid in rtranid when we
2325 * created the relocation tree.
2326 */
2327 last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2328 otransid = btrfs_root_otransid(&reloc_root->root_item);
2329 objectid = reloc_root->root_key.offset;
2330
2311 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); 2331 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2312 if (ret < 0) { 2332 if (ret < 0) {
2313 if (list_empty(&reloc_root->root_list)) 2333 if (list_empty(&reloc_root->root_list))
2314 list_add_tail(&reloc_root->root_list, 2334 list_add_tail(&reloc_root->root_list,
2315 &reloc_roots); 2335 &reloc_roots);
2316 goto out; 2336 goto out;
2337 } else if (!ret) {
2338 /*
2339 * recover the last snapshot tranid to avoid
2340 * the space balance break NOCOW.
2341 */
2342 root = read_fs_root(rc->extent_root->fs_info,
2343 objectid);
2344 if (IS_ERR(root))
2345 continue;
2346
2347 if (btrfs_root_refs(&root->root_item) == 0)
2348 continue;
2349
2350 trans = btrfs_join_transaction(root);
2351 BUG_ON(IS_ERR(trans));
2352
2353 /* Check if the fs/file tree was snapshoted or not. */
2354 if (btrfs_root_last_snapshot(&root->root_item) ==
2355 otransid - 1)
2356 btrfs_set_root_last_snapshot(&root->root_item,
2357 last_snap);
2358
2359 btrfs_end_transaction(trans, root);
2317 } 2360 }
2318 } 2361 }
2319 2362
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
3266 struct btrfs_path *path; 3309 struct btrfs_path *path;
3267 struct btrfs_key key; 3310 struct btrfs_key key;
3268 int ret; 3311 int ret;
3312 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3313 SKINNY_METADATA);
3269 3314
3270 if (tree_block_processed(bytenr, blocksize, rc)) 3315 if (tree_block_processed(bytenr, blocksize, rc))
3271 return 0; 3316 return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
3276 path = btrfs_alloc_path(); 3321 path = btrfs_alloc_path();
3277 if (!path) 3322 if (!path)
3278 return -ENOMEM; 3323 return -ENOMEM;
3279 3324again:
3280 key.objectid = bytenr; 3325 key.objectid = bytenr;
3281 key.type = BTRFS_EXTENT_ITEM_KEY; 3326 if (skinny) {
3282 key.offset = blocksize; 3327 key.type = BTRFS_METADATA_ITEM_KEY;
3328 key.offset = (u64)-1;
3329 } else {
3330 key.type = BTRFS_EXTENT_ITEM_KEY;
3331 key.offset = blocksize;
3332 }
3283 3333
3284 path->search_commit_root = 1; 3334 path->search_commit_root = 1;
3285 path->skip_locking = 1; 3335 path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
3287 if (ret < 0) 3337 if (ret < 0)
3288 goto out; 3338 goto out;
3289 3339
3290 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3340 if (ret > 0 && skinny) {
3291 if (ret > 0) { 3341 if (path->slots[0]) {
3292 if (key.objectid == bytenr && 3342 path->slots[0]--;
3293 key.type == BTRFS_METADATA_ITEM_KEY) 3343 btrfs_item_key_to_cpu(path->nodes[0], &key,
3294 ret = 0; 3344 path->slots[0]);
3345 if (key.objectid == bytenr &&
3346 (key.type == BTRFS_METADATA_ITEM_KEY ||
3347 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3348 key.offset == blocksize)))
3349 ret = 0;
3350 }
3351
3352 if (ret) {
3353 skinny = false;
3354 btrfs_release_path(path);
3355 goto again;
3356 }
3295 } 3357 }
3296 BUG_ON(ret); 3358 BUG_ON(ret);
3297 3359
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4160 (unsigned long long)rc->block_group->key.objectid, 4222 (unsigned long long)rc->block_group->key.objectid,
4161 (unsigned long long)rc->block_group->flags); 4223 (unsigned long long)rc->block_group->flags);
4162 4224
4163 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4225 ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
4164 if (ret < 0) { 4226 if (ret < 0) {
4165 err = ret; 4227 err = ret;
4166 goto out; 4228 goto out;
4167 } 4229 }
4168 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4230 btrfs_wait_all_ordered_extents(fs_info, 0);
4169 4231
4170 while (1) { 4232 while (1) {
4171 mutex_lock(&fs_info->cleaner_mutex); 4233 mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4277 key.type != BTRFS_ROOT_ITEM_KEY) 4339 key.type != BTRFS_ROOT_ITEM_KEY)
4278 break; 4340 break;
4279 4341
4280 reloc_root = btrfs_read_fs_root_no_radix(root, &key); 4342 reloc_root = btrfs_read_fs_root(root, &key);
4281 if (IS_ERR(reloc_root)) { 4343 if (IS_ERR(reloc_root)) {
4282 err = PTR_ERR(reloc_root); 4344 err = PTR_ERR(reloc_root);
4283 goto out; 4345 goto out;
@@ -4396,10 +4458,8 @@ out:
4396int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4458int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4397{ 4459{
4398 struct btrfs_ordered_sum *sums; 4460 struct btrfs_ordered_sum *sums;
4399 struct btrfs_sector_sum *sector_sum;
4400 struct btrfs_ordered_extent *ordered; 4461 struct btrfs_ordered_extent *ordered;
4401 struct btrfs_root *root = BTRFS_I(inode)->root; 4462 struct btrfs_root *root = BTRFS_I(inode)->root;
4402 size_t offset;
4403 int ret; 4463 int ret;
4404 u64 disk_bytenr; 4464 u64 disk_bytenr;
4405 LIST_HEAD(list); 4465 LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4413 if (ret) 4473 if (ret)
4414 goto out; 4474 goto out;
4415 4475
4476 disk_bytenr = ordered->start;
4416 while (!list_empty(&list)) { 4477 while (!list_empty(&list)) {
4417 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4478 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4418 list_del_init(&sums->list); 4479 list_del_init(&sums->list);
4419 4480
4420 sector_sum = sums->sums; 4481 sums->bytenr = disk_bytenr;
4421 sums->bytenr = ordered->start; 4482 disk_bytenr += sums->len;
4422
4423 offset = 0;
4424 while (offset < sums->len) {
4425 sector_sum->bytenr += ordered->start - disk_bytenr;
4426 sector_sum++;
4427 offset += root->sectorsize;
4428 }
4429 4483
4430 btrfs_add_ordered_sum(inode, ordered, sums); 4484 btrfs_add_ordered_sum(inode, ordered, sums);
4431 } 4485 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
64} 64}
65 65
66/* 66/*
67 * lookup the root with the highest offset for a given objectid. The key we do 67 * btrfs_find_root - lookup the root by the key.
68 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 68 * root: the root of the root tree
69 * on error. 69 * search_key: the key to search
70 * path: the path we search
71 * root_item: the root item of the tree we look for
72 * root_key: the reak key of the tree we look for
73 *
74 * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
75 * of the search key, just lookup the root with the highest offset for a
76 * given objectid.
77 *
78 * If we find something return 0, otherwise > 0, < 0 on error.
70 */ 79 */
71int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, 80int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
72 struct btrfs_root_item *item, struct btrfs_key *key) 81 struct btrfs_path *path, struct btrfs_root_item *root_item,
82 struct btrfs_key *root_key)
73{ 83{
74 struct btrfs_path *path;
75 struct btrfs_key search_key;
76 struct btrfs_key found_key; 84 struct btrfs_key found_key;
77 struct extent_buffer *l; 85 struct extent_buffer *l;
78 int ret; 86 int ret;
79 int slot; 87 int slot;
80 88
81 search_key.objectid = objectid; 89 ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
82 search_key.type = BTRFS_ROOT_ITEM_KEY;
83 search_key.offset = (u64)-1;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
89 if (ret < 0) 90 if (ret < 0)
90 goto out; 91 return ret;
91 92
92 BUG_ON(ret == 0); 93 if (search_key->offset != -1ULL) { /* the search key is exact */
93 if (path->slots[0] == 0) { 94 if (ret > 0)
94 ret = 1; 95 goto out;
95 goto out; 96 } else {
97 BUG_ON(ret == 0); /* Logical error */
98 if (path->slots[0] == 0)
99 goto out;
100 path->slots[0]--;
101 ret = 0;
96 } 102 }
103
97 l = path->nodes[0]; 104 l = path->nodes[0];
98 slot = path->slots[0] - 1; 105 slot = path->slots[0];
106
99 btrfs_item_key_to_cpu(l, &found_key, slot); 107 btrfs_item_key_to_cpu(l, &found_key, slot);
100 if (found_key.objectid != objectid || 108 if (found_key.objectid != search_key->objectid ||
101 found_key.type != BTRFS_ROOT_ITEM_KEY) { 109 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 110 ret = 1;
103 goto out; 111 goto out;
104 } 112 }
105 if (item)
106 btrfs_read_root_item(l, slot, item);
107 if (key)
108 memcpy(key, &found_key, sizeof(found_key));
109 113
110 ret = 0; 114 if (root_item)
115 btrfs_read_root_item(l, slot, root_item);
116 if (root_key)
117 memcpy(root_key, &found_key, sizeof(found_key));
111out: 118out:
112 btrfs_free_path(path); 119 btrfs_release_path(path);
113 return ret; 120 return ret;
114} 121}
115 122
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
212 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 219 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
213} 220}
214 221
215/*
216 * at mount time we want to find all the old transaction snapshots that were in
217 * the process of being deleted if we crashed. This is any root item with an
218 * offset lower than the latest root. They need to be queued for deletion to
219 * finish what was happening when we crashed.
220 */
221int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
222{
223 struct btrfs_root *dead_root;
224 struct btrfs_root_item *ri;
225 struct btrfs_key key;
226 struct btrfs_key found_key;
227 struct btrfs_path *path;
228 int ret;
229 u32 nritems;
230 struct extent_buffer *leaf;
231 int slot;
232
233 key.objectid = objectid;
234 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
235 key.offset = 0;
236 path = btrfs_alloc_path();
237 if (!path)
238 return -ENOMEM;
239
240again:
241 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
242 if (ret < 0)
243 goto err;
244 while (1) {
245 leaf = path->nodes[0];
246 nritems = btrfs_header_nritems(leaf);
247 slot = path->slots[0];
248 if (slot >= nritems) {
249 ret = btrfs_next_leaf(root, path);
250 if (ret)
251 break;
252 leaf = path->nodes[0];
253 nritems = btrfs_header_nritems(leaf);
254 slot = path->slots[0];
255 }
256 btrfs_item_key_to_cpu(leaf, &key, slot);
257 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
258 goto next;
259
260 if (key.objectid < objectid)
261 goto next;
262
263 if (key.objectid > objectid)
264 break;
265
266 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
267 if (btrfs_disk_root_refs(leaf, ri) != 0)
268 goto next;
269
270 memcpy(&found_key, &key, sizeof(key));
271 key.offset++;
272 btrfs_release_path(path);
273 dead_root =
274 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
275 &found_key);
276 if (IS_ERR(dead_root)) {
277 ret = PTR_ERR(dead_root);
278 goto err;
279 }
280
281 ret = btrfs_add_dead_root(dead_root);
282 if (ret)
283 goto err;
284 goto again;
285next:
286 slot++;
287 path->slots[0]++;
288 }
289 ret = 0;
290err:
291 btrfs_free_path(path);
292 return ret;
293}
294
295int btrfs_find_orphan_roots(struct btrfs_root *tree_root) 222int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
296{ 223{
297 struct extent_buffer *leaf; 224 struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
301 struct btrfs_root *root; 228 struct btrfs_root *root;
302 int err = 0; 229 int err = 0;
303 int ret; 230 int ret;
231 bool can_recover = true;
232
233 if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
234 can_recover = false;
304 235
305 path = btrfs_alloc_path(); 236 path = btrfs_alloc_path();
306 if (!path) 237 if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
340 root_key.objectid = key.offset; 271 root_key.objectid = key.offset;
341 key.offset++; 272 key.offset++;
342 273
343 root = btrfs_read_fs_root_no_name(tree_root->fs_info, 274 root = btrfs_read_fs_root(tree_root, &root_key);
344 &root_key); 275 err = PTR_RET(root);
345 if (!IS_ERR(root)) 276 if (err && err != -ENOENT) {
277 break;
278 } else if (err == -ENOENT) {
279 struct btrfs_trans_handle *trans;
280
281 btrfs_release_path(path);
282
283 trans = btrfs_join_transaction(tree_root);
284 if (IS_ERR(trans)) {
285 err = PTR_ERR(trans);
286 btrfs_error(tree_root->fs_info, err,
287 "Failed to start trans to delete "
288 "orphan item");
289 break;
290 }
291 err = btrfs_del_orphan_item(trans, tree_root,
292 root_key.objectid);
293 btrfs_end_transaction(trans, tree_root);
294 if (err) {
295 btrfs_error(tree_root->fs_info, err,
296 "Failed to delete root orphan "
297 "item");
298 break;
299 }
346 continue; 300 continue;
301 }
347 302
348 ret = PTR_ERR(root); 303 if (btrfs_root_refs(&root->root_item) == 0) {
349 if (ret != -ENOENT) { 304 btrfs_add_dead_root(root);
350 err = ret; 305 continue;
306 }
307
308 err = btrfs_init_fs_root(root);
309 if (err) {
310 btrfs_free_fs_root(root);
351 break; 311 break;
352 } 312 }
353 313
354 ret = btrfs_find_dead_roots(tree_root, root_key.objectid); 314 root->orphan_item_inserted = 1;
355 if (ret) { 315
356 err = ret; 316 err = btrfs_insert_fs_root(root->fs_info, root);
317 if (err) {
318 BUG_ON(err == -EEXIST);
319 btrfs_free_fs_root(root);
357 break; 320 break;
358 } 321 }
359 } 322 }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
368{ 331{
369 struct btrfs_path *path; 332 struct btrfs_path *path;
370 int ret; 333 int ret;
371 struct btrfs_root_item *ri;
372 struct extent_buffer *leaf;
373 334
374 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
375 if (!path) 336 if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
379 goto out; 340 goto out;
380 341
381 BUG_ON(ret != 0); 342 BUG_ON(ret != 0);
382 leaf = path->nodes[0];
383 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
384 343
385 ret = btrfs_del_item(trans, root, path); 344 ret = btrfs_del_item(trans, root, path);
386out: 345out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..64a157becbe5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2126 u8 *csum) 2126 u8 *csum)
2127{ 2127{
2128 struct btrfs_ordered_sum *sum = NULL; 2128 struct btrfs_ordered_sum *sum = NULL;
2129 int ret = 0; 2129 unsigned long index;
2130 unsigned long i;
2131 unsigned long num_sectors; 2130 unsigned long num_sectors;
2132 2131
2133 while (!list_empty(&sctx->csum_list)) { 2132 while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2146 if (!sum) 2145 if (!sum)
2147 return 0; 2146 return 0;
2148 2147
2148 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2149 num_sectors = sum->len / sctx->sectorsize; 2149 num_sectors = sum->len / sctx->sectorsize;
2150 for (i = 0; i < num_sectors; ++i) { 2150 memcpy(csum, sum->sums + index, sctx->csum_size);
2151 if (sum->sums[i].bytenr == logical) { 2151 if (index == num_sectors - 1) {
2152 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2153 ret = 1;
2154 break;
2155 }
2156 }
2157 if (ret && i == num_sectors - 1) {
2158 list_del(&sum->list); 2152 list_del(&sum->list);
2159 kfree(sum); 2153 kfree(sum);
2160 } 2154 }
2161 return ret; 2155 return 1;
2162} 2156}
2163 2157
2164/* scrub extent tries to collect up to 64 kB for each bio */ 2158/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2501,10 +2495,11 @@ again:
2501 ret = scrub_extent(sctx, extent_logical, extent_len, 2495 ret = scrub_extent(sctx, extent_logical, extent_len,
2502 extent_physical, extent_dev, flags, 2496 extent_physical, extent_dev, flags,
2503 generation, extent_mirror_num, 2497 generation, extent_mirror_num,
2504 extent_physical); 2498 extent_logical - logical + physical);
2505 if (ret) 2499 if (ret)
2506 goto out; 2500 goto out;
2507 2501
2502 scrub_free_csums(sctx);
2508 if (extent_logical + extent_len < 2503 if (extent_logical + extent_len <
2509 key.objectid + bytes) { 2504 key.objectid + bytes) {
2510 logical += increment; 2505 logical += increment;
@@ -3204,16 +3199,18 @@ out:
3204 3199
3205static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3200static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3206{ 3201{
3207 unsigned long index;
3208 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3202 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3209 int ret = 0; 3203 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3210 struct btrfs_key key; 3204 struct btrfs_key key;
3211 struct inode *inode = NULL; 3205 struct inode *inode;
3206 struct page *page;
3212 struct btrfs_root *local_root; 3207 struct btrfs_root *local_root;
3213 u64 physical_for_dev_replace; 3208 u64 physical_for_dev_replace;
3214 u64 len; 3209 u64 len;
3215 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3210 unsigned long index;
3216 int srcu_index; 3211 int srcu_index;
3212 int ret;
3213 int err;
3217 3214
3218 key.objectid = root; 3215 key.objectid = root;
3219 key.type = BTRFS_ROOT_ITEM_KEY; 3216 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3227 return PTR_ERR(local_root); 3224 return PTR_ERR(local_root);
3228 } 3225 }
3229 3226
3227 if (btrfs_root_refs(&local_root->root_item) == 0) {
3228 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3229 return -ENOENT;
3230 }
3231
3230 key.type = BTRFS_INODE_ITEM_KEY; 3232 key.type = BTRFS_INODE_ITEM_KEY;
3231 key.objectid = inum; 3233 key.objectid = inum;
3232 key.offset = 0; 3234 key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3235 if (IS_ERR(inode)) 3237 if (IS_ERR(inode))
3236 return PTR_ERR(inode); 3238 return PTR_ERR(inode);
3237 3239
3240 /* Avoid truncate/dio/punch hole.. */
3241 mutex_lock(&inode->i_mutex);
3242 inode_dio_wait(inode);
3243
3244 ret = 0;
3238 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3245 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3239 len = nocow_ctx->len; 3246 len = nocow_ctx->len;
3240 while (len >= PAGE_CACHE_SIZE) { 3247 while (len >= PAGE_CACHE_SIZE) {
3241 struct page *page = NULL;
3242 int ret_sub;
3243
3244 index = offset >> PAGE_CACHE_SHIFT; 3248 index = offset >> PAGE_CACHE_SHIFT;
3245 3249again:
3246 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3250 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3247 if (!page) { 3251 if (!page) {
3248 pr_err("find_or_create_page() failed\n"); 3252 pr_err("find_or_create_page() failed\n");
3249 ret = -ENOMEM; 3253 ret = -ENOMEM;
3250 goto next_page; 3254 goto out;
3251 } 3255 }
3252 3256
3253 if (PageUptodate(page)) { 3257 if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3255 goto next_page; 3259 goto next_page;
3256 } else { 3260 } else {
3257 ClearPageError(page); 3261 ClearPageError(page);
3258 ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 3262 err = extent_read_full_page(&BTRFS_I(inode)->
3259 io_tree, 3263 io_tree,
3260 page, btrfs_get_extent, 3264 page, btrfs_get_extent,
3261 nocow_ctx->mirror_num); 3265 nocow_ctx->mirror_num);
3262 if (ret_sub) { 3266 if (err) {
3263 ret = ret_sub; 3267 ret = err;
3264 goto next_page; 3268 goto next_page;
3265 } 3269 }
3266 wait_on_page_locked(page); 3270
3271 lock_page(page);
3272 /*
3273 * If the page has been remove from the page cache,
3274 * the data on it is meaningless, because it may be
3275 * old one, the new data may be written into the new
3276 * page in the page cache.
3277 */
3278 if (page->mapping != inode->i_mapping) {
3279 page_cache_release(page);
3280 goto again;
3281 }
3267 if (!PageUptodate(page)) { 3282 if (!PageUptodate(page)) {
3268 ret = -EIO; 3283 ret = -EIO;
3269 goto next_page; 3284 goto next_page;
3270 } 3285 }
3271 } 3286 }
3272 ret_sub = write_page_nocow(nocow_ctx->sctx, 3287 err = write_page_nocow(nocow_ctx->sctx,
3273 physical_for_dev_replace, page); 3288 physical_for_dev_replace, page);
3274 if (ret_sub) { 3289 if (err)
3275 ret = ret_sub; 3290 ret = err;
3276 goto next_page;
3277 }
3278
3279next_page: 3291next_page:
3280 if (page) { 3292 unlock_page(page);
3281 unlock_page(page); 3293 page_cache_release(page);
3282 put_page(page); 3294
3283 } 3295 if (ret)
3296 break;
3297
3284 offset += PAGE_CACHE_SIZE; 3298 offset += PAGE_CACHE_SIZE;
3285 physical_for_dev_replace += PAGE_CACHE_SIZE; 3299 physical_for_dev_replace += PAGE_CACHE_SIZE;
3286 len -= PAGE_CACHE_SIZE; 3300 len -= PAGE_CACHE_SIZE;
3287 } 3301 }
3288 3302out:
3289 if (inode) 3303 mutex_unlock(&inode->i_mutex);
3290 iput(inode); 3304 iput(inode);
3291 return ret; 3305 return ret;
3292} 3306}
3293 3307
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
158 } 158 }
159} 159}
160 160
161static struct fs_path *fs_path_alloc(struct send_ctx *sctx) 161static struct fs_path *fs_path_alloc(void)
162{ 162{
163 struct fs_path *p; 163 struct fs_path *p;
164 164
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
173 return p; 173 return p;
174} 174}
175 175
176static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) 176static struct fs_path *fs_path_alloc_reversed(void)
177{ 177{
178 struct fs_path *p; 178 struct fs_path *p;
179 179
180 p = fs_path_alloc(sctx); 180 p = fs_path_alloc();
181 if (!p) 181 if (!p)
182 return NULL; 182 return NULL;
183 p->reversed = 1; 183 p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
185 return p; 185 return p;
186} 186}
187 187
188static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) 188static void fs_path_free(struct fs_path *p)
189{ 189{
190 if (!p) 190 if (!p)
191 return; 191 return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
753 * 753 *
754 * path must point to the INODE_REF or INODE_EXTREF when called. 754 * path must point to the INODE_REF or INODE_EXTREF when called.
755 */ 755 */
756static int iterate_inode_ref(struct send_ctx *sctx, 756static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
757 struct btrfs_root *root, struct btrfs_path *path,
758 struct btrfs_key *found_key, int resolve, 757 struct btrfs_key *found_key, int resolve,
759 iterate_inode_ref_t iterate, void *ctx) 758 iterate_inode_ref_t iterate, void *ctx)
760{ 759{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
777 unsigned long elem_size; 776 unsigned long elem_size;
778 unsigned long ptr; 777 unsigned long ptr;
779 778
780 p = fs_path_alloc_reversed(sctx); 779 p = fs_path_alloc_reversed();
781 if (!p) 780 if (!p)
782 return -ENOMEM; 781 return -ENOMEM;
783 782
784 tmp_path = alloc_path_for_send(); 783 tmp_path = alloc_path_for_send();
785 if (!tmp_path) { 784 if (!tmp_path) {
786 fs_path_free(sctx, p); 785 fs_path_free(p);
787 return -ENOMEM; 786 return -ENOMEM;
788 } 787 }
789 788
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
858 857
859out: 858out:
860 btrfs_free_path(tmp_path); 859 btrfs_free_path(tmp_path);
861 fs_path_free(sctx, p); 860 fs_path_free(p);
862 return ret; 861 return ret;
863} 862}
864 863
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
874 * 873 *
875 * path must point to the dir item when called. 874 * path must point to the dir item when called.
876 */ 875 */
877static int iterate_dir_item(struct send_ctx *sctx, 876static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
878 struct btrfs_root *root, struct btrfs_path *path,
879 struct btrfs_key *found_key, 877 struct btrfs_key *found_key,
880 iterate_dir_item_t iterate, void *ctx) 878 iterate_dir_item_t iterate, void *ctx)
881{ 879{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
990 * Retrieve the first path of an inode. If an inode has more then one 988 * Retrieve the first path of an inode. If an inode has more then one
991 * ref/hardlink, this is ignored. 989 * ref/hardlink, this is ignored.
992 */ 990 */
993static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, 991static int get_inode_path(struct btrfs_root *root,
994 u64 ino, struct fs_path *path) 992 u64 ino, struct fs_path *path)
995{ 993{
996 int ret; 994 int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
1022 goto out; 1020 goto out;
1023 } 1021 }
1024 1022
1025 ret = iterate_inode_ref(sctx, root, p, &found_key, 1, 1023 ret = iterate_inode_ref(root, p, &found_key, 1,
1026 __copy_first_ref, path); 1024 __copy_first_ref, path);
1027 if (ret < 0) 1025 if (ret < 0)
1028 goto out; 1026 goto out;
1029 ret = 0; 1027 ret = 0;
@@ -1314,8 +1312,7 @@ out:
1314 return ret; 1312 return ret;
1315} 1313}
1316 1314
1317static int read_symlink(struct send_ctx *sctx, 1315static int read_symlink(struct btrfs_root *root,
1318 struct btrfs_root *root,
1319 u64 ino, 1316 u64 ino,
1320 struct fs_path *dest) 1317 struct fs_path *dest)
1321{ 1318{
@@ -1562,8 +1559,7 @@ out:
1562 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1559 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1563 * generation of the parent dir and the name of the dir entry. 1560 * generation of the parent dir and the name of the dir entry.
1564 */ 1561 */
1565static int get_first_ref(struct send_ctx *sctx, 1562static int get_first_ref(struct btrfs_root *root, u64 ino,
1566 struct btrfs_root *root, u64 ino,
1567 u64 *dir, u64 *dir_gen, struct fs_path *name) 1563 u64 *dir, u64 *dir_gen, struct fs_path *name)
1568{ 1564{
1569 int ret; 1565 int ret;
@@ -1628,8 +1624,7 @@ out:
1628 return ret; 1624 return ret;
1629} 1625}
1630 1626
1631static int is_first_ref(struct send_ctx *sctx, 1627static int is_first_ref(struct btrfs_root *root,
1632 struct btrfs_root *root,
1633 u64 ino, u64 dir, 1628 u64 ino, u64 dir,
1634 const char *name, int name_len) 1629 const char *name, int name_len)
1635{ 1630{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
1638 u64 tmp_dir; 1633 u64 tmp_dir;
1639 u64 tmp_dir_gen; 1634 u64 tmp_dir_gen;
1640 1635
1641 tmp_name = fs_path_alloc(sctx); 1636 tmp_name = fs_path_alloc();
1642 if (!tmp_name) 1637 if (!tmp_name)
1643 return -ENOMEM; 1638 return -ENOMEM;
1644 1639
1645 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1640 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1646 if (ret < 0) 1641 if (ret < 0)
1647 goto out; 1642 goto out;
1648 1643
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
1654 ret = !memcmp(tmp_name->start, name, name_len); 1649 ret = !memcmp(tmp_name->start, name, name_len);
1655 1650
1656out: 1651out:
1657 fs_path_free(sctx, tmp_name); 1652 fs_path_free(tmp_name);
1658 return ret; 1653 return ret;
1659} 1654}
1660 1655
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1783 if (!sctx->parent_root) 1778 if (!sctx->parent_root)
1784 goto out; 1779 goto out;
1785 1780
1786 name = fs_path_alloc(sctx); 1781 name = fs_path_alloc();
1787 if (!name) 1782 if (!name)
1788 return -ENOMEM; 1783 return -ENOMEM;
1789 1784
1790 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); 1785 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
1791 if (ret < 0) 1786 if (ret < 0)
1792 goto out; 1787 goto out;
1793 1788
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1795 name->start, fs_path_len(name)); 1790 name->start, fs_path_len(name));
1796 1791
1797out: 1792out:
1798 fs_path_free(sctx, name); 1793 fs_path_free(name);
1799 return ret; 1794 return ret;
1800} 1795}
1801 1796
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 * send_root or parent_root for ref lookup. 1974 * send_root or parent_root for ref lookup.
1980 */ 1975 */
1981 if (ino < sctx->send_progress) 1976 if (ino < sctx->send_progress)
1982 ret = get_first_ref(sctx, sctx->send_root, ino, 1977 ret = get_first_ref(sctx->send_root, ino,
1983 parent_ino, parent_gen, dest); 1978 parent_ino, parent_gen, dest);
1984 else 1979 else
1985 ret = get_first_ref(sctx, sctx->parent_root, ino, 1980 ret = get_first_ref(sctx->parent_root, ino,
1986 parent_ino, parent_gen, dest); 1981 parent_ino, parent_gen, dest);
1987 if (ret < 0) 1982 if (ret < 0)
1988 goto out; 1983 goto out;
1989 1984
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2070 u64 parent_gen = 0; 2065 u64 parent_gen = 0;
2071 int stop = 0; 2066 int stop = 0;
2072 2067
2073 name = fs_path_alloc(sctx); 2068 name = fs_path_alloc();
2074 if (!name) { 2069 if (!name) {
2075 ret = -ENOMEM; 2070 ret = -ENOMEM;
2076 goto out; 2071 goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2098 } 2093 }
2099 2094
2100out: 2095out:
2101 fs_path_free(sctx, name); 2096 fs_path_free(name);
2102 if (!ret) 2097 if (!ret)
2103 fs_path_unreverse(dest); 2098 fs_path_unreverse(dest);
2104 return ret; 2099 return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2263 2258
2264verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); 2259verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2265 2260
2266 p = fs_path_alloc(sctx); 2261 p = fs_path_alloc();
2267 if (!p) 2262 if (!p)
2268 return -ENOMEM; 2263 return -ENOMEM;
2269 2264
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2281 2276
2282tlv_put_failure: 2277tlv_put_failure:
2283out: 2278out:
2284 fs_path_free(sctx, p); 2279 fs_path_free(p);
2285 return ret; 2280 return ret;
2286} 2281}
2287 2282
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2292 2287
2293verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); 2288verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2294 2289
2295 p = fs_path_alloc(sctx); 2290 p = fs_path_alloc();
2296 if (!p) 2291 if (!p)
2297 return -ENOMEM; 2292 return -ENOMEM;
2298 2293
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2310 2305
2311tlv_put_failure: 2306tlv_put_failure:
2312out: 2307out:
2313 fs_path_free(sctx, p); 2308 fs_path_free(p);
2314 return ret; 2309 return ret;
2315} 2310}
2316 2311
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2321 2316
2322verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); 2317verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2323 2318
2324 p = fs_path_alloc(sctx); 2319 p = fs_path_alloc();
2325 if (!p) 2320 if (!p)
2326 return -ENOMEM; 2321 return -ENOMEM;
2327 2322
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2340 2335
2341tlv_put_failure: 2336tlv_put_failure:
2342out: 2337out:
2343 fs_path_free(sctx, p); 2338 fs_path_free(p);
2344 return ret; 2339 return ret;
2345} 2340}
2346 2341
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2356 2351
2357verbose_printk("btrfs: send_utimes %llu\n", ino); 2352verbose_printk("btrfs: send_utimes %llu\n", ino);
2358 2353
2359 p = fs_path_alloc(sctx); 2354 p = fs_path_alloc();
2360 if (!p) 2355 if (!p)
2361 return -ENOMEM; 2356 return -ENOMEM;
2362 2357
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2397 2392
2398tlv_put_failure: 2393tlv_put_failure:
2399out: 2394out:
2400 fs_path_free(sctx, p); 2395 fs_path_free(p);
2401 btrfs_free_path(path); 2396 btrfs_free_path(path);
2402 return ret; 2397 return ret;
2403} 2398}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
2418 2413
2419verbose_printk("btrfs: send_create_inode %llu\n", ino); 2414verbose_printk("btrfs: send_create_inode %llu\n", ino);
2420 2415
2421 p = fs_path_alloc(sctx); 2416 p = fs_path_alloc();
2422 if (!p) 2417 if (!p)
2423 return -ENOMEM; 2418 return -ENOMEM;
2424 2419
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2459 2454
2460 if (S_ISLNK(mode)) { 2455 if (S_ISLNK(mode)) {
2461 fs_path_reset(p); 2456 fs_path_reset(p);
2462 ret = read_symlink(sctx, sctx->send_root, ino, p); 2457 ret = read_symlink(sctx->send_root, ino, p);
2463 if (ret < 0) 2458 if (ret < 0)
2464 goto out; 2459 goto out;
2465 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2460 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2476 2471
2477tlv_put_failure: 2472tlv_put_failure:
2478out: 2473out:
2479 fs_path_free(sctx, p); 2474 fs_path_free(p);
2480 return ret; 2475 return ret;
2481} 2476}
2482 2477
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
2615 return 0; 2610 return 0;
2616} 2611}
2617 2612
2618static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2613static void __free_recorded_refs(struct list_head *head)
2619{ 2614{
2620 struct recorded_ref *cur; 2615 struct recorded_ref *cur;
2621 2616
2622 while (!list_empty(head)) { 2617 while (!list_empty(head)) {
2623 cur = list_entry(head->next, struct recorded_ref, list); 2618 cur = list_entry(head->next, struct recorded_ref, list);
2624 fs_path_free(sctx, cur->full_path); 2619 fs_path_free(cur->full_path);
2625 list_del(&cur->list); 2620 list_del(&cur->list);
2626 kfree(cur); 2621 kfree(cur);
2627 } 2622 }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2629 2624
2630static void free_recorded_refs(struct send_ctx *sctx) 2625static void free_recorded_refs(struct send_ctx *sctx)
2631{ 2626{
2632 __free_recorded_refs(sctx, &sctx->new_refs); 2627 __free_recorded_refs(&sctx->new_refs);
2633 __free_recorded_refs(sctx, &sctx->deleted_refs); 2628 __free_recorded_refs(&sctx->deleted_refs);
2634} 2629}
2635 2630
2636/* 2631/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2644 int ret; 2639 int ret;
2645 struct fs_path *orphan; 2640 struct fs_path *orphan;
2646 2641
2647 orphan = fs_path_alloc(sctx); 2642 orphan = fs_path_alloc();
2648 if (!orphan) 2643 if (!orphan)
2649 return -ENOMEM; 2644 return -ENOMEM;
2650 2645
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2655 ret = send_rename(sctx, path, orphan); 2650 ret = send_rename(sctx, path, orphan);
2656 2651
2657out: 2652out:
2658 fs_path_free(sctx, orphan); 2653 fs_path_free(orphan);
2659 return ret; 2654 return ret;
2660} 2655}
2661 2656
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2746 */ 2741 */
2747 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 2742 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2748 2743
2749 valid_path = fs_path_alloc(sctx); 2744 valid_path = fs_path_alloc();
2750 if (!valid_path) { 2745 if (!valid_path) {
2751 ret = -ENOMEM; 2746 ret = -ENOMEM;
2752 goto out; 2747 goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2843 if (ret < 0) 2838 if (ret < 0)
2844 goto out; 2839 goto out;
2845 if (ret) { 2840 if (ret) {
2846 ret = is_first_ref(sctx, sctx->parent_root, 2841 ret = is_first_ref(sctx->parent_root,
2847 ow_inode, cur->dir, cur->name, 2842 ow_inode, cur->dir, cur->name,
2848 cur->name_len); 2843 cur->name_len);
2849 if (ret < 0) 2844 if (ret < 0)
2850 goto out; 2845 goto out;
2851 if (ret) { 2846 if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3024out: 3019out:
3025 free_recorded_refs(sctx); 3020 free_recorded_refs(sctx);
3026 ulist_free(check_dirs); 3021 ulist_free(check_dirs);
3027 fs_path_free(sctx, valid_path); 3022 fs_path_free(valid_path);
3028 return ret; 3023 return ret;
3029} 3024}
3030 3025
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3037 struct fs_path *p; 3032 struct fs_path *p;
3038 u64 gen; 3033 u64 gen;
3039 3034
3040 p = fs_path_alloc(sctx); 3035 p = fs_path_alloc();
3041 if (!p) 3036 if (!p)
3042 return -ENOMEM; 3037 return -ENOMEM;
3043 3038
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3057 3052
3058out: 3053out:
3059 if (ret) 3054 if (ret)
3060 fs_path_free(sctx, p); 3055 fs_path_free(p);
3061 return ret; 3056 return ret;
3062} 3057}
3063 3058
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3070 struct fs_path *p; 3065 struct fs_path *p;
3071 u64 gen; 3066 u64 gen;
3072 3067
3073 p = fs_path_alloc(sctx); 3068 p = fs_path_alloc();
3074 if (!p) 3069 if (!p)
3075 return -ENOMEM; 3070 return -ENOMEM;
3076 3071
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3090 3085
3091out: 3086out:
3092 if (ret) 3087 if (ret)
3093 fs_path_free(sctx, p); 3088 fs_path_free(p);
3094 return ret; 3089 return ret;
3095} 3090}
3096 3091
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
3098{ 3093{
3099 int ret; 3094 int ret;
3100 3095
3101 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3096 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3102 sctx->cmp_key, 0, __record_new_ref, sctx); 3097 sctx->cmp_key, 0, __record_new_ref, sctx);
3103 if (ret < 0) 3098 if (ret < 0)
3104 goto out; 3099 goto out;
3105 ret = 0; 3100 ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
3112{ 3107{
3113 int ret; 3108 int ret;
3114 3109
3115 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3110 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3116 sctx->cmp_key, 0, __record_deleted_ref, sctx); 3111 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3117 if (ret < 0) 3112 if (ret < 0)
3118 goto out; 3113 goto out;
3119 ret = 0; 3114 ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
3142 return 0; 3137 return 0;
3143} 3138}
3144 3139
3145static int find_iref(struct send_ctx *sctx, 3140static int find_iref(struct btrfs_root *root,
3146 struct btrfs_root *root,
3147 struct btrfs_path *path, 3141 struct btrfs_path *path,
3148 struct btrfs_key *key, 3142 struct btrfs_key *key,
3149 u64 dir, struct fs_path *name) 3143 u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
3155 ctx.name = name; 3149 ctx.name = name;
3156 ctx.found_idx = -1; 3150 ctx.found_idx = -1;
3157 3151
3158 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); 3152 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
3159 if (ret < 0) 3153 if (ret < 0)
3160 return ret; 3154 return ret;
3161 3155
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
3172 int ret; 3166 int ret;
3173 struct send_ctx *sctx = ctx; 3167 struct send_ctx *sctx = ctx;
3174 3168
3175 ret = find_iref(sctx, sctx->parent_root, sctx->right_path, 3169 ret = find_iref(sctx->parent_root, sctx->right_path,
3176 sctx->cmp_key, dir, name); 3170 sctx->cmp_key, dir, name);
3177 if (ret == -ENOENT) 3171 if (ret == -ENOENT)
3178 ret = __record_new_ref(num, dir, index, name, sctx); 3172 ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
3189 int ret; 3183 int ret;
3190 struct send_ctx *sctx = ctx; 3184 struct send_ctx *sctx = ctx;
3191 3185
3192 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3186 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
3193 dir, name); 3187 dir, name);
3194 if (ret == -ENOENT) 3188 if (ret == -ENOENT)
3195 ret = __record_deleted_ref(num, dir, index, name, sctx); 3189 ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
3203{ 3197{
3204 int ret = 0; 3198 int ret = 0;
3205 3199
3206 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3200 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3207 sctx->cmp_key, 0, __record_changed_new_ref, sctx); 3201 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3208 if (ret < 0) 3202 if (ret < 0)
3209 goto out; 3203 goto out;
3210 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3204 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3211 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); 3205 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3212 if (ret < 0) 3206 if (ret < 0)
3213 goto out; 3207 goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
3266 found_key.type != BTRFS_INODE_EXTREF_KEY)) 3260 found_key.type != BTRFS_INODE_EXTREF_KEY))
3267 break; 3261 break;
3268 3262
3269 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, 3263 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3270 sctx);
3271 btrfs_release_path(path); 3264 btrfs_release_path(path);
3272 if (ret < 0) 3265 if (ret < 0)
3273 goto out; 3266 goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3335 struct fs_path *p; 3328 struct fs_path *p;
3336 posix_acl_xattr_header dummy_acl; 3329 posix_acl_xattr_header dummy_acl;
3337 3330
3338 p = fs_path_alloc(sctx); 3331 p = fs_path_alloc();
3339 if (!p) 3332 if (!p)
3340 return -ENOMEM; 3333 return -ENOMEM;
3341 3334
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3362 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 3355 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3363 3356
3364out: 3357out:
3365 fs_path_free(sctx, p); 3358 fs_path_free(p);
3366 return ret; 3359 return ret;
3367} 3360}
3368 3361
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3375 struct send_ctx *sctx = ctx; 3368 struct send_ctx *sctx = ctx;
3376 struct fs_path *p; 3369 struct fs_path *p;
3377 3370
3378 p = fs_path_alloc(sctx); 3371 p = fs_path_alloc();
3379 if (!p) 3372 if (!p)
3380 return -ENOMEM; 3373 return -ENOMEM;
3381 3374
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3386 ret = send_remove_xattr(sctx, p, name, name_len); 3379 ret = send_remove_xattr(sctx, p, name, name_len);
3387 3380
3388out: 3381out:
3389 fs_path_free(sctx, p); 3382 fs_path_free(p);
3390 return ret; 3383 return ret;
3391} 3384}
3392 3385
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
3394{ 3387{
3395 int ret = 0; 3388 int ret = 0;
3396 3389
3397 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3390 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3398 sctx->cmp_key, __process_new_xattr, sctx); 3391 sctx->cmp_key, __process_new_xattr, sctx);
3399 3392
3400 return ret; 3393 return ret;
3401} 3394}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
3404{ 3397{
3405 int ret; 3398 int ret;
3406 3399
3407 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3400 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3408 sctx->cmp_key, __process_deleted_xattr, sctx); 3401 sctx->cmp_key, __process_deleted_xattr, sctx);
3409 3402
3410 return ret; 3403 return ret;
3411} 3404}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
3429 strncmp(name, ctx->name, name_len) == 0) { 3422 strncmp(name, ctx->name, name_len) == 0) {
3430 ctx->found_idx = num; 3423 ctx->found_idx = num;
3431 ctx->found_data_len = data_len; 3424 ctx->found_data_len = data_len;
3432 ctx->found_data = kmalloc(data_len, GFP_NOFS); 3425 ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
3433 if (!ctx->found_data) 3426 if (!ctx->found_data)
3434 return -ENOMEM; 3427 return -ENOMEM;
3435 memcpy(ctx->found_data, data, data_len);
3436 return 1; 3428 return 1;
3437 } 3429 }
3438 return 0; 3430 return 0;
3439} 3431}
3440 3432
3441static int find_xattr(struct send_ctx *sctx, 3433static int find_xattr(struct btrfs_root *root,
3442 struct btrfs_root *root,
3443 struct btrfs_path *path, 3434 struct btrfs_path *path,
3444 struct btrfs_key *key, 3435 struct btrfs_key *key,
3445 const char *name, int name_len, 3436 const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
3454 ctx.found_data = NULL; 3445 ctx.found_data = NULL;
3455 ctx.found_data_len = 0; 3446 ctx.found_data_len = 0;
3456 3447
3457 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); 3448 ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
3458 if (ret < 0) 3449 if (ret < 0)
3459 return ret; 3450 return ret;
3460 3451
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3480 char *found_data = NULL; 3471 char *found_data = NULL;
3481 int found_data_len = 0; 3472 int found_data_len = 0;
3482 3473
3483 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, 3474 ret = find_xattr(sctx->parent_root, sctx->right_path,
3484 sctx->cmp_key, name, name_len, &found_data, 3475 sctx->cmp_key, name, name_len, &found_data,
3485 &found_data_len); 3476 &found_data_len);
3486 if (ret == -ENOENT) { 3477 if (ret == -ENOENT) {
3487 ret = __process_new_xattr(num, di_key, name, name_len, data, 3478 ret = __process_new_xattr(num, di_key, name, name_len, data,
3488 data_len, type, ctx); 3479 data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3508 int ret; 3499 int ret;
3509 struct send_ctx *sctx = ctx; 3500 struct send_ctx *sctx = ctx;
3510 3501
3511 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3502 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
3512 name, name_len, NULL, NULL); 3503 name, name_len, NULL, NULL);
3513 if (ret == -ENOENT) 3504 if (ret == -ENOENT)
3514 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 3505 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3515 data_len, type, ctx); 3506 data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
3523{ 3514{
3524 int ret = 0; 3515 int ret = 0;
3525 3516
3526 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3517 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3527 sctx->cmp_key, __process_changed_new_xattr, sctx); 3518 sctx->cmp_key, __process_changed_new_xattr, sctx);
3528 if (ret < 0) 3519 if (ret < 0)
3529 goto out; 3520 goto out;
3530 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3521 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3531 sctx->cmp_key, __process_changed_deleted_xattr, sctx); 3522 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3532 3523
3533out: 3524out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3572 goto out; 3563 goto out;
3573 } 3564 }
3574 3565
3575 ret = iterate_dir_item(sctx, root, path, &found_key, 3566 ret = iterate_dir_item(root, path, &found_key,
3576 __process_new_xattr, sctx); 3567 __process_new_xattr, sctx);
3577 if (ret < 0) 3568 if (ret < 0)
3578 goto out; 3569 goto out;
3579 3570
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3598 int num_read = 0; 3589 int num_read = 0;
3599 mm_segment_t old_fs; 3590 mm_segment_t old_fs;
3600 3591
3601 p = fs_path_alloc(sctx); 3592 p = fs_path_alloc();
3602 if (!p) 3593 if (!p)
3603 return -ENOMEM; 3594 return -ENOMEM;
3604 3595
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3640 3631
3641tlv_put_failure: 3632tlv_put_failure:
3642out: 3633out:
3643 fs_path_free(sctx, p); 3634 fs_path_free(p);
3644 set_fs(old_fs); 3635 set_fs(old_fs);
3645 if (ret < 0) 3636 if (ret < 0)
3646 return ret; 3637 return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3663 clone_root->root->objectid, clone_root->ino, 3654 clone_root->root->objectid, clone_root->ino,
3664 clone_root->offset); 3655 clone_root->offset);
3665 3656
3666 p = fs_path_alloc(sctx); 3657 p = fs_path_alloc();
3667 if (!p) 3658 if (!p)
3668 return -ENOMEM; 3659 return -ENOMEM;
3669 3660
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3686 goto out; 3677 goto out;
3687 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3678 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3688 } else { 3679 } else {
3689 ret = get_inode_path(sctx, clone_root->root, 3680 ret = get_inode_path(clone_root->root, clone_root->ino, p);
3690 clone_root->ino, p);
3691 } 3681 }
3692 if (ret < 0) 3682 if (ret < 0)
3693 goto out; 3683 goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3704 3694
3705tlv_put_failure: 3695tlv_put_failure:
3706out: 3696out:
3707 fs_path_free(sctx, p); 3697 fs_path_free(p);
3708 return ret; 3698 return ret;
3709} 3699}
3710 3700
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
3717 int ret = 0; 3707 int ret = 0;
3718 struct fs_path *p; 3708 struct fs_path *p;
3719 3709
3720 p = fs_path_alloc(sctx); 3710 p = fs_path_alloc();
3721 if (!p) 3711 if (!p)
3722 return -ENOMEM; 3712 return -ENOMEM;
3723 3713
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
3737 3727
3738tlv_put_failure: 3728tlv_put_failure:
3739out: 3729out:
3740 fs_path_free(sctx, p); 3730 fs_path_free(p);
3741 return ret; 3731 return ret;
3742} 3732}
3743 3733
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4579 send_root = BTRFS_I(file_inode(mnt_file))->root; 4569 send_root = BTRFS_I(file_inode(mnt_file))->root;
4580 fs_info = send_root->fs_info; 4570 fs_info = send_root->fs_info;
4581 4571
4572 /*
4573 * This is done when we lookup the root, it should already be complete
4574 * by the time we get here.
4575 */
4576 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
4577
4578 /*
4579 * If we just created this root we need to make sure that the orphan
4580 * cleanup has been done and committed since we search the commit root,
4581 * so check its commit root transid with our otransid and if they match
4582 * commit the transaction to make sure everything is updated.
4583 */
4584 down_read(&send_root->fs_info->extent_commit_sem);
4585 if (btrfs_header_generation(send_root->commit_root) ==
4586 btrfs_root_otransid(&send_root->root_item)) {
4587 struct btrfs_trans_handle *trans;
4588
4589 up_read(&send_root->fs_info->extent_commit_sem);
4590
4591 trans = btrfs_attach_transaction_barrier(send_root);
4592 if (IS_ERR(trans)) {
4593 if (PTR_ERR(trans) != -ENOENT) {
4594 ret = PTR_ERR(trans);
4595 goto out;
4596 }
4597 /* ENOENT means theres no transaction */
4598 } else {
4599 ret = btrfs_commit_transaction(trans, send_root);
4600 if (ret)
4601 goto out;
4602 }
4603 } else {
4604 up_read(&send_root->fs_info->extent_commit_sem);
4605 }
4606
4582 arg = memdup_user(arg_, sizeof(*arg)); 4607 arg = memdup_user(arg_, sizeof(*arg));
4583 if (IS_ERR(arg)) { 4608 if (IS_ERR(arg)) {
4584 ret = PTR_ERR(arg); 4609 ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4663 key.type = BTRFS_ROOT_ITEM_KEY; 4688 key.type = BTRFS_ROOT_ITEM_KEY;
4664 key.offset = (u64)-1; 4689 key.offset = (u64)-1;
4665 clone_root = btrfs_read_fs_root_no_name(fs_info, &key); 4690 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4666 if (!clone_root) {
4667 ret = -EINVAL;
4668 goto out;
4669 }
4670 if (IS_ERR(clone_root)) { 4691 if (IS_ERR(clone_root)) {
4671 ret = PTR_ERR(clone_root); 4692 ret = PTR_ERR(clone_root);
4672 goto out; 4693 goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4682 key.type = BTRFS_ROOT_ITEM_KEY; 4703 key.type = BTRFS_ROOT_ITEM_KEY;
4683 key.offset = (u64)-1; 4704 key.offset = (u64)-1;
4684 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); 4705 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4685 if (!sctx->parent_root) { 4706 if (IS_ERR(sctx->parent_root)) {
4686 ret = -EINVAL; 4707 ret = PTR_ERR(sctx->parent_root);
4687 goto out; 4708 goto out;
4688 } 4709 }
4689 } 4710 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
54#include "version.h"
55#include "export.h" 54#include "export.h"
56#include "compression.h" 55#include "compression.h"
57#include "rcu-string.h" 56#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
266 return; 265 return;
267 } 266 }
268 ACCESS_ONCE(trans->transaction->aborted) = errno; 267 ACCESS_ONCE(trans->transaction->aborted) = errno;
268 /* Wake up anybody who may be waiting on this transaction */
269 wake_up(&root->fs_info->transaction_wait);
270 wake_up(&root->fs_info->transaction_blocked_wait);
269 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
270} 272}
271/* 273/*
@@ -776,9 +778,6 @@ find_root:
776 if (IS_ERR(new_root)) 778 if (IS_ERR(new_root))
777 return ERR_CAST(new_root); 779 return ERR_CAST(new_root);
778 780
779 if (btrfs_root_refs(&new_root->root_item) == 0)
780 return ERR_PTR(-ENOENT);
781
782 dir_id = btrfs_root_dirid(&new_root->root_item); 781 dir_id = btrfs_root_dirid(&new_root->root_item);
783setup_root: 782setup_root:
784 location.objectid = dir_id; 783 location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
866 return 0; 865 return 0;
867 } 866 }
868 867
869 btrfs_wait_ordered_extents(root, 1); 868 btrfs_wait_all_ordered_extents(fs_info, 1);
870 869
871 trans = btrfs_attach_transaction_barrier(root); 870 trans = btrfs_attach_transaction_barrier(root);
872 if (IS_ERR(trans)) { 871 if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
1685 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); 1684 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1686} 1685}
1687 1686
1687static void btrfs_print_info(void)
1688{
1689 printk(KERN_INFO "Btrfs loaded"
1690#ifdef CONFIG_BTRFS_DEBUG
1691 ", debug=on"
1692#endif
1693#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1694 ", integrity-checker=on"
1695#endif
1696 "\n");
1697}
1698
1688static int __init init_btrfs_fs(void) 1699static int __init init_btrfs_fs(void)
1689{ 1700{
1690 int err; 1701 int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
1733 1744
1734 btrfs_init_lockdep(); 1745 btrfs_init_lockdep();
1735 1746
1736#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1747 btrfs_print_info();
1737 btrfs_test_free_space_cache(); 1748 btrfs_test_free_space_cache();
1738#endif
1739 1749
1740 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
1741 return 0; 1750 return 0;
1742 1751
1743unregister_ioctl: 1752unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..d58cce77fc6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
34 34
35#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
36 36
37static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
38 [TRANS_STATE_RUNNING] = 0U,
39 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
40 __TRANS_START),
41 [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
42 __TRANS_START |
43 __TRANS_ATTACH),
44 [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
45 __TRANS_START |
46 __TRANS_ATTACH |
47 __TRANS_JOIN),
48 [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
49 __TRANS_START |
50 __TRANS_ATTACH |
51 __TRANS_JOIN |
52 __TRANS_JOIN_NOLOCK),
53 [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
54 __TRANS_START |
55 __TRANS_ATTACH |
56 __TRANS_JOIN |
57 __TRANS_JOIN_NOLOCK),
58};
59
37static void put_transaction(struct btrfs_transaction *transaction) 60static void put_transaction(struct btrfs_transaction *transaction)
38{ 61{
39 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
40 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 65 WARN_ON(transaction->delayed_refs.root.rb_node);
66 while (!list_empty(&transaction->pending_chunks)) {
67 struct extent_map *em;
68
69 em = list_first_entry(&transaction->pending_chunks,
70 struct extent_map, list);
71 list_del_init(&em->list);
72 free_extent_map(em);
73 }
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 74 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 75 }
45} 76}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
50 root->commit_root = btrfs_root_node(root); 81 root->commit_root = btrfs_root_node(root);
51} 82}
52 83
53static inline int can_join_transaction(struct btrfs_transaction *trans, 84static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
54 int type) 85 unsigned int type)
86{
87 if (type & TRANS_EXTWRITERS)
88 atomic_inc(&trans->num_extwriters);
89}
90
91static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
92 unsigned int type)
93{
94 if (type & TRANS_EXTWRITERS)
95 atomic_dec(&trans->num_extwriters);
96}
97
98static inline void extwriter_counter_init(struct btrfs_transaction *trans,
99 unsigned int type)
100{
101 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
102}
103
104static inline int extwriter_counter_read(struct btrfs_transaction *trans)
55{ 105{
56 return !(trans->in_commit && 106 return atomic_read(&trans->num_extwriters);
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59} 107}
60 108
61/* 109/*
62 * either allocate a new transaction or hop into the existing one 110 * either allocate a new transaction or hop into the existing one
63 */ 111 */
64static noinline int join_transaction(struct btrfs_root *root, int type) 112static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
65{ 113{
66 struct btrfs_transaction *cur_trans; 114 struct btrfs_transaction *cur_trans;
67 struct btrfs_fs_info *fs_info = root->fs_info; 115 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
74 return -EROFS; 122 return -EROFS;
75 } 123 }
76 124
77 if (fs_info->trans_no_join) {
78 /*
79 * If we are JOIN_NOLOCK we're already committing a current
80 * transaction, we just need a handle to deal with something
81 * when committing the transaction, such as inode cache and
82 * space cache. It is a special case.
83 */
84 if (type != TRANS_JOIN_NOLOCK) {
85 spin_unlock(&fs_info->trans_lock);
86 return -EBUSY;
87 }
88 }
89
90 cur_trans = fs_info->running_transaction; 125 cur_trans = fs_info->running_transaction;
91 if (cur_trans) { 126 if (cur_trans) {
92 if (cur_trans->aborted) { 127 if (cur_trans->aborted) {
93 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
94 return cur_trans->aborted; 129 return cur_trans->aborted;
95 } 130 }
96 if (!can_join_transaction(cur_trans, type)) { 131 if (btrfs_blocked_trans_types[cur_trans->state] & type) {
97 spin_unlock(&fs_info->trans_lock); 132 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY; 133 return -EBUSY;
99 } 134 }
100 atomic_inc(&cur_trans->use_count); 135 atomic_inc(&cur_trans->use_count);
101 atomic_inc(&cur_trans->num_writers); 136 atomic_inc(&cur_trans->num_writers);
102 cur_trans->num_joined++; 137 extwriter_counter_inc(cur_trans, type);
103 spin_unlock(&fs_info->trans_lock); 138 spin_unlock(&fs_info->trans_lock);
104 return 0; 139 return 0;
105 } 140 }
@@ -112,6 +147,12 @@ loop:
112 if (type == TRANS_ATTACH) 147 if (type == TRANS_ATTACH)
113 return -ENOENT; 148 return -ENOENT;
114 149
150 /*
151 * JOIN_NOLOCK only happens during the transaction commit, so
152 * it is impossible that ->running_transaction is NULL
153 */
154 BUG_ON(type == TRANS_JOIN_NOLOCK);
155
115 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 156 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
116 if (!cur_trans) 157 if (!cur_trans)
117 return -ENOMEM; 158 return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
120 if (fs_info->running_transaction) { 161 if (fs_info->running_transaction) {
121 /* 162 /*
122 * someone started a transaction after we unlocked. Make sure 163 * someone started a transaction after we unlocked. Make sure
123 * to redo the trans_no_join checks above 164 * to redo the checks above
124 */ 165 */
125 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 166 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
126 goto loop; 167 goto loop;
@@ -131,17 +172,15 @@ loop:
131 } 172 }
132 173
133 atomic_set(&cur_trans->num_writers, 1); 174 atomic_set(&cur_trans->num_writers, 1);
134 cur_trans->num_joined = 0; 175 extwriter_counter_init(cur_trans, type);
135 init_waitqueue_head(&cur_trans->writer_wait); 176 init_waitqueue_head(&cur_trans->writer_wait);
136 init_waitqueue_head(&cur_trans->commit_wait); 177 init_waitqueue_head(&cur_trans->commit_wait);
137 cur_trans->in_commit = 0; 178 cur_trans->state = TRANS_STATE_RUNNING;
138 cur_trans->blocked = 0;
139 /* 179 /*
140 * One for this trans handle, one so it will live on until we 180 * One for this trans handle, one so it will live on until we
141 * commit the transaction. 181 * commit the transaction.
142 */ 182 */
143 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
144 cur_trans->commit_done = 0;
145 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
146 185
147 cur_trans->delayed_refs.root = RB_ROOT; 186 cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
164 "creating a fresh transaction\n"); 203 "creating a fresh transaction\n");
165 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
166 205
167 spin_lock_init(&cur_trans->commit_lock);
168 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); 207 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0); 208 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
172 210
173 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 211 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations); 212 INIT_LIST_HEAD(&cur_trans->ordered_operations);
213 INIT_LIST_HEAD(&cur_trans->pending_chunks);
175 list_add_tail(&cur_trans->list, &fs_info->trans_list); 214 list_add_tail(&cur_trans->list, &fs_info->trans_list);
176 extent_io_tree_init(&cur_trans->dirty_pages, 215 extent_io_tree_init(&cur_trans->dirty_pages,
177 fs_info->btree_inode->i_mapping); 216 fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
269 return 0; 308 return 0;
270} 309}
271 310
311static inline int is_transaction_blocked(struct btrfs_transaction *trans)
312{
313 return (trans->state >= TRANS_STATE_BLOCKED &&
314 trans->state < TRANS_STATE_UNBLOCKED &&
315 !trans->aborted);
316}
317
272/* wait for commit against the current transaction to become unblocked 318/* wait for commit against the current transaction to become unblocked
273 * when this is done, it is safe to start a new transaction, but the current 319 * when this is done, it is safe to start a new transaction, but the current
274 * transaction might not be fully on disk. 320 * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
279 325
280 spin_lock(&root->fs_info->trans_lock); 326 spin_lock(&root->fs_info->trans_lock);
281 cur_trans = root->fs_info->running_transaction; 327 cur_trans = root->fs_info->running_transaction;
282 if (cur_trans && cur_trans->blocked) { 328 if (cur_trans && is_transaction_blocked(cur_trans)) {
283 atomic_inc(&cur_trans->use_count); 329 atomic_inc(&cur_trans->use_count);
284 spin_unlock(&root->fs_info->trans_lock); 330 spin_unlock(&root->fs_info->trans_lock);
285 331
286 wait_event(root->fs_info->transaction_wait, 332 wait_event(root->fs_info->transaction_wait,
287 !cur_trans->blocked); 333 cur_trans->state >= TRANS_STATE_UNBLOCKED ||
334 cur_trans->aborted);
288 put_transaction(cur_trans); 335 put_transaction(cur_trans);
289 } else { 336 } else {
290 spin_unlock(&root->fs_info->trans_lock); 337 spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
307} 354}
308 355
309static struct btrfs_trans_handle * 356static struct btrfs_trans_handle *
310start_transaction(struct btrfs_root *root, u64 num_items, int type, 357start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
311 enum btrfs_reserve_flush_enum flush) 358 enum btrfs_reserve_flush_enum flush)
312{ 359{
313 struct btrfs_trans_handle *h; 360 struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
320 return ERR_PTR(-EROFS); 367 return ERR_PTR(-EROFS);
321 368
322 if (current->journal_info) { 369 if (current->journal_info) {
323 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 370 WARN_ON(type & TRANS_EXTWRITERS);
324 h = current->journal_info; 371 h = current->journal_info;
325 h->use_count++; 372 h->use_count++;
326 WARN_ON(h->use_count > 2); 373 WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
366 * If we are ATTACH, it means we just want to catch the current 413 * If we are ATTACH, it means we just want to catch the current
367 * transaction and commit it, so we needn't do sb_start_intwrite(). 414 * transaction and commit it, so we needn't do sb_start_intwrite().
368 */ 415 */
369 if (type < TRANS_JOIN_NOLOCK) 416 if (type & __TRANS_FREEZABLE)
370 sb_start_intwrite(root->fs_info->sb); 417 sb_start_intwrite(root->fs_info->sb);
371 418
372 if (may_wait_transaction(root, type)) 419 if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
408 INIT_LIST_HEAD(&h->new_bgs); 455 INIT_LIST_HEAD(&h->new_bgs);
409 456
410 smp_mb(); 457 smp_mb();
411 if (cur_trans->blocked && may_wait_transaction(root, type)) { 458 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
459 may_wait_transaction(root, type)) {
412 btrfs_commit_transaction(h, root); 460 btrfs_commit_transaction(h, root);
413 goto again; 461 goto again;
414 } 462 }
@@ -429,7 +477,7 @@ got_it:
429 return h; 477 return h;
430 478
431join_fail: 479join_fail:
432 if (type < TRANS_JOIN_NOLOCK) 480 if (type & __TRANS_FREEZABLE)
433 sb_end_intwrite(root->fs_info->sb); 481 sb_end_intwrite(root->fs_info->sb);
434 kmem_cache_free(btrfs_trans_handle_cachep, h); 482 kmem_cache_free(btrfs_trans_handle_cachep, h);
435alloc_fail: 483alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
490} 538}
491 539
492/* 540/*
493 * btrfs_attach_transaction() - catch the running transaction 541 * btrfs_attach_transaction_barrier() - catch the running transaction
494 * 542 *
495 * It is similar to the above function, the differentia is this one 543 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully 544 * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
512static noinline void wait_for_commit(struct btrfs_root *root, 560static noinline void wait_for_commit(struct btrfs_root *root,
513 struct btrfs_transaction *commit) 561 struct btrfs_transaction *commit)
514{ 562{
515 wait_event(commit->commit_wait, commit->commit_done); 563 wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
516} 564}
517 565
518int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 566int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
548 spin_lock(&root->fs_info->trans_lock); 596 spin_lock(&root->fs_info->trans_lock);
549 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 597 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
550 list) { 598 list) {
551 if (t->in_commit) { 599 if (t->state >= TRANS_STATE_COMMIT_START) {
552 if (t->commit_done) 600 if (t->state == TRANS_STATE_COMPLETED)
553 break; 601 break;
554 cur_trans = t; 602 cur_trans = t;
555 atomic_inc(&cur_trans->use_count); 603 atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
576static int should_end_transaction(struct btrfs_trans_handle *trans, 624static int should_end_transaction(struct btrfs_trans_handle *trans,
577 struct btrfs_root *root) 625 struct btrfs_root *root)
578{ 626{
579 int ret; 627 if (root->fs_info->global_block_rsv.space_info->full &&
628 btrfs_should_throttle_delayed_refs(trans, root))
629 return 1;
580 630
581 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 631 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
582 return ret ? 1 : 0;
583} 632}
584 633
585int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 634int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
590 int err; 639 int err;
591 640
592 smp_mb(); 641 smp_mb();
593 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 642 if (cur_trans->state >= TRANS_STATE_BLOCKED ||
643 cur_trans->delayed_refs.flushing)
594 return 1; 644 return 1;
595 645
596 updates = trans->delayed_ref_updates; 646 updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
609{ 659{
610 struct btrfs_transaction *cur_trans = trans->transaction; 660 struct btrfs_transaction *cur_trans = trans->transaction;
611 struct btrfs_fs_info *info = root->fs_info; 661 struct btrfs_fs_info *info = root->fs_info;
612 int count = 0; 662 unsigned long cur = trans->delayed_ref_updates;
613 int lock = (trans->type != TRANS_JOIN_NOLOCK); 663 int lock = (trans->type != TRANS_JOIN_NOLOCK);
614 int err = 0; 664 int err = 0;
615 665
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
638 if (!list_empty(&trans->new_bgs)) 688 if (!list_empty(&trans->new_bgs))
639 btrfs_create_pending_block_groups(trans, root); 689 btrfs_create_pending_block_groups(trans, root);
640 690
641 while (count < 1) { 691 trans->delayed_ref_updates = 0;
642 unsigned long cur = trans->delayed_ref_updates; 692 if (btrfs_should_throttle_delayed_refs(trans, root)) {
693 cur = max_t(unsigned long, cur, 1);
643 trans->delayed_ref_updates = 0; 694 trans->delayed_ref_updates = 0;
644 if (cur && 695 btrfs_run_delayed_refs(trans, root, cur);
645 trans->transaction->delayed_refs.num_heads_ready > 64) {
646 trans->delayed_ref_updates = 0;
647 btrfs_run_delayed_refs(trans, root, cur);
648 } else {
649 break;
650 }
651 count++;
652 } 696 }
653 697
654 btrfs_trans_release_metadata(trans, root); 698 btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
658 btrfs_create_pending_block_groups(trans, root); 702 btrfs_create_pending_block_groups(trans, root);
659 703
660 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 704 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
661 should_end_transaction(trans, root)) { 705 should_end_transaction(trans, root) &&
662 trans->transaction->blocked = 1; 706 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
663 smp_wmb(); 707 spin_lock(&info->trans_lock);
708 if (cur_trans->state == TRANS_STATE_RUNNING)
709 cur_trans->state = TRANS_STATE_BLOCKED;
710 spin_unlock(&info->trans_lock);
664 } 711 }
665 712
666 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 713 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
667 if (throttle) { 714 if (throttle) {
668 /* 715 /*
669 * We may race with somebody else here so end up having 716 * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
677 } 724 }
678 } 725 }
679 726
680 if (trans->type < TRANS_JOIN_NOLOCK) 727 if (trans->type & __TRANS_FREEZABLE)
681 sb_end_intwrite(root->fs_info->sb); 728 sb_end_intwrite(root->fs_info->sb);
682 729
683 WARN_ON(cur_trans != info->running_transaction); 730 WARN_ON(cur_trans != info->running_transaction);
684 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 731 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
685 atomic_dec(&cur_trans->num_writers); 732 atomic_dec(&cur_trans->num_writers);
733 extwriter_counter_dec(cur_trans, trans->type);
686 734
687 smp_mb(); 735 smp_mb();
688 if (waitqueue_active(&cur_trans->writer_wait)) 736 if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
736 struct extent_state *cached_state = NULL; 784 struct extent_state *cached_state = NULL;
737 u64 start = 0; 785 u64 start = 0;
738 u64 end; 786 u64 end;
739 struct blk_plug plug;
740 787
741 blk_start_plug(&plug);
742 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 788 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
743 mark, &cached_state)) { 789 mark, &cached_state)) {
744 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 790 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
752 } 798 }
753 if (err) 799 if (err)
754 werr = err; 800 werr = err;
755 blk_finish_plug(&plug);
756 return werr; 801 return werr;
757} 802}
758 803
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
797{ 842{
798 int ret; 843 int ret;
799 int ret2; 844 int ret2;
845 struct blk_plug plug;
800 846
847 blk_start_plug(&plug);
801 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 848 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
849 blk_finish_plug(&plug);
802 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 850 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
803 851
804 if (ret) 852 if (ret)
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
1318 1366
1319int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1367int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
1320{ 1368{
1369 struct btrfs_transaction *trans;
1321 int ret = 0; 1370 int ret = 0;
1371
1322 spin_lock(&info->trans_lock); 1372 spin_lock(&info->trans_lock);
1323 if (info->running_transaction) 1373 trans = info->running_transaction;
1324 ret = info->running_transaction->in_commit; 1374 if (trans)
1375 ret = (trans->state >= TRANS_STATE_COMMIT_START);
1325 spin_unlock(&info->trans_lock); 1376 spin_unlock(&info->trans_lock);
1326 return ret; 1377 return ret;
1327} 1378}
1328 1379
1329int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1380int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1330{ 1381{
1382 struct btrfs_transaction *trans;
1331 int ret = 0; 1383 int ret = 0;
1384
1332 spin_lock(&info->trans_lock); 1385 spin_lock(&info->trans_lock);
1333 if (info->running_transaction) 1386 trans = info->running_transaction;
1334 ret = info->running_transaction->blocked; 1387 if (trans)
1388 ret = is_transaction_blocked(trans);
1335 spin_unlock(&info->trans_lock); 1389 spin_unlock(&info->trans_lock);
1336 return ret; 1390 return ret;
1337} 1391}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1343static void wait_current_trans_commit_start(struct btrfs_root *root, 1397static void wait_current_trans_commit_start(struct btrfs_root *root,
1344 struct btrfs_transaction *trans) 1398 struct btrfs_transaction *trans)
1345{ 1399{
1346 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); 1400 wait_event(root->fs_info->transaction_blocked_wait,
1401 trans->state >= TRANS_STATE_COMMIT_START ||
1402 trans->aborted);
1347} 1403}
1348 1404
1349/* 1405/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1354 struct btrfs_transaction *trans) 1410 struct btrfs_transaction *trans)
1355{ 1411{
1356 wait_event(root->fs_info->transaction_wait, 1412 wait_event(root->fs_info->transaction_wait,
1357 trans->commit_done || (trans->in_commit && !trans->blocked)); 1413 trans->state >= TRANS_STATE_UNBLOCKED ||
1414 trans->aborted);
1358} 1415}
1359 1416
1360/* 1417/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1450 1507
1451 spin_lock(&root->fs_info->trans_lock); 1508 spin_lock(&root->fs_info->trans_lock);
1452 1509
1453 if (list_empty(&cur_trans->list)) { 1510 /*
1454 spin_unlock(&root->fs_info->trans_lock); 1511 * If the transaction is removed from the list, it means this
1455 btrfs_end_transaction(trans, root); 1512 * transaction has been committed successfully, so it is impossible
1456 return; 1513 * to call the cleanup function.
1457 } 1514 */
1515 BUG_ON(list_empty(&cur_trans->list));
1458 1516
1459 list_del_init(&cur_trans->list); 1517 list_del_init(&cur_trans->list);
1460 if (cur_trans == root->fs_info->running_transaction) { 1518 if (cur_trans == root->fs_info->running_transaction) {
1461 root->fs_info->trans_no_join = 1; 1519 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1462 spin_unlock(&root->fs_info->trans_lock); 1520 spin_unlock(&root->fs_info->trans_lock);
1463 wait_event(cur_trans->writer_wait, 1521 wait_event(cur_trans->writer_wait,
1464 atomic_read(&cur_trans->num_writers) == 1); 1522 atomic_read(&cur_trans->num_writers) == 1);
1465 1523
1466 spin_lock(&root->fs_info->trans_lock); 1524 spin_lock(&root->fs_info->trans_lock);
1467 root->fs_info->running_transaction = NULL;
1468 } 1525 }
1469 spin_unlock(&root->fs_info->trans_lock); 1526 spin_unlock(&root->fs_info->trans_lock);
1470 1527
1471 btrfs_cleanup_one_transaction(trans->transaction, root); 1528 btrfs_cleanup_one_transaction(trans->transaction, root);
1472 1529
1530 spin_lock(&root->fs_info->trans_lock);
1531 if (cur_trans == root->fs_info->running_transaction)
1532 root->fs_info->running_transaction = NULL;
1533 spin_unlock(&root->fs_info->trans_lock);
1534
1473 put_transaction(cur_trans); 1535 put_transaction(cur_trans);
1474 put_transaction(cur_trans); 1536 put_transaction(cur_trans);
1475 1537
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1481 current->journal_info = NULL; 1543 current->journal_info = NULL;
1482 1544
1483 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1545 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1484
1485 spin_lock(&root->fs_info->trans_lock);
1486 root->fs_info->trans_no_join = 0;
1487 spin_unlock(&root->fs_info->trans_lock);
1488} 1546}
1489 1547
1490static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1548static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root) 1549 struct btrfs_root *root)
1492{ 1550{
1493 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1494 int snap_pending = 0;
1495 int ret; 1551 int ret;
1496 1552
1497 if (!flush_on_commit) {
1498 spin_lock(&root->fs_info->trans_lock);
1499 if (!list_empty(&trans->transaction->pending_snapshots))
1500 snap_pending = 1;
1501 spin_unlock(&root->fs_info->trans_lock);
1502 }
1503
1504 if (flush_on_commit || snap_pending) {
1505 ret = btrfs_start_delalloc_inodes(root, 1);
1506 if (ret)
1507 return ret;
1508 btrfs_wait_ordered_extents(root, 1);
1509 }
1510
1511 ret = btrfs_run_delayed_items(trans, root); 1553 ret = btrfs_run_delayed_items(trans, root);
1512 if (ret) 1554 if (ret)
1513 return ret; 1555 return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1531 return ret; 1573 return ret;
1532} 1574}
1533 1575
1534/* 1576static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1535 * btrfs_transaction state sequence: 1577{
1536 * in_commit = 0, blocked = 0 (initial) 1578 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1537 * in_commit = 1, blocked = 1 1579 return btrfs_start_all_delalloc_inodes(fs_info, 1);
1538 * blocked = 0 1580 return 0;
1539 * commit_done = 1 1581}
1540 */ 1582
1583static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1584{
1585 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1586 btrfs_wait_all_ordered_extents(fs_info, 1);
1587}
1588
1541int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1589int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root) 1590 struct btrfs_root *root)
1543{ 1591{
1544 unsigned long joined = 0;
1545 struct btrfs_transaction *cur_trans = trans->transaction; 1592 struct btrfs_transaction *cur_trans = trans->transaction;
1546 struct btrfs_transaction *prev_trans = NULL; 1593 struct btrfs_transaction *prev_trans = NULL;
1547 DEFINE_WAIT(wait);
1548 int ret; 1594 int ret;
1549 int should_grow = 0;
1550 unsigned long now = get_seconds();
1551 1595
1552 ret = btrfs_run_ordered_operations(trans, root, 0); 1596 ret = btrfs_run_ordered_operations(trans, root, 0);
1553 if (ret) { 1597 if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1586 * start sending their work down. 1630 * start sending their work down.
1587 */ 1631 */
1588 cur_trans->delayed_refs.flushing = 1; 1632 cur_trans->delayed_refs.flushing = 1;
1633 smp_wmb();
1589 1634
1590 if (!list_empty(&trans->new_bgs)) 1635 if (!list_empty(&trans->new_bgs))
1591 btrfs_create_pending_block_groups(trans, root); 1636 btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1596 return ret; 1641 return ret;
1597 } 1642 }
1598 1643
1599 spin_lock(&cur_trans->commit_lock); 1644 spin_lock(&root->fs_info->trans_lock);
1600 if (cur_trans->in_commit) { 1645 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1601 spin_unlock(&cur_trans->commit_lock); 1646 spin_unlock(&root->fs_info->trans_lock);
1602 atomic_inc(&cur_trans->use_count); 1647 atomic_inc(&cur_trans->use_count);
1603 ret = btrfs_end_transaction(trans, root); 1648 ret = btrfs_end_transaction(trans, root);
1604 1649
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1609 return ret; 1654 return ret;
1610 } 1655 }
1611 1656
1612 trans->transaction->in_commit = 1; 1657 cur_trans->state = TRANS_STATE_COMMIT_START;
1613 trans->transaction->blocked = 1;
1614 spin_unlock(&cur_trans->commit_lock);
1615 wake_up(&root->fs_info->transaction_blocked_wait); 1658 wake_up(&root->fs_info->transaction_blocked_wait);
1616 1659
1617 spin_lock(&root->fs_info->trans_lock);
1618 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1660 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1619 prev_trans = list_entry(cur_trans->list.prev, 1661 prev_trans = list_entry(cur_trans->list.prev,
1620 struct btrfs_transaction, list); 1662 struct btrfs_transaction, list);
1621 if (!prev_trans->commit_done) { 1663 if (prev_trans->state != TRANS_STATE_COMPLETED) {
1622 atomic_inc(&prev_trans->use_count); 1664 atomic_inc(&prev_trans->use_count);
1623 spin_unlock(&root->fs_info->trans_lock); 1665 spin_unlock(&root->fs_info->trans_lock);
1624 1666
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1632 spin_unlock(&root->fs_info->trans_lock); 1674 spin_unlock(&root->fs_info->trans_lock);
1633 } 1675 }
1634 1676
1635 if (!btrfs_test_opt(root, SSD) && 1677 extwriter_counter_dec(cur_trans, trans->type);
1636 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1637 should_grow = 1;
1638
1639 do {
1640 joined = cur_trans->num_joined;
1641
1642 WARN_ON(cur_trans != trans->transaction);
1643
1644 ret = btrfs_flush_all_pending_stuffs(trans, root);
1645 if (ret)
1646 goto cleanup_transaction;
1647 1678
1648 prepare_to_wait(&cur_trans->writer_wait, &wait, 1679 ret = btrfs_start_delalloc_flush(root->fs_info);
1649 TASK_UNINTERRUPTIBLE); 1680 if (ret)
1681 goto cleanup_transaction;
1650 1682
1651 if (atomic_read(&cur_trans->num_writers) > 1) 1683 ret = btrfs_flush_all_pending_stuffs(trans, root);
1652 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1684 if (ret)
1653 else if (should_grow) 1685 goto cleanup_transaction;
1654 schedule_timeout(1);
1655 1686
1656 finish_wait(&cur_trans->writer_wait, &wait); 1687 wait_event(cur_trans->writer_wait,
1657 } while (atomic_read(&cur_trans->num_writers) > 1 || 1688 extwriter_counter_read(cur_trans) == 0);
1658 (should_grow && cur_trans->num_joined != joined));
1659 1689
1690 /* some pending stuffs might be added after the previous flush. */
1660 ret = btrfs_flush_all_pending_stuffs(trans, root); 1691 ret = btrfs_flush_all_pending_stuffs(trans, root);
1661 if (ret) 1692 if (ret)
1662 goto cleanup_transaction; 1693 goto cleanup_transaction;
1663 1694
1695 btrfs_wait_delalloc_flush(root->fs_info);
1664 /* 1696 /*
1665 * Ok now we need to make sure to block out any other joins while we 1697 * Ok now we need to make sure to block out any other joins while we
1666 * commit the transaction. We could have started a join before setting 1698 * commit the transaction. We could have started a join before setting
1667 * no_join so make sure to wait for num_writers to == 1 again. 1699 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
1668 */ 1700 */
1669 spin_lock(&root->fs_info->trans_lock); 1701 spin_lock(&root->fs_info->trans_lock);
1670 root->fs_info->trans_no_join = 1; 1702 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1671 spin_unlock(&root->fs_info->trans_lock); 1703 spin_unlock(&root->fs_info->trans_lock);
1672 wait_event(cur_trans->writer_wait, 1704 wait_event(cur_trans->writer_wait,
1673 atomic_read(&cur_trans->num_writers) == 1); 1705 atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1794 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1826 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1795 sizeof(*root->fs_info->super_copy)); 1827 sizeof(*root->fs_info->super_copy));
1796 1828
1797 trans->transaction->blocked = 0;
1798 spin_lock(&root->fs_info->trans_lock); 1829 spin_lock(&root->fs_info->trans_lock);
1830 cur_trans->state = TRANS_STATE_UNBLOCKED;
1799 root->fs_info->running_transaction = NULL; 1831 root->fs_info->running_transaction = NULL;
1800 root->fs_info->trans_no_join = 0;
1801 spin_unlock(&root->fs_info->trans_lock); 1832 spin_unlock(&root->fs_info->trans_lock);
1802 mutex_unlock(&root->fs_info->reloc_mutex); 1833 mutex_unlock(&root->fs_info->reloc_mutex);
1803 1834
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1825 1856
1826 btrfs_finish_extent_commit(trans, root); 1857 btrfs_finish_extent_commit(trans, root);
1827 1858
1828 cur_trans->commit_done = 1;
1829
1830 root->fs_info->last_trans_committed = cur_trans->transid; 1859 root->fs_info->last_trans_committed = cur_trans->transid;
1831 1860 /*
1861 * We needn't acquire the lock here because there is no other task
1862 * which can change it.
1863 */
1864 cur_trans->state = TRANS_STATE_COMPLETED;
1832 wake_up(&cur_trans->commit_wait); 1865 wake_up(&cur_trans->commit_wait);
1833 1866
1834 spin_lock(&root->fs_info->trans_lock); 1867 spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1838 put_transaction(cur_trans); 1871 put_transaction(cur_trans);
1839 put_transaction(cur_trans); 1872 put_transaction(cur_trans);
1840 1873
1841 if (trans->type < TRANS_JOIN_NOLOCK) 1874 if (trans->type & __TRANS_FREEZABLE)
1842 sb_end_intwrite(root->fs_info->sb); 1875 sb_end_intwrite(root->fs_info->sb);
1843 1876
1844 trace_btrfs_transaction_commit(root); 1877 trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1885 int ret; 1918 int ret;
1886 struct btrfs_fs_info *fs_info = root->fs_info; 1919 struct btrfs_fs_info *fs_info = root->fs_info;
1887 1920
1888 if (fs_info->sb->s_flags & MS_RDONLY) {
1889 pr_debug("btrfs: cleaner called for RO fs!\n");
1890 return 0;
1891 }
1892
1893 spin_lock(&fs_info->trans_lock); 1921 spin_lock(&fs_info->trans_lock);
1894 if (list_empty(&fs_info->dead_roots)) { 1922 if (list_empty(&fs_info->dead_roots)) {
1895 spin_unlock(&fs_info->trans_lock); 1923 spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..005b0375d18c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h" 23#include "ctree.h"
24 24
25enum btrfs_trans_state {
26 TRANS_STATE_RUNNING = 0,
27 TRANS_STATE_BLOCKED = 1,
28 TRANS_STATE_COMMIT_START = 2,
29 TRANS_STATE_COMMIT_DOING = 3,
30 TRANS_STATE_UNBLOCKED = 4,
31 TRANS_STATE_COMPLETED = 5,
32 TRANS_STATE_MAX = 6,
33};
34
25struct btrfs_transaction { 35struct btrfs_transaction {
26 u64 transid; 36 u64 transid;
27 /* 37 /*
38 * total external writers(USERSPACE/START/ATTACH) in this
39 * transaction, it must be zero before the transaction is
40 * being committed
41 */
42 atomic_t num_extwriters;
43 /*
28 * total writers in this transaction, it must be zero before the 44 * total writers in this transaction, it must be zero before the
29 * transaction can end 45 * transaction can end
30 */ 46 */
31 atomic_t num_writers; 47 atomic_t num_writers;
32 atomic_t use_count; 48 atomic_t use_count;
33 49
34 unsigned long num_joined; 50 /* Be protected by fs_info->trans_lock when we want to change it. */
35 51 enum btrfs_trans_state state;
36 spinlock_t commit_lock;
37 int in_commit;
38 int commit_done;
39 int blocked;
40 struct list_head list; 52 struct list_head list;
41 struct extent_io_tree dirty_pages; 53 struct extent_io_tree dirty_pages;
42 unsigned long start_time; 54 unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
44 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
46 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks;
47 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
48 int aborted; 61 int aborted;
49}; 62};
50 63
51enum btrfs_trans_type { 64#define __TRANS_FREEZABLE (1U << 0)
52 TRANS_START, 65
53 TRANS_JOIN, 66#define __TRANS_USERSPACE (1U << 8)
54 TRANS_USERSPACE, 67#define __TRANS_START (1U << 9)
55 TRANS_JOIN_NOLOCK, 68#define __TRANS_ATTACH (1U << 10)
56 TRANS_ATTACH, 69#define __TRANS_JOIN (1U << 11)
57}; 70#define __TRANS_JOIN_NOLOCK (1U << 12)
71
72#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
73#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
74#define TRANS_ATTACH (__TRANS_ATTACH)
75#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
76#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
77
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH)
58 80
59struct btrfs_trans_handle { 81struct btrfs_trans_handle {
60 u64 transid; 82 u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
70 short aborted; 92 short aborted;
71 short adding_csums; 93 short adding_csums;
72 bool allocating_chunk; 94 bool allocating_chunk;
73 enum btrfs_trans_type type; 95 unsigned int type;
74 /* 96 /*
75 * this root is only needed to validate that the root passed to 97 * this root is only needed to validate that the root passed to
76 * start_transaction is the same as the one passed to end_transaction. 98 * start_transaction is the same as the one passed to end_transaction.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..2c6791493637 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h>
21#include <linux/list_sort.h> 22#include <linux/list_sort.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
279{ 280{
280 int ret = 0; 281 int ret = 0;
281 282
283 /*
284 * If this fs is mixed then we need to be able to process the leaves to
285 * pin down any logged extents, so we have to read the block.
286 */
287 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
288 ret = btrfs_read_buffer(eb, gen);
289 if (ret)
290 return ret;
291 }
292
282 if (wc->pin) 293 if (wc->pin)
283 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 294 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
284 eb->start, eb->len); 295 eb->start, eb->len);
285 296
286 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 297 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
298 if (wc->pin && btrfs_header_level(eb) == 0)
299 ret = btrfs_exclude_logged_extents(log, eb);
287 if (wc->write) 300 if (wc->write)
288 btrfs_write_tree_block(eb); 301 btrfs_write_tree_block(eb);
289 if (wc->wait) 302 if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2016 eb, i, &key); 2029 eb, i, &key);
2017 if (ret) 2030 if (ret)
2018 break; 2031 break;
2019 } else if (key.type == BTRFS_INODE_REF_KEY) { 2032 } else if (key.type == BTRFS_INODE_REF_KEY ||
2020 ret = add_inode_ref(wc->trans, root, log, path, 2033 key.type == BTRFS_INODE_EXTREF_KEY) {
2021 eb, i, &key);
2022 if (ret && ret != -ENOENT)
2023 break;
2024 ret = 0;
2025 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
2026 ret = add_inode_ref(wc->trans, root, log, path, 2034 ret = add_inode_ref(wc->trans, root, log, path,
2027 eb, i, &key); 2035 eb, i, &key);
2028 if (ret && ret != -ENOENT) 2036 if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2358 struct btrfs_root *log = root->log_root; 2366 struct btrfs_root *log = root->log_root;
2359 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2367 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2360 unsigned long log_transid = 0; 2368 unsigned long log_transid = 0;
2369 struct blk_plug plug;
2361 2370
2362 mutex_lock(&root->log_mutex); 2371 mutex_lock(&root->log_mutex);
2363 log_transid = root->log_transid; 2372 log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2401 /* we start IO on all the marked extents here, but we don't actually 2410 /* we start IO on all the marked extents here, but we don't actually
2402 * wait for them until later. 2411 * wait for them until later.
2403 */ 2412 */
2413 blk_start_plug(&plug);
2404 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2414 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2405 if (ret) { 2415 if (ret) {
2416 blk_finish_plug(&plug);
2406 btrfs_abort_transaction(trans, root, ret); 2417 btrfs_abort_transaction(trans, root, ret);
2407 btrfs_free_logged_extents(log, log_transid); 2418 btrfs_free_logged_extents(log, log_transid);
2408 mutex_unlock(&root->log_mutex); 2419 mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2437 } 2448 }
2438 2449
2439 if (ret) { 2450 if (ret) {
2451 blk_finish_plug(&plug);
2440 if (ret != -ENOSPC) { 2452 if (ret != -ENOSPC) {
2441 btrfs_abort_transaction(trans, root, ret); 2453 btrfs_abort_transaction(trans, root, ret);
2442 mutex_unlock(&log_root_tree->log_mutex); 2454 mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2452 2464
2453 index2 = log_root_tree->log_transid % 2; 2465 index2 = log_root_tree->log_transid % 2;
2454 if (atomic_read(&log_root_tree->log_commit[index2])) { 2466 if (atomic_read(&log_root_tree->log_commit[index2])) {
2467 blk_finish_plug(&plug);
2455 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2468 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2456 wait_log_commit(trans, log_root_tree, 2469 wait_log_commit(trans, log_root_tree,
2457 log_root_tree->log_transid); 2470 log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2474 * check the full commit flag again 2487 * check the full commit flag again
2475 */ 2488 */
2476 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2489 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2490 blk_finish_plug(&plug);
2477 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2491 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2478 btrfs_free_logged_extents(log, log_transid); 2492 btrfs_free_logged_extents(log, log_transid);
2479 mutex_unlock(&log_root_tree->log_mutex); 2493 mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2481 goto out_wake_log_root; 2495 goto out_wake_log_root;
2482 } 2496 }
2483 2497
2484 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2498 ret = btrfs_write_marked_extents(log_root_tree,
2485 &log_root_tree->dirty_log_pages, 2499 &log_root_tree->dirty_log_pages,
2486 EXTENT_DIRTY | EXTENT_NEW); 2500 EXTENT_DIRTY | EXTENT_NEW);
2501 blk_finish_plug(&plug);
2487 if (ret) { 2502 if (ret) {
2488 btrfs_abort_transaction(trans, root, ret); 2503 btrfs_abort_transaction(trans, root, ret);
2489 btrfs_free_logged_extents(log, log_transid); 2504 btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2491 goto out_wake_log_root; 2506 goto out_wake_log_root;
2492 } 2507 }
2493 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2508 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2509 btrfs_wait_marked_extents(log_root_tree,
2510 &log_root_tree->dirty_log_pages,
2511 EXTENT_NEW | EXTENT_DIRTY);
2494 btrfs_wait_logged_extents(log, log_transid); 2512 btrfs_wait_logged_extents(log, log_transid);
2495 2513
2496 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2514 btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -4016,8 +4034,7 @@ again:
4016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4034 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4017 break; 4035 break;
4018 4036
4019 log = btrfs_read_fs_root_no_radix(log_root_tree, 4037 log = btrfs_read_fs_root(log_root_tree, &found_key);
4020 &found_key);
4021 if (IS_ERR(log)) { 4038 if (IS_ERR(log)) {
4022 ret = PTR_ERR(log); 4039 ret = PTR_ERR(log);
4023 btrfs_error(fs_info, ret, 4040 btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
205 u64 new_alloced = ulist->nodes_alloced + 128; 205 u64 new_alloced = ulist->nodes_alloced + 128;
206 struct ulist_node *new_nodes; 206 struct ulist_node *new_nodes;
207 void *old = NULL; 207 void *old = NULL;
208 int i;
209
210 for (i = 0; i < ulist->nnodes; i++)
211 rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
208 212
209 /* 213 /*
210 * if nodes_alloced == ULIST_SIZE no memory has been allocated 214 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
224 228
225 ulist->nodes = new_nodes; 229 ulist->nodes = new_nodes;
226 ulist->nodes_alloced = new_alloced; 230 ulist->nodes_alloced = new_alloced;
231
232 /*
233 * krealloc actually uses memcpy, which does not copy rb_node
234 * pointers, so we have to do it ourselves. Otherwise we may
235 * be bitten by crashes.
236 */
237 for (i = 0; i < ulist->nnodes; i++) {
238 ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
239 if (ret < 0)
240 return ret;
241 }
227 } 242 }
228 ulist->nodes[ulist->nnodes].val = val; 243 ulist->nodes[ulist->nnodes].val = val;
229 ulist->nodes[ulist->nnodes].aux = aux; 244 ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
982 return ret; 982 return ret;
983} 983}
984 984
985static int contains_pending_extent(struct btrfs_trans_handle *trans,
986 struct btrfs_device *device,
987 u64 *start, u64 len)
988{
989 struct extent_map *em;
990 int ret = 0;
991
992 list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
993 struct map_lookup *map;
994 int i;
995
996 map = (struct map_lookup *)em->bdev;
997 for (i = 0; i < map->num_stripes; i++) {
998 if (map->stripes[i].dev != device)
999 continue;
1000 if (map->stripes[i].physical >= *start + len ||
1001 map->stripes[i].physical + em->orig_block_len <=
1002 *start)
1003 continue;
1004 *start = map->stripes[i].physical +
1005 em->orig_block_len;
1006 ret = 1;
1007 }
1008 }
1009
1010 return ret;
1011}
1012
1013
985/* 1014/*
986 * find_free_dev_extent - find free space in the specified device 1015 * find_free_dev_extent - find free space in the specified device
987 * @device: the device which we search the free space in 1016 * @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
1002 * But if we don't find suitable free space, it is used to store the size of 1031 * But if we don't find suitable free space, it is used to store the size of
1003 * the max free space. 1032 * the max free space.
1004 */ 1033 */
1005int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1034int find_free_dev_extent(struct btrfs_trans_handle *trans,
1035 struct btrfs_device *device, u64 num_bytes,
1006 u64 *start, u64 *len) 1036 u64 *start, u64 *len)
1007{ 1037{
1008 struct btrfs_key key; 1038 struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1026 */ 1056 */
1027 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1057 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1028 1058
1059 path = btrfs_alloc_path();
1060 if (!path)
1061 return -ENOMEM;
1062again:
1029 max_hole_start = search_start; 1063 max_hole_start = search_start;
1030 max_hole_size = 0; 1064 max_hole_size = 0;
1031 hole_size = 0; 1065 hole_size = 0;
1032 1066
1033 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1067 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1034 ret = -ENOSPC; 1068 ret = -ENOSPC;
1035 goto error; 1069 goto out;
1036 } 1070 }
1037 1071
1038 path = btrfs_alloc_path();
1039 if (!path) {
1040 ret = -ENOMEM;
1041 goto error;
1042 }
1043 path->reada = 2; 1072 path->reada = 2;
1073 path->search_commit_root = 1;
1074 path->skip_locking = 1;
1044 1075
1045 key.objectid = device->devid; 1076 key.objectid = device->devid;
1046 key.offset = search_start; 1077 key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1081 if (key.offset > search_start) { 1112 if (key.offset > search_start) {
1082 hole_size = key.offset - search_start; 1113 hole_size = key.offset - search_start;
1083 1114
1115 /*
1116 * Have to check before we set max_hole_start, otherwise
1117 * we could end up sending back this offset anyway.
1118 */
1119 if (contains_pending_extent(trans, device,
1120 &search_start,
1121 hole_size))
1122 hole_size = 0;
1123
1084 if (hole_size > max_hole_size) { 1124 if (hole_size > max_hole_size) {
1085 max_hole_start = search_start; 1125 max_hole_start = search_start;
1086 max_hole_size = hole_size; 1126 max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
1124 max_hole_size = hole_size; 1164 max_hole_size = hole_size;
1125 } 1165 }
1126 1166
1167 if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1168 btrfs_release_path(path);
1169 goto again;
1170 }
1171
1127 /* See above. */ 1172 /* See above. */
1128 if (hole_size < num_bytes) 1173 if (hole_size < num_bytes)
1129 ret = -ENOSPC; 1174 ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
1132 1177
1133out: 1178out:
1134 btrfs_free_path(path); 1179 btrfs_free_path(path);
1135error:
1136 *start = max_hole_start; 1180 *start = max_hole_start;
1137 if (len) 1181 if (len)
1138 *len = max_hole_size; 1182 *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
1244 return ret; 1288 return ret;
1245} 1289}
1246 1290
1247static noinline int find_next_chunk(struct btrfs_root *root, 1291static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1248 u64 objectid, u64 *offset)
1249{ 1292{
1250 struct btrfs_path *path; 1293 struct extent_map_tree *em_tree;
1251 int ret; 1294 struct extent_map *em;
1252 struct btrfs_key key; 1295 struct rb_node *n;
1253 struct btrfs_chunk *chunk; 1296 u64 ret = 0;
1254 struct btrfs_key found_key;
1255
1256 path = btrfs_alloc_path();
1257 if (!path)
1258 return -ENOMEM;
1259
1260 key.objectid = objectid;
1261 key.offset = (u64)-1;
1262 key.type = BTRFS_CHUNK_ITEM_KEY;
1263
1264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1265 if (ret < 0)
1266 goto error;
1267
1268 BUG_ON(ret == 0); /* Corruption */
1269 1297
1270 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1298 em_tree = &fs_info->mapping_tree.map_tree;
1271 if (ret) { 1299 read_lock(&em_tree->lock);
1272 *offset = 0; 1300 n = rb_last(&em_tree->map);
1273 } else { 1301 if (n) {
1274 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1302 em = rb_entry(n, struct extent_map, rb_node);
1275 path->slots[0]); 1303 ret = em->start + em->len;
1276 if (found_key.objectid != objectid)
1277 *offset = 0;
1278 else {
1279 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1280 struct btrfs_chunk);
1281 *offset = found_key.offset +
1282 btrfs_chunk_length(path->nodes[0], chunk);
1283 }
1284 } 1304 }
1285 ret = 0; 1305 read_unlock(&em_tree->lock);
1286error: 1306
1287 btrfs_free_path(path);
1288 return ret; 1307 return ret;
1289} 1308}
1290 1309
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1462 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1481 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1463 1482
1464 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1483 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1465 printk(KERN_ERR "btrfs: unable to go below four devices " 1484 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1466 "on raid10\n");
1467 ret = -EINVAL;
1468 goto out; 1485 goto out;
1469 } 1486 }
1470 1487
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1488 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1472 printk(KERN_ERR "btrfs: unable to go below two " 1489 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1473 "devices on raid1\n");
1474 ret = -EINVAL;
1475 goto out; 1490 goto out;
1476 } 1491 }
1477 1492
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1493 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1479 root->fs_info->fs_devices->rw_devices <= 2) { 1494 root->fs_info->fs_devices->rw_devices <= 2) {
1480 printk(KERN_ERR "btrfs: unable to go below two " 1495 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1481 "devices on raid5\n");
1482 ret = -EINVAL;
1483 goto out; 1496 goto out;
1484 } 1497 }
1485 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1498 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1486 root->fs_info->fs_devices->rw_devices <= 3) { 1499 root->fs_info->fs_devices->rw_devices <= 3) {
1487 printk(KERN_ERR "btrfs: unable to go below three " 1500 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1488 "devices on raid6\n");
1489 ret = -EINVAL;
1490 goto out; 1501 goto out;
1491 } 1502 }
1492 1503
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1512 bh = NULL; 1523 bh = NULL;
1513 disk_super = NULL; 1524 disk_super = NULL;
1514 if (!device) { 1525 if (!device) {
1515 printk(KERN_ERR "btrfs: no missing devices found to " 1526 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1516 "remove\n");
1517 goto out; 1527 goto out;
1518 } 1528 }
1519 } else { 1529 } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1535 } 1545 }
1536 1546
1537 if (device->is_tgtdev_for_dev_replace) { 1547 if (device->is_tgtdev_for_dev_replace) {
1538 pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1548 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1539 ret = -EINVAL;
1540 goto error_brelse; 1549 goto error_brelse;
1541 } 1550 }
1542 1551
1543 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1552 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1544 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1553 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1545 "device\n");
1546 ret = -EINVAL;
1547 goto error_brelse; 1554 goto error_brelse;
1548 } 1555 }
1549 1556
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3295 } 3302 }
3296 3303
3297 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3304 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3298 if (IS_ERR(tsk)) 3305 return PTR_RET(tsk);
3299 return PTR_ERR(tsk);
3300
3301 return 0;
3302} 3306}
3303 3307
3304int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3308int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3681} 3685}
3682 3686
3683static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3687static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3684 struct btrfs_root *extent_root, 3688 struct btrfs_root *extent_root, u64 start,
3685 struct map_lookup **map_ret, 3689 u64 type)
3686 u64 *num_bytes_out, u64 *stripe_size_out,
3687 u64 start, u64 type)
3688{ 3690{
3689 struct btrfs_fs_info *info = extent_root->fs_info; 3691 struct btrfs_fs_info *info = extent_root->fs_info;
3690 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3692 struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3791 if (total_avail == 0) 3793 if (total_avail == 0)
3792 continue; 3794 continue;
3793 3795
3794 ret = find_free_dev_extent(device, 3796 ret = find_free_dev_extent(trans, device,
3795 max_stripe_size * dev_stripes, 3797 max_stripe_size * dev_stripes,
3796 &dev_offset, &max_avail); 3798 &dev_offset, &max_avail);
3797 if (ret && ret != -ENOSPC) 3799 if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3903 map->type = type; 3905 map->type = type;
3904 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3905 3907
3906 *map_ret = map;
3907 num_bytes = stripe_size * data_stripes; 3908 num_bytes = stripe_size * data_stripes;
3908 3909
3909 *stripe_size_out = stripe_size;
3910 *num_bytes_out = num_bytes;
3911
3912 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3910 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3913 3911
3914 em = alloc_extent_map(); 3912 em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3921 em->len = num_bytes; 3919 em->len = num_bytes;
3922 em->block_start = 0; 3920 em->block_start = 0;
3923 em->block_len = em->len; 3921 em->block_len = em->len;
3922 em->orig_block_len = stripe_size;
3924 3923
3925 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3924 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3926 write_lock(&em_tree->lock); 3925 write_lock(&em_tree->lock);
3927 ret = add_extent_mapping(em_tree, em, 0); 3926 ret = add_extent_mapping(em_tree, em, 0);
3927 if (!ret) {
3928 list_add_tail(&em->list, &trans->transaction->pending_chunks);
3929 atomic_inc(&em->refs);
3930 }
3928 write_unlock(&em_tree->lock); 3931 write_unlock(&em_tree->lock);
3929 if (ret) { 3932 if (ret) {
3930 free_extent_map(em); 3933 free_extent_map(em);
3931 goto error; 3934 goto error;
3932 } 3935 }
3933 3936
3934 for (i = 0; i < map->num_stripes; ++i) {
3935 struct btrfs_device *device;
3936 u64 dev_offset;
3937
3938 device = map->stripes[i].dev;
3939 dev_offset = map->stripes[i].physical;
3940
3941 ret = btrfs_alloc_dev_extent(trans, device,
3942 info->chunk_root->root_key.objectid,
3943 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3944 start, dev_offset, stripe_size);
3945 if (ret)
3946 goto error_dev_extent;
3947 }
3948
3949 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3937 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3950 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3938 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3951 start, num_bytes); 3939 start, num_bytes);
3952 if (ret) { 3940 if (ret)
3953 i = map->num_stripes - 1; 3941 goto error_del_extent;
3954 goto error_dev_extent;
3955 }
3956 3942
3957 free_extent_map(em); 3943 free_extent_map(em);
3958 check_raid56_incompat_flag(extent_root->fs_info, type); 3944 check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3960 kfree(devices_info); 3946 kfree(devices_info);
3961 return 0; 3947 return 0;
3962 3948
3963error_dev_extent: 3949error_del_extent:
3964 for (; i >= 0; i--) {
3965 struct btrfs_device *device;
3966 int err;
3967
3968 device = map->stripes[i].dev;
3969 err = btrfs_free_dev_extent(trans, device, start);
3970 if (err) {
3971 btrfs_abort_transaction(trans, extent_root, err);
3972 break;
3973 }
3974 }
3975 write_lock(&em_tree->lock); 3950 write_lock(&em_tree->lock);
3976 remove_extent_mapping(em_tree, em); 3951 remove_extent_mapping(em_tree, em);
3977 write_unlock(&em_tree->lock); 3952 write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
3986 return ret; 3961 return ret;
3987} 3962}
3988 3963
3989static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3964int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, 3965 struct btrfs_root *extent_root,
3991 struct map_lookup *map, u64 chunk_offset, 3966 u64 chunk_offset, u64 chunk_size)
3992 u64 chunk_size, u64 stripe_size)
3993{ 3967{
3994 u64 dev_offset;
3995 struct btrfs_key key; 3968 struct btrfs_key key;
3996 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3969 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3997 struct btrfs_device *device; 3970 struct btrfs_device *device;
3998 struct btrfs_chunk *chunk; 3971 struct btrfs_chunk *chunk;
3999 struct btrfs_stripe *stripe; 3972 struct btrfs_stripe *stripe;
4000 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3973 struct extent_map_tree *em_tree;
4001 int index = 0; 3974 struct extent_map *em;
3975 struct map_lookup *map;
3976 size_t item_size;
3977 u64 dev_offset;
3978 u64 stripe_size;
3979 int i = 0;
4002 int ret; 3980 int ret;
4003 3981
3982 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3983 read_lock(&em_tree->lock);
3984 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
3985 read_unlock(&em_tree->lock);
3986
3987 if (!em) {
3988 btrfs_crit(extent_root->fs_info, "unable to find logical "
3989 "%Lu len %Lu", chunk_offset, chunk_size);
3990 return -EINVAL;
3991 }
3992
3993 if (em->start != chunk_offset || em->len != chunk_size) {
3994 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
3995 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
3996 chunk_size, em->start, em->len);
3997 free_extent_map(em);
3998 return -EINVAL;
3999 }
4000
4001 map = (struct map_lookup *)em->bdev;
4002 item_size = btrfs_chunk_item_size(map->num_stripes);
4003 stripe_size = em->orig_block_len;
4004
4004 chunk = kzalloc(item_size, GFP_NOFS); 4005 chunk = kzalloc(item_size, GFP_NOFS);
4005 if (!chunk) 4006 if (!chunk) {
4006 return -ENOMEM; 4007 ret = -ENOMEM;
4008 goto out;
4009 }
4010
4011 for (i = 0; i < map->num_stripes; i++) {
4012 device = map->stripes[i].dev;
4013 dev_offset = map->stripes[i].physical;
4007 4014
4008 index = 0;
4009 while (index < map->num_stripes) {
4010 device = map->stripes[index].dev;
4011 device->bytes_used += stripe_size; 4015 device->bytes_used += stripe_size;
4012 ret = btrfs_update_device(trans, device); 4016 ret = btrfs_update_device(trans, device);
4013 if (ret) 4017 if (ret)
4014 goto out_free; 4018 goto out;
4015 index++; 4019 ret = btrfs_alloc_dev_extent(trans, device,
4020 chunk_root->root_key.objectid,
4021 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4022 chunk_offset, dev_offset,
4023 stripe_size);
4024 if (ret)
4025 goto out;
4016 } 4026 }
4017 4027
4018 spin_lock(&extent_root->fs_info->free_chunk_lock); 4028 spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4020 map->num_stripes); 4030 map->num_stripes);
4021 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4031 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4022 4032
4023 index = 0;
4024 stripe = &chunk->stripe; 4033 stripe = &chunk->stripe;
4025 while (index < map->num_stripes) { 4034 for (i = 0; i < map->num_stripes; i++) {
4026 device = map->stripes[index].dev; 4035 device = map->stripes[i].dev;
4027 dev_offset = map->stripes[index].physical; 4036 dev_offset = map->stripes[i].physical;
4028 4037
4029 btrfs_set_stack_stripe_devid(stripe, device->devid); 4038 btrfs_set_stack_stripe_devid(stripe, device->devid);
4030 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4039 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4031 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4040 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4032 stripe++; 4041 stripe++;
4033 index++;
4034 } 4042 }
4035 4043
4036 btrfs_set_stack_chunk_length(chunk, chunk_size); 4044 btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4048 key.offset = chunk_offset; 4056 key.offset = chunk_offset;
4049 4057
4050 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4058 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4051
4052 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4059 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4053 /* 4060 /*
4054 * TODO: Cleanup of inserted chunk root in case of 4061 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4058 item_size); 4065 item_size);
4059 } 4066 }
4060 4067
4061out_free: 4068out:
4062 kfree(chunk); 4069 kfree(chunk);
4070 free_extent_map(em);
4063 return ret; 4071 return ret;
4064} 4072}
4065 4073
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4074 struct btrfs_root *extent_root, u64 type) 4082 struct btrfs_root *extent_root, u64 type)
4075{ 4083{
4076 u64 chunk_offset; 4084 u64 chunk_offset;
4077 u64 chunk_size;
4078 u64 stripe_size;
4079 struct map_lookup *map;
4080 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4081 int ret;
4082
4083 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4084 &chunk_offset);
4085 if (ret)
4086 return ret;
4087 4085
4088 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4086 chunk_offset = find_next_chunk(extent_root->fs_info);
4089 &stripe_size, chunk_offset, type); 4087 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4090 if (ret)
4091 return ret;
4092
4093 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4094 chunk_size, stripe_size);
4095 if (ret)
4096 return ret;
4097 return 0;
4098} 4088}
4099 4089
4100static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4090static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4103{ 4093{
4104 u64 chunk_offset; 4094 u64 chunk_offset;
4105 u64 sys_chunk_offset; 4095 u64 sys_chunk_offset;
4106 u64 chunk_size;
4107 u64 sys_chunk_size;
4108 u64 stripe_size;
4109 u64 sys_stripe_size;
4110 u64 alloc_profile; 4096 u64 alloc_profile;
4111 struct map_lookup *map;
4112 struct map_lookup *sys_map;
4113 struct btrfs_fs_info *fs_info = root->fs_info; 4097 struct btrfs_fs_info *fs_info = root->fs_info;
4114 struct btrfs_root *extent_root = fs_info->extent_root; 4098 struct btrfs_root *extent_root = fs_info->extent_root;
4115 int ret; 4099 int ret;
4116 4100
4117 ret = find_next_chunk(fs_info->chunk_root, 4101 chunk_offset = find_next_chunk(fs_info);
4118 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
4119 if (ret)
4120 return ret;
4121
4122 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4102 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4123 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4103 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4124 &stripe_size, chunk_offset, alloc_profile); 4104 alloc_profile);
4125 if (ret) 4105 if (ret)
4126 return ret; 4106 return ret;
4127 4107
4128 sys_chunk_offset = chunk_offset + chunk_size; 4108 sys_chunk_offset = find_next_chunk(root->fs_info);
4129
4130 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4109 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4131 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4110 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4132 &sys_chunk_size, &sys_stripe_size, 4111 alloc_profile);
4133 sys_chunk_offset, alloc_profile);
4134 if (ret) { 4112 if (ret) {
4135 btrfs_abort_transaction(trans, root, ret); 4113 btrfs_abort_transaction(trans, root, ret);
4136 goto out; 4114 goto out;
4137 } 4115 }
4138 4116
4139 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4117 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4140 if (ret) {
4141 btrfs_abort_transaction(trans, root, ret);
4142 goto out;
4143 }
4144
4145 /*
4146 * Modifying chunk tree needs allocating new blocks from both
4147 * system block group and metadata block group. So we only can
4148 * do operations require modifying the chunk tree after both
4149 * block groups were created.
4150 */
4151 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4152 chunk_size, stripe_size);
4153 if (ret) {
4154 btrfs_abort_transaction(trans, root, ret);
4155 goto out;
4156 }
4157
4158 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
4159 sys_chunk_offset, sys_chunk_size,
4160 sys_stripe_size);
4161 if (ret) 4118 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4119 btrfs_abort_transaction(trans, root, ret);
4163
4164out: 4120out:
4165
4166 return ret; 4121 return ret;
4167} 4122}
4168 4123
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4435 map = (struct map_lookup *)em->bdev; 4390 map = (struct map_lookup *)em->bdev;
4436 offset = logical - em->start; 4391 offset = logical - em->start;
4437 4392
4438 if (mirror_num > map->num_stripes)
4439 mirror_num = 0;
4440
4441 stripe_len = map->stripe_len; 4393 stripe_len = map->stripe_len;
4442 stripe_nr = offset; 4394 stripe_nr = offset;
4443 /* 4395 /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5367 return NULL; 5319 return NULL;
5368 list_add(&device->dev_list, 5320 list_add(&device->dev_list,
5369 &fs_devices->devices); 5321 &fs_devices->devices);
5370 device->dev_root = root->fs_info->dev_root;
5371 device->devid = devid; 5322 device->devid = devid;
5372 device->work.func = pending_bios_fn; 5323 device->work.func = pending_bios_fn;
5373 device->fs_devices = fs_devices; 5324 device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
5593 } 5544 }
5594 5545
5595 fill_device_from_item(leaf, dev_item, device); 5546 fill_device_from_item(leaf, dev_item, device);
5596 device->dev_root = root->fs_info->dev_root;
5597 device->in_fs_metadata = 1; 5547 device->in_fs_metadata = 1;
5598 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5548 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5599 device->fs_devices->total_rw_bytes += device->total_bytes; 5549 device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
5751 return ret; 5701 return ret;
5752} 5702}
5753 5703
5704void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
5705{
5706 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5707 struct btrfs_device *device;
5708
5709 mutex_lock(&fs_devices->device_list_mutex);
5710 list_for_each_entry(device, &fs_devices->devices, dev_list)
5711 device->dev_root = fs_info->dev_root;
5712 mutex_unlock(&fs_devices->device_list_mutex);
5713}
5714
5754static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 5715static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5755{ 5716{
5756 int i; 5717 int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
316int btrfs_pause_balance(struct btrfs_fs_info *fs_info); 316int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); 317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
319int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 319int find_free_dev_extent(struct btrfs_trans_handle *trans,
320 struct btrfs_device *device, u64 num_bytes,
320 u64 *start, u64 *max_avail); 321 u64 *start, u64 *max_avail);
321void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 322void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
322int btrfs_get_dev_stats(struct btrfs_root *root, 323int btrfs_get_dev_stats(struct btrfs_root *root,
323 struct btrfs_ioctl_get_dev_stats *stats); 324 struct btrfs_ioctl_get_dev_stats *stats);
325void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
324int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 326int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
325int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 327int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
326 struct btrfs_fs_info *fs_info); 328 struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
336unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 338unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
337 struct btrfs_mapping_tree *map_tree, 339 struct btrfs_mapping_tree *map_tree,
338 u64 logical); 340 u64 logical);
341int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
342 struct btrfs_root *extent_root,
343 u64 chunk_offset, u64 chunk_size);
339static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 344static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
340 int index) 345 int index)
341{ 346{
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b5c1bc6776..5318a3b704f6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -439,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
439 struct ceph_inode_info *ci; 439 struct ceph_inode_info *ci;
440 struct ceph_fs_client *fsc; 440 struct ceph_fs_client *fsc;
441 struct ceph_osd_client *osdc; 441 struct ceph_osd_client *osdc;
442 loff_t page_off = page_offset(page);
443 int len = PAGE_CACHE_SIZE;
444 loff_t i_size;
445 int err = 0;
446 struct ceph_snap_context *snapc, *oldest; 442 struct ceph_snap_context *snapc, *oldest;
447 u64 snap_size = 0; 443 loff_t page_off = page_offset(page);
448 long writeback_stat; 444 long writeback_stat;
445 u64 truncate_size, snap_size = 0;
446 u32 truncate_seq;
447 int err = 0, len = PAGE_CACHE_SIZE;
449 448
450 dout("writepage %p idx %lu\n", page, page->index); 449 dout("writepage %p idx %lu\n", page, page->index);
451 450
@@ -475,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
475 } 474 }
476 ceph_put_snap_context(oldest); 475 ceph_put_snap_context(oldest);
477 476
477 spin_lock(&ci->i_ceph_lock);
478 truncate_seq = ci->i_truncate_seq;
479 truncate_size = ci->i_truncate_size;
480 if (!snap_size)
481 snap_size = i_size_read(inode);
482 spin_unlock(&ci->i_ceph_lock);
483
478 /* is this a partial page at end of file? */ 484 /* is this a partial page at end of file? */
479 if (snap_size) 485 if (page_off >= snap_size) {
480 i_size = snap_size; 486 dout("%p page eof %llu\n", page, snap_size);
481 else 487 goto out;
482 i_size = i_size_read(inode); 488 }
483 if (i_size < page_off + len) 489 if (snap_size < page_off + len)
484 len = i_size - page_off; 490 len = snap_size - page_off;
485 491
486 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 492 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
487 inode, page, page->index, page_off, len, snapc); 493 inode, page, page->index, page_off, len, snapc);
@@ -495,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
495 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 501 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
496 &ci->i_layout, snapc, 502 &ci->i_layout, snapc,
497 page_off, len, 503 page_off, len,
498 ci->i_truncate_seq, ci->i_truncate_size, 504 truncate_seq, truncate_size,
499 &inode->i_mtime, &page, 1); 505 &inode->i_mtime, &page, 1);
500 if (err < 0) { 506 if (err < 0) {
501 dout("writepage setting page/mapping error %d %p\n", err, page); 507 dout("writepage setting page/mapping error %d %p\n", err, page);
@@ -632,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req,
632 ceph_osdc_put_request(req); 638 ceph_osdc_put_request(req);
633} 639}
634 640
635static struct ceph_osd_request *
636ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
637 struct ceph_snap_context *snapc, int num_ops)
638{
639 struct ceph_fs_client *fsc;
640 struct ceph_inode_info *ci;
641 struct ceph_vino vino;
642
643 fsc = ceph_inode_to_client(inode);
644 ci = ceph_inode(inode);
645 vino = ceph_vino(inode);
646 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
647
648 return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
649 vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
650 CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
651 snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
652}
653
654/* 641/*
655 * initiate async writeback 642 * initiate async writeback
656 */ 643 */
@@ -659,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
659{ 646{
660 struct inode *inode = mapping->host; 647 struct inode *inode = mapping->host;
661 struct ceph_inode_info *ci = ceph_inode(inode); 648 struct ceph_inode_info *ci = ceph_inode(inode);
662 struct ceph_fs_client *fsc; 649 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
650 struct ceph_vino vino = ceph_vino(inode);
663 pgoff_t index, start, end; 651 pgoff_t index, start, end;
664 int range_whole = 0; 652 int range_whole = 0;
665 int should_loop = 1; 653 int should_loop = 1;
@@ -671,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
671 unsigned wsize = 1 << inode->i_blkbits; 659 unsigned wsize = 1 << inode->i_blkbits;
672 struct ceph_osd_request *req = NULL; 660 struct ceph_osd_request *req = NULL;
673 int do_sync; 661 int do_sync;
674 u64 snap_size; 662 u64 truncate_size, snap_size;
663 u32 truncate_seq;
675 664
676 /* 665 /*
677 * Include a 'sync' in the OSD request if this is a data 666 * Include a 'sync' in the OSD request if this is a data
678 * integrity write (e.g., O_SYNC write or fsync()), or if our 667 * integrity write (e.g., O_SYNC write or fsync()), or if our
679 * cap is being revoked. 668 * cap is being revoked.
680 */ 669 */
681 do_sync = wbc->sync_mode == WB_SYNC_ALL; 670 if ((wbc->sync_mode == WB_SYNC_ALL) ||
682 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) 671 ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
683 do_sync = 1; 672 do_sync = 1;
684 dout("writepages_start %p dosync=%d (mode=%s)\n", 673 dout("writepages_start %p dosync=%d (mode=%s)\n",
685 inode, do_sync, 674 inode, do_sync,
686 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 675 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
687 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 676 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
688 677
689 fsc = ceph_inode_to_client(inode);
690 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { 678 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
691 pr_warning("writepage_start %p on forced umount\n", inode); 679 pr_warning("writepage_start %p on forced umount\n", inode);
692 return -EIO; /* we're in a forced umount, don't write! */ 680 return -EIO; /* we're in a forced umount, don't write! */
@@ -729,6 +717,14 @@ retry:
729 snap_size = i_size_read(inode); 717 snap_size = i_size_read(inode);
730 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 718 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
731 snapc, snapc->seq, snapc->num_snaps); 719 snapc, snapc->seq, snapc->num_snaps);
720
721 spin_lock(&ci->i_ceph_lock);
722 truncate_seq = ci->i_truncate_seq;
723 truncate_size = ci->i_truncate_size;
724 if (!snap_size)
725 snap_size = i_size_read(inode);
726 spin_unlock(&ci->i_ceph_lock);
727
732 if (last_snapc && snapc != last_snapc) { 728 if (last_snapc && snapc != last_snapc) {
733 /* if we switched to a newer snapc, restart our scan at the 729 /* if we switched to a newer snapc, restart our scan at the
734 * start of the original file range. */ 730 * start of the original file range. */
@@ -740,7 +736,6 @@ retry:
740 736
741 while (!done && index <= end) { 737 while (!done && index <= end) {
742 int num_ops = do_sync ? 2 : 1; 738 int num_ops = do_sync ? 2 : 1;
743 struct ceph_vino vino;
744 unsigned i; 739 unsigned i;
745 int first; 740 int first;
746 pgoff_t next; 741 pgoff_t next;
@@ -834,17 +829,18 @@ get_more_pages:
834 * that it will use. 829 * that it will use.
835 */ 830 */
836 if (locked_pages == 0) { 831 if (locked_pages == 0) {
837 size_t size;
838
839 BUG_ON(pages); 832 BUG_ON(pages);
840
841 /* prepare async write request */ 833 /* prepare async write request */
842 offset = (u64)page_offset(page); 834 offset = (u64)page_offset(page);
843 len = wsize; 835 len = wsize;
844 req = ceph_writepages_osd_request(inode, 836 req = ceph_osdc_new_request(&fsc->client->osdc,
845 offset, &len, snapc, 837 &ci->i_layout, vino,
846 num_ops); 838 offset, &len, num_ops,
847 839 CEPH_OSD_OP_WRITE,
840 CEPH_OSD_FLAG_WRITE |
841 CEPH_OSD_FLAG_ONDISK,
842 snapc, truncate_seq,
843 truncate_size, true);
848 if (IS_ERR(req)) { 844 if (IS_ERR(req)) {
849 rc = PTR_ERR(req); 845 rc = PTR_ERR(req);
850 unlock_page(page); 846 unlock_page(page);
@@ -855,8 +851,8 @@ get_more_pages:
855 req->r_inode = inode; 851 req->r_inode = inode;
856 852
857 max_pages = calc_pages_for(0, (u64)len); 853 max_pages = calc_pages_for(0, (u64)len);
858 size = max_pages * sizeof (*pages); 854 pages = kmalloc(max_pages * sizeof (*pages),
859 pages = kmalloc(size, GFP_NOFS); 855 GFP_NOFS);
860 if (!pages) { 856 if (!pages) {
861 pool = fsc->wb_pagevec_pool; 857 pool = fsc->wb_pagevec_pool;
862 pages = mempool_alloc(pool, GFP_NOFS); 858 pages = mempool_alloc(pool, GFP_NOFS);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index da0f9b8a3bcb..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
147 spin_unlock(&mdsc->caps_list_lock); 147 spin_unlock(&mdsc->caps_list_lock);
148} 148}
149 149
150int ceph_reserve_caps(struct ceph_mds_client *mdsc, 150void ceph_reserve_caps(struct ceph_mds_client *mdsc,
151 struct ceph_cap_reservation *ctx, int need) 151 struct ceph_cap_reservation *ctx, int need)
152{ 152{
153 int i; 153 int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
155 int have; 155 int have;
156 int alloc = 0; 156 int alloc = 0;
157 LIST_HEAD(newcaps); 157 LIST_HEAD(newcaps);
158 int ret = 0;
159 158
160 dout("reserve caps ctx=%p need=%d\n", ctx, need); 159 dout("reserve caps ctx=%p need=%d\n", ctx, need);
161 160
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
174 173
175 for (i = have; i < need; i++) { 174 for (i = have; i < need; i++) {
176 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 175 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
177 if (!cap) { 176 if (!cap)
178 ret = -ENOMEM; 177 break;
179 goto out_alloc_count;
180 }
181 list_add(&cap->caps_item, &newcaps); 178 list_add(&cap->caps_item, &newcaps);
182 alloc++; 179 alloc++;
183 } 180 }
184 BUG_ON(have + alloc != need); 181 /* we didn't manage to reserve as much as we needed */
182 if (have + alloc != need)
183 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
184 ctx, need, have + alloc);
185 185
186 spin_lock(&mdsc->caps_list_lock); 186 spin_lock(&mdsc->caps_list_lock);
187 mdsc->caps_total_count += alloc; 187 mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 197 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
198 ctx, mdsc->caps_total_count, mdsc->caps_use_count, 198 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
199 mdsc->caps_reserve_count, mdsc->caps_avail_count); 199 mdsc->caps_reserve_count, mdsc->caps_avail_count);
200 return 0;
201
202out_alloc_count:
203 /* we didn't manage to reserve as much as we needed */
204 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
205 ctx, need, have);
206 return ret;
207} 200}
208 201
209int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 202int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -612,9 +605,11 @@ retry:
612 __cap_delay_requeue(mdsc, ci); 605 __cap_delay_requeue(mdsc, ci);
613 } 606 }
614 607
615 if (flags & CEPH_CAP_FLAG_AUTH) 608 if (flags & CEPH_CAP_FLAG_AUTH) {
616 ci->i_auth_cap = cap; 609 if (ci->i_auth_cap == NULL ||
617 else if (ci->i_auth_cap == cap) { 610 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
611 ci->i_auth_cap = cap;
612 } else if (ci->i_auth_cap == cap) {
618 ci->i_auth_cap = NULL; 613 ci->i_auth_cap = NULL;
619 spin_lock(&mdsc->cap_dirty_lock); 614 spin_lock(&mdsc->cap_dirty_lock);
620 if (!list_empty(&ci->i_dirty_item)) { 615 if (!list_empty(&ci->i_dirty_item)) {
@@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
695 if (implemented) 690 if (implemented)
696 *implemented |= cap->implemented; 691 *implemented |= cap->implemented;
697 } 692 }
693 /*
694 * exclude caps issued by non-auth MDS, but are been revoking
695 * by the auth MDS. The non-auth MDS should be revoking/exporting
696 * these caps, but the message is delayed.
697 */
698 if (ci->i_auth_cap) {
699 cap = ci->i_auth_cap;
700 have &= ~cap->implemented | cap->issued;
701 }
698 return have; 702 return have;
699} 703}
700 704
@@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
802/* 806/*
803 * Return true if mask caps are currently being revoked by an MDS. 807 * Return true if mask caps are currently being revoked by an MDS.
804 */ 808 */
805int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) 809int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
810 struct ceph_cap *ocap, int mask)
806{ 811{
807 struct inode *inode = &ci->vfs_inode;
808 struct ceph_cap *cap; 812 struct ceph_cap *cap;
809 struct rb_node *p; 813 struct rb_node *p;
810 int ret = 0;
811 814
812 spin_lock(&ci->i_ceph_lock);
813 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 815 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
814 cap = rb_entry(p, struct ceph_cap, ci_node); 816 cap = rb_entry(p, struct ceph_cap, ci_node);
815 if (__cap_is_valid(cap) && 817 if (cap != ocap && __cap_is_valid(cap) &&
816 (cap->implemented & ~cap->issued & mask)) { 818 (cap->implemented & ~cap->issued & mask))
817 ret = 1; 819 return 1;
818 break;
819 }
820 } 820 }
821 return 0;
822}
823
824int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
825{
826 struct inode *inode = &ci->vfs_inode;
827 int ret;
828
829 spin_lock(&ci->i_ceph_lock);
830 ret = __ceph_caps_revoking_other(ci, NULL, mask);
821 spin_unlock(&ci->i_ceph_lock); 831 spin_unlock(&ci->i_ceph_lock);
822 dout("ceph_caps_revoking %p %s = %d\n", inode, 832 dout("ceph_caps_revoking %p %s = %d\n", inode,
823 ceph_cap_string(mask), ret); 833 ceph_cap_string(mask), ret);
@@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
1980 cap = ci->i_auth_cap; 1990 cap = ci->i_auth_cap;
1981 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, 1991 dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
1982 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); 1992 ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
1993
1983 __ceph_flush_snaps(ci, &session, 1); 1994 __ceph_flush_snaps(ci, &session, 1);
1995
1984 if (ci->i_flushing_caps) { 1996 if (ci->i_flushing_caps) {
1997 spin_lock(&mdsc->cap_dirty_lock);
1998 list_move_tail(&ci->i_flushing_item,
1999 &cap->session->s_cap_flushing);
2000 spin_unlock(&mdsc->cap_dirty_lock);
2001
1985 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2002 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1986 __ceph_caps_used(ci), 2003 __ceph_caps_used(ci),
1987 __ceph_caps_wanted(ci), 2004 __ceph_caps_wanted(ci),
@@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2055 /* finish pending truncate */ 2072 /* finish pending truncate */
2056 while (ci->i_truncate_pending) { 2073 while (ci->i_truncate_pending) {
2057 spin_unlock(&ci->i_ceph_lock); 2074 spin_unlock(&ci->i_ceph_lock);
2058 __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); 2075 if (!(need & CEPH_CAP_FILE_WR))
2076 mutex_lock(&inode->i_mutex);
2077 __ceph_do_pending_vmtruncate(inode);
2078 if (!(need & CEPH_CAP_FILE_WR))
2079 mutex_unlock(&inode->i_mutex);
2059 spin_lock(&ci->i_ceph_lock); 2080 spin_lock(&ci->i_ceph_lock);
2060 } 2081 }
2061 2082
@@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2473 } else { 2494 } else {
2474 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), 2495 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2475 ceph_cap_string(newcaps)); 2496 ceph_cap_string(newcaps));
2497 /* non-auth MDS is revoking the newly grant caps ? */
2498 if (cap == ci->i_auth_cap &&
2499 __ceph_caps_revoking_other(ci, cap, newcaps))
2500 check_caps = 2;
2501
2476 cap->issued = newcaps; 2502 cap->issued = newcaps;
2477 cap->implemented |= newcaps; /* add bits only, to 2503 cap->implemented |= newcaps; /* add bits only, to
2478 * avoid stepping on a 2504 * avoid stepping on a
@@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
3042 (cap->issued & unless) == 0)) { 3068 (cap->issued & unless) == 0)) {
3043 if ((cap->issued & drop) && 3069 if ((cap->issued & drop) &&
3044 (cap->issued & unless) == 0) { 3070 (cap->issued & unless) == 0) {
3045 dout("encode_inode_release %p cap %p %s -> " 3071 int wanted = __ceph_caps_wanted(ci);
3046 "%s\n", inode, cap, 3072 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
3073 wanted |= cap->mds_wanted;
3074 dout("encode_inode_release %p cap %p "
3075 "%s -> %s, wanted %s -> %s\n", inode, cap,
3047 ceph_cap_string(cap->issued), 3076 ceph_cap_string(cap->issued),
3048 ceph_cap_string(cap->issued & ~drop)); 3077 ceph_cap_string(cap->issued & ~drop),
3078 ceph_cap_string(cap->mds_wanted),
3079 ceph_cap_string(wanted));
3080
3049 cap->issued &= ~drop; 3081 cap->issued &= ~drop;
3050 cap->implemented &= ~drop; 3082 cap->implemented &= ~drop;
3051 if (ci->i_ceph_flags & CEPH_I_NODELAY) { 3083 cap->mds_wanted = wanted;
3052 int wanted = __ceph_caps_wanted(ci);
3053 dout(" wanted %s -> %s (act %s)\n",
3054 ceph_cap_string(cap->mds_wanted),
3055 ceph_cap_string(cap->mds_wanted &
3056 ~wanted),
3057 ceph_cap_string(wanted));
3058 cap->mds_wanted &= wanted;
3059 }
3060 } else { 3084 } else {
3061 dout("encode_inode_release %p cap %p %s" 3085 dout("encode_inode_release %p cap %p %s"
3062 " (force)\n", inode, cap, 3086 " (force)\n", inode, cap,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 16c989d3e23c..2ddf061c1c4a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
716 if (ceph_snap(inode) != CEPH_NOSNAP) 716 if (ceph_snap(inode) != CEPH_NOSNAP)
717 return -EROFS; 717 return -EROFS;
718 718
719 sb_start_write(inode->i_sb);
720 mutex_lock(&inode->i_mutex); 719 mutex_lock(&inode->i_mutex);
721 hold_mutex = true; 720 hold_mutex = true;
722 721
@@ -809,7 +808,6 @@ retry_snap:
809out: 808out:
810 if (hold_mutex) 809 if (hold_mutex)
811 mutex_unlock(&inode->i_mutex); 810 mutex_unlock(&inode->i_mutex);
812 sb_end_write(inode->i_sb);
813 current->backing_dev_info = NULL; 811 current->backing_dev_info = NULL;
814 812
815 return written ? written : err; 813 return written ? written : err;
@@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
824 int ret; 822 int ret;
825 823
826 mutex_lock(&inode->i_mutex); 824 mutex_lock(&inode->i_mutex);
827 __ceph_do_pending_vmtruncate(inode, false); 825 __ceph_do_pending_vmtruncate(inode);
828 826
829 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 827 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
830 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 828 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e20d62e..f3a2abf28a77 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
903 } else if (realdn) { 903 } else if (realdn) {
904 dout("dn %p (%d) spliced with %p (%d) " 904 dout("dn %p (%d) spliced with %p (%d) "
905 "inode %p ino %llx.%llx\n", 905 "inode %p ino %llx.%llx\n",
906 dn, dn->d_count, 906 dn, d_count(dn),
907 realdn, realdn->d_count, 907 realdn, d_count(realdn),
908 realdn->d_inode, ceph_vinop(realdn->d_inode)); 908 realdn->d_inode, ceph_vinop(realdn->d_inode));
909 dput(dn); 909 dput(dn);
910 dn = realdn; 910 dn = realdn;
@@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work)
1465 struct inode *inode = &ci->vfs_inode; 1465 struct inode *inode = &ci->vfs_inode;
1466 1466
1467 dout("vmtruncate_work %p\n", inode); 1467 dout("vmtruncate_work %p\n", inode);
1468 __ceph_do_pending_vmtruncate(inode, true); 1468 mutex_lock(&inode->i_mutex);
1469 __ceph_do_pending_vmtruncate(inode);
1470 mutex_unlock(&inode->i_mutex);
1469 iput(inode); 1471 iput(inode);
1470} 1472}
1471 1473
@@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1492 * Make sure any pending truncation is applied before doing anything 1494 * Make sure any pending truncation is applied before doing anything
1493 * that may depend on it. 1495 * that may depend on it.
1494 */ 1496 */
1495void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) 1497void __ceph_do_pending_vmtruncate(struct inode *inode)
1496{ 1498{
1497 struct ceph_inode_info *ci = ceph_inode(inode); 1499 struct ceph_inode_info *ci = ceph_inode(inode);
1498 u64 to; 1500 u64 to;
@@ -1525,11 +1527,7 @@ retry:
1525 ci->i_truncate_pending, to); 1527 ci->i_truncate_pending, to);
1526 spin_unlock(&ci->i_ceph_lock); 1528 spin_unlock(&ci->i_ceph_lock);
1527 1529
1528 if (needlock)
1529 mutex_lock(&inode->i_mutex);
1530 truncate_inode_pages(inode->i_mapping, to); 1530 truncate_inode_pages(inode->i_mapping, to);
1531 if (needlock)
1532 mutex_unlock(&inode->i_mutex);
1533 1531
1534 spin_lock(&ci->i_ceph_lock); 1532 spin_lock(&ci->i_ceph_lock);
1535 if (to == ci->i_truncate_size) { 1533 if (to == ci->i_truncate_size) {
@@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1588 if (ceph_snap(inode) != CEPH_NOSNAP) 1586 if (ceph_snap(inode) != CEPH_NOSNAP)
1589 return -EROFS; 1587 return -EROFS;
1590 1588
1591 __ceph_do_pending_vmtruncate(inode, false); 1589 __ceph_do_pending_vmtruncate(inode);
1592 1590
1593 err = inode_change_ok(inode, attr); 1591 err = inode_change_ok(inode, attr);
1594 if (err != 0) 1592 if (err != 0)
@@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1770 ceph_cap_string(dirtied), mask); 1768 ceph_cap_string(dirtied), mask);
1771 1769
1772 ceph_mdsc_put_request(req); 1770 ceph_mdsc_put_request(req);
1773 __ceph_do_pending_vmtruncate(inode, false); 1771 __ceph_do_pending_vmtruncate(inode);
1774 return err; 1772 return err;
1775out: 1773out:
1776 spin_unlock(&ci->i_ceph_lock); 1774 spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 690f73f42425..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
169} 169}
170 170
171/** 171/**
172 * Must be called with BKL already held. Fills in the passed 172 * Must be called with lock_flocks() already held. Fills in the passed
173 * counter variables, so you can prepare pagelist metadata before calling 173 * counter variables, so you can prepare pagelist metadata before calling
174 * ceph_encode_locks. 174 * ceph_encode_locks.
175 */ 175 */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 74fd2898b2ab..187bf214444d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1391 num = le32_to_cpu(head->num); 1391 num = le32_to_cpu(head->num);
1392 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); 1392 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1393 head->num = cpu_to_le32(0); 1393 head->num = cpu_to_le32(0);
1394 msg->front.iov_len = sizeof(*head);
1394 session->s_num_cap_releases += num; 1395 session->s_num_cap_releases += num;
1395 1396
1396 /* requeue completed messages */ 1397 /* requeue completed messages */
@@ -1553,7 +1554,7 @@ retry:
1553 *base = ceph_ino(temp->d_inode); 1554 *base = ceph_ino(temp->d_inode);
1554 *plen = len; 1555 *plen = len;
1555 dout("build_path on %p %d built %llx '%.*s'\n", 1556 dout("build_path on %p %d built %llx '%.*s'\n",
1556 dentry, dentry->d_count, *base, len, path); 1557 dentry, d_count(dentry), *base, len, path);
1557 return path; 1558 return path;
1558} 1559}
1559 1560
@@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2454 spin_lock(&ci->i_ceph_lock); 2455 spin_lock(&ci->i_ceph_lock);
2455 cap->seq = 0; /* reset cap seq */ 2456 cap->seq = 0; /* reset cap seq */
2456 cap->issue_seq = 0; /* and issue_seq */ 2457 cap->issue_seq = 0; /* and issue_seq */
2458 cap->mseq = 0; /* and migrate_seq */
2457 2459
2458 if (recon_state->flock) { 2460 if (recon_state->flock) {
2459 rec.v2.cap_id = cpu_to_le64(cap->cap_id); 2461 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -3040,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3040 fsc->mdsc = mdsc; 3042 fsc->mdsc = mdsc;
3041 mutex_init(&mdsc->mutex); 3043 mutex_init(&mdsc->mutex);
3042 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3044 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
3043 if (mdsc->mdsmap == NULL) 3045 if (mdsc->mdsmap == NULL) {
3046 kfree(mdsc);
3044 return -ENOMEM; 3047 return -ENOMEM;
3048 }
3045 3049
3046 init_completion(&mdsc->safe_umount_waiters); 3050 init_completion(&mdsc->safe_umount_waiters);
3047 init_waitqueue_head(&mdsc->session_close_wq); 3051 init_waitqueue_head(&mdsc->session_close_wq);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..132b64eeecd4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
92 u32 num_export_targets; 92 u32 num_export_targets;
93 void *pexport_targets = NULL; 93 void *pexport_targets = NULL;
94 struct ceph_timespec laggy_since; 94 struct ceph_timespec laggy_since;
95 struct ceph_mds_info *info;
95 96
96 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 97 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
97 global_id = ceph_decode_64(p); 98 global_id = ceph_decode_64(p);
@@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
126 i+1, n, global_id, mds, inc, 127 i+1, n, global_id, mds, inc,
127 ceph_pr_addr(&addr.in_addr), 128 ceph_pr_addr(&addr.in_addr),
128 ceph_mds_state_name(state)); 129 ceph_mds_state_name(state));
129 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 130
130 m->m_info[mds].global_id = global_id; 131 if (mds < 0 || mds >= m->m_max_mds || state <= 0)
131 m->m_info[mds].state = state; 132 continue;
132 m->m_info[mds].addr = addr; 133
133 m->m_info[mds].laggy = 134 info = &m->m_info[mds];
134 (laggy_since.tv_sec != 0 || 135 info->global_id = global_id;
135 laggy_since.tv_nsec != 0); 136 info->state = state;
136 m->m_info[mds].num_export_targets = num_export_targets; 137 info->addr = addr;
137 if (num_export_targets) { 138 info->laggy = (laggy_since.tv_sec != 0 ||
138 m->m_info[mds].export_targets = 139 laggy_since.tv_nsec != 0);
139 kcalloc(num_export_targets, sizeof(u32), 140 info->num_export_targets = num_export_targets;
140 GFP_NOFS); 141 if (num_export_targets) {
141 for (j = 0; j < num_export_targets; j++) 142 info->export_targets = kcalloc(num_export_targets,
142 m->m_info[mds].export_targets[j] = 143 sizeof(u32), GFP_NOFS);
143 ceph_decode_32(&pexport_targets); 144 if (info->export_targets == NULL)
144 } else { 145 goto badmem;
145 m->m_info[mds].export_targets = NULL; 146 for (j = 0; j < num_export_targets; j++)
146 } 147 info->export_targets[j] =
148 ceph_decode_32(&pexport_targets);
149 } else {
150 info->export_targets = NULL;
147 } 151 }
148 } 152 }
149 153
@@ -170,7 +174,7 @@ bad:
170 DUMP_PREFIX_OFFSET, 16, 1, 174 DUMP_PREFIX_OFFSET, 16, 1,
171 start, end - start, true); 175 start, end - start, true);
172 ceph_mdsmap_destroy(m); 176 ceph_mdsmap_destroy(m);
173 return ERR_PTR(-EINVAL); 177 return ERR_PTR(err);
174} 178}
175 179
176void ceph_mdsmap_destroy(struct ceph_mdsmap *m) 180void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
357 } 357 }
358 err = -EINVAL; 358 err = -EINVAL;
359 dev_name_end--; /* back up to ':' separator */ 359 dev_name_end--; /* back up to ':' separator */
360 if (*dev_name_end != ':') { 360 if (dev_name_end < dev_name || *dev_name_end != ':') {
361 pr_err("device name is missing path (no : separator in %s)\n", 361 pr_err("device name is missing path (no : separator in %s)\n",
362 dev_name); 362 dev_name);
363 goto out; 363 goto out;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7ccfdb4aea2e..cbded572345e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
534extern void ceph_caps_init(struct ceph_mds_client *mdsc); 534extern void ceph_caps_init(struct ceph_mds_client *mdsc);
535extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); 535extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
536extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); 536extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
537extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, 537extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
538 struct ceph_cap_reservation *ctx, int need); 538 struct ceph_cap_reservation *ctx, int need);
539extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 539extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
540 struct ceph_cap_reservation *ctx); 540 struct ceph_cap_reservation *ctx);
@@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
692extern int ceph_inode_holds_cap(struct inode *inode, int mask); 692extern int ceph_inode_holds_cap(struct inode *inode, int mask);
693 693
694extern int ceph_inode_set_size(struct inode *inode, loff_t size); 694extern int ceph_inode_set_size(struct inode *inode, loff_t size);
695extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); 695extern void __ceph_do_pending_vmtruncate(struct inode *inode);
696extern void ceph_queue_vmtruncate(struct inode *inode); 696extern void ceph_queue_vmtruncate(struct inode *inode);
697 697
698extern void ceph_queue_invalidate(struct inode *inode); 698extern void ceph_queue_invalidate(struct inode *inode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9b6b2b6dd164..be661d8f532a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
675 if (!ceph_is_valid_xattr(name)) 675 if (!ceph_is_valid_xattr(name))
676 return -ENODATA; 676 return -ENODATA;
677 677
678 spin_lock(&ci->i_ceph_lock);
679 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
680 ci->i_xattrs.version, ci->i_xattrs.index_version);
681 678
682 /* let's see if a virtual xattr was requested */ 679 /* let's see if a virtual xattr was requested */
683 vxattr = ceph_match_vxattr(inode, name); 680 vxattr = ceph_match_vxattr(inode, name);
684 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 681 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
685 err = vxattr->getxattr_cb(ci, value, size); 682 err = vxattr->getxattr_cb(ci, value, size);
686 goto out; 683 return err;
687 } 684 }
688 685
686 spin_lock(&ci->i_ceph_lock);
687 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
688 ci->i_xattrs.version, ci->i_xattrs.index_version);
689
689 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 690 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
690 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 691 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
691 goto get_xattr; 692 goto get_xattr;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 3d8bf941d126..45e57cc38200 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifsencrypt.c 2 * fs/cifs/cifsencrypt.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2005,2006 4 * Copyright (C) International Business Machines Corp., 2005,2013
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -31,6 +31,36 @@
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33 33
34static int
35cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
36{
37 int rc;
38 unsigned int size;
39
40 if (server->secmech.sdescmd5 != NULL)
41 return 0; /* already allocated */
42
43 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
44 if (IS_ERR(server->secmech.md5)) {
45 cifs_dbg(VFS, "could not allocate crypto md5\n");
46 return PTR_ERR(server->secmech.md5);
47 }
48
49 size = sizeof(struct shash_desc) +
50 crypto_shash_descsize(server->secmech.md5);
51 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
52 if (!server->secmech.sdescmd5) {
53 rc = -ENOMEM;
54 crypto_free_shash(server->secmech.md5);
55 server->secmech.md5 = NULL;
56 return rc;
57 }
58 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
59 server->secmech.sdescmd5->shash.flags = 0x0;
60
61 return 0;
62}
63
34/* 64/*
35 * Calculate and return the CIFS signature based on the mac key and SMB PDU. 65 * Calculate and return the CIFS signature based on the mac key and SMB PDU.
36 * The 16 byte signature must be allocated by the caller. Note we only use the 66 * The 16 byte signature must be allocated by the caller. Note we only use the
@@ -50,8 +80,11 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
50 return -EINVAL; 80 return -EINVAL;
51 81
52 if (!server->secmech.sdescmd5) { 82 if (!server->secmech.sdescmd5) {
53 cifs_dbg(VFS, "%s: Can't generate signature\n", __func__); 83 rc = cifs_crypto_shash_md5_allocate(server);
54 return -1; 84 if (rc) {
85 cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
86 return -1;
87 }
55 } 88 }
56 89
57 rc = crypto_shash_init(&server->secmech.sdescmd5->shash); 90 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
@@ -556,6 +589,33 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
556 return rc; 589 return rc;
557} 590}
558 591
592static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
593{
594 unsigned int size;
595
596 /* check if already allocated */
597 if (server->secmech.sdeschmacmd5)
598 return 0;
599
600 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
601 if (IS_ERR(server->secmech.hmacmd5)) {
602 cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
603 return PTR_ERR(server->secmech.hmacmd5);
604 }
605
606 size = sizeof(struct shash_desc) +
607 crypto_shash_descsize(server->secmech.hmacmd5);
608 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
609 if (!server->secmech.sdeschmacmd5) {
610 crypto_free_shash(server->secmech.hmacmd5);
611 server->secmech.hmacmd5 = NULL;
612 return -ENOMEM;
613 }
614 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
615 server->secmech.sdeschmacmd5->shash.flags = 0x0;
616
617 return 0;
618}
559 619
560int 620int
561setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) 621setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
@@ -606,6 +666,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
606 666
607 memcpy(ses->auth_key.response + baselen, tiblob, tilen); 667 memcpy(ses->auth_key.response + baselen, tiblob, tilen);
608 668
669 rc = crypto_hmacmd5_alloc(ses->server);
670 if (rc) {
671 cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
672 goto setup_ntlmv2_rsp_ret;
673 }
674
609 /* calculate ntlmv2_hash */ 675 /* calculate ntlmv2_hash */
610 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); 676 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
611 if (rc) { 677 if (rc) {
@@ -705,123 +771,32 @@ calc_seckey(struct cifs_ses *ses)
705void 771void
706cifs_crypto_shash_release(struct TCP_Server_Info *server) 772cifs_crypto_shash_release(struct TCP_Server_Info *server)
707{ 773{
708 if (server->secmech.cmacaes) 774 if (server->secmech.cmacaes) {
709 crypto_free_shash(server->secmech.cmacaes); 775 crypto_free_shash(server->secmech.cmacaes);
776 server->secmech.cmacaes = NULL;
777 }
710 778
711 if (server->secmech.hmacsha256) 779 if (server->secmech.hmacsha256) {
712 crypto_free_shash(server->secmech.hmacsha256); 780 crypto_free_shash(server->secmech.hmacsha256);
781 server->secmech.hmacsha256 = NULL;
782 }
713 783
714 if (server->secmech.md5) 784 if (server->secmech.md5) {
715 crypto_free_shash(server->secmech.md5); 785 crypto_free_shash(server->secmech.md5);
786 server->secmech.md5 = NULL;
787 }
716 788
717 if (server->secmech.hmacmd5) 789 if (server->secmech.hmacmd5) {
718 crypto_free_shash(server->secmech.hmacmd5); 790 crypto_free_shash(server->secmech.hmacmd5);
791 server->secmech.hmacmd5 = NULL;
792 }
719 793
720 kfree(server->secmech.sdesccmacaes); 794 kfree(server->secmech.sdesccmacaes);
721 795 server->secmech.sdesccmacaes = NULL;
722 kfree(server->secmech.sdeschmacsha256); 796 kfree(server->secmech.sdeschmacsha256);
723 797 server->secmech.sdeschmacsha256 = NULL;
724 kfree(server->secmech.sdeschmacmd5); 798 kfree(server->secmech.sdeschmacmd5);
725 799 server->secmech.sdeschmacmd5 = NULL;
726 kfree(server->secmech.sdescmd5); 800 kfree(server->secmech.sdescmd5);
727} 801 server->secmech.sdescmd5 = NULL;
728
729int
730cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
731{
732 int rc;
733 unsigned int size;
734
735 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
736 if (IS_ERR(server->secmech.hmacmd5)) {
737 cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
738 return PTR_ERR(server->secmech.hmacmd5);
739 }
740
741 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
742 if (IS_ERR(server->secmech.md5)) {
743 cifs_dbg(VFS, "could not allocate crypto md5\n");
744 rc = PTR_ERR(server->secmech.md5);
745 goto crypto_allocate_md5_fail;
746 }
747
748 server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
749 if (IS_ERR(server->secmech.hmacsha256)) {
750 cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
751 rc = PTR_ERR(server->secmech.hmacsha256);
752 goto crypto_allocate_hmacsha256_fail;
753 }
754
755 server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
756 if (IS_ERR(server->secmech.cmacaes)) {
757 cifs_dbg(VFS, "could not allocate crypto cmac-aes");
758 rc = PTR_ERR(server->secmech.cmacaes);
759 goto crypto_allocate_cmacaes_fail;
760 }
761
762 size = sizeof(struct shash_desc) +
763 crypto_shash_descsize(server->secmech.hmacmd5);
764 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
765 if (!server->secmech.sdeschmacmd5) {
766 rc = -ENOMEM;
767 goto crypto_allocate_hmacmd5_sdesc_fail;
768 }
769 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
770 server->secmech.sdeschmacmd5->shash.flags = 0x0;
771
772 size = sizeof(struct shash_desc) +
773 crypto_shash_descsize(server->secmech.md5);
774 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
775 if (!server->secmech.sdescmd5) {
776 rc = -ENOMEM;
777 goto crypto_allocate_md5_sdesc_fail;
778 }
779 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
780 server->secmech.sdescmd5->shash.flags = 0x0;
781
782 size = sizeof(struct shash_desc) +
783 crypto_shash_descsize(server->secmech.hmacsha256);
784 server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
785 if (!server->secmech.sdeschmacsha256) {
786 rc = -ENOMEM;
787 goto crypto_allocate_hmacsha256_sdesc_fail;
788 }
789 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
790 server->secmech.sdeschmacsha256->shash.flags = 0x0;
791
792 size = sizeof(struct shash_desc) +
793 crypto_shash_descsize(server->secmech.cmacaes);
794 server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
795 if (!server->secmech.sdesccmacaes) {
796 cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
797 rc = -ENOMEM;
798 goto crypto_allocate_cmacaes_sdesc_fail;
799 }
800 server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
801 server->secmech.sdesccmacaes->shash.flags = 0x0;
802
803 return 0;
804
805crypto_allocate_cmacaes_sdesc_fail:
806 kfree(server->secmech.sdeschmacsha256);
807
808crypto_allocate_hmacsha256_sdesc_fail:
809 kfree(server->secmech.sdescmd5);
810
811crypto_allocate_md5_sdesc_fail:
812 kfree(server->secmech.sdeschmacmd5);
813
814crypto_allocate_hmacmd5_sdesc_fail:
815 crypto_free_shash(server->secmech.cmacaes);
816
817crypto_allocate_cmacaes_fail:
818 crypto_free_shash(server->secmech.hmacsha256);
819
820crypto_allocate_hmacsha256_fail:
821 crypto_free_shash(server->secmech.md5);
822
823crypto_allocate_md5_fail:
824 crypto_free_shash(server->secmech.hmacmd5);
825
826 return rc;
827} 802}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e66b08882548..1fdc37041057 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -194,6 +194,7 @@ struct cifs_writedata;
194struct cifs_io_parms; 194struct cifs_io_parms;
195struct cifs_search_info; 195struct cifs_search_info;
196struct cifsInodeInfo; 196struct cifsInodeInfo;
197struct cifs_open_parms;
197 198
198struct smb_version_operations { 199struct smb_version_operations {
199 int (*send_cancel)(struct TCP_Server_Info *, void *, 200 int (*send_cancel)(struct TCP_Server_Info *, void *,
@@ -307,9 +308,8 @@ struct smb_version_operations {
307 const char *, const char *, 308 const char *, const char *,
308 struct cifs_sb_info *); 309 struct cifs_sb_info *);
309 /* open a file for non-posix mounts */ 310 /* open a file for non-posix mounts */
310 int (*open)(const unsigned int, struct cifs_tcon *, const char *, int, 311 int (*open)(const unsigned int, struct cifs_open_parms *,
311 int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *, 312 __u32 *, FILE_ALL_INFO *);
312 struct cifs_sb_info *);
313 /* set fid protocol-specific info */ 313 /* set fid protocol-specific info */
314 void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); 314 void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
315 /* close a file */ 315 /* close a file */
@@ -912,6 +912,17 @@ struct cifs_search_info {
912 bool smallBuf:1; /* so we know which buf_release function to call */ 912 bool smallBuf:1; /* so we know which buf_release function to call */
913}; 913};
914 914
915struct cifs_open_parms {
916 struct cifs_tcon *tcon;
917 struct cifs_sb_info *cifs_sb;
918 int disposition;
919 int desired_access;
920 int create_options;
921 const char *path;
922 struct cifs_fid *fid;
923 bool reconnect:1;
924};
925
915struct cifs_fid { 926struct cifs_fid {
916 __u16 netfid; 927 __u16 netfid;
917#ifdef CONFIG_CIFS_SMB2 928#ifdef CONFIG_CIFS_SMB2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c8ff018fae68..f7e584d047e2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -433,7 +433,6 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
433 const struct nls_table *); 433 const struct nls_table *);
434extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); 434extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
435extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); 435extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
436extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
437extern void cifs_crypto_shash_release(struct TCP_Server_Info *); 436extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
438extern int calc_seckey(struct cifs_ses *); 437extern int calc_seckey(struct cifs_ses *);
439extern void generate_smb3signingkey(struct TCP_Server_Info *); 438extern void generate_smb3signingkey(struct TCP_Server_Info *);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index afcb8a1a33b7..fa68813396b5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2108,12 +2108,6 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
2108 goto out_err; 2108 goto out_err;
2109 } 2109 }
2110 2110
2111 rc = cifs_crypto_shash_allocate(tcp_ses);
2112 if (rc) {
2113 cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
2114 goto out_err;
2115 }
2116
2117 tcp_ses->ops = volume_info->ops; 2111 tcp_ses->ops = volume_info->ops;
2118 tcp_ses->vals = volume_info->vals; 2112 tcp_ses->vals = volume_info->vals;
2119 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); 2113 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5175aebf6737..d62ce0d48141 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -204,6 +204,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
204 struct inode *newinode = NULL; 204 struct inode *newinode = NULL;
205 int disposition; 205 int disposition;
206 struct TCP_Server_Info *server = tcon->ses->server; 206 struct TCP_Server_Info *server = tcon->ses->server;
207 struct cifs_open_parms oparms;
207 208
208 *oplock = 0; 209 *oplock = 0;
209 if (tcon->ses->server->oplocks) 210 if (tcon->ses->server->oplocks)
@@ -319,9 +320,16 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
319 if (backup_cred(cifs_sb)) 320 if (backup_cred(cifs_sb))
320 create_options |= CREATE_OPEN_BACKUP_INTENT; 321 create_options |= CREATE_OPEN_BACKUP_INTENT;
321 322
322 rc = server->ops->open(xid, tcon, full_path, disposition, 323 oparms.tcon = tcon;
323 desired_access, create_options, fid, oplock, 324 oparms.cifs_sb = cifs_sb;
324 buf, cifs_sb); 325 oparms.desired_access = desired_access;
326 oparms.create_options = create_options;
327 oparms.disposition = disposition;
328 oparms.path = full_path;
329 oparms.fid = fid;
330 oparms.reconnect = false;
331
332 rc = server->ops->open(xid, &oparms, oplock, buf);
325 if (rc) { 333 if (rc) {
326 cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); 334 cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
327 goto out; 335 goto out;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 91d8629e69a2..1e57f36ea1b2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -183,6 +183,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
183 int create_options = CREATE_NOT_DIR; 183 int create_options = CREATE_NOT_DIR;
184 FILE_ALL_INFO *buf; 184 FILE_ALL_INFO *buf;
185 struct TCP_Server_Info *server = tcon->ses->server; 185 struct TCP_Server_Info *server = tcon->ses->server;
186 struct cifs_open_parms oparms;
186 187
187 if (!server->ops->open) 188 if (!server->ops->open)
188 return -ENOSYS; 189 return -ENOSYS;
@@ -224,9 +225,16 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
224 if (backup_cred(cifs_sb)) 225 if (backup_cred(cifs_sb))
225 create_options |= CREATE_OPEN_BACKUP_INTENT; 226 create_options |= CREATE_OPEN_BACKUP_INTENT;
226 227
227 rc = server->ops->open(xid, tcon, full_path, disposition, 228 oparms.tcon = tcon;
228 desired_access, create_options, fid, oplock, buf, 229 oparms.cifs_sb = cifs_sb;
229 cifs_sb); 230 oparms.desired_access = desired_access;
231 oparms.create_options = create_options;
232 oparms.disposition = disposition;
233 oparms.path = full_path;
234 oparms.fid = fid;
235 oparms.reconnect = false;
236
237 rc = server->ops->open(xid, &oparms, oplock, buf);
230 238
231 if (rc) 239 if (rc)
232 goto out; 240 goto out;
@@ -553,11 +561,10 @@ cifs_relock_file(struct cifsFileInfo *cfile)
553 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 561 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
554 int rc = 0; 562 int rc = 0;
555 563
556 /* we are going to update can_cache_brlcks here - need a write access */ 564 down_read(&cinode->lock_sem);
557 down_write(&cinode->lock_sem);
558 if (cinode->can_cache_brlcks) { 565 if (cinode->can_cache_brlcks) {
559 /* can cache locks - no need to push them */ 566 /* can cache locks - no need to relock */
560 up_write(&cinode->lock_sem); 567 up_read(&cinode->lock_sem);
561 return rc; 568 return rc;
562 } 569 }
563 570
@@ -568,7 +575,7 @@ cifs_relock_file(struct cifsFileInfo *cfile)
568 else 575 else
569 rc = tcon->ses->server->ops->push_mand_locks(cfile); 576 rc = tcon->ses->server->ops->push_mand_locks(cfile);
570 577
571 up_write(&cinode->lock_sem); 578 up_read(&cinode->lock_sem);
572 return rc; 579 return rc;
573} 580}
574 581
@@ -587,7 +594,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
587 int desired_access; 594 int desired_access;
588 int disposition = FILE_OPEN; 595 int disposition = FILE_OPEN;
589 int create_options = CREATE_NOT_DIR; 596 int create_options = CREATE_NOT_DIR;
590 struct cifs_fid fid; 597 struct cifs_open_parms oparms;
591 598
592 xid = get_xid(); 599 xid = get_xid();
593 mutex_lock(&cfile->fh_mutex); 600 mutex_lock(&cfile->fh_mutex);
@@ -637,7 +644,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
637 644
638 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 645 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
639 cifs_sb->mnt_file_mode /* ignored */, 646 cifs_sb->mnt_file_mode /* ignored */,
640 oflags, &oplock, &fid.netfid, xid); 647 oflags, &oplock, &cfile->fid.netfid, xid);
641 if (rc == 0) { 648 if (rc == 0) {
642 cifs_dbg(FYI, "posix reopen succeeded\n"); 649 cifs_dbg(FYI, "posix reopen succeeded\n");
643 goto reopen_success; 650 goto reopen_success;
@@ -654,7 +661,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
654 create_options |= CREATE_OPEN_BACKUP_INTENT; 661 create_options |= CREATE_OPEN_BACKUP_INTENT;
655 662
656 if (server->ops->get_lease_key) 663 if (server->ops->get_lease_key)
657 server->ops->get_lease_key(inode, &fid); 664 server->ops->get_lease_key(inode, &cfile->fid);
665
666 oparms.tcon = tcon;
667 oparms.cifs_sb = cifs_sb;
668 oparms.desired_access = desired_access;
669 oparms.create_options = create_options;
670 oparms.disposition = disposition;
671 oparms.path = full_path;
672 oparms.fid = &cfile->fid;
673 oparms.reconnect = true;
658 674
659 /* 675 /*
660 * Can not refresh inode by passing in file_info buf to be returned by 676 * Can not refresh inode by passing in file_info buf to be returned by
@@ -663,9 +679,14 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
663 * version of file size can be stale. If we knew for sure that inode was 679 * version of file size can be stale. If we knew for sure that inode was
664 * not dirty locally we could do this. 680 * not dirty locally we could do this.
665 */ 681 */
666 rc = server->ops->open(xid, tcon, full_path, disposition, 682 rc = server->ops->open(xid, &oparms, &oplock, NULL);
667 desired_access, create_options, &fid, &oplock, 683 if (rc == -ENOENT && oparms.reconnect == false) {
668 NULL, cifs_sb); 684 /* durable handle timeout is expired - open the file again */
685 rc = server->ops->open(xid, &oparms, &oplock, NULL);
686 /* indicate that we need to relock the file */
687 oparms.reconnect = true;
688 }
689
669 if (rc) { 690 if (rc) {
670 mutex_unlock(&cfile->fh_mutex); 691 mutex_unlock(&cfile->fh_mutex);
671 cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc); 692 cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
@@ -696,8 +717,9 @@ reopen_success:
696 * to the server to get the new inode info. 717 * to the server to get the new inode info.
697 */ 718 */
698 719
699 server->ops->set_fid(cfile, &fid, oplock); 720 server->ops->set_fid(cfile, &cfile->fid, oplock);
700 cifs_relock_file(cfile); 721 if (oparms.reconnect)
722 cifs_relock_file(cfile);
701 723
702reopen_error_exit: 724reopen_error_exit:
703 kfree(full_path); 725 kfree(full_path);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 20efd81266c6..449b6cf09b09 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -558,6 +558,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
558 fattr->cf_mode &= ~(S_IWUGO); 558 fattr->cf_mode &= ~(S_IWUGO);
559 559
560 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); 560 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
561 if (fattr->cf_nlink < 1) {
562 cifs_dbg(1, "replacing bogus file nlink value %u\n",
563 fattr->cf_nlink);
564 fattr->cf_nlink = 1;
565 }
561 } 566 }
562 567
563 fattr->cf_uid = cifs_sb->mnt_uid; 568 fattr->cf_uid = cifs_sb->mnt_uid;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index e813f04511d8..6457690731a2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -674,20 +674,23 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
674} 674}
675 675
676static int 676static int
677cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path, 677cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
678 int disposition, int desired_access, int create_options, 678 __u32 *oplock, FILE_ALL_INFO *buf)
679 struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf, 679{
680 struct cifs_sb_info *cifs_sb) 680 if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
681{ 681 return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
682 if (!(tcon->ses->capabilities & CAP_NT_SMBS)) 682 oparms->disposition,
683 return SMBLegacyOpen(xid, tcon, path, disposition, 683 oparms->desired_access,
684 desired_access, create_options, 684 oparms->create_options,
685 &fid->netfid, oplock, buf, 685 &oparms->fid->netfid, oplock, buf,
686 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 686 oparms->cifs_sb->local_nls,
687 oparms->cifs_sb->mnt_cifs_flags
687 & CIFS_MOUNT_MAP_SPECIAL_CHR); 688 & CIFS_MOUNT_MAP_SPECIAL_CHR);
688 return CIFSSMBOpen(xid, tcon, path, disposition, desired_access, 689 return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
689 create_options, &fid->netfid, oplock, buf, 690 oparms->disposition, oparms->desired_access,
690 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 691 oparms->create_options, &oparms->fid->netfid, oplock,
692 buf, oparms->cifs_sb->local_nls,
693 oparms->cifs_sb->mnt_cifs_flags &
691 CIFS_MOUNT_MAP_SPECIAL_CHR); 694 CIFS_MOUNT_MAP_SPECIAL_CHR);
692} 695}
693 696
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 5da1b55a2258..04a81a4142c3 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -40,7 +40,8 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
40 oplock &= 0xFF; 40 oplock &= 0xFF;
41 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) 41 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
42 return; 42 return;
43 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { 43 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
44 oplock == SMB2_OPLOCK_LEVEL_BATCH) {
44 cinode->clientCanCacheAll = true; 45 cinode->clientCanCacheAll = true;
45 cinode->clientCanCacheRead = true; 46 cinode->clientCanCacheRead = true;
46 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", 47 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
@@ -57,17 +58,16 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
57} 58}
58 59
59int 60int
60smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path, 61smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
61 int disposition, int desired_access, int create_options, 62 __u32 *oplock, FILE_ALL_INFO *buf)
62 struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
63 struct cifs_sb_info *cifs_sb)
64{ 63{
65 int rc; 64 int rc;
66 __le16 *smb2_path; 65 __le16 *smb2_path;
67 struct smb2_file_all_info *smb2_data = NULL; 66 struct smb2_file_all_info *smb2_data = NULL;
68 __u8 smb2_oplock[17]; 67 __u8 smb2_oplock[17];
68 struct cifs_fid *fid = oparms->fid;
69 69
70 smb2_path = cifs_convert_path_to_utf16(path, cifs_sb); 70 smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
71 if (smb2_path == NULL) { 71 if (smb2_path == NULL) {
72 rc = -ENOMEM; 72 rc = -ENOMEM;
73 goto out; 73 goto out;
@@ -80,21 +80,19 @@ smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
80 goto out; 80 goto out;
81 } 81 }
82 82
83 desired_access |= FILE_READ_ATTRIBUTES; 83 oparms->desired_access |= FILE_READ_ATTRIBUTES;
84 *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE; 84 *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
85 85
86 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) 86 if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
87 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); 87 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
88 88
89 rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid, 89 rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data);
90 &fid->volatile_fid, desired_access, disposition,
91 0, 0, smb2_oplock, smb2_data);
92 if (rc) 90 if (rc)
93 goto out; 91 goto out;
94 92
95 if (buf) { 93 if (buf) {
96 /* open response does not have IndexNumber field - get it */ 94 /* open response does not have IndexNumber field - get it */
97 rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid, 95 rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
98 fid->volatile_fid, 96 fid->volatile_fid,
99 &smb2_data->IndexNumber); 97 &smb2_data->IndexNumber);
100 if (rc) { 98 if (rc) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fff6dfba6204..c6ec1633309a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -41,21 +41,26 @@ static int
41smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, 41smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
42 struct cifs_sb_info *cifs_sb, const char *full_path, 42 struct cifs_sb_info *cifs_sb, const char *full_path,
43 __u32 desired_access, __u32 create_disposition, 43 __u32 desired_access, __u32 create_disposition,
44 __u32 file_attributes, __u32 create_options, 44 __u32 create_options, void *data, int command)
45 void *data, int command)
46{ 45{
47 int rc, tmprc = 0; 46 int rc, tmprc = 0;
48 u64 persistent_fid, volatile_fid;
49 __le16 *utf16_path; 47 __le16 *utf16_path;
50 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 48 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
49 struct cifs_open_parms oparms;
50 struct cifs_fid fid;
51 51
52 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); 52 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
53 if (!utf16_path) 53 if (!utf16_path)
54 return -ENOMEM; 54 return -ENOMEM;
55 55
56 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 56 oparms.tcon = tcon;
57 desired_access, create_disposition, file_attributes, 57 oparms.desired_access = desired_access;
58 create_options, &oplock, NULL); 58 oparms.disposition = create_disposition;
59 oparms.create_options = create_options;
60 oparms.fid = &fid;
61 oparms.reconnect = false;
62
63 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
59 if (rc) { 64 if (rc) {
60 kfree(utf16_path); 65 kfree(utf16_path);
61 return rc; 66 return rc;
@@ -65,8 +70,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
65 case SMB2_OP_DELETE: 70 case SMB2_OP_DELETE:
66 break; 71 break;
67 case SMB2_OP_QUERY_INFO: 72 case SMB2_OP_QUERY_INFO:
68 tmprc = SMB2_query_info(xid, tcon, persistent_fid, 73 tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
69 volatile_fid, 74 fid.volatile_fid,
70 (struct smb2_file_all_info *)data); 75 (struct smb2_file_all_info *)data);
71 break; 76 break;
72 case SMB2_OP_MKDIR: 77 case SMB2_OP_MKDIR:
@@ -76,19 +81,21 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
76 */ 81 */
77 break; 82 break;
78 case SMB2_OP_RENAME: 83 case SMB2_OP_RENAME:
79 tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid, 84 tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
80 (__le16 *)data); 85 fid.volatile_fid, (__le16 *)data);
81 break; 86 break;
82 case SMB2_OP_HARDLINK: 87 case SMB2_OP_HARDLINK:
83 tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid, 88 tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
84 volatile_fid, (__le16 *)data); 89 fid.volatile_fid, (__le16 *)data);
85 break; 90 break;
86 case SMB2_OP_SET_EOF: 91 case SMB2_OP_SET_EOF:
87 tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid, 92 tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
88 current->tgid, (__le64 *)data); 93 fid.volatile_fid, current->tgid,
94 (__le64 *)data);
89 break; 95 break;
90 case SMB2_OP_SET_INFO: 96 case SMB2_OP_SET_INFO:
91 tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid, 97 tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
98 fid.volatile_fid,
92 (FILE_BASIC_INFO *)data); 99 (FILE_BASIC_INFO *)data);
93 break; 100 break;
94 default: 101 default:
@@ -96,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
96 break; 103 break;
97 } 104 }
98 105
99 rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid); 106 rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
100 if (tmprc) 107 if (tmprc)
101 rc = tmprc; 108 rc = tmprc;
102 kfree(utf16_path); 109 kfree(utf16_path);
@@ -129,8 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
129 return -ENOMEM; 136 return -ENOMEM;
130 137
131 rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, 138 rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
132 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, 139 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data,
133 smb2_data, SMB2_OP_QUERY_INFO); 140 SMB2_OP_QUERY_INFO);
134 if (rc) 141 if (rc)
135 goto out; 142 goto out;
136 143
@@ -145,7 +152,7 @@ smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
145 struct cifs_sb_info *cifs_sb) 152 struct cifs_sb_info *cifs_sb)
146{ 153{
147 return smb2_open_op_close(xid, tcon, cifs_sb, name, 154 return smb2_open_op_close(xid, tcon, cifs_sb, name,
148 FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0, 155 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
149 CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); 156 CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
150} 157}
151 158
@@ -164,7 +171,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
164 dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; 171 dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
165 data.Attributes = cpu_to_le32(dosattrs); 172 data.Attributes = cpu_to_le32(dosattrs);
166 tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name, 173 tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
167 FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0, 174 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
168 CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); 175 CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
169 if (tmprc == 0) 176 if (tmprc == 0)
170 cifs_i->cifsAttrs = dosattrs; 177 cifs_i->cifsAttrs = dosattrs;
@@ -175,7 +182,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
175 struct cifs_sb_info *cifs_sb) 182 struct cifs_sb_info *cifs_sb)
176{ 183{
177 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, 184 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
178 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, 185 CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
179 NULL, SMB2_OP_DELETE); 186 NULL, SMB2_OP_DELETE);
180} 187}
181 188
@@ -184,7 +191,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
184 struct cifs_sb_info *cifs_sb) 191 struct cifs_sb_info *cifs_sb)
185{ 192{
186 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, 193 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
187 0, CREATE_DELETE_ON_CLOSE, NULL, 194 CREATE_DELETE_ON_CLOSE, NULL,
188 SMB2_OP_DELETE); 195 SMB2_OP_DELETE);
189} 196}
190 197
@@ -203,7 +210,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
203 } 210 }
204 211
205 rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access, 212 rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
206 FILE_OPEN, 0, 0, smb2_to_name, command); 213 FILE_OPEN, 0, smb2_to_name, command);
207smb2_rename_path: 214smb2_rename_path:
208 kfree(smb2_to_name); 215 kfree(smb2_to_name);
209 return rc; 216 return rc;
@@ -234,7 +241,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
234{ 241{
235 __le64 eof = cpu_to_le64(size); 242 __le64 eof = cpu_to_le64(size);
236 return smb2_open_op_close(xid, tcon, cifs_sb, full_path, 243 return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
237 FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof, 244 FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
238 SMB2_OP_SET_EOF); 245 SMB2_OP_SET_EOF);
239} 246}
240 247
@@ -250,7 +257,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
250 if (IS_ERR(tlink)) 257 if (IS_ERR(tlink))
251 return PTR_ERR(tlink); 258 return PTR_ERR(tlink);
252 rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, 259 rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
253 FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf, 260 FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
254 SMB2_OP_SET_INFO); 261 SMB2_OP_SET_INFO);
255 cifs_put_tlink(tlink); 262 cifs_put_tlink(tlink);
256 return rc; 263 return rc;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 6d15cab95b99..f259e6cc8357 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -213,22 +213,29 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
213 struct cifs_sb_info *cifs_sb, const char *full_path) 213 struct cifs_sb_info *cifs_sb, const char *full_path)
214{ 214{
215 int rc; 215 int rc;
216 __u64 persistent_fid, volatile_fid;
217 __le16 *utf16_path; 216 __le16 *utf16_path;
218 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 217 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
218 struct cifs_open_parms oparms;
219 struct cifs_fid fid;
219 220
220 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); 221 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
221 if (!utf16_path) 222 if (!utf16_path)
222 return -ENOMEM; 223 return -ENOMEM;
223 224
224 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 225 oparms.tcon = tcon;
225 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL); 226 oparms.desired_access = FILE_READ_ATTRIBUTES;
227 oparms.disposition = FILE_OPEN;
228 oparms.create_options = 0;
229 oparms.fid = &fid;
230 oparms.reconnect = false;
231
232 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
226 if (rc) { 233 if (rc) {
227 kfree(utf16_path); 234 kfree(utf16_path);
228 return rc; 235 return rc;
229 } 236 }
230 237
231 rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid); 238 rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
232 kfree(utf16_path); 239 kfree(utf16_path);
233 return rc; 240 return rc;
234} 241}
@@ -443,15 +450,20 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
443 __le16 *utf16_path; 450 __le16 *utf16_path;
444 int rc; 451 int rc;
445 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 452 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
446 __u64 persistent_fid, volatile_fid; 453 struct cifs_open_parms oparms;
447 454
448 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); 455 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
449 if (!utf16_path) 456 if (!utf16_path)
450 return -ENOMEM; 457 return -ENOMEM;
451 458
452 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 459 oparms.tcon = tcon;
453 FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0, 460 oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
454 &oplock, NULL); 461 oparms.disposition = FILE_OPEN;
462 oparms.create_options = 0;
463 oparms.fid = fid;
464 oparms.reconnect = false;
465
466 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
455 kfree(utf16_path); 467 kfree(utf16_path);
456 if (rc) { 468 if (rc) {
457 cifs_dbg(VFS, "open dir failed\n"); 469 cifs_dbg(VFS, "open dir failed\n");
@@ -460,14 +472,12 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
460 472
461 srch_inf->entries_in_buffer = 0; 473 srch_inf->entries_in_buffer = 0;
462 srch_inf->index_of_last_entry = 0; 474 srch_inf->index_of_last_entry = 0;
463 fid->persistent_fid = persistent_fid;
464 fid->volatile_fid = volatile_fid;
465 475
466 rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0, 476 rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
467 srch_inf); 477 fid->volatile_fid, 0, srch_inf);
468 if (rc) { 478 if (rc) {
469 cifs_dbg(VFS, "query directory failed\n"); 479 cifs_dbg(VFS, "query directory failed\n");
470 SMB2_close(xid, tcon, persistent_fid, volatile_fid); 480 SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
471 } 481 }
472 return rc; 482 return rc;
473} 483}
@@ -528,17 +538,25 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
528 struct kstatfs *buf) 538 struct kstatfs *buf)
529{ 539{
530 int rc; 540 int rc;
531 u64 persistent_fid, volatile_fid;
532 __le16 srch_path = 0; /* Null - open root of share */ 541 __le16 srch_path = 0; /* Null - open root of share */
533 u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 542 u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
543 struct cifs_open_parms oparms;
544 struct cifs_fid fid;
545
546 oparms.tcon = tcon;
547 oparms.desired_access = FILE_READ_ATTRIBUTES;
548 oparms.disposition = FILE_OPEN;
549 oparms.create_options = 0;
550 oparms.fid = &fid;
551 oparms.reconnect = false;
534 552
535 rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid, 553 rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL);
536 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
537 if (rc) 554 if (rc)
538 return rc; 555 return rc;
539 buf->f_type = SMB2_MAGIC_NUMBER; 556 buf->f_type = SMB2_MAGIC_NUMBER;
540 rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf); 557 rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
541 SMB2_close(xid, tcon, persistent_fid, volatile_fid); 558 buf);
559 SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
542 return rc; 560 return rc;
543} 561}
544 562
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b312e4eeaa6..abc9c2809b51 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -847,29 +847,76 @@ create_lease_buf(u8 *lease_key, u8 oplock)
847 return buf; 847 return buf;
848} 848}
849 849
850static struct create_durable *
851create_durable_buf(void)
852{
853 struct create_durable *buf;
854
855 buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
856 if (!buf)
857 return NULL;
858
859 buf->ccontext.DataOffset = cpu_to_le16(offsetof
860 (struct create_durable, Data));
861 buf->ccontext.DataLength = cpu_to_le32(16);
862 buf->ccontext.NameOffset = cpu_to_le16(offsetof
863 (struct create_durable, Name));
864 buf->ccontext.NameLength = cpu_to_le16(4);
865 buf->Name[0] = 'D';
866 buf->Name[1] = 'H';
867 buf->Name[2] = 'n';
868 buf->Name[3] = 'Q';
869 return buf;
870}
871
872static struct create_durable *
873create_reconnect_durable_buf(struct cifs_fid *fid)
874{
875 struct create_durable *buf;
876
877 buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
878 if (!buf)
879 return NULL;
880
881 buf->ccontext.DataOffset = cpu_to_le16(offsetof
882 (struct create_durable, Data));
883 buf->ccontext.DataLength = cpu_to_le32(16);
884 buf->ccontext.NameOffset = cpu_to_le16(offsetof
885 (struct create_durable, Name));
886 buf->ccontext.NameLength = cpu_to_le16(4);
887 buf->Data.Fid.PersistentFileId = fid->persistent_fid;
888 buf->Data.Fid.VolatileFileId = fid->volatile_fid;
889 buf->Name[0] = 'D';
890 buf->Name[1] = 'H';
891 buf->Name[2] = 'n';
892 buf->Name[3] = 'C';
893 return buf;
894}
895
850static __u8 896static __u8
851parse_lease_state(struct smb2_create_rsp *rsp) 897parse_lease_state(struct smb2_create_rsp *rsp)
852{ 898{
853 char *data_offset; 899 char *data_offset;
854 struct create_lease *lc; 900 struct create_lease *lc;
855 bool found = false; 901 bool found = false;
902 unsigned int next = 0;
903 char *name;
856 904
857 data_offset = (char *)rsp; 905 data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
858 data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset);
859 lc = (struct create_lease *)data_offset; 906 lc = (struct create_lease *)data_offset;
860 do { 907 do {
861 char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc; 908 lc = (struct create_lease *)((char *)lc + next);
909 name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
862 if (le16_to_cpu(lc->ccontext.NameLength) != 4 || 910 if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
863 strncmp(name, "RqLs", 4)) { 911 strncmp(name, "RqLs", 4)) {
864 lc = (struct create_lease *)((char *)lc 912 next = le32_to_cpu(lc->ccontext.Next);
865 + le32_to_cpu(lc->ccontext.Next));
866 continue; 913 continue;
867 } 914 }
868 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) 915 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
869 return SMB2_OPLOCK_LEVEL_NOCHANGE; 916 return SMB2_OPLOCK_LEVEL_NOCHANGE;
870 found = true; 917 found = true;
871 break; 918 break;
872 } while (le32_to_cpu(lc->ccontext.Next) != 0); 919 } while (next != 0);
873 920
874 if (!found) 921 if (!found)
875 return 0; 922 return 0;
@@ -877,23 +924,74 @@ parse_lease_state(struct smb2_create_rsp *rsp)
877 return smb2_map_lease_to_oplock(lc->lcontext.LeaseState); 924 return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
878} 925}
879 926
927static int
928add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock)
929{
930 struct smb2_create_req *req = iov[0].iov_base;
931 unsigned int num = *num_iovec;
932
933 iov[num].iov_base = create_lease_buf(oplock+1, *oplock);
934 if (iov[num].iov_base == NULL)
935 return -ENOMEM;
936 iov[num].iov_len = sizeof(struct create_lease);
937 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
938 if (!req->CreateContextsOffset)
939 req->CreateContextsOffset = cpu_to_le32(
940 sizeof(struct smb2_create_req) - 4 +
941 iov[num - 1].iov_len);
942 req->CreateContextsLength = cpu_to_le32(
943 le32_to_cpu(req->CreateContextsLength) +
944 sizeof(struct create_lease));
945 inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
946 *num_iovec = num + 1;
947 return 0;
948}
949
950static int
951add_durable_context(struct kvec *iov, unsigned int *num_iovec,
952 struct cifs_open_parms *oparms)
953{
954 struct smb2_create_req *req = iov[0].iov_base;
955 unsigned int num = *num_iovec;
956
957 if (oparms->reconnect) {
958 iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
959 /* indicate that we don't need to relock the file */
960 oparms->reconnect = false;
961 } else
962 iov[num].iov_base = create_durable_buf();
963 if (iov[num].iov_base == NULL)
964 return -ENOMEM;
965 iov[num].iov_len = sizeof(struct create_durable);
966 if (!req->CreateContextsOffset)
967 req->CreateContextsOffset =
968 cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
969 iov[1].iov_len);
970 req->CreateContextsLength =
971 cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
972 sizeof(struct create_durable));
973 inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
974 *num_iovec = num + 1;
975 return 0;
976}
977
880int 978int
881SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, 979SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
882 u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access,
883 __u32 create_disposition, __u32 file_attributes, __u32 create_options,
884 __u8 *oplock, struct smb2_file_all_info *buf) 980 __u8 *oplock, struct smb2_file_all_info *buf)
885{ 981{
886 struct smb2_create_req *req; 982 struct smb2_create_req *req;
887 struct smb2_create_rsp *rsp; 983 struct smb2_create_rsp *rsp;
888 struct TCP_Server_Info *server; 984 struct TCP_Server_Info *server;
985 struct cifs_tcon *tcon = oparms->tcon;
889 struct cifs_ses *ses = tcon->ses; 986 struct cifs_ses *ses = tcon->ses;
890 struct kvec iov[3]; 987 struct kvec iov[4];
891 int resp_buftype; 988 int resp_buftype;
892 int uni_path_len; 989 int uni_path_len;
893 __le16 *copy_path = NULL; 990 __le16 *copy_path = NULL;
894 int copy_size; 991 int copy_size;
895 int rc = 0; 992 int rc = 0;
896 int num_iovecs = 2; 993 unsigned int num_iovecs = 2;
994 __u32 file_attributes = 0;
897 995
898 cifs_dbg(FYI, "create/open\n"); 996 cifs_dbg(FYI, "create/open\n");
899 997
@@ -906,55 +1004,47 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
906 if (rc) 1004 if (rc)
907 return rc; 1005 return rc;
908 1006
1007 if (oparms->create_options & CREATE_OPTION_READONLY)
1008 file_attributes |= ATTR_READONLY;
1009
909 req->ImpersonationLevel = IL_IMPERSONATION; 1010 req->ImpersonationLevel = IL_IMPERSONATION;
910 req->DesiredAccess = cpu_to_le32(desired_access); 1011 req->DesiredAccess = cpu_to_le32(oparms->desired_access);
911 /* File attributes ignored on open (used in create though) */ 1012 /* File attributes ignored on open (used in create though) */
912 req->FileAttributes = cpu_to_le32(file_attributes); 1013 req->FileAttributes = cpu_to_le32(file_attributes);
913 req->ShareAccess = FILE_SHARE_ALL_LE; 1014 req->ShareAccess = FILE_SHARE_ALL_LE;
914 req->CreateDisposition = cpu_to_le32(create_disposition); 1015 req->CreateDisposition = cpu_to_le32(oparms->disposition);
915 req->CreateOptions = cpu_to_le32(create_options); 1016 req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
916 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; 1017 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
917 req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) 1018 /* do not count rfc1001 len field */
918 - 8 /* pad */ - 4 /* do not count rfc1001 len field */); 1019 req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
919 1020
920 iov[0].iov_base = (char *)req; 1021 iov[0].iov_base = (char *)req;
921 /* 4 for rfc1002 length field */ 1022 /* 4 for rfc1002 length field */
922 iov[0].iov_len = get_rfc1002_length(req) + 4; 1023 iov[0].iov_len = get_rfc1002_length(req) + 4;
923 1024
924 /* MUST set path len (NameLength) to 0 opening root of share */ 1025 /* MUST set path len (NameLength) to 0 opening root of share */
925 if (uni_path_len >= 4) { 1026 req->NameLength = cpu_to_le16(uni_path_len - 2);
926 req->NameLength = cpu_to_le16(uni_path_len - 2); 1027 /* -1 since last byte is buf[0] which is sent below (path) */
927 /* -1 since last byte is buf[0] which is sent below (path) */ 1028 iov[0].iov_len--;
928 iov[0].iov_len--; 1029 if (uni_path_len % 8 != 0) {
929 if (uni_path_len % 8 != 0) { 1030 copy_size = uni_path_len / 8 * 8;
930 copy_size = uni_path_len / 8 * 8; 1031 if (copy_size < uni_path_len)
931 if (copy_size < uni_path_len) 1032 copy_size += 8;
932 copy_size += 8; 1033
933 1034 copy_path = kzalloc(copy_size, GFP_KERNEL);
934 copy_path = kzalloc(copy_size, GFP_KERNEL); 1035 if (!copy_path)
935 if (!copy_path) 1036 return -ENOMEM;
936 return -ENOMEM; 1037 memcpy((char *)copy_path, (const char *)path,
937 memcpy((char *)copy_path, (const char *)path, 1038 uni_path_len);
938 uni_path_len); 1039 uni_path_len = copy_size;
939 uni_path_len = copy_size; 1040 path = copy_path;
940 path = copy_path;
941 }
942
943 iov[1].iov_len = uni_path_len;
944 iov[1].iov_base = path;
945 /*
946 * -1 since last byte is buf[0] which was counted in
947 * smb2_buf_len.
948 */
949 inc_rfc1001_len(req, uni_path_len - 1);
950 } else {
951 iov[0].iov_len += 7;
952 req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu(
953 req->hdr.smb2_buf_length) + 8 - 1);
954 num_iovecs = 1;
955 req->NameLength = 0;
956 } 1041 }
957 1042
1043 iov[1].iov_len = uni_path_len;
1044 iov[1].iov_base = path;
1045 /* -1 since last byte is buf[0] which was counted in smb2_buf_len */
1046 inc_rfc1001_len(req, uni_path_len - 1);
1047
958 if (!server->oplocks) 1048 if (!server->oplocks)
959 *oplock = SMB2_OPLOCK_LEVEL_NONE; 1049 *oplock = SMB2_OPLOCK_LEVEL_NONE;
960 1050
@@ -962,21 +1052,29 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
962 *oplock == SMB2_OPLOCK_LEVEL_NONE) 1052 *oplock == SMB2_OPLOCK_LEVEL_NONE)
963 req->RequestedOplockLevel = *oplock; 1053 req->RequestedOplockLevel = *oplock;
964 else { 1054 else {
965 iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock); 1055 rc = add_lease_context(iov, &num_iovecs, oplock);
966 if (iov[num_iovecs].iov_base == NULL) { 1056 if (rc) {
967 cifs_small_buf_release(req); 1057 cifs_small_buf_release(req);
968 kfree(copy_path); 1058 kfree(copy_path);
969 return -ENOMEM; 1059 return rc;
1060 }
1061 }
1062
1063 if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
1064 /* need to set Next field of lease context if we request it */
1065 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
1066 struct create_context *ccontext =
1067 (struct create_context *)iov[num_iovecs-1].iov_base;
1068 ccontext->Next =
1069 cpu_to_le32(sizeof(struct create_lease));
1070 }
1071 rc = add_durable_context(iov, &num_iovecs, oparms);
1072 if (rc) {
1073 cifs_small_buf_release(req);
1074 kfree(copy_path);
1075 kfree(iov[num_iovecs-1].iov_base);
1076 return rc;
970 } 1077 }
971 iov[num_iovecs].iov_len = sizeof(struct create_lease);
972 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
973 req->CreateContextsOffset = cpu_to_le32(
974 sizeof(struct smb2_create_req) - 4 - 8 +
975 iov[num_iovecs-1].iov_len);
976 req->CreateContextsLength = cpu_to_le32(
977 sizeof(struct create_lease));
978 inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
979 num_iovecs++;
980 } 1078 }
981 1079
982 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); 1080 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
@@ -987,8 +1085,8 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
987 goto creat_exit; 1085 goto creat_exit;
988 } 1086 }
989 1087
990 *persistent_fid = rsp->PersistentFileId; 1088 oparms->fid->persistent_fid = rsp->PersistentFileId;
991 *volatile_fid = rsp->VolatileFileId; 1089 oparms->fid->volatile_fid = rsp->VolatileFileId;
992 1090
993 if (buf) { 1091 if (buf) {
994 memcpy(buf, &rsp->CreationTime, 32); 1092 memcpy(buf, &rsp->CreationTime, 32);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f31043b26bd3..36b0d37ea69b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -428,7 +428,7 @@ struct smb2_create_req {
428 __le16 NameLength; 428 __le16 NameLength;
429 __le32 CreateContextsOffset; 429 __le32 CreateContextsOffset;
430 __le32 CreateContextsLength; 430 __le32 CreateContextsLength;
431 __u8 Buffer[8]; 431 __u8 Buffer[0];
432} __packed; 432} __packed;
433 433
434struct smb2_create_rsp { 434struct smb2_create_rsp {
@@ -485,6 +485,18 @@ struct create_lease {
485 struct lease_context lcontext; 485 struct lease_context lcontext;
486} __packed; 486} __packed;
487 487
488struct create_durable {
489 struct create_context ccontext;
490 __u8 Name[8];
491 union {
492 __u8 Reserved[16];
493 struct {
494 __u64 PersistentFileId;
495 __u64 VolatileFileId;
496 } Fid;
497 } Data;
498} __packed;
499
488/* this goes in the ioctl buffer when doing a copychunk request */ 500/* this goes in the ioctl buffer when doing a copychunk request */
489struct copychunk_ioctl { 501struct copychunk_ioctl {
490 char SourceKey[24]; 502 char SourceKey[24];
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index d4e1eb807457..1a5ecbed40ed 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -84,11 +84,9 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
84 const char *from_name, const char *to_name, 84 const char *from_name, const char *to_name,
85 struct cifs_sb_info *cifs_sb); 85 struct cifs_sb_info *cifs_sb);
86 86
87extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, 87extern int smb2_open_file(const unsigned int xid,
88 const char *full_path, int disposition, 88 struct cifs_open_parms *oparms,
89 int desired_access, int create_options, 89 __u32 *oplock, FILE_ALL_INFO *buf);
90 struct cifs_fid *fid, __u32 *oplock,
91 FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb);
92extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); 90extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
93extern int smb2_unlock_range(struct cifsFileInfo *cfile, 91extern int smb2_unlock_range(struct cifsFileInfo *cfile,
94 struct file_lock *flock, const unsigned int xid); 92 struct file_lock *flock, const unsigned int xid);
@@ -106,11 +104,9 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
106 const char *tree, struct cifs_tcon *tcon, 104 const char *tree, struct cifs_tcon *tcon,
107 const struct nls_table *); 105 const struct nls_table *);
108extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); 106extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
109extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, 107extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
110 __le16 *path, u64 *persistent_fid, u64 *volatile_fid, 108 __le16 *path, __u8 *oplock,
111 __u32 desired_access, __u32 create_disposition, 109 struct smb2_file_all_info *buf);
112 __u32 file_attributes, __u32 create_options,
113 __u8 *oplock, struct smb2_file_all_info *buf);
114extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, 110extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
115 u64 persistent_fid, u64 volatile_fid, u32 opcode, 111 u64 persistent_fid, u64 volatile_fid, u32 opcode,
116 bool is_fsctl, char *in_data, u32 indatalen, 112 bool is_fsctl, char *in_data, u32 indatalen,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 09b4fbaadeb6..301b191270b9 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,6 +39,77 @@
39#include "smb2status.h" 39#include "smb2status.h"
40#include "smb2glob.h" 40#include "smb2glob.h"
41 41
42static int
43smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
44{
45 unsigned int size;
46
47 if (server->secmech.sdeschmacsha256 != NULL)
48 return 0; /* already allocated */
49
50 server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
51 if (IS_ERR(server->secmech.hmacsha256)) {
52 cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
53 return PTR_ERR(server->secmech.hmacsha256);
54 }
55
56 size = sizeof(struct shash_desc) +
57 crypto_shash_descsize(server->secmech.hmacsha256);
58 server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
59 if (!server->secmech.sdeschmacsha256) {
60 crypto_free_shash(server->secmech.hmacsha256);
61 server->secmech.hmacsha256 = NULL;
62 return -ENOMEM;
63 }
64 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
65 server->secmech.sdeschmacsha256->shash.flags = 0x0;
66
67 return 0;
68}
69
70static int
71smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
72{
73 unsigned int size;
74 int rc;
75
76 if (server->secmech.sdesccmacaes != NULL)
77 return 0; /* already allocated */
78
79 rc = smb2_crypto_shash_allocate(server);
80 if (rc)
81 return rc;
82
83 server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
84 if (IS_ERR(server->secmech.cmacaes)) {
85 cifs_dbg(VFS, "could not allocate crypto cmac-aes");
86 kfree(server->secmech.sdeschmacsha256);
87 server->secmech.sdeschmacsha256 = NULL;
88 crypto_free_shash(server->secmech.hmacsha256);
89 server->secmech.hmacsha256 = NULL;
90 return PTR_ERR(server->secmech.cmacaes);
91 }
92
93 size = sizeof(struct shash_desc) +
94 crypto_shash_descsize(server->secmech.cmacaes);
95 server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
96 if (!server->secmech.sdesccmacaes) {
97 cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
98 kfree(server->secmech.sdeschmacsha256);
99 server->secmech.sdeschmacsha256 = NULL;
100 crypto_free_shash(server->secmech.hmacsha256);
101 crypto_free_shash(server->secmech.cmacaes);
102 server->secmech.hmacsha256 = NULL;
103 server->secmech.cmacaes = NULL;
104 return -ENOMEM;
105 }
106 server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
107 server->secmech.sdesccmacaes->shash.flags = 0x0;
108
109 return 0;
110}
111
112
42int 113int
43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 114smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
44{ 115{
@@ -52,6 +123,12 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
52 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); 123 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
53 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); 124 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
54 125
126 rc = smb2_crypto_shash_allocate(server);
127 if (rc) {
128 cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
129 return rc;
130 }
131
55 rc = crypto_shash_setkey(server->secmech.hmacsha256, 132 rc = crypto_shash_setkey(server->secmech.hmacsha256,
56 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); 133 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
57 if (rc) { 134 if (rc) {
@@ -61,7 +138,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
61 138
62 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); 139 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
63 if (rc) { 140 if (rc) {
64 cifs_dbg(VFS, "%s: Could not init md5\n", __func__); 141 cifs_dbg(VFS, "%s: Could not init sha256", __func__);
65 return rc; 142 return rc;
66 } 143 }
67 144
@@ -129,6 +206,12 @@ generate_smb3signingkey(struct TCP_Server_Info *server)
129 memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); 206 memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
130 memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); 207 memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
131 208
209 rc = smb3_crypto_shash_allocate(server);
210 if (rc) {
211 cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
212 goto smb3signkey_ret;
213 }
214
132 rc = crypto_shash_setkey(server->secmech.hmacsha256, 215 rc = crypto_shash_setkey(server->secmech.hmacsha256,
133 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); 216 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
134 if (rc) { 217 if (rc) {
@@ -210,6 +293,11 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
210 return rc; 293 return rc;
211 } 294 }
212 295
296 /*
297 * we already allocate sdesccmacaes when we init smb3 signing key,
298 * so unlike smb2 case we do not have to check here if secmech are
299 * initialized
300 */
213 rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash); 301 rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
214 if (rc) { 302 if (rc) {
215 cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); 303 cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 14a14808320c..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -526,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
526 if (cii->c_flags & C_FLUSH) 526 if (cii->c_flags & C_FLUSH)
527 coda_flag_inode_children(inode, C_FLUSH); 527 coda_flag_inode_children(inode, C_FLUSH);
528 528
529 if (de->d_count > 1) 529 if (d_count(de) > 1)
530 /* pretend it's valid, but don't change the flags */ 530 /* pretend it's valid, but don't change the flags */
531 goto out; 531 goto out;
532 532
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 64e5323cbbb0..277bd1be21fd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
387 if (d->d_inode) 387 if (d->d_inode)
388 simple_rmdir(parent->d_inode,d); 388 simple_rmdir(parent->d_inode,d);
389 389
390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); 390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
391 391
392 dput(parent); 392 dput(parent);
393} 393}
@@ -660,19 +660,15 @@ static int create_default_group(struct config_group *parent_group,
660 struct config_group *group) 660 struct config_group *group)
661{ 661{
662 int ret; 662 int ret;
663 struct qstr name;
664 struct configfs_dirent *sd; 663 struct configfs_dirent *sd;
665 /* We trust the caller holds a reference to parent */ 664 /* We trust the caller holds a reference to parent */
666 struct dentry *child, *parent = parent_group->cg_item.ci_dentry; 665 struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
667 666
668 if (!group->cg_item.ci_name) 667 if (!group->cg_item.ci_name)
669 group->cg_item.ci_name = group->cg_item.ci_namebuf; 668 group->cg_item.ci_name = group->cg_item.ci_namebuf;
670 name.name = group->cg_item.ci_name;
671 name.len = strlen(name.name);
672 name.hash = full_name_hash(name.name, name.len);
673 669
674 ret = -ENOMEM; 670 ret = -ENOMEM;
675 child = d_alloc(parent, &name); 671 child = d_alloc_name(parent, group->cg_item.ci_name);
676 if (child) { 672 if (child) {
677 d_add(child, NULL); 673 d_add(child, NULL);
678 674
@@ -1650,7 +1646,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1650{ 1646{
1651 int err; 1647 int err;
1652 struct config_group *group = &subsys->su_group; 1648 struct config_group *group = &subsys->su_group;
1653 struct qstr name;
1654 struct dentry *dentry; 1649 struct dentry *dentry;
1655 struct dentry *root; 1650 struct dentry *root;
1656 struct configfs_dirent *sd; 1651 struct configfs_dirent *sd;
@@ -1667,12 +1662,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1667 1662
1668 mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT); 1663 mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
1669 1664
1670 name.name = group->cg_item.ci_name;
1671 name.len = strlen(name.name);
1672 name.hash = full_name_hash(name.name, name.len);
1673
1674 err = -ENOMEM; 1665 err = -ENOMEM;
1675 dentry = d_alloc(root, &name); 1666 dentry = d_alloc_name(root, group->cg_item.ci_name);
1676 if (dentry) { 1667 if (dentry) {
1677 d_add(dentry, NULL); 1668 d_add(dentry, NULL);
1678 1669
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cfa109a4d5a2..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
37#include <asm/unaligned.h> 37#include <asm/unaligned.h>
38#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
39 39
40static int 40#define DECRYPT 0
41ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, 41#define ENCRYPT 1
42 struct page *dst_page, int dst_offset,
43 struct page *src_page, int src_offset, int size,
44 unsigned char *iv);
45static int
46ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
47 struct page *dst_page, int dst_offset,
48 struct page *src_page, int src_offset, int size,
49 unsigned char *iv);
50 42
51/** 43/**
52 * ecryptfs_to_hex 44 * ecryptfs_to_hex
@@ -336,19 +328,20 @@ static void extent_crypt_complete(struct crypto_async_request *req, int rc)
336} 328}
337 329
338/** 330/**
339 * encrypt_scatterlist 331 * crypt_scatterlist
340 * @crypt_stat: Pointer to the crypt_stat struct to initialize. 332 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
341 * @dest_sg: Destination of encrypted data 333 * @dst_sg: Destination of the data after performing the crypto operation
342 * @src_sg: Data to be encrypted 334 * @src_sg: Data to be encrypted or decrypted
343 * @size: Length of data to be encrypted 335 * @size: Length of data
344 * @iv: iv to use during encryption 336 * @iv: IV to use
337 * @op: ENCRYPT or DECRYPT to indicate the desired operation
345 * 338 *
346 * Returns the number of bytes encrypted; negative value on error 339 * Returns the number of bytes encrypted or decrypted; negative value on error
347 */ 340 */
348static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, 341static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
349 struct scatterlist *dest_sg, 342 struct scatterlist *dst_sg,
350 struct scatterlist *src_sg, int size, 343 struct scatterlist *src_sg, int size,
351 unsigned char *iv) 344 unsigned char *iv, int op)
352{ 345{
353 struct ablkcipher_request *req = NULL; 346 struct ablkcipher_request *req = NULL;
354 struct extent_crypt_result ecr; 347 struct extent_crypt_result ecr;
@@ -391,9 +384,9 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
391 crypt_stat->flags |= ECRYPTFS_KEY_SET; 384 crypt_stat->flags |= ECRYPTFS_KEY_SET;
392 } 385 }
393 mutex_unlock(&crypt_stat->cs_tfm_mutex); 386 mutex_unlock(&crypt_stat->cs_tfm_mutex);
394 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size); 387 ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
395 ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv); 388 rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
396 rc = crypto_ablkcipher_encrypt(req); 389 crypto_ablkcipher_decrypt(req);
397 if (rc == -EINPROGRESS || rc == -EBUSY) { 390 if (rc == -EINPROGRESS || rc == -EBUSY) {
398 struct extent_crypt_result *ecr = req->base.data; 391 struct extent_crypt_result *ecr = req->base.data;
399 392
@@ -407,41 +400,43 @@ out:
407} 400}
408 401
409/** 402/**
410 * ecryptfs_lower_offset_for_extent 403 * lower_offset_for_page
411 * 404 *
412 * Convert an eCryptfs page index into a lower byte offset 405 * Convert an eCryptfs page index into a lower byte offset
413 */ 406 */
414static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 407static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
415 struct ecryptfs_crypt_stat *crypt_stat) 408 struct page *page)
416{ 409{
417 (*offset) = ecryptfs_lower_header_size(crypt_stat) 410 return ecryptfs_lower_header_size(crypt_stat) +
418 + (crypt_stat->extent_size * extent_num); 411 (page->index << PAGE_CACHE_SHIFT);
419} 412}
420 413
421/** 414/**
422 * ecryptfs_encrypt_extent 415 * crypt_extent
423 * @enc_extent_page: Allocated page into which to encrypt the data in
424 * @page
425 * @crypt_stat: crypt_stat containing cryptographic context for the 416 * @crypt_stat: crypt_stat containing cryptographic context for the
426 * encryption operation 417 * encryption operation
427 * @page: Page containing plaintext data extent to encrypt 418 * @dst_page: The page to write the result into
419 * @src_page: The page to read from
428 * @extent_offset: Page extent offset for use in generating IV 420 * @extent_offset: Page extent offset for use in generating IV
421 * @op: ENCRYPT or DECRYPT to indicate the desired operation
429 * 422 *
430 * Encrypts one extent of data. 423 * Encrypts or decrypts one extent of data.
431 * 424 *
432 * Return zero on success; non-zero otherwise 425 * Return zero on success; non-zero otherwise
433 */ 426 */
434static int ecryptfs_encrypt_extent(struct page *enc_extent_page, 427static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
435 struct ecryptfs_crypt_stat *crypt_stat, 428 struct page *dst_page,
436 struct page *page, 429 struct page *src_page,
437 unsigned long extent_offset) 430 unsigned long extent_offset, int op)
438{ 431{
432 pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
439 loff_t extent_base; 433 loff_t extent_base;
440 char extent_iv[ECRYPTFS_MAX_IV_BYTES]; 434 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
435 struct scatterlist src_sg, dst_sg;
436 size_t extent_size = crypt_stat->extent_size;
441 int rc; 437 int rc;
442 438
443 extent_base = (((loff_t)page->index) 439 extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
444 * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
445 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 440 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
446 (extent_base + extent_offset)); 441 (extent_base + extent_offset));
447 if (rc) { 442 if (rc) {
@@ -450,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
450 (unsigned long long)(extent_base + extent_offset), rc); 445 (unsigned long long)(extent_base + extent_offset), rc);
451 goto out; 446 goto out;
452 } 447 }
453 rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, 448
454 page, (extent_offset 449 sg_init_table(&src_sg, 1);
455 * crypt_stat->extent_size), 450 sg_init_table(&dst_sg, 1);
456 crypt_stat->extent_size, extent_iv); 451
452 sg_set_page(&src_sg, src_page, extent_size,
453 extent_offset * extent_size);
454 sg_set_page(&dst_sg, dst_page, extent_size,
455 extent_offset * extent_size);
456
457 rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
458 extent_iv, op);
457 if (rc < 0) { 459 if (rc < 0) {
458 printk(KERN_ERR "%s: Error attempting to encrypt page with " 460 printk(KERN_ERR "%s: Error attempting to crypt page with "
459 "page->index = [%ld], extent_offset = [%ld]; " 461 "page_index = [%ld], extent_offset = [%ld]; "
460 "rc = [%d]\n", __func__, page->index, extent_offset, 462 "rc = [%d]\n", __func__, page_index, extent_offset, rc);
461 rc);
462 goto out; 463 goto out;
463 } 464 }
464 rc = 0; 465 rc = 0;
@@ -489,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
489 char *enc_extent_virt; 490 char *enc_extent_virt;
490 struct page *enc_extent_page = NULL; 491 struct page *enc_extent_page = NULL;
491 loff_t extent_offset; 492 loff_t extent_offset;
493 loff_t lower_offset;
492 int rc = 0; 494 int rc = 0;
493 495
494 ecryptfs_inode = page->mapping->host; 496 ecryptfs_inode = page->mapping->host;
@@ -502,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
502 "encrypted extent\n"); 504 "encrypted extent\n");
503 goto out; 505 goto out;
504 } 506 }
505 enc_extent_virt = kmap(enc_extent_page); 507
506 for (extent_offset = 0; 508 for (extent_offset = 0;
507 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); 509 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
508 extent_offset++) { 510 extent_offset++) {
509 loff_t offset; 511 rc = crypt_extent(crypt_stat, enc_extent_page, page,
510 512 extent_offset, ENCRYPT);
511 rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
512 extent_offset);
513 if (rc) { 513 if (rc) {
514 printk(KERN_ERR "%s: Error encrypting extent; " 514 printk(KERN_ERR "%s: Error encrypting extent; "
515 "rc = [%d]\n", __func__, rc); 515 "rc = [%d]\n", __func__, rc);
516 goto out; 516 goto out;
517 } 517 }
518 ecryptfs_lower_offset_for_extent(
519 &offset, ((((loff_t)page->index)
520 * (PAGE_CACHE_SIZE
521 / crypt_stat->extent_size))
522 + extent_offset), crypt_stat);
523 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
524 offset, crypt_stat->extent_size);
525 if (rc < 0) {
526 ecryptfs_printk(KERN_ERR, "Error attempting "
527 "to write lower page; rc = [%d]"
528 "\n", rc);
529 goto out;
530 }
531 }
532 rc = 0;
533out:
534 if (enc_extent_page) {
535 kunmap(enc_extent_page);
536 __free_page(enc_extent_page);
537 } 518 }
538 return rc;
539}
540 519
541static int ecryptfs_decrypt_extent(struct page *page, 520 lower_offset = lower_offset_for_page(crypt_stat, page);
542 struct ecryptfs_crypt_stat *crypt_stat, 521 enc_extent_virt = kmap(enc_extent_page);
543 struct page *enc_extent_page, 522 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
544 unsigned long extent_offset) 523 PAGE_CACHE_SIZE);
545{ 524 kunmap(enc_extent_page);
546 loff_t extent_base;
547 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
548 int rc;
549
550 extent_base = (((loff_t)page->index)
551 * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
552 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
553 (extent_base + extent_offset));
554 if (rc) {
555 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
556 "extent [0x%.16llx]; rc = [%d]\n",
557 (unsigned long long)(extent_base + extent_offset), rc);
558 goto out;
559 }
560 rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
561 (extent_offset
562 * crypt_stat->extent_size),
563 enc_extent_page, 0,
564 crypt_stat->extent_size, extent_iv);
565 if (rc < 0) { 525 if (rc < 0) {
566 printk(KERN_ERR "%s: Error attempting to decrypt to page with " 526 ecryptfs_printk(KERN_ERR,
567 "page->index = [%ld], extent_offset = [%ld]; " 527 "Error attempting to write lower page; rc = [%d]\n",
568 "rc = [%d]\n", __func__, page->index, extent_offset, 528 rc);
569 rc);
570 goto out; 529 goto out;
571 } 530 }
572 rc = 0; 531 rc = 0;
573out: 532out:
533 if (enc_extent_page) {
534 __free_page(enc_extent_page);
535 }
574 return rc; 536 return rc;
575} 537}
576 538
@@ -594,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
594{ 556{
595 struct inode *ecryptfs_inode; 557 struct inode *ecryptfs_inode;
596 struct ecryptfs_crypt_stat *crypt_stat; 558 struct ecryptfs_crypt_stat *crypt_stat;
597 char *enc_extent_virt; 559 char *page_virt;
598 struct page *enc_extent_page = NULL;
599 unsigned long extent_offset; 560 unsigned long extent_offset;
561 loff_t lower_offset;
600 int rc = 0; 562 int rc = 0;
601 563
602 ecryptfs_inode = page->mapping->host; 564 ecryptfs_inode = page->mapping->host;
603 crypt_stat = 565 crypt_stat =
604 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 566 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
605 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); 567 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
606 enc_extent_page = alloc_page(GFP_USER); 568
607 if (!enc_extent_page) { 569 lower_offset = lower_offset_for_page(crypt_stat, page);
608 rc = -ENOMEM; 570 page_virt = kmap(page);
609 ecryptfs_printk(KERN_ERR, "Error allocating memory for " 571 rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
610 "encrypted extent\n"); 572 ecryptfs_inode);
573 kunmap(page);
574 if (rc < 0) {
575 ecryptfs_printk(KERN_ERR,
576 "Error attempting to read lower page; rc = [%d]\n",
577 rc);
611 goto out; 578 goto out;
612 } 579 }
613 enc_extent_virt = kmap(enc_extent_page); 580
614 for (extent_offset = 0; 581 for (extent_offset = 0;
615 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); 582 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
616 extent_offset++) { 583 extent_offset++) {
617 loff_t offset; 584 rc = crypt_extent(crypt_stat, page, page,
618 585 extent_offset, DECRYPT);
619 ecryptfs_lower_offset_for_extent(
620 &offset, ((page->index * (PAGE_CACHE_SIZE
621 / crypt_stat->extent_size))
622 + extent_offset), crypt_stat);
623 rc = ecryptfs_read_lower(enc_extent_virt, offset,
624 crypt_stat->extent_size,
625 ecryptfs_inode);
626 if (rc < 0) {
627 ecryptfs_printk(KERN_ERR, "Error attempting "
628 "to read lower page; rc = [%d]"
629 "\n", rc);
630 goto out;
631 }
632 rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
633 extent_offset);
634 if (rc) { 586 if (rc) {
635 printk(KERN_ERR "%s: Error encrypting extent; " 587 printk(KERN_ERR "%s: Error encrypting extent; "
636 "rc = [%d]\n", __func__, rc); 588 "rc = [%d]\n", __func__, rc);
@@ -638,140 +590,7 @@ int ecryptfs_decrypt_page(struct page *page)
638 } 590 }
639 } 591 }
640out: 592out:
641 if (enc_extent_page) {
642 kunmap(enc_extent_page);
643 __free_page(enc_extent_page);
644 }
645 return rc;
646}
647
648/**
649 * decrypt_scatterlist
650 * @crypt_stat: Cryptographic context
651 * @dest_sg: The destination scatterlist to decrypt into
652 * @src_sg: The source scatterlist to decrypt from
653 * @size: The number of bytes to decrypt
654 * @iv: The initialization vector to use for the decryption
655 *
656 * Returns the number of bytes decrypted; negative value on error
657 */
658static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
659 struct scatterlist *dest_sg,
660 struct scatterlist *src_sg, int size,
661 unsigned char *iv)
662{
663 struct ablkcipher_request *req = NULL;
664 struct extent_crypt_result ecr;
665 int rc = 0;
666
667 BUG_ON(!crypt_stat || !crypt_stat->tfm
668 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
669 if (unlikely(ecryptfs_verbosity > 0)) {
670 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
671 crypt_stat->key_size);
672 ecryptfs_dump_hex(crypt_stat->key,
673 crypt_stat->key_size);
674 }
675
676 init_completion(&ecr.completion);
677
678 mutex_lock(&crypt_stat->cs_tfm_mutex);
679 req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
680 if (!req) {
681 mutex_unlock(&crypt_stat->cs_tfm_mutex);
682 rc = -ENOMEM;
683 goto out;
684 }
685
686 ablkcipher_request_set_callback(req,
687 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
688 extent_crypt_complete, &ecr);
689 /* Consider doing this once, when the file is opened */
690 if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
691 rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
692 crypt_stat->key_size);
693 if (rc) {
694 ecryptfs_printk(KERN_ERR,
695 "Error setting key; rc = [%d]\n",
696 rc);
697 mutex_unlock(&crypt_stat->cs_tfm_mutex);
698 rc = -EINVAL;
699 goto out;
700 }
701 crypt_stat->flags |= ECRYPTFS_KEY_SET;
702 }
703 mutex_unlock(&crypt_stat->cs_tfm_mutex);
704 ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
705 ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
706 rc = crypto_ablkcipher_decrypt(req);
707 if (rc == -EINPROGRESS || rc == -EBUSY) {
708 struct extent_crypt_result *ecr = req->base.data;
709
710 wait_for_completion(&ecr->completion);
711 rc = ecr->rc;
712 INIT_COMPLETION(ecr->completion);
713 }
714out:
715 ablkcipher_request_free(req);
716 return rc; 593 return rc;
717
718}
719
720/**
721 * ecryptfs_encrypt_page_offset
722 * @crypt_stat: The cryptographic context
723 * @dst_page: The page to encrypt into
724 * @dst_offset: The offset in the page to encrypt into
725 * @src_page: The page to encrypt from
726 * @src_offset: The offset in the page to encrypt from
727 * @size: The number of bytes to encrypt
728 * @iv: The initialization vector to use for the encryption
729 *
730 * Returns the number of bytes encrypted
731 */
732static int
733ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
734 struct page *dst_page, int dst_offset,
735 struct page *src_page, int src_offset, int size,
736 unsigned char *iv)
737{
738 struct scatterlist src_sg, dst_sg;
739
740 sg_init_table(&src_sg, 1);
741 sg_init_table(&dst_sg, 1);
742
743 sg_set_page(&src_sg, src_page, size, src_offset);
744 sg_set_page(&dst_sg, dst_page, size, dst_offset);
745 return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
746}
747
748/**
749 * ecryptfs_decrypt_page_offset
750 * @crypt_stat: The cryptographic context
751 * @dst_page: The page to decrypt into
752 * @dst_offset: The offset in the page to decrypt into
753 * @src_page: The page to decrypt from
754 * @src_offset: The offset in the page to decrypt from
755 * @size: The number of bytes to decrypt
756 * @iv: The initialization vector to use for the decryption
757 *
758 * Returns the number of bytes decrypted
759 */
760static int
761ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
762 struct page *dst_page, int dst_offset,
763 struct page *src_page, int src_offset, int size,
764 unsigned char *iv)
765{
766 struct scatterlist src_sg, dst_sg;
767
768 sg_init_table(&src_sg, 1);
769 sg_set_page(&src_sg, src_page, size, src_offset);
770
771 sg_init_table(&dst_sg, 1);
772 sg_set_page(&dst_sg, dst_page, size, dst_offset);
773
774 return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
775} 594}
776 595
777#define ECRYPTFS_MAX_SCATTERLIST_LEN 4 596#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 24f1105fda3a..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -49,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
49 unsigned long nr_segs, loff_t pos) 49 unsigned long nr_segs, loff_t pos)
50{ 50{
51 ssize_t rc; 51 ssize_t rc;
52 struct path lower; 52 struct path *path;
53 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
54 54
55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,9 +60,8 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
60 if (-EIOCBQUEUED == rc) 60 if (-EIOCBQUEUED == rc)
61 rc = wait_on_sync_kiocb(iocb); 61 rc = wait_on_sync_kiocb(iocb);
62 if (rc >= 0) { 62 if (rc >= 0) {
63 lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); 63 path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
64 lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); 64 touch_atime(path);
65 touch_atime(&lower);
66 } 65 }
67 return rc; 66 return rc;
68} 67}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a2f2bb2c256d..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
358 358
359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
361 BUG_ON(!lower_dentry->d_count); 361 BUG_ON(!d_count(lower_dentry));
362 362
363 ecryptfs_set_dentry_private(dentry, dentry_info); 363 ecryptfs_set_dentry_private(dentry, dentry_info);
364 ecryptfs_set_dentry_lower(dentry, lower_dentry); 364 ecryptfs_set_dentry_lower(dentry, lower_dentry);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
120 struct file **lower_file) 120 struct file **lower_file)
121{ 121{
122 const struct cred *cred = current_cred(); 122 const struct cred *cred = current_cred();
123 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 123 struct path *path = ecryptfs_dentry_to_lower_path(dentry);
124 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
125 int rc; 124 int rc;
126 125
127 rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt, 126 rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
128 cred); 127 cred);
129 if (rc) { 128 if (rc) {
130 printk(KERN_ERR "Error opening lower file " 129 printk(KERN_ERR "Error opening lower file "
131 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 130 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
132 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 131 "rc = [%d]\n", path->dentry, path->mnt, rc);
133 (*lower_file) = NULL; 132 (*lower_file) = NULL;
134 } 133 }
135 return rc; 134 return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
247 goto unlock; 247 goto unlock;
248 } 248 }
249 msg_size = (sizeof(*msg) + msg->data_len); 249 msg_size = (sizeof(*msg) + msg->data_len);
250 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); 250 msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
251 if (!msg_ctx->msg) { 251 if (!msg_ctx->msg) {
252 rc = -ENOMEM; 252 rc = -ENOMEM;
253 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " 253 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
254 "GFP_KERNEL memory\n", __func__, msg_size); 254 "GFP_KERNEL memory\n", __func__, msg_size);
255 goto unlock; 255 goto unlock;
256 } 256 }
257 memcpy(msg_ctx->msg, msg, msg_size);
258 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; 257 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
259 wake_up_process(msg_ctx->task); 258 wake_up_process(msg_ctx->task);
260 rc = 0; 259 rc = 0;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 7e787fb90293..07ab49745e31 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -155,20 +155,8 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
155 return 0; 155 return 0;
156}; 156};
157 157
158/*
159 * Handle negative dentry.
160 */
161static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
162 unsigned int flags)
163{
164 if (dentry->d_name.len > NAME_MAX)
165 return ERR_PTR(-ENAMETOOLONG);
166 d_add(dentry, NULL);
167 return NULL;
168}
169
170const struct inode_operations efivarfs_dir_inode_operations = { 158const struct inode_operations efivarfs_dir_inode_operations = {
171 .lookup = efivarfs_lookup, 159 .lookup = simple_lookup,
172 .unlink = efivarfs_unlink, 160 .unlink = efivarfs_unlink,
173 .create = efivarfs_create, 161 .create = efivarfs_create,
174}; 162};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
48 48
49 trace_ext3_sync_file_enter(file, datasync); 49 trace_ext3_sync_file_enter(file, datasync);
50 50
51 if (inode->i_sb->s_flags & MS_RDONLY) 51 if (inode->i_sb->s_flags & MS_RDONLY) {
52 /* Make sure that we read updated state */
53 smp_rmb();
54 if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
55 return -EROFS;
52 return 0; 56 return 0;
53 57 }
54 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 58 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
55 if (ret) 59 if (ret)
56 goto out; 60 goto out;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 998ea111e537..1194b1f0f839 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1780,11 +1780,11 @@ retry:
1780 inode->i_op = &ext3_file_inode_operations; 1780 inode->i_op = &ext3_file_inode_operations;
1781 inode->i_fop = &ext3_file_operations; 1781 inode->i_fop = &ext3_file_operations;
1782 ext3_set_aops(inode); 1782 ext3_set_aops(inode);
1783 d_tmpfile(dentry, inode);
1783 err = ext3_orphan_add(handle, inode); 1784 err = ext3_orphan_add(handle, inode);
1784 if (err) 1785 if (err)
1785 goto err_drop_inode; 1786 goto err_drop_inode;
1786 mark_inode_dirty(inode); 1787 mark_inode_dirty(inode);
1787 d_tmpfile(dentry, inode);
1788 unlock_new_inode(inode); 1788 unlock_new_inode(inode);
1789 } 1789 }
1790 ext3_journal_stop(handle); 1790 ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6356665a74bb..c47f14750722 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
174 if (test_opt (sb, ERRORS_RO)) { 174 if (test_opt (sb, ERRORS_RO)) {
175 ext3_msg(sb, KERN_CRIT, 175 ext3_msg(sb, KERN_CRIT,
176 "error: remounting filesystem read-only"); 176 "error: remounting filesystem read-only");
177 /*
178 * Make sure updated value of ->s_mount_state will be visible
179 * before ->s_flags update.
180 */
181 smp_wmb();
177 sb->s_flags |= MS_RDONLY; 182 sb->s_flags |= MS_RDONLY;
178 } 183 }
179 ext3_commit_super(sb, es, 1); 184 ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
291 ext3_msg(sb, KERN_CRIT, 296 ext3_msg(sb, KERN_CRIT,
292 "error: remounting filesystem read-only"); 297 "error: remounting filesystem read-only");
293 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; 298 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
294 sb->s_flags |= MS_RDONLY;
295 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); 299 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
300 /*
301 * Make sure updated value of ->s_mount_state will be visible
302 * before ->s_flags update.
303 */
304 smp_wmb();
305 sb->s_flags |= MS_RDONLY;
306
296 if (EXT3_SB(sb)->s_journal) 307 if (EXT3_SB(sb)->s_journal)
297 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 308 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
298} 309}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 58339393fa6e..ddd715e42a5c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
38 ext4_group_t group; 38 ext4_group_t group;
39 39
40 if (test_opt2(sb, STD_GROUP_SIZE)) 40 if (test_opt2(sb, STD_GROUP_SIZE))
41 group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 41 group = (block -
42 block) >> 42 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); 43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
44 else 44 else
45 ext4_get_group_no_and_offset(sb, block, &group, NULL); 45 ext4_get_group_no_and_offset(sb, block, &group, NULL);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7097b0f680e6..a61873808f76 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2835,6 +2835,9 @@ again:
2835 err = -EIO; 2835 err = -EIO;
2836 break; 2836 break;
2837 } 2837 }
2838 /* Yield here to deal with large extent trees.
2839 * Should be a no-op if we did IO above. */
2840 cond_resched();
2838 if (WARN_ON(i + 1 > depth)) { 2841 if (WARN_ON(i + 1 > depth)) {
2839 err = -EIO; 2842 err = -EIO;
2840 break; 2843 break;
@@ -4261,8 +4264,8 @@ got_allocated_blocks:
4261 /* not a good idea to call discard here directly, 4264 /* not a good idea to call discard here directly,
4262 * but otherwise we'd need to call it every free() */ 4265 * but otherwise we'd need to call it every free() */
4263 ext4_discard_preallocations(inode); 4266 ext4_discard_preallocations(inode);
4264 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 4267 ext4_free_blocks(handle, inode, NULL, newblock,
4265 ext4_ext_get_actual_len(&newex), fb_flags); 4268 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4266 goto out2; 4269 goto out2;
4267 } 4270 }
4268 4271
@@ -4382,8 +4385,9 @@ out2:
4382 } 4385 }
4383 4386
4384out3: 4387out3:
4385 trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); 4388 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4386 4389 err ? err : allocated);
4390 ext4_es_lru_add(inode);
4387 return err ? err : allocated; 4391 return err ? err : allocated;
4388} 4392}
4389 4393
@@ -4405,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4405 4409
4406 last_block = (inode->i_size + sb->s_blocksize - 1) 4410 last_block = (inode->i_size + sb->s_blocksize - 1)
4407 >> EXT4_BLOCK_SIZE_BITS(sb); 4411 >> EXT4_BLOCK_SIZE_BITS(sb);
4412retry:
4408 err = ext4_es_remove_extent(inode, last_block, 4413 err = ext4_es_remove_extent(inode, last_block,
4409 EXT_MAX_BLOCKS - last_block); 4414 EXT_MAX_BLOCKS - last_block);
4415 if (err == ENOMEM) {
4416 cond_resched();
4417 congestion_wait(BLK_RW_ASYNC, HZ/50);
4418 goto retry;
4419 }
4420 if (err) {
4421 ext4_std_error(inode->i_sb, err);
4422 return;
4423 }
4410 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4424 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4425 ext4_std_error(inode->i_sb, err);
4411} 4426}
4412 4427
4413static void ext4_falloc_update_inode(struct inode *inode, 4428static void ext4_falloc_update_inode(struct inode *inode,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ee018d5f397e..91cb110da1b4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -148,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
148 ext4_lblk_t end); 148 ext4_lblk_t end);
149static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 149static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
150 int nr_to_scan); 150 int nr_to_scan);
151static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152 struct ext4_inode_info *locked_ei);
151 153
152int __init ext4_init_es(void) 154int __init ext4_init_es(void)
153{ 155{
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
439 */ 441 */
440 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { 442 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
441 if (in_range(es->es_lblk, ee_block, ee_len)) { 443 if (in_range(es->es_lblk, ee_block, ee_len)) {
442 pr_warn("ES insert assertation failed for " 444 pr_warn("ES insert assertion failed for "
443 "inode: %lu we can find an extent " 445 "inode: %lu we can find an extent "
444 "at block [%d/%d/%llu/%c], but we " 446 "at block [%d/%d/%llu/%c], but we "
445 "want to add an delayed/hole extent " 447 "want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
458 */ 460 */
459 if (es->es_lblk < ee_block || 461 if (es->es_lblk < ee_block ||
460 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { 462 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
461 pr_warn("ES insert assertation failed for inode: %lu " 463 pr_warn("ES insert assertion failed for inode: %lu "
462 "ex_status [%d/%d/%llu/%c] != " 464 "ex_status [%d/%d/%llu/%c] != "
463 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 465 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
464 ee_block, ee_len, ee_start, 466 ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
468 } 470 }
469 471
470 if (ee_status ^ es_status) { 472 if (ee_status ^ es_status) {
471 pr_warn("ES insert assertation failed for inode: %lu " 473 pr_warn("ES insert assertion failed for inode: %lu "
472 "ex_status [%d/%d/%llu/%c] != " 474 "ex_status [%d/%d/%llu/%c] != "
473 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 475 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
474 ee_block, ee_len, ee_start, 476 ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
481 * that we don't want to add an written/unwritten extent. 483 * that we don't want to add an written/unwritten extent.
482 */ 484 */
483 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 485 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
484 pr_warn("ES insert assertation failed for inode: %lu " 486 pr_warn("ES insert assertion failed for inode: %lu "
485 "can't find an extent at block %d but we want " 487 "can't find an extent at block %d but we want "
486 "to add an written/unwritten extent " 488 "to add an written/unwritten extent "
487 "[%d/%d/%llu/%llx]\n", inode->i_ino, 489 "[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
519 * We want to add a delayed/hole extent but this 521 * We want to add a delayed/hole extent but this
520 * block has been allocated. 522 * block has been allocated.
521 */ 523 */
522 pr_warn("ES insert assertation failed for inode: %lu " 524 pr_warn("ES insert assertion failed for inode: %lu "
523 "We can find blocks but we want to add a " 525 "We can find blocks but we want to add a "
524 "delayed/hole extent [%d/%d/%llu/%llx]\n", 526 "delayed/hole extent [%d/%d/%llu/%llx]\n",
525 inode->i_ino, es->es_lblk, es->es_len, 527 inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
527 return; 529 return;
528 } else if (ext4_es_is_written(es)) { 530 } else if (ext4_es_is_written(es)) {
529 if (retval != es->es_len) { 531 if (retval != es->es_len) {
530 pr_warn("ES insert assertation failed for " 532 pr_warn("ES insert assertion failed for "
531 "inode: %lu retval %d != es_len %d\n", 533 "inode: %lu retval %d != es_len %d\n",
532 inode->i_ino, retval, es->es_len); 534 inode->i_ino, retval, es->es_len);
533 return; 535 return;
534 } 536 }
535 if (map.m_pblk != ext4_es_pblock(es)) { 537 if (map.m_pblk != ext4_es_pblock(es)) {
536 pr_warn("ES insert assertation failed for " 538 pr_warn("ES insert assertion failed for "
537 "inode: %lu m_pblk %llu != " 539 "inode: %lu m_pblk %llu != "
538 "es_pblk %llu\n", 540 "es_pblk %llu\n",
539 inode->i_ino, map.m_pblk, 541 inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
549 } 551 }
550 } else if (retval == 0) { 552 } else if (retval == 0) {
551 if (ext4_es_is_written(es)) { 553 if (ext4_es_is_written(es)) {
552 pr_warn("ES insert assertation failed for inode: %lu " 554 pr_warn("ES insert assertion failed for inode: %lu "
553 "We can't find the block but we want to add " 555 "We can't find the block but we want to add "
554 "an written extent [%d/%d/%llu/%llx]\n", 556 "an written extent [%d/%d/%llu/%llx]\n",
555 inode->i_ino, es->es_lblk, es->es_len, 557 inode->i_ino, es->es_lblk, es->es_len,
@@ -632,10 +634,8 @@ out:
632} 634}
633 635
634/* 636/*
635 * ext4_es_insert_extent() adds a space to a extent status tree. 637 * ext4_es_insert_extent() adds information to an inode's extent
636 * 638 * status tree.
637 * ext4_es_insert_extent is called by ext4_da_write_begin and
638 * ext4_es_remove_extent.
639 * 639 *
640 * Return 0 on success, error code on failure. 640 * Return 0 on success, error code on failure.
641 */ 641 */
@@ -667,7 +667,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
667 err = __es_remove_extent(inode, lblk, end); 667 err = __es_remove_extent(inode, lblk, end);
668 if (err != 0) 668 if (err != 0)
669 goto error; 669 goto error;
670retry:
670 err = __es_insert_extent(inode, &newes); 671 err = __es_insert_extent(inode, &newes);
672 if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
673 EXT4_I(inode)))
674 goto retry;
675 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
676 err = 0;
671 677
672error: 678error:
673 write_unlock(&EXT4_I(inode)->i_es_lock); 679 write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -746,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
746 struct extent_status orig_es; 752 struct extent_status orig_es;
747 ext4_lblk_t len1, len2; 753 ext4_lblk_t len1, len2;
748 ext4_fsblk_t block; 754 ext4_fsblk_t block;
749 int err = 0; 755 int err;
750 756
757retry:
758 err = 0;
751 es = __es_tree_search(&tree->root, lblk); 759 es = __es_tree_search(&tree->root, lblk);
752 if (!es) 760 if (!es)
753 goto out; 761 goto out;
@@ -782,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
782 if (err) { 790 if (err) {
783 es->es_lblk = orig_es.es_lblk; 791 es->es_lblk = orig_es.es_lblk;
784 es->es_len = orig_es.es_len; 792 es->es_len = orig_es.es_len;
793 if ((err == -ENOMEM) &&
794 __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
795 EXT4_I(inode)))
796 goto retry;
785 goto out; 797 goto out;
786 } 798 }
787 } else { 799 } else {
@@ -891,22 +903,14 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
891 return -1; 903 return -1;
892} 904}
893 905
894static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 906static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
907 struct ext4_inode_info *locked_ei)
895{ 908{
896 struct ext4_sb_info *sbi = container_of(shrink,
897 struct ext4_sb_info, s_es_shrinker);
898 struct ext4_inode_info *ei; 909 struct ext4_inode_info *ei;
899 struct list_head *cur, *tmp; 910 struct list_head *cur, *tmp;
900 LIST_HEAD(skiped); 911 LIST_HEAD(skiped);
901 int nr_to_scan = sc->nr_to_scan;
902 int ret, nr_shrunk = 0; 912 int ret, nr_shrunk = 0;
903 913
904 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
905 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
906
907 if (!nr_to_scan)
908 return ret;
909
910 spin_lock(&sbi->s_es_lru_lock); 914 spin_lock(&sbi->s_es_lru_lock);
911 915
912 /* 916 /*
@@ -935,7 +939,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
935 continue; 939 continue;
936 } 940 }
937 941
938 if (ei->i_es_lru_nr == 0) 942 if (ei->i_es_lru_nr == 0 || ei == locked_ei)
939 continue; 943 continue;
940 944
941 write_lock(&ei->i_es_lock); 945 write_lock(&ei->i_es_lock);
@@ -954,6 +958,27 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
954 list_splice_tail(&skiped, &sbi->s_es_lru); 958 list_splice_tail(&skiped, &sbi->s_es_lru);
955 spin_unlock(&sbi->s_es_lru_lock); 959 spin_unlock(&sbi->s_es_lru_lock);
956 960
961 if (locked_ei && nr_shrunk == 0)
962 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
963
964 return nr_shrunk;
965}
966
967static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
968{
969 struct ext4_sb_info *sbi = container_of(shrink,
970 struct ext4_sb_info, s_es_shrinker);
971 int nr_to_scan = sc->nr_to_scan;
972 int ret, nr_shrunk;
973
974 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
975 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
976
977 if (!nr_to_scan)
978 return ret;
979
980 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
981
957 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 982 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
958 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); 983 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
959 return ret; 984 return ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0188e65e1f58..ba33c67d6e48 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -465,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
465 if (es_map->m_lblk != map->m_lblk || 465 if (es_map->m_lblk != map->m_lblk ||
466 es_map->m_flags != map->m_flags || 466 es_map->m_flags != map->m_flags ||
467 es_map->m_pblk != map->m_pblk) { 467 es_map->m_pblk != map->m_pblk) {
468 printk("ES cache assertation failed for inode: %lu " 468 printk("ES cache assertion failed for inode: %lu "
469 "es_cached ex [%d/%d/%llu/%x] != " 469 "es_cached ex [%d/%d/%llu/%x] != "
470 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 470 "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
471 inode->i_ino, es_map->m_lblk, es_map->m_len, 471 inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -514,10 +514,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
514 "logical block %lu\n", inode->i_ino, flags, map->m_len, 514 "logical block %lu\n", inode->i_ino, flags, map->m_len,
515 (unsigned long) map->m_lblk); 515 (unsigned long) map->m_lblk);
516 516
517 ext4_es_lru_add(inode);
518
519 /* Lookup extent status tree firstly */ 517 /* Lookup extent status tree firstly */
520 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 518 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
519 ext4_es_lru_add(inode);
521 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 520 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
522 map->m_pblk = ext4_es_pblock(&es) + 521 map->m_pblk = ext4_es_pblock(&es) +
523 map->m_lblk - es.es_lblk; 522 map->m_lblk - es.es_lblk;
@@ -558,7 +557,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
558 557
559#ifdef ES_AGGRESSIVE_TEST 558#ifdef ES_AGGRESSIVE_TEST
560 if (retval != map->m_len) { 559 if (retval != map->m_len) {
561 printk("ES len assertation failed for inode: %lu " 560 printk("ES len assertion failed for inode: %lu "
562 "retval %d != map->m_len %d " 561 "retval %d != map->m_len %d "
563 "in %s (lookup)\n", inode->i_ino, retval, 562 "in %s (lookup)\n", inode->i_ino, retval,
564 map->m_len, __func__); 563 map->m_len, __func__);
@@ -659,7 +658,7 @@ found:
659 658
660#ifdef ES_AGGRESSIVE_TEST 659#ifdef ES_AGGRESSIVE_TEST
661 if (retval != map->m_len) { 660 if (retval != map->m_len) {
662 printk("ES len assertation failed for inode: %lu " 661 printk("ES len assertion failed for inode: %lu "
663 "retval %d != map->m_len %d " 662 "retval %d != map->m_len %d "
664 "in %s (allocation)\n", inode->i_ino, retval, 663 "in %s (allocation)\n", inode->i_ino, retval,
665 map->m_len, __func__); 664 map->m_len, __func__);
@@ -1529,11 +1528,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1529 "logical block %lu\n", inode->i_ino, map->m_len, 1528 "logical block %lu\n", inode->i_ino, map->m_len,
1530 (unsigned long) map->m_lblk); 1529 (unsigned long) map->m_lblk);
1531 1530
1532 ext4_es_lru_add(inode);
1533
1534 /* Lookup extent status tree firstly */ 1531 /* Lookup extent status tree firstly */
1535 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1532 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1536 1533 ext4_es_lru_add(inode);
1537 if (ext4_es_is_hole(&es)) { 1534 if (ext4_es_is_hole(&es)) {
1538 retval = 0; 1535 retval = 0;
1539 down_read((&EXT4_I(inode)->i_data_sem)); 1536 down_read((&EXT4_I(inode)->i_data_sem));
@@ -1642,7 +1639,7 @@ add_delayed:
1642 1639
1643#ifdef ES_AGGRESSIVE_TEST 1640#ifdef ES_AGGRESSIVE_TEST
1644 if (retval != map->m_len) { 1641 if (retval != map->m_len) {
1645 printk("ES len assertation failed for inode: %lu " 1642 printk("ES len assertion failed for inode: %lu "
1646 "retval %d != map->m_len %d " 1643 "retval %d != map->m_len %d "
1647 "in %s (lookup)\n", inode->i_ino, retval, 1644 "in %s (lookup)\n", inode->i_ino, retval,
1648 map->m_len, __func__); 1645 map->m_len, __func__);
@@ -2163,7 +2160,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2163 2160
2164 mpd->io_submit.io_end->offset = 2161 mpd->io_submit.io_end->offset =
2165 ((loff_t)map->m_lblk) << inode->i_blkbits; 2162 ((loff_t)map->m_lblk) << inode->i_blkbits;
2166 while (map->m_len) { 2163 do {
2167 err = mpage_map_one_extent(handle, mpd); 2164 err = mpage_map_one_extent(handle, mpd);
2168 if (err < 0) { 2165 if (err < 0) {
2169 struct super_block *sb = inode->i_sb; 2166 struct super_block *sb = inode->i_sb;
@@ -2201,7 +2198,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2201 err = mpage_map_and_submit_buffers(mpd); 2198 err = mpage_map_and_submit_buffers(mpd);
2202 if (err < 0) 2199 if (err < 0)
2203 return err; 2200 return err;
2204 } 2201 } while (map->m_len);
2205 2202
2206 /* Update on-disk size after IO is submitted */ 2203 /* Update on-disk size after IO is submitted */
2207 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; 2204 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a9ff5e5137ca..4bbbf13bd743 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4740,11 +4740,16 @@ do_more:
4740 * blocks being freed are metadata. these blocks shouldn't 4740 * blocks being freed are metadata. these blocks shouldn't
4741 * be used until this transaction is committed 4741 * be used until this transaction is committed
4742 */ 4742 */
4743 retry:
4743 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); 4744 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4744 if (!new_entry) { 4745 if (!new_entry) {
4745 ext4_mb_unload_buddy(&e4b); 4746 /*
4746 err = -ENOMEM; 4747 * We use a retry loop because
4747 goto error_return; 4748 * ext4_free_blocks() is not allowed to fail.
4749 */
4750 cond_resched();
4751 congestion_wait(BLK_RW_ASYNC, HZ/50);
4752 goto retry;
4748 } 4753 }
4749 new_entry->efd_start_cluster = bit; 4754 new_entry->efd_start_cluster = bit;
4750 new_entry->efd_group = block_group; 4755 new_entry->efd_group = block_group;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 234b834d5a97..35f55a0dbc4b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2316,11 +2316,11 @@ retry:
2316 inode->i_op = &ext4_file_inode_operations; 2316 inode->i_op = &ext4_file_inode_operations;
2317 inode->i_fop = &ext4_file_operations; 2317 inode->i_fop = &ext4_file_operations;
2318 ext4_set_aops(inode); 2318 ext4_set_aops(inode);
2319 d_tmpfile(dentry, inode);
2319 err = ext4_orphan_add(handle, inode); 2320 err = ext4_orphan_add(handle, inode);
2320 if (err) 2321 if (err)
2321 goto err_drop_inode; 2322 goto err_drop_inode;
2322 mark_inode_dirty(inode); 2323 mark_inode_dirty(inode);
2323 d_tmpfile(dentry, inode);
2324 unlock_new_inode(inode); 2324 unlock_new_inode(inode);
2325 } 2325 }
2326 if (handle) 2326 if (handle)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 48786cdb5e6c..6625d210fb45 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/ratelimit.h>
28 29
29#include "ext4_jbd2.h" 30#include "ext4_jbd2.h"
30#include "xattr.h" 31#include "xattr.h"
@@ -55,7 +56,7 @@ void ext4_exit_pageio(void)
55static void buffer_io_error(struct buffer_head *bh) 56static void buffer_io_error(struct buffer_head *bh)
56{ 57{
57 char b[BDEVNAME_SIZE]; 58 char b[BDEVNAME_SIZE];
58 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 59 printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
59 bdevname(bh->b_bdev, b), 60 bdevname(bh->b_bdev, b),
60 (unsigned long long)bh->b_blocknr); 61 (unsigned long long)bh->b_blocknr);
61} 62}
@@ -308,6 +309,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
308 return io_end; 309 return io_end;
309} 310}
310 311
312/* BIO completion function for page writeback */
311static void ext4_end_bio(struct bio *bio, int error) 313static void ext4_end_bio(struct bio *bio, int error)
312{ 314{
313 ext4_io_end_t *io_end = bio->bi_private; 315 ext4_io_end_t *io_end = bio->bi_private;
@@ -318,18 +320,6 @@ static void ext4_end_bio(struct bio *bio, int error)
318 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 320 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
319 error = 0; 321 error = 0;
320 322
321 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
322 /*
323 * Link bio into list hanging from io_end. We have to do it
324 * atomically as bio completions can be racing against each
325 * other.
326 */
327 bio->bi_private = xchg(&io_end->bio, bio);
328 } else {
329 ext4_finish_bio(bio);
330 bio_put(bio);
331 }
332
333 if (error) { 323 if (error) {
334 struct inode *inode = io_end->inode; 324 struct inode *inode = io_end->inode;
335 325
@@ -341,7 +331,24 @@ static void ext4_end_bio(struct bio *bio, int error)
341 (unsigned long long) 331 (unsigned long long)
342 bi_sector >> (inode->i_blkbits - 9)); 332 bi_sector >> (inode->i_blkbits - 9));
343 } 333 }
344 ext4_put_io_end_defer(io_end); 334
335 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
336 /*
337 * Link bio into list hanging from io_end. We have to do it
338 * atomically as bio completions can be racing against each
339 * other.
340 */
341 bio->bi_private = xchg(&io_end->bio, bio);
342 ext4_put_io_end_defer(io_end);
343 } else {
344 /*
345 * Drop io_end reference early. Inode can get freed once
346 * we finish the bio.
347 */
348 ext4_put_io_end_defer(io_end);
349 ext4_finish_bio(bio);
350 bio_put(bio);
351 }
345} 352}
346 353
347void ext4_io_submit(struct ext4_io_submit *io) 354void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 85b3dd60169b..bca26f34edf4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1702,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
1702 1702
1703 if (sbi->s_qf_names[GRPQUOTA]) 1703 if (sbi->s_qf_names[GRPQUOTA])
1704 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 1704 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1705
1706 if (test_opt(sb, USRQUOTA))
1707 seq_puts(seq, ",usrquota");
1708
1709 if (test_opt(sb, GRPQUOTA))
1710 seq_puts(seq, ",grpquota");
1711#endif 1705#endif
1712} 1706}
1713 1707
@@ -3624,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3624 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 3618 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3625 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 3619 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3626 3620
3627 /* Do we have standard group size of blocksize * 8 blocks ? */
3628 if (sbi->s_blocks_per_group == blocksize << 3)
3629 set_opt2(sb, STD_GROUP_SIZE);
3630
3631 for (i = 0; i < 4; i++) 3621 for (i = 0; i < 4; i++)
3632 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3622 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3633 sbi->s_def_hash_version = es->s_def_hash_version; 3623 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3697,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3697 goto failed_mount; 3687 goto failed_mount;
3698 } 3688 }
3699 3689
3690 /* Do we have standard group size of clustersize * 8 blocks ? */
3691 if (sbi->s_blocks_per_group == clustersize << 3)
3692 set_opt2(sb, STD_GROUP_SIZE);
3693
3700 /* 3694 /*
3701 * Test whether we have more sectors than will fit in sector_t, 3695 * Test whether we have more sectors than will fit in sector_t,
3702 * and whether the max offset is addressable by the page cache. 3696 * and whether the max offset is addressable by the page cache.
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 9d1cd423450d..62f0d5977c64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -610,13 +610,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
610{ 610{
611 struct inode *inode = file_inode(file); 611 struct inode *inode = file_inode(file);
612 unsigned long npages = dir_blocks(inode); 612 unsigned long npages = dir_blocks(inode);
613 unsigned int bit_pos = 0, start_bit_pos = 0; 613 unsigned int bit_pos = 0;
614 struct f2fs_dentry_block *dentry_blk = NULL; 614 struct f2fs_dentry_block *dentry_blk = NULL;
615 struct f2fs_dir_entry *de = NULL; 615 struct f2fs_dir_entry *de = NULL;
616 struct page *dentry_page = NULL; 616 struct page *dentry_page = NULL;
617 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); 617 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
618 unsigned char d_type = DT_UNKNOWN; 618 unsigned char d_type = DT_UNKNOWN;
619 int slots;
620 619
621 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); 620 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
622 621
@@ -625,7 +624,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
625 if (IS_ERR(dentry_page)) 624 if (IS_ERR(dentry_page))
626 continue; 625 continue;
627 626
628 start_bit_pos = bit_pos;
629 dentry_blk = kmap(dentry_page); 627 dentry_blk = kmap(dentry_page);
630 while (bit_pos < NR_DENTRY_IN_BLOCK) { 628 while (bit_pos < NR_DENTRY_IN_BLOCK) {
631 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 629 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
@@ -634,19 +632,19 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
634 if (bit_pos >= NR_DENTRY_IN_BLOCK) 632 if (bit_pos >= NR_DENTRY_IN_BLOCK)
635 break; 633 break;
636 634
637 ctx->pos += bit_pos - start_bit_pos;
638 de = &dentry_blk->dentry[bit_pos]; 635 de = &dentry_blk->dentry[bit_pos];
639 if (de->file_type < F2FS_FT_MAX) 636 if (de->file_type < F2FS_FT_MAX)
640 d_type = f2fs_filetype_table[de->file_type]; 637 d_type = f2fs_filetype_table[de->file_type];
641 else 638 else
642 d_type = DT_UNKNOWN; 639 d_type = DT_UNKNOWN;
643 if (!dir_emit(ctx, 640 if (!dir_emit(ctx,
644 dentry_blk->filename[bit_pos], 641 dentry_blk->filename[bit_pos],
645 le16_to_cpu(de->name_len), 642 le16_to_cpu(de->name_len),
646 le32_to_cpu(de->ino), d_type)) 643 le32_to_cpu(de->ino), d_type))
647 goto success; 644 goto stop;
648 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 645
649 bit_pos += slots; 646 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
647 ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
650 } 648 }
651 bit_pos = 0; 649 bit_pos = 0;
652 ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; 650 ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -654,7 +652,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
654 f2fs_put_page(dentry_page, 1); 652 f2fs_put_page(dentry_page, 1);
655 dentry_page = NULL; 653 dentry_page = NULL;
656 } 654 }
657success: 655stop:
658 if (dentry_page && !IS_ERR(dentry_page)) { 656 if (dentry_page && !IS_ERR(dentry_page)) {
659 kunmap(dentry_page); 657 kunmap(dentry_page);
660 f2fs_put_page(dentry_page, 1); 658 f2fs_put_page(dentry_page, 1);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
86 const void *dir_ops; /* Opaque; default directory operations */ 86 const void *dir_ops; /* Opaque; default directory operations */
87 int dir_per_block; /* dir entries per block */ 87 int dir_per_block; /* dir entries per block */
88 int dir_per_block_bits; /* log2(dir_per_block) */ 88 int dir_per_block_bits; /* log2(dir_per_block) */
89 unsigned int vol_id; /*volume ID*/
89 90
90 int fatent_shift; 91 int fatent_shift;
91 struct fatent_operations *fatent_ops; 92 struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
114 return err; 114 return err;
115} 115}
116 116
117static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
118{
119 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
120 return put_user(sbi->vol_id, user_attr);
121}
122
117long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 123long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118{ 124{
119 struct inode *inode = file_inode(filp); 125 struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
124 return fat_ioctl_get_attributes(inode, user_attr); 130 return fat_ioctl_get_attributes(inode, user_attr);
125 case FAT_IOCTL_SET_ATTRIBUTES: 131 case FAT_IOCTL_SET_ATTRIBUTES:
126 return fat_ioctl_set_attributes(filp, user_attr); 132 return fat_ioctl_set_attributes(filp, user_attr);
133 case FAT_IOCTL_GET_VOLUME_ID:
134 return fat_ioctl_get_volume_id(inode, user_attr);
127 default: 135 default:
128 return -ENOTTY; /* Inappropriate ioctl for device */ 136 return -ENOTTY; /* Inappropriate ioctl for device */
129 } 137 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1415 brelse(fsinfo_bh); 1415 brelse(fsinfo_bh);
1416 } 1416 }
1417 1417
1418 /* interpret volume ID as a little endian 32 bit integer */
1419 if (sbi->fat_bits == 32)
1420 sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
1421 ((u32)b->fat32.vol_id[1] << 8) |
1422 ((u32)b->fat32.vol_id[2] << 16) |
1423 ((u32)b->fat32.vol_id[3] << 24));
1424 else /* fat 16 or 12 */
1425 sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
1426 ((u32)b->fat16.vol_id[1] << 8) |
1427 ((u32)b->fat16.vol_id[2] << 16) |
1428 ((u32)b->fat16.vol_id[3] << 24));
1429
1418 sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); 1430 sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
1419 sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; 1431 sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
1420 1432
diff --git a/fs/file_table.c b/fs/file_table.c
index 08e719b884ca..b44e4c559786 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -265,18 +265,15 @@ static void __fput(struct file *file)
265 mntput(mnt); 265 mntput(mnt);
266} 266}
267 267
268static DEFINE_SPINLOCK(delayed_fput_lock); 268static LLIST_HEAD(delayed_fput_list);
269static LIST_HEAD(delayed_fput_list);
270static void delayed_fput(struct work_struct *unused) 269static void delayed_fput(struct work_struct *unused)
271{ 270{
272 LIST_HEAD(head); 271 struct llist_node *node = llist_del_all(&delayed_fput_list);
273 spin_lock_irq(&delayed_fput_lock); 272 struct llist_node *next;
274 list_splice_init(&delayed_fput_list, &head); 273
275 spin_unlock_irq(&delayed_fput_lock); 274 for (; node; node = next) {
276 while (!list_empty(&head)) { 275 next = llist_next(node);
277 struct file *f = list_first_entry(&head, struct file, f_u.fu_list); 276 __fput(llist_entry(node, struct file, f_u.fu_llist));
278 list_del_init(&f->f_u.fu_list);
279 __fput(f);
280 } 277 }
281} 278}
282 279
@@ -306,18 +303,22 @@ void fput(struct file *file)
306{ 303{
307 if (atomic_long_dec_and_test(&file->f_count)) { 304 if (atomic_long_dec_and_test(&file->f_count)) {
308 struct task_struct *task = current; 305 struct task_struct *task = current;
309 unsigned long flags;
310 306
311 file_sb_list_del(file); 307 file_sb_list_del(file);
312 if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { 308 if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
313 init_task_work(&file->f_u.fu_rcuhead, ____fput); 309 init_task_work(&file->f_u.fu_rcuhead, ____fput);
314 if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) 310 if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
315 return; 311 return;
312 /*
313 * After this task has run exit_task_work(),
314 * task_work_add() will fail. free_ipc_ns()->
315 * shm_destroy() can do this. Fall through to delayed
316 * fput to avoid leaking *file.
317 */
316 } 318 }
317 spin_lock_irqsave(&delayed_fput_lock, flags); 319
318 list_add(&file->f_u.fu_list, &delayed_fput_list); 320 if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
319 schedule_work(&delayed_fput_work); 321 schedule_work(&delayed_fput_work);
320 spin_unlock_irqrestore(&delayed_fput_lock, flags);
321 } 322 }
322} 323}
323 324
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a85ac4e33436..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
963/* 963/*
964 * Retrieve work items and do the writeback they describe 964 * Retrieve work items and do the writeback they describe
965 */ 965 */
966long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 966static long wb_do_writeback(struct bdi_writeback *wb)
967{ 967{
968 struct backing_dev_info *bdi = wb->bdi; 968 struct backing_dev_info *bdi = wb->bdi;
969 struct wb_writeback_work *work; 969 struct wb_writeback_work *work;
@@ -971,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
971 971
972 set_bit(BDI_writeback_running, &wb->bdi->state); 972 set_bit(BDI_writeback_running, &wb->bdi->state);
973 while ((work = get_next_work_item(bdi)) != NULL) { 973 while ((work = get_next_work_item(bdi)) != NULL) {
974 /*
975 * Override sync mode, in case we must wait for completion
976 * because this thread is exiting now.
977 */
978 if (force_wait)
979 work->sync_mode = WB_SYNC_ALL;
980 974
981 trace_writeback_exec(bdi, work); 975 trace_writeback_exec(bdi, work);
982 976
@@ -1025,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
1025 * rescuer as work_list needs to be drained. 1019 * rescuer as work_list needs to be drained.
1026 */ 1020 */
1027 do { 1021 do {
1028 pages_written = wb_do_writeback(wb, 0); 1022 pages_written = wb_do_writeback(wb);
1029 trace_writeback_pages_written(pages_written); 1023 trace_writeback_pages_written(pages_written);
1030 } while (!list_empty(&bdi->work_list)); 1024 } while (!list_empty(&bdi->work_list));
1031 } else { 1025 } else {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0eda52738ec4..72a5d5b04494 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1223,30 +1223,46 @@ static int fuse_direntplus_link(struct file *file,
1223 if (name.name[1] == '.' && name.len == 2) 1223 if (name.name[1] == '.' && name.len == 2)
1224 return 0; 1224 return 0;
1225 } 1225 }
1226
1227 if (invalid_nodeid(o->nodeid))
1228 return -EIO;
1229 if (!fuse_valid_type(o->attr.mode))
1230 return -EIO;
1231
1226 fc = get_fuse_conn(dir); 1232 fc = get_fuse_conn(dir);
1227 1233
1228 name.hash = full_name_hash(name.name, name.len); 1234 name.hash = full_name_hash(name.name, name.len);
1229 dentry = d_lookup(parent, &name); 1235 dentry = d_lookup(parent, &name);
1230 if (dentry && dentry->d_inode) { 1236 if (dentry) {
1231 inode = dentry->d_inode; 1237 inode = dentry->d_inode;
1232 if (get_node_id(inode) == o->nodeid) { 1238 if (!inode) {
1239 d_drop(dentry);
1240 } else if (get_node_id(inode) != o->nodeid ||
1241 ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
1242 err = d_invalidate(dentry);
1243 if (err)
1244 goto out;
1245 } else if (is_bad_inode(inode)) {
1246 err = -EIO;
1247 goto out;
1248 } else {
1233 struct fuse_inode *fi; 1249 struct fuse_inode *fi;
1234 fi = get_fuse_inode(inode); 1250 fi = get_fuse_inode(inode);
1235 spin_lock(&fc->lock); 1251 spin_lock(&fc->lock);
1236 fi->nlookup++; 1252 fi->nlookup++;
1237 spin_unlock(&fc->lock); 1253 spin_unlock(&fc->lock);
1238 1254
1255 fuse_change_attributes(inode, &o->attr,
1256 entry_attr_timeout(o),
1257 attr_version);
1258
1239 /* 1259 /*
1240 * The other branch to 'found' comes via fuse_iget() 1260 * The other branch to 'found' comes via fuse_iget()
1241 * which bumps nlookup inside 1261 * which bumps nlookup inside
1242 */ 1262 */
1243 goto found; 1263 goto found;
1244 } 1264 }
1245 err = d_invalidate(dentry);
1246 if (err)
1247 goto out;
1248 dput(dentry); 1265 dput(dentry);
1249 dentry = NULL;
1250 } 1266 }
1251 1267
1252 dentry = d_alloc(parent, &name); 1268 dentry = d_alloc(parent, &name);
@@ -1259,25 +1275,30 @@ static int fuse_direntplus_link(struct file *file,
1259 if (!inode) 1275 if (!inode)
1260 goto out; 1276 goto out;
1261 1277
1262 alias = d_materialise_unique(dentry, inode); 1278 if (S_ISDIR(inode->i_mode)) {
1263 err = PTR_ERR(alias); 1279 mutex_lock(&fc->inst_mutex);
1264 if (IS_ERR(alias)) 1280 alias = fuse_d_add_directory(dentry, inode);
1265 goto out; 1281 mutex_unlock(&fc->inst_mutex);
1282 err = PTR_ERR(alias);
1283 if (IS_ERR(alias)) {
1284 iput(inode);
1285 goto out;
1286 }
1287 } else {
1288 alias = d_splice_alias(inode, dentry);
1289 }
1290
1266 if (alias) { 1291 if (alias) {
1267 dput(dentry); 1292 dput(dentry);
1268 dentry = alias; 1293 dentry = alias;
1269 } 1294 }
1270 1295
1271found: 1296found:
1272 fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
1273 attr_version);
1274
1275 fuse_change_entry_timeout(dentry, o); 1297 fuse_change_entry_timeout(dentry, o);
1276 1298
1277 err = 0; 1299 err = 0;
1278out: 1300out:
1279 if (dentry) 1301 dput(dentry);
1280 dput(dentry);
1281 return err; 1302 return err;
1282} 1303}
1283 1304
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
346 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 346 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
347 (unsigned long long) blkno, 347 (unsigned long long) blkno,
348 (unsigned long long) nblocks); 348 (unsigned long long) nblocks);
349 jfs_error(ip->i_sb, 349 jfs_error(ip->i_sb, "block to be freed is outside the map\n");
350 "dbFree: block to be freed is outside the map");
351 return -EIO; 350 return -EIO;
352 } 351 }
353 352
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
384 383
385 /* free the blocks. */ 384 /* free the blocks. */
386 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { 385 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
387 jfs_error(ip->i_sb, "dbFree: error in block map\n"); 386 jfs_error(ip->i_sb, "error in block map\n");
388 release_metapage(mp); 387 release_metapage(mp);
389 IREAD_UNLOCK(ipbmap); 388 IREAD_UNLOCK(ipbmap);
390 return (rc); 389 return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
441 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 440 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
442 (unsigned long long) blkno, 441 (unsigned long long) blkno,
443 (unsigned long long) nblocks); 442 (unsigned long long) nblocks);
444 jfs_error(ipbmap->i_sb, 443 jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
445 "dbUpdatePMap: blocks are outside the map");
446 return -EIO; 444 return -EIO;
447 } 445 }
448 446
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
726 724
727 /* the hint should be within the map */ 725 /* the hint should be within the map */
728 if (hint >= mapSize) { 726 if (hint >= mapSize) {
729 jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); 727 jfs_error(ip->i_sb, "the hint is outside the map\n");
730 return -EIO; 728 return -EIO;
731 } 729 }
732 730
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1057 bmp = sbi->bmap; 1055 bmp = sbi->bmap;
1058 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { 1056 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1059 IREAD_UNLOCK(ipbmap); 1057 IREAD_UNLOCK(ipbmap);
1060 jfs_error(ip->i_sb, 1058 jfs_error(ip->i_sb, "the block is outside the filesystem\n");
1061 "dbExtend: the block is outside the filesystem");
1062 return -EIO; 1059 return -EIO;
1063 } 1060 }
1064 1061
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1134 u32 mask; 1131 u32 mask;
1135 1132
1136 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1133 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1137 jfs_error(bmp->db_ipbmap->i_sb, 1134 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1138 "dbAllocNext: Corrupt dmap page");
1139 return -EIO; 1135 return -EIO;
1140 } 1136 }
1141 1137
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
1265 s8 *leaf; 1261 s8 *leaf;
1266 1262
1267 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1263 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1268 jfs_error(bmp->db_ipbmap->i_sb, 1264 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1269 "dbAllocNear: Corrupt dmap page");
1270 return -EIO; 1265 return -EIO;
1271 } 1266 }
1272 1267
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1381 */ 1376 */
1382 if (l2nb > bmp->db_agl2size) { 1377 if (l2nb > bmp->db_agl2size) {
1383 jfs_error(bmp->db_ipbmap->i_sb, 1378 jfs_error(bmp->db_ipbmap->i_sb,
1384 "dbAllocAG: allocation request is larger than the " 1379 "allocation request is larger than the allocation group size\n");
1385 "allocation group size");
1386 return -EIO; 1380 return -EIO;
1387 } 1381 }
1388 1382
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1417 (unsigned long long) blkno, 1411 (unsigned long long) blkno,
1418 (unsigned long long) nblocks); 1412 (unsigned long long) nblocks);
1419 jfs_error(bmp->db_ipbmap->i_sb, 1413 jfs_error(bmp->db_ipbmap->i_sb,
1420 "dbAllocAG: dbAllocCtl failed in free AG"); 1414 "dbAllocCtl failed in free AG\n");
1421 } 1415 }
1422 return (rc); 1416 return (rc);
1423 } 1417 }
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1433 budmin = dcp->budmin; 1427 budmin = dcp->budmin;
1434 1428
1435 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1429 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1436 jfs_error(bmp->db_ipbmap->i_sb, 1430 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
1437 "dbAllocAG: Corrupt dmapctl page");
1438 release_metapage(mp); 1431 release_metapage(mp);
1439 return -EIO; 1432 return -EIO;
1440 } 1433 }
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1475 } 1468 }
1476 if (n == 4) { 1469 if (n == 4) {
1477 jfs_error(bmp->db_ipbmap->i_sb, 1470 jfs_error(bmp->db_ipbmap->i_sb,
1478 "dbAllocAG: failed descending stree"); 1471 "failed descending stree\n");
1479 release_metapage(mp); 1472 release_metapage(mp);
1480 return -EIO; 1473 return -EIO;
1481 } 1474 }
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1515 &blkno))) { 1508 &blkno))) {
1516 if (rc == -ENOSPC) { 1509 if (rc == -ENOSPC) {
1517 jfs_error(bmp->db_ipbmap->i_sb, 1510 jfs_error(bmp->db_ipbmap->i_sb,
1518 "dbAllocAG: control page " 1511 "control page inconsistent\n");
1519 "inconsistent");
1520 return -EIO; 1512 return -EIO;
1521 } 1513 }
1522 return (rc); 1514 return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1528 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1520 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1529 if (rc == -ENOSPC) { 1521 if (rc == -ENOSPC) {
1530 jfs_error(bmp->db_ipbmap->i_sb, 1522 jfs_error(bmp->db_ipbmap->i_sb,
1531 "dbAllocAG: unable to allocate blocks"); 1523 "unable to allocate blocks\n");
1532 rc = -EIO; 1524 rc = -EIO;
1533 } 1525 }
1534 return (rc); 1526 return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1587 */ 1579 */
1588 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1580 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1589 if (rc == -ENOSPC) { 1581 if (rc == -ENOSPC) {
1590 jfs_error(bmp->db_ipbmap->i_sb, 1582 jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
1591 "dbAllocAny: unable to allocate blocks");
1592 return -EIO; 1583 return -EIO;
1593 } 1584 }
1594 return (rc); 1585 return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1652 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); 1643 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
1653 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); 1644 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
1654 if (totrim == NULL) { 1645 if (totrim == NULL) {
1655 jfs_error(bmp->db_ipbmap->i_sb, 1646 jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
1656 "dbDiscardAG: no memory for trim array");
1657 IWRITE_UNLOCK(ipbmap); 1647 IWRITE_UNLOCK(ipbmap);
1658 return 0; 1648 return 0;
1659 } 1649 }
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1682 nblocks = 1 << l2nb; 1672 nblocks = 1 << l2nb;
1683 } else { 1673 } else {
1684 /* Trim any already allocated blocks */ 1674 /* Trim any already allocated blocks */
1685 jfs_error(bmp->db_ipbmap->i_sb, 1675 jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
1686 "dbDiscardAG: -EIO");
1687 break; 1676 break;
1688 } 1677 }
1689 1678
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1761 1750
1762 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1751 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1763 jfs_error(bmp->db_ipbmap->i_sb, 1752 jfs_error(bmp->db_ipbmap->i_sb,
1764 "dbFindCtl: Corrupt dmapctl page"); 1753 "Corrupt dmapctl page\n");
1765 release_metapage(mp); 1754 release_metapage(mp);
1766 return -EIO; 1755 return -EIO;
1767 } 1756 }
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1782 if (rc) { 1771 if (rc) {
1783 if (lev != level) { 1772 if (lev != level) {
1784 jfs_error(bmp->db_ipbmap->i_sb, 1773 jfs_error(bmp->db_ipbmap->i_sb,
1785 "dbFindCtl: dmap inconsistent"); 1774 "dmap inconsistent\n");
1786 return -EIO; 1775 return -EIO;
1787 } 1776 }
1788 return -ENOSPC; 1777 return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1906 if (dp->tree.stree[ROOT] != L2BPERDMAP) { 1895 if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1907 release_metapage(mp); 1896 release_metapage(mp);
1908 jfs_error(bmp->db_ipbmap->i_sb, 1897 jfs_error(bmp->db_ipbmap->i_sb,
1909 "dbAllocCtl: the dmap is not all free"); 1898 "the dmap is not all free\n");
1910 rc = -EIO; 1899 rc = -EIO;
1911 goto backout; 1900 goto backout;
1912 } 1901 }
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1953 * to indicate that we have leaked blocks. 1942 * to indicate that we have leaked blocks.
1954 */ 1943 */
1955 jfs_error(bmp->db_ipbmap->i_sb, 1944 jfs_error(bmp->db_ipbmap->i_sb,
1956 "dbAllocCtl: I/O Error: Block Leakage."); 1945 "I/O Error: Block Leakage\n");
1957 continue; 1946 continue;
1958 } 1947 }
1959 dp = (struct dmap *) mp->data; 1948 dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1965 * to indicate that we have leaked blocks. 1954 * to indicate that we have leaked blocks.
1966 */ 1955 */
1967 release_metapage(mp); 1956 release_metapage(mp);
1968 jfs_error(bmp->db_ipbmap->i_sb, 1957 jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
1969 "dbAllocCtl: Block Leakage.");
1970 continue; 1958 continue;
1971 } 1959 }
1972 1960
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2263 for (; nwords > 0; nwords -= nw) { 2251 for (; nwords > 0; nwords -= nw) {
2264 if (leaf[word] < BUDMIN) { 2252 if (leaf[word] < BUDMIN) {
2265 jfs_error(bmp->db_ipbmap->i_sb, 2253 jfs_error(bmp->db_ipbmap->i_sb,
2266 "dbAllocBits: leaf page " 2254 "leaf page corrupt\n");
2267 "corrupt");
2268 break; 2255 break;
2269 } 2256 }
2270 2257
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2536 dcp = (struct dmapctl *) mp->data; 2523 dcp = (struct dmapctl *) mp->data;
2537 2524
2538 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 2525 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2539 jfs_error(bmp->db_ipbmap->i_sb, 2526 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
2540 "dbAdjCtl: Corrupt dmapctl page");
2541 release_metapage(mp); 2527 release_metapage(mp);
2542 return -EIO; 2528 return -EIO;
2543 } 2529 }
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2638 assert(level == bmp->db_maxlevel); 2624 assert(level == bmp->db_maxlevel);
2639 if (bmp->db_maxfreebud != oldroot) { 2625 if (bmp->db_maxfreebud != oldroot) {
2640 jfs_error(bmp->db_ipbmap->i_sb, 2626 jfs_error(bmp->db_ipbmap->i_sb,
2641 "dbAdjCtl: the maximum free buddy is " 2627 "the maximum free buddy is not the old root\n");
2642 "not the old root");
2643 } 2628 }
2644 bmp->db_maxfreebud = dcp->stree[ROOT]; 2629 bmp->db_maxfreebud = dcp->stree[ROOT];
2645 } 2630 }
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3481 p = BMAPBLKNO + nbperpage; /* L2 page */ 3466 p = BMAPBLKNO + nbperpage; /* L2 page */
3482 l2mp = read_metapage(ipbmap, p, PSIZE, 0); 3467 l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3483 if (!l2mp) { 3468 if (!l2mp) {
3484 jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); 3469 jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
3485 return -EIO; 3470 return -EIO;
3486 } 3471 }
3487 l2dcp = (struct dmapctl *) l2mp->data; 3472 l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3646 } 3631 }
3647 } /* for each L1 in a L2 */ 3632 } /* for each L1 in a L2 */
3648 3633
3649 jfs_error(ipbmap->i_sb, 3634 jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
3650 "dbExtendFS: function has not returned as expected");
3651errout: 3635errout:
3652 if (l0mp) 3636 if (l0mp)
3653 release_metapage(l0mp); 3637 release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3717 } 3701 }
3718 if (bmp->db_agpref >= bmp->db_numag) { 3702 if (bmp->db_agpref >= bmp->db_numag) {
3719 jfs_error(ipbmap->i_sb, 3703 jfs_error(ipbmap->i_sb,
3720 "cannot find ag with average freespace"); 3704 "cannot find ag with average freespace\n");
3721 } 3705 }
3722 } 3706 }
3723 3707
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 9f4ed13d9f15..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) 124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
125 125
126/* get page buffer for specified block address */ 126/* get page buffer for specified block address */
127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ 127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
128{\ 128do { \
129 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ 129 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \
130 if (!(RC))\ 130 if (!(RC)) { \
131 {\ 131 if (((P)->header.nextindex > \
132 if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ 132 (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
133 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ 133 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
134 {\ 134 BT_PUTPAGE(MP); \
135 BT_PUTPAGE(MP);\ 135 jfs_error((IP)->i_sb, \
136 jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ 136 "DT_GETPAGE: dtree page corrupt\n"); \
137 MP = NULL;\ 137 MP = NULL; \
138 RC = -EIO;\ 138 RC = -EIO; \
139 }\ 139 } \
140 }\ 140 } \
141} 141} while (0)
142 142
143/* for consistency */ 143/* for consistency */
144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) 144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
776 /* Something's corrupted, mark filesystem dirty so 776 /* Something's corrupted, mark filesystem dirty so
777 * chkdsk will fix it. 777 * chkdsk will fix it.
778 */ 778 */
779 jfs_error(sb, "stack overrun in dtSearch!"); 779 jfs_error(sb, "stack overrun!\n");
780 BT_STACK_DUMP(btstack); 780 BT_STACK_DUMP(btstack);
781 rc = -EIO; 781 rc = -EIO;
782 goto out; 782 goto out;
@@ -3247,8 +3247,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
3247 /* Sanity Check */ 3247 /* Sanity Check */
3248 if (d_namleft == 0) { 3248 if (d_namleft == 0) {
3249 jfs_error(ip->i_sb, 3249 jfs_error(ip->i_sb,
3250 "JFS:Dtree error: ino = " 3250 "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
3251 "%ld, bn=%Ld, index = %d",
3252 (long)ip->i_ino, 3251 (long)ip->i_ino,
3253 (long long)bn, 3252 (long long)bn,
3254 i); 3253 i);
@@ -3368,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3368 */ 3367 */
3369 if (BT_STACK_FULL(btstack)) { 3368 if (BT_STACK_FULL(btstack)) {
3370 DT_PUTPAGE(mp); 3369 DT_PUTPAGE(mp);
3371 jfs_error(ip->i_sb, "dtReadFirst: btstack overrun"); 3370 jfs_error(ip->i_sb, "btstack overrun\n");
3372 BT_STACK_DUMP(btstack); 3371 BT_STACK_DUMP(btstack);
3373 return -EIO; 3372 return -EIO;
3374 } 3373 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
388 388
389 if ((rc == 0) && xlen) { 389 if ((rc == 0) && xlen) {
390 if (xlen != nbperpage) { 390 if (xlen != nbperpage) {
391 jfs_error(ip->i_sb, "extHint: corrupt xtree"); 391 jfs_error(ip->i_sb, "corrupt xtree\n");
392 rc = -EIO; 392 rc = -EIO;
393 } 393 }
394 XADaddress(xp, xaddr); 394 XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f7e042b63ddb..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
386 dp += rel_inode; 386 dp += rel_inode;
387 387
388 if (ip->i_ino != le32_to_cpu(dp->di_number)) { 388 if (ip->i_ino != le32_to_cpu(dp->di_number)) {
389 jfs_error(ip->i_sb, "diRead: i_ino != di_number"); 389 jfs_error(ip->i_sb, "i_ino != di_number\n");
390 rc = -EIO; 390 rc = -EIO;
391 } else if (le32_to_cpu(dp->di_nlink) == 0) 391 } else if (le32_to_cpu(dp->di_nlink) == 0)
392 rc = -ESTALE; 392 rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
625 if (!addressPXD(&(jfs_ip->ixpxd)) || 625 if (!addressPXD(&(jfs_ip->ixpxd)) ||
626 (lengthPXD(&(jfs_ip->ixpxd)) != 626 (lengthPXD(&(jfs_ip->ixpxd)) !=
627 JFS_IP(ipimap)->i_imap->im_nbperiext)) { 627 JFS_IP(ipimap)->i_imap->im_nbperiext)) {
628 jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); 628 jfs_error(ip->i_sb, "ixpxd invalid\n");
629 return -EIO; 629 return -EIO;
630 } 630 }
631 631
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
893 if (iagno >= imap->im_nextiag) { 893 if (iagno >= imap->im_nextiag) {
894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, 894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
895 imap, 32, 0); 895 imap, 32, 0);
896 jfs_error(ip->i_sb, 896 jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
897 "diFree: inum = %d, iagno = %d, nextiag = %d",
898 (uint) inum, iagno, imap->im_nextiag); 897 (uint) inum, iagno, imap->im_nextiag);
899 return -EIO; 898 return -EIO;
900 } 899 }
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
930 mask = HIGHORDER >> bitno; 929 mask = HIGHORDER >> bitno;
931 930
932 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 931 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
933 jfs_error(ip->i_sb, 932 jfs_error(ip->i_sb, "wmap shows inode already free\n");
934 "diFree: wmap shows inode already free");
935 } 933 }
936 934
937 if (!addressPXD(&iagp->inoext[extno])) { 935 if (!addressPXD(&iagp->inoext[extno])) {
938 release_metapage(mp); 936 release_metapage(mp);
939 IREAD_UNLOCK(ipimap); 937 IREAD_UNLOCK(ipimap);
940 AG_UNLOCK(imap, agno); 938 AG_UNLOCK(imap, agno);
941 jfs_error(ip->i_sb, "diFree: invalid inoext"); 939 jfs_error(ip->i_sb, "invalid inoext\n");
942 return -EIO; 940 return -EIO;
943 } 941 }
944 942
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
950 release_metapage(mp); 948 release_metapage(mp);
951 IREAD_UNLOCK(ipimap); 949 IREAD_UNLOCK(ipimap);
952 AG_UNLOCK(imap, agno); 950 AG_UNLOCK(imap, agno);
953 jfs_error(ip->i_sb, "diFree: numfree > numinos"); 951 jfs_error(ip->i_sb, "numfree > numinos\n");
954 return -EIO; 952 return -EIO;
955 } 953 }
956 /* 954 /*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
1199 * for the inode being freed. 1197 * for the inode being freed.
1200 */ 1198 */
1201 if (iagp->pmap[extno] != 0) { 1199 if (iagp->pmap[extno] != 0) {
1202 jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); 1200 jfs_error(ip->i_sb, "the pmap does not show inode free\n");
1203 } 1201 }
1204 iagp->wmap[extno] = 0; 1202 iagp->wmap[extno] = 0;
1205 PXDlength(&iagp->inoext[extno], 0); 1203 PXDlength(&iagp->inoext[extno], 0);
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1518 release_metapage(mp); 1516 release_metapage(mp);
1519 AG_UNLOCK(imap, agno); 1517 AG_UNLOCK(imap, agno);
1520 jfs_error(ip->i_sb, 1518 jfs_error(ip->i_sb,
1521 "diAlloc: can't find free bit " 1519 "can't find free bit in wmap\n");
1522 "in wmap");
1523 return -EIO; 1520 return -EIO;
1524 } 1521 }
1525 1522
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1660 numinos = imap->im_agctl[agno].numinos; 1657 numinos = imap->im_agctl[agno].numinos;
1661 1658
1662 if (numfree > numinos) { 1659 if (numfree > numinos) {
1663 jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); 1660 jfs_error(ip->i_sb, "numfree > numinos\n");
1664 return -EIO; 1661 return -EIO;
1665 } 1662 }
1666 1663
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1811 if (!iagp->nfreeinos) { 1808 if (!iagp->nfreeinos) {
1812 IREAD_UNLOCK(imap->im_ipimap); 1809 IREAD_UNLOCK(imap->im_ipimap);
1813 release_metapage(mp); 1810 release_metapage(mp);
1814 jfs_error(ip->i_sb, 1811 jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
1815 "diAllocIno: nfreeinos = 0, but iag on freelist");
1816 return -EIO; 1812 return -EIO;
1817 } 1813 }
1818 1814
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1824 IREAD_UNLOCK(imap->im_ipimap); 1820 IREAD_UNLOCK(imap->im_ipimap);
1825 release_metapage(mp); 1821 release_metapage(mp);
1826 jfs_error(ip->i_sb, 1822 jfs_error(ip->i_sb,
1827 "diAllocIno: free inode not found in summary map"); 1823 "free inode not found in summary map\n");
1828 return -EIO; 1824 return -EIO;
1829 } 1825 }
1830 1826
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1839 if (rem >= EXTSPERSUM) { 1835 if (rem >= EXTSPERSUM) {
1840 IREAD_UNLOCK(imap->im_ipimap); 1836 IREAD_UNLOCK(imap->im_ipimap);
1841 release_metapage(mp); 1837 release_metapage(mp);
1842 jfs_error(ip->i_sb, "diAllocIno: no free extent found"); 1838 jfs_error(ip->i_sb, "no free extent found\n");
1843 return -EIO; 1839 return -EIO;
1844 } 1840 }
1845 extno = (sword << L2EXTSPERSUM) + rem; 1841 extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1850 if (rem >= INOSPEREXT) { 1846 if (rem >= INOSPEREXT) {
1851 IREAD_UNLOCK(imap->im_ipimap); 1847 IREAD_UNLOCK(imap->im_ipimap);
1852 release_metapage(mp); 1848 release_metapage(mp);
1853 jfs_error(ip->i_sb, "diAllocIno: free inode not found"); 1849 jfs_error(ip->i_sb, "free inode not found\n");
1854 return -EIO; 1850 return -EIO;
1855 } 1851 }
1856 1852
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1936 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); 1932 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1937 if ((rc = diIAGRead(imap, iagno, &mp))) { 1933 if ((rc = diIAGRead(imap, iagno, &mp))) {
1938 IREAD_UNLOCK(imap->im_ipimap); 1934 IREAD_UNLOCK(imap->im_ipimap);
1939 jfs_error(ip->i_sb, "diAllocExt: error reading iag"); 1935 jfs_error(ip->i_sb, "error reading iag\n");
1940 return rc; 1936 return rc;
1941 } 1937 }
1942 iagp = (struct iag *) mp->data; 1938 iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1948 if (sword >= SMAPSZ) { 1944 if (sword >= SMAPSZ) {
1949 release_metapage(mp); 1945 release_metapage(mp);
1950 IREAD_UNLOCK(imap->im_ipimap); 1946 IREAD_UNLOCK(imap->im_ipimap);
1951 jfs_error(ip->i_sb, 1947 jfs_error(ip->i_sb, "free ext summary map not found\n");
1952 "diAllocExt: free ext summary map not found");
1953 return -EIO; 1948 return -EIO;
1954 } 1949 }
1955 if (~iagp->extsmap[sword]) 1950 if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1962 if (rem >= EXTSPERSUM) { 1957 if (rem >= EXTSPERSUM) {
1963 release_metapage(mp); 1958 release_metapage(mp);
1964 IREAD_UNLOCK(imap->im_ipimap); 1959 IREAD_UNLOCK(imap->im_ipimap);
1965 jfs_error(ip->i_sb, "diAllocExt: free extent not found"); 1960 jfs_error(ip->i_sb, "free extent not found\n");
1966 return -EIO; 1961 return -EIO;
1967 } 1962 }
1968 extno = (sword << L2EXTSPERSUM) + rem; 1963 extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2081 if (bmp) 2076 if (bmp)
2082 release_metapage(bmp); 2077 release_metapage(bmp);
2083 2078
2084 jfs_error(imap->im_ipimap->i_sb, 2079 jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
2085 "diAllocBit: iag inconsistent");
2086 return -EIO; 2080 return -EIO;
2087 } 2081 }
2088 2082
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2189 /* better have free extents. 2183 /* better have free extents.
2190 */ 2184 */
2191 if (!iagp->nfreeexts) { 2185 if (!iagp->nfreeexts) {
2192 jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); 2186 jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
2193 return -EIO; 2187 return -EIO;
2194 } 2188 }
2195 2189
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2261 } 2255 }
2262 if (ciagp == NULL) { 2256 if (ciagp == NULL) {
2263 jfs_error(imap->im_ipimap->i_sb, 2257 jfs_error(imap->im_ipimap->i_sb,
2264 "diNewExt: ciagp == NULL"); 2258 "ciagp == NULL\n");
2265 rc = -EIO; 2259 rc = -EIO;
2266 goto error_out; 2260 goto error_out;
2267 } 2261 }
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2498 IWRITE_UNLOCK(ipimap); 2492 IWRITE_UNLOCK(ipimap);
2499 IAGFREE_UNLOCK(imap); 2493 IAGFREE_UNLOCK(imap);
2500 jfs_error(imap->im_ipimap->i_sb, 2494 jfs_error(imap->im_ipimap->i_sb,
2501 "diNewIAG: ipimap->i_size is wrong"); 2495 "ipimap->i_size is wrong\n");
2502 return -EIO; 2496 return -EIO;
2503 } 2497 }
2504 2498
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
2758 iagno = INOTOIAG(inum); 2752 iagno = INOTOIAG(inum);
2759 /* make sure that the iag is contained within the map */ 2753 /* make sure that the iag is contained within the map */
2760 if (iagno >= imap->im_nextiag) { 2754 if (iagno >= imap->im_nextiag) {
2761 jfs_error(ipimap->i_sb, 2755 jfs_error(ipimap->i_sb, "the iag is outside the map\n");
2762 "diUpdatePMap: the iag is outside the map");
2763 return -EIO; 2756 return -EIO;
2764 } 2757 }
2765 /* read the iag */ 2758 /* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
2788 */ 2781 */
2789 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 2782 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2790 jfs_error(ipimap->i_sb, 2783 jfs_error(ipimap->i_sb,
2791 "diUpdatePMap: inode %ld not marked as " 2784 "inode %ld not marked as allocated in wmap!\n",
2792 "allocated in wmap!", inum); 2785 inum);
2793 } 2786 }
2794 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { 2787 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
2795 jfs_error(ipimap->i_sb, 2788 jfs_error(ipimap->i_sb,
2796 "diUpdatePMap: inode %ld not marked as " 2789 "inode %ld not marked as allocated in pmap!\n",
2797 "allocated in pmap!", inum); 2790 inum);
2798 } 2791 }
2799 /* update the bitmap for the extent of the freed inode */ 2792 /* update the bitmap for the extent of the freed inode */
2800 iagp->pmap[extno] &= cpu_to_le32(~mask); 2793 iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
2809 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 2802 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2810 release_metapage(mp); 2803 release_metapage(mp);
2811 jfs_error(ipimap->i_sb, 2804 jfs_error(ipimap->i_sb,
2812 "diUpdatePMap: the inode is not allocated in " 2805 "the inode is not allocated in the working map\n");
2813 "the working map");
2814 return -EIO; 2806 return -EIO;
2815 } 2807 }
2816 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { 2808 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
2817 release_metapage(mp); 2809 release_metapage(mp);
2818 jfs_error(ipimap->i_sb, 2810 jfs_error(ipimap->i_sb,
2819 "diUpdatePMap: the inode is not free in the " 2811 "the inode is not free in the persistent map\n");
2820 "persistent map");
2821 return -EIO; 2812 return -EIO;
2822 } 2813 }
2823 /* update the bitmap for the extent of the allocated inode */ 2814 /* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2909 iagp = (struct iag *) bp->data; 2900 iagp = (struct iag *) bp->data;
2910 if (le32_to_cpu(iagp->iagnum) != i) { 2901 if (le32_to_cpu(iagp->iagnum) != i) {
2911 release_metapage(bp); 2902 release_metapage(bp);
2912 jfs_error(ipimap->i_sb, 2903 jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
2913 "diExtendFs: unexpected value of iagnum");
2914 return -EIO; 2904 return -EIO;
2915 } 2905 }
2916 2906
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2986 2976
2987 if (xnuminos != atomic_read(&imap->im_numinos) || 2977 if (xnuminos != atomic_read(&imap->im_numinos) ||
2988 xnumfree != atomic_read(&imap->im_numfree)) { 2978 xnumfree != atomic_read(&imap->im_numfree)) {
2989 jfs_error(ipimap->i_sb, 2979 jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
2990 "diExtendFs: numinos or numfree incorrect");
2991 return -EIO; 2980 return -EIO;
2992 } 2981 }
2993 2982
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 9e3aaff11f89..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -647,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
647 if (mp) { 647 if (mp) {
648 if (mp->logical_size != size) { 648 if (mp->logical_size != size) {
649 jfs_error(inode->i_sb, 649 jfs_error(inode->i_sb,
650 "__get_metapage: mp->logical_size != size"); 650 "get_mp->logical_size != size\n");
651 jfs_err("logical_size = %d, size = %d", 651 jfs_err("logical_size = %d, size = %d",
652 mp->logical_size, size); 652 mp->logical_size, size);
653 dump_stack(); 653 dump_stack();
@@ -658,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
658 if (test_bit(META_discard, &mp->flag)) { 658 if (test_bit(META_discard, &mp->flag)) {
659 if (!new) { 659 if (!new) {
660 jfs_error(inode->i_sb, 660 jfs_error(inode->i_sb,
661 "__get_metapage: using a " 661 "using a discarded metapage\n");
662 "discarded metapage");
663 discard_metapage(mp); 662 discard_metapage(mp);
664 goto unlock; 663 goto unlock;
665 } 664 }
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
108 108
109extern int readSuper(struct super_block *, struct buffer_head **); 109extern int readSuper(struct super_block *, struct buffer_head **);
110extern int updateSuper(struct super_block *, uint); 110extern int updateSuper(struct super_block *, uint);
111__printf(2, 3)
111extern void jfs_error(struct super_block *, const char *, ...); 112extern void jfs_error(struct super_block *, const char *, ...);
112extern int jfs_mount(struct super_block *); 113extern int jfs_mount(struct super_block *);
113extern int jfs_mount_rw(struct super_block *, int); 114extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
2684 * mark filesystem dirty 2684 * mark filesystem dirty
2685 */ 2685 */
2686 if (dirty) 2686 if (dirty)
2687 jfs_error(tblk->sb, "txAbort"); 2687 jfs_error(tblk->sb, "\n");
2688 2688
2689 return; 2689 return;
2690} 2690}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
64 64
65/* get page buffer for specified block address */ 65/* get page buffer for specified block address */
66/* ToDo: Replace this ugly macro with a function */ 66/* ToDo: Replace this ugly macro with a function */
67#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ 67#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
68{\ 68do { \
69 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ 69 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
70 if (!(RC))\ 70 if (!(RC)) { \
71 {\ 71 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
72 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ 72 (le16_to_cpu((P)->header.nextindex) > \
73 (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ 73 le16_to_cpu((P)->header.maxentry)) || \
74 (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ 74 (le16_to_cpu((P)->header.maxentry) > \
75 {\ 75 (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
76 jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ 76 jfs_error((IP)->i_sb, \
77 BT_PUTPAGE(MP);\ 77 "XT_GETPAGE: xtree page corrupt\n"); \
78 MP = NULL;\ 78 BT_PUTPAGE(MP); \
79 RC = -EIO;\ 79 MP = NULL; \
80 }\ 80 RC = -EIO; \
81 }\ 81 } \
82} 82 } \
83} while (0)
83 84
84/* for consistency */ 85/* for consistency */
85#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) 86#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
499 500
500 /* push (bn, index) of the parent page/entry */ 501 /* push (bn, index) of the parent page/entry */
501 if (BT_STACK_FULL(btstack)) { 502 if (BT_STACK_FULL(btstack)) {
502 jfs_error(ip->i_sb, "stack overrun in xtSearch!"); 503 jfs_error(ip->i_sb, "stack overrun!\n");
503 XT_PUTPAGE(mp); 504 XT_PUTPAGE(mp);
504 return -EIO; 505 return -EIO;
505 } 506 }
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */
1385 1386
1386 if (cmp != 0) { 1387 if (cmp != 0) {
1387 XT_PUTPAGE(mp); 1388 XT_PUTPAGE(mp);
1388 jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); 1389 jfs_error(ip->i_sb, "xtSearch did not find extent\n");
1389 return -EIO; 1390 return -EIO;
1390 } 1391 }
1391 1392
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */
1393 xad = &p->xad[index]; 1394 xad = &p->xad[index];
1394 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { 1395 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
1395 XT_PUTPAGE(mp); 1396 XT_PUTPAGE(mp);
1396 jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); 1397 jfs_error(ip->i_sb, "extension is not contiguous\n");
1397 return -EIO; 1398 return -EIO;
1398 } 1399 }
1399 1400
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1552 1553
1553 if (cmp != 0) { 1554 if (cmp != 0) {
1554 XT_PUTPAGE(mp); 1555 XT_PUTPAGE(mp);
1555 jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); 1556 jfs_error(ip->i_sb, "couldn't find extent\n");
1556 return -EIO; 1557 return -EIO;
1557 } 1558 }
1558 1559
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1560 nextindex = le16_to_cpu(p->header.nextindex); 1561 nextindex = le16_to_cpu(p->header.nextindex);
1561 if (index != nextindex - 1) { 1562 if (index != nextindex - 1) {
1562 XT_PUTPAGE(mp); 1563 XT_PUTPAGE(mp);
1563 jfs_error(ip->i_sb, 1564 jfs_error(ip->i_sb, "the entry found is not the last entry\n");
1564 "xtTailgate: the entry found is not the last entry");
1565 return -EIO; 1565 return -EIO;
1566 } 1566 }
1567 1567
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1734 1734
1735 if (cmp != 0) { 1735 if (cmp != 0) {
1736 XT_PUTPAGE(mp); 1736 XT_PUTPAGE(mp);
1737 jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); 1737 jfs_error(ip->i_sb, "Could not find extent\n");
1738 return -EIO; 1738 return -EIO;
1739 } 1739 }
1740 1740
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1758 (nxoff + nxlen > xoff + xlen)) { 1758 (nxoff + nxlen > xoff + xlen)) {
1759 XT_PUTPAGE(mp); 1759 XT_PUTPAGE(mp);
1760 jfs_error(ip->i_sb, 1760 jfs_error(ip->i_sb,
1761 "xtUpdate: nXAD in not completely contained within XAD"); 1761 "nXAD in not completely contained within XAD\n");
1762 return -EIO; 1762 return -EIO;
1763 } 1763 }
1764 1764
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1907 1907
1908 if (xoff >= nxoff) { 1908 if (xoff >= nxoff) {
1909 XT_PUTPAGE(mp); 1909 XT_PUTPAGE(mp);
1910 jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); 1910 jfs_error(ip->i_sb, "xoff >= nxoff\n");
1911 return -EIO; 1911 return -EIO;
1912 } 1912 }
1913/* #endif _JFS_WIP_COALESCE */ 1913/* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
2048 2048
2049 if (cmp != 0) { 2049 if (cmp != 0) {
2050 XT_PUTPAGE(mp); 2050 XT_PUTPAGE(mp);
2051 jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); 2051 jfs_error(ip->i_sb, "xtSearch failed\n");
2052 return -EIO; 2052 return -EIO;
2053 } 2053 }
2054 2054
2055 if (index0 != index) { 2055 if (index0 != index) {
2056 XT_PUTPAGE(mp); 2056 XT_PUTPAGE(mp);
2057 jfs_error(ip->i_sb, 2057 jfs_error(ip->i_sb, "unexpected value of index\n");
2058 "xtUpdate: unexpected value of index");
2059 return -EIO; 2058 return -EIO;
2060 } 2059 }
2061 } 2060 }
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3650 getChild: 3649 getChild:
3651 /* save current parent entry for the child page */ 3650 /* save current parent entry for the child page */
3652 if (BT_STACK_FULL(&btstack)) { 3651 if (BT_STACK_FULL(&btstack)) {
3653 jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); 3652 jfs_error(ip->i_sb, "stack overrun!\n");
3654 XT_PUTPAGE(mp); 3653 XT_PUTPAGE(mp);
3655 return -EIO; 3654 return -EIO;
3656 } 3655 }
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3751 3750
3752 if (cmp != 0) { 3751 if (cmp != 0) {
3753 XT_PUTPAGE(mp); 3752 XT_PUTPAGE(mp);
3754 jfs_error(ip->i_sb, 3753 jfs_error(ip->i_sb, "did not find extent\n");
3755 "xtTruncate_pmap: did not find extent");
3756 return -EIO; 3754 return -EIO;
3757 } 3755 }
3758 } else { 3756 } else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3851 getChild: 3849 getChild:
3852 /* save current parent entry for the child page */ 3850 /* save current parent entry for the child page */
3853 if (BT_STACK_FULL(&btstack)) { 3851 if (BT_STACK_FULL(&btstack)) {
3854 jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); 3852 jfs_error(ip->i_sb, "stack overrun!\n");
3855 XT_PUTPAGE(mp); 3853 XT_PUTPAGE(mp);
3856 return -EIO; 3854 return -EIO;
3857 } 3855 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 8b19027291d6..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1176 if (!S_ISDIR(old_ip->i_mode) && new_ip) 1176 if (!S_ISDIR(old_ip->i_mode) && new_ip)
1177 IWRITE_UNLOCK(new_ip); 1177 IWRITE_UNLOCK(new_ip);
1178 jfs_error(new_ip->i_sb, 1178 jfs_error(new_ip->i_sb,
1179 "jfs_rename: new_ip->i_nlink != 0"); 1179 "new_ip->i_nlink != 0\n");
1180 return -EIO; 1180 return -EIO;
1181 } 1181 }
1182 tblk = tid_to_tblock(tid); 1182 tblk = tid_to_tblock(tid);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
530 goto resume; 530 goto resume;
531 531
532 error_out: 532 error_out:
533 jfs_error(sb, "jfs_extendfs"); 533 jfs_error(sb, "\n");
534 534
535 resume: 535 resume:
536 /* 536 /*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 788e0a9c1fb0..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
92 /* nothing is done for continue beyond marking the superblock dirty */ 92 /* nothing is done for continue beyond marking the superblock dirty */
93} 93}
94 94
95void jfs_error(struct super_block *sb, const char * function, ...) 95void jfs_error(struct super_block *sb, const char *fmt, ...)
96{ 96{
97 static char error_buf[256]; 97 struct va_format vaf;
98 va_list args; 98 va_list args;
99 99
100 va_start(args, function); 100 va_start(args, fmt);
101 vsnprintf(error_buf, sizeof(error_buf), function, args); 101
102 va_end(args); 102 vaf.fmt = fmt;
103 vaf.va = &args;
103 104
104 pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf); 105 pr_err("ERROR: (device %s): %pf: %pV\n",
106 sb->s_id, __builtin_return_address(0), &vaf);
107
108 va_end(args);
105 109
106 jfs_handle_error(sb); 110 jfs_handle_error(sb);
107} 111}
@@ -617,7 +621,7 @@ static int jfs_freeze(struct super_block *sb)
617 txQuiesce(sb); 621 txQuiesce(sb);
618 rc = lmLogShutdown(log); 622 rc = lmLogShutdown(log);
619 if (rc) { 623 if (rc) {
620 jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); 624 jfs_error(sb, "lmLogShutdown failed\n");
621 625
622 /* let operations fail rather than hang */ 626 /* let operations fail rather than hang */
623 txResume(sb); 627 txResume(sb);
@@ -646,12 +650,12 @@ static int jfs_unfreeze(struct super_block *sb)
646 if (!(sb->s_flags & MS_RDONLY)) { 650 if (!(sb->s_flags & MS_RDONLY)) {
647 rc = updateSuper(sb, FM_MOUNT); 651 rc = updateSuper(sb, FM_MOUNT);
648 if (rc) { 652 if (rc) {
649 jfs_error(sb, "jfs_unfreeze: updateSuper failed"); 653 jfs_error(sb, "updateSuper failed\n");
650 goto out; 654 goto out;
651 } 655 }
652 rc = lmLogInit(log); 656 rc = lmLogInit(log);
653 if (rc) 657 if (rc)
654 jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); 658 jfs_error(sb, "lmLogInit failed\n");
655out: 659out:
656 txResume(sb); 660 txResume(sb);
657 } 661 }
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
382 382
383 nbytes = sizeDXD(&ji->ea); 383 nbytes = sizeDXD(&ji->ea);
384 if (!nbytes) { 384 if (!nbytes) {
385 jfs_error(sb, "ea_read: nbytes is 0"); 385 jfs_error(sb, "nbytes is 0\n");
386 return -EIO; 386 return -EIO;
387 } 387 }
388 388
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
482 current_blocks = 0; 482 current_blocks = 0;
483 } else { 483 } else {
484 if (!(ji->ea.flag & DXD_EXTENT)) { 484 if (!(ji->ea.flag & DXD_EXTENT)) {
485 jfs_error(sb, "ea_get: invalid ea.flag)"); 485 jfs_error(sb, "invalid ea.flag\n");
486 return -EIO; 486 return -EIO;
487 } 487 }
488 current_blocks = (ea_size + sb->s_blocksize - 1) >> 488 current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1089} 1089}
1090 1090
1091#ifdef CONFIG_JFS_SECURITY 1091#ifdef CONFIG_JFS_SECURITY
1092int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, 1092static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
1093 void *fs_info) 1093 void *fs_info)
1094{ 1094{
1095 const struct xattr *xattr; 1095 const struct xattr *xattr;
1096 tid_t *tid = fs_info; 1096 tid_t *tid = fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index c3a0837fb861..3a3a9b53bf5a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -61,7 +61,8 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
61 61
62 if (dentry->d_name.len > NAME_MAX) 62 if (dentry->d_name.len > NAME_MAX)
63 return ERR_PTR(-ENAMETOOLONG); 63 return ERR_PTR(-ENAMETOOLONG);
64 d_set_d_op(dentry, &simple_dentry_operations); 64 if (!dentry->d_sb->s_d_op)
65 d_set_d_op(dentry, &simple_dentry_operations);
65 d_add(dentry, NULL); 66 d_add(dentry, NULL);
66 return NULL; 67 return NULL;
67} 68}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 067778b0ccc9..e066a3902973 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -951,6 +951,7 @@ nlmsvc_retry_blocked(void)
951 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 951 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
952 struct nlm_block *block; 952 struct nlm_block *block;
953 953
954 spin_lock(&nlm_blocked_lock);
954 while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { 955 while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
955 block = list_entry(nlm_blocked.next, struct nlm_block, b_list); 956 block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
956 957
@@ -960,6 +961,7 @@ nlmsvc_retry_blocked(void)
960 timeout = block->b_when - jiffies; 961 timeout = block->b_when - jiffies;
961 break; 962 break;
962 } 963 }
964 spin_unlock(&nlm_blocked_lock);
963 965
964 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", 966 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
965 block, block->b_when); 967 block, block->b_when);
@@ -969,7 +971,9 @@ nlmsvc_retry_blocked(void)
969 retry_deferred_block(block); 971 retry_deferred_block(block);
970 } else 972 } else
971 nlmsvc_grant_blocked(block); 973 nlmsvc_grant_blocked(block);
974 spin_lock(&nlm_blocked_lock);
972 } 975 }
976 spin_unlock(&nlm_blocked_lock);
973 977
974 return timeout; 978 return timeout;
975} 979}
diff --git a/fs/locks.c b/fs/locks.c
index 04e2c1fdb157..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,6 +127,8 @@
127#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
128#include <linux/pid_namespace.h> 128#include <linux/pid_namespace.h>
129#include <linux/hashtable.h> 129#include <linux/hashtable.h>
130#include <linux/percpu.h>
131#include <linux/lglock.h>
130 132
131#include <asm/uaccess.h> 133#include <asm/uaccess.h>
132 134
@@ -155,11 +157,13 @@ int lease_break_time = 45;
155 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) 157 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
156 158
157/* 159/*
158 * The global file_lock_list is only used for displaying /proc/locks. Protected 160 * The global file_lock_list is only used for displaying /proc/locks, so we
159 * by the file_lock_lock. 161 * keep a list on each CPU, with each list protected by its own spinlock via
162 * the file_lock_lglock. Note that alterations to the list also require that
163 * the relevant i_lock is held.
160 */ 164 */
161static HLIST_HEAD(file_lock_list); 165DEFINE_STATIC_LGLOCK(file_lock_lglock);
162static DEFINE_SPINLOCK(file_lock_lock); 166static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
163 167
164/* 168/*
165 * The blocked_hash is used to find POSIX lock loops for deadlock detection. 169 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -506,20 +510,30 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
506 return fl1->fl_owner == fl2->fl_owner; 510 return fl1->fl_owner == fl2->fl_owner;
507} 511}
508 512
513/* Must be called with the i_lock held! */
509static inline void 514static inline void
510locks_insert_global_locks(struct file_lock *fl) 515locks_insert_global_locks(struct file_lock *fl)
511{ 516{
512 spin_lock(&file_lock_lock); 517 lg_local_lock(&file_lock_lglock);
513 hlist_add_head(&fl->fl_link, &file_lock_list); 518 fl->fl_link_cpu = smp_processor_id();
514 spin_unlock(&file_lock_lock); 519 hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
520 lg_local_unlock(&file_lock_lglock);
515} 521}
516 522
523/* Must be called with the i_lock held! */
517static inline void 524static inline void
518locks_delete_global_locks(struct file_lock *fl) 525locks_delete_global_locks(struct file_lock *fl)
519{ 526{
520 spin_lock(&file_lock_lock); 527 /*
528 * Avoid taking lock if already unhashed. This is safe since this check
529 * is done while holding the i_lock, and new insertions into the list
530 * also require that it be held.
531 */
532 if (hlist_unhashed(&fl->fl_link))
533 return;
534 lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
521 hlist_del_init(&fl->fl_link); 535 hlist_del_init(&fl->fl_link);
522 spin_unlock(&file_lock_lock); 536 lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
523} 537}
524 538
525static unsigned long 539static unsigned long
@@ -1454,7 +1468,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1454 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1468 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1455 goto out; 1469 goto out;
1456 if ((arg == F_WRLCK) 1470 if ((arg == F_WRLCK)
1457 && ((dentry->d_count > 1) 1471 && ((d_count(dentry) > 1)
1458 || (atomic_read(&inode->i_count) > 1))) 1472 || (atomic_read(&inode->i_count) > 1)))
1459 goto out; 1473 goto out;
1460 1474
@@ -2243,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2243#include <linux/proc_fs.h> 2257#include <linux/proc_fs.h>
2244#include <linux/seq_file.h> 2258#include <linux/seq_file.h>
2245 2259
2260struct locks_iterator {
2261 int li_cpu;
2262 loff_t li_pos;
2263};
2264
2246static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2265static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2247 loff_t id, char *pfx) 2266 loff_t id, char *pfx)
2248{ 2267{
@@ -2316,39 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2316 2335
2317static int locks_show(struct seq_file *f, void *v) 2336static int locks_show(struct seq_file *f, void *v)
2318{ 2337{
2338 struct locks_iterator *iter = f->private;
2319 struct file_lock *fl, *bfl; 2339 struct file_lock *fl, *bfl;
2320 2340
2321 fl = hlist_entry(v, struct file_lock, fl_link); 2341 fl = hlist_entry(v, struct file_lock, fl_link);
2322 2342
2323 lock_get_status(f, fl, *((loff_t *)f->private), ""); 2343 lock_get_status(f, fl, iter->li_pos, "");
2324 2344
2325 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2345 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2326 lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); 2346 lock_get_status(f, bfl, iter->li_pos, " ->");
2327 2347
2328 return 0; 2348 return 0;
2329} 2349}
2330 2350
2331static void *locks_start(struct seq_file *f, loff_t *pos) 2351static void *locks_start(struct seq_file *f, loff_t *pos)
2332{ 2352{
2333 loff_t *p = f->private; 2353 struct locks_iterator *iter = f->private;
2334 2354
2335 spin_lock(&file_lock_lock); 2355 iter->li_pos = *pos + 1;
2356 lg_global_lock(&file_lock_lglock);
2336 spin_lock(&blocked_lock_lock); 2357 spin_lock(&blocked_lock_lock);
2337 *p = (*pos + 1); 2358 return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
2338 return seq_hlist_start(&file_lock_list, *pos);
2339} 2359}
2340 2360
2341static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2361static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2342{ 2362{
2343 loff_t *p = f->private; 2363 struct locks_iterator *iter = f->private;
2344 ++*p; 2364
2345 return seq_hlist_next(v, &file_lock_list, pos); 2365 ++iter->li_pos;
2366 return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
2346} 2367}
2347 2368
2348static void locks_stop(struct seq_file *f, void *v) 2369static void locks_stop(struct seq_file *f, void *v)
2349{ 2370{
2350 spin_unlock(&blocked_lock_lock); 2371 spin_unlock(&blocked_lock_lock);
2351 spin_unlock(&file_lock_lock); 2372 lg_global_unlock(&file_lock_lglock);
2352} 2373}
2353 2374
2354static const struct seq_operations locks_seq_operations = { 2375static const struct seq_operations locks_seq_operations = {
@@ -2360,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
2360 2381
2361static int locks_open(struct inode *inode, struct file *filp) 2382static int locks_open(struct inode *inode, struct file *filp)
2362{ 2383{
2363 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); 2384 return seq_open_private(filp, &locks_seq_operations,
2385 sizeof(struct locks_iterator));
2364} 2386}
2365 2387
2366static const struct file_operations proc_locks_operations = { 2388static const struct file_operations proc_locks_operations = {
@@ -2460,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
2460 2482
2461static int __init filelock_init(void) 2483static int __init filelock_init(void)
2462{ 2484{
2485 int i;
2486
2463 filelock_cache = kmem_cache_create("file_lock_cache", 2487 filelock_cache = kmem_cache_create("file_lock_cache",
2464 sizeof(struct file_lock), 0, SLAB_PANIC, NULL); 2488 sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
2465 2489
2490 lg_lock_init(&file_lock_lglock, "file_lock_lglock");
2491
2492 for_each_possible_cpu(i)
2493 INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
2494
2466 return 0; 2495 return 0;
2467} 2496}
2468 2497
diff --git a/fs/namei.c b/fs/namei.c
index b2beee7a733f..8b61d103a8a7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2977,7 +2977,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
2977 2977
2978 file->f_flags = op->open_flag; 2978 file->f_flags = op->open_flag;
2979 2979
2980 if (unlikely(file->f_flags & O_TMPFILE)) { 2980 if (unlikely(file->f_flags & __O_TMPFILE)) {
2981 error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); 2981 error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
2982 goto out; 2982 goto out;
2983 } 2983 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0765ad12c382..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
403 switch (optval) { 403 switch (optval) {
404 case 'u': 404 case 'u':
405 data->uid = make_kuid(current_user_ns(), optint); 405 data->uid = make_kuid(current_user_ns(), optint);
406 if (!uid_valid(data->uid)) 406 if (!uid_valid(data->uid)) {
407 ret = -EINVAL;
407 goto err; 408 goto err;
409 }
408 break; 410 break;
409 case 'g': 411 case 'g':
410 data->gid = make_kgid(current_user_ns(), optint); 412 data->gid = make_kgid(current_user_ns(), optint);
411 if (!gid_valid(data->gid)) 413 if (!gid_valid(data->gid)) {
414 ret = -EINVAL;
412 goto err; 415 goto err;
416 }
413 break; 417 break;
414 case 'o': 418 case 'o':
415 data->mounted_uid = make_kuid(current_user_ns(), optint); 419 data->mounted_uid = make_kuid(current_user_ns(), optint);
416 if (!uid_valid(data->mounted_uid)) 420 if (!uid_valid(data->mounted_uid)) {
421 ret = -EINVAL;
417 goto err; 422 goto err;
423 }
418 break; 424 break;
419 case 'm': 425 case 'm':
420 data->file_mode = optint; 426 data->file_mode = optint;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
104 104
105 If unsure, say N. 105 If unsure, say N.
106 106
107config NFS_V4_2
108 bool "NFS client support for NFSv4.2"
109 depends on NFS_V4_1
110 help
111 This option enables support for minor version 2 of the NFSv4 protocol
112 in the kernel's NFS client.
113
114 If unsure, say N.
115
107config PNFS_FILE_LAYOUT 116config PNFS_FILE_LAYOUT
108 tristate 117 tristate
109 depends on NFS_V4_1 118 depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
131 If the NFS client is unchanged from the upstream kernel, this 140 If the NFS client is unchanged from the upstream kernel, this
132 option should be set to the default "kernel.org". 141 option should be set to the default "kernel.org".
133 142
143config NFS_V4_SECURITY_LABEL
144 bool
145 depends on NFS_V4_2 && SECURITY
146 default y
147
134config ROOT_NFS 148config ROOT_NFS
135 bool "Root file system on NFS" 149 bool "Root file system on NFS"
136 depends on NFS_FS=y && IP_PNP 150 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
8 direct.o pagelist.o read.o symlink.o unlink.o \ 8 direct.o pagelist.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o \ 9 write.o namespace.o mount_clnt.o
10 dns_resolve.o cache_lib.o
11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
12nfs-$(CONFIG_SYSCTL) += sysctl.o 11nfs-$(CONFIG_SYSCTL) += sysctl.o
13nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 12nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
22obj-$(CONFIG_NFS_V4) += nfsv4.o 21obj-$(CONFIG_NFS_V4) += nfsv4.o
23nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ 22nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 23 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
25 nfs4namespace.o nfs4getroot.o nfs4client.o 24 nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
25nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o 27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
28 28
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1089 dev->pgbase = 0; 1089 dev->pgbase = 0;
1090 dev->pglen = PAGE_SIZE * max_pages; 1090 dev->pglen = PAGE_SIZE * max_pages;
1091 dev->mincount = 0; 1091 dev->mincount = 0;
1092 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
1092 1093
1093 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 1094 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
1094 rc = nfs4_proc_getdeviceinfo(server, dev); 1095 rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
1095 dprintk("%s getdevice info returns %d\n", __func__, rc); 1096 dprintk("%s getdevice info returns %d\n", __func__, rc);
1096 if (rc) { 1097 if (rc) {
1097 rv = ERR_PTR(rc); 1098 rv = ERR_PTR(rc);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index da6a43d19aa3..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -281,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
281 ret = nfs4_callback_up_net(serv, net); 281 ret = nfs4_callback_up_net(serv, net);
282 break; 282 break;
283 case 1: 283 case 1:
284 case 2:
284 ret = nfs41_callback_up_net(serv, net); 285 ret = nfs41_callback_up_net(serv, net);
285 break; 286 break;
286 default: 287 default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
32 OP_CB_WANTS_CANCELLED = 12, 32 OP_CB_WANTS_CANCELLED = 12,
33 OP_CB_NOTIFY_LOCK = 13, 33 OP_CB_NOTIFY_LOCK = 13,
34 OP_CB_NOTIFY_DEVICEID = 14, 34 OP_CB_NOTIFY_DEVICEID = 14,
35/* Callback operations new to NFSv4.2 */
36 OP_CB_OFFLOAD = 15,
35 OP_CB_ILLEGAL = 10044, 37 OP_CB_ILLEGAL = 10044,
36}; 38};
37 39
@@ -39,6 +41,7 @@ struct cb_process_state {
39 __be32 drc_status; 41 __be32 drc_status;
40 struct nfs_client *clp; 42 struct nfs_client *clp;
41 u32 slotid; 43 u32 slotid;
44 u32 minorversion;
42 struct net *net; 45 struct net *net;
43}; 46};
44 47
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0bc27684ebfa..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
406 int i; 406 int i;
407 __be32 status = htonl(NFS4ERR_BADSESSION); 407 __be32 status = htonl(NFS4ERR_BADSESSION);
408 408
409 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid); 409 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
410 &args->csa_sessionid, cps->minorversion);
410 if (clp == NULL) 411 if (clp == NULL)
411 goto out; 412 goto out;
412 413
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a35582c9d444..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
166 if (unlikely(p == NULL)) 166 if (unlikely(p == NULL))
167 return htonl(NFS4ERR_RESOURCE); 167 return htonl(NFS4ERR_RESOURCE);
168 hdr->minorversion = ntohl(*p++); 168 hdr->minorversion = ntohl(*p++);
169 /* Check minor version is zero or one. */ 169 /* Check for minor version support */
170 if (hdr->minorversion <= 1) { 170 if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
171 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ 171 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
172 } else { 172 } else {
173 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " 173 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
174 "illegal minor version %u!\n", 174 "illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
786} 786}
787#endif /* CONFIG_NFS_V4_1 */ 787#endif /* CONFIG_NFS_V4_1 */
788 788
789#ifdef CONFIG_NFS_V4_2
790static __be32
791preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
792{
793 __be32 status = preprocess_nfs41_op(nop, op_nr, op);
794 if (status != htonl(NFS4ERR_OP_ILLEGAL))
795 return status;
796
797 if (op_nr == OP_CB_OFFLOAD)
798 return htonl(NFS4ERR_NOTSUPP);
799 return htonl(NFS4ERR_OP_ILLEGAL);
800}
801#else /* CONFIG_NFS_V4_2 */
802static __be32
803preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
804{
805 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
806}
807#endif /* CONFIG_NFS_V4_2 */
808
789static __be32 809static __be32
790preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) 810preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
791{ 811{
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
801 return htonl(NFS_OK); 821 return htonl(NFS_OK);
802} 822}
803 823
804static __be32 process_op(uint32_t minorversion, int nop, 824static __be32 process_op(int nop, struct svc_rqst *rqstp,
805 struct svc_rqst *rqstp,
806 struct xdr_stream *xdr_in, void *argp, 825 struct xdr_stream *xdr_in, void *argp,
807 struct xdr_stream *xdr_out, void *resp, 826 struct xdr_stream *xdr_out, void *resp,
808 struct cb_process_state *cps) 827 struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
819 return status; 838 return status;
820 839
821 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", 840 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
822 __func__, minorversion, nop, op_nr); 841 __func__, cps->minorversion, nop, op_nr);
842
843 switch (cps->minorversion) {
844 case 0:
845 status = preprocess_nfs4_op(op_nr, &op);
846 break;
847 case 1:
848 status = preprocess_nfs41_op(nop, op_nr, &op);
849 break;
850 case 2:
851 status = preprocess_nfs42_op(nop, op_nr, &op);
852 break;
853 default:
854 status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
855 }
823 856
824 status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
825 preprocess_nfs4_op(op_nr, &op);
826 if (status == htonl(NFS4ERR_OP_ILLEGAL)) 857 if (status == htonl(NFS4ERR_OP_ILLEGAL))
827 op_nr = OP_CB_ILLEGAL; 858 op_nr = OP_CB_ILLEGAL;
828 if (status) 859 if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
885 return rpc_drop_reply; 916 return rpc_drop_reply;
886 } 917 }
887 918
919 cps.minorversion = hdr_arg.minorversion;
888 hdr_res.taglen = hdr_arg.taglen; 920 hdr_res.taglen = hdr_arg.taglen;
889 hdr_res.tag = hdr_arg.tag; 921 hdr_res.tag = hdr_arg.tag;
890 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) 922 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
891 return rpc_system_err; 923 return rpc_system_err;
892 924
893 while (status == 0 && nops != hdr_arg.nops) { 925 while (status == 0 && nops != hdr_arg.nops) {
894 status = process_op(hdr_arg.minorversion, nops, rqstp, 926 status = process_op(nops, rqstp, &xdr_in,
895 &xdr_in, argp, &xdr_out, resp, &cps); 927 argp, &xdr_out, resp, &cps);
896 nops++; 928 nops++;
897 } 929 }
898 930
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c513b0cc835f..340b1eff0267 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
753 data->timeo, data->retrans); 753 data->timeo, data->retrans);
754 if (data->flags & NFS_MOUNT_NORESVPORT) 754 if (data->flags & NFS_MOUNT_NORESVPORT)
755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
756 if (server->options & NFS_OPTION_MIGRATION)
757 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
758 756
759 /* Allocate or find a client reference we can use */ 757 /* Allocate or find a client reference we can use */
760 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 758 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
1076 } 1074 }
1077 1075
1078 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1076 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1079 error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr); 1077 error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
1080 if (error < 0) { 1078 if (error < 0) {
1081 dprintk("nfs_create_server: getattr error = %d\n", -error); 1079 dprintk("nfs_create_server: getattr error = %d\n", -error);
1082 goto error; 1080 goto error;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d7ed697133f0..e474ca2b2bfe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -437,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
437 struct dentry *alias; 437 struct dentry *alias;
438 struct inode *dir = parent->d_inode; 438 struct inode *dir = parent->d_inode;
439 struct inode *inode; 439 struct inode *inode;
440 int status;
440 441
441 if (filename.name[0] == '.') { 442 if (filename.name[0] == '.') {
442 if (filename.len == 1) 443 if (filename.len == 1)
@@ -449,7 +450,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
449 dentry = d_lookup(parent, &filename); 450 dentry = d_lookup(parent, &filename);
450 if (dentry != NULL) { 451 if (dentry != NULL) {
451 if (nfs_same_file(dentry, entry)) { 452 if (nfs_same_file(dentry, entry)) {
452 nfs_refresh_inode(dentry->d_inode, entry->fattr); 453 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
454 status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
455 if (!status)
456 nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
453 goto out; 457 goto out;
454 } else { 458 } else {
455 if (d_invalidate(dentry) != 0) 459 if (d_invalidate(dentry) != 0)
@@ -462,7 +466,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
462 if (dentry == NULL) 466 if (dentry == NULL)
463 return; 467 return;
464 468
465 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); 469 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
466 if (IS_ERR(inode)) 470 if (IS_ERR(inode))
467 goto out; 471 goto out;
468 472
@@ -587,10 +591,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
587 if (entry.fh == NULL || entry.fattr == NULL) 591 if (entry.fh == NULL || entry.fattr == NULL)
588 goto out; 592 goto out;
589 593
594 entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
595 if (IS_ERR(entry.label)) {
596 status = PTR_ERR(entry.label);
597 goto out;
598 }
599
590 array = nfs_readdir_get_array(page); 600 array = nfs_readdir_get_array(page);
591 if (IS_ERR(array)) { 601 if (IS_ERR(array)) {
592 status = PTR_ERR(array); 602 status = PTR_ERR(array);
593 goto out; 603 goto out_label_free;
594 } 604 }
595 memset(array, 0, sizeof(struct nfs_cache_array)); 605 memset(array, 0, sizeof(struct nfs_cache_array));
596 array->eof_index = -1; 606 array->eof_index = -1;
@@ -616,6 +626,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
616 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 626 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
617out_release_array: 627out_release_array:
618 nfs_readdir_release_array(page); 628 nfs_readdir_release_array(page);
629out_label_free:
630 nfs4_label_free(entry.label);
619out: 631out:
620 nfs_free_fattr(entry.fattr); 632 nfs_free_fattr(entry.fattr);
621 nfs_free_fhandle(entry.fh); 633 nfs_free_fhandle(entry.fh);
@@ -806,7 +818,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
806 nfs_readdir_descriptor_t my_desc, 818 nfs_readdir_descriptor_t my_desc,
807 *desc = &my_desc; 819 *desc = &my_desc;
808 struct nfs_open_dir_context *dir_ctx = file->private_data; 820 struct nfs_open_dir_context *dir_ctx = file->private_data;
809 int res; 821 int res = 0;
810 822
811 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 823 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
812 dentry->d_parent->d_name.name, dentry->d_name.name, 824 dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -828,7 +840,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
828 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; 840 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
829 841
830 nfs_block_sillyrename(dentry); 842 nfs_block_sillyrename(dentry);
831 res = nfs_revalidate_mapping(inode, file->f_mapping); 843 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
844 res = nfs_revalidate_mapping(inode, file->f_mapping);
832 if (res < 0) 845 if (res < 0)
833 goto out; 846 goto out;
834 847
@@ -1040,6 +1053,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1040 struct dentry *parent; 1053 struct dentry *parent;
1041 struct nfs_fh *fhandle = NULL; 1054 struct nfs_fh *fhandle = NULL;
1042 struct nfs_fattr *fattr = NULL; 1055 struct nfs_fattr *fattr = NULL;
1056 struct nfs4_label *label = NULL;
1043 int error; 1057 int error;
1044 1058
1045 if (flags & LOOKUP_RCU) 1059 if (flags & LOOKUP_RCU)
@@ -1082,7 +1096,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1082 if (fhandle == NULL || fattr == NULL) 1096 if (fhandle == NULL || fattr == NULL)
1083 goto out_error; 1097 goto out_error;
1084 1098
1085 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1099 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
1100 if (IS_ERR(label))
1101 goto out_error;
1102
1103 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1086 if (error) 1104 if (error)
1087 goto out_bad; 1105 goto out_bad;
1088 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1106 if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1108,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1090 if ((error = nfs_refresh_inode(inode, fattr)) != 0) 1108 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
1091 goto out_bad; 1109 goto out_bad;
1092 1110
1111 nfs_setsecurity(inode, fattr, label);
1112
1093 nfs_free_fattr(fattr); 1113 nfs_free_fattr(fattr);
1094 nfs_free_fhandle(fhandle); 1114 nfs_free_fhandle(fhandle);
1115 nfs4_label_free(label);
1116
1095out_set_verifier: 1117out_set_verifier:
1096 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1118 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1097 out_valid: 1119 out_valid:
@@ -1108,6 +1130,7 @@ out_zap_parent:
1108 out_bad: 1130 out_bad:
1109 nfs_free_fattr(fattr); 1131 nfs_free_fattr(fattr);
1110 nfs_free_fhandle(fhandle); 1132 nfs_free_fhandle(fhandle);
1133 nfs4_label_free(label);
1111 nfs_mark_for_revalidate(dir); 1134 nfs_mark_for_revalidate(dir);
1112 if (inode && S_ISDIR(inode->i_mode)) { 1135 if (inode && S_ISDIR(inode->i_mode)) {
1113 /* Purge readdir caches. */ 1136 /* Purge readdir caches. */
@@ -1128,6 +1151,7 @@ out_zap_parent:
1128out_error: 1151out_error:
1129 nfs_free_fattr(fattr); 1152 nfs_free_fattr(fattr);
1130 nfs_free_fhandle(fhandle); 1153 nfs_free_fhandle(fhandle);
1154 nfs4_label_free(label);
1131 dput(parent); 1155 dput(parent);
1132 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", 1156 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
1133 __func__, dentry->d_parent->d_name.name, 1157 __func__, dentry->d_parent->d_name.name,
@@ -1256,6 +1280,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1256 struct inode *inode = NULL; 1280 struct inode *inode = NULL;
1257 struct nfs_fh *fhandle = NULL; 1281 struct nfs_fh *fhandle = NULL;
1258 struct nfs_fattr *fattr = NULL; 1282 struct nfs_fattr *fattr = NULL;
1283 struct nfs4_label *label = NULL;
1259 int error; 1284 int error;
1260 1285
1261 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 1286 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1282,17 +1307,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1282 if (fhandle == NULL || fattr == NULL) 1307 if (fhandle == NULL || fattr == NULL)
1283 goto out; 1308 goto out;
1284 1309
1310 label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
1311 if (IS_ERR(label))
1312 goto out;
1313
1285 parent = dentry->d_parent; 1314 parent = dentry->d_parent;
1286 /* Protect against concurrent sillydeletes */ 1315 /* Protect against concurrent sillydeletes */
1287 nfs_block_sillyrename(parent); 1316 nfs_block_sillyrename(parent);
1288 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1317 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1289 if (error == -ENOENT) 1318 if (error == -ENOENT)
1290 goto no_entry; 1319 goto no_entry;
1291 if (error < 0) { 1320 if (error < 0) {
1292 res = ERR_PTR(error); 1321 res = ERR_PTR(error);
1293 goto out_unblock_sillyrename; 1322 goto out_unblock_sillyrename;
1294 } 1323 }
1295 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1324 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1296 res = ERR_CAST(inode); 1325 res = ERR_CAST(inode);
1297 if (IS_ERR(res)) 1326 if (IS_ERR(res))
1298 goto out_unblock_sillyrename; 1327 goto out_unblock_sillyrename;
@@ -1310,6 +1339,7 @@ no_entry:
1310 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1339 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1311out_unblock_sillyrename: 1340out_unblock_sillyrename:
1312 nfs_unblock_sillyrename(parent); 1341 nfs_unblock_sillyrename(parent);
1342 nfs4_label_free(label);
1313out: 1343out:
1314 nfs_free_fattr(fattr); 1344 nfs_free_fattr(fattr);
1315 nfs_free_fhandle(fhandle); 1345 nfs_free_fhandle(fhandle);
@@ -1357,18 +1387,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
1357{ 1387{
1358 int err; 1388 int err;
1359 1389
1360 if (ctx->dentry != dentry) {
1361 dput(ctx->dentry);
1362 ctx->dentry = dget(dentry);
1363 }
1364
1365 /* If the open_intent is for execute, we have an extra check to make */
1366 if (ctx->mode & FMODE_EXEC) {
1367 err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
1368 if (err < 0)
1369 goto out;
1370 }
1371
1372 err = finish_open(file, dentry, do_open, opened); 1390 err = finish_open(file, dentry, do_open, opened);
1373 if (err) 1391 if (err)
1374 goto out; 1392 goto out;
@@ -1427,13 +1445,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1427 1445
1428 nfs_block_sillyrename(dentry->d_parent); 1446 nfs_block_sillyrename(dentry->d_parent);
1429 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); 1447 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1430 d_drop(dentry); 1448 nfs_unblock_sillyrename(dentry->d_parent);
1431 if (IS_ERR(inode)) { 1449 if (IS_ERR(inode)) {
1432 nfs_unblock_sillyrename(dentry->d_parent);
1433 put_nfs_open_context(ctx); 1450 put_nfs_open_context(ctx);
1434 err = PTR_ERR(inode); 1451 err = PTR_ERR(inode);
1435 switch (err) { 1452 switch (err) {
1436 case -ENOENT: 1453 case -ENOENT:
1454 d_drop(dentry);
1437 d_add(dentry, NULL); 1455 d_add(dentry, NULL);
1438 break; 1456 break;
1439 case -EISDIR: 1457 case -EISDIR:
@@ -1449,16 +1467,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1449 } 1467 }
1450 goto out; 1468 goto out;
1451 } 1469 }
1452 res = d_add_unique(dentry, inode);
1453 if (res != NULL)
1454 dentry = res;
1455
1456 nfs_unblock_sillyrename(dentry->d_parent);
1457 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1458
1459 err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
1460 1470
1461 dput(res); 1471 err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
1462out: 1472out:
1463 return err; 1473 return err;
1464 1474
@@ -1528,7 +1538,8 @@ no_open:
1528 * Code common to create, mkdir, and mknod. 1538 * Code common to create, mkdir, and mknod.
1529 */ 1539 */
1530int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, 1540int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1531 struct nfs_fattr *fattr) 1541 struct nfs_fattr *fattr,
1542 struct nfs4_label *label)
1532{ 1543{
1533 struct dentry *parent = dget_parent(dentry); 1544 struct dentry *parent = dget_parent(dentry);
1534 struct inode *dir = parent->d_inode; 1545 struct inode *dir = parent->d_inode;
@@ -1541,18 +1552,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1541 if (dentry->d_inode) 1552 if (dentry->d_inode)
1542 goto out; 1553 goto out;
1543 if (fhandle->size == 0) { 1554 if (fhandle->size == 0) {
1544 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1555 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
1545 if (error) 1556 if (error)
1546 goto out_error; 1557 goto out_error;
1547 } 1558 }
1548 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1559 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1549 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1560 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1550 struct nfs_server *server = NFS_SB(dentry->d_sb); 1561 struct nfs_server *server = NFS_SB(dentry->d_sb);
1551 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); 1562 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
1552 if (error < 0) 1563 if (error < 0)
1553 goto out_error; 1564 goto out_error;
1554 } 1565 }
1555 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1566 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1556 error = PTR_ERR(inode); 1567 error = PTR_ERR(inode);
1557 if (IS_ERR(inode)) 1568 if (IS_ERR(inode))
1558 goto out_error; 1569 goto out_error;
@@ -1721,7 +1732,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
1721 dir->i_ino, dentry->d_name.name); 1732 dir->i_ino, dentry->d_name.name);
1722 1733
1723 spin_lock(&dentry->d_lock); 1734 spin_lock(&dentry->d_lock);
1724 if (dentry->d_count > 1) { 1735 if (d_count(dentry) > 1) {
1725 spin_unlock(&dentry->d_lock); 1736 spin_unlock(&dentry->d_lock);
1726 /* Start asynchronous writeout of the inode */ 1737 /* Start asynchronous writeout of the inode */
1727 write_inode_now(dentry->d_inode, 0); 1738 write_inode_now(dentry->d_inode, 0);
@@ -1866,7 +1877,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1866 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1877 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1867 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1878 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1868 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1879 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1869 new_dentry->d_count); 1880 d_count(new_dentry));
1870 1881
1871 /* 1882 /*
1872 * For non-directories, check whether the target is busy and if so, 1883 * For non-directories, check whether the target is busy and if so,
@@ -1884,7 +1895,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1884 rehash = new_dentry; 1895 rehash = new_dentry;
1885 } 1896 }
1886 1897
1887 if (new_dentry->d_count > 2) { 1898 if (d_count(new_dentry) > 2) {
1888 int err; 1899 int err;
1889 1900
1890 /* copy the target dentry's name */ 1901 /* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
29 kfree(ip_addr); 29 kfree(ip_addr);
30 return ret; 30 return ret;
31} 31}
32EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
33 32
34#else 33#else
35 34
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
351 ret = -ESRCH; 350 ret = -ESRCH;
352 return ret; 351 return ret;
353} 352}
354EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
355 353
356static struct cache_detail nfs_dns_resolve_template = { 354static struct cache_detail nfs_dns_resolve_template = {
357 .owner = THIS_MODULE, 355 .owner = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
396 cache_destroy_net(nn->nfs_dns_resolve, net); 394 cache_destroy_net(nn->nfs_dns_resolve, net);
397} 395}
398 396
397static int nfs4_dns_net_init(struct net *net)
398{
399 return nfs_dns_resolver_cache_init(net);
400}
401
402static void nfs4_dns_net_exit(struct net *net)
403{
404 nfs_dns_resolver_cache_destroy(net);
405}
406
407static struct pernet_operations nfs4_dns_resolver_ops = {
408 .init = nfs4_dns_net_init,
409 .exit = nfs4_dns_net_exit,
410};
411
399static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, 412static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
400 void *ptr) 413 void *ptr)
401{ 414{
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
432 445
433int nfs_dns_resolver_init(void) 446int nfs_dns_resolver_init(void)
434{ 447{
435 return rpc_pipefs_notifier_register(&nfs_dns_resolver_block); 448 int err;
449
450 err = register_pernet_subsys(&nfs4_dns_resolver_ops);
451 if (err < 0)
452 goto out;
453 err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
454 if (err < 0)
455 goto out1;
456 return 0;
457out1:
458 unregister_pernet_subsys(&nfs4_dns_resolver_ops);
459out:
460 return err;
436} 461}
437 462
438void nfs_dns_resolver_destroy(void) 463void nfs_dns_resolver_destroy(void)
439{ 464{
440 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); 465 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
466 unregister_pernet_subsys(&nfs4_dns_resolver_ops);
441} 467}
442#endif 468#endif
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
95 goto out; 95 goto out;
96 } 96 }
97 97
98 inode = nfs_fhget(sb, mntfh, fsinfo.fattr); 98 inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 dprintk("nfs_get_root: get root inode failed\n"); 100 dprintk("nfs_get_root: get root inode failed\n");
101 ret = ERR_CAST(inode); 101 ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
262 return desclen; 262 return desclen;
263} 263}
264 264
265static ssize_t nfs_idmap_request_key(struct key_type *key_type, 265static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
266 const char *name, size_t namelen, 266 const char *type, struct idmap *idmap)
267 const char *type, void *data,
268 size_t data_size, struct idmap *idmap)
269{ 267{
270 const struct cred *saved_cred;
271 struct key *rkey;
272 char *desc; 268 char *desc;
273 struct user_key_payload *payload; 269 struct key *rkey;
274 ssize_t ret; 270 ssize_t ret;
275 271
276 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); 272 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
277 if (ret <= 0) 273 if (ret <= 0)
278 goto out; 274 return ERR_PTR(ret);
275
276 rkey = request_key(&key_type_id_resolver, desc, "");
277 if (IS_ERR(rkey)) {
278 mutex_lock(&idmap->idmap_mutex);
279 rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
280 desc, "", 0, idmap);
281 mutex_unlock(&idmap->idmap_mutex);
282 }
283
284 kfree(desc);
285 return rkey;
286}
287
288static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
289 const char *type, void *data,
290 size_t data_size, struct idmap *idmap)
291{
292 const struct cred *saved_cred;
293 struct key *rkey;
294 struct user_key_payload *payload;
295 ssize_t ret;
279 296
280 saved_cred = override_creds(id_resolver_cache); 297 saved_cred = override_creds(id_resolver_cache);
281 if (idmap) 298 rkey = nfs_idmap_request_key(name, namelen, type, idmap);
282 rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
283 else
284 rkey = request_key(&key_type_id_resolver, desc, "");
285 revert_creds(saved_cred); 299 revert_creds(saved_cred);
286 300
287 kfree(desc);
288 if (IS_ERR(rkey)) { 301 if (IS_ERR(rkey)) {
289 ret = PTR_ERR(rkey); 302 ret = PTR_ERR(rkey);
290 goto out; 303 goto out;
@@ -316,23 +329,6 @@ out:
316 return ret; 329 return ret;
317} 330}
318 331
319static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
320 const char *type, void *data,
321 size_t data_size, struct idmap *idmap)
322{
323 ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
324 name, namelen, type, data,
325 data_size, NULL);
326 if (ret < 0) {
327 mutex_lock(&idmap->idmap_mutex);
328 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
329 name, namelen, type, data,
330 data_size, idmap);
331 mutex_unlock(&idmap->idmap_mutex);
332 }
333 return ret;
334}
335
336/* ID -> Name */ 332/* ID -> Name */
337static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, 333static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
338 size_t buflen, struct idmap *idmap) 334 size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ce727047ee87..af6e806044d7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "dns_resolve.h"
52#include "pnfs.h" 51#include "pnfs.h"
53#include "nfs.h" 52#include "nfs.h"
54#include "netns.h" 53#include "netns.h"
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
162 161
163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); 162 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 163 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
166 nfs_fscache_invalidate(inode); 164 nfs_fscache_invalidate(inode);
167 } else { 165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
168 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 166 | NFS_INO_INVALID_LABEL
169 } 167 | NFS_INO_INVALID_DATA
168 | NFS_INO_INVALID_ACCESS
169 | NFS_INO_INVALID_ACL
170 | NFS_INO_REVAL_PAGECACHE;
171 } else
172 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
173 | NFS_INO_INVALID_LABEL
174 | NFS_INO_INVALID_ACCESS
175 | NFS_INO_INVALID_ACL
176 | NFS_INO_REVAL_PAGECACHE;
170} 177}
171 178
172void nfs_zap_caches(struct inode *inode) 179void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
257 return 0; 264 return 0;
258} 265}
259 266
267#ifdef CONFIG_NFS_V4_SECURITY_LABEL
268void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
269 struct nfs4_label *label)
270{
271 int error;
272
273 if (label == NULL)
274 return;
275
276 if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
277 return;
278
279 if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
280 return;
281
282 if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
283 error = security_inode_notifysecctx(inode, label->label,
284 label->len);
285 if (error)
286 printk(KERN_ERR "%s() %s %d "
287 "security_inode_notifysecctx() %d\n",
288 __func__,
289 (char *)label->label,
290 label->len, error);
291 }
292}
293
294struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
295{
296 struct nfs4_label *label = NULL;
297 int minor_version = server->nfs_client->cl_minorversion;
298
299 if (minor_version < 2)
300 return label;
301
302 if (!(server->caps & NFS_CAP_SECURITY_LABEL))
303 return label;
304
305 label = kzalloc(sizeof(struct nfs4_label), flags);
306 if (label == NULL)
307 return ERR_PTR(-ENOMEM);
308
309 label->label = kzalloc(NFS4_MAXLABELLEN, flags);
310 if (label->label == NULL) {
311 kfree(label);
312 return ERR_PTR(-ENOMEM);
313 }
314 label->len = NFS4_MAXLABELLEN;
315
316 return label;
317}
318EXPORT_SYMBOL_GPL(nfs4_label_alloc);
319#else
320void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
321 struct nfs4_label *label)
322{
323}
324#endif
325EXPORT_SYMBOL_GPL(nfs_setsecurity);
326
260/* 327/*
261 * This is our front-end to iget that looks up inodes by file handle 328 * This is our front-end to iget that looks up inodes by file handle
262 * instead of inode number. 329 * instead of inode number.
263 */ 330 */
264struct inode * 331struct inode *
265nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) 332nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
266{ 333{
267 struct nfs_find_desc desc = { 334 struct nfs_find_desc desc = {
268 .fh = fh, 335 .fh = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
384 */ 451 */
385 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 452 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
386 } 453 }
454
455 nfs_setsecurity(inode, fattr, label);
456
387 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 457 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
388 nfsi->attrtimeo_timestamp = now; 458 nfsi->attrtimeo_timestamp = now;
389 nfsi->access_cache = RB_ROOT; 459 nfsi->access_cache = RB_ROOT;
@@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
393 unlock_new_inode(inode); 463 unlock_new_inode(inode);
394 } else 464 } else
395 nfs_refresh_inode(inode, fattr); 465 nfs_refresh_inode(inode, fattr);
466 nfs_setsecurity(inode, fattr, label);
396 dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", 467 dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
397 inode->i_sb->s_id, 468 inode->i_sb->s_id,
398 (long long)NFS_FILEID(inode), 469 (long long)NFS_FILEID(inode),
@@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
449 NFS_PROTO(inode)->return_delegation(inode); 520 NFS_PROTO(inode)->return_delegation(inode);
450 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); 521 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
451 if (error == 0) 522 if (error == 0)
452 nfs_refresh_inode(inode, fattr); 523 error = nfs_refresh_inode(inode, fattr);
453 nfs_free_fattr(fattr); 524 nfs_free_fattr(fattr);
454out: 525out:
455 return error; 526 return error;
@@ -713,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
713 * Ensure that mmap has a recent RPC credential for use when writing out 784 * Ensure that mmap has a recent RPC credential for use when writing out
714 * shared pages 785 * shared pages
715 */ 786 */
716void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 787void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
717{ 788{
718 struct inode *inode = file_inode(filp); 789 struct inode *inode = ctx->dentry->d_inode;
719 struct nfs_inode *nfsi = NFS_I(inode); 790 struct nfs_inode *nfsi = NFS_I(inode);
720 791
721 filp->private_data = get_nfs_open_context(ctx);
722 spin_lock(&inode->i_lock); 792 spin_lock(&inode->i_lock);
723 list_add(&ctx->list, &nfsi->open_files); 793 list_add(&ctx->list, &nfsi->open_files);
724 spin_unlock(&inode->i_lock); 794 spin_unlock(&inode->i_lock);
725} 795}
796EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
797
798void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
799{
800 filp->private_data = get_nfs_open_context(ctx);
801 if (list_empty(&ctx->list))
802 nfs_inode_attach_open_context(ctx);
803}
726EXPORT_SYMBOL_GPL(nfs_file_set_open_context); 804EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
727 805
728/* 806/*
@@ -748,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
748 826
749static void nfs_file_clear_open_context(struct file *filp) 827static void nfs_file_clear_open_context(struct file *filp)
750{ 828{
751 struct inode *inode = file_inode(filp);
752 struct nfs_open_context *ctx = nfs_file_open_context(filp); 829 struct nfs_open_context *ctx = nfs_file_open_context(filp);
753 830
754 if (ctx) { 831 if (ctx) {
832 struct inode *inode = ctx->dentry->d_inode;
833
755 filp->private_data = NULL; 834 filp->private_data = NULL;
756 spin_lock(&inode->i_lock); 835 spin_lock(&inode->i_lock);
757 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 836 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +869,7 @@ int
790__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 869__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
791{ 870{
792 int status = -ESTALE; 871 int status = -ESTALE;
872 struct nfs4_label *label = NULL;
793 struct nfs_fattr *fattr = NULL; 873 struct nfs_fattr *fattr = NULL;
794 struct nfs_inode *nfsi = NFS_I(inode); 874 struct nfs_inode *nfsi = NFS_I(inode);
795 875
@@ -807,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
807 goto out; 887 goto out;
808 888
809 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 889 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
810 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); 890
891 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
892 if (IS_ERR(label)) {
893 status = PTR_ERR(label);
894 goto out;
895 }
896
897 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
811 if (status != 0) { 898 if (status != 0) {
812 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 899 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
813 inode->i_sb->s_id, 900 inode->i_sb->s_id,
@@ -817,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
817 if (!S_ISDIR(inode->i_mode)) 904 if (!S_ISDIR(inode->i_mode))
818 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 905 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
819 } 906 }
820 goto out; 907 goto err_out;
821 } 908 }
822 909
823 status = nfs_refresh_inode(inode, fattr); 910 status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
825 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 912 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
826 inode->i_sb->s_id, 913 inode->i_sb->s_id,
827 (long long)NFS_FILEID(inode), status); 914 (long long)NFS_FILEID(inode), status);
828 goto out; 915 goto err_out;
829 } 916 }
830 917
831 if (nfsi->cache_validity & NFS_INO_INVALID_ACL) 918 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
835 inode->i_sb->s_id, 922 inode->i_sb->s_id,
836 (long long)NFS_FILEID(inode)); 923 (long long)NFS_FILEID(inode));
837 924
838 out: 925err_out:
926 nfs4_label_free(label);
927out:
839 nfs_free_fattr(fattr); 928 nfs_free_fattr(fattr);
840 return status; 929 return status;
841} 930}
@@ -847,7 +936,7 @@ int nfs_attribute_timeout(struct inode *inode)
847 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 936 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
848} 937}
849 938
850static int nfs_attribute_cache_expired(struct inode *inode) 939int nfs_attribute_cache_expired(struct inode *inode)
851{ 940{
852 if (nfs_have_delegated_attributes(inode)) 941 if (nfs_have_delegated_attributes(inode))
853 return 0; 942 return 0;
@@ -863,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
863 */ 952 */
864int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 953int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
865{ 954{
866 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) 955 if (!(NFS_I(inode)->cache_validity &
956 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
867 && !nfs_attribute_cache_expired(inode)) 957 && !nfs_attribute_cache_expired(inode))
868 return NFS_STALE(inode) ? -ESTALE : 0; 958 return NFS_STALE(inode) ? -ESTALE : 0;
869 return __nfs_revalidate_inode(server, inode); 959 return __nfs_revalidate_inode(server, inode);
@@ -1243,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1243 spin_lock(&inode->i_lock); 1333 spin_lock(&inode->i_lock);
1244 status = nfs_post_op_update_inode_locked(inode, fattr); 1334 status = nfs_post_op_update_inode_locked(inode, fattr);
1245 spin_unlock(&inode->i_lock); 1335 spin_unlock(&inode->i_lock);
1336
1246 return status; 1337 return status;
1247} 1338}
1248EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); 1339EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1483 inode->i_blocks = fattr->du.nfs2.blocks; 1574 inode->i_blocks = fattr->du.nfs2.blocks;
1484 1575
1485 /* Update attrtimeo value if we're out of the unstable period */ 1576 /* Update attrtimeo value if we're out of the unstable period */
1486 if (invalid & NFS_INO_INVALID_ATTR) { 1577 if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
1487 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1578 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1488 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1579 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1489 nfsi->attrtimeo_timestamp = now; 1580 nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1496 } 1587 }
1497 } 1588 }
1498 invalid &= ~NFS_INO_INVALID_ATTR; 1589 invalid &= ~NFS_INO_INVALID_ATTR;
1590 invalid &= ~NFS_INO_INVALID_LABEL;
1499 /* Don't invalidate the data if we were to blame */ 1591 /* Don't invalidate the data if we were to blame */
1500 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1592 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1501 || S_ISLNK(inode->i_mode))) 1593 || S_ISLNK(inode->i_mode)))
@@ -1638,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
1638static int nfs_net_init(struct net *net) 1730static int nfs_net_init(struct net *net)
1639{ 1731{
1640 nfs_clients_init(net); 1732 nfs_clients_init(net);
1641 return nfs_dns_resolver_cache_init(net); 1733 return 0;
1642} 1734}
1643 1735
1644static void nfs_net_exit(struct net *net) 1736static void nfs_net_exit(struct net *net)
1645{ 1737{
1646 nfs_dns_resolver_cache_destroy(net);
1647 nfs_cleanup_cb_ident_idr(net); 1738 nfs_cleanup_cb_ident_idr(net);
1648} 1739}
1649 1740
@@ -1661,10 +1752,6 @@ static int __init init_nfs_fs(void)
1661{ 1752{
1662 int err; 1753 int err;
1663 1754
1664 err = nfs_dns_resolver_init();
1665 if (err < 0)
1666 goto out10;;
1667
1668 err = register_pernet_subsys(&nfs_net_ops); 1755 err = register_pernet_subsys(&nfs_net_ops);
1669 if (err < 0) 1756 if (err < 0)
1670 goto out9; 1757 goto out9;
@@ -1730,8 +1817,6 @@ out7:
1730out8: 1817out8:
1731 unregister_pernet_subsys(&nfs_net_ops); 1818 unregister_pernet_subsys(&nfs_net_ops);
1732out9: 1819out9:
1733 nfs_dns_resolver_destroy();
1734out10:
1735 return err; 1820 return err;
1736} 1821}
1737 1822
@@ -1744,7 +1829,6 @@ static void __exit exit_nfs_fs(void)
1744 nfs_destroy_nfspagecache(); 1829 nfs_destroy_nfspagecache();
1745 nfs_fscache_unregister(); 1830 nfs_fscache_unregister();
1746 unregister_pernet_subsys(&nfs_net_ops); 1831 unregister_pernet_subsys(&nfs_net_ops);
1747 nfs_dns_resolver_destroy();
1748#ifdef CONFIG_PROC_FS 1832#ifdef CONFIG_PROC_FS
1749 rpc_proc_unregister(&init_net, "nfs"); 1833 rpc_proc_unregister(&init_net, "nfs");
1750#endif 1834#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 91e59a39fc08..3c8373f90ab3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
165extern struct nfs_client *nfs4_find_client_ident(struct net *, int); 165extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
166extern struct nfs_client * 166extern struct nfs_client *
167nfs4_find_client_sessionid(struct net *, const struct sockaddr *, 167nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
168 struct nfs4_sessionid *); 168 struct nfs4_sessionid *, u32);
169extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, 169extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
170 struct nfs_subversion *); 170 struct nfs_subversion *);
171extern struct nfs_server *nfs4_create_server( 171extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
255#ifdef CONFIG_NFS_V4_1 255#ifdef CONFIG_NFS_V4_1
256extern const u32 nfs41_maxread_overhead; 256extern const u32 nfs41_maxread_overhead;
257extern const u32 nfs41_maxwrite_overhead; 257extern const u32 nfs41_maxwrite_overhead;
258extern const u32 nfs41_maxgetdevinfo_overhead;
258#endif 259#endif
259 260
260/* nfs4proc.c */ 261/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
139 * nfs_mount - Obtain an NFS file handle for the given host and path 139 * nfs_mount - Obtain an NFS file handle for the given host and path
140 * @info: pointer to mount request arguments 140 * @info: pointer to mount request arguments
141 * 141 *
142 * Uses default timeout parameters specified by underlying transport. 142 * Uses default timeout parameters specified by underlying transport. On
143 * successful return, the auth_flavs list and auth_flav_len will be populated
144 * with the list from the server or a faked-up list if the server didn't
145 * provide one.
143 */ 146 */
144int nfs_mount(struct nfs_mount_request *info) 147int nfs_mount(struct nfs_mount_request *info)
145{ 148{
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
195 dprintk("NFS: MNT request succeeded\n"); 198 dprintk("NFS: MNT request succeeded\n");
196 status = 0; 199 status = 0;
197 200
201 /*
202 * If the server didn't provide a flavor list, allow the
203 * client to try any flavor.
204 */
205 if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
206 dprintk("NFS: Faking up auth_flavs list\n");
207 info->auth_flavs[0] = RPC_AUTH_NULL;
208 *info->auth_flav_len = 1;
209 }
198out: 210out:
199 return status; 211 return status;
200 212
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
280 struct dentry *parent = dget_parent(dentry); 280 struct dentry *parent = dget_parent(dentry);
281 281
282 /* Look it up again to get its attributes */ 282 /* Look it up again to get its attributes */
283 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr); 283 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
284 dput(parent); 284 dput(parent);
285 if (err != 0) 285 if (err != 0)
286 return ERR_PTR(err); 286 return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce90eb4775c2..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
98 */ 98 */
99static int 99static int
100nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, 100nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
101 struct nfs_fattr *fattr) 101 struct nfs_fattr *fattr, struct nfs4_label *label)
102{ 102{
103 struct rpc_message msg = { 103 struct rpc_message msg = {
104 .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], 104 .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
143 143
144static int 144static int
145nfs3_proc_lookup(struct inode *dir, struct qstr *name, 145nfs3_proc_lookup(struct inode *dir, struct qstr *name,
146 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 146 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
147 struct nfs4_label *label)
147{ 148{
148 struct nfs3_diropargs arg = { 149 struct nfs3_diropargs arg = {
149 .fh = NFS_FH(dir), 150 .fh = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
300 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); 301 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
301 nfs_post_op_update_inode(dir, data->res.dir_attr); 302 nfs_post_op_update_inode(dir, data->res.dir_attr);
302 if (status == 0) 303 if (status == 0)
303 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 304 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
304 return status; 305 return status;
305} 306}
306 307
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a1dd768d0a35..ee81e354bce7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops {
194 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 194 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *); 196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
197 int (*reclaim_complete)(struct nfs_client *); 197 int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **, 198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
199 struct rpc_cred *); 199 struct rpc_cred *);
200}; 200};
@@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
304 304
305extern const u32 nfs4_fattr_bitmap[3]; 305extern const u32 nfs4_fattr_bitmap[3];
306extern const u32 nfs4_statfs_bitmap[2]; 306extern const u32 nfs4_statfs_bitmap[3];
307extern const u32 nfs4_pathconf_bitmap[2]; 307extern const u32 nfs4_pathconf_bitmap[3];
308extern const u32 nfs4_fsinfo_bitmap[3]; 308extern const u32 nfs4_fsinfo_bitmap[3];
309extern const u32 nfs4_fs_locations_bitmap[2]; 309extern const u32 nfs4_fs_locations_bitmap[3];
310 310
311void nfs4_free_client(struct nfs_client *); 311void nfs4_free_client(struct nfs_client *);
312 312
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..90dce91dd5b5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
66 if (err) 66 if (err)
67 goto error; 67 goto error;
68 68
69 if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
70 err = -EINVAL;
71 goto error;
72 }
73
69 spin_lock_init(&clp->cl_lock); 74 spin_lock_init(&clp->cl_lock);
70 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 75 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
71 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 76 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
562 */ 567 */
563struct nfs_client * 568struct nfs_client *
564nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, 569nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
565 struct nfs4_sessionid *sid) 570 struct nfs4_sessionid *sid, u32 minorversion)
566{ 571{
567 struct nfs_client *clp; 572 struct nfs_client *clp;
568 struct nfs_net *nn = net_generic(net, nfs_net_id); 573 struct nfs_net *nn = net_generic(net, nfs_net_id);
569 574
570 spin_lock(&nn->nfs_client_lock); 575 spin_lock(&nn->nfs_client_lock);
571 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { 576 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
572 if (nfs4_cb_match_client(addr, clp, 1) == false) 577 if (nfs4_cb_match_client(addr, clp, minorversion) == false)
573 continue; 578 continue;
574 579
575 if (!nfs4_has_session(clp)) 580 if (!nfs4_has_session(clp))
@@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
592 597
593struct nfs_client * 598struct nfs_client *
594nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, 599nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
595 struct nfs4_sessionid *sid) 600 struct nfs4_sessionid *sid, u32 minorversion)
596{ 601{
597 return NULL; 602 return NULL;
598} 603}
@@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
626 631
627 if (server->flags & NFS_MOUNT_NORESVPORT) 632 if (server->flags & NFS_MOUNT_NORESVPORT)
628 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 633 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
634 if (server->options & NFS_OPTION_MIGRATION)
635 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
629 636
630 /* Allocate or find a client reference we can use */ 637 /* Allocate or find a client reference we can use */
631 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); 638 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
730 return -ENOMEM; 737 return -ENOMEM;
731 738
732 /* We must ensure the session is initialised first */ 739 /* We must ensure the session is initialised first */
733 error = nfs4_init_session(server); 740 error = nfs4_init_session(server->nfs_client);
734 if (error < 0) 741 if (error < 0)
735 goto out; 742 goto out;
736 743
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
69 goto out_drop; 69 goto out_drop;
70 } 70 }
71 } 71 }
72 iput(inode);
73 if (inode != dentry->d_inode) 72 if (inode != dentry->d_inode)
74 goto out_drop; 73 goto out_drop;
75 74
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 22d10623f5ee..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
643 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 643 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
644 NFS_SERVER(lo->plh_inode)->nfs_client, id); 644 NFS_SERVER(lo->plh_inode)->nfs_client, id);
645 if (d == NULL) { 645 if (d == NULL) {
646 dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); 646 dsaddr = filelayout_get_device_info(lo->plh_inode, id,
647 lo->plh_lc_cred, gfp_flags);
647 if (dsaddr == NULL) 648 if (dsaddr == NULL)
648 goto out; 649 goto out;
649 } else 650 } else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 235ff952d3c8..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr * 152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
154 155
155#endif /* FS_NFS_NFS4FILELAYOUT_H */ 156#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 661a0f611215..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
668 * of available devices, and return it. 668 * of available devices, and return it.
669 */ 669 */
670struct nfs4_file_layout_dsaddr * 670struct nfs4_file_layout_dsaddr *
671filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) 671filelayout_get_device_info(struct inode *inode,
672 struct nfs4_deviceid *dev_id,
673 struct rpc_cred *cred,
674 gfp_t gfp_flags)
672{ 675{
673 struct pnfs_device *pdev = NULL; 676 struct pnfs_device *pdev = NULL;
674 u32 max_resp_sz; 677 u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
708 pdev->pgbase = 0; 711 pdev->pgbase = 0;
709 pdev->pglen = max_resp_sz; 712 pdev->pglen = max_resp_sz;
710 pdev->mincount = 0; 713 pdev->mincount = 0;
714 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
711 715
712 rc = nfs4_proc_getdeviceinfo(server, pdev); 716 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
713 dprintk("%s getdevice info returns %d\n", __func__, rc); 717 dprintk("%s getdevice info returns %d\n", __func__, rc);
714 if (rc) 718 if (rc)
715 goto out_free; 719 goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 28241a42f363..cf11799297c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
77static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 77static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
78static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 78static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
79static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 79static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
80static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); 80static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
81static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 81static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
82static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 82static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
83 struct nfs_fattr *fattr, struct iattr *sattr, 83 struct nfs_fattr *fattr, struct iattr *sattr,
84 struct nfs4_state *state); 84 struct nfs4_state *state, struct nfs4_label *ilabel,
85 struct nfs4_label *olabel);
85#ifdef CONFIG_NFS_V4_1 86#ifdef CONFIG_NFS_V4_1
86static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); 87static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
87static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *); 88 struct rpc_cred *);
89static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
90 struct rpc_cred *);
88#endif 91#endif
92
93#ifdef CONFIG_NFS_V4_SECURITY_LABEL
94static inline struct nfs4_label *
95nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
96 struct iattr *sattr, struct nfs4_label *label)
97{
98 int err;
99
100 if (label == NULL)
101 return NULL;
102
103 if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
104 return NULL;
105
106 if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
107 return NULL;
108
109 err = security_dentry_init_security(dentry, sattr->ia_mode,
110 &dentry->d_name, (void **)&label->label, &label->len);
111 if (err == 0)
112 return label;
113
114 return NULL;
115}
116static inline void
117nfs4_label_release_security(struct nfs4_label *label)
118{
119 if (label)
120 security_release_secctx(label->label, label->len);
121}
122static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
123{
124 if (label)
125 return server->attr_bitmask;
126
127 return server->attr_bitmask_nl;
128}
129#else
130static inline struct nfs4_label *
131nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
132 struct iattr *sattr, struct nfs4_label *l)
133{ return NULL; }
134static inline void
135nfs4_label_release_security(struct nfs4_label *label)
136{ return; }
137static inline u32 *
138nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
139{ return server->attr_bitmask; }
140#endif
141
89/* Prevent leaks of NFSv4 errors into userland */ 142/* Prevent leaks of NFSv4 errors into userland */
90static int nfs4_map_errors(int err) 143static int nfs4_map_errors(int err)
91{ 144{
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
134 | FATTR4_WORD1_SPACE_USED 187 | FATTR4_WORD1_SPACE_USED
135 | FATTR4_WORD1_TIME_ACCESS 188 | FATTR4_WORD1_TIME_ACCESS
136 | FATTR4_WORD1_TIME_METADATA 189 | FATTR4_WORD1_TIME_METADATA
137 | FATTR4_WORD1_TIME_MODIFY 190 | FATTR4_WORD1_TIME_MODIFY,
191#ifdef CONFIG_NFS_V4_SECURITY_LABEL
192 FATTR4_WORD2_SECURITY_LABEL
193#endif
138}; 194};
139 195
140static const u32 nfs4_pnfs_open_bitmap[3] = { 196static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
161 | FATTR4_WORD0_FILEID, 217 | FATTR4_WORD0_FILEID,
162}; 218};
163 219
164const u32 nfs4_statfs_bitmap[2] = { 220const u32 nfs4_statfs_bitmap[3] = {
165 FATTR4_WORD0_FILES_AVAIL 221 FATTR4_WORD0_FILES_AVAIL
166 | FATTR4_WORD0_FILES_FREE 222 | FATTR4_WORD0_FILES_FREE
167 | FATTR4_WORD0_FILES_TOTAL, 223 | FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
170 | FATTR4_WORD1_SPACE_TOTAL 226 | FATTR4_WORD1_SPACE_TOTAL
171}; 227};
172 228
173const u32 nfs4_pathconf_bitmap[2] = { 229const u32 nfs4_pathconf_bitmap[3] = {
174 FATTR4_WORD0_MAXLINK 230 FATTR4_WORD0_MAXLINK
175 | FATTR4_WORD0_MAXNAME, 231 | FATTR4_WORD0_MAXNAME,
176 0 232 0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
185 FATTR4_WORD2_LAYOUT_BLKSIZE 241 FATTR4_WORD2_LAYOUT_BLKSIZE
186}; 242};
187 243
188const u32 nfs4_fs_locations_bitmap[2] = { 244const u32 nfs4_fs_locations_bitmap[3] = {
189 FATTR4_WORD0_TYPE 245 FATTR4_WORD0_TYPE
190 | FATTR4_WORD0_CHANGE 246 | FATTR4_WORD0_CHANGE
191 | FATTR4_WORD0_SIZE 247 | FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
201 | FATTR4_WORD1_TIME_ACCESS 257 | FATTR4_WORD1_TIME_ACCESS
202 | FATTR4_WORD1_TIME_METADATA 258 | FATTR4_WORD1_TIME_METADATA
203 | FATTR4_WORD1_TIME_MODIFY 259 | FATTR4_WORD1_TIME_MODIFY
204 | FATTR4_WORD1_MOUNTED_ON_FILEID 260 | FATTR4_WORD1_MOUNTED_ON_FILEID,
205}; 261};
206 262
207static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, 263static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -762,6 +818,7 @@ struct nfs4_opendata {
762 struct nfs4_string owner_name; 818 struct nfs4_string owner_name;
763 struct nfs4_string group_name; 819 struct nfs4_string group_name;
764 struct nfs_fattr f_attr; 820 struct nfs_fattr f_attr;
821 struct nfs4_label *f_label;
765 struct dentry *dir; 822 struct dentry *dir;
766 struct dentry *dentry; 823 struct dentry *dentry;
767 struct nfs4_state_owner *owner; 824 struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
807static void nfs4_init_opendata_res(struct nfs4_opendata *p) 864static void nfs4_init_opendata_res(struct nfs4_opendata *p)
808{ 865{
809 p->o_res.f_attr = &p->f_attr; 866 p->o_res.f_attr = &p->f_attr;
867 p->o_res.f_label = p->f_label;
810 p->o_res.seqid = p->o_arg.seqid; 868 p->o_res.seqid = p->o_arg.seqid;
811 p->c_res.seqid = p->c_arg.seqid; 869 p->c_res.seqid = p->c_arg.seqid;
812 p->o_res.server = p->o_arg.server; 870 p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
818static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, 876static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
819 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 877 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
820 const struct iattr *attrs, 878 const struct iattr *attrs,
879 struct nfs4_label *label,
821 enum open_claim_type4 claim, 880 enum open_claim_type4 claim,
822 gfp_t gfp_mask) 881 gfp_t gfp_mask)
823{ 882{
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
829 p = kzalloc(sizeof(*p), gfp_mask); 888 p = kzalloc(sizeof(*p), gfp_mask);
830 if (p == NULL) 889 if (p == NULL)
831 goto err; 890 goto err;
891
892 p->f_label = nfs4_label_alloc(server, gfp_mask);
893 if (IS_ERR(p->f_label))
894 goto err_free_p;
895
832 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 896 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
833 if (p->o_arg.seqid == NULL) 897 if (p->o_arg.seqid == NULL)
834 goto err_free; 898 goto err_free_label;
835 nfs_sb_active(dentry->d_sb); 899 nfs_sb_active(dentry->d_sb);
836 p->dentry = dget(dentry); 900 p->dentry = dget(dentry);
837 p->dir = parent; 901 p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
852 p->o_arg.id.uniquifier = sp->so_seqid.owner_id; 916 p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
853 p->o_arg.name = &dentry->d_name; 917 p->o_arg.name = &dentry->d_name;
854 p->o_arg.server = server; 918 p->o_arg.server = server;
855 p->o_arg.bitmask = server->attr_bitmask; 919 p->o_arg.bitmask = nfs4_bitmask(server, label);
856 p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; 920 p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
921 p->o_arg.label = label;
857 p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); 922 p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
858 switch (p->o_arg.claim) { 923 switch (p->o_arg.claim) {
859 case NFS4_OPEN_CLAIM_NULL: 924 case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
884 nfs4_init_opendata_res(p); 949 nfs4_init_opendata_res(p);
885 kref_init(&p->kref); 950 kref_init(&p->kref);
886 return p; 951 return p;
887err_free: 952
953err_free_label:
954 nfs4_label_free(p->f_label);
955err_free_p:
888 kfree(p); 956 kfree(p);
889err: 957err:
890 dput(parent); 958 dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
901 if (p->state != NULL) 969 if (p->state != NULL)
902 nfs4_put_open_state(p->state); 970 nfs4_put_open_state(p->state);
903 nfs4_put_state_owner(p->owner); 971 nfs4_put_state_owner(p->owner);
972
973 nfs4_label_free(p->f_label);
974
904 dput(p->dir); 975 dput(p->dir);
905 dput(p->dentry); 976 dput(p->dentry);
906 nfs_sb_deactive(sb); 977 nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
1179 if (ret) 1250 if (ret)
1180 goto err; 1251 goto err;
1181 1252
1253 nfs_setsecurity(inode, &data->f_attr, data->f_label);
1254
1182 if (data->o_res.delegation_type != 0) 1255 if (data->o_res.delegation_type != 0)
1183 nfs4_opendata_check_deleg(data, state); 1256 nfs4_opendata_check_deleg(data, state);
1184 update_open_stateid(state, &data->o_res.stateid, NULL, 1257 update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1205 ret = -EAGAIN; 1278 ret = -EAGAIN;
1206 if (!(data->f_attr.valid & NFS_ATTR_FATTR)) 1279 if (!(data->f_attr.valid & NFS_ATTR_FATTR))
1207 goto err; 1280 goto err;
1208 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); 1281 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
1209 ret = PTR_ERR(inode); 1282 ret = PTR_ERR(inode);
1210 if (IS_ERR(inode)) 1283 if (IS_ERR(inode))
1211 goto err; 1284 goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1258 struct nfs4_opendata *opendata; 1331 struct nfs4_opendata *opendata;
1259 1332
1260 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, 1333 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
1261 NULL, claim, GFP_NOFS); 1334 NULL, NULL, claim, GFP_NOFS);
1262 if (opendata == NULL) 1335 if (opendata == NULL)
1263 return ERR_PTR(-ENOMEM); 1336 return ERR_PTR(-ENOMEM);
1264 opendata->state = state; 1337 opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1784 return status; 1857 return status;
1785 } 1858 }
1786 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) 1859 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
1787 _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); 1860 _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
1788 return 0; 1861 return 0;
1789} 1862}
1790 1863
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1855{ 1928{
1856 struct nfs_server *server = NFS_SERVER(state->inode); 1929 struct nfs_server *server = NFS_SERVER(state->inode);
1857 nfs4_stateid *stateid = &state->stateid; 1930 nfs4_stateid *stateid = &state->stateid;
1858 int status; 1931 struct nfs_delegation *delegation;
1932 struct rpc_cred *cred = NULL;
1933 int status = -NFS4ERR_BAD_STATEID;
1859 1934
1860 /* If a state reset has been done, test_stateid is unneeded */ 1935 /* If a state reset has been done, test_stateid is unneeded */
1861 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1936 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1862 return; 1937 return;
1863 1938
1864 status = nfs41_test_stateid(server, stateid); 1939 /* Get the delegation credential for use by test/free_stateid */
1940 rcu_read_lock();
1941 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
1942 if (delegation != NULL &&
1943 nfs4_stateid_match(&delegation->stateid, stateid)) {
1944 cred = get_rpccred(delegation->cred);
1945 rcu_read_unlock();
1946 status = nfs41_test_stateid(server, stateid, cred);
1947 } else
1948 rcu_read_unlock();
1949
1865 if (status != NFS_OK) { 1950 if (status != NFS_OK) {
1866 /* Free the stateid unless the server explicitly 1951 /* Free the stateid unless the server explicitly
1867 * informs us the stateid is unrecognized. */ 1952 * informs us the stateid is unrecognized. */
1868 if (status != -NFS4ERR_BAD_STATEID) 1953 if (status != -NFS4ERR_BAD_STATEID)
1869 nfs41_free_stateid(server, stateid); 1954 nfs41_free_stateid(server, stateid, cred);
1870 nfs_remove_bad_delegation(state->inode); 1955 nfs_remove_bad_delegation(state->inode);
1871 1956
1872 write_seqlock(&state->seqlock); 1957 write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1874 write_sequnlock(&state->seqlock); 1959 write_sequnlock(&state->seqlock);
1875 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1960 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1876 } 1961 }
1962
1963 if (cred != NULL)
1964 put_rpccred(cred);
1877} 1965}
1878 1966
1879/** 1967/**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1888{ 1976{
1889 struct nfs_server *server = NFS_SERVER(state->inode); 1977 struct nfs_server *server = NFS_SERVER(state->inode);
1890 nfs4_stateid *stateid = &state->open_stateid; 1978 nfs4_stateid *stateid = &state->open_stateid;
1979 struct rpc_cred *cred = state->owner->so_cred;
1891 int status; 1980 int status;
1892 1981
1893 /* If a state reset has been done, test_stateid is unneeded */ 1982 /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1896 (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) 1985 (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
1897 return -NFS4ERR_BAD_STATEID; 1986 return -NFS4ERR_BAD_STATEID;
1898 1987
1899 status = nfs41_test_stateid(server, stateid); 1988 status = nfs41_test_stateid(server, stateid, cred);
1900 if (status != NFS_OK) { 1989 if (status != NFS_OK) {
1901 /* Free the stateid unless the server explicitly 1990 /* Free the stateid unless the server explicitly
1902 * informs us the stateid is unrecognized. */ 1991 * informs us the stateid is unrecognized. */
1903 if (status != -NFS4ERR_BAD_STATEID) 1992 if (status != -NFS4ERR_BAD_STATEID)
1904 nfs41_free_stateid(server, stateid); 1993 nfs41_free_stateid(server, stateid, cred);
1905 1994
1906 clear_bit(NFS_O_RDONLY_STATE, &state->flags); 1995 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1907 clear_bit(NFS_O_WRONLY_STATE, &state->flags); 1996 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
1942static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, 2031static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
1943 fmode_t fmode, 2032 fmode_t fmode,
1944 int flags, 2033 int flags,
1945 struct nfs4_state **res) 2034 struct nfs_open_context *ctx)
1946{ 2035{
1947 struct nfs4_state_owner *sp = opendata->owner; 2036 struct nfs4_state_owner *sp = opendata->owner;
1948 struct nfs_server *server = sp->so_server; 2037 struct nfs_server *server = sp->so_server;
2038 struct dentry *dentry;
1949 struct nfs4_state *state; 2039 struct nfs4_state *state;
1950 unsigned int seq; 2040 unsigned int seq;
1951 int ret; 2041 int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
1963 if (server->caps & NFS_CAP_POSIX_LOCK) 2053 if (server->caps & NFS_CAP_POSIX_LOCK)
1964 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 2054 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1965 2055
2056 dentry = opendata->dentry;
2057 if (dentry->d_inode == NULL) {
2058 /* FIXME: Is this d_drop() ever needed? */
2059 d_drop(dentry);
2060 dentry = d_add_unique(dentry, igrab(state->inode));
2061 if (dentry == NULL) {
2062 dentry = opendata->dentry;
2063 } else if (dentry != ctx->dentry) {
2064 dput(ctx->dentry);
2065 ctx->dentry = dget(dentry);
2066 }
2067 nfs_set_verifier(dentry,
2068 nfs_save_change_attribute(opendata->dir->d_inode));
2069 }
2070
1966 ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); 2071 ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
1967 if (ret != 0) 2072 if (ret != 0)
1968 goto out; 2073 goto out;
1969 2074
1970 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) 2075 ctx->state = state;
1971 nfs4_schedule_stateid_recovery(server, state); 2076 if (dentry->d_inode == state->inode) {
1972 *res = state; 2077 nfs_inode_attach_open_context(ctx);
2078 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
2079 nfs4_schedule_stateid_recovery(server, state);
2080 }
1973out: 2081out:
1974 return ret; 2082 return ret;
1975} 2083}
@@ -1978,19 +2086,21 @@ out:
1978 * Returns a referenced nfs4_state 2086 * Returns a referenced nfs4_state
1979 */ 2087 */
1980static int _nfs4_do_open(struct inode *dir, 2088static int _nfs4_do_open(struct inode *dir,
1981 struct dentry *dentry, 2089 struct nfs_open_context *ctx,
1982 fmode_t fmode,
1983 int flags, 2090 int flags,
1984 struct iattr *sattr, 2091 struct iattr *sattr,
1985 struct rpc_cred *cred, 2092 struct nfs4_label *label)
1986 struct nfs4_state **res,
1987 struct nfs4_threshold **ctx_th)
1988{ 2093{
1989 struct nfs4_state_owner *sp; 2094 struct nfs4_state_owner *sp;
1990 struct nfs4_state *state = NULL; 2095 struct nfs4_state *state = NULL;
1991 struct nfs_server *server = NFS_SERVER(dir); 2096 struct nfs_server *server = NFS_SERVER(dir);
1992 struct nfs4_opendata *opendata; 2097 struct nfs4_opendata *opendata;
2098 struct dentry *dentry = ctx->dentry;
2099 struct rpc_cred *cred = ctx->cred;
2100 struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
2101 fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
1993 enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; 2102 enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
2103 struct nfs4_label *olabel = NULL;
1994 int status; 2104 int status;
1995 2105
1996 /* Protect against reboot recovery conflicts */ 2106 /* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
2009 if (dentry->d_inode) 2119 if (dentry->d_inode)
2010 claim = NFS4_OPEN_CLAIM_FH; 2120 claim = NFS4_OPEN_CLAIM_FH;
2011 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, 2121 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
2012 claim, GFP_KERNEL); 2122 label, claim, GFP_KERNEL);
2013 if (opendata == NULL) 2123 if (opendata == NULL)
2014 goto err_put_state_owner; 2124 goto err_put_state_owner;
2015 2125
2126 if (label) {
2127 olabel = nfs4_label_alloc(server, GFP_KERNEL);
2128 if (IS_ERR(olabel)) {
2129 status = PTR_ERR(olabel);
2130 goto err_opendata_put;
2131 }
2132 }
2133
2016 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { 2134 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
2017 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); 2135 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
2018 if (!opendata->f_attr.mdsthreshold) 2136 if (!opendata->f_attr.mdsthreshold)
2019 goto err_opendata_put; 2137 goto err_free_label;
2020 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; 2138 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
2021 } 2139 }
2022 if (dentry->d_inode != NULL) 2140 if (dentry->d_inode != NULL)
2023 opendata->state = nfs4_get_open_state(dentry->d_inode, sp); 2141 opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
2024 2142
2025 status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); 2143 status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
2026 if (status != 0) 2144 if (status != 0)
2027 goto err_opendata_put; 2145 goto err_free_label;
2146 state = ctx->state;
2028 2147
2029 if ((opendata->o_arg.open_flags & O_EXCL) && 2148 if ((opendata->o_arg.open_flags & O_EXCL) &&
2030 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { 2149 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
2033 nfs_fattr_init(opendata->o_res.f_attr); 2152 nfs_fattr_init(opendata->o_res.f_attr);
2034 status = nfs4_do_setattr(state->inode, cred, 2153 status = nfs4_do_setattr(state->inode, cred,
2035 opendata->o_res.f_attr, sattr, 2154 opendata->o_res.f_attr, sattr,
2036 state); 2155 state, label, olabel);
2037 if (status == 0) 2156 if (status == 0) {
2038 nfs_setattr_update_inode(state->inode, sattr); 2157 nfs_setattr_update_inode(state->inode, sattr);
2039 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); 2158 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
2159 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
2160 }
2040 } 2161 }
2041 2162
2042 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) 2163 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
2045 kfree(opendata->f_attr.mdsthreshold); 2166 kfree(opendata->f_attr.mdsthreshold);
2046 opendata->f_attr.mdsthreshold = NULL; 2167 opendata->f_attr.mdsthreshold = NULL;
2047 2168
2169 nfs4_label_free(olabel);
2170
2048 nfs4_opendata_put(opendata); 2171 nfs4_opendata_put(opendata);
2049 nfs4_put_state_owner(sp); 2172 nfs4_put_state_owner(sp);
2050 *res = state;
2051 return 0; 2173 return 0;
2174err_free_label:
2175 nfs4_label_free(olabel);
2052err_opendata_put: 2176err_opendata_put:
2053 kfree(opendata->f_attr.mdsthreshold); 2177 kfree(opendata->f_attr.mdsthreshold);
2054 nfs4_opendata_put(opendata); 2178 nfs4_opendata_put(opendata);
2055err_put_state_owner: 2179err_put_state_owner:
2056 nfs4_put_state_owner(sp); 2180 nfs4_put_state_owner(sp);
2057out_err: 2181out_err:
2058 *res = NULL;
2059 return status; 2182 return status;
2060} 2183}
2061 2184
2062 2185
2063static struct nfs4_state *nfs4_do_open(struct inode *dir, 2186static struct nfs4_state *nfs4_do_open(struct inode *dir,
2064 struct dentry *dentry, 2187 struct nfs_open_context *ctx,
2065 fmode_t fmode,
2066 int flags, 2188 int flags,
2067 struct iattr *sattr, 2189 struct iattr *sattr,
2068 struct rpc_cred *cred, 2190 struct nfs4_label *label)
2069 struct nfs4_threshold **ctx_th)
2070{ 2191{
2071 struct nfs_server *server = NFS_SERVER(dir); 2192 struct nfs_server *server = NFS_SERVER(dir);
2072 struct nfs4_exception exception = { }; 2193 struct nfs4_exception exception = { };
2073 struct nfs4_state *res; 2194 struct nfs4_state *res;
2074 int status; 2195 int status;
2075 2196
2076 fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
2077 do { 2197 do {
2078 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, 2198 status = _nfs4_do_open(dir, ctx, flags, sattr, label);
2079 &res, ctx_th); 2199 res = ctx->state;
2080 if (status == 0) 2200 if (status == 0)
2081 break; 2201 break;
2082 /* NOTE: BAD_SEQID means the server and client disagree about the 2202 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
2122 2242
2123static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2243static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2124 struct nfs_fattr *fattr, struct iattr *sattr, 2244 struct nfs_fattr *fattr, struct iattr *sattr,
2125 struct nfs4_state *state) 2245 struct nfs4_state *state, struct nfs4_label *ilabel,
2246 struct nfs4_label *olabel)
2126{ 2247{
2127 struct nfs_server *server = NFS_SERVER(inode); 2248 struct nfs_server *server = NFS_SERVER(inode);
2128 struct nfs_setattrargs arg = { 2249 struct nfs_setattrargs arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2130 .iap = sattr, 2251 .iap = sattr,
2131 .server = server, 2252 .server = server,
2132 .bitmask = server->attr_bitmask, 2253 .bitmask = server->attr_bitmask,
2254 .label = ilabel,
2133 }; 2255 };
2134 struct nfs_setattrres res = { 2256 struct nfs_setattrres res = {
2135 .fattr = fattr, 2257 .fattr = fattr,
2258 .label = olabel,
2136 .server = server, 2259 .server = server,
2137 }; 2260 };
2138 struct rpc_message msg = { 2261 struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2146 bool truncate; 2269 bool truncate;
2147 int status; 2270 int status;
2148 2271
2272 arg.bitmask = nfs4_bitmask(server, ilabel);
2273 if (ilabel)
2274 arg.bitmask = nfs4_bitmask(server, olabel);
2275
2149 nfs_fattr_init(fattr); 2276 nfs_fattr_init(fattr);
2150 2277
2151 /* Servers should only apply open mode checks for file size changes */ 2278 /* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2172 2299
2173static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2300static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2174 struct nfs_fattr *fattr, struct iattr *sattr, 2301 struct nfs_fattr *fattr, struct iattr *sattr,
2175 struct nfs4_state *state) 2302 struct nfs4_state *state, struct nfs4_label *ilabel,
2303 struct nfs4_label *olabel)
2176{ 2304{
2177 struct nfs_server *server = NFS_SERVER(inode); 2305 struct nfs_server *server = NFS_SERVER(inode);
2178 struct nfs4_exception exception = { 2306 struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2181 }; 2309 };
2182 int err; 2310 int err;
2183 do { 2311 do {
2184 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); 2312 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
2185 switch (err) { 2313 switch (err) {
2186 case -NFS4ERR_OPENMODE: 2314 case -NFS4ERR_OPENMODE:
2187 if (!(sattr->ia_valid & ATTR_SIZE)) { 2315 if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
2426nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) 2554nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
2427{ 2555{
2428 struct nfs4_state *state; 2556 struct nfs4_state *state;
2557 struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
2558
2559 label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
2429 2560
2430 /* Protect against concurrent sillydeletes */ 2561 /* Protect against concurrent sillydeletes */
2431 state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, 2562 state = nfs4_do_open(dir, ctx, open_flags, attr, label);
2432 ctx->cred, &ctx->mdsthreshold); 2563
2564 nfs4_label_release_security(label);
2565
2433 if (IS_ERR(state)) 2566 if (IS_ERR(state))
2434 return ERR_CAST(state); 2567 return ERR_CAST(state);
2435 ctx->state = state; 2568 return state->inode;
2436 return igrab(state->inode);
2437} 2569}
2438 2570
2439static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2571static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2489 server->caps |= NFS_CAP_CTIME; 2621 server->caps |= NFS_CAP_CTIME;
2490 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) 2622 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
2491 server->caps |= NFS_CAP_MTIME; 2623 server->caps |= NFS_CAP_MTIME;
2624#ifdef CONFIG_NFS_V4_SECURITY_LABEL
2625 if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
2626 server->caps |= NFS_CAP_SECURITY_LABEL;
2627#endif
2628 memcpy(server->attr_bitmask_nl, res.attr_bitmask,
2629 sizeof(server->attr_bitmask));
2492 2630
2631 if (server->caps & NFS_CAP_SECURITY_LABEL) {
2632 server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
2633 res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
2634 }
2493 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); 2635 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
2494 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2636 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2495 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2637 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
2515static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2657static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2516 struct nfs_fsinfo *info) 2658 struct nfs_fsinfo *info)
2517{ 2659{
2660 u32 bitmask[3];
2518 struct nfs4_lookup_root_arg args = { 2661 struct nfs4_lookup_root_arg args = {
2519 .bitmask = nfs4_fattr_bitmap, 2662 .bitmask = bitmask,
2520 }; 2663 };
2521 struct nfs4_lookup_res res = { 2664 struct nfs4_lookup_res res = {
2522 .server = server, 2665 .server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2529 .rpc_resp = &res, 2672 .rpc_resp = &res,
2530 }; 2673 };
2531 2674
2675 bitmask[0] = nfs4_fattr_bitmap[0];
2676 bitmask[1] = nfs4_fattr_bitmap[1];
2677 /*
2678 * Process the label in the upcoming getfattr
2679 */
2680 bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
2681
2532 nfs_fattr_init(info->fattr); 2682 nfs_fattr_init(info->fattr);
2533 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2683 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2534} 2684}
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
2648{ 2798{
2649 int error; 2799 int error;
2650 struct nfs_fattr *fattr = info->fattr; 2800 struct nfs_fattr *fattr = info->fattr;
2801 struct nfs4_label *label = NULL;
2651 2802
2652 error = nfs4_server_capabilities(server, mntfh); 2803 error = nfs4_server_capabilities(server, mntfh);
2653 if (error < 0) { 2804 if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
2655 return error; 2806 return error;
2656 } 2807 }
2657 2808
2658 error = nfs4_proc_getattr(server, mntfh, fattr); 2809 label = nfs4_label_alloc(server, GFP_KERNEL);
2810 if (IS_ERR(label))
2811 return PTR_ERR(label);
2812
2813 error = nfs4_proc_getattr(server, mntfh, fattr, label);
2659 if (error < 0) { 2814 if (error < 0) {
2660 dprintk("nfs4_get_root: getattr error = %d\n", -error); 2815 dprintk("nfs4_get_root: getattr error = %d\n", -error);
2661 return error; 2816 goto err_free_label;
2662 } 2817 }
2663 2818
2664 if (fattr->valid & NFS_ATTR_FATTR_FSID && 2819 if (fattr->valid & NFS_ATTR_FATTR_FSID &&
2665 !nfs_fsid_equal(&server->fsid, &fattr->fsid)) 2820 !nfs_fsid_equal(&server->fsid, &fattr->fsid))
2666 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); 2821 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
2667 2822
2823err_free_label:
2824 nfs4_label_free(label);
2825
2668 return error; 2826 return error;
2669} 2827}
2670 2828
@@ -2711,7 +2869,8 @@ out:
2711 return status; 2869 return status;
2712} 2870}
2713 2871
2714static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2872static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2873 struct nfs_fattr *fattr, struct nfs4_label *label)
2715{ 2874{
2716 struct nfs4_getattr_arg args = { 2875 struct nfs4_getattr_arg args = {
2717 .fh = fhandle, 2876 .fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2719 }; 2878 };
2720 struct nfs4_getattr_res res = { 2879 struct nfs4_getattr_res res = {
2721 .fattr = fattr, 2880 .fattr = fattr,
2881 .label = label,
2722 .server = server, 2882 .server = server,
2723 }; 2883 };
2724 struct rpc_message msg = { 2884 struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2726 .rpc_argp = &args, 2886 .rpc_argp = &args,
2727 .rpc_resp = &res, 2887 .rpc_resp = &res,
2728 }; 2888 };
2729 2889
2890 args.bitmask = nfs4_bitmask(server, label);
2891
2730 nfs_fattr_init(fattr); 2892 nfs_fattr_init(fattr);
2731 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2893 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2732} 2894}
2733 2895
2734static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2896static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2897 struct nfs_fattr *fattr, struct nfs4_label *label)
2735{ 2898{
2736 struct nfs4_exception exception = { }; 2899 struct nfs4_exception exception = { };
2737 int err; 2900 int err;
2738 do { 2901 do {
2739 err = nfs4_handle_exception(server, 2902 err = nfs4_handle_exception(server,
2740 _nfs4_proc_getattr(server, fhandle, fattr), 2903 _nfs4_proc_getattr(server, fhandle, fattr, label),
2741 &exception); 2904 &exception);
2742 } while (exception.retry); 2905 } while (exception.retry);
2743 return err; 2906 return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2767 struct inode *inode = dentry->d_inode; 2930 struct inode *inode = dentry->d_inode;
2768 struct rpc_cred *cred = NULL; 2931 struct rpc_cred *cred = NULL;
2769 struct nfs4_state *state = NULL; 2932 struct nfs4_state *state = NULL;
2933 struct nfs4_label *label = NULL;
2770 int status; 2934 int status;
2771 2935
2772 if (pnfs_ld_layoutret_on_setattr(inode)) 2936 if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2793 } 2957 }
2794 } 2958 }
2795 2959
2796 status = nfs4_do_setattr(inode, cred, fattr, sattr, state); 2960 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
2797 if (status == 0) 2961 if (IS_ERR(label))
2962 return PTR_ERR(label);
2963
2964 status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
2965 if (status == 0) {
2798 nfs_setattr_update_inode(inode, sattr); 2966 nfs_setattr_update_inode(inode, sattr);
2967 nfs_setsecurity(inode, fattr, label);
2968 }
2969 nfs4_label_free(label);
2799 return status; 2970 return status;
2800} 2971}
2801 2972
2802static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, 2973static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2803 const struct qstr *name, struct nfs_fh *fhandle, 2974 const struct qstr *name, struct nfs_fh *fhandle,
2804 struct nfs_fattr *fattr) 2975 struct nfs_fattr *fattr, struct nfs4_label *label)
2805{ 2976{
2806 struct nfs_server *server = NFS_SERVER(dir); 2977 struct nfs_server *server = NFS_SERVER(dir);
2807 int status; 2978 int status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2813 struct nfs4_lookup_res res = { 2984 struct nfs4_lookup_res res = {
2814 .server = server, 2985 .server = server,
2815 .fattr = fattr, 2986 .fattr = fattr,
2987 .label = label,
2816 .fh = fhandle, 2988 .fh = fhandle,
2817 }; 2989 };
2818 struct rpc_message msg = { 2990 struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2821 .rpc_resp = &res, 2993 .rpc_resp = &res,
2822 }; 2994 };
2823 2995
2996 args.bitmask = nfs4_bitmask(server, label);
2997
2824 nfs_fattr_init(fattr); 2998 nfs_fattr_init(fattr);
2825 2999
2826 dprintk("NFS call lookup %s\n", name->name); 3000 dprintk("NFS call lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
2839 3013
2840static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, 3014static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
2841 struct qstr *name, struct nfs_fh *fhandle, 3015 struct qstr *name, struct nfs_fh *fhandle,
2842 struct nfs_fattr *fattr) 3016 struct nfs_fattr *fattr, struct nfs4_label *label)
2843{ 3017{
2844 struct nfs4_exception exception = { }; 3018 struct nfs4_exception exception = { };
2845 struct rpc_clnt *client = *clnt; 3019 struct rpc_clnt *client = *clnt;
2846 int err; 3020 int err;
2847 do { 3021 do {
2848 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr); 3022 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
2849 switch (err) { 3023 switch (err) {
2850 case -NFS4ERR_BADNAME: 3024 case -NFS4ERR_BADNAME:
2851 err = -ENOENT; 3025 err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
2879} 3053}
2880 3054
2881static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, 3055static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
2882 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 3056 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
3057 struct nfs4_label *label)
2883{ 3058{
2884 int status; 3059 int status;
2885 struct rpc_clnt *client = NFS_CLIENT(dir); 3060 struct rpc_clnt *client = NFS_CLIENT(dir);
2886 3061
2887 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); 3062 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
2888 if (client != NFS_CLIENT(dir)) { 3063 if (client != NFS_CLIENT(dir)) {
2889 rpc_shutdown_client(client); 3064 rpc_shutdown_client(client);
2890 nfs_fixup_secinfo_attributes(fattr); 3065 nfs_fixup_secinfo_attributes(fattr);
@@ -2899,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
2899 int status; 3074 int status;
2900 struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); 3075 struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
2901 3076
2902 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); 3077 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
2903 if (status < 0) { 3078 if (status < 0) {
2904 rpc_shutdown_client(client); 3079 rpc_shutdown_client(client);
2905 return ERR_PTR(status); 3080 return ERR_PTR(status);
@@ -2924,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2924 .rpc_cred = entry->cred, 3099 .rpc_cred = entry->cred,
2925 }; 3100 };
2926 int mode = entry->mask; 3101 int mode = entry->mask;
2927 int status; 3102 int status = 0;
2928 3103
2929 /* 3104 /*
2930 * Determine which access bits we want to ask for... 3105 * Determine which access bits we want to ask for...
@@ -3029,6 +3204,7 @@ static int
3029nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 3204nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3030 int flags) 3205 int flags)
3031{ 3206{
3207 struct nfs4_label l, *ilabel = NULL;
3032 struct nfs_open_context *ctx; 3208 struct nfs_open_context *ctx;
3033 struct nfs4_state *state; 3209 struct nfs4_state *state;
3034 int status = 0; 3210 int status = 0;
@@ -3037,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3037 if (IS_ERR(ctx)) 3213 if (IS_ERR(ctx))
3038 return PTR_ERR(ctx); 3214 return PTR_ERR(ctx);
3039 3215
3216 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
3217
3040 sattr->ia_mode &= ~current_umask(); 3218 sattr->ia_mode &= ~current_umask();
3041 state = nfs4_do_open(dir, dentry, ctx->mode, 3219 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
3042 flags, sattr, ctx->cred,
3043 &ctx->mdsthreshold);
3044 d_drop(dentry);
3045 if (IS_ERR(state)) { 3220 if (IS_ERR(state)) {
3046 status = PTR_ERR(state); 3221 status = PTR_ERR(state);
3047 goto out; 3222 goto out;
3048 } 3223 }
3049 d_add(dentry, igrab(state->inode));
3050 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
3051 ctx->state = state;
3052out: 3224out:
3225 nfs4_label_release_security(ilabel);
3053 put_nfs_open_context(ctx); 3226 put_nfs_open_context(ctx);
3054 return status; 3227 return status;
3055} 3228}
@@ -3098,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
3098 res->server = server; 3271 res->server = server;
3099 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 3272 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
3100 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); 3273 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
3274
3275 nfs_fattr_init(res->dir_attr);
3101} 3276}
3102 3277
3103static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) 3278static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3173 .rpc_resp = &res, 3348 .rpc_resp = &res,
3174 }; 3349 };
3175 int status = -ENOMEM; 3350 int status = -ENOMEM;
3176 3351
3177 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 3352 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3178 if (!status) { 3353 if (!status) {
3179 update_changeattr(old_dir, &res.old_cinfo); 3354 update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
3207 }; 3382 };
3208 struct nfs4_link_res res = { 3383 struct nfs4_link_res res = {
3209 .server = server, 3384 .server = server,
3385 .label = NULL,
3210 }; 3386 };
3211 struct rpc_message msg = { 3387 struct rpc_message msg = {
3212 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 3388 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
3219 if (res.fattr == NULL) 3395 if (res.fattr == NULL)
3220 goto out; 3396 goto out;
3221 3397
3398 res.label = nfs4_label_alloc(server, GFP_KERNEL);
3399 if (IS_ERR(res.label)) {
3400 status = PTR_ERR(res.label);
3401 goto out;
3402 }
3403 arg.bitmask = nfs4_bitmask(server, res.label);
3404
3222 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 3405 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3223 if (!status) { 3406 if (!status) {
3224 update_changeattr(dir, &res.cinfo); 3407 update_changeattr(dir, &res.cinfo);
3225 nfs_post_op_update_inode(inode, res.fattr); 3408 status = nfs_post_op_update_inode(inode, res.fattr);
3409 if (!status)
3410 nfs_setsecurity(inode, res.fattr, res.label);
3226 } 3411 }
3412
3413
3414 nfs4_label_free(res.label);
3415
3227out: 3416out:
3228 nfs_free_fattr(res.fattr); 3417 nfs_free_fattr(res.fattr);
3229 return status; 3418 return status;
@@ -3247,6 +3436,7 @@ struct nfs4_createdata {
3247 struct nfs4_create_res res; 3436 struct nfs4_create_res res;
3248 struct nfs_fh fh; 3437 struct nfs_fh fh;
3249 struct nfs_fattr fattr; 3438 struct nfs_fattr fattr;
3439 struct nfs4_label *label;
3250}; 3440};
3251 3441
3252static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, 3442static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
3258 if (data != NULL) { 3448 if (data != NULL) {
3259 struct nfs_server *server = NFS_SERVER(dir); 3449 struct nfs_server *server = NFS_SERVER(dir);
3260 3450
3451 data->label = nfs4_label_alloc(server, GFP_KERNEL);
3452 if (IS_ERR(data->label))
3453 goto out_free;
3454
3261 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; 3455 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
3262 data->msg.rpc_argp = &data->arg; 3456 data->msg.rpc_argp = &data->arg;
3263 data->msg.rpc_resp = &data->res; 3457 data->msg.rpc_resp = &data->res;
@@ -3266,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
3266 data->arg.name = name; 3460 data->arg.name = name;
3267 data->arg.attrs = sattr; 3461 data->arg.attrs = sattr;
3268 data->arg.ftype = ftype; 3462 data->arg.ftype = ftype;
3269 data->arg.bitmask = server->attr_bitmask; 3463 data->arg.bitmask = nfs4_bitmask(server, data->label);
3270 data->res.server = server; 3464 data->res.server = server;
3271 data->res.fh = &data->fh; 3465 data->res.fh = &data->fh;
3272 data->res.fattr = &data->fattr; 3466 data->res.fattr = &data->fattr;
3467 data->res.label = data->label;
3273 nfs_fattr_init(data->res.fattr); 3468 nfs_fattr_init(data->res.fattr);
3274 } 3469 }
3275 return data; 3470 return data;
3471out_free:
3472 kfree(data);
3473 return NULL;
3276} 3474}
3277 3475
3278static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) 3476static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
3281 &data->arg.seq_args, &data->res.seq_res, 1); 3479 &data->arg.seq_args, &data->res.seq_res, 1);
3282 if (status == 0) { 3480 if (status == 0) {
3283 update_changeattr(dir, &data->res.dir_cinfo); 3481 update_changeattr(dir, &data->res.dir_cinfo);
3284 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 3482 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
3285 } 3483 }
3286 return status; 3484 return status;
3287} 3485}
3288 3486
3289static void nfs4_free_createdata(struct nfs4_createdata *data) 3487static void nfs4_free_createdata(struct nfs4_createdata *data)
3290{ 3488{
3489 nfs4_label_free(data->label);
3291 kfree(data); 3490 kfree(data);
3292} 3491}
3293 3492
3294static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, 3493static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3295 struct page *page, unsigned int len, struct iattr *sattr) 3494 struct page *page, unsigned int len, struct iattr *sattr,
3495 struct nfs4_label *label)
3296{ 3496{
3297 struct nfs4_createdata *data; 3497 struct nfs4_createdata *data;
3298 int status = -ENAMETOOLONG; 3498 int status = -ENAMETOOLONG;
@@ -3308,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3308 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; 3508 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
3309 data->arg.u.symlink.pages = &page; 3509 data->arg.u.symlink.pages = &page;
3310 data->arg.u.symlink.len = len; 3510 data->arg.u.symlink.len = len;
3511 data->arg.label = label;
3311 3512
3312 status = nfs4_do_create(dir, dentry, data); 3513 status = nfs4_do_create(dir, dentry, data);
3313 3514
@@ -3320,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3320 struct page *page, unsigned int len, struct iattr *sattr) 3521 struct page *page, unsigned int len, struct iattr *sattr)
3321{ 3522{
3322 struct nfs4_exception exception = { }; 3523 struct nfs4_exception exception = { };
3524 struct nfs4_label l, *label = NULL;
3323 int err; 3525 int err;
3526
3527 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3528
3324 do { 3529 do {
3325 err = nfs4_handle_exception(NFS_SERVER(dir), 3530 err = nfs4_handle_exception(NFS_SERVER(dir),
3326 _nfs4_proc_symlink(dir, dentry, page, 3531 _nfs4_proc_symlink(dir, dentry, page,
3327 len, sattr), 3532 len, sattr, label),
3328 &exception); 3533 &exception);
3329 } while (exception.retry); 3534 } while (exception.retry);
3535
3536 nfs4_label_release_security(label);
3330 return err; 3537 return err;
3331} 3538}
3332 3539
3333static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, 3540static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3334 struct iattr *sattr) 3541 struct iattr *sattr, struct nfs4_label *label)
3335{ 3542{
3336 struct nfs4_createdata *data; 3543 struct nfs4_createdata *data;
3337 int status = -ENOMEM; 3544 int status = -ENOMEM;
@@ -3340,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3340 if (data == NULL) 3547 if (data == NULL)
3341 goto out; 3548 goto out;
3342 3549
3550 data->arg.label = label;
3343 status = nfs4_do_create(dir, dentry, data); 3551 status = nfs4_do_create(dir, dentry, data);
3344 3552
3345 nfs4_free_createdata(data); 3553 nfs4_free_createdata(data);
@@ -3351,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3351 struct iattr *sattr) 3559 struct iattr *sattr)
3352{ 3560{
3353 struct nfs4_exception exception = { }; 3561 struct nfs4_exception exception = { };
3562 struct nfs4_label l, *label = NULL;
3354 int err; 3563 int err;
3355 3564
3565 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3566
3356 sattr->ia_mode &= ~current_umask(); 3567 sattr->ia_mode &= ~current_umask();
3357 do { 3568 do {
3358 err = nfs4_handle_exception(NFS_SERVER(dir), 3569 err = nfs4_handle_exception(NFS_SERVER(dir),
3359 _nfs4_proc_mkdir(dir, dentry, sattr), 3570 _nfs4_proc_mkdir(dir, dentry, sattr, label),
3360 &exception); 3571 &exception);
3361 } while (exception.retry); 3572 } while (exception.retry);
3573 nfs4_label_release_security(label);
3574
3362 return err; 3575 return err;
3363} 3576}
3364 3577
@@ -3416,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
3416} 3629}
3417 3630
3418static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, 3631static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3419 struct iattr *sattr, dev_t rdev) 3632 struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
3420{ 3633{
3421 struct nfs4_createdata *data; 3634 struct nfs4_createdata *data;
3422 int mode = sattr->ia_mode; 3635 int mode = sattr->ia_mode;
@@ -3441,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3441 status = -EINVAL; 3654 status = -EINVAL;
3442 goto out_free; 3655 goto out_free;
3443 } 3656 }
3444 3657
3658 data->arg.label = label;
3445 status = nfs4_do_create(dir, dentry, data); 3659 status = nfs4_do_create(dir, dentry, data);
3446out_free: 3660out_free:
3447 nfs4_free_createdata(data); 3661 nfs4_free_createdata(data);
@@ -3453,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3453 struct iattr *sattr, dev_t rdev) 3667 struct iattr *sattr, dev_t rdev)
3454{ 3668{
3455 struct nfs4_exception exception = { }; 3669 struct nfs4_exception exception = { };
3670 struct nfs4_label l, *label = NULL;
3456 int err; 3671 int err;
3457 3672
3673 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3674
3458 sattr->ia_mode &= ~current_umask(); 3675 sattr->ia_mode &= ~current_umask();
3459 do { 3676 do {
3460 err = nfs4_handle_exception(NFS_SERVER(dir), 3677 err = nfs4_handle_exception(NFS_SERVER(dir),
3461 _nfs4_proc_mknod(dir, dentry, sattr, rdev), 3678 _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
3462 &exception); 3679 &exception);
3463 } while (exception.retry); 3680 } while (exception.retry);
3681
3682 nfs4_label_release_security(label);
3683
3464 return err; 3684 return err;
3465} 3685}
3466 3686
@@ -4187,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
4187 return err; 4407 return err;
4188} 4408}
4189 4409
4410#ifdef CONFIG_NFS_V4_SECURITY_LABEL
4411static int _nfs4_get_security_label(struct inode *inode, void *buf,
4412 size_t buflen)
4413{
4414 struct nfs_server *server = NFS_SERVER(inode);
4415 struct nfs_fattr fattr;
4416 struct nfs4_label label = {0, 0, buflen, buf};
4417
4418 u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
4419 struct nfs4_getattr_arg args = {
4420 .fh = NFS_FH(inode),
4421 .bitmask = bitmask,
4422 };
4423 struct nfs4_getattr_res res = {
4424 .fattr = &fattr,
4425 .label = &label,
4426 .server = server,
4427 };
4428 struct rpc_message msg = {
4429 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
4430 .rpc_argp = &args,
4431 .rpc_resp = &res,
4432 };
4433 int ret;
4434
4435 nfs_fattr_init(&fattr);
4436
4437 ret = rpc_call_sync(server->client, &msg, 0);
4438 if (ret)
4439 return ret;
4440 if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
4441 return -ENOENT;
4442 if (buflen < label.len)
4443 return -ERANGE;
4444 return 0;
4445}
4446
4447static int nfs4_get_security_label(struct inode *inode, void *buf,
4448 size_t buflen)
4449{
4450 struct nfs4_exception exception = { };
4451 int err;
4452
4453 if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
4454 return -EOPNOTSUPP;
4455
4456 do {
4457 err = nfs4_handle_exception(NFS_SERVER(inode),
4458 _nfs4_get_security_label(inode, buf, buflen),
4459 &exception);
4460 } while (exception.retry);
4461 return err;
4462}
4463
4464static int _nfs4_do_set_security_label(struct inode *inode,
4465 struct nfs4_label *ilabel,
4466 struct nfs_fattr *fattr,
4467 struct nfs4_label *olabel)
4468{
4469
4470 struct iattr sattr = {0};
4471 struct nfs_server *server = NFS_SERVER(inode);
4472 const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
4473 struct nfs_setattrargs args = {
4474 .fh = NFS_FH(inode),
4475 .iap = &sattr,
4476 .server = server,
4477 .bitmask = bitmask,
4478 .label = ilabel,
4479 };
4480 struct nfs_setattrres res = {
4481 .fattr = fattr,
4482 .label = olabel,
4483 .server = server,
4484 };
4485 struct rpc_message msg = {
4486 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
4487 .rpc_argp = &args,
4488 .rpc_resp = &res,
4489 };
4490 int status;
4491
4492 nfs4_stateid_copy(&args.stateid, &zero_stateid);
4493
4494 status = rpc_call_sync(server->client, &msg, 0);
4495 if (status)
4496 dprintk("%s failed: %d\n", __func__, status);
4497
4498 return status;
4499}
4500
4501static int nfs4_do_set_security_label(struct inode *inode,
4502 struct nfs4_label *ilabel,
4503 struct nfs_fattr *fattr,
4504 struct nfs4_label *olabel)
4505{
4506 struct nfs4_exception exception = { };
4507 int err;
4508
4509 do {
4510 err = nfs4_handle_exception(NFS_SERVER(inode),
4511 _nfs4_do_set_security_label(inode, ilabel,
4512 fattr, olabel),
4513 &exception);
4514 } while (exception.retry);
4515 return err;
4516}
4517
4518static int
4519nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
4520{
4521 struct nfs4_label ilabel, *olabel = NULL;
4522 struct nfs_fattr fattr;
4523 struct rpc_cred *cred;
4524 struct inode *inode = dentry->d_inode;
4525 int status;
4526
4527 if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
4528 return -EOPNOTSUPP;
4529
4530 nfs_fattr_init(&fattr);
4531
4532 ilabel.pi = 0;
4533 ilabel.lfs = 0;
4534 ilabel.label = (char *)buf;
4535 ilabel.len = buflen;
4536
4537 cred = rpc_lookup_cred();
4538 if (IS_ERR(cred))
4539 return PTR_ERR(cred);
4540
4541 olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
4542 if (IS_ERR(olabel)) {
4543 status = -PTR_ERR(olabel);
4544 goto out;
4545 }
4546
4547 status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
4548 if (status == 0)
4549 nfs_setsecurity(inode, &fattr, olabel);
4550
4551 nfs4_label_free(olabel);
4552out:
4553 put_rpccred(cred);
4554 return status;
4555}
4556#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
4557
4558
4190static int 4559static int
4191nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4560nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
4192{ 4561{
@@ -4345,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4345 /* cb_client4 */ 4714 /* cb_client4 */
4346 rcu_read_lock(); 4715 rcu_read_lock();
4347 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4716 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
4348 sizeof(setclientid.sc_netid), 4717 sizeof(setclientid.sc_netid), "%s",
4349 rpc_peeraddr2str(clp->cl_rpcclient, 4718 rpc_peeraddr2str(clp->cl_rpcclient,
4350 RPC_DISPLAY_NETID)); 4719 RPC_DISPLAY_NETID));
4351 rcu_read_unlock(); 4720 rcu_read_unlock();
@@ -5056,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
5056 5425
5057 list_for_each_entry(lsp, &state->lock_states, ls_locks) { 5426 list_for_each_entry(lsp, &state->lock_states, ls_locks) {
5058 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { 5427 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
5059 status = nfs41_test_stateid(server, &lsp->ls_stateid); 5428 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
5429
5430 status = nfs41_test_stateid(server,
5431 &lsp->ls_stateid,
5432 cred);
5060 if (status != NFS_OK) { 5433 if (status != NFS_OK) {
5061 /* Free the stateid unless the server 5434 /* Free the stateid unless the server
5062 * informs us the stateid is unrecognized. */ 5435 * informs us the stateid is unrecognized. */
5063 if (status != -NFS4ERR_BAD_STATEID) 5436 if (status != -NFS4ERR_BAD_STATEID)
5064 nfs41_free_stateid(server, 5437 nfs41_free_stateid(server,
5065 &lsp->ls_stateid); 5438 &lsp->ls_stateid,
5439 cred);
5066 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); 5440 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
5067 ret = status; 5441 ret = status;
5068 } 5442 }
@@ -5295,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
5295 return len; 5669 return len;
5296} 5670}
5297 5671
5672#ifdef CONFIG_NFS_V4_SECURITY_LABEL
5673static inline int nfs4_server_supports_labels(struct nfs_server *server)
5674{
5675 return server->caps & NFS_CAP_SECURITY_LABEL;
5676}
5677
5678static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
5679 const void *buf, size_t buflen,
5680 int flags, int type)
5681{
5682 if (security_ismaclabel(key))
5683 return nfs4_set_security_label(dentry, buf, buflen);
5684
5685 return -EOPNOTSUPP;
5686}
5687
5688static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
5689 void *buf, size_t buflen, int type)
5690{
5691 if (security_ismaclabel(key))
5692 return nfs4_get_security_label(dentry->d_inode, buf, buflen);
5693 return -EOPNOTSUPP;
5694}
5695
5696static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
5697 size_t list_len, const char *name,
5698 size_t name_len, int type)
5699{
5700 size_t len = 0;
5701
5702 if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
5703 len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
5704 if (list && len <= list_len)
5705 security_inode_listsecurity(dentry->d_inode, list, len);
5706 }
5707 return len;
5708}
5709
5710static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
5711 .prefix = XATTR_SECURITY_PREFIX,
5712 .list = nfs4_xattr_list_nfs4_label,
5713 .get = nfs4_xattr_get_nfs4_label,
5714 .set = nfs4_xattr_set_nfs4_label,
5715};
5716#endif
5717
5718
5298/* 5719/*
5299 * nfs_fhget will use either the mounted_on_fileid or the fileid 5720 * nfs_fhget will use either the mounted_on_fileid or the fileid
5300 */ 5721 */
@@ -5318,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
5318 struct page *page) 5739 struct page *page)
5319{ 5740{
5320 struct nfs_server *server = NFS_SERVER(dir); 5741 struct nfs_server *server = NFS_SERVER(dir);
5321 u32 bitmask[2] = { 5742 u32 bitmask[3] = {
5322 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, 5743 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
5323 }; 5744 };
5324 struct nfs4_fs_locations_arg args = { 5745 struct nfs4_fs_locations_arg args = {
@@ -5505,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
5505 struct nfs41_exchange_id_args args = { 5926 struct nfs41_exchange_id_args args = {
5506 .verifier = &verifier, 5927 .verifier = &verifier,
5507 .client = clp, 5928 .client = clp,
5508 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, 5929 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
5930 EXCHGID4_FLAG_BIND_PRINC_STATEID,
5509 }; 5931 };
5510 struct nfs41_exchange_id_res res = { 5932 struct nfs41_exchange_id_res res = {
5511 0 5933 0
@@ -5762,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5762 */ 6184 */
5763static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) 6185static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
5764{ 6186{
5765 struct nfs4_session *session = args->client->cl_session; 6187 unsigned int max_rqst_sz, max_resp_sz;
5766 unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, 6188
5767 mxresp_sz = session->fc_target_max_resp_sz; 6189 max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
6190 max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
5768 6191
5769 if (mxrqst_sz == 0)
5770 mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
5771 if (mxresp_sz == 0)
5772 mxresp_sz = NFS_MAX_FILE_IO_SIZE;
5773 /* Fore channel attributes */ 6192 /* Fore channel attributes */
5774 args->fc_attrs.max_rqst_sz = mxrqst_sz; 6193 args->fc_attrs.max_rqst_sz = max_rqst_sz;
5775 args->fc_attrs.max_resp_sz = mxresp_sz; 6194 args->fc_attrs.max_resp_sz = max_resp_sz;
5776 args->fc_attrs.max_ops = NFS4_MAX_OPS; 6195 args->fc_attrs.max_ops = NFS4_MAX_OPS;
5777 args->fc_attrs.max_reqs = max_session_slots; 6196 args->fc_attrs.max_reqs = max_session_slots;
5778 6197
@@ -6159,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
6159/* 6578/*
6160 * Issue a global reclaim complete. 6579 * Issue a global reclaim complete.
6161 */ 6580 */
6162static int nfs41_proc_reclaim_complete(struct nfs_client *clp) 6581static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
6582 struct rpc_cred *cred)
6163{ 6583{
6164 struct nfs4_reclaim_complete_data *calldata; 6584 struct nfs4_reclaim_complete_data *calldata;
6165 struct rpc_task *task; 6585 struct rpc_task *task;
6166 struct rpc_message msg = { 6586 struct rpc_message msg = {
6167 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], 6587 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
6588 .rpc_cred = cred,
6168 }; 6589 };
6169 struct rpc_task_setup task_setup_data = { 6590 struct rpc_task_setup task_setup_data = {
6170 .rpc_client = clp->cl_rpcclient, 6591 .rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6348 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], 6769 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
6349 .rpc_argp = &lgp->args, 6770 .rpc_argp = &lgp->args,
6350 .rpc_resp = &lgp->res, 6771 .rpc_resp = &lgp->res,
6772 .rpc_cred = lgp->cred,
6351 }; 6773 };
6352 struct rpc_task_setup task_setup_data = { 6774 struct rpc_task_setup task_setup_data = {
6353 .rpc_client = server->client, 6775 .rpc_client = server->client,
@@ -6451,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
6451 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], 6873 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
6452 .rpc_argp = &lrp->args, 6874 .rpc_argp = &lrp->args,
6453 .rpc_resp = &lrp->res, 6875 .rpc_resp = &lrp->res,
6876 .rpc_cred = lrp->cred,
6454 }; 6877 };
6455 struct rpc_task_setup task_setup_data = { 6878 struct rpc_task_setup task_setup_data = {
6456 .rpc_client = lrp->clp->cl_rpcclient, 6879 .rpc_client = lrp->clp->cl_rpcclient,
@@ -6520,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
6520EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); 6943EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
6521 6944
6522static int 6945static int
6523_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 6946_nfs4_proc_getdeviceinfo(struct nfs_server *server,
6947 struct pnfs_device *pdev,
6948 struct rpc_cred *cred)
6524{ 6949{
6525 struct nfs4_getdeviceinfo_args args = { 6950 struct nfs4_getdeviceinfo_args args = {
6526 .pdev = pdev, 6951 .pdev = pdev,
@@ -6532,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
6532 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], 6957 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
6533 .rpc_argp = &args, 6958 .rpc_argp = &args,
6534 .rpc_resp = &res, 6959 .rpc_resp = &res,
6960 .rpc_cred = cred,
6535 }; 6961 };
6536 int status; 6962 int status;
6537 6963
@@ -6542,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
6542 return status; 6968 return status;
6543} 6969}
6544 6970
6545int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 6971int nfs4_proc_getdeviceinfo(struct nfs_server *server,
6972 struct pnfs_device *pdev,
6973 struct rpc_cred *cred)
6546{ 6974{
6547 struct nfs4_exception exception = { }; 6975 struct nfs4_exception exception = { };
6548 int err; 6976 int err;
6549 6977
6550 do { 6978 do {
6551 err = nfs4_handle_exception(server, 6979 err = nfs4_handle_exception(server,
6552 _nfs4_proc_getdeviceinfo(server, pdev), 6980 _nfs4_proc_getdeviceinfo(server, pdev, cred),
6553 &exception); 6981 &exception);
6554 } while (exception.retry); 6982 } while (exception.retry);
6555 return err; 6983 return err;
@@ -6733,7 +7161,9 @@ out:
6733 return err; 7161 return err;
6734} 7162}
6735 7163
6736static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7164static int _nfs41_test_stateid(struct nfs_server *server,
7165 nfs4_stateid *stateid,
7166 struct rpc_cred *cred)
6737{ 7167{
6738 int status; 7168 int status;
6739 struct nfs41_test_stateid_args args = { 7169 struct nfs41_test_stateid_args args = {
@@ -6744,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], 7174 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
6745 .rpc_argp = &args, 7175 .rpc_argp = &args,
6746 .rpc_resp = &res, 7176 .rpc_resp = &res,
7177 .rpc_cred = cred,
6747 }; 7178 };
6748 7179
6749 dprintk("NFS call test_stateid %p\n", stateid); 7180 dprintk("NFS call test_stateid %p\n", stateid);
@@ -6764,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6764 * 7195 *
6765 * @server: server / transport on which to perform the operation 7196 * @server: server / transport on which to perform the operation
6766 * @stateid: state ID to test 7197 * @stateid: state ID to test
7198 * @cred: credential
6767 * 7199 *
6768 * Returns NFS_OK if the server recognizes that "stateid" is valid. 7200 * Returns NFS_OK if the server recognizes that "stateid" is valid.
6769 * Otherwise a negative NFS4ERR value is returned if the operation 7201 * Otherwise a negative NFS4ERR value is returned if the operation
6770 * failed or the state ID is not currently valid. 7202 * failed or the state ID is not currently valid.
6771 */ 7203 */
6772static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7204static int nfs41_test_stateid(struct nfs_server *server,
7205 nfs4_stateid *stateid,
7206 struct rpc_cred *cred)
6773{ 7207{
6774 struct nfs4_exception exception = { }; 7208 struct nfs4_exception exception = { };
6775 int err; 7209 int err;
6776 do { 7210 do {
6777 err = _nfs41_test_stateid(server, stateid); 7211 err = _nfs41_test_stateid(server, stateid, cred);
6778 if (err != -NFS4ERR_DELAY) 7212 if (err != -NFS4ERR_DELAY)
6779 break; 7213 break;
6780 nfs4_handle_exception(server, err, &exception); 7214 nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7257,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
6823 7257
6824static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, 7258static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
6825 nfs4_stateid *stateid, 7259 nfs4_stateid *stateid,
7260 struct rpc_cred *cred,
6826 bool privileged) 7261 bool privileged)
6827{ 7262{
6828 struct rpc_message msg = { 7263 struct rpc_message msg = {
6829 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], 7264 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
7265 .rpc_cred = cred,
6830 }; 7266 };
6831 struct rpc_task_setup task_setup = { 7267 struct rpc_task_setup task_setup = {
6832 .rpc_client = server->client, 7268 .rpc_client = server->client,
@@ -6859,16 +7295,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
6859 * 7295 *
6860 * @server: server / transport on which to perform the operation 7296 * @server: server / transport on which to perform the operation
6861 * @stateid: state ID to release 7297 * @stateid: state ID to release
7298 * @cred: credential
6862 * 7299 *
6863 * Returns NFS_OK if the server freed "stateid". Otherwise a 7300 * Returns NFS_OK if the server freed "stateid". Otherwise a
6864 * negative NFS4ERR value is returned. 7301 * negative NFS4ERR value is returned.
6865 */ 7302 */
6866static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7303static int nfs41_free_stateid(struct nfs_server *server,
7304 nfs4_stateid *stateid,
7305 struct rpc_cred *cred)
6867{ 7306{
6868 struct rpc_task *task; 7307 struct rpc_task *task;
6869 int ret; 7308 int ret;
6870 7309
6871 task = _nfs41_free_stateid(server, stateid, true); 7310 task = _nfs41_free_stateid(server, stateid, cred, true);
6872 if (IS_ERR(task)) 7311 if (IS_ERR(task))
6873 return PTR_ERR(task); 7312 return PTR_ERR(task);
6874 ret = rpc_wait_for_completion_task(task); 7313 ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7320,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6881static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) 7320static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
6882{ 7321{
6883 struct rpc_task *task; 7322 struct rpc_task *task;
7323 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
6884 7324
6885 task = _nfs41_free_stateid(server, &lsp->ls_stateid, false); 7325 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
6886 nfs4_free_lock_state(server, lsp); 7326 nfs4_free_lock_state(server, lsp);
6887 if (IS_ERR(task)) 7327 if (IS_ERR(task))
6888 return PTR_ERR(task); 7328 return PTR_ERR(task);
@@ -7004,11 +7444,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
7004}; 7444};
7005#endif 7445#endif
7006 7446
7447#if defined(CONFIG_NFS_V4_2)
7448static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
7449 .minor_version = 2,
7450 .init_caps = NFS_CAP_READDIRPLUS
7451 | NFS_CAP_ATOMIC_OPEN
7452 | NFS_CAP_CHANGE_ATTR
7453 | NFS_CAP_POSIX_LOCK
7454 | NFS_CAP_STATEID_NFSV41
7455 | NFS_CAP_ATOMIC_OPEN_V1,
7456 .call_sync = nfs4_call_sync_sequence,
7457 .match_stateid = nfs41_match_stateid,
7458 .find_root_sec = nfs41_find_root_sec,
7459 .free_lock_state = nfs41_free_lock_state,
7460 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
7461 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
7462 .state_renewal_ops = &nfs41_state_renewal_ops,
7463};
7464#endif
7465
7007const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { 7466const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
7008 [0] = &nfs_v4_0_minor_ops, 7467 [0] = &nfs_v4_0_minor_ops,
7009#if defined(CONFIG_NFS_V4_1) 7468#if defined(CONFIG_NFS_V4_1)
7010 [1] = &nfs_v4_1_minor_ops, 7469 [1] = &nfs_v4_1_minor_ops,
7011#endif 7470#endif
7471#if defined(CONFIG_NFS_V4_2)
7472 [2] = &nfs_v4_2_minor_ops,
7473#endif
7012}; 7474};
7013 7475
7014const struct inode_operations nfs4_dir_inode_operations = { 7476const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
7108 7570
7109const struct xattr_handler *nfs4_xattr_handlers[] = { 7571const struct xattr_handler *nfs4_xattr_handlers[] = {
7110 &nfs4_xattr_nfs4_acl_handler, 7572 &nfs4_xattr_nfs4_acl_handler,
7573#ifdef CONFIG_NFS_V4_SECURITY_LABEL
7574 &nfs4_xattr_nfs4_label_handler,
7575#endif
7111 NULL 7576 NULL
7112}; 7577};
7113 7578
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c4e225e4a9af..36e21cb29d65 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
478 return 0; 478 return 0;
479} 479}
480 480
481int nfs4_init_session(struct nfs_server *server) 481int nfs4_init_session(struct nfs_client *clp)
482{ 482{
483 struct nfs_client *clp = server->nfs_client;
484 struct nfs4_session *session;
485 unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
486 unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
487
488 if (!nfs4_has_session(clp)) 483 if (!nfs4_has_session(clp))
489 return 0; 484 return 0;
490 485
491 if (server->rsize != 0) 486 clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
492 target_max_resp_sz = server->rsize;
493 target_max_resp_sz += nfs41_maxread_overhead;
494
495 if (server->wsize != 0)
496 target_max_rqst_sz = server->wsize;
497 target_max_rqst_sz += nfs41_maxwrite_overhead;
498
499 session = clp->cl_session;
500 spin_lock(&clp->cl_lock);
501 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
502 /* Initialise targets and channel attributes */
503 session->fc_target_max_rqst_sz = target_max_rqst_sz;
504 session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
505 session->fc_target_max_resp_sz = target_max_resp_sz;
506 session->fc_attrs.max_resp_sz = target_max_resp_sz;
507 } else {
508 /* Just adjust the targets */
509 if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
510 session->fc_target_max_rqst_sz = target_max_rqst_sz;
511 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
512 }
513 if (target_max_resp_sz > session->fc_target_max_resp_sz) {
514 session->fc_target_max_resp_sz = target_max_resp_sz;
515 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
516 }
517 }
518 spin_unlock(&clp->cl_lock);
519
520 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
521 nfs4_schedule_lease_recovery(clp);
522
523 return nfs41_check_session_ready(clp); 487 return nfs41_check_session_ready(clp);
524} 488}
525 489
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index ff7d9f0f8a65..3a153d82b90c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -66,9 +66,6 @@ struct nfs4_session {
66 struct nfs4_channel_attrs bc_attrs; 66 struct nfs4_channel_attrs bc_attrs;
67 struct nfs4_slot_table bc_slot_table; 67 struct nfs4_slot_table bc_slot_table;
68 struct nfs_client *clp; 68 struct nfs_client *clp;
69 /* Create session arguments */
70 unsigned int fc_target_max_rqst_sz;
71 unsigned int fc_target_max_resp_sz;
72}; 69};
73 70
74enum nfs4_session_state { 71enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
89 86
90extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 87extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
91extern void nfs4_destroy_session(struct nfs4_session *session); 88extern void nfs4_destroy_session(struct nfs4_session *session);
92extern int nfs4_init_session(struct nfs_server *server); 89extern int nfs4_init_session(struct nfs_client *clp);
93extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); 90extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
94 91
95extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); 92extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
122 119
123#else /* defined(CONFIG_NFS_V4_1) */ 120#else /* defined(CONFIG_NFS_V4_1) */
124 121
125static inline int nfs4_init_session(struct nfs_server *server) 122static inline int nfs4_init_session(struct nfs_client *clp)
126{ 123{
127 return 0; 124 return 0;
128} 125}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 55418811a55a..e22862f13564 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
228 return status; 228 return status;
229} 229}
230 230
231/* 231static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
232 * Back channel returns NFS4ERR_DELAY for new requests when
233 * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
234 * is ended.
235 */
236static void nfs4_end_drain_session(struct nfs_client *clp)
237{ 232{
238 struct nfs4_session *ses = clp->cl_session;
239 struct nfs4_slot_table *tbl;
240
241 if (ses == NULL)
242 return;
243 tbl = &ses->fc_slot_table;
244 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { 233 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
245 spin_lock(&tbl->slot_tbl_lock); 234 spin_lock(&tbl->slot_tbl_lock);
246 nfs41_wake_slot_table(tbl); 235 nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
248 } 237 }
249} 238}
250 239
240static void nfs4_end_drain_session(struct nfs_client *clp)
241{
242 struct nfs4_session *ses = clp->cl_session;
243
244 if (ses != NULL) {
245 nfs4_end_drain_slot_table(&ses->bc_slot_table);
246 nfs4_end_drain_slot_table(&ses->fc_slot_table);
247 }
248}
249
251/* 250/*
252 * Signal state manager thread if session fore channel is drained 251 * Signal state manager thread if session fore channel is drained
253 */ 252 */
@@ -1563,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
1563} 1562}
1564 1563
1565static void nfs4_reclaim_complete(struct nfs_client *clp, 1564static void nfs4_reclaim_complete(struct nfs_client *clp,
1566 const struct nfs4_state_recovery_ops *ops) 1565 const struct nfs4_state_recovery_ops *ops,
1566 struct rpc_cred *cred)
1567{ 1567{
1568 /* Notify the server we're done reclaiming our state */ 1568 /* Notify the server we're done reclaiming our state */
1569 if (ops->reclaim_complete) 1569 if (ops->reclaim_complete)
1570 (void)ops->reclaim_complete(clp); 1570 (void)ops->reclaim_complete(clp, cred);
1571} 1571}
1572 1572
1573static void nfs4_clear_reclaim_server(struct nfs_server *server) 1573static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1612 1612
1613static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1613static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1614{ 1614{
1615 const struct nfs4_state_recovery_ops *ops;
1616 struct rpc_cred *cred;
1617
1615 if (!nfs4_state_clear_reclaim_reboot(clp)) 1618 if (!nfs4_state_clear_reclaim_reboot(clp))
1616 return; 1619 return;
1617 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); 1620 ops = clp->cl_mvops->reboot_recovery_ops;
1621 cred = ops->get_clid_cred(clp);
1622 nfs4_reclaim_complete(clp, ops, cred);
1623 put_rpccred(cred);
1618} 1624}
1619 1625
1620static void nfs_delegation_clear_all(struct nfs_client *clp) 1626static void nfs_delegation_clear_all(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index a5e1a3026d48..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
9#include "delegation.h" 9#include "delegation.h"
10#include "internal.h" 10#include "internal.h"
11#include "nfs4_fs.h" 11#include "nfs4_fs.h"
12#include "dns_resolve.h"
12#include "pnfs.h" 13#include "pnfs.h"
13#include "nfs.h" 14#include "nfs.h"
14 15
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
331{ 332{
332 int err; 333 int err;
333 334
334 err = nfs_idmap_init(); 335 err = nfs_dns_resolver_init();
335 if (err) 336 if (err)
336 goto out; 337 goto out;
337 338
338 err = nfs4_register_sysctl(); 339 err = nfs_idmap_init();
339 if (err) 340 if (err)
340 goto out1; 341 goto out1;
341 342
343 err = nfs4_register_sysctl();
344 if (err)
345 goto out2;
346
342 register_nfs_version(&nfs_v4); 347 register_nfs_version(&nfs_v4);
343 return 0; 348 return 0;
344out1: 349out2:
345 nfs_idmap_quit(); 350 nfs_idmap_quit();
351out1:
352 nfs_dns_resolver_destroy();
346out: 353out:
347 return err; 354 return err;
348} 355}
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
352 unregister_nfs_version(&nfs_v4); 359 unregister_nfs_version(&nfs_v4);
353 nfs4_unregister_sysctl(); 360 nfs4_unregister_sysctl();
354 nfs_idmap_quit(); 361 nfs_idmap_quit();
362 nfs_dns_resolver_destroy();
355} 363}
356 364
357MODULE_LICENSE("GPL"); 365MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..3850b018815f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
102#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) 102#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
103#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 103#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
104#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 104#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
105#ifdef CONFIG_NFS_V4_SECURITY_LABEL
106/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
107#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
108#define encode_readdir_space 24
109#define encode_readdir_bitmask_sz 3
110#else
111#define nfs4_label_maxsz 0
112#define encode_readdir_space 20
113#define encode_readdir_bitmask_sz 2
114#endif
105/* We support only one layout type per file system */ 115/* We support only one layout type per file system */
106#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) 116#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
107/* This is based on getfattr, which uses the most attributes: */ 117/* This is based on getfattr, which uses the most attributes: */
108#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ 118#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
109 3 + 3 + 3 + nfs4_owner_maxsz + \ 119 3 + 3 + 3 + nfs4_owner_maxsz + \
110 nfs4_group_maxsz + decode_mdsthreshold_maxsz)) 120 nfs4_group_maxsz + nfs4_label_maxsz + \
121 decode_mdsthreshold_maxsz))
111#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ 122#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
112 nfs4_fattr_value_maxsz) 123 nfs4_fattr_value_maxsz)
113#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) 124#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
115 1 + 2 + 1 + \ 126 1 + 2 + 1 + \
116 nfs4_owner_maxsz + \ 127 nfs4_owner_maxsz + \
117 nfs4_group_maxsz + \ 128 nfs4_group_maxsz + \
129 nfs4_label_maxsz + \
118 4 + 4) 130 4 + 4)
119#define encode_savefh_maxsz (op_encode_hdr_maxsz) 131#define encode_savefh_maxsz (op_encode_hdr_maxsz)
120#define decode_savefh_maxsz (op_decode_hdr_maxsz) 132#define decode_savefh_maxsz (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
192 encode_stateid_maxsz + 3) 204 encode_stateid_maxsz + 3)
193#define decode_read_maxsz (op_decode_hdr_maxsz + 2) 205#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
194#define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 206#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
195 2 + encode_verifier_maxsz + 5) 207 2 + encode_verifier_maxsz + 5 + \
208 nfs4_label_maxsz)
196#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ 209#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
197 decode_verifier_maxsz) 210 decode_verifier_maxsz + \
211 nfs4_label_maxsz + nfs4_fattr_maxsz)
198#define encode_readlink_maxsz (op_encode_hdr_maxsz) 212#define encode_readlink_maxsz (op_encode_hdr_maxsz)
199#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) 213#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
200#define encode_write_maxsz (op_encode_hdr_maxsz + \ 214#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
853 decode_sequence_maxsz + 867 decode_sequence_maxsz +
854 decode_putfh_maxsz) * 868 decode_putfh_maxsz) *
855 XDR_UNIT); 869 XDR_UNIT);
870
871const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
872 compound_decode_hdr_maxsz +
873 decode_sequence_maxsz) *
874 XDR_UNIT);
875EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
856#endif /* CONFIG_NFS_V4_1 */ 876#endif /* CONFIG_NFS_V4_1 */
857 877
858static const umode_t nfs_type2fmt[] = { 878static const umode_t nfs_type2fmt[] = {
@@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
968 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); 988 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
969} 989}
970 990
971static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) 991static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
992 const struct nfs4_label *label,
993 const struct nfs_server *server)
972{ 994{
973 char owner_name[IDMAP_NAMESZ]; 995 char owner_name[IDMAP_NAMESZ];
974 char owner_group[IDMAP_NAMESZ]; 996 char owner_group[IDMAP_NAMESZ];
@@ -977,17 +999,19 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
977 __be32 *p; 999 __be32 *p;
978 __be32 *q; 1000 __be32 *q;
979 int len; 1001 int len;
1002 uint32_t bmval_len = 2;
980 uint32_t bmval0 = 0; 1003 uint32_t bmval0 = 0;
981 uint32_t bmval1 = 0; 1004 uint32_t bmval1 = 0;
1005 uint32_t bmval2 = 0;
982 1006
983 /* 1007 /*
984 * We reserve enough space to write the entire attribute buffer at once. 1008 * We reserve enough space to write the entire attribute buffer at once.
985 * In the worst-case, this would be 1009 * In the worst-case, this would be
986 * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) 1010 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
987 * = 36 bytes, plus any contribution from variable-length fields 1011 * = 40 bytes, plus any contribution from variable-length fields
988 * such as owner/group. 1012 * such as owner/group.
989 */ 1013 */
990 len = 16; 1014 len = 8;
991 1015
992 /* Sigh */ 1016 /* Sigh */
993 if (iap->ia_valid & ATTR_SIZE) 1017 if (iap->ia_valid & ATTR_SIZE)
@@ -1025,15 +1049,22 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1025 len += 16; 1049 len += 16;
1026 else if (iap->ia_valid & ATTR_MTIME) 1050 else if (iap->ia_valid & ATTR_MTIME)
1027 len += 4; 1051 len += 4;
1052 if (label) {
1053 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
1054 bmval_len = 3;
1055 }
1056
1057 len += bmval_len << 2;
1028 p = reserve_space(xdr, len); 1058 p = reserve_space(xdr, len);
1029 1059
1030 /* 1060 /*
1031 * We write the bitmap length now, but leave the bitmap and the attribute 1061 * We write the bitmap length now, but leave the bitmap and the attribute
1032 * buffer length to be backfilled at the end of this routine. 1062 * buffer length to be backfilled at the end of this routine.
1033 */ 1063 */
1034 *p++ = cpu_to_be32(2); 1064 *p++ = cpu_to_be32(bmval_len);
1035 q = p; 1065 q = p;
1036 p += 3; 1066 /* Skip bitmap entries + attrlen */
1067 p += bmval_len + 1;
1037 1068
1038 if (iap->ia_valid & ATTR_SIZE) { 1069 if (iap->ia_valid & ATTR_SIZE) {
1039 bmval0 |= FATTR4_WORD0_SIZE; 1070 bmval0 |= FATTR4_WORD0_SIZE;
@@ -1071,6 +1102,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1071 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 1102 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
1072 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); 1103 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1073 } 1104 }
1105 if (label) {
1106 bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
1107 *p++ = cpu_to_be32(label->lfs);
1108 *p++ = cpu_to_be32(label->pi);
1109 *p++ = cpu_to_be32(label->len);
1110 p = xdr_encode_opaque_fixed(p, label->label, label->len);
1111 }
1074 1112
1075 /* 1113 /*
1076 * Now we backfill the bitmap and the attribute buffer length. 1114 * Now we backfill the bitmap and the attribute buffer length.
@@ -1080,9 +1118,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1080 len, ((char *)p - (char *)q) + 4); 1118 len, ((char *)p - (char *)q) + 4);
1081 BUG(); 1119 BUG();
1082 } 1120 }
1083 len = (char *)p - (char *)q - 12;
1084 *q++ = htonl(bmval0); 1121 *q++ = htonl(bmval0);
1085 *q++ = htonl(bmval1); 1122 *q++ = htonl(bmval1);
1123 if (bmval_len == 3)
1124 *q++ = htonl(bmval2);
1125 len = (char *)p - (char *)(q + 1);
1086 *q = htonl(len); 1126 *q = htonl(len);
1087 1127
1088/* out: */ 1128/* out: */
@@ -1136,7 +1176,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
1136 } 1176 }
1137 1177
1138 encode_string(xdr, create->name->len, create->name->name); 1178 encode_string(xdr, create->name->len, create->name->name);
1139 encode_attrs(xdr, create->attrs, create->server); 1179 encode_attrs(xdr, create->attrs, create->label, create->server);
1140} 1180}
1141 1181
1142static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) 1182static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1228,10 @@ encode_getattr_three(struct xdr_stream *xdr,
1188 1228
1189static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1229static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1190{ 1230{
1191 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], 1231 encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
1192 bitmask[1] & nfs4_fattr_bitmap[1], hdr); 1232 bitmask[1] & nfs4_fattr_bitmap[1],
1233 bitmask[2] & nfs4_fattr_bitmap[2],
1234 hdr);
1193} 1235}
1194 1236
1195static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, 1237static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1409,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1367 switch(arg->createmode) { 1409 switch(arg->createmode) {
1368 case NFS4_CREATE_UNCHECKED: 1410 case NFS4_CREATE_UNCHECKED:
1369 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); 1411 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1370 encode_attrs(xdr, arg->u.attrs, arg->server); 1412 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
1371 break; 1413 break;
1372 case NFS4_CREATE_GUARDED: 1414 case NFS4_CREATE_GUARDED:
1373 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1415 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1374 encode_attrs(xdr, arg->u.attrs, arg->server); 1416 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
1375 break; 1417 break;
1376 case NFS4_CREATE_EXCLUSIVE: 1418 case NFS4_CREATE_EXCLUSIVE:
1377 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); 1419 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1423,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1381 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); 1423 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
1382 encode_nfs4_verifier(xdr, &arg->u.verifier); 1424 encode_nfs4_verifier(xdr, &arg->u.verifier);
1383 dummy.ia_valid = 0; 1425 dummy.ia_valid = 0;
1384 encode_attrs(xdr, &dummy, arg->server); 1426 encode_attrs(xdr, &dummy, arg->label, arg->server);
1385 } 1427 }
1386} 1428}
1387 1429
@@ -1532,7 +1574,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1532 1574
1533static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1575static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1534{ 1576{
1535 uint32_t attrs[2] = { 1577 uint32_t attrs[3] = {
1536 FATTR4_WORD0_RDATTR_ERROR, 1578 FATTR4_WORD0_RDATTR_ERROR,
1537 FATTR4_WORD1_MOUNTED_ON_FILEID, 1579 FATTR4_WORD1_MOUNTED_ON_FILEID,
1538 }; 1580 };
@@ -1555,20 +1597,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1555 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); 1597 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
1556 encode_uint64(xdr, readdir->cookie); 1598 encode_uint64(xdr, readdir->cookie);
1557 encode_nfs4_verifier(xdr, &readdir->verifier); 1599 encode_nfs4_verifier(xdr, &readdir->verifier);
1558 p = reserve_space(xdr, 20); 1600 p = reserve_space(xdr, encode_readdir_space);
1559 *p++ = cpu_to_be32(dircount); 1601 *p++ = cpu_to_be32(dircount);
1560 *p++ = cpu_to_be32(readdir->count); 1602 *p++ = cpu_to_be32(readdir->count);
1561 *p++ = cpu_to_be32(2); 1603 *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
1562
1563 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1604 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1564 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1605 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1606 if (encode_readdir_bitmask_sz > 2) {
1607 if (hdr->minorversion > 1)
1608 attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
1609 p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
1610 }
1565 memcpy(verf, readdir->verifier.data, sizeof(verf)); 1611 memcpy(verf, readdir->verifier.data, sizeof(verf));
1566 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1612
1613 dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
1567 __func__, 1614 __func__,
1568 (unsigned long long)readdir->cookie, 1615 (unsigned long long)readdir->cookie,
1569 verf[0], verf[1], 1616 verf[0], verf[1],
1570 attrs[0] & readdir->bitmask[0], 1617 attrs[0] & readdir->bitmask[0],
1571 attrs[1] & readdir->bitmask[1]); 1618 attrs[1] & readdir->bitmask[1],
1619 attrs[2] & readdir->bitmask[2]);
1572} 1620}
1573 1621
1574static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) 1622static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1675,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1627{ 1675{
1628 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); 1676 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
1629 encode_nfs4_stateid(xdr, &arg->stateid); 1677 encode_nfs4_stateid(xdr, &arg->stateid);
1630 encode_attrs(xdr, arg->iap, server); 1678 encode_attrs(xdr, arg->iap, arg->label, server);
1631} 1679}
1632 1680
1633static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) 1681static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1937,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1889 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1937 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1890 NFS4_DEVICEID4_SIZE); 1938 NFS4_DEVICEID4_SIZE);
1891 *p++ = cpu_to_be32(args->pdev->layout_type); 1939 *p++ = cpu_to_be32(args->pdev->layout_type);
1892 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ 1940 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1893 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1941 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1894} 1942}
1895 1943
@@ -4038,6 +4086,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
4038 return status; 4086 return status;
4039} 4087}
4040 4088
4089static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
4090 struct nfs4_label *label)
4091{
4092 uint32_t pi = 0;
4093 uint32_t lfs = 0;
4094 __u32 len;
4095 __be32 *p;
4096 int status = 0;
4097
4098 if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
4099 return -EIO;
4100 if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
4101 p = xdr_inline_decode(xdr, 4);
4102 if (unlikely(!p))
4103 goto out_overflow;
4104 lfs = be32_to_cpup(p++);
4105 p = xdr_inline_decode(xdr, 4);
4106 if (unlikely(!p))
4107 goto out_overflow;
4108 pi = be32_to_cpup(p++);
4109 p = xdr_inline_decode(xdr, 4);
4110 if (unlikely(!p))
4111 goto out_overflow;
4112 len = be32_to_cpup(p++);
4113 p = xdr_inline_decode(xdr, len);
4114 if (unlikely(!p))
4115 goto out_overflow;
4116 if (len < NFS4_MAXLABELLEN) {
4117 if (label) {
4118 memcpy(label->label, p, len);
4119 label->len = len;
4120 label->pi = pi;
4121 label->lfs = lfs;
4122 status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
4123 }
4124 bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
4125 } else
4126 printk(KERN_WARNING "%s: label too long (%u)!\n",
4127 __func__, len);
4128 }
4129 if (label && label->label)
4130 dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
4131 (char *)label->label, label->len, label->pi, label->lfs);
4132 return status;
4133
4134out_overflow:
4135 print_overflow_msg(__func__, xdr);
4136 return -EIO;
4137}
4138
4041static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 4139static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
4042{ 4140{
4043 int status = 0; 4141 int status = 0;
@@ -4380,7 +4478,7 @@ out_overflow:
4380 4478
4381static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, 4479static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4382 struct nfs_fattr *fattr, struct nfs_fh *fh, 4480 struct nfs_fattr *fattr, struct nfs_fh *fh,
4383 struct nfs4_fs_locations *fs_loc, 4481 struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
4384 const struct nfs_server *server) 4482 const struct nfs_server *server)
4385{ 4483{
4386 int status; 4484 int status;
@@ -4488,6 +4586,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4488 if (status < 0) 4586 if (status < 0)
4489 goto xdr_error; 4587 goto xdr_error;
4490 4588
4589 if (label) {
4590 status = decode_attr_security_label(xdr, bitmap, label);
4591 if (status < 0)
4592 goto xdr_error;
4593 fattr->valid |= status;
4594 }
4595
4491xdr_error: 4596xdr_error:
4492 dprintk("%s: xdr returned %d\n", __func__, -status); 4597 dprintk("%s: xdr returned %d\n", __func__, -status);
4493 return status; 4598 return status;
@@ -4495,7 +4600,7 @@ xdr_error:
4495 4600
4496static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4601static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4497 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, 4602 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
4498 const struct nfs_server *server) 4603 struct nfs4_label *label, const struct nfs_server *server)
4499{ 4604{
4500 unsigned int savep; 4605 unsigned int savep;
4501 uint32_t attrlen, 4606 uint32_t attrlen,
@@ -4514,7 +4619,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4514 if (status < 0) 4619 if (status < 0)
4515 goto xdr_error; 4620 goto xdr_error;
4516 4621
4517 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); 4622 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
4623 label, server);
4518 if (status < 0) 4624 if (status < 0)
4519 goto xdr_error; 4625 goto xdr_error;
4520 4626
@@ -4524,10 +4630,16 @@ xdr_error:
4524 return status; 4630 return status;
4525} 4631}
4526 4632
4633static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4634 struct nfs4_label *label, const struct nfs_server *server)
4635{
4636 return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
4637}
4638
4527static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4639static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4528 const struct nfs_server *server) 4640 const struct nfs_server *server)
4529{ 4641{
4530 return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); 4642 return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
4531} 4643}
4532 4644
4533/* 4645/*
@@ -5919,7 +6031,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5919 status = decode_getfh(xdr, res->fh); 6031 status = decode_getfh(xdr, res->fh);
5920 if (status) 6032 if (status)
5921 goto out; 6033 goto out;
5922 status = decode_getfattr(xdr, res->fattr, res->server); 6034 status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
5923out: 6035out:
5924 return status; 6036 return status;
5925} 6037}
@@ -5945,7 +6057,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5945 goto out; 6057 goto out;
5946 status = decode_getfh(xdr, res->fh); 6058 status = decode_getfh(xdr, res->fh);
5947 if (status == 0) 6059 if (status == 0)
5948 status = decode_getfattr(xdr, res->fattr, res->server); 6060 status = decode_getfattr_label(xdr, res->fattr,
6061 res->label, res->server);
5949out: 6062out:
5950 return status; 6063 return status;
5951} 6064}
@@ -6036,7 +6149,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6036 status = decode_restorefh(xdr); 6149 status = decode_restorefh(xdr);
6037 if (status) 6150 if (status)
6038 goto out; 6151 goto out;
6039 decode_getfattr(xdr, res->fattr, res->server); 6152 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6040out: 6153out:
6041 return status; 6154 return status;
6042} 6155}
@@ -6065,7 +6178,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6065 status = decode_getfh(xdr, res->fh); 6178 status = decode_getfh(xdr, res->fh);
6066 if (status) 6179 if (status)
6067 goto out; 6180 goto out;
6068 decode_getfattr(xdr, res->fattr, res->server); 6181 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6069out: 6182out:
6070 return status; 6183 return status;
6071} 6184}
@@ -6097,7 +6210,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6097 status = decode_putfh(xdr); 6210 status = decode_putfh(xdr);
6098 if (status) 6211 if (status)
6099 goto out; 6212 goto out;
6100 status = decode_getfattr(xdr, res->fattr, res->server); 6213 status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6101out: 6214out:
6102 return status; 6215 return status;
6103} 6216}
@@ -6230,7 +6343,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6230 goto out; 6343 goto out;
6231 if (res->access_request) 6344 if (res->access_request)
6232 decode_access(xdr, &res->access_supported, &res->access_result); 6345 decode_access(xdr, &res->access_supported, &res->access_result);
6233 decode_getfattr(xdr, res->f_attr, res->server); 6346 decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
6234out: 6347out:
6235 return status; 6348 return status;
6236} 6349}
@@ -6307,7 +6420,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
6307 status = decode_setattr(xdr); 6420 status = decode_setattr(xdr);
6308 if (status) 6421 if (status)
6309 goto out; 6422 goto out;
6310 decode_getfattr(xdr, res->fattr, res->server); 6423 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6311out: 6424out:
6312 return status; 6425 return status;
6313} 6426}
@@ -6696,7 +6809,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
6696 xdr_enter_page(xdr, PAGE_SIZE); 6809 xdr_enter_page(xdr, PAGE_SIZE);
6697 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, 6810 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
6698 NULL, res->fs_locations, 6811 NULL, res->fs_locations,
6699 res->fs_locations->server); 6812 NULL, res->fs_locations->server);
6700out: 6813out:
6701 return status; 6814 return status;
6702} 6815}
@@ -7109,7 +7222,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
7109 goto out_overflow; 7222 goto out_overflow;
7110 7223
7111 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, 7224 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
7112 NULL, entry->server) < 0) 7225 NULL, entry->label, entry->server) < 0)
7113 goto out_overflow; 7226 goto out_overflow;
7114 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) 7227 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
7115 entry->ino = entry->fattr->mounted_on_fileid; 7228 entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
613 pd.pgbase = 0; 613 pd.pgbase = 0;
614 pd.pglen = PAGE_SIZE; 614 pd.pglen = PAGE_SIZE;
615 pd.mincount = 0; 615 pd.mincount = 0;
616 pd.maxcount = PAGE_SIZE;
616 617
617 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); 618 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
619 pnfslay->plh_lc_cred);
618 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); 620 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
619 if (err) 621 if (err)
620 goto err_out; 622 goto err_out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c5bd758e5637..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
360} 360}
361EXPORT_SYMBOL_GPL(pnfs_put_lseg); 361EXPORT_SYMBOL_GPL(pnfs_put_lseg);
362 362
363static inline u64 363static u64
364end_offset(u64 start, u64 len) 364end_offset(u64 start, u64 len)
365{ 365{
366 u64 end; 366 u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
376 * start2 end2 376 * start2 end2
377 * [----------------) 377 * [----------------)
378 */ 378 */
379static inline int 379static bool
380lo_seg_contained(struct pnfs_layout_range *l1, 380pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
381 struct pnfs_layout_range *l2) 381 const struct pnfs_layout_range *l2)
382{ 382{
383 u64 start1 = l1->offset; 383 u64 start1 = l1->offset;
384 u64 end1 = end_offset(start1, l1->length); 384 u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
395 * start2 end2 395 * start2 end2
396 * [----------------) 396 * [----------------)
397 */ 397 */
398static inline int 398static bool
399lo_seg_intersecting(struct pnfs_layout_range *l1, 399pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
400 struct pnfs_layout_range *l2) 400 const struct pnfs_layout_range *l2)
401{ 401{
402 u64 start1 = l1->offset; 402 u64 start1 = l1->offset;
403 u64 end1 = end_offset(start1, l1->length); 403 u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
409} 409}
410 410
411static bool 411static bool
412should_free_lseg(struct pnfs_layout_range *lseg_range, 412should_free_lseg(const struct pnfs_layout_range *lseg_range,
413 struct pnfs_layout_range *recall_range) 413 const struct pnfs_layout_range *recall_range)
414{ 414{
415 return (recall_range->iomode == IOMODE_ANY || 415 return (recall_range->iomode == IOMODE_ANY ||
416 lseg_range->iomode == recall_range->iomode) && 416 lseg_range->iomode == recall_range->iomode) &&
417 lo_seg_intersecting(lseg_range, recall_range); 417 pnfs_lseg_range_intersecting(lseg_range, recall_range);
418} 418}
419 419
420static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 420static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
766 lgp->args.inode = ino; 766 lgp->args.inode = ino;
767 lgp->args.ctx = get_nfs_open_context(ctx); 767 lgp->args.ctx = get_nfs_open_context(ctx);
768 lgp->gfp_flags = gfp_flags; 768 lgp->gfp_flags = gfp_flags;
769 lgp->cred = lo->plh_lc_cred;
769 770
770 /* Synchronously retrieve layout information from server and 771 /* Synchronously retrieve layout information from server and
771 * store in lseg. 772 * store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
860 lrp->args.inode = ino; 861 lrp->args.inode = ino;
861 lrp->args.layout = lo; 862 lrp->args.layout = lo;
862 lrp->clp = NFS_SERVER(ino)->nfs_client; 863 lrp->clp = NFS_SERVER(ino)->nfs_client;
864 lrp->cred = lo->plh_lc_cred;
863 865
864 status = nfs4_proc_layoutreturn(lrp); 866 status = nfs4_proc_layoutreturn(lrp);
865out: 867out:
@@ -984,8 +986,8 @@ out:
984 * are seen first. 986 * are seen first.
985 */ 987 */
986static s64 988static s64
987cmp_layout(struct pnfs_layout_range *l1, 989pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
988 struct pnfs_layout_range *l2) 990 const struct pnfs_layout_range *l2)
989{ 991{
990 s64 d; 992 s64 d;
991 993
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1012 dprintk("%s:Begin\n", __func__); 1014 dprintk("%s:Begin\n", __func__);
1013 1015
1014 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 1016 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
1015 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 1017 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
1016 continue; 1018 continue;
1017 list_add_tail(&lseg->pls_list, &lp->pls_list); 1019 list_add_tail(&lseg->pls_list, &lp->pls_list);
1018 dprintk("%s: inserted lseg %p " 1020 dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
1050 INIT_LIST_HEAD(&lo->plh_segs); 1052 INIT_LIST_HEAD(&lo->plh_segs);
1051 INIT_LIST_HEAD(&lo->plh_bulk_destroy); 1053 INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1052 lo->plh_inode = ino; 1054 lo->plh_inode = ino;
1053 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); 1055 lo->plh_lc_cred = get_rpccred(ctx->cred);
1054 return lo; 1056 return lo;
1055} 1057}
1056 1058
@@ -1091,21 +1093,21 @@ out_existing:
1091 * READ READ true 1093 * READ READ true
1092 * READ RW true 1094 * READ RW true
1093 */ 1095 */
1094static int 1096static bool
1095is_matching_lseg(struct pnfs_layout_range *ls_range, 1097pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1096 struct pnfs_layout_range *range) 1098 const struct pnfs_layout_range *range)
1097{ 1099{
1098 struct pnfs_layout_range range1; 1100 struct pnfs_layout_range range1;
1099 1101
1100 if ((range->iomode == IOMODE_RW && 1102 if ((range->iomode == IOMODE_RW &&
1101 ls_range->iomode != IOMODE_RW) || 1103 ls_range->iomode != IOMODE_RW) ||
1102 !lo_seg_intersecting(ls_range, range)) 1104 !pnfs_lseg_range_intersecting(ls_range, range))
1103 return 0; 1105 return 0;
1104 1106
1105 /* range1 covers only the first byte in the range */ 1107 /* range1 covers only the first byte in the range */
1106 range1 = *range; 1108 range1 = *range;
1107 range1.length = 1; 1109 range1.length = 1;
1108 return lo_seg_contained(ls_range, &range1); 1110 return pnfs_lseg_range_contained(ls_range, &range1);
1109} 1111}
1110 1112
1111/* 1113/*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1121 1123
1122 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1124 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1123 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1125 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1124 is_matching_lseg(&lseg->pls_range, range)) { 1126 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1125 ret = pnfs_get_lseg(lseg); 1127 ret = pnfs_get_lseg(lseg);
1126 break; 1128 break;
1127 } 1129 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
149 struct nfs4_deviceid dev_id; 149 struct nfs4_deviceid dev_id;
150 unsigned int layout_type; 150 unsigned int layout_type;
151 unsigned int mincount; 151 unsigned int mincount;
152 unsigned int maxcount; /* gdia_maxcount */
152 struct page **pages; 153 struct page **pages;
153 unsigned int pgbase; 154 unsigned int pgbase;
154 unsigned int pglen; 155 unsigned int pglen; /* reply buffer length */
155}; 156};
156 157
157#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 158#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
170 const struct nfs_fh *fh, 171 const struct nfs_fh *fh,
171 struct pnfs_devicelist *devlist); 172 struct pnfs_devicelist *devlist);
172extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 173extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
173 struct pnfs_device *dev); 174 struct pnfs_device *dev,
175 struct rpc_cred *cred);
174extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 176extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
175extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 177extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
176 178
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
98 */ 98 */
99static int 99static int
100nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, 100nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
101 struct nfs_fattr *fattr) 101 struct nfs_fattr *fattr, struct nfs4_label *label)
102{ 102{
103 struct rpc_message msg = { 103 struct rpc_message msg = {
104 .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], 104 .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
146 146
147static int 147static int
148nfs_proc_lookup(struct inode *dir, struct qstr *name, 148nfs_proc_lookup(struct inode *dir, struct qstr *name,
149 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 149 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
150 struct nfs4_label *label)
150{ 151{
151 struct nfs_diropargs arg = { 152 struct nfs_diropargs arg = {
152 .fh = NFS_FH(dir), 153 .fh = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
243 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 244 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
244 nfs_mark_for_revalidate(dir); 245 nfs_mark_for_revalidate(dir);
245 if (status == 0) 246 if (status == 0)
246 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 247 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
247 nfs_free_createdata(data); 248 nfs_free_createdata(data);
248out: 249out:
249 dprintk("NFS reply create: %d\n", status); 250 dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
290 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 291 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
291 } 292 }
292 if (status == 0) 293 if (status == 0)
293 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 294 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
294 nfs_free_createdata(data); 295 nfs_free_createdata(data);
295out: 296out:
296 dprintk("NFS reply mknod: %d\n", status); 297 dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
442 * should fill in the data with a LOOKUP call on the wire. 443 * should fill in the data with a LOOKUP call on the wire.
443 */ 444 */
444 if (status == 0) 445 if (status == 0)
445 status = nfs_instantiate(dentry, fh, fattr); 446 status = nfs_instantiate(dentry, fh, fattr, NULL);
446 447
447out_free: 448out_free:
448 nfs_free_fattr(fattr); 449 nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
471 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 472 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
472 nfs_mark_for_revalidate(dir); 473 nfs_mark_for_revalidate(dir);
473 if (status == 0) 474 if (status == 0)
474 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 475 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
475 nfs_free_createdata(data); 476 nfs_free_createdata(data);
476out: 477out:
477 dprintk("NFS reply mkdir: %d\n", status); 478 dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2d7525fbcf25..71fdc0dfa0d2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
269 269
270enum { 270enum {
271 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, 271 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
272 Opt_vers_4_1, 272 Opt_vers_4_1, Opt_vers_4_2,
273 273
274 Opt_vers_err 274 Opt_vers_err
275}; 275};
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
280 { Opt_vers_4, "4" }, 280 { Opt_vers_4, "4" },
281 { Opt_vers_4_0, "4.0" }, 281 { Opt_vers_4_0, "4.0" },
282 { Opt_vers_4_1, "4.1" }, 282 { Opt_vers_4_1, "4.1" },
283 { Opt_vers_4_2, "4.2" },
283 284
284 { Opt_vers_err, NULL } 285 { Opt_vers_err, NULL }
285}; 286};
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
832 seq_printf(m, "\n\tnfsv4:\t"); 833 seq_printf(m, "\n\tnfsv4:\t");
833 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 834 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
834 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 835 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
836 seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
835 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); 837 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
836 show_sessions(m, nfss); 838 show_sessions(m, nfss);
837 show_pnfs(m, nfss); 839 show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
1097 mnt->version = 4; 1099 mnt->version = 4;
1098 mnt->minorversion = 1; 1100 mnt->minorversion = 1;
1099 break; 1101 break;
1102 case Opt_vers_4_2:
1103 mnt->version = 4;
1104 mnt->minorversion = 2;
1105 break;
1100 default: 1106 default:
1101 return 0; 1107 return 0;
1102 } 1108 }
@@ -1608,29 +1614,13 @@ out_security_failure:
1608} 1614}
1609 1615
1610/* 1616/*
1611 * Select a security flavor for this mount. The selected flavor 1617 * Ensure that the specified authtype in args->auth_flavors[0] is supported by
1612 * is planted in args->auth_flavors[0]. 1618 * the server. Returns 0 if it's ok, and -EACCES if not.
1613 *
1614 * Returns 0 on success, -EACCES on failure.
1615 */ 1619 */
1616static int nfs_select_flavor(struct nfs_parsed_mount_data *args, 1620static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
1617 struct nfs_mount_request *request) 1621 rpc_authflavor_t *server_authlist, unsigned int count)
1618{ 1622{
1619 unsigned int i, count = *(request->auth_flav_len); 1623 unsigned int i;
1620 rpc_authflavor_t flavor;
1621
1622 /*
1623 * The NFSv2 MNT operation does not return a flavor list.
1624 */
1625 if (args->mount_server.version != NFS_MNT3_VERSION)
1626 goto out_default;
1627
1628 /*
1629 * Certain releases of Linux's mountd return an empty
1630 * flavor list in some cases.
1631 */
1632 if (count == 0)
1633 goto out_default;
1634 1624
1635 /* 1625 /*
1636 * If the sec= mount option is used, the specified flavor or AUTH_NULL 1626 * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
1640 * means that the server will ignore the rpc creds, so any flavor 1630 * means that the server will ignore the rpc creds, so any flavor
1641 * can be used. 1631 * can be used.
1642 */ 1632 */
1643 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
1644 for (i = 0; i < count; i++) {
1645 if (args->auth_flavors[0] == request->auth_flavs[i] ||
1646 request->auth_flavs[i] == RPC_AUTH_NULL)
1647 goto out;
1648 }
1649 dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
1650 args->auth_flavors[0]);
1651 goto out_err;
1652 }
1653
1654 /*
1655 * RFC 2623, section 2.7 suggests we SHOULD prefer the
1656 * flavor listed first. However, some servers list
1657 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
1658 */
1659 for (i = 0; i < count; i++) { 1633 for (i = 0; i < count; i++) {
1660 struct rpcsec_gss_info info; 1634 if (args->auth_flavors[0] == server_authlist[i] ||
1661 1635 server_authlist[i] == RPC_AUTH_NULL)
1662 flavor = request->auth_flavs[i]; 1636 goto out;
1663 switch (flavor) {
1664 case RPC_AUTH_UNIX:
1665 goto out_set;
1666 case RPC_AUTH_NULL:
1667 continue;
1668 default:
1669 if (rpcauth_get_gssinfo(flavor, &info) == 0)
1670 goto out_set;
1671 }
1672 } 1637 }
1673 1638
1674 /* 1639 dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
1675 * As a last chance, see if the server list contains AUTH_NULL - 1640 args->auth_flavors[0]);
1676 * if it does, use the default flavor. 1641 return -EACCES;
1677 */
1678 for (i = 0; i < count; i++) {
1679 if (request->auth_flavs[i] == RPC_AUTH_NULL)
1680 goto out_default;
1681 }
1682
1683 dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
1684 goto out_err;
1685 1642
1686out_default:
1687 /* use default if flavor not already set */
1688 flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
1689 RPC_AUTH_UNIX : args->auth_flavors[0];
1690out_set:
1691 args->auth_flavors[0] = flavor;
1692out: 1643out:
1693 dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); 1644 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1694 return 0; 1645 return 0;
1695out_err:
1696 return -EACCES;
1697} 1646}
1698 1647
1699/* 1648/*
@@ -1701,10 +1650,10 @@ out_err:
1701 * corresponding to the provided path. 1650 * corresponding to the provided path.
1702 */ 1651 */
1703static int nfs_request_mount(struct nfs_parsed_mount_data *args, 1652static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1704 struct nfs_fh *root_fh) 1653 struct nfs_fh *root_fh,
1654 rpc_authflavor_t *server_authlist,
1655 unsigned int *server_authlist_len)
1705{ 1656{
1706 rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
1707 unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
1708 struct nfs_mount_request request = { 1657 struct nfs_mount_request request = {
1709 .sap = (struct sockaddr *) 1658 .sap = (struct sockaddr *)
1710 &args->mount_server.address, 1659 &args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1712 .protocol = args->mount_server.protocol, 1661 .protocol = args->mount_server.protocol,
1713 .fh = root_fh, 1662 .fh = root_fh,
1714 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1663 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1715 .auth_flav_len = &server_authlist_len, 1664 .auth_flav_len = server_authlist_len,
1716 .auth_flavs = server_authlist, 1665 .auth_flavs = server_authlist,
1717 .net = args->net, 1666 .net = args->net,
1718 }; 1667 };
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1756 return status; 1705 return status;
1757 } 1706 }
1758 1707
1759 return nfs_select_flavor(args, &request); 1708 return 0;
1760} 1709}
1761 1710
1762struct dentry *nfs_try_mount(int flags, const char *dev_name, 1711static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
1763 struct nfs_mount_info *mount_info, 1712 struct nfs_subversion *nfs_mod)
1764 struct nfs_subversion *nfs_mod)
1765{ 1713{
1766 int status; 1714 int status;
1767 struct nfs_server *server; 1715 unsigned int i;
1716 bool tried_auth_unix = false;
1717 bool auth_null_in_list = false;
1718 struct nfs_server *server = ERR_PTR(-EACCES);
1719 struct nfs_parsed_mount_data *args = mount_info->parsed;
1720 rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
1721 unsigned int authlist_len = ARRAY_SIZE(authlist);
1722
1723 status = nfs_request_mount(args, mount_info->mntfh, authlist,
1724 &authlist_len);
1725 if (status)
1726 return ERR_PTR(status);
1768 1727
1769 if (mount_info->parsed->need_mount) { 1728 /*
1770 status = nfs_request_mount(mount_info->parsed, mount_info->mntfh); 1729 * Was a sec= authflavor specified in the options? First, verify
1730 * whether the server supports it, and then just try to use it if so.
1731 */
1732 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
1733 status = nfs_verify_authflavor(args, authlist, authlist_len);
1734 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1771 if (status) 1735 if (status)
1772 return ERR_PTR(status); 1736 return ERR_PTR(status);
1737 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1738 }
1739
1740 /*
1741 * No sec= option was provided. RFC 2623, section 2.7 suggests we
1742 * SHOULD prefer the flavor listed first. However, some servers list
1743 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
1744 */
1745 for (i = 0; i < authlist_len; ++i) {
1746 rpc_authflavor_t flavor;
1747 struct rpcsec_gss_info info;
1748
1749 flavor = authlist[i];
1750 switch (flavor) {
1751 case RPC_AUTH_UNIX:
1752 tried_auth_unix = true;
1753 break;
1754 case RPC_AUTH_NULL:
1755 auth_null_in_list = true;
1756 continue;
1757 default:
1758 if (rpcauth_get_gssinfo(flavor, &info) != 0)
1759 continue;
1760 /* Fallthrough */
1761 }
1762 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
1763 args->auth_flavors[0] = flavor;
1764 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1765 if (!IS_ERR(server))
1766 return server;
1773 } 1767 }
1774 1768
1775 /* Get a volume representation */ 1769 /*
1776 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); 1770 * Nothing we tried so far worked. At this point, give up if we've
1771 * already tried AUTH_UNIX or if the server's list doesn't contain
1772 * AUTH_NULL
1773 */
1774 if (tried_auth_unix || !auth_null_in_list)
1775 return server;
1776
1777 /* Last chance! Try AUTH_UNIX */
1778 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
1779 args->auth_flavors[0] = RPC_AUTH_UNIX;
1780 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1781}
1782
1783struct dentry *nfs_try_mount(int flags, const char *dev_name,
1784 struct nfs_mount_info *mount_info,
1785 struct nfs_subversion *nfs_mod)
1786{
1787 struct nfs_server *server;
1788
1789 if (mount_info->parsed->need_mount)
1790 server = nfs_try_mount_request(mount_info, nfs_mod);
1791 else
1792 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1793
1777 if (IS_ERR(server)) 1794 if (IS_ERR(server))
1778 return ERR_CAST(server); 1795 return ERR_CAST(server);
1779 1796
@@ -2412,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
2412int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, 2429int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
2413 struct nfs_mount_info *mount_info) 2430 struct nfs_mount_info *mount_info)
2414{ 2431{
2415 return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); 2432 int error;
2433 unsigned long kflags = 0, kflags_out = 0;
2434 if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
2435 kflags |= SECURITY_LSM_NATIVE_LABELS;
2436
2437 error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
2438 kflags, &kflags_out);
2439 if (error)
2440 goto err;
2441
2442 if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
2443 !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
2444 NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
2445err:
2446 return error;
2416} 2447}
2417EXPORT_SYMBOL_GPL(nfs_set_sb_security); 2448EXPORT_SYMBOL_GPL(nfs_set_sb_security);
2418 2449
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
479 479
480 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", 480 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
481 dentry->d_parent->d_name.name, dentry->d_name.name, 481 dentry->d_parent->d_name.name, dentry->d_name.name,
482 dentry->d_count); 482 d_count(dentry));
483 nfs_inc_stats(dir, NFSIOS_SILLYRENAME); 483 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
484 484
485 /* 485 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a2c7c28049d5..f1bdb7254776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -888,6 +888,28 @@ out:
888 return PageUptodate(page) != 0; 888 return PageUptodate(page) != 0;
889} 889}
890 890
891/* If we know the page is up to date, and we're not using byte range locks (or
892 * if we have the whole file locked for writing), it may be more efficient to
893 * extend the write to cover the entire page in order to avoid fragmentation
894 * inefficiencies.
895 *
896 * If the file is opened for synchronous writes or if we have a write delegation
897 * from the server then we can just skip the rest of the checks.
898 */
899static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
900{
901 if (file->f_flags & O_DSYNC)
902 return 0;
903 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
904 return 1;
905 if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
906 (inode->i_flock->fl_start == 0 &&
907 inode->i_flock->fl_end == OFFSET_MAX &&
908 inode->i_flock->fl_type != F_RDLCK)))
909 return 1;
910 return 0;
911}
912
891/* 913/*
892 * Update and possibly write a cached page of an NFS file. 914 * Update and possibly write a cached page of an NFS file.
893 * 915 *
@@ -908,14 +930,7 @@ int nfs_updatepage(struct file *file, struct page *page,
908 file->f_path.dentry->d_name.name, count, 930 file->f_path.dentry->d_name.name, count,
909 (long long)(page_file_offset(page) + offset)); 931 (long long)(page_file_offset(page) + offset));
910 932
911 /* If we're not using byte range locks, and we know the page 933 if (nfs_can_extend_write(file, page, inode)) {
912 * is up to date, it may be more efficient to extend the write
913 * to cover the entire page in order to avoid fragmentation
914 * inefficiencies.
915 */
916 if (nfs_write_pageuptodate(page, inode) &&
917 inode->i_flock == NULL &&
918 !(file->f_flags & O_DSYNC)) {
919 count = max(count + offset, nfs_page_length(page)); 934 count = max(count + offset, nfs_page_length(page));
920 offset = 0; 935 offset = 0;
921 } 936 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
81 81
82 If unsure, say N. 82 If unsure, say N.
83 83
84config NFSD_V4_SECURITY_LABEL
85 bool "Provide Security Label support for NFSv4 server"
86 depends on NFSD_V4 && SECURITY
87 help
88
89 Say Y here if you want enable fine-grained security label attribute
90 support for NFS version 4. Security labels allow security modules like
91 SELinux and Smack to label files to facilitate enforcement of their policies.
92 Without this an NFSv4 mount will have the same label on each file.
93
94 If you do not wish to enable fine-grained security labels SELinux or
95 Smack policies on NFSv4 files, say N.
96
97 WARNING: there is still a chance of backwards-incompatible protocol changes.
98 For now we recommend "Y" only for developers and testers."
99
84config NFSD_FAULT_INJECTION 100config NFSD_FAULT_INJECTION
85 bool "NFS server manual fault injection" 101 bool "NFS server manual fault injection"
86 depends on NFSD_V4 && DEBUG_KERNEL 102 depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 27d74a294515..0d4c410e4589 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
42#include "current_stateid.h" 42#include "current_stateid.h"
43#include "netns.h" 43#include "netns.h"
44 44
45#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
46#include <linux/security.h>
47
48static inline void
49nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
50{
51 struct inode *inode = resfh->fh_dentry->d_inode;
52 int status;
53
54 mutex_lock(&inode->i_mutex);
55 status = security_inode_setsecctx(resfh->fh_dentry,
56 label->data, label->len);
57 mutex_unlock(&inode->i_mutex);
58
59 if (status)
60 /*
61 * XXX: We should really fail the whole open, but we may
62 * already have created a new file, so it may be too
63 * late. For now this seems the least of evils:
64 */
65 bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
66
67 return;
68}
69#else
70static inline void
71nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
72{ }
73#endif
74
45#define NFSDDBG_FACILITY NFSDDBG_PROC 75#define NFSDDBG_FACILITY NFSDDBG_PROC
46 76
47static u32 nfsd_attrmask[] = { 77static u32 nfsd_attrmask[] = {
@@ -239,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
239 (u32 *)open->op_verf.data, 269 (u32 *)open->op_verf.data,
240 &open->op_truncate, &open->op_created); 270 &open->op_truncate, &open->op_created);
241 271
272 if (!status && open->op_label.len)
273 nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
274
242 /* 275 /*
243 * Following rfc 3530 14.2.16, use the returned bitmask 276 * Following rfc 3530 14.2.16, use the returned bitmask
244 * to indicate which attributes we used to store the 277 * to indicate which attributes we used to store the
@@ -263,7 +296,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
263 296
264 nfsd4_set_open_owner_reply_cache(cstate, open, resfh); 297 nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
265 accmode = NFSD_MAY_NOP; 298 accmode = NFSD_MAY_NOP;
266 if (open->op_created) 299 if (open->op_created ||
300 open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
267 accmode |= NFSD_MAY_OWNER_OVERRIDE; 301 accmode |= NFSD_MAY_OWNER_OVERRIDE;
268 status = do_open_permission(rqstp, resfh, open, accmode); 302 status = do_open_permission(rqstp, resfh, open, accmode);
269 set_change_info(&open->op_cinfo, current_fh); 303 set_change_info(&open->op_cinfo, current_fh);
@@ -637,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
637 if (status) 671 if (status)
638 goto out; 672 goto out;
639 673
674 if (create->cr_label.len)
675 nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
676
640 if (create->cr_acl != NULL) 677 if (create->cr_acl != NULL)
641 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, 678 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
642 create->cr_bmval); 679 create->cr_bmval);
@@ -916,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
916 setattr->sa_acl); 953 setattr->sa_acl);
917 if (status) 954 if (status)
918 goto out; 955 goto out;
956 if (setattr->sa_label.len)
957 status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
958 &setattr->sa_label);
959 if (status)
960 goto out;
919 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 961 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
920 0, (time_t)0); 962 0, (time_t)0);
921out: 963out:
@@ -1251,7 +1293,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1251 * According to RFC3010, this takes precedence over all other errors. 1293 * According to RFC3010, this takes precedence over all other errors.
1252 */ 1294 */
1253 status = nfserr_minor_vers_mismatch; 1295 status = nfserr_minor_vers_mismatch;
1254 if (args->minorversion > nfsd_supported_minorversion) 1296 if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
1255 goto out; 1297 goto out;
1256 1298
1257 status = nfs41_check_op_ordering(args); 1299 status = nfs41_check_op_ordering(args);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f17051838b41..280acef6f0dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -97,19 +97,20 @@ nfs4_lock_state(void)
97 97
98static void free_session(struct nfsd4_session *); 98static void free_session(struct nfsd4_session *);
99 99
100void nfsd4_put_session(struct nfsd4_session *ses) 100static bool is_session_dead(struct nfsd4_session *ses)
101{ 101{
102 atomic_dec(&ses->se_ref); 102 return ses->se_flags & NFS4_SESSION_DEAD;
103} 103}
104 104
105static bool is_session_dead(struct nfsd4_session *ses) 105void nfsd4_put_session(struct nfsd4_session *ses)
106{ 106{
107 return ses->se_flags & NFS4_SESSION_DEAD; 107 if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
108 free_session(ses);
108} 109}
109 110
110static __be32 mark_session_dead_locked(struct nfsd4_session *ses) 111static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
111{ 112{
112 if (atomic_read(&ses->se_ref)) 113 if (atomic_read(&ses->se_ref) > ref_held_by_me)
113 return nfserr_jukebox; 114 return nfserr_jukebox;
114 ses->se_flags |= NFS4_SESSION_DEAD; 115 ses->se_flags |= NFS4_SESSION_DEAD;
115 return nfs_ok; 116 return nfs_ok;
@@ -364,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
364} 365}
365 366
366static struct nfs4_delegation * 367static struct nfs4_delegation *
367alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) 368alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
368{ 369{
369 struct nfs4_delegation *dp; 370 struct nfs4_delegation *dp;
370 struct nfs4_file *fp = stp->st_file; 371 struct nfs4_file *fp = stp->st_file;
371 372
372 dprintk("NFSD alloc_init_deleg\n"); 373 dprintk("NFSD alloc_init_deleg\n");
373 /*
374 * Major work on the lease subsystem (for example, to support
375 * calbacks on stat) will be required before we can support
376 * write delegations properly.
377 */
378 if (type != NFS4_OPEN_DELEGATE_READ)
379 return NULL;
380 if (fp->fi_had_conflict) 374 if (fp->fi_had_conflict)
381 return NULL; 375 return NULL;
382 if (num_delegations > max_delegations) 376 if (num_delegations > max_delegations)
@@ -397,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
397 INIT_LIST_HEAD(&dp->dl_recall_lru); 391 INIT_LIST_HEAD(&dp->dl_recall_lru);
398 get_nfs4_file(fp); 392 get_nfs4_file(fp);
399 dp->dl_file = fp; 393 dp->dl_file = fp;
400 dp->dl_type = type; 394 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
401 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 395 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
402 dp->dl_time = 0; 396 dp->dl_time = 0;
403 atomic_set(&dp->dl_count, 1); 397 atomic_set(&dp->dl_count, 1);
@@ -1188,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
1188 target->cr_gid = source->cr_gid; 1182 target->cr_gid = source->cr_gid;
1189 target->cr_group_info = source->cr_group_info; 1183 target->cr_group_info = source->cr_group_info;
1190 get_group_info(target->cr_group_info); 1184 get_group_info(target->cr_group_info);
1185 target->cr_gss_mech = source->cr_gss_mech;
1186 if (source->cr_gss_mech)
1187 gss_mech_get(source->cr_gss_mech);
1191 return 0; 1188 return 0;
1192} 1189}
1193 1190
@@ -1262,6 +1259,31 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1262 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); 1259 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
1263} 1260}
1264 1261
1262static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
1263{
1264 struct svc_cred *cr = &rqstp->rq_cred;
1265 u32 service;
1266
1267 service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
1268 return service == RPC_GSS_SVC_INTEGRITY ||
1269 service == RPC_GSS_SVC_PRIVACY;
1270}
1271
1272static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
1273{
1274 struct svc_cred *cr = &rqstp->rq_cred;
1275
1276 if (!cl->cl_mach_cred)
1277 return true;
1278 if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
1279 return false;
1280 if (!svc_rqst_integrity_protected(rqstp))
1281 return false;
1282 if (!cr->cr_principal)
1283 return false;
1284 return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
1285}
1286
1265static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) 1287static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
1266{ 1288{
1267 static u32 current_clientid = 1; 1289 static u32 current_clientid = 1;
@@ -1639,16 +1661,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1639 if (exid->flags & ~EXCHGID4_FLAG_MASK_A) 1661 if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
1640 return nfserr_inval; 1662 return nfserr_inval;
1641 1663
1642 /* Currently only support SP4_NONE */
1643 switch (exid->spa_how) { 1664 switch (exid->spa_how) {
1665 case SP4_MACH_CRED:
1666 if (!svc_rqst_integrity_protected(rqstp))
1667 return nfserr_inval;
1644 case SP4_NONE: 1668 case SP4_NONE:
1645 break; 1669 break;
1646 default: /* checked by xdr code */ 1670 default: /* checked by xdr code */
1647 WARN_ON_ONCE(1); 1671 WARN_ON_ONCE(1);
1648 case SP4_SSV: 1672 case SP4_SSV:
1649 return nfserr_encr_alg_unsupp; 1673 return nfserr_encr_alg_unsupp;
1650 case SP4_MACH_CRED:
1651 return nfserr_serverfault; /* no excuse :-/ */
1652 } 1674 }
1653 1675
1654 /* Cases below refer to rfc 5661 section 18.35.4: */ 1676 /* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1663,6 +1685,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1663 status = nfserr_inval; 1685 status = nfserr_inval;
1664 goto out; 1686 goto out;
1665 } 1687 }
1688 if (!mach_creds_match(conf, rqstp)) {
1689 status = nfserr_wrong_cred;
1690 goto out;
1691 }
1666 if (!creds_match) { /* case 9 */ 1692 if (!creds_match) { /* case 9 */
1667 status = nfserr_perm; 1693 status = nfserr_perm;
1668 goto out; 1694 goto out;
@@ -1709,7 +1735,8 @@ out_new:
1709 status = nfserr_jukebox; 1735 status = nfserr_jukebox;
1710 goto out; 1736 goto out;
1711 } 1737 }
1712 new->cl_minorversion = 1; 1738 new->cl_minorversion = cstate->minorversion;
1739 new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
1713 1740
1714 gen_clid(new, nn); 1741 gen_clid(new, nn);
1715 add_to_unconfirmed(new); 1742 add_to_unconfirmed(new);
@@ -1839,6 +1866,24 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
1839 return nfs_ok; 1866 return nfs_ok;
1840} 1867}
1841 1868
1869static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
1870{
1871 switch (cbs->flavor) {
1872 case RPC_AUTH_NULL:
1873 case RPC_AUTH_UNIX:
1874 return nfs_ok;
1875 default:
1876 /*
1877 * GSS case: the spec doesn't allow us to return this
1878 * error. But it also doesn't allow us not to support
1879 * GSS.
1880 * I'd rather this fail hard than return some error the
1881 * client might think it can already handle:
1882 */
1883 return nfserr_encr_alg_unsupp;
1884 }
1885}
1886
1842__be32 1887__be32
1843nfsd4_create_session(struct svc_rqst *rqstp, 1888nfsd4_create_session(struct svc_rqst *rqstp,
1844 struct nfsd4_compound_state *cstate, 1889 struct nfsd4_compound_state *cstate,
@@ -1854,6 +1899,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1854 1899
1855 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1900 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1856 return nfserr_inval; 1901 return nfserr_inval;
1902 status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
1903 if (status)
1904 return status;
1857 status = check_forechannel_attrs(&cr_ses->fore_channel, nn); 1905 status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
1858 if (status) 1906 if (status)
1859 return status; 1907 return status;
@@ -1874,6 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1874 WARN_ON_ONCE(conf && unconf); 1922 WARN_ON_ONCE(conf && unconf);
1875 1923
1876 if (conf) { 1924 if (conf) {
1925 status = nfserr_wrong_cred;
1926 if (!mach_creds_match(conf, rqstp))
1927 goto out_free_conn;
1877 cs_slot = &conf->cl_cs_slot; 1928 cs_slot = &conf->cl_cs_slot;
1878 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1929 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1879 if (status == nfserr_replay_cache) { 1930 if (status == nfserr_replay_cache) {
@@ -1890,6 +1941,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1890 status = nfserr_clid_inuse; 1941 status = nfserr_clid_inuse;
1891 goto out_free_conn; 1942 goto out_free_conn;
1892 } 1943 }
1944 status = nfserr_wrong_cred;
1945 if (!mach_creds_match(unconf, rqstp))
1946 goto out_free_conn;
1893 cs_slot = &unconf->cl_cs_slot; 1947 cs_slot = &unconf->cl_cs_slot;
1894 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1948 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1895 if (status) { 1949 if (status) {
@@ -1957,7 +2011,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
1957{ 2011{
1958 struct nfsd4_session *session = cstate->session; 2012 struct nfsd4_session *session = cstate->session;
1959 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2013 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2014 __be32 status;
1960 2015
2016 status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
2017 if (status)
2018 return status;
1961 spin_lock(&nn->client_lock); 2019 spin_lock(&nn->client_lock);
1962 session->se_cb_prog = bc->bc_cb_program; 2020 session->se_cb_prog = bc->bc_cb_program;
1963 session->se_cb_sec = bc->bc_cb_sec; 2021 session->se_cb_sec = bc->bc_cb_sec;
@@ -1986,6 +2044,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1986 status = nfserr_badsession; 2044 status = nfserr_badsession;
1987 if (!session) 2045 if (!session)
1988 goto out; 2046 goto out;
2047 status = nfserr_wrong_cred;
2048 if (!mach_creds_match(session->se_client, rqstp))
2049 goto out;
1989 status = nfsd4_map_bcts_dir(&bcts->dir); 2050 status = nfsd4_map_bcts_dir(&bcts->dir);
1990 if (status) 2051 if (status)
1991 goto out; 2052 goto out;
@@ -2014,6 +2075,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
2014{ 2075{
2015 struct nfsd4_session *ses; 2076 struct nfsd4_session *ses;
2016 __be32 status; 2077 __be32 status;
2078 int ref_held_by_me = 0;
2017 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); 2079 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
2018 2080
2019 nfs4_lock_state(); 2081 nfs4_lock_state();
@@ -2021,6 +2083,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
2021 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { 2083 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
2022 if (!nfsd4_last_compound_op(r)) 2084 if (!nfsd4_last_compound_op(r))
2023 goto out; 2085 goto out;
2086 ref_held_by_me++;
2024 } 2087 }
2025 dump_sessionid(__func__, &sessionid->sessionid); 2088 dump_sessionid(__func__, &sessionid->sessionid);
2026 spin_lock(&nn->client_lock); 2089 spin_lock(&nn->client_lock);
@@ -2028,17 +2091,22 @@ nfsd4_destroy_session(struct svc_rqst *r,
2028 status = nfserr_badsession; 2091 status = nfserr_badsession;
2029 if (!ses) 2092 if (!ses)
2030 goto out_client_lock; 2093 goto out_client_lock;
2031 status = mark_session_dead_locked(ses); 2094 status = nfserr_wrong_cred;
2032 if (status) 2095 if (!mach_creds_match(ses->se_client, r))
2033 goto out_client_lock; 2096 goto out_client_lock;
2097 nfsd4_get_session_locked(ses);
2098 status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
2099 if (status)
2100 goto out_put_session;
2034 unhash_session(ses); 2101 unhash_session(ses);
2035 spin_unlock(&nn->client_lock); 2102 spin_unlock(&nn->client_lock);
2036 2103
2037 nfsd4_probe_callback_sync(ses->se_client); 2104 nfsd4_probe_callback_sync(ses->se_client);
2038 2105
2039 spin_lock(&nn->client_lock); 2106 spin_lock(&nn->client_lock);
2040 free_session(ses);
2041 status = nfs_ok; 2107 status = nfs_ok;
2108out_put_session:
2109 nfsd4_put_session(ses);
2042out_client_lock: 2110out_client_lock:
2043 spin_unlock(&nn->client_lock); 2111 spin_unlock(&nn->client_lock);
2044out: 2112out:
@@ -2058,26 +2126,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
2058 return NULL; 2126 return NULL;
2059} 2127}
2060 2128
2061static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) 2129static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
2062{ 2130{
2063 struct nfs4_client *clp = ses->se_client; 2131 struct nfs4_client *clp = ses->se_client;
2064 struct nfsd4_conn *c; 2132 struct nfsd4_conn *c;
2133 __be32 status = nfs_ok;
2065 int ret; 2134 int ret;
2066 2135
2067 spin_lock(&clp->cl_lock); 2136 spin_lock(&clp->cl_lock);
2068 c = __nfsd4_find_conn(new->cn_xprt, ses); 2137 c = __nfsd4_find_conn(new->cn_xprt, ses);
2069 if (c) { 2138 if (c)
2070 spin_unlock(&clp->cl_lock); 2139 goto out_free;
2071 free_conn(new); 2140 status = nfserr_conn_not_bound_to_session;
2072 return; 2141 if (clp->cl_mach_cred)
2073 } 2142 goto out_free;
2074 __nfsd4_hash_conn(new, ses); 2143 __nfsd4_hash_conn(new, ses);
2075 spin_unlock(&clp->cl_lock); 2144 spin_unlock(&clp->cl_lock);
2076 ret = nfsd4_register_conn(new); 2145 ret = nfsd4_register_conn(new);
2077 if (ret) 2146 if (ret)
2078 /* oops; xprt is already down: */ 2147 /* oops; xprt is already down: */
2079 nfsd4_conn_lost(&new->cn_xpt_user); 2148 nfsd4_conn_lost(&new->cn_xpt_user);
2080 return; 2149 return nfs_ok;
2150out_free:
2151 spin_unlock(&clp->cl_lock);
2152 free_conn(new);
2153 return status;
2081} 2154}
2082 2155
2083static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) 2156static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2169,8 +2242,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2169 if (status) 2242 if (status)
2170 goto out_put_session; 2243 goto out_put_session;
2171 2244
2172 nfsd4_sequence_check_conn(conn, session); 2245 status = nfsd4_sequence_check_conn(conn, session);
2173 conn = NULL; 2246 conn = NULL;
2247 if (status)
2248 goto out_put_session;
2174 2249
2175 /* Success! bump slot seqid */ 2250 /* Success! bump slot seqid */
2176 slot->sl_seqid = seq->seqid; 2251 slot->sl_seqid = seq->seqid;
@@ -2232,7 +2307,10 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2232 status = nfserr_stale_clientid; 2307 status = nfserr_stale_clientid;
2233 goto out; 2308 goto out;
2234 } 2309 }
2235 2310 if (!mach_creds_match(clp, rqstp)) {
2311 status = nfserr_wrong_cred;
2312 goto out;
2313 }
2236 expire_client(clp); 2314 expire_client(clp);
2237out: 2315out:
2238 nfs4_unlock_state(); 2316 nfs4_unlock_state();
@@ -2940,13 +3018,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
2940 return fl; 3018 return fl;
2941} 3019}
2942 3020
2943static int nfs4_setlease(struct nfs4_delegation *dp, int flag) 3021static int nfs4_setlease(struct nfs4_delegation *dp)
2944{ 3022{
2945 struct nfs4_file *fp = dp->dl_file; 3023 struct nfs4_file *fp = dp->dl_file;
2946 struct file_lock *fl; 3024 struct file_lock *fl;
2947 int status; 3025 int status;
2948 3026
2949 fl = nfs4_alloc_init_lease(dp, flag); 3027 fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
2950 if (!fl) 3028 if (!fl)
2951 return -ENOMEM; 3029 return -ENOMEM;
2952 fl->fl_file = find_readable_file(fp); 3030 fl->fl_file = find_readable_file(fp);
@@ -2964,12 +3042,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2964 return 0; 3042 return 0;
2965} 3043}
2966 3044
2967static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) 3045static int nfs4_set_delegation(struct nfs4_delegation *dp)
2968{ 3046{
2969 struct nfs4_file *fp = dp->dl_file; 3047 struct nfs4_file *fp = dp->dl_file;
2970 3048
2971 if (!fp->fi_lease) 3049 if (!fp->fi_lease)
2972 return nfs4_setlease(dp, flag); 3050 return nfs4_setlease(dp);
2973 spin_lock(&recall_lock); 3051 spin_lock(&recall_lock);
2974 if (fp->fi_had_conflict) { 3052 if (fp->fi_had_conflict) {
2975 spin_unlock(&recall_lock); 3053 spin_unlock(&recall_lock);
@@ -3005,6 +3083,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
3005 3083
3006/* 3084/*
3007 * Attempt to hand out a delegation. 3085 * Attempt to hand out a delegation.
3086 *
3087 * Note we don't support write delegations, and won't until the vfs has
3088 * proper support for them.
3008 */ 3089 */
3009static void 3090static void
3010nfs4_open_delegation(struct net *net, struct svc_fh *fh, 3091nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -3013,39 +3094,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3013 struct nfs4_delegation *dp; 3094 struct nfs4_delegation *dp;
3014 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); 3095 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
3015 int cb_up; 3096 int cb_up;
3016 int status = 0, flag = 0; 3097 int status = 0;
3017 3098
3018 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); 3099 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
3019 flag = NFS4_OPEN_DELEGATE_NONE;
3020 open->op_recall = 0; 3100 open->op_recall = 0;
3021 switch (open->op_claim_type) { 3101 switch (open->op_claim_type) {
3022 case NFS4_OPEN_CLAIM_PREVIOUS: 3102 case NFS4_OPEN_CLAIM_PREVIOUS:
3023 if (!cb_up) 3103 if (!cb_up)
3024 open->op_recall = 1; 3104 open->op_recall = 1;
3025 flag = open->op_delegate_type; 3105 if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
3026 if (flag == NFS4_OPEN_DELEGATE_NONE) 3106 goto out_no_deleg;
3027 goto out;
3028 break; 3107 break;
3029 case NFS4_OPEN_CLAIM_NULL: 3108 case NFS4_OPEN_CLAIM_NULL:
3030 /* Let's not give out any delegations till everyone's 3109 /*
3031 * had the chance to reclaim theirs.... */ 3110 * Let's not give out any delegations till everyone's
3111 * had the chance to reclaim theirs....
3112 */
3032 if (locks_in_grace(net)) 3113 if (locks_in_grace(net))
3033 goto out; 3114 goto out_no_deleg;
3034 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) 3115 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
3035 goto out; 3116 goto out_no_deleg;
3117 /*
3118 * Also, if the file was opened for write or
3119 * create, there's a good chance the client's
3120 * about to write to it, resulting in an
3121 * immediate recall (since we don't support
3122 * write delegations):
3123 */
3036 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 3124 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
3037 flag = NFS4_OPEN_DELEGATE_WRITE; 3125 goto out_no_deleg;
3038 else 3126 if (open->op_create == NFS4_OPEN_CREATE)
3039 flag = NFS4_OPEN_DELEGATE_READ; 3127 goto out_no_deleg;
3040 break; 3128 break;
3041 default: 3129 default:
3042 goto out; 3130 goto out_no_deleg;
3043 } 3131 }
3044 3132 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
3045 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
3046 if (dp == NULL) 3133 if (dp == NULL)
3047 goto out_no_deleg; 3134 goto out_no_deleg;
3048 status = nfs4_set_delegation(dp, flag); 3135 status = nfs4_set_delegation(dp);
3049 if (status) 3136 if (status)
3050 goto out_free; 3137 goto out_free;
3051 3138
@@ -3053,24 +3140,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3053 3140
3054 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", 3141 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
3055 STATEID_VAL(&dp->dl_stid.sc_stateid)); 3142 STATEID_VAL(&dp->dl_stid.sc_stateid));
3056out: 3143 open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
3057 open->op_delegate_type = flag;
3058 if (flag == NFS4_OPEN_DELEGATE_NONE) {
3059 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
3060 open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
3061 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
3062
3063 /* 4.1 client asking for a delegation? */
3064 if (open->op_deleg_want)
3065 nfsd4_open_deleg_none_ext(open, status);
3066 }
3067 return; 3144 return;
3068out_free: 3145out_free:
3069 unhash_stid(&dp->dl_stid); 3146 unhash_stid(&dp->dl_stid);
3070 nfs4_put_delegation(dp); 3147 nfs4_put_delegation(dp);
3071out_no_deleg: 3148out_no_deleg:
3072 flag = NFS4_OPEN_DELEGATE_NONE; 3149 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
3073 goto out; 3150 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
3151 open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
3152 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
3153 open->op_recall = 1;
3154 }
3155
3156 /* 4.1 client asking for a delegation? */
3157 if (open->op_deleg_want)
3158 nfsd4_open_deleg_none_ext(open, status);
3159 return;
3074} 3160}
3075 3161
3076static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, 3162static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3427,7 +3513,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
3427/* Returns true iff a is later than b: */ 3513/* Returns true iff a is later than b: */
3428static bool stateid_generation_after(stateid_t *a, stateid_t *b) 3514static bool stateid_generation_after(stateid_t *a, stateid_t *b)
3429{ 3515{
3430 return (s32)a->si_generation - (s32)b->si_generation > 0; 3516 return (s32)(a->si_generation - b->si_generation) > 0;
3431} 3517}
3432 3518
3433static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) 3519static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -4435,7 +4521,6 @@ __be32
4435nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 4521nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4436 struct nfsd4_locku *locku) 4522 struct nfsd4_locku *locku)
4437{ 4523{
4438 struct nfs4_lockowner *lo;
4439 struct nfs4_ol_stateid *stp; 4524 struct nfs4_ol_stateid *stp;
4440 struct file *filp = NULL; 4525 struct file *filp = NULL;
4441 struct file_lock *file_lock = NULL; 4526 struct file_lock *file_lock = NULL;
@@ -4468,10 +4553,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4468 status = nfserr_jukebox; 4553 status = nfserr_jukebox;
4469 goto out; 4554 goto out;
4470 } 4555 }
4471 lo = lockowner(stp->st_stateowner);
4472 locks_init_lock(file_lock); 4556 locks_init_lock(file_lock);
4473 file_lock->fl_type = F_UNLCK; 4557 file_lock->fl_type = F_UNLCK;
4474 file_lock->fl_owner = (fl_owner_t)lo; 4558 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4475 file_lock->fl_pid = current->tgid; 4559 file_lock->fl_pid = current->tgid;
4476 file_lock->fl_file = filp; 4560 file_lock->fl_file = filp;
4477 file_lock->fl_flags = FL_POSIX; 4561 file_lock->fl_flags = FL_POSIX;
@@ -4490,11 +4574,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4490 update_stateid(&stp->st_stid.sc_stateid); 4574 update_stateid(&stp->st_stid.sc_stateid);
4491 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4575 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4492 4576
4493 if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
4494 WARN_ON_ONCE(cstate->replay_owner);
4495 release_lockowner(lo);
4496 }
4497
4498out: 4577out:
4499 nfsd4_bump_seqid(cstate, status); 4578 nfsd4_bump_seqid(cstate, status);
4500 if (!cstate->replay_owner) 4579 if (!cstate->replay_owner)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe450..0c0f3ea90de5 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
55#include "cache.h" 55#include "cache.h"
56#include "netns.h" 56#include "netns.h"
57 57
58#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
59#include <linux/security.h>
60#endif
61
62
58#define NFSDDBG_FACILITY NFSDDBG_XDR 63#define NFSDDBG_FACILITY NFSDDBG_XDR
59 64
60/* 65/*
@@ -134,6 +139,19 @@ xdr_error: \
134 } \ 139 } \
135} while (0) 140} while (0)
136 141
142static void next_decode_page(struct nfsd4_compoundargs *argp)
143{
144 argp->pagelist++;
145 argp->p = page_address(argp->pagelist[0]);
146 if (argp->pagelen < PAGE_SIZE) {
147 argp->end = argp->p + (argp->pagelen>>2);
148 argp->pagelen = 0;
149 } else {
150 argp->end = argp->p + (PAGE_SIZE>>2);
151 argp->pagelen -= PAGE_SIZE;
152 }
153}
154
137static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) 155static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
138{ 156{
139 /* We want more bytes than seem to be available. 157 /* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
161 * guarantee p points to at least nbytes bytes. 179 * guarantee p points to at least nbytes bytes.
162 */ 180 */
163 memcpy(p, argp->p, avail); 181 memcpy(p, argp->p, avail);
164 /* step to next page */ 182 next_decode_page(argp);
165 argp->p = page_address(argp->pagelist[0]);
166 argp->pagelist++;
167 if (argp->pagelen < PAGE_SIZE) {
168 argp->end = argp->p + (argp->pagelen>>2);
169 argp->pagelen = 0;
170 } else {
171 argp->end = argp->p + (PAGE_SIZE>>2);
172 argp->pagelen -= PAGE_SIZE;
173 }
174 memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); 183 memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
175 argp->p += XDR_QUADLEN(nbytes - avail); 184 argp->p += XDR_QUADLEN(nbytes - avail);
176 return p; 185 return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
242 251
243static __be32 252static __be32
244nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, 253nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
245 struct iattr *iattr, struct nfs4_acl **acl) 254 struct iattr *iattr, struct nfs4_acl **acl,
255 struct xdr_netobj *label)
246{ 256{
247 int expected_len, len = 0; 257 int expected_len, len = 0;
248 u32 dummy32; 258 u32 dummy32;
@@ -380,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
380 goto xdr_error; 390 goto xdr_error;
381 } 391 }
382 } 392 }
393
394 label->len = 0;
395#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
396 if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
397 READ_BUF(4);
398 len += 4;
399 READ32(dummy32); /* lfs: we don't use it */
400 READ_BUF(4);
401 len += 4;
402 READ32(dummy32); /* pi: we don't use it either */
403 READ_BUF(4);
404 len += 4;
405 READ32(dummy32);
406 READ_BUF(dummy32);
407 if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
408 return nfserr_badlabel;
409 len += (XDR_QUADLEN(dummy32) << 2);
410 READMEM(buf, dummy32);
411 label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
412 if (!label->data)
413 return nfserr_jukebox;
414 defer_free(argp, kfree, label->data);
415 memcpy(label->data, buf, dummy32);
416 }
417#endif
418
383 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 419 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
384 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 420 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
385 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) 421 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -428,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
428 /* callback_sec_params4 */ 464 /* callback_sec_params4 */
429 READ_BUF(4); 465 READ_BUF(4);
430 READ32(nr_secflavs); 466 READ32(nr_secflavs);
431 cbs->flavor = (u32)(-1); 467 if (nr_secflavs)
468 cbs->flavor = (u32)(-1);
469 else
470 /* Is this legal? Be generous, take it to mean AUTH_NONE: */
471 cbs->flavor = 0;
432 for (i = 0; i < nr_secflavs; ++i) { 472 for (i = 0; i < nr_secflavs; ++i) {
433 READ_BUF(4); 473 READ_BUF(4);
434 READ32(dummy); 474 READ32(dummy);
@@ -576,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
576 return status; 616 return status;
577 617
578 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, 618 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
579 &create->cr_acl); 619 &create->cr_acl, &create->cr_label);
580 if (status) 620 if (status)
581 goto out; 621 goto out;
582 622
@@ -827,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
827 case NFS4_CREATE_UNCHECKED: 867 case NFS4_CREATE_UNCHECKED:
828 case NFS4_CREATE_GUARDED: 868 case NFS4_CREATE_GUARDED:
829 status = nfsd4_decode_fattr(argp, open->op_bmval, 869 status = nfsd4_decode_fattr(argp, open->op_bmval,
830 &open->op_iattr, &open->op_acl); 870 &open->op_iattr, &open->op_acl, &open->op_label);
831 if (status) 871 if (status)
832 goto out; 872 goto out;
833 break; 873 break;
@@ -841,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
841 READ_BUF(NFS4_VERIFIER_SIZE); 881 READ_BUF(NFS4_VERIFIER_SIZE);
842 COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); 882 COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
843 status = nfsd4_decode_fattr(argp, open->op_bmval, 883 status = nfsd4_decode_fattr(argp, open->op_bmval,
844 &open->op_iattr, &open->op_acl); 884 &open->op_iattr, &open->op_acl, &open->op_label);
845 if (status) 885 if (status)
846 goto out; 886 goto out;
847 break; 887 break;
@@ -1063,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
1063 if (status) 1103 if (status)
1064 return status; 1104 return status;
1065 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, 1105 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
1066 &setattr->sa_acl); 1106 &setattr->sa_acl, &setattr->sa_label);
1067} 1107}
1068 1108
1069static __be32 1109static __be32
@@ -1567,6 +1607,7 @@ struct nfsd4_minorversion_ops {
1567static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1607static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1568 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1608 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1569 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, 1609 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1610 [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1570}; 1611};
1571 1612
1572static __be32 1613static __be32
@@ -1953,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
1953 FATTR4_WORD0_RDATTR_ERROR) 1994 FATTR4_WORD0_RDATTR_ERROR)
1954#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID 1995#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
1955 1996
1997#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
1998static inline __be32
1999nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
2000{
2001 __be32 *p = *pp;
2002
2003 if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
2004 return nfserr_resource;
2005
2006 /*
2007 * For now we use a 0 here to indicate the null translation; in
2008 * the future we may place a call to translation code here.
2009 */
2010 if ((*buflen -= 8) < 0)
2011 return nfserr_resource;
2012
2013 WRITE32(0); /* lfs */
2014 WRITE32(0); /* pi */
2015 p = xdr_encode_opaque(p, context, len);
2016 *buflen -= (XDR_QUADLEN(len) << 2) + 4;
2017
2018 *pp = p;
2019 return 0;
2020}
2021#else
2022static inline __be32
2023nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
2024{ return 0; }
2025#endif
2026
1956static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) 2027static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
1957{ 2028{
1958 /* As per referral draft: */ 2029 /* As per referral draft: */
@@ -2012,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2012 int err; 2083 int err;
2013 int aclsupport = 0; 2084 int aclsupport = 0;
2014 struct nfs4_acl *acl = NULL; 2085 struct nfs4_acl *acl = NULL;
2086 void *context = NULL;
2087 int contextlen;
2088 bool contextsupport = false;
2015 struct nfsd4_compoundres *resp = rqstp->rq_resp; 2089 struct nfsd4_compoundres *resp = rqstp->rq_resp;
2016 u32 minorversion = resp->cstate.minorversion; 2090 u32 minorversion = resp->cstate.minorversion;
2017 struct path path = { 2091 struct path path = {
@@ -2065,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2065 } 2139 }
2066 } 2140 }
2067 2141
2142#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
2143 if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
2144 bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
2145 err = security_inode_getsecctx(dentry->d_inode,
2146 &context, &contextlen);
2147 contextsupport = (err == 0);
2148 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2149 if (err == -EOPNOTSUPP)
2150 bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
2151 else if (err)
2152 goto out_nfserr;
2153 }
2154 }
2155#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2156
2068 if (bmval2) { 2157 if (bmval2) {
2069 if ((buflen -= 16) < 0) 2158 if ((buflen -= 16) < 0)
2070 goto out_resource; 2159 goto out_resource;
@@ -2093,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2093 2182
2094 if (!aclsupport) 2183 if (!aclsupport)
2095 word0 &= ~FATTR4_WORD0_ACL; 2184 word0 &= ~FATTR4_WORD0_ACL;
2185 if (!contextsupport)
2186 word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
2096 if (!word2) { 2187 if (!word2) {
2097 if ((buflen -= 12) < 0) 2188 if ((buflen -= 12) < 0)
2098 goto out_resource; 2189 goto out_resource;
@@ -2400,6 +2491,12 @@ out_acl:
2400 get_parent_attributes(exp, &stat); 2491 get_parent_attributes(exp, &stat);
2401 WRITE64(stat.ino); 2492 WRITE64(stat.ino);
2402 } 2493 }
2494 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2495 status = nfsd4_encode_security_label(rqstp, context,
2496 contextlen, &p, &buflen);
2497 if (status)
2498 goto out;
2499 }
2403 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { 2500 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2404 WRITE32(3); 2501 WRITE32(3);
2405 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); 2502 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2412,6 +2509,10 @@ out_acl:
2412 status = nfs_ok; 2509 status = nfs_ok;
2413 2510
2414out: 2511out:
2512#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
2513 if (context)
2514 security_release_secctx(context, contextlen);
2515#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2415 kfree(acl); 2516 kfree(acl);
2416 if (fhp == &tempfh) 2517 if (fhp == &tempfh)
2417 fh_put(&tempfh); 2518 fh_put(&tempfh);
@@ -3176,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3176{ 3277{
3177 __be32 *p; 3278 __be32 *p;
3178 3279
3179 RESERVE_SPACE(12); 3280 RESERVE_SPACE(16);
3180 if (nfserr) { 3281 if (nfserr) {
3181 WRITE32(2); 3282 WRITE32(3);
3283 WRITE32(0);
3182 WRITE32(0); 3284 WRITE32(0);
3183 WRITE32(0); 3285 WRITE32(0);
3184 } 3286 }
3185 else { 3287 else {
3186 WRITE32(2); 3288 WRITE32(3);
3187 WRITE32(setattr->sa_bmval[0]); 3289 WRITE32(setattr->sa_bmval[0]);
3188 WRITE32(setattr->sa_bmval[1]); 3290 WRITE32(setattr->sa_bmval[1]);
3291 WRITE32(setattr->sa_bmval[2]);
3189 } 3292 }
3190 ADJUST_ARGS(); 3293 ADJUST_ARGS();
3191 return nfserr; 3294 return nfserr;
@@ -3226,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
3226 return nfserr; 3329 return nfserr;
3227} 3330}
3228 3331
3332static const u32 nfs4_minimal_spo_must_enforce[2] = {
3333 [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
3334 1 << (OP_EXCHANGE_ID - 32) |
3335 1 << (OP_CREATE_SESSION - 32) |
3336 1 << (OP_DESTROY_SESSION - 32) |
3337 1 << (OP_DESTROY_CLIENTID - 32)
3338};
3339
3229static __be32 3340static __be32
3230nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 3341nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3231 struct nfsd4_exchange_id *exid) 3342 struct nfsd4_exchange_id *exid)
@@ -3264,6 +3375,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3264 /* state_protect4_r. Currently only support SP4_NONE */ 3375 /* state_protect4_r. Currently only support SP4_NONE */
3265 BUG_ON(exid->spa_how != SP4_NONE); 3376 BUG_ON(exid->spa_how != SP4_NONE);
3266 WRITE32(exid->spa_how); 3377 WRITE32(exid->spa_how);
3378 switch (exid->spa_how) {
3379 case SP4_NONE:
3380 break;
3381 case SP4_MACH_CRED:
3382 /* spo_must_enforce bitmap: */
3383 WRITE32(2);
3384 WRITE32(nfs4_minimal_spo_must_enforce[0]);
3385 WRITE32(nfs4_minimal_spo_must_enforce[1]);
3386 /* empty spo_must_allow bitmap: */
3387 WRITE32(0);
3388 break;
3389 default:
3390 WARN_ON_ONCE(1);
3391 }
3267 3392
3268 /* The server_owner struct */ 3393 /* The server_owner struct */
3269 WRITE64(minor_id); /* Minor id */ 3394 WRITE64(minor_id); /* Minor id */
@@ -3635,13 +3760,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3635 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3760 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3636 BUG_ON(iov->iov_len > PAGE_SIZE); 3761 BUG_ON(iov->iov_len > PAGE_SIZE);
3637 if (nfsd4_has_session(cs)) { 3762 if (nfsd4_has_session(cs)) {
3763 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3764 struct nfs4_client *clp = cs->session->se_client;
3638 if (cs->status != nfserr_replay_cache) { 3765 if (cs->status != nfserr_replay_cache) {
3639 nfsd4_store_cache_entry(resp); 3766 nfsd4_store_cache_entry(resp);
3640 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; 3767 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
3641 } 3768 }
3642 /* Renew the clientid on success and on replay */ 3769 /* Renew the clientid on success and on replay */
3643 put_client_renew(cs->session->se_client); 3770 spin_lock(&nn->client_lock);
3644 nfsd4_put_session(cs->session); 3771 nfsd4_put_session(cs->session);
3772 spin_unlock(&nn->client_lock);
3773 put_client_renew(clp);
3645 } 3774 }
3646 return 1; 3775 return 1;
3647} 3776}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..30f34ab02137 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
24/* 24/*
25 * nfsd version 25 * nfsd version
26 */ 26 */
27#define NFSD_SUPPORTED_MINOR_VERSION 1 27#define NFSD_SUPPORTED_MINOR_VERSION 2
28/* 28/*
29 * Maximum blocksizes supported by daemon under various circumstances. 29 * Maximum blocksizes supported by daemon under various circumstances.
30 */ 30 */
@@ -53,7 +53,6 @@ struct readdir_cd {
53extern struct svc_program nfsd_program; 53extern struct svc_program nfsd_program;
54extern struct svc_version nfsd_version2, nfsd_version3, 54extern struct svc_version nfsd_version2, nfsd_version3,
55 nfsd_version4; 55 nfsd_version4;
56extern u32 nfsd_supported_minorversion;
57extern struct mutex nfsd_mutex; 56extern struct mutex nfsd_mutex;
58extern spinlock_t nfsd_drc_lock; 57extern spinlock_t nfsd_drc_lock;
59extern unsigned long nfsd_drc_max_mem; 58extern unsigned long nfsd_drc_max_mem;
@@ -243,6 +242,12 @@ void nfsd_lockd_shutdown(void);
243#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) 242#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
244#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) 243#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
245#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) 244#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
245#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
246#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
247#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
248#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
249#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
250#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
246 251
247/* error codes for internal use */ 252/* error codes for internal use */
248/* if a request fails due to kmalloc failure, it gets dropped. 253/* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +327,13 @@ void nfsd_lockd_shutdown(void);
322#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 327#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
323 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 328 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
324 329
330#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
331#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
332 (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
333#else
334#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
335#endif
336
325static inline u32 nfsd_suppattrs0(u32 minorversion) 337static inline u32 nfsd_suppattrs0(u32 minorversion)
326{ 338{
327 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 339 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +348,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
336 348
337static inline u32 nfsd_suppattrs2(u32 minorversion) 349static inline u32 nfsd_suppattrs2(u32 minorversion)
338{ 350{
339 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 351 switch (minorversion) {
340 : NFSD4_SUPPORTED_ATTRS_WORD2; 352 default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
353 case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
354 case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
355 }
341} 356}
342 357
343/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ 358/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +365,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
350#define NFSD_WRITEABLE_ATTRS_WORD1 \ 365#define NFSD_WRITEABLE_ATTRS_WORD1 \
351 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ 366 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
352 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 367 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
368#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
369#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
370#else
353#define NFSD_WRITEABLE_ATTRS_WORD2 0 371#define NFSD_WRITEABLE_ATTRS_WORD2 0
372#endif
354 373
355#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ 374#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
356 NFSD_WRITEABLE_ATTRS_WORD0 375 NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..760c85a6f534 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,10 @@ struct svc_program nfsd_program = {
116 116
117}; 117};
118 118
119u32 nfsd_supported_minorversion; 119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
120 [0] = 1,
121 [1] = 1,
122};
120 123
121int nfsd_vers(int vers, enum vers_op change) 124int nfsd_vers(int vers, enum vers_op change)
122{ 125{
@@ -151,15 +154,13 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
151 return -1; 154 return -1;
152 switch(change) { 155 switch(change) {
153 case NFSD_SET: 156 case NFSD_SET:
154 nfsd_supported_minorversion = minorversion; 157 nfsd_supported_minorversions[minorversion] = true;
155 break; 158 break;
156 case NFSD_CLEAR: 159 case NFSD_CLEAR:
157 if (minorversion == 0) 160 nfsd_supported_minorversions[minorversion] = false;
158 return -1;
159 nfsd_supported_minorversion = minorversion - 1;
160 break; 161 break;
161 case NFSD_TEST: 162 case NFSD_TEST:
162 return minorversion <= nfsd_supported_minorversion; 163 return nfsd_supported_minorversions[minorversion];
163 case NFSD_AVAIL: 164 case NFSD_AVAIL:
164 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; 165 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
165 } 166 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 274e2a114e05..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -246,6 +246,7 @@ struct nfs4_client {
246 nfs4_verifier cl_verifier; /* generated by client */ 246 nfs4_verifier cl_verifier; /* generated by client */
247 time_t cl_time; /* time of last lease renewal */ 247 time_t cl_time; /* time of last lease renewal */
248 struct sockaddr_storage cl_addr; /* client ipaddress */ 248 struct sockaddr_storage cl_addr; /* client ipaddress */
249 bool cl_mach_cred; /* SP4_MACH_CRED in force */
249 struct svc_cred cl_cred; /* setclientid principal */ 250 struct svc_cred cl_cred; /* setclientid principal */
250 clientid_t cl_clientid; /* generated by server */ 251 clientid_t cl_clientid; /* generated by server */
251 nfs4_verifier cl_confirm; /* generated by server */ 252 nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a6bc8a7423db..c827acb0e943 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <linux/exportfs.h> 29#include <linux/exportfs.h>
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/security.h>
31 32
32#ifdef CONFIG_NFSD_V3 33#ifdef CONFIG_NFSD_V3
33#include "xdr3.h" 34#include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
621 return 0; 622 return 0;
622 return 1; 623 return 1;
623} 624}
625#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
626__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
627 struct xdr_netobj *label)
628{
629 __be32 error;
630 int host_error;
631 struct dentry *dentry;
632
633 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
634 if (error)
635 return error;
636
637 dentry = fhp->fh_dentry;
638
639 mutex_lock(&dentry->d_inode->i_mutex);
640 host_error = security_inode_setsecctx(dentry, label->data, label->len);
641 mutex_unlock(&dentry->d_inode->i_mutex);
642 return nfserrno(host_error);
643}
644#else
645__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
646 struct xdr_netobj *label)
647{
648 return nfserr_notsupp;
649}
650#endif
651
624#endif /* defined(CONFIG_NFSD_V4) */ 652#endif /* defined(CONFIG_NFSD_V4) */
625 653
626#ifdef CONFIG_NFSD_V3 654#ifdef CONFIG_NFSD_V3
@@ -802,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
802 flags = O_WRONLY|O_LARGEFILE; 830 flags = O_WRONLY|O_LARGEFILE;
803 } 831 }
804 *filp = dentry_open(&path, flags, current_cred()); 832 *filp = dentry_open(&path, flags, current_cred());
805 if (IS_ERR(*filp)) 833 if (IS_ERR(*filp)) {
806 host_err = PTR_ERR(*filp); 834 host_err = PTR_ERR(*filp);
807 else { 835 *filp = NULL;
836 } else {
808 host_err = ima_file_check(*filp, may_flags); 837 host_err = ima_file_check(*filp, may_flags);
809 838
810 if (may_flags & NFSD_MAY_64BIT_COOKIE) 839 if (may_flags & NFSD_MAY_64BIT_COOKIE)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
39typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); 39typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
40 40
41/* nfsd/vfs.c */ 41/* nfsd/vfs.c */
42int fh_lock_parent(struct svc_fh *, struct dentry *);
43int nfsd_racache_init(int); 42int nfsd_racache_init(int);
44void nfsd_racache_shutdown(void); 43void nfsd_racache_shutdown(void);
45int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 44int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
56__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, 55__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
57 struct nfs4_acl *); 56 struct nfs4_acl *);
58int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); 57int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
58__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
59 struct xdr_netobj *);
59#endif /* CONFIG_NFSD_V4 */ 60#endif /* CONFIG_NFSD_V4 */
60__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, 61__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
61 char *name, int len, struct iattr *attrs, 62 char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *,
92 struct svc_fh *, char *, int); 93 struct svc_fh *, char *, int);
93__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, 94__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
94 char *name, int len); 95 char *name, int len);
95int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
96 unsigned long size);
97__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, 96__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
98 loff_t *, struct readdir_cd *, filldir_t); 97 loff_t *, struct readdir_cd *, filldir_t);
99__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, 98__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
100 struct kstatfs *, int access); 99 struct kstatfs *, int access);
101 100
102int nfsd_notify_change(struct inode *, struct iattr *);
103__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, 101__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
104 struct dentry *, int); 102 struct dentry *, int);
105int nfsd_sync_dir(struct dentry *dp);
106 103
107#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 104#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
108struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); 105struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3b271d2092b6..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
40#include "state.h" 40#include "state.h"
41#include "nfsd.h" 41#include "nfsd.h"
42 42
43#define NFSD4_MAX_SEC_LABEL_LEN 2048
43#define NFSD4_MAX_TAGLEN 128 44#define NFSD4_MAX_TAGLEN 128
44#define XDR_LEN(n) (((n) + 3) & ~3) 45#define XDR_LEN(n) (((n) + 3) & ~3)
45 46
@@ -118,6 +119,7 @@ struct nfsd4_create {
118 struct iattr cr_iattr; /* request */ 119 struct iattr cr_iattr; /* request */
119 struct nfsd4_change_info cr_cinfo; /* response */ 120 struct nfsd4_change_info cr_cinfo; /* response */
120 struct nfs4_acl *cr_acl; 121 struct nfs4_acl *cr_acl;
122 struct xdr_netobj cr_label;
121}; 123};
122#define cr_linklen u.link.namelen 124#define cr_linklen u.link.namelen
123#define cr_linkname u.link.name 125#define cr_linkname u.link.name
@@ -246,6 +248,7 @@ struct nfsd4_open {
246 struct nfs4_file *op_file; /* used during processing */ 248 struct nfs4_file *op_file; /* used during processing */
247 struct nfs4_ol_stateid *op_stp; /* used during processing */ 249 struct nfs4_ol_stateid *op_stp; /* used during processing */
248 struct nfs4_acl *op_acl; 250 struct nfs4_acl *op_acl;
251 struct xdr_netobj op_label;
249}; 252};
250#define op_iattr iattr 253#define op_iattr iattr
251 254
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
330 u32 sa_bmval[3]; /* request */ 333 u32 sa_bmval[3]; /* request */
331 struct iattr sa_iattr; /* request */ 334 struct iattr sa_iattr; /* request */
332 struct nfs4_acl *sa_acl; 335 struct nfs4_acl *sa_acl;
336 struct xdr_netobj sa_label;
333}; 337};
334 338
335struct nfsd4_setclientid { 339struct nfsd4_setclientid {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1427de5ebf4d..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -996,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
996 996
997static int nilfs_tree_was_touched(struct dentry *root_dentry) 997static int nilfs_tree_was_touched(struct dentry *root_dentry)
998{ 998{
999 return root_dentry->d_count > 1; 999 return d_count(root_dentry) > 1;
1000} 1000}
1001 1001
1002/** 1002/**
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex);
35 34
36/* 35/*
37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which 36 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
183 return; 182 return;
184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); 183 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
185 184
186 mutex_lock(&dnotify_mark_mutex); 185 mutex_lock(&dnotify_group->mark_mutex);
187 186
188 spin_lock(&fsn_mark->lock); 187 spin_lock(&fsn_mark->lock);
189 prev = &dn_mark->dn; 188 prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
199 198
200 spin_unlock(&fsn_mark->lock); 199 spin_unlock(&fsn_mark->lock);
201 200
202 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 201 /* nothing else could have found us thanks to the dnotify_groups
202 mark_mutex */
203 if (dn_mark->dn == NULL) 203 if (dn_mark->dn == NULL)
204 fsnotify_destroy_mark(fsn_mark, dnotify_group); 204 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
205 205
206 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_group->mark_mutex);
207 207
208 fsnotify_put_mark(fsn_mark); 208 fsnotify_put_mark(fsn_mark);
209} 209}
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
326 new_dn_mark->dn = NULL; 326 new_dn_mark->dn = NULL;
327 327
328 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
329 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_group->mark_mutex);
330 330
331 /* add the new_fsn_mark or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
335 spin_lock(&fsn_mark->lock); 335 spin_lock(&fsn_mark->lock);
336 } else { 336 } else {
337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); 337 fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
338 NULL, 0);
338 spin_lock(&new_fsn_mark->lock); 339 spin_lock(&new_fsn_mark->lock);
339 fsn_mark = new_fsn_mark; 340 fsn_mark = new_fsn_mark;
340 dn_mark = new_dn_mark; 341 dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
348 349
349 /* if (f != filp) means that we lost a race and another task/thread 350 /* if (f != filp) means that we lost a race and another task/thread
350 * actually closed the fd we are still playing with before we grabbed 351 * actually closed the fd we are still playing with before we grabbed
351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the 352 * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
352 * only time we clean up the marks we need to get our mark off 353 * fd is the only time we clean up the marks we need to get our mark
353 * the list. */ 354 * off the list. */
354 if (f != filp) { 355 if (f != filp) {
355 /* if we added ourselves, shoot ourselves, it's possible that 356 /* if we added ourselves, shoot ourselves, it's possible that
356 * the flush actually did shoot this fsn_mark. That's fine too 357 * the flush actually did shoot this fsn_mark. That's fine too
@@ -385,9 +386,9 @@ out:
385 spin_unlock(&fsn_mark->lock); 386 spin_unlock(&fsn_mark->lock);
386 387
387 if (destroy) 388 if (destroy)
388 fsnotify_destroy_mark(fsn_mark, dnotify_group); 389 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
389 390
390 mutex_unlock(&dnotify_mark_mutex); 391 mutex_unlock(&dnotify_group->mark_mutex);
391 fsnotify_put_mark(fsn_mark); 392 fsnotify_put_mark(fsn_mark);
392out_err: 393out_err:
393 if (new_fsn_mark) 394 if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1ea52f7c031f..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
122 metadata->event_len = FAN_EVENT_METADATA_LEN; 122 metadata->event_len = FAN_EVENT_METADATA_LEN;
123 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 123 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
124 metadata->vers = FANOTIFY_METADATA_VERSION; 124 metadata->vers = FANOTIFY_METADATA_VERSION;
125 metadata->reserved = 0;
125 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 126 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
126 metadata->pid = pid_vnr(event->tgid); 127 metadata->pid = pid_vnr(event->tgid);
127 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 128 if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -523,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
523 __u32 removed; 524 __u32 removed;
524 int destroy_mark; 525 int destroy_mark;
525 526
527 mutex_lock(&group->mark_mutex);
526 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 528 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
527 if (!fsn_mark) 529 if (!fsn_mark) {
530 mutex_unlock(&group->mark_mutex);
528 return -ENOENT; 531 return -ENOENT;
532 }
529 533
530 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 534 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
531 &destroy_mark); 535 &destroy_mark);
532 if (destroy_mark) 536 if (destroy_mark)
533 fsnotify_destroy_mark(fsn_mark, group); 537 fsnotify_destroy_mark_locked(fsn_mark, group);
538 mutex_unlock(&group->mark_mutex);
534 539
535 fsnotify_put_mark(fsn_mark); 540 fsnotify_put_mark(fsn_mark);
536 if (removed & real_mount(mnt)->mnt_fsnotify_mask) 541 if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -547,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
547 __u32 removed; 552 __u32 removed;
548 int destroy_mark; 553 int destroy_mark;
549 554
555 mutex_lock(&group->mark_mutex);
550 fsn_mark = fsnotify_find_inode_mark(group, inode); 556 fsn_mark = fsnotify_find_inode_mark(group, inode);
551 if (!fsn_mark) 557 if (!fsn_mark) {
558 mutex_unlock(&group->mark_mutex);
552 return -ENOENT; 559 return -ENOENT;
560 }
553 561
554 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 562 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
555 &destroy_mark); 563 &destroy_mark);
556 if (destroy_mark) 564 if (destroy_mark)
557 fsnotify_destroy_mark(fsn_mark, group); 565 fsnotify_destroy_mark_locked(fsn_mark, group);
566 mutex_unlock(&group->mark_mutex);
567
558 /* matches the fsnotify_find_inode_mark() */ 568 /* matches the fsnotify_find_inode_mark() */
559 fsnotify_put_mark(fsn_mark); 569 fsnotify_put_mark(fsn_mark);
560 if (removed & inode->i_fsnotify_mask) 570 if (removed & inode->i_fsnotify_mask)
@@ -590,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
590 return mask & ~oldmask; 600 return mask & ~oldmask;
591} 601}
592 602
603static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
604 struct inode *inode,
605 struct vfsmount *mnt)
606{
607 struct fsnotify_mark *mark;
608 int ret;
609
610 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
611 return ERR_PTR(-ENOSPC);
612
613 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
614 if (!mark)
615 return ERR_PTR(-ENOMEM);
616
617 fsnotify_init_mark(mark, fanotify_free_mark);
618 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
619 if (ret) {
620 fsnotify_put_mark(mark);
621 return ERR_PTR(ret);
622 }
623
624 return mark;
625}
626
627
593static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, 628static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
594 struct vfsmount *mnt, __u32 mask, 629 struct vfsmount *mnt, __u32 mask,
595 unsigned int flags) 630 unsigned int flags)
596{ 631{
597 struct fsnotify_mark *fsn_mark; 632 struct fsnotify_mark *fsn_mark;
598 __u32 added; 633 __u32 added;
599 int ret = 0;
600 634
635 mutex_lock(&group->mark_mutex);
601 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 636 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
602 if (!fsn_mark) { 637 if (!fsn_mark) {
603 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 638 fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
604 return -ENOSPC; 639 if (IS_ERR(fsn_mark)) {
605 640 mutex_unlock(&group->mark_mutex);
606 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 641 return PTR_ERR(fsn_mark);
607 if (!fsn_mark) 642 }
608 return -ENOMEM;
609
610 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
611 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
612 if (ret)
613 goto err;
614 } 643 }
615 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 644 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
645 mutex_unlock(&group->mark_mutex);
616 646
617 if (added & ~real_mount(mnt)->mnt_fsnotify_mask) 647 if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
618 fsnotify_recalc_vfsmount_mask(mnt); 648 fsnotify_recalc_vfsmount_mask(mnt);
619err: 649
620 fsnotify_put_mark(fsn_mark); 650 fsnotify_put_mark(fsn_mark);
621 return ret; 651 return 0;
622} 652}
623 653
624static int fanotify_add_inode_mark(struct fsnotify_group *group, 654static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -627,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
627{ 657{
628 struct fsnotify_mark *fsn_mark; 658 struct fsnotify_mark *fsn_mark;
629 __u32 added; 659 __u32 added;
630 int ret = 0;
631 660
632 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 661 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
633 662
@@ -641,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
641 (atomic_read(&inode->i_writecount) > 0)) 670 (atomic_read(&inode->i_writecount) > 0))
642 return 0; 671 return 0;
643 672
673 mutex_lock(&group->mark_mutex);
644 fsn_mark = fsnotify_find_inode_mark(group, inode); 674 fsn_mark = fsnotify_find_inode_mark(group, inode);
645 if (!fsn_mark) { 675 if (!fsn_mark) {
646 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 676 fsn_mark = fanotify_add_new_mark(group, inode, NULL);
647 return -ENOSPC; 677 if (IS_ERR(fsn_mark)) {
648 678 mutex_unlock(&group->mark_mutex);
649 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 679 return PTR_ERR(fsn_mark);
650 if (!fsn_mark) 680 }
651 return -ENOMEM;
652
653 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
654 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
655 if (ret)
656 goto err;
657 } 681 }
658 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 682 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
683 mutex_unlock(&group->mark_mutex);
659 684
660 if (added & ~inode->i_fsnotify_mask) 685 if (added & ~inode->i_fsnotify_mask)
661 fsnotify_recalc_inode_mask(inode); 686 fsnotify_recalc_inode_mask(inode);
662err: 687
663 fsnotify_put_mark(fsn_mark); 688 fsnotify_put_mark(fsn_mark);
664 return ret; 689 return 0;
665} 690}
666 691
667/* fanotify syscalls */ 692/* fanotify syscalls */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c1e017..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
636 goto out_err; 636 goto out_err;
637 637
638 /* we are on the idr, now get on the inode */ 638 /* we are on the idr, now get on the inode */
639 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); 639 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
640 NULL, 0);
640 if (ret) { 641 if (ret) {
641 /* we failed to get on the inode, get off the idr */ 642 /* we failed to get on the inode, get off the idr */
642 inotify_remove_from_idr(group, tmp_i_mark); 643 inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
660{ 661{
661 int ret = 0; 662 int ret = 0;
662 663
663retry: 664 mutex_lock(&group->mark_mutex);
664 /* try to update and existing watch with the new arg */ 665 /* try to update and existing watch with the new arg */
665 ret = inotify_update_existing_watch(group, inode, arg); 666 ret = inotify_update_existing_watch(group, inode, arg);
666 /* no mark present, try to add a new one */ 667 /* no mark present, try to add a new one */
667 if (ret == -ENOENT) 668 if (ret == -ENOENT)
668 ret = inotify_new_watch(group, inode, arg); 669 ret = inotify_new_watch(group, inode, arg);
669 /* 670 mutex_unlock(&group->mark_mutex);
670 * inotify_new_watch could race with another thread which did an
671 * inotify_new_watch between the update_existing and the add watch
672 * here, go back and try to update an existing mark again.
673 */
674 if (ret == -EEXIST)
675 goto retry;
676 671
677 return ret; 672 return ret;
678} 673}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
20 * fsnotify inode mark locking/lifetime/and refcnting 20 * fsnotify inode mark locking/lifetime/and refcnting
21 * 21 *
22 * REFCNT: 22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are 23 * The group->recnt and mark->refcnt tell how many "things" in the kernel
24 * referencing this object. The object typically will live inside the kernel 24 * currently are referencing the objects. Both kind of objects typically will
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task 25 * live inside the kernel with a refcnt of 2, one for its creation and one for
26 * which can find this object holding the appropriete locks, can take a reference 26 * the reference a group and a mark hold to each other.
27 * and the object itself is guaranteed to survive until the reference is dropped. 27 * If you are holding the appropriate locks, you can take a reference and the
28 * object itself is guaranteed to survive until the reference is dropped.
28 * 29 *
29 * LOCKING: 30 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST 31 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
31 * be taken in order as follows: 32 * in order as follows:
32 * 33 *
34 * group->mark_mutex
33 * mark->lock 35 * mark->lock
34 * group->mark_lock
35 * inode->i_lock 36 * inode->i_lock
36 * 37 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold 38 * group->mark_mutex protects the marks_list anchored inside a given group and
38 * that lock to dereference either of these things (they could be NULL even with 39 * each mark is hooked via the g_list. It also protects the groups private
39 * the lock) 40 * data (i.e group limits).
40 * 41
41 * group->mark_lock protects the marks_list anchored inside a given group 42 * mark->lock protects the marks attributes like its masks and flags.
42 * and each mark is hooked via the g_list. It also sorta protects the 43 * Furthermore it protects the access to a reference of the group that the mark
43 * free_g_list, which when used is anchored by a private list on the stack of the 44 * is assigned to as well as the access to a reference of the inode/vfsmount
44 * task which held the group->mark_lock. 45 * that is being watched by the mark.
45 * 46 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a 47 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the 48 * given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each 65 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us). 66 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a 67 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no 68 * private list anchored on the stack using i_free_list; we walk i_free_list
68 * longer fear anything finding the mark using the inode's list of marks. 69 * and before we destroy the mark we make sure that we dont race with a
69 * 70 * concurrent destroy_group by getting a ref to the marks group and taking the
70 * We can safely and locklessly run the private list on the stack of everything 71 * groups mutex.
71 * we just unattached from the original inode. For each mark on the private list 72
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list. 73 * Very similarly for freeing by group, except we use free_g_list.
80 * 74 *
81 * This has the very interesting property of being able to run concurrently with 75 * This has the very interesting property of being able to run concurrently with
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 998b17eda09d..9f6b96a09615 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2965,6 +2965,11 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2965 to = map_end & (PAGE_CACHE_SIZE - 1); 2965 to = map_end & (PAGE_CACHE_SIZE - 1);
2966 2966
2967 page = find_or_create_page(mapping, page_index, GFP_NOFS); 2967 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2968 if (!page) {
2969 ret = -ENOMEM;
2970 mlog_errno(ret);
2971 break;
2972 }
2968 2973
2969 /* 2974 /*
2970 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2975 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
diff --git a/fs/open.c b/fs/open.c
index fca72c4d3f17..d53e29895082 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,10 +840,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
840 if (flags & __O_SYNC) 840 if (flags & __O_SYNC)
841 flags |= O_DSYNC; 841 flags |= O_DSYNC;
842 842
843 if (flags & O_TMPFILE) { 843 if (flags & __O_TMPFILE) {
844 if (!(flags & O_CREAT)) 844 if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
845 return -EINVAL; 845 return -EINVAL;
846 acc_mode = MAY_OPEN | ACC_MODE(flags); 846 acc_mode = MAY_OPEN | ACC_MODE(flags);
847 if (!(acc_mode & MAY_WRITE))
848 return -EINVAL;
847 } else if (flags & O_PATH) { 849 } else if (flags & O_PATH) {
848 /* 850 /*
849 * If we have O_PATH in the open flag. Then we 851 * If we have O_PATH in the open flag. Then we
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 28503172f2e4..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -223,7 +223,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into 223 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout. 224 * virtually contiguous user-space in ELF layout.
225 */ 225 */
226#ifdef CONFIG_MMU 226#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) 227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{ 228{
229 size_t size = vma->vm_end - vma->vm_start; 229 size_t size = vma->vm_end - vma->vm_start;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
2585 return proc_dointvec(table, write, buffer, lenp, ppos); 2585 return proc_dointvec(table, write, buffer, lenp, ppos);
2586} 2586}
2587 2587
2588static ctl_table fs_dqstats_table[] = { 2588static struct ctl_table fs_dqstats_table[] = {
2589 { 2589 {
2590 .procname = "lookups", 2590 .procname = "lookups",
2591 .data = &dqstats.stat[DQST_LOOKUPS], 2591 .data = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
2654 { }, 2654 { },
2655}; 2655};
2656 2656
2657static ctl_table fs_table[] = { 2657static struct ctl_table fs_table[] = {
2658 { 2658 {
2659 .procname = "quota", 2659 .procname = "quota",
2660 .mode = 0555, 2660 .mode = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
2663 { }, 2663 { },
2664}; 2664};
2665 2665
2666static ctl_table sys_table[] = { 2666static struct ctl_table sys_table[] = {
2667 { 2667 {
2668 .procname = "fs", 2668 .procname = "fs",
2669 .mode = 0555, 2669 .mode = 0555,
diff --git a/fs/select.c b/fs/select.c
index 6b14dc7df3a4..35d4adc749d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -28,6 +28,7 @@
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/sched/rt.h> 29#include <linux/sched/rt.h>
30#include <linux/freezer.h> 30#include <linux/freezer.h>
31#include <net/busy_poll.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33 34
@@ -386,9 +387,10 @@ get_max:
386#define POLLEX_SET (POLLPRI) 387#define POLLEX_SET (POLLPRI)
387 388
388static inline void wait_key_set(poll_table *wait, unsigned long in, 389static inline void wait_key_set(poll_table *wait, unsigned long in,
389 unsigned long out, unsigned long bit) 390 unsigned long out, unsigned long bit,
391 unsigned int ll_flag)
390{ 392{
391 wait->_key = POLLEX_SET; 393 wait->_key = POLLEX_SET | ll_flag;
392 if (in & bit) 394 if (in & bit)
393 wait->_key |= POLLIN_SET; 395 wait->_key |= POLLIN_SET;
394 if (out & bit) 396 if (out & bit)
@@ -402,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
402 poll_table *wait; 404 poll_table *wait;
403 int retval, i, timed_out = 0; 405 int retval, i, timed_out = 0;
404 unsigned long slack = 0; 406 unsigned long slack = 0;
407 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
408 unsigned long busy_end = 0;
405 409
406 rcu_read_lock(); 410 rcu_read_lock();
407 retval = max_select_fd(n, fds); 411 retval = max_select_fd(n, fds);
@@ -424,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
424 retval = 0; 428 retval = 0;
425 for (;;) { 429 for (;;) {
426 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 430 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
431 bool can_busy_loop = false;
427 432
428 inp = fds->in; outp = fds->out; exp = fds->ex; 433 inp = fds->in; outp = fds->out; exp = fds->ex;
429 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 434 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -451,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
451 f_op = f.file->f_op; 456 f_op = f.file->f_op;
452 mask = DEFAULT_POLLMASK; 457 mask = DEFAULT_POLLMASK;
453 if (f_op && f_op->poll) { 458 if (f_op && f_op->poll) {
454 wait_key_set(wait, in, out, bit); 459 wait_key_set(wait, in, out,
460 bit, busy_flag);
455 mask = (*f_op->poll)(f.file, wait); 461 mask = (*f_op->poll)(f.file, wait);
456 } 462 }
457 fdput(f); 463 fdput(f);
@@ -470,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
470 retval++; 476 retval++;
471 wait->_qproc = NULL; 477 wait->_qproc = NULL;
472 } 478 }
479 /* got something, stop busy polling */
480 if (retval) {
481 can_busy_loop = false;
482 busy_flag = 0;
483
484 /*
485 * only remember a returned
486 * POLL_BUSY_LOOP if we asked for it
487 */
488 } else if (busy_flag & mask)
489 can_busy_loop = true;
490
473 } 491 }
474 } 492 }
475 if (res_in) 493 if (res_in)
@@ -488,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
488 break; 506 break;
489 } 507 }
490 508
509 /* only if found POLL_BUSY_LOOP sockets && not out of time */
510 if (can_busy_loop && !need_resched()) {
511 if (!busy_end) {
512 busy_end = busy_loop_end_time();
513 continue;
514 }
515 if (!busy_loop_timeout(busy_end))
516 continue;
517 }
518 busy_flag = 0;
519
491 /* 520 /*
492 * If this is the first loop and we have a timeout 521 * If this is the first loop and we have a timeout
493 * given, then we convert to ktime_t and set the to 522 * given, then we convert to ktime_t and set the to
@@ -719,7 +748,9 @@ struct poll_list {
719 * pwait poll_table will be used by the fd-provided poll handler for waiting, 748 * pwait poll_table will be used by the fd-provided poll handler for waiting,
720 * if pwait->_qproc is non-NULL. 749 * if pwait->_qproc is non-NULL.
721 */ 750 */
722static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) 751static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
752 bool *can_busy_poll,
753 unsigned int busy_flag)
723{ 754{
724 unsigned int mask; 755 unsigned int mask;
725 int fd; 756 int fd;
@@ -733,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
733 mask = DEFAULT_POLLMASK; 764 mask = DEFAULT_POLLMASK;
734 if (f.file->f_op && f.file->f_op->poll) { 765 if (f.file->f_op && f.file->f_op->poll) {
735 pwait->_key = pollfd->events|POLLERR|POLLHUP; 766 pwait->_key = pollfd->events|POLLERR|POLLHUP;
767 pwait->_key |= busy_flag;
736 mask = f.file->f_op->poll(f.file, pwait); 768 mask = f.file->f_op->poll(f.file, pwait);
769 if (mask & busy_flag)
770 *can_busy_poll = true;
737 } 771 }
738 /* Mask out unneeded events. */ 772 /* Mask out unneeded events. */
739 mask &= pollfd->events | POLLERR | POLLHUP; 773 mask &= pollfd->events | POLLERR | POLLHUP;
@@ -752,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
752 ktime_t expire, *to = NULL; 786 ktime_t expire, *to = NULL;
753 int timed_out = 0, count = 0; 787 int timed_out = 0, count = 0;
754 unsigned long slack = 0; 788 unsigned long slack = 0;
789 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
790 unsigned long busy_end = 0;
755 791
756 /* Optimise the no-wait case */ 792 /* Optimise the no-wait case */
757 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 793 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -764,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
764 800
765 for (;;) { 801 for (;;) {
766 struct poll_list *walk; 802 struct poll_list *walk;
803 bool can_busy_loop = false;
767 804
768 for (walk = list; walk != NULL; walk = walk->next) { 805 for (walk = list; walk != NULL; walk = walk->next) {
769 struct pollfd * pfd, * pfd_end; 806 struct pollfd * pfd, * pfd_end;
@@ -778,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
778 * this. They'll get immediately deregistered 815 * this. They'll get immediately deregistered
779 * when we break out and return. 816 * when we break out and return.
780 */ 817 */
781 if (do_pollfd(pfd, pt)) { 818 if (do_pollfd(pfd, pt, &can_busy_loop,
819 busy_flag)) {
782 count++; 820 count++;
783 pt->_qproc = NULL; 821 pt->_qproc = NULL;
822 /* found something, stop busy polling */
823 busy_flag = 0;
824 can_busy_loop = false;
784 } 825 }
785 } 826 }
786 } 827 }
@@ -797,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
797 if (count || timed_out) 838 if (count || timed_out)
798 break; 839 break;
799 840
841 /* only if found POLL_BUSY_LOOP sockets && not out of time */
842 if (can_busy_loop && !need_resched()) {
843 if (!busy_end) {
844 busy_end = busy_loop_end_time();
845 continue;
846 }
847 if (!busy_loop_timeout(busy_end))
848 continue;
849 }
850 busy_flag = 0;
851
800 /* 852 /*
801 * If this is the first loop and we have a timeout 853 * If this is the first loop and we have a timeout
802 * given, then we convert to ktime_t and set the to 854 * given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 774c1eb7f1c9..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
921 return rcu_dereference(node->next); 921 return rcu_dereference(node->next);
922} 922}
923EXPORT_SYMBOL(seq_hlist_next_rcu); 923EXPORT_SYMBOL(seq_hlist_next_rcu);
924
925/**
926 * seq_hlist_start_precpu - start an iteration of a percpu hlist array
927 * @head: pointer to percpu array of struct hlist_heads
928 * @cpu: pointer to cpu "cursor"
929 * @pos: start position of sequence
930 *
931 * Called at seq_file->op->start().
932 */
933struct hlist_node *
934seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
935{
936 struct hlist_node *node;
937
938 for_each_possible_cpu(*cpu) {
939 hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
940 if (pos-- == 0)
941 return node;
942 }
943 }
944 return NULL;
945}
946EXPORT_SYMBOL(seq_hlist_start_percpu);
947
948/**
949 * seq_hlist_next_percpu - move to the next position of the percpu hlist array
950 * @v: pointer to current hlist_node
951 * @head: pointer to percpu array of struct hlist_heads
952 * @cpu: pointer to cpu "cursor"
953 * @pos: start position of sequence
954 *
955 * Called at seq_file->op->next().
956 */
957struct hlist_node *
958seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
959 int *cpu, loff_t *pos)
960{
961 struct hlist_node *node = v;
962
963 ++*pos;
964
965 if (node->next)
966 return node->next;
967
968 for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
969 *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
970 struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
971
972 if (!hlist_empty(bucket))
973 return bucket->first;
974 }
975 return NULL;
976}
977EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/super.c b/fs/super.c
index 7465d4364208..68307c029228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -336,19 +336,19 @@ EXPORT_SYMBOL(deactivate_super);
336 * and want to turn it into a full-blown active reference. grab_super() 336 * and want to turn it into a full-blown active reference. grab_super()
337 * is called with sb_lock held and drops it. Returns 1 in case of 337 * is called with sb_lock held and drops it. Returns 1 in case of
338 * success, 0 if we had failed (superblock contents was already dead or 338 * success, 0 if we had failed (superblock contents was already dead or
339 * dying when grab_super() had been called). 339 * dying when grab_super() had been called). Note that this is only
340 * called for superblocks not in rundown mode (== ones still on ->fs_supers
341 * of their type), so increment of ->s_count is OK here.
340 */ 342 */
341static int grab_super(struct super_block *s) __releases(sb_lock) 343static int grab_super(struct super_block *s) __releases(sb_lock)
342{ 344{
343 if (atomic_inc_not_zero(&s->s_active)) {
344 spin_unlock(&sb_lock);
345 return 1;
346 }
347 /* it's going away */
348 s->s_count++; 345 s->s_count++;
349 spin_unlock(&sb_lock); 346 spin_unlock(&sb_lock);
350 /* wait for it to die */
351 down_write(&s->s_umount); 347 down_write(&s->s_umount);
348 if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
349 put_super(s);
350 return 1;
351 }
352 up_write(&s->s_umount); 352 up_write(&s->s_umount);
353 put_super(s); 353 put_super(s);
354 return 0; 354 return 0;
@@ -463,11 +463,6 @@ retry:
463 destroy_super(s); 463 destroy_super(s);
464 s = NULL; 464 s = NULL;
465 } 465 }
466 down_write(&old->s_umount);
467 if (unlikely(!(old->s_flags & MS_BORN))) {
468 deactivate_locked_super(old);
469 goto retry;
470 }
471 return old; 466 return old;
472 } 467 }
473 } 468 }
@@ -660,10 +655,10 @@ restart:
660 if (hlist_unhashed(&sb->s_instances)) 655 if (hlist_unhashed(&sb->s_instances))
661 continue; 656 continue;
662 if (sb->s_bdev == bdev) { 657 if (sb->s_bdev == bdev) {
663 if (grab_super(sb)) /* drops sb_lock */ 658 if (!grab_super(sb))
664 return sb;
665 else
666 goto restart; 659 goto restart;
660 up_write(&sb->s_umount);
661 return sb;
667 } 662 }
668 } 663 }
669 spin_unlock(&sb_lock); 664 spin_unlock(&sb_lock);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..09a1a25cd145 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -20,38 +20,64 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
20 const struct attribute_group *grp) 20 const struct attribute_group *grp)
21{ 21{
22 struct attribute *const* attr; 22 struct attribute *const* attr;
23 int i; 23 struct bin_attribute *const* bin_attr;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 if (grp->attrs)
26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); 26 for (attr = grp->attrs; *attr; attr++)
27 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
28 if (grp->bin_attrs)
29 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
30 sysfs_remove_bin_file(kobj, *bin_attr);
27} 31}
28 32
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 33static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
30 const struct attribute_group *grp, int update) 34 const struct attribute_group *grp, int update)
31{ 35{
32 struct attribute *const* attr; 36 struct attribute *const* attr;
37 struct bin_attribute *const* bin_attr;
33 int error = 0, i; 38 int error = 0, i;
34 39
35 for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { 40 if (grp->attrs) {
36 umode_t mode = 0; 41 for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
42 umode_t mode = 0;
43
44 /*
45 * In update mode, we're changing the permissions or
46 * visibility. Do this by first removing then
47 * re-adding (if required) the file.
48 */
49 if (update)
50 sysfs_hash_and_remove(dir_sd, NULL,
51 (*attr)->name);
52 if (grp->is_visible) {
53 mode = grp->is_visible(kobj, *attr, i);
54 if (!mode)
55 continue;
56 }
57 error = sysfs_add_file_mode(dir_sd, *attr,
58 SYSFS_KOBJ_ATTR,
59 (*attr)->mode | mode);
60 if (unlikely(error))
61 break;
62 }
63 if (error) {
64 remove_files(dir_sd, kobj, grp);
65 goto exit;
66 }
67 }
37 68
38 /* in update mode, we're changing the permissions or 69 if (grp->bin_attrs) {
39 * visibility. Do this by first removing then 70 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
40 * re-adding (if required) the file */ 71 if (update)
41 if (update) 72 sysfs_remove_bin_file(kobj, *bin_attr);
42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); 73 error = sysfs_create_bin_file(kobj, *bin_attr);
43 if (grp->is_visible) { 74 if (error)
44 mode = grp->is_visible(kobj, *attr, i); 75 break;
45 if (!mode)
46 continue;
47 } 76 }
48 error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR, 77 if (error)
49 (*attr)->mode | mode); 78 remove_files(dir_sd, kobj, grp);
50 if (unlikely(error))
51 break;
52 } 79 }
53 if (error) 80exit:
54 remove_files(dir_sd, kobj, grp);
55 return error; 81 return error;
56} 82}
57 83
@@ -67,8 +93,8 @@ static int internal_create_group(struct kobject *kobj, int update,
67 /* Updates may happen before the object has been instantiated */ 93 /* Updates may happen before the object has been instantiated */
68 if (unlikely(update && !kobj->sd)) 94 if (unlikely(update && !kobj->sd))
69 return -EINVAL; 95 return -EINVAL;
70 if (!grp->attrs) { 96 if (!grp->attrs && !grp->bin_attrs) {
71 WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n", 97 WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
72 kobj->name, grp->name ? "" : grp->name); 98 kobj->name, grp->name ? "" : grp->name);
73 return -EINVAL; 99 return -EINVAL;
74 } 100 }
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/alarmtimer.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
13#include <linux/init.h> 14#include <linux/init.h>
@@ -26,7 +27,10 @@
26#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
27 28
28struct timerfd_ctx { 29struct timerfd_ctx {
29 struct hrtimer tmr; 30 union {
31 struct hrtimer tmr;
32 struct alarm alarm;
33 } t;
30 ktime_t tintv; 34 ktime_t tintv;
31 ktime_t moffs; 35 ktime_t moffs;
32 wait_queue_head_t wqh; 36 wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
41static LIST_HEAD(cancel_list); 45static LIST_HEAD(cancel_list);
42static DEFINE_SPINLOCK(cancel_lock); 46static DEFINE_SPINLOCK(cancel_lock);
43 47
48static inline bool isalarm(struct timerfd_ctx *ctx)
49{
50 return ctx->clockid == CLOCK_REALTIME_ALARM ||
51 ctx->clockid == CLOCK_BOOTTIME_ALARM;
52}
53
44/* 54/*
45 * This gets called when the timer event triggers. We set the "expired" 55 * This gets called when the timer event triggers. We set the "expired"
46 * flag, but we do not re-arm the timer (in case it's necessary, 56 * flag, but we do not re-arm the timer (in case it's necessary,
47 * tintv.tv64 != 0) until the timer is accessed. 57 * tintv.tv64 != 0) until the timer is accessed.
48 */ 58 */
49static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) 59static void timerfd_triggered(struct timerfd_ctx *ctx)
50{ 60{
51 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
52 unsigned long flags; 61 unsigned long flags;
53 62
54 spin_lock_irqsave(&ctx->wqh.lock, flags); 63 spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
56 ctx->ticks++; 65 ctx->ticks++;
57 wake_up_locked(&ctx->wqh); 66 wake_up_locked(&ctx->wqh);
58 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 67 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
68}
59 69
70static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
71{
72 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
73 t.tmr);
74 timerfd_triggered(ctx);
60 return HRTIMER_NORESTART; 75 return HRTIMER_NORESTART;
61} 76}
62 77
78static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
79 ktime_t now)
80{
81 struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
82 t.alarm);
83 timerfd_triggered(ctx);
84 return ALARMTIMER_NORESTART;
85}
86
63/* 87/*
64 * Called when the clock was set to cancel the timers in the cancel 88 * Called when the clock was set to cancel the timers in the cancel
65 * list. This will wake up processes waiting on these timers. The 89 * list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
107 131
108static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) 132static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
109{ 133{
110 if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && 134 if ((ctx->clockid == CLOCK_REALTIME ||
111 (flags & TFD_TIMER_CANCEL_ON_SET)) { 135 ctx->clockid == CLOCK_REALTIME_ALARM) &&
136 (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
112 if (!ctx->might_cancel) { 137 if (!ctx->might_cancel) {
113 ctx->might_cancel = true; 138 ctx->might_cancel = true;
114 spin_lock(&cancel_lock); 139 spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
124{ 149{
125 ktime_t remaining; 150 ktime_t remaining;
126 151
127 remaining = hrtimer_expires_remaining(&ctx->tmr); 152 if (isalarm(ctx))
153 remaining = alarm_expires_remaining(&ctx->t.alarm);
154 else
155 remaining = hrtimer_expires_remaining(&ctx->t.tmr);
156
128 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; 157 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
129} 158}
130 159
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
142 ctx->expired = 0; 171 ctx->expired = 0;
143 ctx->ticks = 0; 172 ctx->ticks = 0;
144 ctx->tintv = timespec_to_ktime(ktmr->it_interval); 173 ctx->tintv = timespec_to_ktime(ktmr->it_interval);
145 hrtimer_init(&ctx->tmr, clockid, htmode); 174
146 hrtimer_set_expires(&ctx->tmr, texp); 175 if (isalarm(ctx)) {
147 ctx->tmr.function = timerfd_tmrproc; 176 alarm_init(&ctx->t.alarm,
177 ctx->clockid == CLOCK_REALTIME_ALARM ?
178 ALARM_REALTIME : ALARM_BOOTTIME,
179 timerfd_alarmproc);
180 } else {
181 hrtimer_init(&ctx->t.tmr, clockid, htmode);
182 hrtimer_set_expires(&ctx->t.tmr, texp);
183 ctx->t.tmr.function = timerfd_tmrproc;
184 }
185
148 if (texp.tv64 != 0) { 186 if (texp.tv64 != 0) {
149 hrtimer_start(&ctx->tmr, texp, htmode); 187 if (isalarm(ctx)) {
188 if (flags & TFD_TIMER_ABSTIME)
189 alarm_start(&ctx->t.alarm, texp);
190 else
191 alarm_start_relative(&ctx->t.alarm, texp);
192 } else {
193 hrtimer_start(&ctx->t.tmr, texp, htmode);
194 }
195
150 if (timerfd_canceled(ctx)) 196 if (timerfd_canceled(ctx))
151 return -ECANCELED; 197 return -ECANCELED;
152 } 198 }
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
158 struct timerfd_ctx *ctx = file->private_data; 204 struct timerfd_ctx *ctx = file->private_data;
159 205
160 timerfd_remove_cancel(ctx); 206 timerfd_remove_cancel(ctx);
161 hrtimer_cancel(&ctx->tmr); 207
208 if (isalarm(ctx))
209 alarm_cancel(&ctx->t.alarm);
210 else
211 hrtimer_cancel(&ctx->t.tmr);
162 kfree_rcu(ctx, rcu); 212 kfree_rcu(ctx, rcu);
163 return 0; 213 return 0;
164} 214}
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
215 * callback to avoid DoS attacks specifying a very 265 * callback to avoid DoS attacks specifying a very
216 * short timer period. 266 * short timer period.
217 */ 267 */
218 ticks += hrtimer_forward_now(&ctx->tmr, 268 if (isalarm(ctx)) {
219 ctx->tintv) - 1; 269 ticks += alarm_forward_now(
220 hrtimer_restart(&ctx->tmr); 270 &ctx->t.alarm, ctx->tintv) - 1;
271 alarm_restart(&ctx->t.alarm);
272 } else {
273 ticks += hrtimer_forward_now(&ctx->t.tmr,
274 ctx->tintv) - 1;
275 hrtimer_restart(&ctx->t.tmr);
276 }
221 } 277 }
222 ctx->expired = 0; 278 ctx->expired = 0;
223 ctx->ticks = 0; 279 ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
259 315
260 if ((flags & ~TFD_CREATE_FLAGS) || 316 if ((flags & ~TFD_CREATE_FLAGS) ||
261 (clockid != CLOCK_MONOTONIC && 317 (clockid != CLOCK_MONOTONIC &&
262 clockid != CLOCK_REALTIME)) 318 clockid != CLOCK_REALTIME &&
319 clockid != CLOCK_REALTIME_ALARM &&
320 clockid != CLOCK_BOOTTIME_ALARM))
263 return -EINVAL; 321 return -EINVAL;
264 322
265 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 323 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
268 326
269 init_waitqueue_head(&ctx->wqh); 327 init_waitqueue_head(&ctx->wqh);
270 ctx->clockid = clockid; 328 ctx->clockid = clockid;
271 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 329
330 if (isalarm(ctx))
331 alarm_init(&ctx->t.alarm,
332 ctx->clockid == CLOCK_REALTIME_ALARM ?
333 ALARM_REALTIME : ALARM_BOOTTIME,
334 timerfd_alarmproc);
335 else
336 hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
337
272 ctx->moffs = ktime_get_monotonic_offset(); 338 ctx->moffs = ktime_get_monotonic_offset();
273 339
274 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 340 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
305 */ 371 */
306 for (;;) { 372 for (;;) {
307 spin_lock_irq(&ctx->wqh.lock); 373 spin_lock_irq(&ctx->wqh.lock);
308 if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) 374
309 break; 375 if (isalarm(ctx)) {
376 if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
377 break;
378 } else {
379 if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
380 break;
381 }
310 spin_unlock_irq(&ctx->wqh.lock); 382 spin_unlock_irq(&ctx->wqh.lock);
311 cpu_relax(); 383 cpu_relax();
312 } 384 }
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
317 * We do not update "ticks" and "expired" since the timer will be 389 * We do not update "ticks" and "expired" since the timer will be
318 * re-programmed again in the following timerfd_setup() call. 390 * re-programmed again in the following timerfd_setup() call.
319 */ 391 */
320 if (ctx->expired && ctx->tintv.tv64) 392 if (ctx->expired && ctx->tintv.tv64) {
321 hrtimer_forward_now(&ctx->tmr, ctx->tintv); 393 if (isalarm(ctx))
394 alarm_forward_now(&ctx->t.alarm, ctx->tintv);
395 else
396 hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
397 }
322 398
323 old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 399 old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
324 old->it_interval = ktime_to_timespec(ctx->tintv); 400 old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
345 spin_lock_irq(&ctx->wqh.lock); 421 spin_lock_irq(&ctx->wqh.lock);
346 if (ctx->expired && ctx->tintv.tv64) { 422 if (ctx->expired && ctx->tintv.tv64) {
347 ctx->expired = 0; 423 ctx->expired = 0;
348 ctx->ticks += 424
349 hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; 425 if (isalarm(ctx)) {
350 hrtimer_restart(&ctx->tmr); 426 ctx->ticks +=
427 alarm_forward_now(
428 &ctx->t.alarm, ctx->tintv) - 1;
429 alarm_restart(&ctx->t.alarm);
430 } else {
431 ctx->ticks +=
432 hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
433 - 1;
434 hrtimer_restart(&ctx->t.tmr);
435 }
351 } 436 }
352 t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 437 t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
353 t->it_interval = ktime_to_timespec(ctx->tintv); 438 t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f21acf0ef01f..879b9976c12b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
1412 1412
1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", 1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name, 1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name,
1415 c->ro_mount ? ", R/O mode" : NULL); 1415 c->ro_mount ? ", R/O mode" : "");
1416 x = (long long)c->main_lebs * c->leb_size; 1416 x = (long long)c->main_lebs * c->leb_size;
1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", 1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6313b69b6644..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -71,6 +71,7 @@ xfs-y += xfs_alloc.o \
71 xfs_dir2_sf.o \ 71 xfs_dir2_sf.o \
72 xfs_ialloc.o \ 72 xfs_ialloc.o \
73 xfs_ialloc_btree.o \ 73 xfs_ialloc_btree.o \
74 xfs_icreate_item.o \
74 xfs_inode.o \ 75 xfs_inode.o \
75 xfs_log_recover.o \ 76 xfs_log_recover.o \
76 xfs_mount.o \ 77 xfs_mount.o \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5673bcfda2f0..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
175 xfs_agblock_t wantbno, /* target starting block */ 175 xfs_agblock_t wantbno, /* target starting block */
176 xfs_extlen_t wantlen, /* target length */ 176 xfs_extlen_t wantlen, /* target length */
177 xfs_extlen_t alignment, /* target alignment */ 177 xfs_extlen_t alignment, /* target alignment */
178 char userdata, /* are we allocating data? */
178 xfs_agblock_t freebno, /* freespace's starting block */ 179 xfs_agblock_t freebno, /* freespace's starting block */
179 xfs_extlen_t freelen, /* freespace's length */ 180 xfs_extlen_t freelen, /* freespace's length */
180 xfs_agblock_t *newbnop) /* result: best start block from free */ 181 xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
189 ASSERT(freelen >= wantlen); 190 ASSERT(freelen >= wantlen);
190 freeend = freebno + freelen; 191 freeend = freebno + freelen;
191 wantend = wantbno + wantlen; 192 wantend = wantbno + wantlen;
192 if (freebno >= wantbno) { 193 /*
194 * We want to allocate from the start of a free extent if it is past
195 * the desired block or if we are allocating user data and the free
196 * extent is before desired block. The second case is there to allow
197 * for contiguous allocation from the remaining free space if the file
198 * grows in the short term.
199 */
200 if (freebno >= wantbno || (userdata && freeend < wantend)) {
193 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 201 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
194 newbno1 = NULLAGBLOCK; 202 newbno1 = NULLAGBLOCK;
195 } else if (freeend >= wantend && alignment > 1) { 203 } else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
805 xfs_alloc_fix_len(args); 813 xfs_alloc_fix_len(args);
806 814
807 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 815 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
808 args->alignment, *sbnoa, 816 args->alignment,
817 args->userdata, *sbnoa,
809 *slena, &new); 818 *slena, &new);
810 819
811 /* 820 /*
@@ -976,7 +985,8 @@ restart:
976 if (args->len < blen) 985 if (args->len < blen)
977 continue; 986 continue;
978 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 987 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
979 args->alignment, ltbnoa, ltlena, &ltnew); 988 args->alignment, args->userdata, ltbnoa,
989 ltlena, &ltnew);
980 if (ltnew != NULLAGBLOCK && 990 if (ltnew != NULLAGBLOCK &&
981 (args->len > blen || ltdiff < bdiff)) { 991 (args->len > blen || ltdiff < bdiff)) {
982 bdiff = ltdiff; 992 bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
1128 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1138 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1129 xfs_alloc_fix_len(args); 1139 xfs_alloc_fix_len(args);
1130 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1140 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1131 args->alignment, ltbnoa, ltlena, &ltnew); 1141 args->alignment, args->userdata, ltbnoa,
1142 ltlena, &ltnew);
1132 1143
1133 error = xfs_alloc_find_best_extent(args, 1144 error = xfs_alloc_find_best_extent(args,
1134 &bno_cur_lt, &bno_cur_gt, 1145 &bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
1144 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1155 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1145 xfs_alloc_fix_len(args); 1156 xfs_alloc_fix_len(args);
1146 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1157 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1147 args->alignment, gtbnoa, gtlena, &gtnew); 1158 args->alignment, args->userdata, gtbnoa,
1159 gtlena, &gtnew);
1148 1160
1149 error = xfs_alloc_find_best_extent(args, 1161 error = xfs_alloc_find_best_extent(args,
1150 &bno_cur_gt, &bno_cur_lt, 1162 &bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
1203 } 1215 }
1204 rlen = args->len; 1216 rlen = args->len;
1205 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, 1217 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1206 ltbnoa, ltlena, &ltnew); 1218 args->userdata, ltbnoa, ltlena, &ltnew);
1207 ASSERT(ltnew >= ltbno); 1219 ASSERT(ltnew >= ltbno);
1208 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1220 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1209 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1221 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 31d3cd129269..b800fbcafc7f 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -690,6 +690,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
690 sf = (xfs_attr_shortform_t *)tmpbuffer; 690 sf = (xfs_attr_shortform_t *)tmpbuffer;
691 691
692 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 692 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
693 xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
694
693 bp = NULL; 695 bp = NULL;
694 error = xfs_da_grow_inode(args, &blkno); 696 error = xfs_da_grow_inode(args, &blkno);
695 if (error) { 697 if (error) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 89042848f9ec..05c698ccb238 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1161,6 +1161,24 @@ xfs_bmap_extents_to_btree(
1161 * since the file data needs to get logged so things will stay consistent. 1161 * since the file data needs to get logged so things will stay consistent.
1162 * (The bmap-level manipulations are ok, though). 1162 * (The bmap-level manipulations are ok, though).
1163 */ 1163 */
1164void
1165xfs_bmap_local_to_extents_empty(
1166 struct xfs_inode *ip,
1167 int whichfork)
1168{
1169 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
1170
1171 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
1172 ASSERT(ifp->if_bytes == 0);
1173 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
1174
1175 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
1176 ifp->if_flags &= ~XFS_IFINLINE;
1177 ifp->if_flags |= XFS_IFEXTENTS;
1178 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
1179}
1180
1181
1164STATIC int /* error */ 1182STATIC int /* error */
1165xfs_bmap_local_to_extents( 1183xfs_bmap_local_to_extents(
1166 xfs_trans_t *tp, /* transaction pointer */ 1184 xfs_trans_t *tp, /* transaction pointer */
@@ -1174,9 +1192,12 @@ xfs_bmap_local_to_extents(
1174 struct xfs_inode *ip, 1192 struct xfs_inode *ip,
1175 struct xfs_ifork *ifp)) 1193 struct xfs_ifork *ifp))
1176{ 1194{
1177 int error; /* error return value */ 1195 int error = 0;
1178 int flags; /* logging flags returned */ 1196 int flags; /* logging flags returned */
1179 xfs_ifork_t *ifp; /* inode fork pointer */ 1197 xfs_ifork_t *ifp; /* inode fork pointer */
1198 xfs_alloc_arg_t args; /* allocation arguments */
1199 xfs_buf_t *bp; /* buffer for extent block */
1200 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1180 1201
1181 /* 1202 /*
1182 * We don't want to deal with the case of keeping inode data inline yet. 1203 * We don't want to deal with the case of keeping inode data inline yet.
@@ -1185,68 +1206,65 @@ xfs_bmap_local_to_extents(
1185 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); 1206 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
1186 ifp = XFS_IFORK_PTR(ip, whichfork); 1207 ifp = XFS_IFORK_PTR(ip, whichfork);
1187 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); 1208 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
1209
1210 if (!ifp->if_bytes) {
1211 xfs_bmap_local_to_extents_empty(ip, whichfork);
1212 flags = XFS_ILOG_CORE;
1213 goto done;
1214 }
1215
1188 flags = 0; 1216 flags = 0;
1189 error = 0; 1217 error = 0;
1190 if (ifp->if_bytes) { 1218 ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
1191 xfs_alloc_arg_t args; /* allocation arguments */ 1219 XFS_IFINLINE);
1192 xfs_buf_t *bp; /* buffer for extent block */ 1220 memset(&args, 0, sizeof(args));
1193 xfs_bmbt_rec_host_t *ep;/* extent record pointer */ 1221 args.tp = tp;
1194 1222 args.mp = ip->i_mount;
1195 ASSERT((ifp->if_flags & 1223 args.firstblock = *firstblock;
1196 (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); 1224 /*
1197 memset(&args, 0, sizeof(args)); 1225 * Allocate a block. We know we need only one, since the
1198 args.tp = tp; 1226 * file currently fits in an inode.
1199 args.mp = ip->i_mount; 1227 */
1200 args.firstblock = *firstblock; 1228 if (*firstblock == NULLFSBLOCK) {
1201 /* 1229 args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
1202 * Allocate a block. We know we need only one, since the 1230 args.type = XFS_ALLOCTYPE_START_BNO;
1203 * file currently fits in an inode.
1204 */
1205 if (*firstblock == NULLFSBLOCK) {
1206 args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
1207 args.type = XFS_ALLOCTYPE_START_BNO;
1208 } else {
1209 args.fsbno = *firstblock;
1210 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1211 }
1212 args.total = total;
1213 args.minlen = args.maxlen = args.prod = 1;
1214 error = xfs_alloc_vextent(&args);
1215 if (error)
1216 goto done;
1217
1218 /* Can't fail, the space was reserved. */
1219 ASSERT(args.fsbno != NULLFSBLOCK);
1220 ASSERT(args.len == 1);
1221 *firstblock = args.fsbno;
1222 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
1223
1224 /* initialise the block and copy the data */
1225 init_fn(tp, bp, ip, ifp);
1226
1227 /* account for the change in fork size and log everything */
1228 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
1229 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
1230 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
1231 xfs_iext_add(ifp, 0, 1);
1232 ep = xfs_iext_get_ext(ifp, 0);
1233 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
1234 trace_xfs_bmap_post_update(ip, 0,
1235 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
1236 _THIS_IP_);
1237 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
1238 ip->i_d.di_nblocks = 1;
1239 xfs_trans_mod_dquot_byino(tp, ip,
1240 XFS_TRANS_DQ_BCOUNT, 1L);
1241 flags |= xfs_ilog_fext(whichfork);
1242 } else { 1231 } else {
1243 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); 1232 args.fsbno = *firstblock;
1244 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); 1233 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1245 } 1234 }
1246 ifp->if_flags &= ~XFS_IFINLINE; 1235 args.total = total;
1247 ifp->if_flags |= XFS_IFEXTENTS; 1236 args.minlen = args.maxlen = args.prod = 1;
1248 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); 1237 error = xfs_alloc_vextent(&args);
1238 if (error)
1239 goto done;
1240
1241 /* Can't fail, the space was reserved. */
1242 ASSERT(args.fsbno != NULLFSBLOCK);
1243 ASSERT(args.len == 1);
1244 *firstblock = args.fsbno;
1245 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
1246
1247 /* initialise the block and copy the data */
1248 init_fn(tp, bp, ip, ifp);
1249
1250 /* account for the change in fork size and log everything */
1251 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
1252 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
1253 xfs_bmap_local_to_extents_empty(ip, whichfork);
1249 flags |= XFS_ILOG_CORE; 1254 flags |= XFS_ILOG_CORE;
1255
1256 xfs_iext_add(ifp, 0, 1);
1257 ep = xfs_iext_get_ext(ifp, 0);
1258 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
1259 trace_xfs_bmap_post_update(ip, 0,
1260 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
1261 _THIS_IP_);
1262 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
1263 ip->i_d.di_nblocks = 1;
1264 xfs_trans_mod_dquot_byino(tp, ip,
1265 XFS_TRANS_DQ_BCOUNT, 1L);
1266 flags |= xfs_ilog_fext(whichfork);
1267
1250done: 1268done:
1251 *logflagsp = flags; 1269 *logflagsp = flags;
1252 return error; 1270 return error;
@@ -1323,25 +1341,6 @@ xfs_bmap_add_attrfork_extents(
1323} 1341}
1324 1342
1325/* 1343/*
1326 * Block initialisation function for local to extent format conversion.
1327 *
1328 * This shouldn't actually be called by anyone, so make sure debug kernels cause
1329 * a noticable failure.
1330 */
1331STATIC void
1332xfs_bmap_local_to_extents_init_fn(
1333 struct xfs_trans *tp,
1334 struct xfs_buf *bp,
1335 struct xfs_inode *ip,
1336 struct xfs_ifork *ifp)
1337{
1338 ASSERT(0);
1339 bp->b_ops = &xfs_bmbt_buf_ops;
1340 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
1341 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
1342}
1343
1344/*
1345 * Called from xfs_bmap_add_attrfork to handle local format files. Each 1344 * Called from xfs_bmap_add_attrfork to handle local format files. Each
1346 * different data fork content type needs a different callout to do the 1345 * different data fork content type needs a different callout to do the
1347 * conversion. Some are basic and only require special block initialisation 1346 * conversion. Some are basic and only require special block initialisation
@@ -1381,9 +1380,9 @@ xfs_bmap_add_attrfork_local(
1381 flags, XFS_DATA_FORK, 1380 flags, XFS_DATA_FORK,
1382 xfs_symlink_local_to_remote); 1381 xfs_symlink_local_to_remote);
1383 1382
1384 return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, 1383 /* should only be called for types that support local format data */
1385 XFS_DATA_FORK, 1384 ASSERT(0);
1386 xfs_bmap_local_to_extents_init_fn); 1385 return EFSCORRUPTED;
1387} 1386}
1388 1387
1389/* 1388/*
@@ -4907,20 +4906,19 @@ xfs_bmapi_write(
4907 orig_mval = mval; 4906 orig_mval = mval;
4908 orig_nmap = *nmap; 4907 orig_nmap = *nmap;
4909#endif 4908#endif
4909 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4910 XFS_ATTR_FORK : XFS_DATA_FORK;
4910 4911
4911 ASSERT(*nmap >= 1); 4912 ASSERT(*nmap >= 1);
4912 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4913 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4913 ASSERT(!(flags & XFS_BMAPI_IGSTATE)); 4914 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4914 ASSERT(tp != NULL); 4915 ASSERT(tp != NULL);
4915 ASSERT(len > 0); 4916 ASSERT(len > 0);
4916 4917 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4917 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4918 XFS_ATTR_FORK : XFS_DATA_FORK;
4919 4918
4920 if (unlikely(XFS_TEST_ERROR( 4919 if (unlikely(XFS_TEST_ERROR(
4921 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4920 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4922 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 4921 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4923 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
4924 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4922 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4925 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); 4923 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
4926 return XFS_ERROR(EFSCORRUPTED); 4924 return XFS_ERROR(EFSCORRUPTED);
@@ -4933,37 +4931,6 @@ xfs_bmapi_write(
4933 4931
4934 XFS_STATS_INC(xs_blk_mapw); 4932 XFS_STATS_INC(xs_blk_mapw);
4935 4933
4936 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4937 /*
4938 * XXX (dgc): This assumes we are only called for inodes that
4939 * contain content neutral data in local format. Anything that
4940 * contains caller-specific data in local format that needs
4941 * transformation to move to a block format needs to do the
4942 * conversion to extent format itself.
4943 *
4944 * Directory data forks and attribute forks handle this
4945 * themselves, but with the addition of metadata verifiers every
4946 * data fork in local format now contains caller specific data
4947 * and as such conversion through this function is likely to be
4948 * broken.
4949 *
4950 * The only likely user of this branch is for remote symlinks,
4951 * but we cannot overwrite the data fork contents of the symlink
4952 * (EEXIST occurs higher up the stack) and so it will never go
4953 * from local format to extent format here. Hence I don't think
4954 * this branch is ever executed intentionally and we should
4955 * consider removing it and asserting that xfs_bmapi_write()
4956 * cannot be called directly on local format forks. i.e. callers
4957 * are completely responsible for local to extent format
4958 * conversion, not xfs_bmapi_write().
4959 */
4960 error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
4961 &bma.logflags, whichfork,
4962 xfs_bmap_local_to_extents_init_fn);
4963 if (error)
4964 goto error0;
4965 }
4966
4967 if (*firstblock == NULLFSBLOCK) { 4934 if (*firstblock == NULLFSBLOCK) {
4968 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) 4935 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4969 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; 4936 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f469c3516eb..1cf1292d29b7 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -172,6 +172,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
172#endif 172#endif
173 173
174int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 174int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
175void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
175void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, 176void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
176 struct xfs_bmap_free *flist, struct xfs_mount *mp); 177 struct xfs_bmap_free *flist, struct xfs_mount *mp);
177void xfs_bmap_cancel(struct xfs_bmap_free *flist); 178void xfs_bmap_cancel(struct xfs_bmap_free *flist);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 70c43d9f72c1..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
196#define XFS_BMDR_SPACE_CALC(nrecs) \ 196#define XFS_BMDR_SPACE_CALC(nrecs) \
197 (int)(sizeof(xfs_bmdr_block_t) + \ 197 (int)(sizeof(xfs_bmdr_block_t) + \
198 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) 198 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
199#define XFS_BMAP_BMDR_SPACE(bb) \
200 (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
199 201
200/* 202/*
201 * Maximum number of bmap btree levels. 203 * Maximum number of bmap btree levels.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4ec431777048..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
140 140
141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
142 142
143 if (bip->bli_flags & XFS_BLI_ORDERED) {
144 /*
145 * The buffer has been logged just to order it.
146 * It is not being included in the transaction
147 * commit, so no vectors are used at all.
148 */
149 trace_xfs_buf_item_size_ordered(bip);
150 return XFS_LOG_VEC_ORDERED;
151 }
152
143 /* 153 /*
144 * the vector count is based on the number of buffer vectors we have 154 * the vector count is based on the number of buffer vectors we have
145 * dirty bits in. This will only be greater than one when we have a 155 * dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
212 goto out; 222 goto out;
213 } 223 }
214 224
225
215 /* 226 /*
216 * Fill in an iovec for each set of contiguous chunks. 227 * Fill in an iovec for each set of contiguous chunks.
217 */ 228 */
@@ -299,18 +310,36 @@ xfs_buf_item_format(
299 310
300 /* 311 /*
301 * If it is an inode buffer, transfer the in-memory state to the 312 * If it is an inode buffer, transfer the in-memory state to the
302 * format flags and clear the in-memory state. We do not transfer 313 * format flags and clear the in-memory state.
314 *
315 * For buffer based inode allocation, we do not transfer
303 * this state if the inode buffer allocation has not yet been committed 316 * this state if the inode buffer allocation has not yet been committed
304 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 317 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
305 * correct replay of the inode allocation. 318 * correct replay of the inode allocation.
319 *
320 * For icreate item based inode allocation, the buffers aren't written
321 * to the journal during allocation, and hence we should always tag the
322 * buffer as an inode buffer so that the correct unlinked list replay
323 * occurs during recovery.
306 */ 324 */
307 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 325 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
308 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 326 if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
327 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
309 xfs_log_item_in_current_chkpt(lip))) 328 xfs_log_item_in_current_chkpt(lip)))
310 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 329 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
311 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 330 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
312 } 331 }
313 332
333 if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
334 XFS_BLI_ORDERED) {
335 /*
336 * The buffer has been logged just to order it. It is not being
337 * included in the transaction commit, so don't format it.
338 */
339 trace_xfs_buf_item_format_ordered(bip);
340 return;
341 }
342
314 for (i = 0; i < bip->bli_format_count; i++) { 343 for (i = 0; i < bip->bli_format_count; i++) {
315 vecp = xfs_buf_item_format_segment(bip, vecp, offset, 344 vecp = xfs_buf_item_format_segment(bip, vecp, offset,
316 &bip->bli_formats[i]); 345 &bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
340 369
341 ASSERT(atomic_read(&bip->bli_refcount) > 0); 370 ASSERT(atomic_read(&bip->bli_refcount) > 0);
342 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 371 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
372 (bip->bli_flags & XFS_BLI_ORDERED) ||
343 (bip->bli_flags & XFS_BLI_STALE)); 373 (bip->bli_flags & XFS_BLI_STALE));
344 374
345 trace_xfs_buf_item_pin(bip); 375 trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
512{ 542{
513 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 543 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
514 struct xfs_buf *bp = bip->bli_buf; 544 struct xfs_buf *bp = bip->bli_buf;
515 int aborted, clean, i; 545 bool clean;
516 uint hold; 546 bool aborted;
547 int flags;
517 548
518 /* Clear the buffer's association with this transaction. */ 549 /* Clear the buffer's association with this transaction. */
519 bp->b_transp = NULL; 550 bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
524 * (cancelled) buffers at unpin time, but we'll never go through the 555 * (cancelled) buffers at unpin time, but we'll never go through the
525 * pin/unpin cycle if we abort inside commit. 556 * pin/unpin cycle if we abort inside commit.
526 */ 557 */
527 aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; 558 aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
528
529 /* 559 /*
530 * Before possibly freeing the buf item, determine if we should 560 * Before possibly freeing the buf item, copy the per-transaction state
531 * release the buffer at the end of this routine. 561 * so we can reference it safely later after clearing it from the
562 * buffer log item.
532 */ 563 */
533 hold = bip->bli_flags & XFS_BLI_HOLD; 564 flags = bip->bli_flags;
534 565 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
535 /* Clear the per transaction state. */
536 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
537 566
538 /* 567 /*
539 * If the buf item is marked stale, then don't do anything. We'll 568 * If the buf item is marked stale, then don't do anything. We'll
540 * unlock the buffer and free the buf item when the buffer is unpinned 569 * unlock the buffer and free the buf item when the buffer is unpinned
541 * for the last time. 570 * for the last time.
542 */ 571 */
543 if (bip->bli_flags & XFS_BLI_STALE) { 572 if (flags & XFS_BLI_STALE) {
544 trace_xfs_buf_item_unlock_stale(bip); 573 trace_xfs_buf_item_unlock_stale(bip);
545 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 574 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
546 if (!aborted) { 575 if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
557 * be the only reference to the buf item, so we free it anyway 586 * be the only reference to the buf item, so we free it anyway
558 * regardless of whether it is dirty or not. A dirty abort implies a 587 * regardless of whether it is dirty or not. A dirty abort implies a
559 * shutdown, anyway. 588 * shutdown, anyway.
589 *
590 * Ordered buffers are dirty but may have no recorded changes, so ensure
591 * we only release clean items here.
560 */ 592 */
561 clean = 1; 593 clean = (flags & XFS_BLI_DIRTY) ? false : true;
562 for (i = 0; i < bip->bli_format_count; i++) { 594 if (clean) {
563 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 595 int i;
564 bip->bli_formats[i].blf_map_size)) { 596 for (i = 0; i < bip->bli_format_count; i++) {
565 clean = 0; 597 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
566 break; 598 bip->bli_formats[i].blf_map_size)) {
599 clean = false;
600 break;
601 }
567 } 602 }
568 } 603 }
569 if (clean) 604 if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
576 } else 611 } else
577 atomic_dec(&bip->bli_refcount); 612 atomic_dec(&bip->bli_refcount);
578 613
579 if (!hold) 614 if (!(flags & XFS_BLI_HOLD))
580 xfs_buf_relse(bp); 615 xfs_buf_relse(bp);
581} 616}
582 617
@@ -842,12 +877,6 @@ xfs_buf_item_log(
842 struct xfs_buf *bp = bip->bli_buf; 877 struct xfs_buf *bp = bip->bli_buf;
843 878
844 /* 879 /*
845 * Mark the item as having some dirty data for
846 * quick reference in xfs_buf_item_dirty.
847 */
848 bip->bli_flags |= XFS_BLI_DIRTY;
849
850 /*
851 * walk each buffer segment and mark them dirty appropriately. 880 * walk each buffer segment and mark them dirty appropriately.
852 */ 881 */
853 start = 0; 882 start = 0;
@@ -873,7 +902,7 @@ xfs_buf_item_log(
873 902
874 903
875/* 904/*
876 * Return 1 if the buffer has some data that has been logged (at any 905 * Return 1 if the buffer has been logged or ordered in a transaction (at any
877 * point, not just the current transaction) and 0 if not. 906 * point, not just the current transaction) and 0 if not.
878 */ 907 */
879uint 908uint
@@ -907,11 +936,11 @@ void
907xfs_buf_item_relse( 936xfs_buf_item_relse(
908 xfs_buf_t *bp) 937 xfs_buf_t *bp)
909{ 938{
910 xfs_buf_log_item_t *bip; 939 xfs_buf_log_item_t *bip = bp->b_fspriv;
911 940
912 trace_xfs_buf_item_relse(bp, _RET_IP_); 941 trace_xfs_buf_item_relse(bp, _RET_IP_);
942 ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
913 943
914 bip = bp->b_fspriv;
915 bp->b_fspriv = bip->bli_item.li_bio_list; 944 bp->b_fspriv = bip->bli_item.li_bio_list;
916 if (bp->b_fspriv == NULL) 945 if (bp->b_fspriv == NULL)
917 bp->b_iodone = NULL; 946 bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 2573d2a75fc8..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
120#define XFS_BLI_INODE_ALLOC_BUF 0x10 120#define XFS_BLI_INODE_ALLOC_BUF 0x10
121#define XFS_BLI_STALE_INODE 0x20 121#define XFS_BLI_STALE_INODE 0x20
122#define XFS_BLI_INODE_BUF 0x40 122#define XFS_BLI_INODE_BUF 0x40
123#define XFS_BLI_ORDERED 0x80
123 124
124#define XFS_BLI_FLAGS \ 125#define XFS_BLI_FLAGS \
125 { XFS_BLI_HOLD, "HOLD" }, \ 126 { XFS_BLI_HOLD, "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
128 { XFS_BLI_LOGGED, "LOGGED" }, \ 129 { XFS_BLI_LOGGED, "LOGGED" }, \
129 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 130 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
130 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ 131 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
131 { XFS_BLI_INODE_BUF, "INODE_BUF" } 132 { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
133 { XFS_BLI_ORDERED, "ORDERED" }
132 134
133 135
134#ifdef __KERNEL__ 136#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index c407e1ccff43..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_alloc_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
27#include "xfs_dinode.h" 30#include "xfs_dinode.h"
28#include "xfs_inode.h" 31#include "xfs_inode.h"
29#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
182 */ 185 */
183 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 186 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 if (XFS_IFORK_BOFF(ip) && 187 if (XFS_IFORK_BOFF(ip) &&
185 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) 188 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
186 return EINVAL; 189 return EINVAL;
187 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= 190 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
188 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) 191 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
192 /* Reciprocal target->temp btree format checks */ 195 /* Reciprocal target->temp btree format checks */
193 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
194 if (XFS_IFORK_BOFF(tip) && 197 if (XFS_IFORK_BOFF(tip) &&
195 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) 198 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
196 return EINVAL; 199 return EINVAL;
197
198 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= 200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
199 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) 201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
200 return EINVAL; 202 return EINVAL;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index f7a0e95d197a..e5869b50dc41 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
39 * There is a very similar struct icdinode in xfs_inode which matches the 39 * There is a very similar struct icdinode in xfs_inode which matches the
40 * layout of the first 96 bytes of this structure, but is kept in native 40 * layout of the first 96 bytes of this structure, but is kept in native
41 * format instead of big endian. 41 * format instead of big endian.
42 *
43 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
44 * padding field for v3 inodes.
42 */ 45 */
43typedef struct xfs_dinode { 46typedef struct xfs_dinode {
44 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 47 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
@@ -132,9 +135,6 @@ typedef enum xfs_dinode_fmt {
132#define XFS_LITINO(mp, version) \ 135#define XFS_LITINO(mp, version) \
133 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) 136 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
134 137
135#define XFS_BROOT_SIZE_ADJ(ip) \
136 (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
137
138/* 138/*
139 * Inode data & attribute fork sizes, per inode. 139 * Inode data & attribute fork sizes, per inode.
140 */ 140 */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 09aea0247d96..5e7fbd72cf52 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
29#include "xfs_dinode.h" 29#include "xfs_dinode.h"
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
32#include "xfs_buf_item.h" 33#include "xfs_buf_item.h"
33#include "xfs_dir2.h" 34#include "xfs_dir2.h"
34#include "xfs_dir2_format.h" 35#include "xfs_dir2_format.h"
@@ -1164,13 +1165,15 @@ xfs_dir2_sf_to_block(
1164 __be16 *tagp; /* end of data entry */ 1165 __be16 *tagp; /* end of data entry */
1165 xfs_trans_t *tp; /* transaction pointer */ 1166 xfs_trans_t *tp; /* transaction pointer */
1166 struct xfs_name name; 1167 struct xfs_name name;
1168 struct xfs_ifork *ifp;
1167 1169
1168 trace_xfs_dir2_sf_to_block(args); 1170 trace_xfs_dir2_sf_to_block(args);
1169 1171
1170 dp = args->dp; 1172 dp = args->dp;
1171 tp = args->trans; 1173 tp = args->trans;
1172 mp = dp->i_mount; 1174 mp = dp->i_mount;
1173 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 1175 ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
1176 ASSERT(ifp->if_flags & XFS_IFINLINE);
1174 /* 1177 /*
1175 * Bomb out if the shortform directory is way too short. 1178 * Bomb out if the shortform directory is way too short.
1176 */ 1179 */
@@ -1179,22 +1182,23 @@ xfs_dir2_sf_to_block(
1179 return XFS_ERROR(EIO); 1182 return XFS_ERROR(EIO);
1180 } 1183 }
1181 1184
1182 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1185 oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
1183 1186
1184 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 1187 ASSERT(ifp->if_bytes == dp->i_d.di_size);
1185 ASSERT(dp->i_df.if_u1.if_data != NULL); 1188 ASSERT(ifp->if_u1.if_data != NULL);
1186 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); 1189 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
1190 ASSERT(dp->i_d.di_nextents == 0);
1187 1191
1188 /* 1192 /*
1189 * Copy the directory into a temporary buffer. 1193 * Copy the directory into a temporary buffer.
1190 * Then pitch the incore inode data so we can make extents. 1194 * Then pitch the incore inode data so we can make extents.
1191 */ 1195 */
1192 sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); 1196 sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
1193 memcpy(sfp, oldsfp, dp->i_df.if_bytes); 1197 memcpy(sfp, oldsfp, ifp->if_bytes);
1194 1198
1195 xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); 1199 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
1200 xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
1196 dp->i_d.di_size = 0; 1201 dp->i_d.di_size = 0;
1197 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1198 1202
1199 /* 1203 /*
1200 * Add block 0 to the inode. 1204 * Add block 0 to the inode.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e0cc1243a8aa..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
1108 struct xfs_mount *mp = dp->i_mount; 1108 struct xfs_mount *mp = dp->i_mount;
1109 struct xfs_buf *bp = *bpp; 1109 struct xfs_buf *bp = *bpp;
1110 struct xfs_bmbt_irec *map = mip->map; 1110 struct xfs_bmbt_irec *map = mip->map;
1111 struct blk_plug plug;
1111 int error = 0; 1112 int error = 0;
1112 int length; 1113 int length;
1113 int i; 1114 int i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
1236 /* 1237 /*
1237 * Do we need more readahead? 1238 * Do we need more readahead?
1238 */ 1239 */
1240 blk_start_plug(&plug);
1239 for (mip->ra_index = mip->ra_offset = i = 0; 1241 for (mip->ra_index = mip->ra_offset = i = 0;
1240 mip->ra_want > mip->ra_current && i < mip->map_blocks; 1242 mip->ra_want > mip->ra_current && i < mip->map_blocks;
1241 i += mp->m_dirblkfsbs) { 1243 i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
1287 } 1289 }
1288 } 1290 }
1289 } 1291 }
1292 blk_finish_plug(&plug);
1290 1293
1291out: 1294out:
1292 *bpp = bp; 1295 *bpp = bp;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a33c8d..0adf27ecf3f1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
570 xfs_buf_t **O_bpp, 570 xfs_buf_t **O_bpp,
571 uint flags) 571 uint flags)
572{ 572{
573 xfs_bmbt_irec_t map; 573 struct xfs_bmbt_irec map;
574 int nmaps = 1, error; 574 int nmaps = 1, error;
575 xfs_buf_t *bp; 575 struct xfs_buf *bp;
576 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 576 struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
577 xfs_mount_t *mp = dqp->q_mount; 577 struct xfs_mount *mp = dqp->q_mount;
578 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 578 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
579 xfs_trans_t *tp = (tpp ? *tpp : NULL); 579 struct xfs_trans *tp = (tpp ? *tpp : NULL);
580 580
581 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 581 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
582 582
@@ -804,7 +804,7 @@ xfs_qm_dqget(
804 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ 804 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
805{ 805{
806 struct xfs_quotainfo *qi = mp->m_quotainfo; 806 struct xfs_quotainfo *qi = mp->m_quotainfo;
807 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 807 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
808 struct xfs_dquot *dqp; 808 struct xfs_dquot *dqp;
809 int error; 809 int error;
810 810
@@ -936,6 +936,7 @@ xfs_qm_dqput_final(
936{ 936{
937 struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; 937 struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
938 struct xfs_dquot *gdqp; 938 struct xfs_dquot *gdqp;
939 struct xfs_dquot *pdqp;
939 940
940 trace_xfs_dqput_free(dqp); 941 trace_xfs_dqput_free(dqp);
941 942
@@ -949,21 +950,29 @@ xfs_qm_dqput_final(
949 950
950 /* 951 /*
951 * If we just added a udquot to the freelist, then we want to release 952 * If we just added a udquot to the freelist, then we want to release
952 * the gdquot reference that it (probably) has. Otherwise it'll keep 953 * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
953 * the gdquot from getting reclaimed. 954 * keep the gdquot/pdquot from getting reclaimed.
954 */ 955 */
955 gdqp = dqp->q_gdquot; 956 gdqp = dqp->q_gdquot;
956 if (gdqp) { 957 if (gdqp) {
957 xfs_dqlock(gdqp); 958 xfs_dqlock(gdqp);
958 dqp->q_gdquot = NULL; 959 dqp->q_gdquot = NULL;
959 } 960 }
961
962 pdqp = dqp->q_pdquot;
963 if (pdqp) {
964 xfs_dqlock(pdqp);
965 dqp->q_pdquot = NULL;
966 }
960 xfs_dqunlock(dqp); 967 xfs_dqunlock(dqp);
961 968
962 /* 969 /*
963 * If we had a group quota hint, release it now. 970 * If we had a group/project quota hint, release it now.
964 */ 971 */
965 if (gdqp) 972 if (gdqp)
966 xfs_qm_dqput(gdqp); 973 xfs_qm_dqput(gdqp);
974 if (pdqp)
975 xfs_qm_dqput(pdqp);
967} 976}
968 977
969/* 978/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4f0ebfc43cc9..55abbca2883d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -53,6 +53,7 @@ typedef struct xfs_dquot {
53 xfs_fileoff_t q_fileoffset; /* offset in quotas file */ 53 xfs_fileoff_t q_fileoffset; /* offset in quotas file */
54 54
55 struct xfs_dquot*q_gdquot; /* group dquot, hint only */ 55 struct xfs_dquot*q_gdquot; /* group dquot, hint only */
56 struct xfs_dquot*q_pdquot; /* project dquot, hint only */
56 xfs_disk_dquot_t q_core; /* actual usage & quotas */ 57 xfs_disk_dquot_t q_core; /* actual usage & quotas */
57 xfs_dq_logitem_t q_logitem; /* dquot log item */ 58 xfs_dq_logitem_t q_logitem; /* dquot log item */
58 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ 59 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
@@ -118,8 +119,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
118 case XFS_DQ_USER: 119 case XFS_DQ_USER:
119 return XFS_IS_UQUOTA_ON(mp); 120 return XFS_IS_UQUOTA_ON(mp);
120 case XFS_DQ_GROUP: 121 case XFS_DQ_GROUP:
122 return XFS_IS_GQUOTA_ON(mp);
121 case XFS_DQ_PROJ: 123 case XFS_DQ_PROJ:
122 return XFS_IS_OQUOTA_ON(mp); 124 return XFS_IS_PQUOTA_ON(mp);
123 default: 125 default:
124 return 0; 126 return 0;
125 } 127 }
@@ -131,8 +133,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
131 case XFS_DQ_USER: 133 case XFS_DQ_USER:
132 return ip->i_udquot; 134 return ip->i_udquot;
133 case XFS_DQ_GROUP: 135 case XFS_DQ_GROUP:
134 case XFS_DQ_PROJ:
135 return ip->i_gdquot; 136 return ip->i_gdquot;
137 case XFS_DQ_PROJ:
138 return ip->i_pdquot;
136 default: 139 default:
137 return NULL; 140 return NULL;
138 } 141 }
@@ -143,10 +146,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
143#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 146#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
144#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 147#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
145#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) 148#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
146#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
147#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
148 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
149 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
150 149
151extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, 150extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
152 uint, struct xfs_dquot **); 151 uint, struct xfs_dquot **);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
176 if (!bp) 176 if (!bp)
177 return EIO; 177 return EIO;
178 if (bp->b_error) { 178 if (bp->b_error) {
179 int error = bp->b_error; 179 error = bp->b_error;
180 xfs_buf_relse(bp); 180 xfs_buf_relse(bp);
181 return error; 181 return error;
182 } 182 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c8f5ae1debf2..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
38#include "xfs_bmap.h" 38#include "xfs_bmap.h"
39#include "xfs_cksum.h" 39#include "xfs_cksum.h"
40#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
41#include "xfs_icreate_item.h"
41 42
42 43
43/* 44/*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
150#endif 151#endif
151 152
152/* 153/*
153 * Initialise a new set of inodes. 154 * Initialise a new set of inodes. When called without a transaction context
155 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
156 * than logging them (which in a transaction context puts them into the AIL
157 * for writeback rather than the xfsbufd queue).
154 */ 158 */
155STATIC int 159int
156xfs_ialloc_inode_init( 160xfs_ialloc_inode_init(
157 struct xfs_mount *mp, 161 struct xfs_mount *mp,
158 struct xfs_trans *tp, 162 struct xfs_trans *tp,
163 struct list_head *buffer_list,
159 xfs_agnumber_t agno, 164 xfs_agnumber_t agno,
160 xfs_agblock_t agbno, 165 xfs_agblock_t agbno,
161 xfs_agblock_t length, 166 xfs_agblock_t length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
208 version = 3; 213 version = 3;
209 ino = XFS_AGINO_TO_INO(mp, agno, 214 ino = XFS_AGINO_TO_INO(mp, agno,
210 XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); 215 XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
216
217 /*
218 * log the initialisation that is about to take place as an
219 * logical operation. This means the transaction does not
220 * need to log the physical changes to the inode buffers as log
221 * recovery will know what initialisation is actually needed.
222 * Hence we only need to log the buffers as "ordered" buffers so
223 * they track in the AIL as if they were physically logged.
224 */
225 if (tp)
226 xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
227 mp->m_sb.sb_inodesize, length, gen);
211 } else if (xfs_sb_version_hasnlink(&mp->m_sb)) 228 } else if (xfs_sb_version_hasnlink(&mp->m_sb))
212 version = 2; 229 version = 2;
213 else 230 else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
223 XBF_UNMAPPED); 240 XBF_UNMAPPED);
224 if (!fbuf) 241 if (!fbuf)
225 return ENOMEM; 242 return ENOMEM;
226 /* 243
227 * Initialize all inodes in this buffer and then log them. 244 /* Initialize the inode buffers and log them appropriately. */
228 *
229 * XXX: It would be much better if we had just one transaction
230 * to log a whole cluster of inodes instead of all the
231 * individual transactions causing a lot of log traffic.
232 */
233 fbuf->b_ops = &xfs_inode_buf_ops; 245 fbuf->b_ops = &xfs_inode_buf_ops;
234 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); 246 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
235 for (i = 0; i < ninodes; i++) { 247 for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
247 ino++; 259 ino++;
248 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); 260 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
249 xfs_dinode_calc_crc(mp, free); 261 xfs_dinode_calc_crc(mp, free);
250 } else { 262 } else if (tp) {
251 /* just log the inode core */ 263 /* just log the inode core */
252 xfs_trans_log_buf(tp, fbuf, ioffset, 264 xfs_trans_log_buf(tp, fbuf, ioffset,
253 ioffset + isize - 1); 265 ioffset + isize - 1);
254 } 266 }
255 } 267 }
256 if (version == 3) { 268
257 /* need to log the entire buffer */ 269 if (tp) {
258 xfs_trans_log_buf(tp, fbuf, 0, 270 /*
259 BBTOB(fbuf->b_length) - 1); 271 * Mark the buffer as an inode allocation buffer so it
272 * sticks in AIL at the point of this allocation
273 * transaction. This ensures the they are on disk before
274 * the tail of the log can be moved past this
275 * transaction (i.e. by preventing relogging from moving
276 * it forward in the log).
277 */
278 xfs_trans_inode_alloc_buf(tp, fbuf);
279 if (version == 3) {
280 /*
281 * Mark the buffer as ordered so that they are
282 * not physically logged in the transaction but
283 * still tracked in the AIL as part of the
284 * transaction and pin the log appropriately.
285 */
286 xfs_trans_ordered_buf(tp, fbuf);
287 xfs_trans_log_buf(tp, fbuf, 0,
288 BBTOB(fbuf->b_length) - 1);
289 }
290 } else {
291 fbuf->b_flags |= XBF_DONE;
292 xfs_buf_delwri_queue(fbuf, buffer_list);
293 xfs_buf_relse(fbuf);
260 } 294 }
261 xfs_trans_inode_alloc_buf(tp, fbuf);
262 } 295 }
263 return 0; 296 return 0;
264} 297}
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
303 * First try to allocate inodes contiguous with the last-allocated 336 * First try to allocate inodes contiguous with the last-allocated
304 * chunk of inodes. If the filesystem is striped, this will fill 337 * chunk of inodes. If the filesystem is striped, this will fill
305 * an entire stripe unit with inodes. 338 * an entire stripe unit with inodes.
306 */ 339 */
307 agi = XFS_BUF_TO_AGI(agbp); 340 agi = XFS_BUF_TO_AGI(agbp);
308 newino = be32_to_cpu(agi->agi_newino); 341 newino = be32_to_cpu(agi->agi_newino);
309 agno = be32_to_cpu(agi->agi_seqno); 342 agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
402 * rather than a linear progression to prevent the next generation 435 * rather than a linear progression to prevent the next generation
403 * number from being easily guessable. 436 * number from being easily guessable.
404 */ 437 */
405 error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, 438 error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
406 args.len, prandom_u32()); 439 args.len, prandom_u32());
407 440
408 if (error) 441 if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
615 struct xfs_btree_cur *cur, 648 struct xfs_btree_cur *cur,
616 xfs_agino_t agino, 649 xfs_agino_t agino,
617 xfs_inobt_rec_incore_t *rec, 650 xfs_inobt_rec_incore_t *rec,
618 int *done, 651 int *done)
619 int left)
620{ 652{
621 int error; 653 int error;
622 int i; 654 int i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
724 pag->pagl_leftrec != NULLAGINO && 756 pag->pagl_leftrec != NULLAGINO &&
725 pag->pagl_rightrec != NULLAGINO) { 757 pag->pagl_rightrec != NULLAGINO) {
726 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, 758 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
727 &trec, &doneleft, 1); 759 &trec, &doneleft);
728 if (error) 760 if (error)
729 goto error1; 761 goto error1;
730 762
731 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, 763 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
732 &rec, &doneright, 0); 764 &rec, &doneright);
733 if (error) 765 if (error)
734 goto error1; 766 goto error1;
735 } else { 767 } else {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
150int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153/*
154 * Inode chunk initialisation routine
155 */
156int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
157 struct list_head *buffer_list,
158 xfs_agnumber_t agno, xfs_agblock_t agbno,
159 xfs_agblock_t length, unsigned int gen);
160
153extern const struct xfs_buf_ops xfs_agi_buf_ops; 161extern const struct xfs_buf_ops xfs_agi_buf_ops;
154 162
155#endif /* __XFS_IALLOC_H__ */ 163#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..3f90e1ceb8d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,9 @@ xfs_iget_cache_miss(
335 iflags = XFS_INEW; 335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE) 336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE; 337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL; 338 ip->i_udquot = NULL;
339 ip->i_gdquot = NULL;
340 ip->i_pdquot = NULL;
339 xfs_iflags_set(ip, iflags); 341 xfs_iflags_set(ip, iflags);
340 342
341 /* insert the new inode */ 343 /* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); 40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *); 41void xfs_eofblocks_worker(struct work_struct *);
42 42
43int xfs_sync_inode_grab(struct xfs_inode *ip);
44int xfs_inode_ag_iterator(struct xfs_mount *mp, 43int xfs_inode_ag_iterator(struct xfs_mount *mp,
45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, 44 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
46 int flags, void *args), 45 int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
1/*
2 * Copyright (c) 2008-2010, 2013 Dave Chinner
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h"
30#include "xfs_trans_priv.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_error.h"
41#include "xfs_icreate_item.h"
42
43kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
44
45static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
46{
47 return container_of(lip, struct xfs_icreate_item, ic_item);
48}
49
50/*
51 * This returns the number of iovecs needed to log the given inode item.
52 *
53 * We only need one iovec for the icreate log structure.
54 */
55STATIC uint
56xfs_icreate_item_size(
57 struct xfs_log_item *lip)
58{
59 return 1;
60}
61
62/*
63 * This is called to fill in the vector of log iovecs for the
64 * given inode create log item.
65 */
66STATIC void
67xfs_icreate_item_format(
68 struct xfs_log_item *lip,
69 struct xfs_log_iovec *log_vector)
70{
71 struct xfs_icreate_item *icp = ICR_ITEM(lip);
72
73 log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
74 log_vector->i_len = sizeof(struct xfs_icreate_log);
75 log_vector->i_type = XLOG_REG_TYPE_ICREATE;
76}
77
78
79/* Pinning has no meaning for the create item, so just return. */
80STATIC void
81xfs_icreate_item_pin(
82 struct xfs_log_item *lip)
83{
84}
85
86
87/* pinning has no meaning for the create item, so just return. */
88STATIC void
89xfs_icreate_item_unpin(
90 struct xfs_log_item *lip,
91 int remove)
92{
93}
94
95STATIC void
96xfs_icreate_item_unlock(
97 struct xfs_log_item *lip)
98{
99 struct xfs_icreate_item *icp = ICR_ITEM(lip);
100
101 if (icp->ic_item.li_flags & XFS_LI_ABORTED)
102 kmem_zone_free(xfs_icreate_zone, icp);
103 return;
104}
105
106/*
107 * Because we have ordered buffers being tracked in the AIL for the inode
108 * creation, we don't need the create item after this. Hence we can free
109 * the log item and return -1 to tell the caller we're done with the item.
110 */
111STATIC xfs_lsn_t
112xfs_icreate_item_committed(
113 struct xfs_log_item *lip,
114 xfs_lsn_t lsn)
115{
116 struct xfs_icreate_item *icp = ICR_ITEM(lip);
117
118 kmem_zone_free(xfs_icreate_zone, icp);
119 return (xfs_lsn_t)-1;
120}
121
122/* item can never get into the AIL */
123STATIC uint
124xfs_icreate_item_push(
125 struct xfs_log_item *lip,
126 struct list_head *buffer_list)
127{
128 ASSERT(0);
129 return XFS_ITEM_SUCCESS;
130}
131
132/* Ordered buffers do the dependency tracking here, so this does nothing. */
133STATIC void
134xfs_icreate_item_committing(
135 struct xfs_log_item *lip,
136 xfs_lsn_t lsn)
137{
138}
139
140/*
141 * This is the ops vector shared by all buf log items.
142 */
143static struct xfs_item_ops xfs_icreate_item_ops = {
144 .iop_size = xfs_icreate_item_size,
145 .iop_format = xfs_icreate_item_format,
146 .iop_pin = xfs_icreate_item_pin,
147 .iop_unpin = xfs_icreate_item_unpin,
148 .iop_push = xfs_icreate_item_push,
149 .iop_unlock = xfs_icreate_item_unlock,
150 .iop_committed = xfs_icreate_item_committed,
151 .iop_committing = xfs_icreate_item_committing,
152};
153
154
155/*
156 * Initialize the inode log item for a newly allocated (in-core) inode.
157 *
158 * Inode extents can only reside within an AG. Hence specify the starting
159 * block for the inode chunk by offset within an AG as well as the
160 * length of the allocated extent.
161 *
162 * This joins the item to the transaction and marks it dirty so
163 * that we don't need a separate call to do this, nor does the
164 * caller need to know anything about the icreate item.
165 */
166void
167xfs_icreate_log(
168 struct xfs_trans *tp,
169 xfs_agnumber_t agno,
170 xfs_agblock_t agbno,
171 unsigned int count,
172 unsigned int inode_size,
173 xfs_agblock_t length,
174 unsigned int generation)
175{
176 struct xfs_icreate_item *icp;
177
178 icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
179
180 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
181 &xfs_icreate_item_ops);
182
183 icp->ic_format.icl_type = XFS_LI_ICREATE;
184 icp->ic_format.icl_size = 1; /* single vector */
185 icp->ic_format.icl_ag = cpu_to_be32(agno);
186 icp->ic_format.icl_agbno = cpu_to_be32(agbno);
187 icp->ic_format.icl_count = cpu_to_be32(count);
188 icp->ic_format.icl_isize = cpu_to_be32(inode_size);
189 icp->ic_format.icl_length = cpu_to_be32(length);
190 icp->ic_format.icl_gen = cpu_to_be32(generation);
191
192 xfs_trans_add_item(tp, &icp->ic_item);
193 tp->t_flags |= XFS_TRANS_DIRTY;
194 icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
195}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (c) 2008-2010, Dave Chinner
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef XFS_ICREATE_ITEM_H
19#define XFS_ICREATE_ITEM_H 1
20
21/*
22 * on disk log item structure
23 *
24 * Log recovery assumes the first two entries are the type and size and they fit
25 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
26 * decoding can be done correctly.
27 */
28struct xfs_icreate_log {
29 __uint16_t icl_type; /* type of log format structure */
30 __uint16_t icl_size; /* size of log format structure */
31 __be32 icl_ag; /* ag being allocated in */
32 __be32 icl_agbno; /* start block of inode range */
33 __be32 icl_count; /* number of inodes to initialise */
34 __be32 icl_isize; /* size of inodes */
35 __be32 icl_length; /* length of extent to initialise */
36 __be32 icl_gen; /* inode generation number to use */
37};
38
39/* in memory log item structure */
40struct xfs_icreate_item {
41 struct xfs_log_item ic_item;
42 struct xfs_icreate_log ic_format;
43};
44
45extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
46
47void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
48 xfs_agblock_t agbno, unsigned int count,
49 unsigned int inode_size, xfs_agblock_t length,
50 unsigned int generation);
51
52#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f98f52..bb262c25c8de 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@ xfs_dinode_to_disk(
896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
899 to->di_flushiter = cpu_to_be16(from->di_flushiter);
900 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 899 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
901 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 900 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
902 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 901 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@ xfs_dinode_to_disk(
924 to->di_lsn = cpu_to_be64(from->di_lsn); 923 to->di_lsn = cpu_to_be64(from->di_lsn);
925 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 924 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
926 uuid_copy(&to->di_uuid, &from->di_uuid); 925 uuid_copy(&to->di_uuid, &from->di_uuid);
926 to->di_flushiter = 0;
927 } else {
928 to->di_flushiter = cpu_to_be16(from->di_flushiter);
927 } 929 }
928} 930}
929 931
@@ -1028,6 +1030,15 @@ xfs_dinode_calc_crc(
1028 1030
1029/* 1031/*
1030 * Read the disk inode attributes into the in-core inode structure. 1032 * Read the disk inode attributes into the in-core inode structure.
1033 *
1034 * For version 5 superblocks, if we are initialising a new inode and we are not
1035 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
1036 * inode core with a random generation number. If we are keeping inodes around,
1037 * we need to read the inode cluster to get the existing generation number off
1038 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
1039 * format) then log recovery is dependent on the di_flushiter field being
1040 * initialised from the current on-disk value and hence we must also read the
1041 * inode off disk.
1031 */ 1042 */
1032int 1043int
1033xfs_iread( 1044xfs_iread(
@@ -1047,6 +1058,23 @@ xfs_iread(
1047 if (error) 1058 if (error)
1048 return error; 1059 return error;
1049 1060
1061 /* shortcut IO on inode allocation if possible */
1062 if ((iget_flags & XFS_IGET_CREATE) &&
1063 xfs_sb_version_hascrc(&mp->m_sb) &&
1064 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1065 /* initialise the on-disk inode core */
1066 memset(&ip->i_d, 0, sizeof(ip->i_d));
1067 ip->i_d.di_magic = XFS_DINODE_MAGIC;
1068 ip->i_d.di_gen = prandom_u32();
1069 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1070 ip->i_d.di_version = 3;
1071 ip->i_d.di_ino = ip->i_ino;
1072 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
1073 } else
1074 ip->i_d.di_version = 2;
1075 return 0;
1076 }
1077
1050 /* 1078 /*
1051 * Get pointers to the on-disk inode and the buffer containing it. 1079 * Get pointers to the on-disk inode and the buffer containing it.
1052 */ 1080 */
@@ -1133,17 +1161,16 @@ xfs_iread(
1133 xfs_buf_set_ref(bp, XFS_INO_REF); 1161 xfs_buf_set_ref(bp, XFS_INO_REF);
1134 1162
1135 /* 1163 /*
1136 * Use xfs_trans_brelse() to release the buffer containing the 1164 * Use xfs_trans_brelse() to release the buffer containing the on-disk
1137 * on-disk inode, because it was acquired with xfs_trans_read_buf() 1165 * inode, because it was acquired with xfs_trans_read_buf() in
1138 * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal 1166 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
1139 * brelse(). If we're within a transaction, then xfs_trans_brelse() 1167 * brelse(). If we're within a transaction, then xfs_trans_brelse()
1140 * will only release the buffer if it is not dirty within the 1168 * will only release the buffer if it is not dirty within the
1141 * transaction. It will be OK to release the buffer in this case, 1169 * transaction. It will be OK to release the buffer in this case,
1142 * because inodes on disk are never destroyed and we will be 1170 * because inodes on disk are never destroyed and we will be locking the
1143 * locking the new in-core inode before putting it in the hash 1171 * new in-core inode before putting it in the cache where other
1144 * table where other processes can find it. Thus we don't have 1172 * processes can find it. Thus we don't have to worry about the inode
1145 * to worry about the inode being changed just because we released 1173 * being changed just because we released the buffer.
1146 * the buffer.
1147 */ 1174 */
1148 out_brelse: 1175 out_brelse:
1149 xfs_trans_brelse(tp, bp); 1176 xfs_trans_brelse(tp, bp);
@@ -2028,8 +2055,6 @@ xfs_ifree(
2028 int error; 2055 int error;
2029 int delete; 2056 int delete;
2030 xfs_ino_t first_ino; 2057 xfs_ino_t first_ino;
2031 xfs_dinode_t *dip;
2032 xfs_buf_t *ibp;
2033 2058
2034 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2059 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2035 ASSERT(ip->i_d.di_nlink == 0); 2060 ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2067,13 @@ xfs_ifree(
2042 * Pull the on-disk inode from the AGI unlinked list. 2067 * Pull the on-disk inode from the AGI unlinked list.
2043 */ 2068 */
2044 error = xfs_iunlink_remove(tp, ip); 2069 error = xfs_iunlink_remove(tp, ip);
2045 if (error != 0) { 2070 if (error)
2046 return error; 2071 return error;
2047 }
2048 2072
2049 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2073 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2050 if (error != 0) { 2074 if (error)
2051 return error; 2075 return error;
2052 } 2076
2053 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2077 ip->i_d.di_mode = 0; /* mark incore inode as free */
2054 ip->i_d.di_flags = 0; 2078 ip->i_d.di_flags = 0;
2055 ip->i_d.di_dmevmask = 0; 2079 ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2085,10 @@ xfs_ifree(
2061 * by reincarnations of this inode. 2085 * by reincarnations of this inode.
2062 */ 2086 */
2063 ip->i_d.di_gen++; 2087 ip->i_d.di_gen++;
2064
2065 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2088 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2066 2089
2067 error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, 2090 if (delete)
2068 0, 0);
2069 if (error)
2070 return error;
2071
2072 /*
2073 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
2074 * from picking up this inode when it is reclaimed (its incore state
2075 * initialzed but not flushed to disk yet). The in-core di_mode is
2076 * already cleared and a corresponding transaction logged.
2077 * The hack here just synchronizes the in-core to on-disk
2078 * di_mode value in advance before the actual inode sync to disk.
2079 * This is OK because the inode is already unlinked and would never
2080 * change its di_mode again for this inode generation.
2081 * This is a temporary hack that would require a proper fix
2082 * in the future.
2083 */
2084 dip->di_mode = 0;
2085
2086 if (delete) {
2087 error = xfs_ifree_cluster(ip, tp, first_ino); 2091 error = xfs_ifree_cluster(ip, tp, first_ino);
2088 }
2089 2092
2090 return error; 2093 return error;
2091} 2094}
@@ -2160,8 +2163,8 @@ xfs_iroot_realloc(
2160 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2163 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2161 (int)new_size); 2164 (int)new_size);
2162 ifp->if_broot_bytes = (int)new_size; 2165 ifp->if_broot_bytes = (int)new_size;
2163 ASSERT(ifp->if_broot_bytes <= 2166 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2164 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); 2167 XFS_IFORK_SIZE(ip, whichfork));
2165 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2168 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2166 return; 2169 return;
2167 } 2170 }
@@ -2214,8 +2217,9 @@ xfs_iroot_realloc(
2214 kmem_free(ifp->if_broot); 2217 kmem_free(ifp->if_broot);
2215 ifp->if_broot = new_broot; 2218 ifp->if_broot = new_broot;
2216 ifp->if_broot_bytes = (int)new_size; 2219 ifp->if_broot_bytes = (int)new_size;
2217 ASSERT(ifp->if_broot_bytes <= 2220 if (ifp->if_broot)
2218 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); 2221 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2222 XFS_IFORK_SIZE(ip, whichfork));
2219 return; 2223 return;
2220} 2224}
2221 2225
@@ -2526,9 +2530,8 @@ xfs_iflush_fork(
2526 if ((iip->ili_fields & brootflag[whichfork]) && 2530 if ((iip->ili_fields & brootflag[whichfork]) &&
2527 (ifp->if_broot_bytes > 0)) { 2531 (ifp->if_broot_bytes > 0)) {
2528 ASSERT(ifp->if_broot != NULL); 2532 ASSERT(ifp->if_broot != NULL);
2529 ASSERT(ifp->if_broot_bytes <= 2533 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2530 (XFS_IFORK_SIZE(ip, whichfork) + 2534 XFS_IFORK_SIZE(ip, whichfork));
2531 XFS_BROOT_SIZE_ADJ(ip)));
2532 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2535 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2533 (xfs_bmdr_block_t *)cp, 2536 (xfs_bmdr_block_t *)cp,
2534 XFS_DFORK_SIZE(dip, mp, whichfork)); 2537 XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -2886,12 +2889,18 @@ xfs_iflush_int(
2886 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2889 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2887 goto corrupt_out; 2890 goto corrupt_out;
2888 } 2891 }
2892
2889 /* 2893 /*
2890 * bump the flush iteration count, used to detect flushes which 2894 * Inode item log recovery for v1/v2 inodes are dependent on the
2891 * postdate a log record during recovery. This is redundant as we now 2895 * di_flushiter count for correct sequencing. We bump the flush
2892 * log every change and hence this can't happen. Still, it doesn't hurt. 2896 * iteration count so we can detect flushes which postdate a log record
2897 * during recovery. This is redundant as we now log every change and
2898 * hence this can't happen but we need to still do it to ensure
2899 * backwards compatibility with old kernels that predate logging all
2900 * inode changes.
2893 */ 2901 */
2894 ip->i_d.di_flushiter++; 2902 if (ip->i_d.di_version < 3)
2903 ip->i_d.di_flushiter++;
2895 2904
2896 /* 2905 /*
2897 * Copy the dirty parts of the inode into the on-disk 2906 * Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 91129794aaec..b55fd347ab5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -250,6 +250,7 @@ typedef struct xfs_inode {
250 struct xfs_mount *i_mount; /* fs mount struct ptr */ 250 struct xfs_mount *i_mount; /* fs mount struct ptr */
251 struct xfs_dquot *i_udquot; /* user dquot */ 251 struct xfs_dquot *i_udquot; /* user dquot */
252 struct xfs_dquot *i_gdquot; /* group dquot */ 252 struct xfs_dquot *i_gdquot; /* group dquot */
253 struct xfs_dquot *i_pdquot; /* project dquot */
253 254
254 /* Inode location stuff */ 255 /* Inode location stuff */
255 xfs_ino_t i_ino; /* inode number (agno/agino)*/ 256 xfs_ino_t i_ino; /* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e999680094a..6e2bca5d44d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -248,7 +248,7 @@ xfs_open_by_handle(
248 goto out_dput; 248 goto out_dput;
249 } 249 }
250 250
251 fd = get_unused_fd(); 251 fd = get_unused_fd_flags(0);
252 if (fd < 0) { 252 if (fd < 0) {
253 error = fd; 253 error = fd;
254 goto out_dput; 254 goto out_dput;
@@ -928,7 +928,7 @@ xfs_ioctl_setattr(
928 struct xfs_trans *tp; 928 struct xfs_trans *tp;
929 unsigned int lock_flags = 0; 929 unsigned int lock_flags = 0;
930 struct xfs_dquot *udqp = NULL; 930 struct xfs_dquot *udqp = NULL;
931 struct xfs_dquot *gdqp = NULL; 931 struct xfs_dquot *pdqp = NULL;
932 struct xfs_dquot *olddquot = NULL; 932 struct xfs_dquot *olddquot = NULL;
933 int code; 933 int code;
934 934
@@ -957,7 +957,7 @@ xfs_ioctl_setattr(
957 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { 957 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
958 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, 958 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
959 ip->i_d.di_gid, fa->fsx_projid, 959 ip->i_d.di_gid, fa->fsx_projid,
960 XFS_QMOPT_PQUOTA, &udqp, &gdqp); 960 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
961 if (code) 961 if (code)
962 return code; 962 return code;
963 } 963 }
@@ -994,8 +994,8 @@ xfs_ioctl_setattr(
994 XFS_IS_PQUOTA_ON(mp) && 994 XFS_IS_PQUOTA_ON(mp) &&
995 xfs_get_projid(ip) != fa->fsx_projid) { 995 xfs_get_projid(ip) != fa->fsx_projid) {
996 ASSERT(tp); 996 ASSERT(tp);
997 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 997 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
998 capable(CAP_FOWNER) ? 998 pdqp, capable(CAP_FOWNER) ?
999 XFS_QMOPT_FORCE_RES : 0); 999 XFS_QMOPT_FORCE_RES : 0);
1000 if (code) /* out of quota */ 1000 if (code) /* out of quota */
1001 goto error_return; 1001 goto error_return;
@@ -1113,7 +1113,7 @@ xfs_ioctl_setattr(
1113 if (xfs_get_projid(ip) != fa->fsx_projid) { 1113 if (xfs_get_projid(ip) != fa->fsx_projid) {
1114 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1114 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1115 olddquot = xfs_qm_vop_chown(tp, ip, 1115 olddquot = xfs_qm_vop_chown(tp, ip,
1116 &ip->i_gdquot, gdqp); 1116 &ip->i_pdquot, pdqp);
1117 } 1117 }
1118 xfs_set_projid(ip, fa->fsx_projid); 1118 xfs_set_projid(ip, fa->fsx_projid);
1119 1119
@@ -1160,13 +1160,13 @@ xfs_ioctl_setattr(
1160 */ 1160 */
1161 xfs_qm_dqrele(olddquot); 1161 xfs_qm_dqrele(olddquot);
1162 xfs_qm_dqrele(udqp); 1162 xfs_qm_dqrele(udqp);
1163 xfs_qm_dqrele(gdqp); 1163 xfs_qm_dqrele(pdqp);
1164 1164
1165 return code; 1165 return code;
1166 1166
1167 error_return: 1167 error_return:
1168 xfs_qm_dqrele(udqp); 1168 xfs_qm_dqrele(udqp);
1169 xfs_qm_dqrele(gdqp); 1169 xfs_qm_dqrele(pdqp);
1170 xfs_trans_cancel(tp, 0); 1170 xfs_trans_cancel(tp, 0);
1171 if (lock_flags) 1171 if (lock_flags)
1172 xfs_iunlock(ip, lock_flags); 1172 xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8f8aaee7f379..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
284 return 0; 284 return 0;
285 285
286 /* 286 /*
287 * If the file is smaller than the minimum prealloc and we are using
288 * dynamic preallocation, don't do any preallocation at all as it is
289 * likely this is the only write to the file that is going to be done.
290 */
291 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
292 XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
293 return 0;
294
295 /*
287 * If there are any real blocks past eof, then don't 296 * If there are any real blocks past eof, then don't
288 * do any speculative allocation. 297 * do any speculative allocation.
289 */ 298 */
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
345 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) 354 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
346 return 0; 355 return 0;
347 356
357 /* If the file is small, then use the minimum prealloc */
358 if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
359 return 0;
360
348 /* 361 /*
349 * As we write multiple pages, the offset will always align to the 362 * As we write multiple pages, the offset will always align to the
350 * start of a page and hence point to a hole at EOF. i.e. if the size is 363 * start of a page and hence point to a hole at EOF. i.e. if the size is
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa81112..96dda62d497b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -467,9 +467,6 @@ xfs_setattr_mode(
467 ASSERT(tp); 467 ASSERT(tp);
468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
469 469
470 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
471 mode &= ~S_ISGID;
472
473 ip->i_d.di_mode &= S_IFMT; 470 ip->i_d.di_mode &= S_IFMT;
474 ip->i_d.di_mode |= mode & ~S_IFMT; 471 ip->i_d.di_mode |= mode & ~S_IFMT;
475 472
@@ -495,15 +492,18 @@ xfs_setattr_nonsize(
495 492
496 trace_xfs_setattr(ip); 493 trace_xfs_setattr(ip);
497 494
498 if (mp->m_flags & XFS_MOUNT_RDONLY) 495 /* If acls are being inherited, we already have this checked */
499 return XFS_ERROR(EROFS); 496 if (!(flags & XFS_ATTR_NOACL)) {
497 if (mp->m_flags & XFS_MOUNT_RDONLY)
498 return XFS_ERROR(EROFS);
500 499
501 if (XFS_FORCED_SHUTDOWN(mp)) 500 if (XFS_FORCED_SHUTDOWN(mp))
502 return XFS_ERROR(EIO); 501 return XFS_ERROR(EIO);
503 502
504 error = -inode_change_ok(inode, iattr); 503 error = -inode_change_ok(inode, iattr);
505 if (error) 504 if (error)
506 return XFS_ERROR(error); 505 return XFS_ERROR(error);
506 }
507 507
508 ASSERT((mask & ATTR_SIZE) == 0); 508 ASSERT((mask & ATTR_SIZE) == 0);
509 509
@@ -539,7 +539,7 @@ xfs_setattr_nonsize(
539 ASSERT(udqp == NULL); 539 ASSERT(udqp == NULL);
540 ASSERT(gdqp == NULL); 540 ASSERT(gdqp == NULL);
541 error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), 541 error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
542 qflags, &udqp, &gdqp); 542 qflags, &udqp, &gdqp, NULL);
543 if (error) 543 if (error)
544 return error; 544 return error;
545 } 545 }
@@ -575,7 +575,7 @@ xfs_setattr_nonsize(
575 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { 575 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
576 ASSERT(tp); 576 ASSERT(tp);
577 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 577 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
578 capable(CAP_FOWNER) ? 578 NULL, capable(CAP_FOWNER) ?
579 XFS_QMOPT_FORCE_RES : 0); 579 XFS_QMOPT_FORCE_RES : 0);
580 if (error) /* out of quota */ 580 if (error) /* out of quota */
581 goto out_trans_cancel; 581 goto out_trans_cancel;
@@ -987,7 +987,8 @@ xfs_fiemap_format(
987 if (bmv->bmv_oflags & BMV_OF_PREALLOC) 987 if (bmv->bmv_oflags & BMV_OF_PREALLOC)
988 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; 988 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
989 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { 989 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
990 fiemap_flags |= FIEMAP_EXTENT_DELALLOC; 990 fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
991 FIEMAP_EXTENT_UNKNOWN);
991 physical = 0; /* no block yet */ 992 physical = 0; /* no block yet */
992 } 993 }
993 if (bmv->bmv_oflags & BMV_OF_LAST) 994 if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..b93e14b86754 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
43{ 43{
44 return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || 44 return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
45 (xfs_sb_version_hasquota(&mp->m_sb) && 45 (xfs_sb_version_hasquota(&mp->m_sb) &&
46 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); 46 xfs_is_quota_inode(&mp->m_sb, ino)));
47} 47}
48 48
49/* 49/*
@@ -221,7 +221,6 @@ xfs_bulkstat(
221 char __user *ubufp; /* pointer into user's buffer */ 221 char __user *ubufp; /* pointer into user's buffer */
222 int ubelem; /* spaces used in user's buffer */ 222 int ubelem; /* spaces used in user's buffer */
223 int ubused; /* bytes used by formatter */ 223 int ubused; /* bytes used by formatter */
224 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
225 224
226 /* 225 /*
227 * Get the last inode value, see if there's nothing to do. 226 * Get the last inode value, see if there's nothing to do.
@@ -263,7 +262,6 @@ xfs_bulkstat(
263 rval = 0; 262 rval = 0;
264 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 263 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
265 cond_resched(); 264 cond_resched();
266 bp = NULL;
267 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 265 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
268 if (error) { 266 if (error) {
269 /* 267 /*
@@ -383,11 +381,13 @@ xfs_bulkstat(
383 * Also start read-ahead now for this chunk. 381 * Also start read-ahead now for this chunk.
384 */ 382 */
385 if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 383 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
384 struct blk_plug plug;
386 /* 385 /*
387 * Loop over all clusters in the next chunk. 386 * Loop over all clusters in the next chunk.
388 * Do a readahead if there are any allocated 387 * Do a readahead if there are any allocated
389 * inodes in that cluster. 388 * inodes in that cluster.
390 */ 389 */
390 blk_start_plug(&plug);
391 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); 391 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
392 for (chunkidx = 0; 392 for (chunkidx = 0;
393 chunkidx < XFS_INODES_PER_CHUNK; 393 chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +399,7 @@ xfs_bulkstat(
399 agbno, nbcluster, 399 agbno, nbcluster,
400 &xfs_inode_buf_ops); 400 &xfs_inode_buf_ops);
401 } 401 }
402 blk_finish_plug(&plug);
402 irbp->ir_startino = r.ir_startino; 403 irbp->ir_startino = r.ir_startino;
403 irbp->ir_freecount = r.ir_freecount; 404 irbp->ir_freecount = r.ir_freecount;
404 irbp->ir_free = r.ir_free; 405 irbp->ir_free = r.ir_free;
@@ -433,27 +434,7 @@ xfs_bulkstat(
433 irbp->ir_freecount < XFS_INODES_PER_CHUNK; 434 irbp->ir_freecount < XFS_INODES_PER_CHUNK;
434 chunkidx++, clustidx++, agino++) { 435 chunkidx++, clustidx++, agino++) {
435 ASSERT(chunkidx < XFS_INODES_PER_CHUNK); 436 ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
436 /* 437
437 * Recompute agbno if this is the
438 * first inode of the cluster.
439 *
440 * Careful with clustidx. There can be
441 * multiple clusters per chunk, a single
442 * cluster per chunk or a cluster that has
443 * inodes represented from several different
444 * chunks (if blocksize is large).
445 *
446 * Because of this, the starting clustidx is
447 * initialized to zero in this loop but must
448 * later be reset after reading in the cluster
449 * buffer.
450 */
451 if ((chunkidx & (nicluster - 1)) == 0) {
452 agbno = XFS_AGINO_TO_AGBNO(mp,
453 irbp->ir_startino) +
454 ((chunkidx & nimask) >>
455 mp->m_sb.sb_inopblog);
456 }
457 ino = XFS_AGINO_TO_INO(mp, agno, agino); 438 ino = XFS_AGINO_TO_INO(mp, agno, agino);
458 /* 439 /*
459 * Skip if this inode is free. 440 * Skip if this inode is free.
@@ -499,10 +480,6 @@ xfs_bulkstat(
499 480
500 cond_resched(); 481 cond_resched();
501 } 482 }
502
503 if (bp)
504 xfs_buf_relse(bp);
505
506 /* 483 /*
507 * Set up for the next loop iteration. 484 * Set up for the next loop iteration.
508 */ 485 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b345a7c85153..d852a2b3e1fd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
1963 headers++; 1963 headers++;
1964 1964
1965 for (lv = log_vector; lv; lv = lv->lv_next) { 1965 for (lv = log_vector; lv; lv = lv->lv_next) {
1966 /* we don't write ordered log vectors */
1967 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
1968 continue;
1969
1966 headers += lv->lv_niovecs; 1970 headers += lv->lv_niovecs;
1967 1971
1968 for (i = 0; i < lv->lv_niovecs; i++) { 1972 for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
2216 index = 0; 2220 index = 0;
2217 lv = log_vector; 2221 lv = log_vector;
2218 vecp = lv->lv_iovecp; 2222 vecp = lv->lv_iovecp;
2219 while (lv && index < lv->lv_niovecs) { 2223 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2220 void *ptr; 2224 void *ptr;
2221 int log_offset; 2225 int log_offset;
2222 2226
@@ -2236,13 +2240,22 @@ xlog_write(
2236 * This loop writes out as many regions as can fit in the amount 2240 * This loop writes out as many regions as can fit in the amount
2237 * of space which was allocated by xlog_state_get_iclog_space(). 2241 * of space which was allocated by xlog_state_get_iclog_space().
2238 */ 2242 */
2239 while (lv && index < lv->lv_niovecs) { 2243 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2240 struct xfs_log_iovec *reg = &vecp[index]; 2244 struct xfs_log_iovec *reg;
2241 struct xlog_op_header *ophdr; 2245 struct xlog_op_header *ophdr;
2242 int start_rec_copy; 2246 int start_rec_copy;
2243 int copy_len; 2247 int copy_len;
2244 int copy_off; 2248 int copy_off;
2249 bool ordered = false;
2250
2251 /* ordered log vectors have no regions to write */
2252 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
2253 ASSERT(lv->lv_niovecs == 0);
2254 ordered = true;
2255 goto next_lv;
2256 }
2245 2257
2258 reg = &vecp[index];
2246 ASSERT(reg->i_len % sizeof(__int32_t) == 0); 2259 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
2247 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); 2260 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
2248 2261
@@ -2302,12 +2315,13 @@ xlog_write(
2302 break; 2315 break;
2303 2316
2304 if (++index == lv->lv_niovecs) { 2317 if (++index == lv->lv_niovecs) {
2318next_lv:
2305 lv = lv->lv_next; 2319 lv = lv->lv_next;
2306 index = 0; 2320 index = 0;
2307 if (lv) 2321 if (lv)
2308 vecp = lv->lv_iovecp; 2322 vecp = lv->lv_iovecp;
2309 } 2323 }
2310 if (record_cnt == 0) { 2324 if (record_cnt == 0 && ordered == false) {
2311 if (!lv) 2325 if (!lv)
2312 return 0; 2326 return 0;
2313 break; 2327 break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
88#define XLOG_REG_TYPE_UNMOUNT 17 88#define XLOG_REG_TYPE_UNMOUNT 17
89#define XLOG_REG_TYPE_COMMIT 18 89#define XLOG_REG_TYPE_COMMIT 18
90#define XLOG_REG_TYPE_TRANSHDR 19 90#define XLOG_REG_TYPE_TRANSHDR 19
91#define XLOG_REG_TYPE_MAX 19 91#define XLOG_REG_TYPE_ICREATE 20
92#define XLOG_REG_TYPE_MAX 20
92 93
93typedef struct xfs_log_iovec { 94typedef struct xfs_log_iovec {
94 void *i_addr; /* beginning address of region */ 95 void *i_addr; /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
105 int lv_buf_len; /* size of formatted buffer */ 106 int lv_buf_len; /* size of formatted buffer */
106}; 107};
107 108
109#define XFS_LOG_VEC_ORDERED (-1)
110
108/* 111/*
109 * Structure used to pass callback function and the function's argument 112 * Structure used to pass callback function and the function's argument
110 * to the log manager. 113 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d0833b54e55d..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
127 int index; 127 int index;
128 int len = 0; 128 int len = 0;
129 uint niovecs; 129 uint niovecs;
130 bool ordered = false;
130 131
131 /* Skip items which aren't dirty in this transaction. */ 132 /* Skip items which aren't dirty in this transaction. */
132 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 133 if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
137 if (!niovecs) 138 if (!niovecs)
138 continue; 139 continue;
139 140
141 /*
142 * Ordered items need to be tracked but we do not wish to write
143 * them. We need a logvec to track the object, but we do not
144 * need an iovec or buffer to be allocated for copying data.
145 */
146 if (niovecs == XFS_LOG_VEC_ORDERED) {
147 ordered = true;
148 niovecs = 0;
149 }
150
140 new_lv = kmem_zalloc(sizeof(*new_lv) + 151 new_lv = kmem_zalloc(sizeof(*new_lv) +
141 niovecs * sizeof(struct xfs_log_iovec), 152 niovecs * sizeof(struct xfs_log_iovec),
142 KM_SLEEP|KM_NOFS); 153 KM_SLEEP|KM_NOFS);
143 154
155 new_lv->lv_item = lidp->lid_item;
156 new_lv->lv_niovecs = niovecs;
157 if (ordered) {
158 /* track as an ordered logvec */
159 new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
160 goto next;
161 }
162
144 /* The allocated iovec region lies beyond the log vector. */ 163 /* The allocated iovec region lies beyond the log vector. */
145 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; 164 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
146 new_lv->lv_niovecs = niovecs;
147 new_lv->lv_item = lidp->lid_item;
148 165
149 /* build the vector array and calculate it's length */ 166 /* build the vector array and calculate it's length */
150 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); 167 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
165 } 182 }
166 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); 183 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
167 184
185next:
168 if (!ret_lv) 186 if (!ret_lv)
169 ret_lv = new_lv; 187 ret_lv = new_lv;
170 else 188 else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
191 209
192 if (old) { 210 if (old) {
193 /* existing lv on log item, space used is a delta */ 211 /* existing lv on log item, space used is a delta */
194 ASSERT(!list_empty(&lv->lv_item->li_cil)); 212 ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
195 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); 213 old->lv_buf_len == XFS_LOG_VEC_ORDERED);
214
215 /*
216 * If the new item is ordered, keep the old one that is already
217 * tracking dirty or ordered regions
218 */
219 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
220 ASSERT(!lv->lv_buf);
221 kmem_free(lv);
222 return;
223 }
196 224
197 *len += lv->lv_buf_len - old->lv_buf_len; 225 *len += lv->lv_buf_len - old->lv_buf_len;
198 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; 226 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
201 } else { 229 } else {
202 /* new lv, must pin the log item */ 230 /* new lv, must pin the log item */
203 ASSERT(!lv->lv_item->li_lv); 231 ASSERT(!lv->lv_item->li_lv);
204 ASSERT(list_empty(&lv->lv_item->li_cil));
205 232
206 *len += lv->lv_buf_len; 233 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
207 *diff_iovecs += lv->lv_niovecs; 234 *len += lv->lv_buf_len;
235 *diff_iovecs += lv->lv_niovecs;
236 }
208 IOP_PIN(lv->lv_item); 237 IOP_PIN(lv->lv_item);
209 238
210 } 239 }
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
259 * We can do this safely because the context can't checkpoint until we 288 * We can do this safely because the context can't checkpoint until we
260 * are done so it doesn't matter exactly how we update the CIL. 289 * are done so it doesn't matter exactly how we update the CIL.
261 */ 290 */
262 for (lv = log_vector; lv; lv = lv->lv_next)
263 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
264
265 /* account for space used by new iovec headers */
266 len += diff_iovecs * sizeof(xlog_op_header_t);
267
268 spin_lock(&cil->xc_cil_lock); 291 spin_lock(&cil->xc_cil_lock);
292 for (lv = log_vector; lv; ) {
293 struct xfs_log_vec *next = lv->lv_next;
269 294
270 /* move the items to the tail of the CIL */ 295 ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
271 for (lv = log_vector; lv; lv = lv->lv_next) 296 lv->lv_next = NULL;
297
298 /*
299 * xfs_cil_prepare_item() may free the lv, so move the item on
300 * the CIL first.
301 */
272 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); 302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
303 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
304 lv = next;
305 }
273 306
307 /* account for space used by new iovec headers */
308 len += diff_iovecs * sizeof(xlog_op_header_t);
274 ctx->nvecs += diff_iovecs; 309 ctx->nvecs += diff_iovecs;
275 310
276 /* 311 /*
@@ -381,9 +416,7 @@ xlog_cil_push(
381 struct xfs_cil_ctx *new_ctx; 416 struct xfs_cil_ctx *new_ctx;
382 struct xlog_in_core *commit_iclog; 417 struct xlog_in_core *commit_iclog;
383 struct xlog_ticket *tic; 418 struct xlog_ticket *tic;
384 int num_lv;
385 int num_iovecs; 419 int num_iovecs;
386 int len;
387 int error = 0; 420 int error = 0;
388 struct xfs_trans_header thdr; 421 struct xfs_trans_header thdr;
389 struct xfs_log_iovec lhdr; 422 struct xfs_log_iovec lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
428 * side which is currently locked out by the flush lock. 461 * side which is currently locked out by the flush lock.
429 */ 462 */
430 lv = NULL; 463 lv = NULL;
431 num_lv = 0;
432 num_iovecs = 0; 464 num_iovecs = 0;
433 len = 0;
434 while (!list_empty(&cil->xc_cil)) { 465 while (!list_empty(&cil->xc_cil)) {
435 struct xfs_log_item *item; 466 struct xfs_log_item *item;
436 int i;
437 467
438 item = list_first_entry(&cil->xc_cil, 468 item = list_first_entry(&cil->xc_cil,
439 struct xfs_log_item, li_cil); 469 struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
444 lv->lv_next = item->li_lv; 474 lv->lv_next = item->li_lv;
445 lv = item->li_lv; 475 lv = item->li_lv;
446 item->li_lv = NULL; 476 item->li_lv = NULL;
447
448 num_lv++;
449 num_iovecs += lv->lv_niovecs; 477 num_iovecs += lv->lv_niovecs;
450 for (i = 0; i < lv->lv_niovecs; i++)
451 len += lv->lv_iovecp[i].i_len;
452 } 478 }
453 479
454 /* 480 /*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
701 if (commit_lsn) 727 if (commit_lsn)
702 *commit_lsn = log->l_cilp->xc_ctx->sequence; 728 *commit_lsn = log->l_cilp->xc_ctx->sequence;
703 729
730 /* xlog_cil_insert_items() destroys log_vector list */
704 xlog_cil_insert_items(log, log_vector, tp->t_ticket); 731 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
705 732
706 /* check we didn't blow the reservation */ 733 /* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7cf5e4eafe28..7681b19aa5dc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,7 @@
45#include "xfs_cksum.h" 45#include "xfs_cksum.h"
46#include "xfs_trace.h" 46#include "xfs_trace.h"
47#include "xfs_icache.h" 47#include "xfs_icache.h"
48#include "xfs_icreate_item.h"
48 49
49/* Need all the magic numbers and buffer ops structures from these headers */ 50/* Need all the magic numbers and buffer ops structures from these headers */
50#include "xfs_symlink.h" 51#include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
1617 * form the cancelled buffer table. Hence they have tobe done last. 1618 * form the cancelled buffer table. Hence they have tobe done last.
1618 * 1619 *
1619 * 3. Inode allocation buffers must be replayed before inode items that 1620 * 3. Inode allocation buffers must be replayed before inode items that
1620 * read the buffer and replay changes into it. 1621 * read the buffer and replay changes into it. For filesystems using the
1622 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1623 * treated the same as inode allocation buffers as they create and
1624 * initialise the buffers directly.
1621 * 1625 *
1622 * 4. Inode unlink buffers must be replayed after inode items are replayed. 1626 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1623 * This ensures that inodes are completely flushed to the inode buffer 1627 * This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
1632 * from all the other buffers and move them to last. 1636 * from all the other buffers and move them to last.
1633 * 1637 *
1634 * Hence, 4 lists, in order from head to tail: 1638 * Hence, 4 lists, in order from head to tail:
1635 * - buffer_list for all buffers except cancelled/inode unlink buffers 1639 * - buffer_list for all buffers except cancelled/inode unlink buffers
1636 * - item_list for all non-buffer items 1640 * - item_list for all non-buffer items
1637 * - inode_buffer_list for inode unlink buffers 1641 * - inode_buffer_list for inode unlink buffers
1638 * - cancel_list for the cancelled buffers 1642 * - cancel_list for the cancelled buffers
1643 *
1644 * Note that we add objects to the tail of the lists so that first-to-last
1645 * ordering is preserved within the lists. Adding objects to the head of the
1646 * list means when we traverse from the head we walk them in last-to-first
1647 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1648 * but for all other items there may be specific ordering that we need to
1649 * preserve.
1639 */ 1650 */
1640STATIC int 1651STATIC int
1641xlog_recover_reorder_trans( 1652xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
1655 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1666 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1656 1667
1657 switch (ITEM_TYPE(item)) { 1668 switch (ITEM_TYPE(item)) {
1669 case XFS_LI_ICREATE:
1670 list_move_tail(&item->ri_list, &buffer_list);
1671 break;
1658 case XFS_LI_BUF: 1672 case XFS_LI_BUF:
1659 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1673 if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1660 trace_xfs_log_recover_item_reorder_head(log, 1674 trace_xfs_log_recover_item_reorder_head(log,
@@ -2578,8 +2592,16 @@ xlog_recover_inode_pass2(
2578 goto error; 2592 goto error;
2579 } 2593 }
2580 2594
2581 /* Skip replay when the on disk inode is newer than the log one */ 2595 /*
2582 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 2596 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2597 * are transactional and if ordering is necessary we can determine that
2598 * more accurately by the LSN field in the V3 inode core. Don't trust
2599 * the inode versions we might be changing them here - use the
2600 * superblock flag to determine whether we need to look at di_flushiter
2601 * to skip replay when the on disk inode is newer than the log one
2602 */
2603 if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2604 dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2583 /* 2605 /*
2584 * Deal with the wrap case, DI_MAX_FLUSH is less 2606 * Deal with the wrap case, DI_MAX_FLUSH is less
2585 * than smaller numbers 2607 * than smaller numbers
@@ -2594,6 +2616,7 @@ xlog_recover_inode_pass2(
2594 goto error; 2616 goto error;
2595 } 2617 }
2596 } 2618 }
2619
2597 /* Take the opportunity to reset the flush iteration count */ 2620 /* Take the opportunity to reset the flush iteration count */
2598 dicp->di_flushiter = 0; 2621 dicp->di_flushiter = 0;
2599 2622
@@ -2982,6 +3005,93 @@ xlog_recover_efd_pass2(
2982} 3005}
2983 3006
2984/* 3007/*
3008 * This routine is called when an inode create format structure is found in a
3009 * committed transaction in the log. It's purpose is to initialise the inodes
3010 * being allocated on disk. This requires us to get inode cluster buffers that
3011 * match the range to be intialised, stamped with inode templates and written
3012 * by delayed write so that subsequent modifications will hit the cached buffer
3013 * and only need writing out at the end of recovery.
3014 */
3015STATIC int
3016xlog_recover_do_icreate_pass2(
3017 struct xlog *log,
3018 struct list_head *buffer_list,
3019 xlog_recover_item_t *item)
3020{
3021 struct xfs_mount *mp = log->l_mp;
3022 struct xfs_icreate_log *icl;
3023 xfs_agnumber_t agno;
3024 xfs_agblock_t agbno;
3025 unsigned int count;
3026 unsigned int isize;
3027 xfs_agblock_t length;
3028
3029 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3030 if (icl->icl_type != XFS_LI_ICREATE) {
3031 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3032 return EINVAL;
3033 }
3034
3035 if (icl->icl_size != 1) {
3036 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3037 return EINVAL;
3038 }
3039
3040 agno = be32_to_cpu(icl->icl_ag);
3041 if (agno >= mp->m_sb.sb_agcount) {
3042 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3043 return EINVAL;
3044 }
3045 agbno = be32_to_cpu(icl->icl_agbno);
3046 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3047 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3048 return EINVAL;
3049 }
3050 isize = be32_to_cpu(icl->icl_isize);
3051 if (isize != mp->m_sb.sb_inodesize) {
3052 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3053 return EINVAL;
3054 }
3055 count = be32_to_cpu(icl->icl_count);
3056 if (!count) {
3057 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3058 return EINVAL;
3059 }
3060 length = be32_to_cpu(icl->icl_length);
3061 if (!length || length >= mp->m_sb.sb_agblocks) {
3062 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3063 return EINVAL;
3064 }
3065
3066 /* existing allocation is fixed value */
3067 ASSERT(count == XFS_IALLOC_INODES(mp));
3068 ASSERT(length == XFS_IALLOC_BLOCKS(mp));
3069 if (count != XFS_IALLOC_INODES(mp) ||
3070 length != XFS_IALLOC_BLOCKS(mp)) {
3071 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3072 return EINVAL;
3073 }
3074
3075 /*
3076 * Inode buffers can be freed. Do not replay the inode initialisation as
3077 * we could be overwriting something written after this inode buffer was
3078 * cancelled.
3079 *
3080 * XXX: we need to iterate all buffers and only init those that are not
3081 * cancelled. I think that a more fine grained factoring of
3082 * xfs_ialloc_inode_init may be appropriate here to enable this to be
3083 * done easily.
3084 */
3085 if (xlog_check_buffer_cancelled(log,
3086 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3087 return 0;
3088
3089 xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
3090 be32_to_cpu(icl->icl_gen));
3091 return 0;
3092}
3093
3094/*
2985 * Free up any resources allocated by the transaction 3095 * Free up any resources allocated by the transaction
2986 * 3096 *
2987 * Remember that EFIs, EFDs, and IUNLINKs are handled later. 3097 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -3023,6 +3133,7 @@ xlog_recover_commit_pass1(
3023 case XFS_LI_EFI: 3133 case XFS_LI_EFI:
3024 case XFS_LI_EFD: 3134 case XFS_LI_EFD:
3025 case XFS_LI_DQUOT: 3135 case XFS_LI_DQUOT:
3136 case XFS_LI_ICREATE:
3026 /* nothing to do in pass 1 */ 3137 /* nothing to do in pass 1 */
3027 return 0; 3138 return 0;
3028 default: 3139 default:
@@ -3053,6 +3164,8 @@ xlog_recover_commit_pass2(
3053 return xlog_recover_efd_pass2(log, item); 3164 return xlog_recover_efd_pass2(log, item);
3054 case XFS_LI_DQUOT: 3165 case XFS_LI_DQUOT:
3055 return xlog_recover_dquot_pass2(log, buffer_list, item); 3166 return xlog_recover_dquot_pass2(log, buffer_list, item);
3167 case XFS_LI_ICREATE:
3168 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3056 case XFS_LI_QUOTAOFF: 3169 case XFS_LI_QUOTAOFF:
3057 /* nothing to do in pass2 */ 3170 /* nothing to do in pass2 */
3058 return 0; 3171 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8e310c05097..2b0ba3581656 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -336,6 +336,14 @@ xfs_mount_validate_sb(
336 return XFS_ERROR(EWRONGFS); 336 return XFS_ERROR(EWRONGFS);
337 } 337 }
338 338
339 if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
340 (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
341 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
342 xfs_notice(mp,
343"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
344 return XFS_ERROR(EFSCORRUPTED);
345 }
346
339 /* 347 /*
340 * Version 5 superblock feature mask validation. Reject combinations the 348 * Version 5 superblock feature mask validation. Reject combinations the
341 * kernel cannot support up front before checking anything else. For 349 * kernel cannot support up front before checking anything else. For
@@ -561,6 +569,18 @@ out_unwind:
561 return error; 569 return error;
562} 570}
563 571
572static void
573xfs_sb_quota_from_disk(struct xfs_sb *sbp)
574{
575 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
576 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
577 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
578 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
579 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
580 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
581 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
582}
583
564void 584void
565xfs_sb_from_disk( 585xfs_sb_from_disk(
566 struct xfs_sb *to, 586 struct xfs_sb *to,
@@ -622,6 +642,35 @@ xfs_sb_from_disk(
622 to->sb_lsn = be64_to_cpu(from->sb_lsn); 642 to->sb_lsn = be64_to_cpu(from->sb_lsn);
623} 643}
624 644
645static inline void
646xfs_sb_quota_to_disk(
647 xfs_dsb_t *to,
648 xfs_sb_t *from,
649 __int64_t *fields)
650{
651 __uint16_t qflags = from->sb_qflags;
652
653 if (*fields & XFS_SB_QFLAGS) {
654 /*
655 * The in-core version of sb_qflags do not have
656 * XFS_OQUOTA_* flags, whereas the on-disk version
657 * does. So, convert incore XFS_{PG}QUOTA_* flags
658 * to on-disk XFS_OQUOTA_* flags.
659 */
660 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
661 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
662
663 if (from->sb_qflags &
664 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
665 qflags |= XFS_OQUOTA_ENFD;
666 if (from->sb_qflags &
667 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
668 qflags |= XFS_OQUOTA_CHKD;
669 to->sb_qflags = cpu_to_be16(qflags);
670 *fields &= ~XFS_SB_QFLAGS;
671 }
672}
673
625/* 674/*
626 * Copy in core superblock to ondisk one. 675 * Copy in core superblock to ondisk one.
627 * 676 *
@@ -643,6 +692,7 @@ xfs_sb_to_disk(
643 if (!fields) 692 if (!fields)
644 return; 693 return;
645 694
695 xfs_sb_quota_to_disk(to, from, &fields);
646 while (fields) { 696 while (fields) {
647 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 697 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
648 first = xfs_sb_info[f].offset; 698 first = xfs_sb_info[f].offset;
@@ -835,6 +885,7 @@ reread:
835 */ 885 */
836 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 886 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
837 887
888 xfs_sb_quota_from_disk(&mp->m_sb);
838 /* 889 /*
839 * We must be able to do sector-sized and sector-aligned IO. 890 * We must be able to do sector-sized and sector-aligned IO.
840 */ 891 */
@@ -987,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
987 */ 1038 */
988 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 1039 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
989 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 1040 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
990 if (mp->m_flags & XFS_MOUNT_RETERR) { 1041 xfs_warn(mp,
991 xfs_warn(mp, "alignment check failed: " 1042 "alignment check failed: sunit/swidth vs. blocksize(%d)",
992 "(sunit/swidth vs. blocksize)"); 1043 sbp->sb_blocksize);
993 return XFS_ERROR(EINVAL); 1044 return XFS_ERROR(EINVAL);
994 }
995 mp->m_dalign = mp->m_swidth = 0;
996 } else { 1045 } else {
997 /* 1046 /*
998 * Convert the stripe unit and width to FSBs. 1047 * Convert the stripe unit and width to FSBs.
999 */ 1048 */
1000 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); 1049 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
1001 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { 1050 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
1002 if (mp->m_flags & XFS_MOUNT_RETERR) {
1003 xfs_warn(mp, "alignment check failed: "
1004 "(sunit/swidth vs. ag size)");
1005 return XFS_ERROR(EINVAL);
1006 }
1007 xfs_warn(mp, 1051 xfs_warn(mp,
1008 "stripe alignment turned off: sunit(%d)/swidth(%d) " 1052 "alignment check failed: sunit/swidth vs. agsize(%d)",
1009 "incompatible with agsize(%d)", 1053 sbp->sb_agblocks);
1010 mp->m_dalign, mp->m_swidth, 1054 return XFS_ERROR(EINVAL);
1011 sbp->sb_agblocks);
1012
1013 mp->m_dalign = 0;
1014 mp->m_swidth = 0;
1015 } else if (mp->m_dalign) { 1055 } else if (mp->m_dalign) {
1016 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 1056 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
1017 } else { 1057 } else {
1018 if (mp->m_flags & XFS_MOUNT_RETERR) { 1058 xfs_warn(mp,
1019 xfs_warn(mp, "alignment check failed: " 1059 "alignment check failed: sunit(%d) less than bsize(%d)",
1020 "sunit(%d) less than bsize(%d)", 1060 mp->m_dalign, sbp->sb_blocksize);
1021 mp->m_dalign, 1061 return XFS_ERROR(EINVAL);
1022 mp->m_blockmask +1);
1023 return XFS_ERROR(EINVAL);
1024 }
1025 mp->m_swidth = 0;
1026 } 1062 }
1027 } 1063 }
1028 1064
@@ -1039,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
1039 sbp->sb_width = mp->m_swidth; 1075 sbp->sb_width = mp->m_swidth;
1040 mp->m_update_flags |= XFS_SB_WIDTH; 1076 mp->m_update_flags |= XFS_SB_WIDTH;
1041 } 1077 }
1078 } else {
1079 xfs_warn(mp,
1080 "cannot change alignment: superblock does not support data alignment");
1081 return XFS_ERROR(EINVAL);
1042 } 1082 }
1043 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 1083 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
1044 xfs_sb_version_hasdalign(&mp->m_sb)) { 1084 xfs_sb_version_hasdalign(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b004cecdfb04..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
192 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ 192 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
193 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ 193 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
194 uint m_chsize; /* size of next field */ 194 uint m_chsize; /* size of next field */
195 struct xfs_chash *m_chash; /* fs private inode per-cluster
196 * hash table */
197 atomic_t m_active_trans; /* number trans frozen */ 195 atomic_t m_active_trans; /* number trans frozen */
198#ifdef HAVE_PERCPU_SB 196#ifdef HAVE_PERCPU_SB
199 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ 197 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
229 operations, typically for 227 operations, typically for
230 disk errors in metadata */ 228 disk errors in metadata */
231#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ 229#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
232#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
233 user */
234#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 230#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
235 allocations */ 231 allocations */
236#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ 232#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb6e71e..d320794d03ce 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
70 void *data) 70 void *data)
71{ 71{
72 struct xfs_quotainfo *qi = mp->m_quotainfo; 72 struct xfs_quotainfo *qi = mp->m_quotainfo;
73 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 73 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
74 uint32_t next_index; 74 uint32_t next_index;
75 int last_error = 0; 75 int last_error = 0;
76 int skipped; 76 int skipped;
@@ -137,6 +137,7 @@ xfs_qm_dqpurge(
137 struct xfs_mount *mp = dqp->q_mount; 137 struct xfs_mount *mp = dqp->q_mount;
138 struct xfs_quotainfo *qi = mp->m_quotainfo; 138 struct xfs_quotainfo *qi = mp->m_quotainfo;
139 struct xfs_dquot *gdqp = NULL; 139 struct xfs_dquot *gdqp = NULL;
140 struct xfs_dquot *pdqp = NULL;
140 141
141 xfs_dqlock(dqp); 142 xfs_dqlock(dqp);
142 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { 143 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -145,8 +146,7 @@ xfs_qm_dqpurge(
145 } 146 }
146 147
147 /* 148 /*
148 * If this quota has a group hint attached, prepare for releasing it 149 * If this quota has a hint attached, prepare for releasing it now.
149 * now.
150 */ 150 */
151 gdqp = dqp->q_gdquot; 151 gdqp = dqp->q_gdquot;
152 if (gdqp) { 152 if (gdqp) {
@@ -154,6 +154,12 @@ xfs_qm_dqpurge(
154 dqp->q_gdquot = NULL; 154 dqp->q_gdquot = NULL;
155 } 155 }
156 156
157 pdqp = dqp->q_pdquot;
158 if (pdqp) {
159 xfs_dqlock(pdqp);
160 dqp->q_pdquot = NULL;
161 }
162
157 dqp->dq_flags |= XFS_DQ_FREEING; 163 dqp->dq_flags |= XFS_DQ_FREEING;
158 164
159 xfs_dqflock(dqp); 165 xfs_dqflock(dqp);
@@ -189,7 +195,7 @@ xfs_qm_dqpurge(
189 xfs_dqfunlock(dqp); 195 xfs_dqfunlock(dqp);
190 xfs_dqunlock(dqp); 196 xfs_dqunlock(dqp);
191 197
192 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), 198 radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
193 be32_to_cpu(dqp->q_core.d_id)); 199 be32_to_cpu(dqp->q_core.d_id));
194 qi->qi_dquots--; 200 qi->qi_dquots--;
195 201
@@ -208,6 +214,8 @@ xfs_qm_dqpurge(
208 214
209 if (gdqp) 215 if (gdqp)
210 xfs_qm_dqput(gdqp); 216 xfs_qm_dqput(gdqp);
217 if (pdqp)
218 xfs_qm_dqput(pdqp);
211 return 0; 219 return 0;
212} 220}
213 221
@@ -299,8 +307,10 @@ xfs_qm_mount_quotas(
299 */ 307 */
300 if (!XFS_IS_UQUOTA_ON(mp)) 308 if (!XFS_IS_UQUOTA_ON(mp))
301 mp->m_qflags &= ~XFS_UQUOTA_CHKD; 309 mp->m_qflags &= ~XFS_UQUOTA_CHKD;
302 if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) 310 if (!XFS_IS_GQUOTA_ON(mp))
303 mp->m_qflags &= ~XFS_OQUOTA_CHKD; 311 mp->m_qflags &= ~XFS_GQUOTA_CHKD;
312 if (!XFS_IS_PQUOTA_ON(mp))
313 mp->m_qflags &= ~XFS_PQUOTA_CHKD;
304 314
305 write_changes: 315 write_changes:
306 /* 316 /*
@@ -362,6 +372,10 @@ xfs_qm_unmount_quotas(
362 IRELE(mp->m_quotainfo->qi_gquotaip); 372 IRELE(mp->m_quotainfo->qi_gquotaip);
363 mp->m_quotainfo->qi_gquotaip = NULL; 373 mp->m_quotainfo->qi_gquotaip = NULL;
364 } 374 }
375 if (mp->m_quotainfo->qi_pquotaip) {
376 IRELE(mp->m_quotainfo->qi_pquotaip);
377 mp->m_quotainfo->qi_pquotaip = NULL;
378 }
365 } 379 }
366} 380}
367 381
@@ -408,7 +422,10 @@ xfs_qm_dqattach_one(
408 * be reclaimed as long as we have a ref from inode and we 422 * be reclaimed as long as we have a ref from inode and we
409 * hold the ilock. 423 * hold the ilock.
410 */ 424 */
411 dqp = udqhint->q_gdquot; 425 if (type == XFS_DQ_GROUP)
426 dqp = udqhint->q_gdquot;
427 else
428 dqp = udqhint->q_pdquot;
412 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { 429 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
413 ASSERT(*IO_idqpp == NULL); 430 ASSERT(*IO_idqpp == NULL);
414 431
@@ -451,28 +468,42 @@ xfs_qm_dqattach_one(
451 468
452 469
453/* 470/*
454 * Given a udquot and gdquot, attach a ptr to the group dquot in the 471 * Given a udquot and group/project type, attach the group/project
455 * udquot as a hint for future lookups. 472 * dquot pointer to the udquot as a hint for future lookups.
456 */ 473 */
457STATIC void 474STATIC void
458xfs_qm_dqattach_grouphint( 475xfs_qm_dqattach_hint(
459 xfs_dquot_t *udq, 476 struct xfs_inode *ip,
460 xfs_dquot_t *gdq) 477 int type)
461{ 478{
462 xfs_dquot_t *tmp; 479 struct xfs_dquot **dqhintp;
480 struct xfs_dquot *dqp;
481 struct xfs_dquot *udq = ip->i_udquot;
482
483 ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
463 484
464 xfs_dqlock(udq); 485 xfs_dqlock(udq);
465 486
466 tmp = udq->q_gdquot; 487 if (type == XFS_DQ_GROUP) {
467 if (tmp) { 488 dqp = ip->i_gdquot;
468 if (tmp == gdq) 489 dqhintp = &udq->q_gdquot;
490 } else {
491 dqp = ip->i_pdquot;
492 dqhintp = &udq->q_pdquot;
493 }
494
495 if (*dqhintp) {
496 struct xfs_dquot *tmp;
497
498 if (*dqhintp == dqp)
469 goto done; 499 goto done;
470 500
471 udq->q_gdquot = NULL; 501 tmp = *dqhintp;
502 *dqhintp = NULL;
472 xfs_qm_dqrele(tmp); 503 xfs_qm_dqrele(tmp);
473 } 504 }
474 505
475 udq->q_gdquot = xfs_qm_dqhold(gdq); 506 *dqhintp = xfs_qm_dqhold(dqp);
476done: 507done:
477 xfs_dqunlock(udq); 508 xfs_dqunlock(udq);
478} 509}
@@ -489,8 +520,7 @@ xfs_qm_need_dqattach(
489 return false; 520 return false;
490 if (!XFS_NOT_DQATTACHED(mp, ip)) 521 if (!XFS_NOT_DQATTACHED(mp, ip))
491 return false; 522 return false;
492 if (ip->i_ino == mp->m_sb.sb_uquotino || 523 if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
493 ip->i_ino == mp->m_sb.sb_gquotino)
494 return false; 524 return false;
495 return true; 525 return true;
496} 526}
@@ -526,12 +556,8 @@ xfs_qm_dqattach_locked(
526 } 556 }
527 557
528 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 558 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
529 if (XFS_IS_OQUOTA_ON(mp)) { 559 if (XFS_IS_GQUOTA_ON(mp)) {
530 error = XFS_IS_GQUOTA_ON(mp) ? 560 error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
531 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
532 flags & XFS_QMOPT_DQALLOC,
533 ip->i_udquot, &ip->i_gdquot) :
534 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
535 flags & XFS_QMOPT_DQALLOC, 561 flags & XFS_QMOPT_DQALLOC,
536 ip->i_udquot, &ip->i_gdquot); 562 ip->i_udquot, &ip->i_gdquot);
537 /* 563 /*
@@ -543,14 +569,28 @@ xfs_qm_dqattach_locked(
543 nquotas++; 569 nquotas++;
544 } 570 }
545 571
572 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
573 if (XFS_IS_PQUOTA_ON(mp)) {
574 error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
575 flags & XFS_QMOPT_DQALLOC,
576 ip->i_udquot, &ip->i_pdquot);
577 /*
578 * Don't worry about the udquot that we may have
579 * attached above. It'll get detached, if not already.
580 */
581 if (error)
582 goto done;
583 nquotas++;
584 }
585
546 /* 586 /*
547 * Attach this group quota to the user quota as a hint. 587 * Attach this group/project quota to the user quota as a hint.
548 * This WON'T, in general, result in a thrash. 588 * This WON'T, in general, result in a thrash.
549 */ 589 */
550 if (nquotas == 2) { 590 if (nquotas > 1 && ip->i_udquot) {
551 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 591 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
552 ASSERT(ip->i_udquot); 592 ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
553 ASSERT(ip->i_gdquot); 593 ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
554 594
555 /* 595 /*
556 * We do not have i_udquot locked at this point, but this check 596 * We do not have i_udquot locked at this point, but this check
@@ -559,7 +599,10 @@ xfs_qm_dqattach_locked(
559 * succeed in general. 599 * succeed in general.
560 */ 600 */
561 if (ip->i_udquot->q_gdquot != ip->i_gdquot) 601 if (ip->i_udquot->q_gdquot != ip->i_gdquot)
562 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); 602 xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
603
604 if (ip->i_udquot->q_pdquot != ip->i_pdquot)
605 xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
563 } 606 }
564 607
565 done: 608 done:
@@ -567,8 +610,10 @@ xfs_qm_dqattach_locked(
567 if (!error) { 610 if (!error) {
568 if (XFS_IS_UQUOTA_ON(mp)) 611 if (XFS_IS_UQUOTA_ON(mp))
569 ASSERT(ip->i_udquot); 612 ASSERT(ip->i_udquot);
570 if (XFS_IS_OQUOTA_ON(mp)) 613 if (XFS_IS_GQUOTA_ON(mp))
571 ASSERT(ip->i_gdquot); 614 ASSERT(ip->i_gdquot);
615 if (XFS_IS_PQUOTA_ON(mp))
616 ASSERT(ip->i_pdquot);
572 } 617 }
573 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 618 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
574#endif 619#endif
@@ -601,13 +646,12 @@ void
601xfs_qm_dqdetach( 646xfs_qm_dqdetach(
602 xfs_inode_t *ip) 647 xfs_inode_t *ip)
603{ 648{
604 if (!(ip->i_udquot || ip->i_gdquot)) 649 if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
605 return; 650 return;
606 651
607 trace_xfs_dquot_dqdetach(ip); 652 trace_xfs_dquot_dqdetach(ip);
608 653
609 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); 654 ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
610 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
611 if (ip->i_udquot) { 655 if (ip->i_udquot) {
612 xfs_qm_dqrele(ip->i_udquot); 656 xfs_qm_dqrele(ip->i_udquot);
613 ip->i_udquot = NULL; 657 ip->i_udquot = NULL;
@@ -616,6 +660,10 @@ xfs_qm_dqdetach(
616 xfs_qm_dqrele(ip->i_gdquot); 660 xfs_qm_dqrele(ip->i_gdquot);
617 ip->i_gdquot = NULL; 661 ip->i_gdquot = NULL;
618 } 662 }
663 if (ip->i_pdquot) {
664 xfs_qm_dqrele(ip->i_pdquot);
665 ip->i_pdquot = NULL;
666 }
619} 667}
620 668
621int 669int
@@ -660,6 +708,7 @@ xfs_qm_init_quotainfo(
660 708
661 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); 709 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
662 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); 710 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
711 INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
663 mutex_init(&qinf->qi_tree_lock); 712 mutex_init(&qinf->qi_tree_lock);
664 713
665 INIT_LIST_HEAD(&qinf->qi_lru_list); 714 INIT_LIST_HEAD(&qinf->qi_lru_list);
@@ -761,6 +810,10 @@ xfs_qm_destroy_quotainfo(
761 IRELE(qi->qi_gquotaip); 810 IRELE(qi->qi_gquotaip);
762 qi->qi_gquotaip = NULL; 811 qi->qi_gquotaip = NULL;
763 } 812 }
813 if (qi->qi_pquotaip) {
814 IRELE(qi->qi_pquotaip);
815 qi->qi_pquotaip = NULL;
816 }
764 mutex_destroy(&qi->qi_quotaofflock); 817 mutex_destroy(&qi->qi_quotaofflock);
765 kmem_free(qi); 818 kmem_free(qi);
766 mp->m_quotainfo = NULL; 819 mp->m_quotainfo = NULL;
@@ -1152,7 +1205,7 @@ xfs_qm_dqusage_adjust(
1152 * rootino must have its resources accounted for, not so with the quota 1205 * rootino must have its resources accounted for, not so with the quota
1153 * inodes. 1206 * inodes.
1154 */ 1207 */
1155 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1208 if (xfs_is_quota_inode(&mp->m_sb, ino)) {
1156 *res = BULKSTAT_RV_NOTHING; 1209 *res = BULKSTAT_RV_NOTHING;
1157 return XFS_ERROR(EINVAL); 1210 return XFS_ERROR(EINVAL);
1158 } 1211 }
@@ -1262,19 +1315,21 @@ int
1262xfs_qm_quotacheck( 1315xfs_qm_quotacheck(
1263 xfs_mount_t *mp) 1316 xfs_mount_t *mp)
1264{ 1317{
1265 int done, count, error, error2; 1318 int done, count, error, error2;
1266 xfs_ino_t lastino; 1319 xfs_ino_t lastino;
1267 size_t structsz; 1320 size_t structsz;
1268 xfs_inode_t *uip, *gip; 1321 uint flags;
1269 uint flags; 1322 LIST_HEAD (buffer_list);
1270 LIST_HEAD (buffer_list); 1323 struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
1324 struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
1325 struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
1271 1326
1272 count = INT_MAX; 1327 count = INT_MAX;
1273 structsz = 1; 1328 structsz = 1;
1274 lastino = 0; 1329 lastino = 0;
1275 flags = 0; 1330 flags = 0;
1276 1331
1277 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); 1332 ASSERT(uip || gip || pip);
1278 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1333 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1279 1334
1280 xfs_notice(mp, "Quotacheck needed: Please wait."); 1335 xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1339,6 @@ xfs_qm_quotacheck(
1284 * their counters to zero. We need a clean slate. 1339 * their counters to zero. We need a clean slate.
1285 * We don't log our changes till later. 1340 * We don't log our changes till later.
1286 */ 1341 */
1287 uip = mp->m_quotainfo->qi_uquotaip;
1288 if (uip) { 1342 if (uip) {
1289 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, 1343 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
1290 &buffer_list); 1344 &buffer_list);
@@ -1293,14 +1347,20 @@ xfs_qm_quotacheck(
1293 flags |= XFS_UQUOTA_CHKD; 1347 flags |= XFS_UQUOTA_CHKD;
1294 } 1348 }
1295 1349
1296 gip = mp->m_quotainfo->qi_gquotaip;
1297 if (gip) { 1350 if (gip) {
1298 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1351 error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
1299 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
1300 &buffer_list); 1352 &buffer_list);
1301 if (error) 1353 if (error)
1302 goto error_return; 1354 goto error_return;
1303 flags |= XFS_OQUOTA_CHKD; 1355 flags |= XFS_GQUOTA_CHKD;
1356 }
1357
1358 if (pip) {
1359 error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
1360 &buffer_list);
1361 if (error)
1362 goto error_return;
1363 flags |= XFS_PQUOTA_CHKD;
1304 } 1364 }
1305 1365
1306 do { 1366 do {
@@ -1395,15 +1455,14 @@ STATIC int
1395xfs_qm_init_quotainos( 1455xfs_qm_init_quotainos(
1396 xfs_mount_t *mp) 1456 xfs_mount_t *mp)
1397{ 1457{
1398 xfs_inode_t *uip, *gip; 1458 struct xfs_inode *uip = NULL;
1399 int error; 1459 struct xfs_inode *gip = NULL;
1400 __int64_t sbflags; 1460 struct xfs_inode *pip = NULL;
1401 uint flags; 1461 int error;
1462 __int64_t sbflags = 0;
1463 uint flags = 0;
1402 1464
1403 ASSERT(mp->m_quotainfo); 1465 ASSERT(mp->m_quotainfo);
1404 uip = gip = NULL;
1405 sbflags = 0;
1406 flags = 0;
1407 1466
1408 /* 1467 /*
1409 * Get the uquota and gquota inodes 1468 * Get the uquota and gquota inodes
@@ -1412,19 +1471,27 @@ xfs_qm_init_quotainos(
1412 if (XFS_IS_UQUOTA_ON(mp) && 1471 if (XFS_IS_UQUOTA_ON(mp) &&
1413 mp->m_sb.sb_uquotino != NULLFSINO) { 1472 mp->m_sb.sb_uquotino != NULLFSINO) {
1414 ASSERT(mp->m_sb.sb_uquotino > 0); 1473 ASSERT(mp->m_sb.sb_uquotino > 0);
1415 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 1474 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
1416 0, 0, &uip))) 1475 0, 0, &uip);
1476 if (error)
1417 return XFS_ERROR(error); 1477 return XFS_ERROR(error);
1418 } 1478 }
1419 if (XFS_IS_OQUOTA_ON(mp) && 1479 if (XFS_IS_GQUOTA_ON(mp) &&
1420 mp->m_sb.sb_gquotino != NULLFSINO) { 1480 mp->m_sb.sb_gquotino != NULLFSINO) {
1421 ASSERT(mp->m_sb.sb_gquotino > 0); 1481 ASSERT(mp->m_sb.sb_gquotino > 0);
1422 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1482 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1423 0, 0, &gip))) { 1483 0, 0, &gip);
1424 if (uip) 1484 if (error)
1425 IRELE(uip); 1485 goto error_rele;
1426 return XFS_ERROR(error); 1486 }
1427 } 1487 /* XXX: Use gquotino for now */
1488 if (XFS_IS_PQUOTA_ON(mp) &&
1489 mp->m_sb.sb_gquotino != NULLFSINO) {
1490 ASSERT(mp->m_sb.sb_gquotino > 0);
1491 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1492 0, 0, &pip);
1493 if (error)
1494 goto error_rele;
1428 } 1495 }
1429 } else { 1496 } else {
1430 flags |= XFS_QMOPT_SBVERSION; 1497 flags |= XFS_QMOPT_SBVERSION;
@@ -1433,36 +1500,52 @@ xfs_qm_init_quotainos(
1433 } 1500 }
1434 1501
1435 /* 1502 /*
1436 * Create the two inodes, if they don't exist already. The changes 1503 * Create the three inodes, if they don't exist already. The changes
1437 * made above will get added to a transaction and logged in one of 1504 * made above will get added to a transaction and logged in one of
1438 * the qino_alloc calls below. If the device is readonly, 1505 * the qino_alloc calls below. If the device is readonly,
1439 * temporarily switch to read-write to do this. 1506 * temporarily switch to read-write to do this.
1440 */ 1507 */
1441 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { 1508 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
1442 if ((error = xfs_qm_qino_alloc(mp, &uip, 1509 error = xfs_qm_qino_alloc(mp, &uip,
1443 sbflags | XFS_SB_UQUOTINO, 1510 sbflags | XFS_SB_UQUOTINO,
1444 flags | XFS_QMOPT_UQUOTA))) 1511 flags | XFS_QMOPT_UQUOTA);
1445 return XFS_ERROR(error); 1512 if (error)
1513 goto error_rele;
1446 1514
1447 flags &= ~XFS_QMOPT_SBVERSION; 1515 flags &= ~XFS_QMOPT_SBVERSION;
1448 } 1516 }
1449 if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) { 1517 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
1450 flags |= (XFS_IS_GQUOTA_ON(mp) ?
1451 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1452 error = xfs_qm_qino_alloc(mp, &gip, 1518 error = xfs_qm_qino_alloc(mp, &gip,
1453 sbflags | XFS_SB_GQUOTINO, flags); 1519 sbflags | XFS_SB_GQUOTINO,
1454 if (error) { 1520 flags | XFS_QMOPT_GQUOTA);
1455 if (uip) 1521 if (error)
1456 IRELE(uip); 1522 goto error_rele;
1457 1523
1458 return XFS_ERROR(error); 1524 flags &= ~XFS_QMOPT_SBVERSION;
1459 } 1525 }
1526 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
1527 /* XXX: Use XFS_SB_GQUOTINO for now */
1528 error = xfs_qm_qino_alloc(mp, &pip,
1529 sbflags | XFS_SB_GQUOTINO,
1530 flags | XFS_QMOPT_PQUOTA);
1531 if (error)
1532 goto error_rele;
1460 } 1533 }
1461 1534
1462 mp->m_quotainfo->qi_uquotaip = uip; 1535 mp->m_quotainfo->qi_uquotaip = uip;
1463 mp->m_quotainfo->qi_gquotaip = gip; 1536 mp->m_quotainfo->qi_gquotaip = gip;
1537 mp->m_quotainfo->qi_pquotaip = pip;
1464 1538
1465 return 0; 1539 return 0;
1540
1541error_rele:
1542 if (uip)
1543 IRELE(uip);
1544 if (gip)
1545 IRELE(gip);
1546 if (pip)
1547 IRELE(pip);
1548 return XFS_ERROR(error);
1466} 1549}
1467 1550
1468STATIC void 1551STATIC void
@@ -1473,7 +1556,7 @@ xfs_qm_dqfree_one(
1473 struct xfs_quotainfo *qi = mp->m_quotainfo; 1556 struct xfs_quotainfo *qi = mp->m_quotainfo;
1474 1557
1475 mutex_lock(&qi->qi_tree_lock); 1558 mutex_lock(&qi->qi_tree_lock);
1476 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), 1559 radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
1477 be32_to_cpu(dqp->q_core.d_id)); 1560 be32_to_cpu(dqp->q_core.d_id));
1478 1561
1479 qi->qi_dquots--; 1562 qi->qi_dquots--;
@@ -1656,10 +1739,13 @@ xfs_qm_vop_dqalloc(
1656 prid_t prid, 1739 prid_t prid,
1657 uint flags, 1740 uint flags,
1658 struct xfs_dquot **O_udqpp, 1741 struct xfs_dquot **O_udqpp,
1659 struct xfs_dquot **O_gdqpp) 1742 struct xfs_dquot **O_gdqpp,
1743 struct xfs_dquot **O_pdqpp)
1660{ 1744{
1661 struct xfs_mount *mp = ip->i_mount; 1745 struct xfs_mount *mp = ip->i_mount;
1662 struct xfs_dquot *uq, *gq; 1746 struct xfs_dquot *uq = NULL;
1747 struct xfs_dquot *gq = NULL;
1748 struct xfs_dquot *pq = NULL;
1663 int error; 1749 int error;
1664 uint lockflags; 1750 uint lockflags;
1665 1751
@@ -1684,7 +1770,6 @@ xfs_qm_vop_dqalloc(
1684 } 1770 }
1685 } 1771 }
1686 1772
1687 uq = gq = NULL;
1688 if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { 1773 if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
1689 if (ip->i_d.di_uid != uid) { 1774 if (ip->i_d.di_uid != uid) {
1690 /* 1775 /*
@@ -1697,11 +1782,12 @@ xfs_qm_vop_dqalloc(
1697 * holding ilock. 1782 * holding ilock.
1698 */ 1783 */
1699 xfs_iunlock(ip, lockflags); 1784 xfs_iunlock(ip, lockflags);
1700 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, 1785 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
1701 XFS_DQ_USER, 1786 XFS_DQ_USER,
1702 XFS_QMOPT_DQALLOC | 1787 XFS_QMOPT_DQALLOC |
1703 XFS_QMOPT_DOWARN, 1788 XFS_QMOPT_DOWARN,
1704 &uq))) { 1789 &uq);
1790 if (error) {
1705 ASSERT(error != ENOENT); 1791 ASSERT(error != ENOENT);
1706 return error; 1792 return error;
1707 } 1793 }
@@ -1723,15 +1809,14 @@ xfs_qm_vop_dqalloc(
1723 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { 1809 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
1724 if (ip->i_d.di_gid != gid) { 1810 if (ip->i_d.di_gid != gid) {
1725 xfs_iunlock(ip, lockflags); 1811 xfs_iunlock(ip, lockflags);
1726 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, 1812 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
1727 XFS_DQ_GROUP, 1813 XFS_DQ_GROUP,
1728 XFS_QMOPT_DQALLOC | 1814 XFS_QMOPT_DQALLOC |
1729 XFS_QMOPT_DOWARN, 1815 XFS_QMOPT_DOWARN,
1730 &gq))) { 1816 &gq);
1731 if (uq) 1817 if (error) {
1732 xfs_qm_dqrele(uq);
1733 ASSERT(error != ENOENT); 1818 ASSERT(error != ENOENT);
1734 return error; 1819 goto error_rele;
1735 } 1820 }
1736 xfs_dqunlock(gq); 1821 xfs_dqunlock(gq);
1737 lockflags = XFS_ILOCK_SHARED; 1822 lockflags = XFS_ILOCK_SHARED;
@@ -1740,25 +1825,25 @@ xfs_qm_vop_dqalloc(
1740 ASSERT(ip->i_gdquot); 1825 ASSERT(ip->i_gdquot);
1741 gq = xfs_qm_dqhold(ip->i_gdquot); 1826 gq = xfs_qm_dqhold(ip->i_gdquot);
1742 } 1827 }
1743 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 1828 }
1829 if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
1744 if (xfs_get_projid(ip) != prid) { 1830 if (xfs_get_projid(ip) != prid) {
1745 xfs_iunlock(ip, lockflags); 1831 xfs_iunlock(ip, lockflags);
1746 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 1832 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
1747 XFS_DQ_PROJ, 1833 XFS_DQ_PROJ,
1748 XFS_QMOPT_DQALLOC | 1834 XFS_QMOPT_DQALLOC |
1749 XFS_QMOPT_DOWARN, 1835 XFS_QMOPT_DOWARN,
1750 &gq))) { 1836 &pq);
1751 if (uq) 1837 if (error) {
1752 xfs_qm_dqrele(uq);
1753 ASSERT(error != ENOENT); 1838 ASSERT(error != ENOENT);
1754 return (error); 1839 goto error_rele;
1755 } 1840 }
1756 xfs_dqunlock(gq); 1841 xfs_dqunlock(pq);
1757 lockflags = XFS_ILOCK_SHARED; 1842 lockflags = XFS_ILOCK_SHARED;
1758 xfs_ilock(ip, lockflags); 1843 xfs_ilock(ip, lockflags);
1759 } else { 1844 } else {
1760 ASSERT(ip->i_gdquot); 1845 ASSERT(ip->i_pdquot);
1761 gq = xfs_qm_dqhold(ip->i_gdquot); 1846 pq = xfs_qm_dqhold(ip->i_pdquot);
1762 } 1847 }
1763 } 1848 }
1764 if (uq) 1849 if (uq)
@@ -1773,7 +1858,18 @@ xfs_qm_vop_dqalloc(
1773 *O_gdqpp = gq; 1858 *O_gdqpp = gq;
1774 else if (gq) 1859 else if (gq)
1775 xfs_qm_dqrele(gq); 1860 xfs_qm_dqrele(gq);
1861 if (O_pdqpp)
1862 *O_pdqpp = pq;
1863 else if (pq)
1864 xfs_qm_dqrele(pq);
1776 return 0; 1865 return 0;
1866
1867error_rele:
1868 if (gq)
1869 xfs_qm_dqrele(gq);
1870 if (uq)
1871 xfs_qm_dqrele(uq);
1872 return error;
1777} 1873}
1778 1874
1779/* 1875/*
@@ -1821,29 +1917,34 @@ xfs_qm_vop_chown(
1821 */ 1917 */
1822int 1918int
1823xfs_qm_vop_chown_reserve( 1919xfs_qm_vop_chown_reserve(
1824 xfs_trans_t *tp, 1920 struct xfs_trans *tp,
1825 xfs_inode_t *ip, 1921 struct xfs_inode *ip,
1826 xfs_dquot_t *udqp, 1922 struct xfs_dquot *udqp,
1827 xfs_dquot_t *gdqp, 1923 struct xfs_dquot *gdqp,
1828 uint flags) 1924 struct xfs_dquot *pdqp,
1925 uint flags)
1829{ 1926{
1830 xfs_mount_t *mp = ip->i_mount; 1927 struct xfs_mount *mp = ip->i_mount;
1831 uint delblks, blkflags, prjflags = 0; 1928 uint delblks, blkflags, prjflags = 0;
1832 xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; 1929 struct xfs_dquot *udq_unres = NULL;
1833 int error; 1930 struct xfs_dquot *gdq_unres = NULL;
1931 struct xfs_dquot *pdq_unres = NULL;
1932 struct xfs_dquot *udq_delblks = NULL;
1933 struct xfs_dquot *gdq_delblks = NULL;
1934 struct xfs_dquot *pdq_delblks = NULL;
1935 int error;
1834 1936
1835 1937
1836 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 1938 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
1837 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1939 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1838 1940
1839 delblks = ip->i_delayed_blks; 1941 delblks = ip->i_delayed_blks;
1840 delblksudq = delblksgdq = unresudq = unresgdq = NULL;
1841 blkflags = XFS_IS_REALTIME_INODE(ip) ? 1942 blkflags = XFS_IS_REALTIME_INODE(ip) ?
1842 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; 1943 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
1843 1944
1844 if (XFS_IS_UQUOTA_ON(mp) && udqp && 1945 if (XFS_IS_UQUOTA_ON(mp) && udqp &&
1845 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { 1946 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
1846 delblksudq = udqp; 1947 udq_delblks = udqp;
1847 /* 1948 /*
1848 * If there are delayed allocation blocks, then we have to 1949 * If there are delayed allocation blocks, then we have to
1849 * unreserve those from the old dquot, and add them to the 1950 * unreserve those from the old dquot, and add them to the
@@ -1851,29 +1952,34 @@ xfs_qm_vop_chown_reserve(
1851 */ 1952 */
1852 if (delblks) { 1953 if (delblks) {
1853 ASSERT(ip->i_udquot); 1954 ASSERT(ip->i_udquot);
1854 unresudq = ip->i_udquot; 1955 udq_unres = ip->i_udquot;
1855 } 1956 }
1856 } 1957 }
1857 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 1958 if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
1858 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 1959 ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
1859 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) 1960 gdq_delblks = gdqp;
1860 prjflags = XFS_QMOPT_ENOSPC; 1961 if (delblks) {
1861 1962 ASSERT(ip->i_gdquot);
1862 if (prjflags || 1963 gdq_unres = ip->i_gdquot;
1863 (XFS_IS_GQUOTA_ON(ip->i_mount) &&
1864 ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
1865 delblksgdq = gdqp;
1866 if (delblks) {
1867 ASSERT(ip->i_gdquot);
1868 unresgdq = ip->i_gdquot;
1869 }
1870 } 1964 }
1871 } 1965 }
1872 1966
1873 if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, 1967 if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
1874 delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, 1968 xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
1875 flags | blkflags | prjflags))) 1969 prjflags = XFS_QMOPT_ENOSPC;
1876 return (error); 1970 pdq_delblks = pdqp;
1971 if (delblks) {
1972 ASSERT(ip->i_pdquot);
1973 pdq_unres = ip->i_pdquot;
1974 }
1975 }
1976
1977 error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
1978 udq_delblks, gdq_delblks, pdq_delblks,
1979 ip->i_d.di_nblocks, 1,
1980 flags | blkflags | prjflags);
1981 if (error)
1982 return error;
1877 1983
1878 /* 1984 /*
1879 * Do the delayed blks reservations/unreservations now. Since, these 1985 * Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,15 +1991,17 @@ xfs_qm_vop_chown_reserve(
1885 /* 1991 /*
1886 * Do the reservations first. Unreservation can't fail. 1992 * Do the reservations first. Unreservation can't fail.
1887 */ 1993 */
1888 ASSERT(delblksudq || delblksgdq); 1994 ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
1889 ASSERT(unresudq || unresgdq); 1995 ASSERT(udq_unres || gdq_unres || pdq_unres);
1890 if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, 1996 error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
1891 delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, 1997 udq_delblks, gdq_delblks, pdq_delblks,
1892 flags | blkflags | prjflags))) 1998 (xfs_qcnt_t)delblks, 0,
1893 return (error); 1999 flags | blkflags | prjflags);
2000 if (error)
2001 return error;
1894 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, 2002 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
1895 unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, 2003 udq_unres, gdq_unres, pdq_unres,
1896 blkflags); 2004 -((xfs_qcnt_t)delblks), 0, blkflags);
1897 } 2005 }
1898 2006
1899 return (0); 2007 return (0);
@@ -1932,7 +2040,8 @@ xfs_qm_vop_create_dqattach(
1932 struct xfs_trans *tp, 2040 struct xfs_trans *tp,
1933 struct xfs_inode *ip, 2041 struct xfs_inode *ip,
1934 struct xfs_dquot *udqp, 2042 struct xfs_dquot *udqp,
1935 struct xfs_dquot *gdqp) 2043 struct xfs_dquot *gdqp,
2044 struct xfs_dquot *pdqp)
1936{ 2045{
1937 struct xfs_mount *mp = tp->t_mountp; 2046 struct xfs_mount *mp = tp->t_mountp;
1938 2047
@@ -1952,13 +2061,18 @@ xfs_qm_vop_create_dqattach(
1952 } 2061 }
1953 if (gdqp) { 2062 if (gdqp) {
1954 ASSERT(ip->i_gdquot == NULL); 2063 ASSERT(ip->i_gdquot == NULL);
1955 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2064 ASSERT(XFS_IS_GQUOTA_ON(mp));
1956 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2065 ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
1957 ip->i_d.di_gid : xfs_get_projid(ip)) ==
1958 be32_to_cpu(gdqp->q_core.d_id));
1959
1960 ip->i_gdquot = xfs_qm_dqhold(gdqp); 2066 ip->i_gdquot = xfs_qm_dqhold(gdqp);
1961 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2067 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
1962 } 2068 }
2069 if (pdqp) {
2070 ASSERT(ip->i_pdquot == NULL);
2071 ASSERT(XFS_IS_PQUOTA_ON(mp));
2072 ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
2073
2074 ip->i_pdquot = xfs_qm_dqhold(pdqp);
2075 xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
2076 }
1963} 2077}
1964 2078
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e6900f..579d6a02a5b6 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -44,9 +44,11 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
44typedef struct xfs_quotainfo { 44typedef struct xfs_quotainfo {
45 struct radix_tree_root qi_uquota_tree; 45 struct radix_tree_root qi_uquota_tree;
46 struct radix_tree_root qi_gquota_tree; 46 struct radix_tree_root qi_gquota_tree;
47 struct radix_tree_root qi_pquota_tree;
47 struct mutex qi_tree_lock; 48 struct mutex qi_tree_lock;
48 xfs_inode_t *qi_uquotaip; /* user quota inode */ 49 struct xfs_inode *qi_uquotaip; /* user quota inode */
49 xfs_inode_t *qi_gquotaip; /* group quota inode */ 50 struct xfs_inode *qi_gquotaip; /* group quota inode */
51 struct xfs_inode *qi_pquotaip; /* project quota inode */
50 struct list_head qi_lru_list; 52 struct list_head qi_lru_list;
51 struct mutex qi_lru_lock; 53 struct mutex qi_lru_lock;
52 int qi_lru_count; 54 int qi_lru_count;
@@ -69,30 +71,66 @@ typedef struct xfs_quotainfo {
69 struct shrinker qi_shrinker; 71 struct shrinker qi_shrinker;
70} xfs_quotainfo_t; 72} xfs_quotainfo_t;
71 73
72#define XFS_DQUOT_TREE(qi, type) \ 74static inline struct radix_tree_root *
73 ((type & XFS_DQ_USER) ? \ 75xfs_dquot_tree(
74 &((qi)->qi_uquota_tree) : \ 76 struct xfs_quotainfo *qi,
75 &((qi)->qi_gquota_tree)) 77 int type)
78{
79 switch (type) {
80 case XFS_DQ_USER:
81 return &qi->qi_uquota_tree;
82 case XFS_DQ_GROUP:
83 return &qi->qi_gquota_tree;
84 case XFS_DQ_PROJ:
85 return &qi->qi_pquota_tree;
86 default:
87 ASSERT(0);
88 }
89 return NULL;
90}
76 91
92static inline struct xfs_inode *
93xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
94{
95 switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
96 case XFS_DQ_USER:
97 return dqp->q_mount->m_quotainfo->qi_uquotaip;
98 case XFS_DQ_GROUP:
99 return dqp->q_mount->m_quotainfo->qi_gquotaip;
100 case XFS_DQ_PROJ:
101 return dqp->q_mount->m_quotainfo->qi_pquotaip;
102 default:
103 ASSERT(0);
104 }
105 return NULL;
106}
77 107
78extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, 108extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
79 unsigned int nbblks); 109 unsigned int nbblks);
80extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); 110extern void xfs_trans_mod_dquot(struct xfs_trans *,
81extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, 111 struct xfs_dquot *, uint, long);
82 xfs_dquot_t *, xfs_dquot_t *, long, long, uint); 112extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
83extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); 113 struct xfs_mount *, struct xfs_dquot *,
84extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); 114 struct xfs_dquot *, struct xfs_dquot *,
115 long, long, uint);
116extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
117extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
85 118
86/* 119/*
87 * We keep the usr and grp dquots separately so that locking will be easier 120 * We keep the usr, grp, and prj dquots separately so that locking will be
88 * to do at commit time. All transactions that we know of at this point 121 * easier to do at commit time. All transactions that we know of at this point
89 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. 122 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
90 */ 123 */
124enum {
125 XFS_QM_TRANS_USR = 0,
126 XFS_QM_TRANS_GRP,
127 XFS_QM_TRANS_PRJ,
128 XFS_QM_TRANS_DQTYPES
129};
91#define XFS_QM_TRANS_MAXDQS 2 130#define XFS_QM_TRANS_MAXDQS 2
92typedef struct xfs_dquot_acct { 131struct xfs_dquot_acct {
93 xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; 132 struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
94 xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; 133};
95} xfs_dquot_acct_t;
96 134
97/* 135/*
98 * Users are allowed to have a usage exceeding their softlimit for 136 * Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +144,23 @@ typedef struct xfs_dquot_acct {
106#define XFS_QM_IWARNLIMIT 5 144#define XFS_QM_IWARNLIMIT 5
107#define XFS_QM_RTBWARNLIMIT 5 145#define XFS_QM_RTBWARNLIMIT 5
108 146
109extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 147extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
110extern int xfs_qm_quotacheck(xfs_mount_t *); 148extern int xfs_qm_quotacheck(struct xfs_mount *);
111extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); 149extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
112 150
113/* dquot stuff */ 151/* dquot stuff */
114extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); 152extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
115extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 153extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
116 154
117/* quota ops */ 155/* quota ops */
118extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); 156extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
119extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, 157extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
120 fs_disk_quota_t *); 158 uint, struct fs_disk_quota *);
121extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, 159extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
122 fs_disk_quota_t *); 160 struct fs_disk_quota *);
123extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); 161extern int xfs_qm_scall_getqstat(struct xfs_mount *,
124extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 162 struct fs_quota_stat *);
125extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 163extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
164extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
126 165
127#endif /* __XFS_QM_H__ */ 166#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2d02eac1c9a8..437a52d91f6d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -112,16 +112,16 @@ xfs_qm_newmount(
112 112
113 if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || 113 if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
114 (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || 114 (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
115 (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
116 (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
117 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || 115 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
118 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && 116 (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
117 (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
118 (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
119 xfs_dev_is_read_only(mp, "changing quota state")) { 119 xfs_dev_is_read_only(mp, "changing quota state")) {
120 xfs_warn(mp, "please mount with%s%s%s%s.", 120 xfs_warn(mp, "please mount with%s%s%s%s.",
121 (!quotaondisk ? "out quota" : ""), 121 (!quotaondisk ? "out quota" : ""),
122 (uquotaondisk ? " usrquota" : ""), 122 (uquotaondisk ? " usrquota" : ""),
123 (pquotaondisk ? " prjquota" : ""), 123 (gquotaondisk ? " grpquota" : ""),
124 (gquotaondisk ? " grpquota" : "")); 124 (pquotaondisk ? " prjquota" : ""));
125 return XFS_ERROR(EPERM); 125 return XFS_ERROR(EPERM);
126 } 126 }
127 127
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 6cdf6ffc36a1..e4f8b2d6f38b 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,12 @@ xfs_qm_scall_quotaoff(
117 } 117 }
118 if (flags & XFS_GQUOTA_ACCT) { 118 if (flags & XFS_GQUOTA_ACCT) {
119 dqtype |= XFS_QMOPT_GQUOTA; 119 dqtype |= XFS_QMOPT_GQUOTA;
120 flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); 120 flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
121 inactivate_flags |= XFS_GQUOTA_ACTIVE; 121 inactivate_flags |= XFS_GQUOTA_ACTIVE;
122 } else if (flags & XFS_PQUOTA_ACCT) { 122 }
123 if (flags & XFS_PQUOTA_ACCT) {
123 dqtype |= XFS_QMOPT_PQUOTA; 124 dqtype |= XFS_QMOPT_PQUOTA;
124 flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); 125 flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
125 inactivate_flags |= XFS_PQUOTA_ACTIVE; 126 inactivate_flags |= XFS_PQUOTA_ACTIVE;
126 } 127 }
127 128
@@ -198,10 +199,9 @@ xfs_qm_scall_quotaoff(
198 } 199 }
199 200
200 /* 201 /*
201 * If quotas is completely disabled, close shop. 202 * If all quotas are completely turned off, close shop.
202 */ 203 */
203 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || 204 if (mp->m_qflags == 0) {
204 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
205 mutex_unlock(&q->qi_quotaofflock); 205 mutex_unlock(&q->qi_quotaofflock);
206 xfs_qm_destroy_quotainfo(mp); 206 xfs_qm_destroy_quotainfo(mp);
207 return (0); 207 return (0);
@@ -214,10 +214,14 @@ xfs_qm_scall_quotaoff(
214 IRELE(q->qi_uquotaip); 214 IRELE(q->qi_uquotaip);
215 q->qi_uquotaip = NULL; 215 q->qi_uquotaip = NULL;
216 } 216 }
217 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { 217 if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
218 IRELE(q->qi_gquotaip); 218 IRELE(q->qi_gquotaip);
219 q->qi_gquotaip = NULL; 219 q->qi_gquotaip = NULL;
220 } 220 }
221 if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
222 IRELE(q->qi_pquotaip);
223 q->qi_pquotaip = NULL;
224 }
221 225
222out_unlock: 226out_unlock:
223 mutex_unlock(&q->qi_quotaofflock); 227 mutex_unlock(&q->qi_quotaofflock);
@@ -335,14 +339,14 @@ xfs_qm_scall_quotaon(
335 * quota acct on ondisk without m_qflags' knowing. 339 * quota acct on ondisk without m_qflags' knowing.
336 */ 340 */
337 if (((flags & XFS_UQUOTA_ACCT) == 0 && 341 if (((flags & XFS_UQUOTA_ACCT) == 0 &&
338 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && 342 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
339 (flags & XFS_UQUOTA_ENFD)) 343 (flags & XFS_UQUOTA_ENFD)) ||
340 || 344 ((flags & XFS_GQUOTA_ACCT) == 0 &&
345 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
346 (flags & XFS_GQUOTA_ENFD)) ||
341 ((flags & XFS_PQUOTA_ACCT) == 0 && 347 ((flags & XFS_PQUOTA_ACCT) == 0 &&
342 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && 348 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
343 (flags & XFS_GQUOTA_ACCT) == 0 && 349 (flags & XFS_PQUOTA_ENFD))) {
344 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
345 (flags & XFS_OQUOTA_ENFD))) {
346 xfs_debug(mp, 350 xfs_debug(mp,
347 "%s: Can't enforce without acct, flags=%x sbflags=%x\n", 351 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
348 __func__, flags, mp->m_sb.sb_qflags); 352 __func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +411,11 @@ xfs_qm_scall_getqstat(
407 struct fs_quota_stat *out) 411 struct fs_quota_stat *out)
408{ 412{
409 struct xfs_quotainfo *q = mp->m_quotainfo; 413 struct xfs_quotainfo *q = mp->m_quotainfo;
410 struct xfs_inode *uip, *gip; 414 struct xfs_inode *uip = NULL;
411 bool tempuqip, tempgqip; 415 struct xfs_inode *gip = NULL;
416 bool tempuqip = false;
417 bool tempgqip = false;
412 418
413 uip = gip = NULL;
414 tempuqip = tempgqip = false;
415 memset(out, 0, sizeof(fs_quota_stat_t)); 419 memset(out, 0, sizeof(fs_quota_stat_t));
416 420
417 out->qs_version = FS_QSTAT_VERSION; 421 out->qs_version = FS_QSTAT_VERSION;
@@ -776,9 +780,12 @@ xfs_qm_scall_getquota(
776 * gets turned off. No need to confuse the user level code, 780 * gets turned off. No need to confuse the user level code,
777 * so return zeroes in that case. 781 * so return zeroes in that case.
778 */ 782 */
779 if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || 783 if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
780 (!XFS_IS_OQUOTA_ENFORCED(mp) && 784 dqp->q_core.d_flags == XFS_DQ_USER) ||
781 (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { 785 (!XFS_IS_GQUOTA_ENFORCED(mp) &&
786 dqp->q_core.d_flags == XFS_DQ_GROUP) ||
787 (!XFS_IS_PQUOTA_ENFORCED(mp) &&
788 dqp->q_core.d_flags == XFS_DQ_PROJ)) {
782 dst->d_btimer = 0; 789 dst->d_btimer = 0;
783 dst->d_itimer = 0; 790 dst->d_itimer = 0;
784 dst->d_rtbtimer = 0; 791 dst->d_rtbtimer = 0;
@@ -786,8 +793,8 @@ xfs_qm_scall_getquota(
786 793
787#ifdef DEBUG 794#ifdef DEBUG
788 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || 795 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
789 (XFS_IS_OQUOTA_ENFORCED(mp) && 796 (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
790 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && 797 (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
791 dst->d_id != 0) { 798 dst->d_id != 0) {
792 if ((dst->d_bcount > dst->d_blk_softlimit) && 799 if ((dst->d_bcount > dst->d_blk_softlimit) &&
793 (dst->d_blk_softlimit > 0)) { 800 (dst->d_blk_softlimit > 0)) {
@@ -833,16 +840,16 @@ xfs_qm_export_flags(
833 uflags = 0; 840 uflags = 0;
834 if (flags & XFS_UQUOTA_ACCT) 841 if (flags & XFS_UQUOTA_ACCT)
835 uflags |= FS_QUOTA_UDQ_ACCT; 842 uflags |= FS_QUOTA_UDQ_ACCT;
836 if (flags & XFS_PQUOTA_ACCT)
837 uflags |= FS_QUOTA_PDQ_ACCT;
838 if (flags & XFS_GQUOTA_ACCT) 843 if (flags & XFS_GQUOTA_ACCT)
839 uflags |= FS_QUOTA_GDQ_ACCT; 844 uflags |= FS_QUOTA_GDQ_ACCT;
845 if (flags & XFS_PQUOTA_ACCT)
846 uflags |= FS_QUOTA_PDQ_ACCT;
840 if (flags & XFS_UQUOTA_ENFD) 847 if (flags & XFS_UQUOTA_ENFD)
841 uflags |= FS_QUOTA_UDQ_ENFD; 848 uflags |= FS_QUOTA_UDQ_ENFD;
842 if (flags & (XFS_OQUOTA_ENFD)) { 849 if (flags & XFS_GQUOTA_ENFD)
843 uflags |= (flags & XFS_GQUOTA_ACCT) ? 850 uflags |= FS_QUOTA_GDQ_ENFD;
844 FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; 851 if (flags & XFS_PQUOTA_ENFD)
845 } 852 uflags |= FS_QUOTA_PDQ_ENFD;
846 return (uflags); 853 return (uflags);
847} 854}
848 855
@@ -856,9 +863,11 @@ xfs_dqrele_inode(
856{ 863{
857 /* skip quota inodes */ 864 /* skip quota inodes */
858 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 865 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
859 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 866 ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
867 ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
860 ASSERT(ip->i_udquot == NULL); 868 ASSERT(ip->i_udquot == NULL);
861 ASSERT(ip->i_gdquot == NULL); 869 ASSERT(ip->i_gdquot == NULL);
870 ASSERT(ip->i_pdquot == NULL);
862 return 0; 871 return 0;
863 } 872 }
864 873
@@ -867,10 +876,14 @@ xfs_dqrele_inode(
867 xfs_qm_dqrele(ip->i_udquot); 876 xfs_qm_dqrele(ip->i_udquot);
868 ip->i_udquot = NULL; 877 ip->i_udquot = NULL;
869 } 878 }
870 if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { 879 if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
871 xfs_qm_dqrele(ip->i_gdquot); 880 xfs_qm_dqrele(ip->i_gdquot);
872 ip->i_gdquot = NULL; 881 ip->i_gdquot = NULL;
873 } 882 }
883 if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
884 xfs_qm_dqrele(ip->i_pdquot);
885 ip->i_pdquot = NULL;
886 }
874 xfs_iunlock(ip, XFS_ILOCK_EXCL); 887 xfs_iunlock(ip, XFS_ILOCK_EXCL);
875 return 0; 888 return 0;
876} 889}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f26c55..b14f42c714b6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,11 +108,28 @@ typedef struct xfs_dqblk {
108 { XFS_DQ_FREEING, "FREEING" } 108 { XFS_DQ_FREEING, "FREEING" }
109 109
110/* 110/*
111 * In the worst case, when both user and group quotas are on, 111 * We have the possibility of all three quota types being active at once, and
112 * we can have a max of three dquots changing in a single transaction. 112 * hence free space modification requires modification of all three current
113 * dquots in a single transaction. For this case we need to have a reservation
114 * of at least 3 dquots.
115 *
116 * However, a chmod operation can change both UID and GID in a single
117 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
118 * modified. Hence for this case we need to reserve space for at least 4 dquots.
119 *
120 * And in the worst case, there's a rename operation that can be modifying up to
121 * 4 inodes with dquots attached to them. In reality, the only inodes that can
122 * have their dquots modified are the source and destination directory inodes
123 * due to directory name creation and removal. That can require space allocation
124 * and/or freeing on both directory inodes, and hence all three dquots on each
125 * inode can be modified. And if the directories are world writeable, all the
126 * dquots can be unique and so 6 dquots can be modified....
127 *
128 * And, of course, we also need to take into account the dquot log format item
129 * used to describe each dquot.
113 */ 130 */
114#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3) 131#define XFS_DQUOT_LOGRES(mp) \
115 132 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
116 133
117/* 134/*
118 * These are the structures used to lay out dquots and quotaoff 135 * These are the structures used to lay out dquots and quotaoff
@@ -161,30 +178,42 @@ typedef struct xfs_qoff_logformat {
161#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ 178#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
162 179
163/* 180/*
181 * Conversion to and from the combined OQUOTA flag (if necessary)
182 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
183 */
184#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
185#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
186#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
187#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
188
189/*
164 * Quota Accounting/Enforcement flags 190 * Quota Accounting/Enforcement flags
165 */ 191 */
166#define XFS_ALL_QUOTA_ACCT \ 192#define XFS_ALL_QUOTA_ACCT \
167 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) 193 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
168#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) 194#define XFS_ALL_QUOTA_ENFD \
169#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) 195 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
196#define XFS_ALL_QUOTA_CHKD \
197 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
170 198
171#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) 199#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
172#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) 200#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
173#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) 201#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
174#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) 202#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
175#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) 203#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
176#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) 204#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
205#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
177 206
178/* 207/*
179 * Incore only flags for quotaoff - these bits get cleared when quota(s) 208 * Incore only flags for quotaoff - these bits get cleared when quota(s)
180 * are in the process of getting turned off. These flags are in m_qflags but 209 * are in the process of getting turned off. These flags are in m_qflags but
181 * never in sb_qflags. 210 * never in sb_qflags.
182 */ 211 */
183#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ 212#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
184#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ 213#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
185#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ 214#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
186#define XFS_ALL_QUOTA_ACTIVE \ 215#define XFS_ALL_QUOTA_ACTIVE \
187 (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) 216 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
188 217
189/* 218/*
190 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees 219 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,33 +288,24 @@ typedef struct xfs_qoff_logformat {
259 * we didn't have the inode locked, the appropriate dquot(s) will be 288 * we didn't have the inode locked, the appropriate dquot(s) will be
260 * attached atomically. 289 * attached atomically.
261 */ 290 */
262#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ 291#define XFS_NOT_DQATTACHED(mp, ip) \
263 (ip)->i_udquot == NULL) || \ 292 ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
264 (XFS_IS_OQUOTA_ON(mp) && \ 293 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
265 (ip)->i_gdquot == NULL)) 294 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
266 295
267#define XFS_QM_NEED_QUOTACHECK(mp) \ 296#define XFS_QM_NEED_QUOTACHECK(mp) \
268 ((XFS_IS_UQUOTA_ON(mp) && \ 297 ((XFS_IS_UQUOTA_ON(mp) && \
269 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ 298 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
270 (XFS_IS_GQUOTA_ON(mp) && \ 299 (XFS_IS_GQUOTA_ON(mp) && \
271 ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ 300 (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
272 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
273 (XFS_IS_PQUOTA_ON(mp) && \ 301 (XFS_IS_PQUOTA_ON(mp) && \
274 ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ 302 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
275 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
276
277#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
278 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
279 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
280
281#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
282 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
283 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
284 303
285#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ 304#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
286 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ 305 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
287 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ 306 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
288 XFS_GQUOTA_ACCT) 307 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
308 XFS_PQUOTA_CHKD)
289 309
290 310
291/* 311/*
@@ -318,17 +338,18 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
318 struct xfs_inode *, long, long, uint); 338 struct xfs_inode *, long, long, uint);
319extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, 339extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
320 struct xfs_mount *, struct xfs_dquot *, 340 struct xfs_mount *, struct xfs_dquot *,
321 struct xfs_dquot *, long, long, uint); 341 struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
322 342
323extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, 343extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
324 struct xfs_dquot **, struct xfs_dquot **); 344 struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
325extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, 345extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
326 struct xfs_dquot *, struct xfs_dquot *); 346 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
327extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); 347extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
328extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, 348extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
329 struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); 349 struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
330extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, 350extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
331 struct xfs_dquot *, struct xfs_dquot *, uint); 351 struct xfs_dquot *, struct xfs_dquot *,
352 struct xfs_dquot *, uint);
332extern int xfs_qm_dqattach(struct xfs_inode *, uint); 353extern int xfs_qm_dqattach(struct xfs_inode *, uint);
333extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); 354extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
334extern void xfs_qm_dqdetach(struct xfs_inode *); 355extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -342,10 +363,12 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
342#else 363#else
343static inline int 364static inline int
344xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, 365xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
345 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp) 366 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
367 struct xfs_dquot **pdqp)
346{ 368{
347 *udqp = NULL; 369 *udqp = NULL;
348 *gdqp = NULL; 370 *gdqp = NULL;
371 *pdqp = NULL;
349 return 0; 372 return 0;
350} 373}
351#define xfs_trans_dup_dqinfo(tp, tp2) 374#define xfs_trans_dup_dqinfo(tp, tp2)
@@ -360,14 +383,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
360} 383}
361static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, 384static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
362 struct xfs_mount *mp, struct xfs_dquot *udqp, 385 struct xfs_mount *mp, struct xfs_dquot *udqp,
363 struct xfs_dquot *gdqp, long nblks, long nions, uint flags) 386 struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
387 long nblks, long nions, uint flags)
364{ 388{
365 return 0; 389 return 0;
366} 390}
367#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 391#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
368#define xfs_qm_vop_rename_dqattach(it) (0) 392#define xfs_qm_vop_rename_dqattach(it) (0)
369#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 393#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
370#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0) 394#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
371#define xfs_qm_dqattach(ip, fl) (0) 395#define xfs_qm_dqattach(ip, fl) (0)
372#define xfs_qm_dqattach_locked(ip, fl) (0) 396#define xfs_qm_dqattach_locked(ip, fl) (0)
373#define xfs_qm_dqdetach(ip) 397#define xfs_qm_dqdetach(ip)
@@ -381,8 +405,8 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
381 405
382#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 406#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
383 xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) 407 xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
384#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \ 408#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
385 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ 409 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
386 f | XFS_QMOPT_RES_REGBLKS) 410 f | XFS_QMOPT_RES_REGBLKS)
387 411
388extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, 412extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
75 flags |= XFS_GQUOTA_ACCT; 75 flags |= XFS_GQUOTA_ACCT;
76 if (uflags & FS_QUOTA_UDQ_ENFD) 76 if (uflags & FS_QUOTA_UDQ_ENFD)
77 flags |= XFS_UQUOTA_ENFD; 77 flags |= XFS_UQUOTA_ENFD;
78 if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) 78 if (uflags & FS_QUOTA_GDQ_ENFD)
79 flags |= XFS_OQUOTA_ENFD; 79 flags |= XFS_GQUOTA_ENFD;
80 if (uflags & FS_QUOTA_PDQ_ENFD)
81 flags |= XFS_PQUOTA_ENFD;
80 82
81 switch (op) { 83 switch (op) {
82 case Q_XQUOTAON: 84 case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 2de58a85833c..78f9e70b80c7 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature(
618 return (sbp->sb_features_log_incompat & feature) != 0; 618 return (sbp->sb_features_log_incompat & feature) != 0;
619} 619}
620 620
621static inline bool
622xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
623{
624 return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
625}
626
621/* 627/*
622 * end of superblock version macros 628 * end of superblock version macros
623 */ 629 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5e9762..1d68ffcdeaa7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_icache.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54#include "xfs_icreate_item.h"
54 55
55#include <linux/namei.h> 56#include <linux/namei.h>
56#include <linux/init.h> 57#include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
359 } else if (!strcmp(this_char, MNTOPT_PQUOTA) || 360 } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
360 !strcmp(this_char, MNTOPT_PRJQUOTA)) { 361 !strcmp(this_char, MNTOPT_PRJQUOTA)) {
361 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | 362 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
362 XFS_OQUOTA_ENFD); 363 XFS_PQUOTA_ENFD);
363 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { 364 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
364 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); 365 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
365 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 366 mp->m_qflags &= ~XFS_PQUOTA_ENFD;
366 } else if (!strcmp(this_char, MNTOPT_GQUOTA) || 367 } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
367 !strcmp(this_char, MNTOPT_GRPQUOTA)) { 368 !strcmp(this_char, MNTOPT_GRPQUOTA)) {
368 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | 369 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
369 XFS_OQUOTA_ENFD); 370 XFS_GQUOTA_ENFD);
370 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 371 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
371 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 372 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
372 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 373 mp->m_qflags &= ~XFS_GQUOTA_ENFD;
373 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 374 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
374 xfs_warn(mp, 375 xfs_warn(mp,
375 "delaylog is the default now, option is deprecated."); 376 "delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
439 } 440 }
440 441
441done: 442done:
442 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { 443 if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
443 /* 444 /*
444 * At this point the superblock has not been read 445 * At this point the superblock has not been read
445 * in, therefore we do not know the block size. 446 * in, therefore we do not know the block size.
446 * Before the mount call ends we will convert 447 * Before the mount call ends we will convert
447 * these to FSBs. 448 * these to FSBs.
448 */ 449 */
449 if (dsunit) { 450 mp->m_dalign = dsunit;
450 mp->m_dalign = dsunit; 451 mp->m_swidth = dswidth;
451 mp->m_flags |= XFS_MOUNT_RETERR;
452 }
453
454 if (dswidth)
455 mp->m_swidth = dswidth;
456 } 452 }
457 453
458 if (mp->m_logbufs != -1 && 454 if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
563 /* Either project or group quotas can be active, not both */ 559 /* Either project or group quotas can be active, not both */
564 560
565 if (mp->m_qflags & XFS_PQUOTA_ACCT) { 561 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
566 if (mp->m_qflags & XFS_OQUOTA_ENFD) 562 if (mp->m_qflags & XFS_PQUOTA_ENFD)
567 seq_puts(m, "," MNTOPT_PRJQUOTA); 563 seq_puts(m, "," MNTOPT_PRJQUOTA);
568 else 564 else
569 seq_puts(m, "," MNTOPT_PQUOTANOENF); 565 seq_puts(m, "," MNTOPT_PQUOTANOENF);
570 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { 566 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
571 if (mp->m_qflags & XFS_OQUOTA_ENFD) 567 if (mp->m_qflags & XFS_GQUOTA_ENFD)
572 seq_puts(m, "," MNTOPT_GRPQUOTA); 568 seq_puts(m, "," MNTOPT_GRPQUOTA);
573 else 569 else
574 seq_puts(m, "," MNTOPT_GQUOTANOENF); 570 seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
1136 spin_unlock(&mp->m_sb_lock); 1132 spin_unlock(&mp->m_sb_lock);
1137 1133
1138 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1134 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1139 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == 1135 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
1140 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 1136 (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
1141 xfs_qm_statvfs(ip, statp); 1137 xfs_qm_statvfs(ip, statp);
1142 return 0; 1138 return 0;
1143} 1139}
@@ -1481,6 +1477,10 @@ xfs_fs_fill_super(
1481 sb->s_time_gran = 1; 1477 sb->s_time_gran = 1;
1482 set_posix_acl_flag(sb); 1478 set_posix_acl_flag(sb);
1483 1479
1480 /* version 5 superblocks support inode version counters. */
1481 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
1482 sb->s_flags |= MS_I_VERSION;
1483
1484 error = xfs_mountfs(mp); 1484 error = xfs_mountfs(mp);
1485 if (error) 1485 if (error)
1486 goto out_filestream_unmount; 1486 goto out_filestream_unmount;
@@ -1655,9 +1655,15 @@ xfs_init_zones(void)
1655 KM_ZONE_SPREAD, NULL); 1655 KM_ZONE_SPREAD, NULL);
1656 if (!xfs_ili_zone) 1656 if (!xfs_ili_zone)
1657 goto out_destroy_inode_zone; 1657 goto out_destroy_inode_zone;
1658 xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
1659 "xfs_icr");
1660 if (!xfs_icreate_zone)
1661 goto out_destroy_ili_zone;
1658 1662
1659 return 0; 1663 return 0;
1660 1664
1665 out_destroy_ili_zone:
1666 kmem_zone_destroy(xfs_ili_zone);
1661 out_destroy_inode_zone: 1667 out_destroy_inode_zone:
1662 kmem_zone_destroy(xfs_inode_zone); 1668 kmem_zone_destroy(xfs_inode_zone);
1663 out_destroy_efi_zone: 1669 out_destroy_efi_zone:
@@ -1696,6 +1702,7 @@ xfs_destroy_zones(void)
1696 * destroy caches. 1702 * destroy caches.
1697 */ 1703 */
1698 rcu_barrier(); 1704 rcu_barrier();
1705 kmem_zone_destroy(xfs_icreate_zone);
1699 kmem_zone_destroy(xfs_ili_zone); 1706 kmem_zone_destroy(xfs_ili_zone);
1700 kmem_zone_destroy(xfs_inode_zone); 1707 kmem_zone_destroy(xfs_inode_zone);
1701 kmem_zone_destroy(xfs_efi_zone); 1708 kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403e1522..f4895b662fcb 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -358,7 +358,9 @@ xfs_symlink(
358 int n; 358 int n;
359 xfs_buf_t *bp; 359 xfs_buf_t *bp;
360 prid_t prid; 360 prid_t prid;
361 struct xfs_dquot *udqp, *gdqp; 361 struct xfs_dquot *udqp = NULL;
362 struct xfs_dquot *gdqp = NULL;
363 struct xfs_dquot *pdqp = NULL;
362 uint resblks; 364 uint resblks;
363 365
364 *ipp = NULL; 366 *ipp = NULL;
@@ -385,7 +387,7 @@ xfs_symlink(
385 * Make sure that we have allocated dquot(s) on disk. 387 * Make sure that we have allocated dquot(s) on disk.
386 */ 388 */
387 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 389 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
388 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 390 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
389 if (error) 391 if (error)
390 goto std_return; 392 goto std_return;
391 393
@@ -426,7 +428,8 @@ xfs_symlink(
426 /* 428 /*
427 * Reserve disk quota : blocks and inode. 429 * Reserve disk quota : blocks and inode.
428 */ 430 */
429 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); 431 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
432 pdqp, resblks, 1, 0);
430 if (error) 433 if (error)
431 goto error_return; 434 goto error_return;
432 435
@@ -464,7 +467,7 @@ xfs_symlink(
464 /* 467 /*
465 * Also attach the dquot(s) to it, if applicable. 468 * Also attach the dquot(s) to it, if applicable.
466 */ 469 */
467 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 470 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
468 471
469 if (resblks) 472 if (resblks)
470 resblks -= XFS_IALLOC_SPACE_RES(mp); 473 resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -562,6 +565,7 @@ xfs_symlink(
562 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 565 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
563 xfs_qm_dqrele(udqp); 566 xfs_qm_dqrele(udqp);
564 xfs_qm_dqrele(gdqp); 567 xfs_qm_dqrele(gdqp);
568 xfs_qm_dqrele(pdqp);
565 569
566 *ipp = ip; 570 *ipp = ip;
567 return 0; 571 return 0;
@@ -575,6 +579,7 @@ xfs_symlink(
575 xfs_trans_cancel(tp, cancel_flags); 579 xfs_trans_cancel(tp, cancel_flags);
576 xfs_qm_dqrele(udqp); 580 xfs_qm_dqrele(udqp);
577 xfs_qm_dqrele(gdqp); 581 xfs_qm_dqrele(gdqp);
582 xfs_qm_dqrele(pdqp);
578 583
579 if (unlock_dp_on_error) 584 if (unlock_dp_on_error)
580 xfs_iunlock(dp, XFS_ILOCK_EXCL); 585 xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -585,7 +590,7 @@ xfs_symlink(
585/* 590/*
586 * Free a symlink that has blocks associated with it. 591 * Free a symlink that has blocks associated with it.
587 */ 592 */
588int 593STATIC int
589xfs_inactive_symlink_rmt( 594xfs_inactive_symlink_rmt(
590 xfs_inode_t *ip, 595 xfs_inode_t *ip,
591 xfs_trans_t **tpp) 596 xfs_trans_t **tpp)
@@ -606,7 +611,7 @@ xfs_inactive_symlink_rmt(
606 611
607 tp = *tpp; 612 tp = *tpp;
608 mp = ip->i_mount; 613 mp = ip->i_mount;
609 ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); 614 ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
610 /* 615 /*
611 * We're freeing a symlink that has some 616 * We're freeing a symlink that has some
612 * blocks allocated to it. Free the 617 * blocks allocated to it. Free the
@@ -720,3 +725,47 @@ xfs_inactive_symlink_rmt(
720 error0: 725 error0:
721 return error; 726 return error;
722} 727}
728
729/*
730 * xfs_inactive_symlink - free a symlink
731 */
732int
733xfs_inactive_symlink(
734 struct xfs_inode *ip,
735 struct xfs_trans **tp)
736{
737 struct xfs_mount *mp = ip->i_mount;
738 int pathlen;
739
740 trace_xfs_inactive_symlink(ip);
741
742 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
743
744 if (XFS_FORCED_SHUTDOWN(mp))
745 return XFS_ERROR(EIO);
746
747 /*
748 * Zero length symlinks _can_ exist.
749 */
750 pathlen = (int)ip->i_d.di_size;
751 if (!pathlen)
752 return 0;
753
754 if (pathlen < 0 || pathlen > MAXPATHLEN) {
755 xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
756 __func__, (unsigned long long)ip->i_ino, pathlen);
757 ASSERT(0);
758 return XFS_ERROR(EFSCORRUPTED);
759 }
760
761 if (ip->i_df.if_flags & XFS_IFINLINE) {
762 if (ip->i_df.if_bytes > 0)
763 xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
764 XFS_DATA_FORK);
765 ASSERT(ip->i_df.if_bytes == 0);
766 return 0;
767 }
768
769 /* remove the remote symlink */
770 return xfs_inactive_symlink_rmt(ip, tp);
771}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b39398d2097c..374394880c01 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
61 const char *target_path, umode_t mode, struct xfs_inode **ipp); 61 const char *target_path, umode_t mode, struct xfs_inode **ipp);
62int xfs_readlink(struct xfs_inode *ip, char *link); 62int xfs_readlink(struct xfs_inode *ip, char *link);
63int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp); 63int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
64 64
65#endif /* __KERNEL__ */ 65#endif /* __KERNEL__ */
66#endif /* __XFS_SYMLINK_H */ 66#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
25#ifdef CONFIG_PROC_FS 25#ifdef CONFIG_PROC_FS
26STATIC int 26STATIC int
27xfs_stats_clear_proc_handler( 27xfs_stats_clear_proc_handler(
28 ctl_table *ctl, 28 struct ctl_table *ctl,
29 int write, 29 int write,
30 void __user *buffer, 30 void __user *buffer,
31 size_t *lenp, 31 size_t *lenp,
32 loff_t *ppos) 32 loff_t *ppos)
33{ 33{
34 int c, ret, *valp = ctl->data; 34 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 35 __uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
55 55
56STATIC int 56STATIC int
57xfs_panic_mask_proc_handler( 57xfs_panic_mask_proc_handler(
58 ctl_table *ctl, 58 struct ctl_table *ctl,
59 int write, 59 int write,
60 void __user *buffer, 60 void __user *buffer,
61 size_t *lenp, 61 size_t *lenp,
62 loff_t *ppos) 62 loff_t *ppos)
63{ 63{
64 int ret, *valp = ctl->data; 64 int ret, *valp = ctl->data;
65 65
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
74} 74}
75#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
76 76
77static ctl_table xfs_table[] = { 77static struct ctl_table xfs_table[] = {
78 { 78 {
79 .procname = "irix_sgid_inherit", 79 .procname = "irix_sgid_inherit",
80 .data = &xfs_params.sgid_inherit.val, 80 .data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
227 {} 227 {}
228}; 228};
229 229
230static ctl_table xfs_dir_table[] = { 230static struct ctl_table xfs_dir_table[] = {
231 { 231 {
232 .procname = "xfs", 232 .procname = "xfs",
233 .mode = 0555, 233 .mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
236 {} 236 {}
237}; 237};
238 238
239static ctl_table xfs_root_table[] = { 239static struct ctl_table xfs_root_table[] = {
240 { 240 {
241 .procname = "fs", 241 .procname = "fs",
242 .mode = 0555, 242 .mode = 0555,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a04701de6bbd..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
486 TP_PROTO(struct xfs_buf_log_item *bip), \ 486 TP_PROTO(struct xfs_buf_log_item *bip), \
487 TP_ARGS(bip)) 487 TP_ARGS(bip))
488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); 488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); 490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); 491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); 493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); 495DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); 496DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); 497DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
508DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); 511DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
509DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); 512DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
510DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); 513DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
514DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
511 515
512DECLARE_EVENT_CLASS(xfs_lock_class, 516DECLARE_EVENT_CLASS(xfs_lock_class,
513 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, 517 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
571DEFINE_INODE_EVENT(xfs_getattr); 575DEFINE_INODE_EVENT(xfs_getattr);
572DEFINE_INODE_EVENT(xfs_setattr); 576DEFINE_INODE_EVENT(xfs_setattr);
573DEFINE_INODE_EVENT(xfs_readlink); 577DEFINE_INODE_EVENT(xfs_readlink);
578DEFINE_INODE_EVENT(xfs_inactive_symlink);
574DEFINE_INODE_EVENT(xfs_alloc_file_space); 579DEFINE_INODE_EVENT(xfs_alloc_file_space);
575DEFINE_INODE_EVENT(xfs_free_file_space); 580DEFINE_INODE_EVENT(xfs_free_file_space);
576DEFINE_INODE_EVENT(xfs_readdir); 581DEFINE_INODE_EVENT(xfs_readdir);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
234} 234}
235 235
236/* 236/*
237 * For symlink we can modify: 237 * For create, break it in to the two cases that the transaction
238 * covers. We start with the modify case - allocation done by modification
239 * of the state of existing inodes - and the allocation case.
240 */
241
242/*
243 * For create we can modify:
238 * the parent directory inode: inode size 244 * the parent directory inode: inode size
239 * the new inode: inode size 245 * the new inode: inode size
240 * the inode btree entry: 1 block 246 * the inode btree entry: block size
247 * the superblock for the nlink flag: sector size
241 * the directory btree: (max depth + v2) * dir block size 248 * the directory btree: (max depth + v2) * dir block size
242 * the directory inode's bmap btree: (max depth + v2) * block size 249 * the directory inode's bmap btree: (max depth + v2) * block size
243 * the blocks for the symlink: 1 kB 250 */
244 * Or in the first xact we allocate some inodes giving: 251STATIC uint
252xfs_calc_create_resv_modify(
253 struct xfs_mount *mp)
254{
255 return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
256 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
257 (uint)XFS_FSB_TO_B(mp, 1) +
258 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
259}
260
261/*
262 * For create we can allocate some inodes giving:
245 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 263 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
264 * the superblock for the nlink flag: sector size
246 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 265 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
247 * the inode btree: max depth * blocksize 266 * the inode btree: max depth * blocksize
248 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size 267 * the allocation btrees: 2 trees * (max depth - 1) * block size
249 */ 268 */
250STATIC uint 269STATIC uint
251xfs_calc_symlink_reservation( 270xfs_calc_create_resv_alloc(
271 struct xfs_mount *mp)
272{
273 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
274 mp->m_sb.sb_sectsize +
275 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
276 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
277 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
278 XFS_FSB_TO_B(mp, 1));
279}
280
281STATIC uint
282__xfs_calc_create_reservation(
252 struct xfs_mount *mp) 283 struct xfs_mount *mp)
253{ 284{
254 return XFS_DQUOT_LOGRES(mp) + 285 return XFS_DQUOT_LOGRES(mp) +
255 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 286 MAX(xfs_calc_create_resv_alloc(mp),
256 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 287 xfs_calc_create_resv_modify(mp));
257 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
258 XFS_FSB_TO_B(mp, 1)) +
259 xfs_calc_buf_res(1, 1024)),
260 (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
261 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
262 XFS_FSB_TO_B(mp, 1)) +
263 xfs_calc_buf_res(mp->m_in_maxlevels,
264 XFS_FSB_TO_B(mp, 1)) +
265 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
266 XFS_FSB_TO_B(mp, 1))));
267} 288}
268 289
269/* 290/*
270 * For create we can modify: 291 * For icreate we can allocate some inodes giving:
271 * the parent directory inode: inode size
272 * the new inode: inode size
273 * the inode btree entry: block size
274 * the superblock for the nlink flag: sector size
275 * the directory btree: (max depth + v2) * dir block size
276 * the directory inode's bmap btree: (max depth + v2) * block size
277 * Or in the first xact we allocate some inodes giving:
278 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 292 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
279 * the superblock for the nlink flag: sector size 293 * the superblock for the nlink flag: sector size
280 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
281 * the inode btree: max depth * blocksize 294 * the inode btree: max depth * blocksize
282 * the allocation btrees: 2 trees * (max depth - 1) * block size 295 * the allocation btrees: 2 trees * (max depth - 1) * block size
283 */ 296 */
284STATIC uint 297STATIC uint
285xfs_calc_create_reservation( 298xfs_calc_icreate_resv_alloc(
286 struct xfs_mount *mp) 299 struct xfs_mount *mp)
287{ 300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
305 XFS_FSB_TO_B(mp, 1));
306}
307
308STATIC uint
309xfs_calc_icreate_reservation(xfs_mount_t *mp)
310{
288 return XFS_DQUOT_LOGRES(mp) + 311 return XFS_DQUOT_LOGRES(mp) +
289 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 312 MAX(xfs_calc_icreate_resv_alloc(mp),
290 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + 313 xfs_calc_create_resv_modify(mp));
291 (uint)XFS_FSB_TO_B(mp, 1) + 314}
292 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 315
293 XFS_FSB_TO_B(mp, 1))), 316STATIC uint
294 (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 317xfs_calc_create_reservation(
295 mp->m_sb.sb_sectsize + 318 struct xfs_mount *mp)
296 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), 319{
297 XFS_FSB_TO_B(mp, 1)) + 320 if (xfs_sb_version_hascrc(&mp->m_sb))
298 xfs_calc_buf_res(mp->m_in_maxlevels, 321 return xfs_calc_icreate_reservation(mp);
299 XFS_FSB_TO_B(mp, 1)) + 322 return __xfs_calc_create_reservation(mp);
300 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 323
301 XFS_FSB_TO_B(mp, 1))));
302} 324}
303 325
304/* 326/*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
311 return xfs_calc_create_reservation(mp); 333 return xfs_calc_create_reservation(mp);
312} 334}
313 335
336
337/*
338 * Making a new symplink is the same as creating a new file, but
339 * with the added blocks for remote symlink data which can be up to 1kB in
340 * length (MAXPATHLEN).
341 */
342STATIC uint
343xfs_calc_symlink_reservation(
344 struct xfs_mount *mp)
345{
346 return xfs_calc_create_reservation(mp) +
347 xfs_calc_buf_res(1, MAXPATHLEN);
348}
349
314/* 350/*
315 * In freeing an inode we can modify: 351 * In freeing an inode we can modify:
316 * the inode being freed: inode size 352 * the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a44dba5b2cdb..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ 48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51#define XFS_LI_ICREATE 0x123f
51 52
52#define XFS_LI_TYPE_DESC \ 53#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \ 54 { XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
107#define XFS_TRANS_SWAPEXT 40 108#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 109#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_CHECKPOINT 42 110#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42 111#define XFS_TRANS_ICREATE 43
112#define XFS_TRANS_TYPE_MAX 43
111/* new transaction types need to be reflected in xfs_logprint(8) */ 113/* new transaction types need to be reflected in xfs_logprint(8) */
112 114
113#define XFS_TRANS_TYPES \ 115#define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
210/* 212/*
211 * Per-extent log reservation for the allocation btree changes 213 * Per-extent log reservation for the allocation btree changes
212 * involved in freeing or allocating an extent. 214 * involved in freeing or allocating an extent.
213 * 2 trees * (2 blocks/level * max depth - 1) * block size 215 * 2 trees * (2 blocks/level * max depth - 1)
214 */ 216 */
215#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
216 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ 217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) 218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
219 219
220/* 220/*
221 * Per-directory log reservation for any directory change. 221 * Per-directory log reservation for any directory change.
222 * dir blocks: (1 btree block per level + data block + free block) * dblock size 222 * dir blocks: (1 btree block per level + data block + free block)
223 * bmap btree: (levels + 2) * max depth * block size 223 * bmap btree: (levels + 2) * max depth
224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb 224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
225 * size, so account for that in the DAENTER macros. 225 * size, so account for that in the DAENTER macros.
226 */ 226 */
227#define XFS_DIROP_LOG_RES(mp) \
228 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
229 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
230#define XFS_DIROP_LOG_COUNT(mp) \ 227#define XFS_DIROP_LOG_COUNT(mp) \
231 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ 228 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
232 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) 229 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
503void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); 500void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
504void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); 501void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
505void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 502void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
503void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
506void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 504void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
507void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 505void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
508void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 506void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 73a5fa457e16..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
397 return XFS_ERROR(EIO); 397 return XFS_ERROR(EIO);
398} 398}
399 399
400
401/* 400/*
402 * Release the buffer bp which was previously acquired with one of the 401 * Release the buffer bp which was previously acquired with one of the
403 * xfs_trans_... buffer allocation routines if the buffer has not 402 * xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
603 602
604 tp->t_flags |= XFS_TRANS_DIRTY; 603 tp->t_flags |= XFS_TRANS_DIRTY;
605 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; 604 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
606 bip->bli_flags |= XFS_BLI_LOGGED; 605
607 xfs_buf_item_log(bip, first, last); 606 /*
607 * If we have an ordered buffer we are not logging any dirty range but
608 * it still needs to be marked dirty and that it has been logged.
609 */
610 bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
611 if (!(bip->bli_flags & XFS_BLI_ORDERED))
612 xfs_buf_item_log(bip, first, last);
608} 613}
609 614
610 615
@@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf(
757} 762}
758 763
759/* 764/*
765 * Mark the buffer as ordered for this transaction. This means
766 * that the contents of the buffer are not recorded in the transaction
767 * but it is tracked in the AIL as though it was. This allows us
768 * to record logical changes in transactions rather than the physical
769 * changes we make to the buffer without changing writeback ordering
770 * constraints of metadata buffers.
771 */
772void
773xfs_trans_ordered_buf(
774 struct xfs_trans *tp,
775 struct xfs_buf *bp)
776{
777 struct xfs_buf_log_item *bip = bp->b_fspriv;
778
779 ASSERT(bp->b_transp == tp);
780 ASSERT(bip != NULL);
781 ASSERT(atomic_read(&bip->bli_refcount) > 0);
782
783 bip->bli_flags |= XFS_BLI_ORDERED;
784 trace_xfs_buf_item_ordered(bip);
785}
786
787/*
760 * Set the type of the buffer for log recovery so that it can correctly identify 788 * Set the type of the buffer for log recovery so that it can correctly identify
761 * and hence attach the correct buffer ops to the buffer after replay. 789 * and hence attach the correct buffer ops to the buffer after replay.
762 */ 790 */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fec75d023703..61407a847b86 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
103 return; 103 return;
104 104
105 xfs_trans_alloc_dqinfo(ntp); 105 xfs_trans_alloc_dqinfo(ntp);
106 oqa = otp->t_dqinfo->dqa_usrdquots;
107 nqa = ntp->t_dqinfo->dqa_usrdquots;
108 106
109 /* 107 /*
110 * Because the quota blk reservation is carried forward, 108 * Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
113 if(otp->t_flags & XFS_TRANS_DQ_DIRTY) 111 if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
114 ntp->t_flags |= XFS_TRANS_DQ_DIRTY; 112 ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
115 113
116 for (j = 0; j < 2; j++) { 114 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
115 oqa = otp->t_dqinfo->dqs[j];
116 nqa = ntp->t_dqinfo->dqs[j];
117 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 117 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
118 if (oqa[i].qt_dquot == NULL) 118 if (oqa[i].qt_dquot == NULL)
119 break; 119 break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
138 oq->qt_ino_res = oq->qt_ino_res_used; 138 oq->qt_ino_res = oq->qt_ino_res_used;
139 139
140 } 140 }
141 oqa = otp->t_dqinfo->dqa_grpdquots;
142 nqa = ntp->t_dqinfo->dqa_grpdquots;
143 } 141 }
144} 142}
145 143
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
157 155
158 if (!XFS_IS_QUOTA_RUNNING(mp) || 156 if (!XFS_IS_QUOTA_RUNNING(mp) ||
159 !XFS_IS_QUOTA_ON(mp) || 157 !XFS_IS_QUOTA_ON(mp) ||
160 ip->i_ino == mp->m_sb.sb_uquotino || 158 xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
161 ip->i_ino == mp->m_sb.sb_gquotino)
162 return; 159 return;
163 160
164 if (tp->t_dqinfo == NULL) 161 if (tp->t_dqinfo == NULL)
@@ -166,20 +163,28 @@ xfs_trans_mod_dquot_byino(
166 163
167 if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) 164 if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
168 (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); 165 (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
169 if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot) 166 if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
170 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); 167 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
168 if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
169 (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
171} 170}
172 171
173STATIC xfs_dqtrx_t * 172STATIC struct xfs_dqtrx *
174xfs_trans_get_dqtrx( 173xfs_trans_get_dqtrx(
175 xfs_trans_t *tp, 174 struct xfs_trans *tp,
176 xfs_dquot_t *dqp) 175 struct xfs_dquot *dqp)
177{ 176{
178 int i; 177 int i;
179 xfs_dqtrx_t *qa; 178 struct xfs_dqtrx *qa;
180 179
181 qa = XFS_QM_ISUDQ(dqp) ? 180 if (XFS_QM_ISUDQ(dqp))
182 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; 181 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
182 else if (XFS_QM_ISGDQ(dqp))
183 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
184 else if (XFS_QM_ISPDQ(dqp))
185 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
186 else
187 return NULL;
183 188
184 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 189 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
185 if (qa[i].qt_dquot == NULL || 190 if (qa[i].qt_dquot == NULL ||
@@ -292,11 +297,10 @@ xfs_trans_mod_dquot(
292 297
293 298
294/* 299/*
295 * Given an array of dqtrx structures, lock all the dquots associated 300 * Given an array of dqtrx structures, lock all the dquots associated and join
296 * and join them to the transaction, provided they have been modified. 301 * them to the transaction, provided they have been modified. We know that the
297 * We know that the highest number of dquots (of one type - usr OR grp), 302 * highest number of dquots of one type - usr, grp OR prj - involved in a
298 * involved in a transaction is 2 and that both usr and grp combined - 3. 303 * transaction is 2 so we don't need to make this very generic.
299 * So, we don't attempt to make this very generic.
300 */ 304 */
301STATIC void 305STATIC void
302xfs_trans_dqlockedjoin( 306xfs_trans_dqlockedjoin(
@@ -339,12 +343,10 @@ xfs_trans_apply_dquot_deltas(
339 return; 343 return;
340 344
341 ASSERT(tp->t_dqinfo); 345 ASSERT(tp->t_dqinfo);
342 qa = tp->t_dqinfo->dqa_usrdquots; 346 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
343 for (j = 0; j < 2; j++) { 347 qa = tp->t_dqinfo->dqs[j];
344 if (qa[0].qt_dquot == NULL) { 348 if (qa[0].qt_dquot == NULL)
345 qa = tp->t_dqinfo->dqa_grpdquots;
346 continue; 349 continue;
347 }
348 350
349 /* 351 /*
350 * Lock all of the dquots and join them to the transaction. 352 * Lock all of the dquots and join them to the transaction.
@@ -495,10 +497,6 @@ xfs_trans_apply_dquot_deltas(
495 ASSERT(dqp->q_res_rtbcount >= 497 ASSERT(dqp->q_res_rtbcount >=
496 be64_to_cpu(dqp->q_core.d_rtbcount)); 498 be64_to_cpu(dqp->q_core.d_rtbcount));
497 } 499 }
498 /*
499 * Do the group quotas next
500 */
501 qa = tp->t_dqinfo->dqa_grpdquots;
502 } 500 }
503} 501}
504 502
@@ -521,9 +519,9 @@ xfs_trans_unreserve_and_mod_dquots(
521 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) 519 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
522 return; 520 return;
523 521
524 qa = tp->t_dqinfo->dqa_usrdquots; 522 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
523 qa = tp->t_dqinfo->dqs[j];
525 524
526 for (j = 0; j < 2; j++) {
527 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 525 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
528 qtrx = &qa[i]; 526 qtrx = &qa[i];
529 /* 527 /*
@@ -565,7 +563,6 @@ xfs_trans_unreserve_and_mod_dquots(
565 xfs_dqunlock(dqp); 563 xfs_dqunlock(dqp);
566 564
567 } 565 }
568 qa = tp->t_dqinfo->dqa_grpdquots;
569 } 566 }
570} 567}
571 568
@@ -640,8 +637,8 @@ xfs_trans_dqresv(
640 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 637 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
641 dqp->q_core.d_id && 638 dqp->q_core.d_id &&
642 ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || 639 ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 640 (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 641 (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
645 if (nblks > 0) { 642 if (nblks > 0) {
646 /* 643 /*
647 * dquot is locked already. See if we'd go over the 644 * dquot is locked already. See if we'd go over the
@@ -736,8 +733,8 @@ error_return:
736 733
737/* 734/*
738 * Given dquot(s), make disk block and/or inode reservations against them. 735 * Given dquot(s), make disk block and/or inode reservations against them.
739 * The fact that this does the reservation against both the usr and 736 * The fact that this does the reservation against user, group and
740 * grp/prj quotas is important, because this follows a both-or-nothing 737 * project quotas is important, because this follows a all-or-nothing
741 * approach. 738 * approach.
742 * 739 *
743 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. 740 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
@@ -748,15 +745,16 @@ error_return:
748 */ 745 */
749int 746int
750xfs_trans_reserve_quota_bydquots( 747xfs_trans_reserve_quota_bydquots(
751 xfs_trans_t *tp, 748 struct xfs_trans *tp,
752 xfs_mount_t *mp, 749 struct xfs_mount *mp,
753 xfs_dquot_t *udqp, 750 struct xfs_dquot *udqp,
754 xfs_dquot_t *gdqp, 751 struct xfs_dquot *gdqp,
755 long nblks, 752 struct xfs_dquot *pdqp,
756 long ninos, 753 long nblks,
757 uint flags) 754 long ninos,
755 uint flags)
758{ 756{
759 int resvd = 0, error; 757 int error;
760 758
761 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 759 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
762 return 0; 760 return 0;
@@ -771,28 +769,34 @@ xfs_trans_reserve_quota_bydquots(
771 (flags & ~XFS_QMOPT_ENOSPC)); 769 (flags & ~XFS_QMOPT_ENOSPC));
772 if (error) 770 if (error)
773 return error; 771 return error;
774 resvd = 1;
775 } 772 }
776 773
777 if (gdqp) { 774 if (gdqp) {
778 error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); 775 error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
779 if (error) { 776 if (error)
780 /* 777 goto unwind_usr;
781 * can't do it, so backout previous reservation 778 }
782 */ 779
783 if (resvd) { 780 if (pdqp) {
784 flags |= XFS_QMOPT_FORCE_RES; 781 error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
785 xfs_trans_dqresv(tp, mp, udqp, 782 if (error)
786 -nblks, -ninos, flags); 783 goto unwind_grp;
787 }
788 return error;
789 }
790 } 784 }
791 785
792 /* 786 /*
793 * Didn't change anything critical, so, no need to log 787 * Didn't change anything critical, so, no need to log
794 */ 788 */
795 return 0; 789 return 0;
790
791unwind_grp:
792 flags |= XFS_QMOPT_FORCE_RES;
793 if (gdqp)
794 xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
795unwind_usr:
796 flags |= XFS_QMOPT_FORCE_RES;
797 if (udqp)
798 xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
799 return error;
796} 800}
797 801
798 802
@@ -816,8 +820,7 @@ xfs_trans_reserve_quota_nblks(
816 if (XFS_IS_PQUOTA_ON(mp)) 820 if (XFS_IS_PQUOTA_ON(mp))
817 flags |= XFS_QMOPT_ENOSPC; 821 flags |= XFS_QMOPT_ENOSPC;
818 822
819 ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); 823 ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
820 ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
821 824
822 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 825 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
823 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == 826 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -830,6 +833,7 @@ xfs_trans_reserve_quota_nblks(
830 */ 833 */
831 return xfs_trans_reserve_quota_bydquots(tp, mp, 834 return xfs_trans_reserve_quota_bydquots(tp, mp,
832 ip->i_udquot, ip->i_gdquot, 835 ip->i_udquot, ip->i_gdquot,
836 ip->i_pdquot,
833 nblks, ninos, flags); 837 nblks, ninos, flags);
834} 838}
835 839
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
112 ASSERT(ip->i_itemp != NULL); 112 ASSERT(ip->i_itemp != NULL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 114
115 /*
116 * First time we log the inode in a transaction, bump the inode change
117 * counter if it is configured for this to occur.
118 */
119 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
120 IS_I_VERSION(VFS_I(ip))) {
121 inode_inc_iversion(VFS_I(ip));
122 ip->i_d.di_changecount = VFS_I(ip)->i_version;
123 flags |= XFS_ILOG_CORE;
124 }
125
115 tp->t_flags |= XFS_TRANS_DIRTY; 126 tp->t_flags |= XFS_TRANS_DIRTY;
116 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; 127 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
117 128
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb21f09a..dc730ac272be 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -322,18 +322,9 @@ xfs_inactive(
322 xfs_trans_ijoin(tp, ip, 0); 322 xfs_trans_ijoin(tp, ip, 0);
323 323
324 if (S_ISLNK(ip->i_d.di_mode)) { 324 if (S_ISLNK(ip->i_d.di_mode)) {
325 /* 325 error = xfs_inactive_symlink(ip, &tp);
326 * Zero length symlinks _can_ exist. 326 if (error)
327 */ 327 goto out_cancel;
328 if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
329 error = xfs_inactive_symlink_rmt(ip, &tp);
330 if (error)
331 goto out_cancel;
332 } else if (ip->i_df.if_bytes > 0) {
333 xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
334 XFS_DATA_FORK);
335 ASSERT(ip->i_df.if_bytes == 0);
336 }
337 } else if (truncate) { 328 } else if (truncate) {
338 ip->i_d.di_size = 0; 329 ip->i_d.di_size = 0;
339 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -498,6 +489,7 @@ xfs_create(
498 prid_t prid; 489 prid_t prid;
499 struct xfs_dquot *udqp = NULL; 490 struct xfs_dquot *udqp = NULL;
500 struct xfs_dquot *gdqp = NULL; 491 struct xfs_dquot *gdqp = NULL;
492 struct xfs_dquot *pdqp = NULL;
501 uint resblks; 493 uint resblks;
502 uint log_res; 494 uint log_res;
503 uint log_count; 495 uint log_count;
@@ -516,7 +508,8 @@ xfs_create(
516 * Make sure that we have allocated dquot(s) on disk. 508 * Make sure that we have allocated dquot(s) on disk.
517 */ 509 */
518 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 510 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
519 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 511 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
512 &udqp, &gdqp, &pdqp);
520 if (error) 513 if (error)
521 return error; 514 return error;
522 515
@@ -568,7 +561,8 @@ xfs_create(
568 /* 561 /*
569 * Reserve disk quota and the inode. 562 * Reserve disk quota and the inode.
570 */ 563 */
571 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); 564 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
565 pdqp, resblks, 1, 0);
572 if (error) 566 if (error)
573 goto out_trans_cancel; 567 goto out_trans_cancel;
574 568
@@ -632,7 +626,7 @@ xfs_create(
632 * These ids of the inode couldn't have changed since the new 626 * These ids of the inode couldn't have changed since the new
633 * inode has been locked ever since it was created. 627 * inode has been locked ever since it was created.
634 */ 628 */
635 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 629 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
636 630
637 error = xfs_bmap_finish(&tp, &free_list, &committed); 631 error = xfs_bmap_finish(&tp, &free_list, &committed);
638 if (error) 632 if (error)
@@ -644,6 +638,7 @@ xfs_create(
644 638
645 xfs_qm_dqrele(udqp); 639 xfs_qm_dqrele(udqp);
646 xfs_qm_dqrele(gdqp); 640 xfs_qm_dqrele(gdqp);
641 xfs_qm_dqrele(pdqp);
647 642
648 *ipp = ip; 643 *ipp = ip;
649 return 0; 644 return 0;
@@ -665,6 +660,7 @@ xfs_create(
665 660
666 xfs_qm_dqrele(udqp); 661 xfs_qm_dqrele(udqp);
667 xfs_qm_dqrele(gdqp); 662 xfs_qm_dqrele(gdqp);
663 xfs_qm_dqrele(pdqp);
668 664
669 if (unlock_dp_on_error) 665 if (unlock_dp_on_error)
670 xfs_iunlock(dp, XFS_ILOCK_EXCL); 666 xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1577,7 +1573,7 @@ xfs_free_file_space(
1577 } 1573 }
1578 xfs_ilock(ip, XFS_ILOCK_EXCL); 1574 xfs_ilock(ip, XFS_ILOCK_EXCL);
1579 error = xfs_trans_reserve_quota(tp, mp, 1575 error = xfs_trans_reserve_quota(tp, mp,
1580 ip->i_udquot, ip->i_gdquot, 1576 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1581 resblks, 0, XFS_QMOPT_RES_REGBLKS); 1577 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1582 if (error) 1578 if (error)
1583 goto error1; 1579 goto error1;