aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/aio.c2
-rw-r--r--fs/attr.c2
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/binfmt_elf.c30
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_misc.c3
-rw-r--r--fs/bio.c2
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/inode.c11
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c19
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c202
-rw-r--r--fs/cifs/README6
-rw-r--r--fs/cifs/cifs_debug.c3
-rw-r--r--fs/cifs/cifsfs.c25
-rw-r--r--fs/cifs/cifsglob.h47
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c18
-rw-r--r--fs/cifs/connect.c44
-rw-r--r--fs/cifs/dir.c6
-rw-r--r--fs/cifs/file.c23
-rw-r--r--fs/cifs/misc.c19
-rw-r--r--fs/cifs/transport.c78
-rw-r--r--fs/compat.c1
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/dcookies.c2
-rw-r--r--fs/eventfd.c2
-rw-r--r--fs/eventpoll.c45
-rw-r--r--fs/ext3/balloc.c84
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext4/balloc.c63
-rw-r--r--fs/ext4/dir.c13
-rw-r--r--fs/ext4/ext4.h34
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.h128
-rw-r--r--fs/ext4/extents.c330
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c260
-rw-r--r--fs/ext4/inode.c95
-rw-r--r--fs/ext4/mballoc.c342
-rw-r--r--fs/ext4/mballoc.h20
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c18
-rw-r--r--fs/ext4/resize.c37
-rw-r--r--fs/ext4/super.c1075
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/fat/namei_vfat.c83
-rw-r--r--fs/file.c2
-rw-r--r--fs/fs-writeback.c24
-rw-r--r--fs/fs_struct.c2
-rw-r--r--fs/hostfs/hostfs.h3
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hostfs/hostfs_user.c4
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/jbd2/checkpoint.c140
-rw-r--r--fs/jbd2/commit.c47
-rw-r--r--fs/jbd2/journal.c361
-rw-r--r--fs/jbd2/recovery.c5
-rw-r--r--fs/jbd2/revoke.c12
-rw-r--r--fs/jbd2/transaction.c48
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/clnt4xdr.c2
-rw-r--r--fs/lockd/clntlock.c3
-rw-r--r--fs/lockd/clntxdr.c8
-rw-r--r--fs/lockd/host.c42
-rw-r--r--fs/lockd/mon.c21
-rw-r--r--fs/lockd/netns.h12
-rw-r--r--fs/lockd/svc.c117
-rw-r--r--fs/lockd/svclock.c59
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c2
-rw-r--r--fs/nfs/Kconfig29
-rw-r--r--fs/nfs/blocklayout/blocklayout.c161
-rw-r--r--fs/nfs/blocklayout/blocklayout.h11
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c46
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c33
-rw-r--r--fs/nfs/blocklayout/extents.c2
-rw-r--r--fs/nfs/cache_lib.c61
-rw-r--r--fs/nfs/cache_lib.h10
-rw-r--r--fs/nfs/callback.c19
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c99
-rw-r--r--fs/nfs/callback_xdr.c21
-rw-r--r--fs/nfs/client.c246
-rw-r--r--fs/nfs/delegation.c68
-rw-r--r--fs/nfs/delegation.h4
-rw-r--r--fs/nfs/dir.c27
-rw-r--r--fs/nfs/direct.c6
-rw-r--r--fs/nfs/dns_resolve.c130
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/fscache.c2
-rw-r--r--fs/nfs/idmap.c733
-rw-r--r--fs/nfs/inode.c119
-rw-r--r--fs/nfs/internal.h15
-rw-r--r--fs/nfs/mount_clnt.c16
-rw-r--r--fs/nfs/namespace.c5
-rw-r--r--fs/nfs/netns.h27
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c24
-rw-r--r--fs/nfs/nfs3xdr.c4
-rw-r--r--fs/nfs/nfs4_fs.h58
-rw-r--r--fs/nfs/nfs4filelayout.c272
-rw-r--r--fs/nfs/nfs4filelayout.h7
-rw-r--r--fs/nfs/nfs4filelayoutdev.c90
-rw-r--r--fs/nfs/nfs4namespace.c10
-rw-r--r--fs/nfs/nfs4proc.c559
-rw-r--r--fs/nfs/nfs4state.c355
-rw-r--r--fs/nfs/nfs4xdr.c697
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c54
-rw-r--r--fs/nfs/objlayout/objlayout.c142
-rw-r--r--fs/nfs/objlayout/objlayout.h2
-rw-r--r--fs/nfs/pagelist.c92
-rw-r--r--fs/nfs/pnfs.c46
-rw-r--r--fs/nfs/pnfs.h98
-rw-r--r--fs/nfs/pnfs_dev.c4
-rw-r--r--fs/nfs/proc.c24
-rw-r--r--fs/nfs/read.c14
-rw-r--r--fs/nfs/super.c167
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/unlink.c45
-rw-r--r--fs/nfs/write.c213
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c4
-rw-r--r--fs/nfsd/stats.c5
-rw-r--r--fs/notify/notification.c3
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/array.c119
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/kcore.c6
-rw-r--r--fs/proc/namespaces.c2
-rw-r--r--fs/proc/proc_sysctl.c1274
-rw-r--r--fs/proc/stat.c62
-rw-r--r--fs/pstore/platform.c30
-rw-r--r--fs/quota/dquot.c189
-rw-r--r--fs/quota/quota.c3
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/select.c42
-rw-r--r--fs/seq_file.c86
-rw-r--r--fs/splice.c2
-rw-r--r--fs/stack.c2
-rw-r--r--fs/stat.c2
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/sync.c2
-rw-r--r--fs/ubifs/debug.c410
-rw-r--r--fs/ubifs/debug.h3
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/recovery.c3
-rw-r--r--fs/ubifs/sb.c19
-rw-r--r--fs/ubifs/ubifs.h11
-rw-r--r--fs/udf/balloc.c84
-rw-r--r--fs/udf/ialloc.c1
-rw-r--r--fs/udf/inode.c20
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/udf/udf_i.h1
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/xfs_alloc.c36
-rw-r--r--fs/xfs/xfs_alloc.h12
-rw-r--r--fs/xfs/xfs_aops.c183
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr.c16
-rw-r--r--fs/xfs/xfs_attr_leaf.c40
-rw-r--r--fs/xfs/xfs_bmap.c22
-rw-r--r--fs/xfs/xfs_buf.c17
-rw-r--r--fs/xfs/xfs_da_btree.c32
-rw-r--r--fs/xfs/xfs_dfrag.c24
-rw-r--r--fs/xfs/xfs_dir2_block.c1
-rw-r--r--fs/xfs/xfs_discard.c61
-rw-r--r--fs/xfs/xfs_dquot.c418
-rw-r--r--fs/xfs/xfs_dquot.h49
-rw-r--r--fs/xfs/xfs_file.c84
-rw-r--r--fs/xfs/xfs_iget.c47
-rw-r--r--fs/xfs/xfs_inode.c94
-rw-r--r--fs/xfs/xfs_inode.h27
-rw-r--r--fs/xfs/xfs_inode_item.c297
-rw-r--r--fs/xfs/xfs_inode_item.h16
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c71
-rw-r--r--fs/xfs/xfs_itable.c24
-rw-r--r--fs/xfs/xfs_log.c615
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_log_priv.h28
-rw-r--r--fs/xfs/xfs_log_recover.c39
-rw-r--r--fs/xfs/xfs_mount.c8
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_qm.c628
-rw-r--r--fs/xfs/xfs_qm.h49
-rw-r--r--fs/xfs/xfs_qm_bhv.c42
-rw-r--r--fs/xfs/xfs_qm_stats.c105
-rw-r--r--fs/xfs/xfs_qm_stats.h53
-rw-r--r--fs/xfs/xfs_qm_syscalls.c130
-rw-r--r--fs/xfs/xfs_quota.h2
-rw-r--r--fs/xfs/xfs_quota_priv.h11
-rw-r--r--fs/xfs/xfs_rtalloc.c9
-rw-r--r--fs/xfs/xfs_sb.h1
-rw-r--r--fs/xfs/xfs_stats.c99
-rw-r--r--fs/xfs/xfs_stats.h10
-rw-r--r--fs/xfs/xfs_super.c197
-rw-r--r--fs/xfs/xfs_super.h8
-rw-r--r--fs/xfs/xfs_sync.c46
-rw-r--r--fs/xfs/xfs_sync.h2
-rw-r--r--fs/xfs/xfs_trace.h106
-rw-r--r--fs/xfs/xfs_trans.c31
-rw-r--r--fs/xfs/xfs_trans_ail.c83
-rw-r--r--fs/xfs/xfs_trans_buf.c25
-rw-r--r--fs/xfs/xfs_trans_dquot.c21
-rw-r--r--fs/xfs/xfs_trans_inode.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_vnode.h1
-rw-r--r--fs/xfs/xfs_vnodeops.h3
229 files changed, 8641 insertions, 6891 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 10b7d3c9dba8..8c92a9ba8330 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -259,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
259 if (v9fs_proto_dotl(v9ses)) { 259 if (v9fs_proto_dotl(v9ses)) {
260 res = p9_client_statfs(fid, &rs); 260 res = p9_client_statfs(fid, &rs);
261 if (res == 0) { 261 if (res == 0) {
262 buf->f_type = V9FS_MAGIC; 262 buf->f_type = rs.type;
263 buf->f_bsize = rs.bsize; 263 buf->f_bsize = rs.bsize;
264 buf->f_blocks = rs.blocks; 264 buf->f_blocks = rs.blocks;
265 buf->f_bfree = rs.bfree; 265 buf->f_bfree = rs.bfree;
diff --git a/fs/aio.c b/fs/aio.c
index c7acaf3167aa..4f71627264fd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -13,7 +13,7 @@
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/aio_abi.h> 15#include <linux/aio_abi.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/backing-dev.h> 18#include <linux/backing-dev.h>
19#include <linux/uio.h> 19#include <linux/uio.h>
diff --git a/fs/attr.c b/fs/attr.c
index 95053ad8abcc..73f69a6ce9ed 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -5,7 +5,7 @@
5 * changes by Thomas Schoebel-Theuer 5 * changes by Thomas Schoebel-Theuer
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/export.h>
9#include <linux/time.h> 9#include <linux/time.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/string.h> 11#include <linux/string.h>
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 22e9a78872ff..37268c5bb98b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -9,7 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/module.h> 12#include <linux/export.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/namei.h> 15#include <linux/namei.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 18276531f7c6..7d7ff206cdcb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1094,6 +1094,29 @@ out:
1094 */ 1094 */
1095 1095
1096/* 1096/*
1097 * The purpose of always_dump_vma() is to make sure that special kernel mappings
1098 * that are useful for post-mortem analysis are included in every core dump.
1099 * In that way we ensure that the core dump is fully interpretable later
1100 * without matching up the same kernel and hardware config to see what PC values
1101 * meant. These special mappings include - vDSO, vsyscall, and other
1102 * architecture specific mappings
1103 */
1104static bool always_dump_vma(struct vm_area_struct *vma)
1105{
1106 /* Any vsyscall mappings? */
1107 if (vma == get_gate_vma(vma->vm_mm))
1108 return true;
1109 /*
1110 * arch_vma_name() returns non-NULL for special architecture mappings,
1111 * such as vDSO sections.
1112 */
1113 if (arch_vma_name(vma))
1114 return true;
1115
1116 return false;
1117}
1118
1119/*
1097 * Decide what to dump of a segment, part, all or none. 1120 * Decide what to dump of a segment, part, all or none.
1098 */ 1121 */
1099static unsigned long vma_dump_size(struct vm_area_struct *vma, 1122static unsigned long vma_dump_size(struct vm_area_struct *vma,
@@ -1101,10 +1124,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1101{ 1124{
1102#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) 1125#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
1103 1126
1104 /* The vma can be set up to tell us the answer directly. */ 1127 /* always dump the vdso and vsyscall sections */
1105 if (vma->vm_flags & VM_ALWAYSDUMP) 1128 if (always_dump_vma(vma))
1106 goto whole; 1129 goto whole;
1107 1130
1131 if (vma->vm_flags & VM_NODUMP)
1132 return 0;
1133
1108 /* Hugetlb memory check */ 1134 /* Hugetlb memory check */
1109 if (vma->vm_flags & VM_HUGETLB) { 1135 if (vma->vm_flags & VM_HUGETLB) {
1110 if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) 1136 if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 4e4017c08887..024d20ee3ca3 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,7 @@
15 * JAN/99 -- coded full program relocation (gerg@snapgear.com) 15 * JAN/99 -- coded full program relocation (gerg@snapgear.com)
16 */ 16 */
17 17
18#include <linux/module.h> 18#include <linux/export.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1ffb60355cae..613aa0618235 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -19,6 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/magic.h>
22#include <linux/binfmts.h> 23#include <linux/binfmts.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
24#include <linux/ctype.h> 25#include <linux/ctype.h>
@@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
699 [3] = {"register", &bm_register_operations, S_IWUSR}, 700 [3] = {"register", &bm_register_operations, S_IWUSR},
700 /* last one */ {""} 701 /* last one */ {""}
701 }; 702 };
702 int err = simple_fill_super(sb, 0x42494e4d, bm_files); 703 int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
703 if (!err) 704 if (!err)
704 sb->s_op = &s_ops; 705 sb->s_op = &s_ops;
705 return err; 706 return err;
diff --git a/fs/bio.c b/fs/bio.c
index b980ecde026a..e453924036e9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -22,7 +22,7 @@
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/module.h> 25#include <linux/export.h>
26#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <scsi/sg.h> /* for struct sg_iovec */ 28#include <scsi/sg.h> /* for struct sg_iovec */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a9ff3000b83d..e08f6a20a5bb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -16,6 +16,7 @@
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/blkpg.h> 18#include <linux/blkpg.h>
19#include <linux/magic.h>
19#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
21#include <linux/pagevec.h> 22#include <linux/pagevec.h>
@@ -506,7 +507,7 @@ static const struct super_operations bdev_sops = {
506static struct dentry *bd_mount(struct file_system_type *fs_type, 507static struct dentry *bd_mount(struct file_system_type *fs_type,
507 int flags, const char *dev_name, void *data) 508 int flags, const char *dev_name, void *data)
508{ 509{
509 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); 510 return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
510} 511}
511 512
512static struct file_system_type bd_type = { 513static struct file_system_type bd_type = {
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a30db77af32..70e2017edd70 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -29,7 +29,7 @@
29#include <linux/file.h> 29#include <linux/file.h>
30#include <linux/quotaops.h> 30#include <linux/quotaops.h>
31#include <linux/highmem.h> 31#include <linux/highmem.h>
32#include <linux/module.h> 32#include <linux/export.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/hash.h> 34#include <linux/hash.h>
35#include <linux/suspend.h> 35#include <linux/suspend.h>
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2c489378b4cd..9fff9f3b17e4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode,
677 case S_IFLNK: 677 case S_IFLNK:
678 inode->i_op = &ceph_symlink_iops; 678 inode->i_op = &ceph_symlink_iops;
679 if (!ci->i_symlink) { 679 if (!ci->i_symlink) {
680 int symlen = iinfo->symlink_len; 680 u32 symlen = iinfo->symlink_len;
681 char *sym; 681 char *sym;
682 682
683 BUG_ON(symlen != inode->i_size);
684 spin_unlock(&ci->i_ceph_lock); 683 spin_unlock(&ci->i_ceph_lock);
685 684
685 err = -EINVAL;
686 if (WARN_ON(symlen != inode->i_size))
687 goto out;
688
686 err = -ENOMEM; 689 err = -ENOMEM;
687 sym = kmalloc(symlen+1, GFP_NOFS); 690 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
688 if (!sym) 691 if (!sym)
689 goto out; 692 goto out;
690 memcpy(sym, iinfo->symlink, symlen);
691 sym[symlen] = 0;
692 693
693 spin_lock(&ci->i_ceph_lock); 694 spin_lock(&ci->i_ceph_lock);
694 if (!ci->i_symlink) 695 if (!ci->i_symlink)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 866e8d7ca37d..89971e137aab 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
402 402
403 spin_lock_init(&s->s_gen_ttl_lock); 403 spin_lock_init(&s->s_gen_ttl_lock);
404 s->s_cap_gen = 0; 404 s->s_cap_gen = 0;
405 s->s_cap_ttl = 0; 405 s->s_cap_ttl = jiffies - 1;
406 406
407 spin_lock_init(&s->s_cap_lock); 407 spin_lock_init(&s->s_cap_lock);
408 s->s_renew_requested = 0; 408 s->s_renew_requested = 0;
@@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
1083 int wake = 0; 1083 int wake = 0;
1084 1084
1085 spin_lock(&session->s_cap_lock); 1085 spin_lock(&session->s_cap_lock);
1086 was_stale = is_renew && (session->s_cap_ttl == 0 || 1086 was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
1087 time_after_eq(jiffies, session->s_cap_ttl));
1088 1087
1089 session->s_cap_ttl = session->s_renew_requested + 1088 session->s_cap_ttl = session->s_renew_requested +
1090 mdsc->mdsmap->m_session_timeout*HZ; 1089 mdsc->mdsmap->m_session_timeout*HZ;
@@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session,
2332 session->s_mds); 2331 session->s_mds);
2333 spin_lock(&session->s_gen_ttl_lock); 2332 spin_lock(&session->s_gen_ttl_lock);
2334 session->s_cap_gen++; 2333 session->s_cap_gen++;
2335 session->s_cap_ttl = 0; 2334 session->s_cap_ttl = jiffies - 1;
2336 spin_unlock(&session->s_gen_ttl_lock); 2335 spin_unlock(&session->s_gen_ttl_lock);
2337 send_renew_caps(mdsc, session); 2336 send_renew_caps(mdsc, session);
2338 break; 2337 break;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a559c80f127a..f04c0961f993 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
331 331
332 /* alloc new snap context */ 332 /* alloc new snap context */
333 err = -ENOMEM; 333 err = -ENOMEM;
334 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) 334 if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
335 goto fail; 335 goto fail;
336 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); 336 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
337 if (!snapc) 337 if (!snapc)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 256f85221926..1e67dd7305a4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -130,10 +130,12 @@ enum {
130 Opt_nodirstat, 130 Opt_nodirstat,
131 Opt_rbytes, 131 Opt_rbytes,
132 Opt_norbytes, 132 Opt_norbytes,
133 Opt_asyncreaddir,
133 Opt_noasyncreaddir, 134 Opt_noasyncreaddir,
134 Opt_dcache, 135 Opt_dcache,
135 Opt_nodcache, 136 Opt_nodcache,
136 Opt_ino32, 137 Opt_ino32,
138 Opt_noino32,
137}; 139};
138 140
139static match_table_t fsopt_tokens = { 141static match_table_t fsopt_tokens = {
@@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = {
153 {Opt_nodirstat, "nodirstat"}, 155 {Opt_nodirstat, "nodirstat"},
154 {Opt_rbytes, "rbytes"}, 156 {Opt_rbytes, "rbytes"},
155 {Opt_norbytes, "norbytes"}, 157 {Opt_norbytes, "norbytes"},
158 {Opt_asyncreaddir, "asyncreaddir"},
156 {Opt_noasyncreaddir, "noasyncreaddir"}, 159 {Opt_noasyncreaddir, "noasyncreaddir"},
157 {Opt_dcache, "dcache"}, 160 {Opt_dcache, "dcache"},
158 {Opt_nodcache, "nodcache"}, 161 {Opt_nodcache, "nodcache"},
159 {Opt_ino32, "ino32"}, 162 {Opt_ino32, "ino32"},
163 {Opt_noino32, "noino32"},
160 {-1, NULL} 164 {-1, NULL}
161}; 165};
162 166
@@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private)
232 case Opt_norbytes: 236 case Opt_norbytes:
233 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; 237 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
234 break; 238 break;
239 case Opt_asyncreaddir:
240 fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
241 break;
235 case Opt_noasyncreaddir: 242 case Opt_noasyncreaddir:
236 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; 243 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
237 break; 244 break;
@@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private)
244 case Opt_ino32: 251 case Opt_ino32:
245 fsopt->flags |= CEPH_MOUNT_OPT_INO32; 252 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
246 break; 253 break;
254 case Opt_noino32:
255 fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
256 break;
247 default: 257 default:
248 BUG_ON(token); 258 BUG_ON(token);
249 } 259 }
@@ -334,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
334 *path += 2; 344 *path += 2;
335 dout("server path '%s'\n", *path); 345 dout("server path '%s'\n", *path);
336 346
337 err = ceph_parse_options(popt, options, dev_name, dev_name_end, 347 *popt = ceph_parse_options(options, dev_name, dev_name_end,
338 parse_fsopt_token, (void *)fsopt); 348 parse_fsopt_token, (void *)fsopt);
339 if (err) 349 if (IS_ERR(*popt)) {
350 err = PTR_ERR(*popt);
340 goto out; 351 goto out;
352 }
341 353
342 /* success */ 354 /* success */
343 *pfsopt = fsopt; 355 *pfsopt = fsopt;
@@ -926,6 +938,7 @@ static int __init init_ceph(void)
926 if (ret) 938 if (ret)
927 goto out; 939 goto out;
928 940
941 ceph_xattr_init();
929 ret = register_filesystem(&ceph_fs_type); 942 ret = register_filesystem(&ceph_fs_type);
930 if (ret) 943 if (ret)
931 goto out_icache; 944 goto out_icache;
@@ -935,6 +948,7 @@ static int __init init_ceph(void)
935 return 0; 948 return 0;
936 949
937out_icache: 950out_icache:
951 ceph_xattr_exit();
938 destroy_caches(); 952 destroy_caches();
939out: 953out:
940 return ret; 954 return ret;
@@ -944,6 +958,7 @@ static void __exit exit_ceph(void)
944{ 958{
945 dout("exit_ceph\n"); 959 dout("exit_ceph\n");
946 unregister_filesystem(&ceph_fs_type); 960 unregister_filesystem(&ceph_fs_type);
961 ceph_xattr_exit();
947 destroy_caches(); 962 destroy_caches();
948} 963}
949 964
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1421f3d875a2..fc35036d258d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
367 u32 ino = vino & 0xffffffff; 367 u32 ino = vino & 0xffffffff;
368 ino ^= vino >> 32; 368 ino ^= vino >> 32;
369 if (!ino) 369 if (!ino)
370 ino = 1; 370 ino = 2;
371 return ino; 371 return ino;
372} 372}
373 373
@@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
733extern int ceph_removexattr(struct dentry *, const char *); 733extern int ceph_removexattr(struct dentry *, const char *);
734extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); 734extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
735extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); 735extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
736extern void __init ceph_xattr_init(void);
737extern void ceph_xattr_exit(void);
736 738
737/* caps.c */ 739/* caps.c */
738extern const char *ceph_cap_string(int c); 740extern const char *ceph_cap_string(int c);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a76f697303d9..35b86331d8a5 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -8,9 +8,12 @@
8#include <linux/xattr.h> 8#include <linux/xattr.h>
9#include <linux/slab.h> 9#include <linux/slab.h>
10 10
11#define XATTR_CEPH_PREFIX "ceph."
12#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
13
11static bool ceph_is_valid_xattr(const char *name) 14static bool ceph_is_valid_xattr(const char *name)
12{ 15{
13 return !strncmp(name, "ceph.", 5) || 16 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
14 !strncmp(name, XATTR_SECURITY_PREFIX, 17 !strncmp(name, XATTR_SECURITY_PREFIX,
15 XATTR_SECURITY_PREFIX_LEN) || 18 XATTR_SECURITY_PREFIX_LEN) ||
16 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 19 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
@@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name)
21 * These define virtual xattrs exposing the recursive directory 24 * These define virtual xattrs exposing the recursive directory
22 * statistics and layout metadata. 25 * statistics and layout metadata.
23 */ 26 */
24struct ceph_vxattr_cb { 27struct ceph_vxattr {
25 bool readonly;
26 char *name; 28 char *name;
29 size_t name_size; /* strlen(name) + 1 (for '\0') */
27 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, 30 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
28 size_t size); 31 size_t size);
32 bool readonly;
29}; 33};
30 34
31/* directories */ 35/* directories */
32 36
33static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, 37static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
34 size_t size) 38 size_t size)
35{ 39{
36 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); 40 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
37} 41}
38 42
39static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, 43static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
40 size_t size) 44 size_t size)
41{ 45{
42 return snprintf(val, size, "%lld", ci->i_files); 46 return snprintf(val, size, "%lld", ci->i_files);
43} 47}
44 48
45static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, 49static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
46 size_t size) 50 size_t size)
47{ 51{
48 return snprintf(val, size, "%lld", ci->i_subdirs); 52 return snprintf(val, size, "%lld", ci->i_subdirs);
49} 53}
50 54
51static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, 55static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
52 size_t size) 56 size_t size)
53{ 57{
54 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); 58 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
55} 59}
56 60
57static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, 61static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
58 size_t size) 62 size_t size)
59{ 63{
60 return snprintf(val, size, "%lld", ci->i_rfiles); 64 return snprintf(val, size, "%lld", ci->i_rfiles);
61} 65}
62 66
63static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, 67static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
64 size_t size) 68 size_t size)
65{ 69{
66 return snprintf(val, size, "%lld", ci->i_rsubdirs); 70 return snprintf(val, size, "%lld", ci->i_rsubdirs);
67} 71}
68 72
69static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, 73static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
70 size_t size) 74 size_t size)
71{ 75{
72 return snprintf(val, size, "%lld", ci->i_rbytes); 76 return snprintf(val, size, "%lld", ci->i_rbytes);
73} 77}
74 78
75static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, 79static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
76 size_t size) 80 size_t size)
77{ 81{
78 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, 82 return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
79 (long)ci->i_rctime.tv_nsec); 83 (long)ci->i_rctime.tv_nsec);
80} 84}
81 85
82static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 86#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
83 { true, "ceph.dir.entries", ceph_vxattrcb_entries}, 87
84 { true, "ceph.dir.files", ceph_vxattrcb_files}, 88#define XATTR_NAME_CEPH(_type, _name) \
85 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 89 { \
86 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, 90 .name = CEPH_XATTR_NAME(_type, _name), \
87 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 91 .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
88 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 92 .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
89 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 93 .readonly = true, \
90 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, 94 }
91 { true, NULL, NULL } 95
96static struct ceph_vxattr ceph_dir_vxattrs[] = {
97 XATTR_NAME_CEPH(dir, entries),
98 XATTR_NAME_CEPH(dir, files),
99 XATTR_NAME_CEPH(dir, subdirs),
100 XATTR_NAME_CEPH(dir, rentries),
101 XATTR_NAME_CEPH(dir, rfiles),
102 XATTR_NAME_CEPH(dir, rsubdirs),
103 XATTR_NAME_CEPH(dir, rbytes),
104 XATTR_NAME_CEPH(dir, rctime),
105 { 0 } /* Required table terminator */
92}; 106};
107static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
93 108
94/* files */ 109/* files */
95 110
96static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, 111static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
97 size_t size) 112 size_t size)
98{ 113{
99 int ret; 114 int ret;
@@ -103,21 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
103 (unsigned long long)ceph_file_layout_su(ci->i_layout), 118 (unsigned long long)ceph_file_layout_su(ci->i_layout),
104 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), 119 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
105 (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); 120 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
106 if (ceph_file_layout_pg_preferred(ci->i_layout)) 121
107 ret += snprintf(val + ret, size, "preferred_osd=%lld\n", 122 if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
123 val += ret;
124 size -= ret;
125 ret += snprintf(val, size, "preferred_osd=%lld\n",
108 (unsigned long long)ceph_file_layout_pg_preferred( 126 (unsigned long long)ceph_file_layout_pg_preferred(
109 ci->i_layout)); 127 ci->i_layout));
128 }
129
110 return ret; 130 return ret;
111} 131}
112 132
113static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 133static struct ceph_vxattr ceph_file_vxattrs[] = {
114 { true, "ceph.file.layout", ceph_vxattrcb_layout}, 134 XATTR_NAME_CEPH(file, layout),
115 /* The following extended attribute name is deprecated */ 135 /* The following extended attribute name is deprecated */
116 { true, "ceph.layout", ceph_vxattrcb_layout}, 136 {
117 { true, NULL, NULL } 137 .name = XATTR_CEPH_PREFIX "layout",
138 .name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
139 .getxattr_cb = ceph_vxattrcb_file_layout,
140 .readonly = true,
141 },
142 { 0 } /* Required table terminator */
118}; 143};
144static size_t ceph_file_vxattrs_name_size; /* total size of all names */
119 145
120static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) 146static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
121{ 147{
122 if (S_ISDIR(inode->i_mode)) 148 if (S_ISDIR(inode->i_mode))
123 return ceph_dir_vxattrs; 149 return ceph_dir_vxattrs;
@@ -126,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
126 return NULL; 152 return NULL;
127} 153}
128 154
129static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, 155static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
156{
157 if (vxattrs == ceph_dir_vxattrs)
158 return ceph_dir_vxattrs_name_size;
159 if (vxattrs == ceph_file_vxattrs)
160 return ceph_file_vxattrs_name_size;
161 BUG();
162
163 return 0;
164}
165
166/*
167 * Compute the aggregate size (including terminating '\0') of all
168 * virtual extended attribute names in the given vxattr table.
169 */
170static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
171{
172 struct ceph_vxattr *vxattr;
173 size_t size = 0;
174
175 for (vxattr = vxattrs; vxattr->name; vxattr++)
176 size += vxattr->name_size;
177
178 return size;
179}
180
181/* Routines called at initialization and exit time */
182
183void __init ceph_xattr_init(void)
184{
185 ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
186 ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
187}
188
189void ceph_xattr_exit(void)
190{
191 ceph_dir_vxattrs_name_size = 0;
192 ceph_file_vxattrs_name_size = 0;
193}
194
195static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
130 const char *name) 196 const char *name)
131{ 197{
132 do { 198 struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
133 if (strcmp(vxattr->name, name) == 0) 199
134 return vxattr; 200 if (vxattr) {
135 vxattr++; 201 while (vxattr->name) {
136 } while (vxattr->name); 202 if (!strcmp(vxattr->name, name))
203 return vxattr;
204 vxattr++;
205 }
206 }
207
137 return NULL; 208 return NULL;
138} 209}
139 210
@@ -502,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
502{ 573{
503 struct inode *inode = dentry->d_inode; 574 struct inode *inode = dentry->d_inode;
504 struct ceph_inode_info *ci = ceph_inode(inode); 575 struct ceph_inode_info *ci = ceph_inode(inode);
505 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
506 int err; 576 int err;
507 struct ceph_inode_xattr *xattr; 577 struct ceph_inode_xattr *xattr;
508 struct ceph_vxattr_cb *vxattr = NULL; 578 struct ceph_vxattr *vxattr = NULL;
509 579
510 if (!ceph_is_valid_xattr(name)) 580 if (!ceph_is_valid_xattr(name))
511 return -ENODATA; 581 return -ENODATA;
512 582
513 /* let's see if a virtual xattr was requested */ 583 /* let's see if a virtual xattr was requested */
514 if (vxattrs) 584 vxattr = ceph_match_vxattr(inode, name);
515 vxattr = ceph_match_vxattr(vxattrs, name);
516 585
517 spin_lock(&ci->i_ceph_lock); 586 spin_lock(&ci->i_ceph_lock);
518 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 587 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -568,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
568{ 637{
569 struct inode *inode = dentry->d_inode; 638 struct inode *inode = dentry->d_inode;
570 struct ceph_inode_info *ci = ceph_inode(inode); 639 struct ceph_inode_info *ci = ceph_inode(inode);
571 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 640 struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
572 u32 vir_namelen = 0; 641 u32 vir_namelen = 0;
573 u32 namelen; 642 u32 namelen;
574 int err; 643 int err;
@@ -596,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
596 goto out; 665 goto out;
597 666
598list_xattr: 667list_xattr:
599 vir_namelen = 0; 668 /*
600 /* include virtual dir xattrs */ 669 * Start with virtual dir xattr names (if any) (including
601 if (vxattrs) 670 * terminating '\0' characters for each).
602 for (i = 0; vxattrs[i].name; i++) 671 */
603 vir_namelen += strlen(vxattrs[i].name) + 1; 672 vir_namelen = ceph_vxattrs_name_size(vxattrs);
673
604 /* adding 1 byte per each variable due to the null termination */ 674 /* adding 1 byte per each variable due to the null termination */
605 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; 675 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
606 err = -ERANGE; 676 err = -ERANGE;
@@ -698,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
698 const void *value, size_t size, int flags) 768 const void *value, size_t size, int flags)
699{ 769{
700 struct inode *inode = dentry->d_inode; 770 struct inode *inode = dentry->d_inode;
771 struct ceph_vxattr *vxattr;
701 struct ceph_inode_info *ci = ceph_inode(inode); 772 struct ceph_inode_info *ci = ceph_inode(inode);
702 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 773 int issued;
703 int err; 774 int err;
775 int dirty;
704 int name_len = strlen(name); 776 int name_len = strlen(name);
705 int val_len = size; 777 int val_len = size;
706 char *newname = NULL; 778 char *newname = NULL;
707 char *newval = NULL; 779 char *newval = NULL;
708 struct ceph_inode_xattr *xattr = NULL; 780 struct ceph_inode_xattr *xattr = NULL;
709 int issued;
710 int required_blob_size; 781 int required_blob_size;
711 int dirty;
712 782
713 if (ceph_snap(inode) != CEPH_NOSNAP) 783 if (ceph_snap(inode) != CEPH_NOSNAP)
714 return -EROFS; 784 return -EROFS;
@@ -716,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
716 if (!ceph_is_valid_xattr(name)) 786 if (!ceph_is_valid_xattr(name))
717 return -EOPNOTSUPP; 787 return -EOPNOTSUPP;
718 788
719 if (vxattrs) { 789 vxattr = ceph_match_vxattr(inode, name);
720 struct ceph_vxattr_cb *vxattr = 790 if (vxattr && vxattr->readonly)
721 ceph_match_vxattr(vxattrs, name); 791 return -EOPNOTSUPP;
722 if (vxattr && vxattr->readonly)
723 return -EOPNOTSUPP;
724 }
725 792
726 /* preallocate memory for xattr name, value, index node */ 793 /* preallocate memory for xattr name, value, index node */
727 err = -ENOMEM; 794 err = -ENOMEM;
@@ -730,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
730 goto out; 797 goto out;
731 798
732 if (val_len) { 799 if (val_len) {
733 newval = kmalloc(val_len + 1, GFP_NOFS); 800 newval = kmemdup(value, val_len, GFP_NOFS);
734 if (!newval) 801 if (!newval)
735 goto out; 802 goto out;
736 memcpy(newval, value, val_len);
737 newval[val_len] = '\0';
738 } 803 }
739 804
740 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); 805 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
@@ -744,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
744 spin_lock(&ci->i_ceph_lock); 809 spin_lock(&ci->i_ceph_lock);
745retry: 810retry:
746 issued = __ceph_caps_issued(ci, NULL); 811 issued = __ceph_caps_issued(ci, NULL);
812 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
747 if (!(issued & CEPH_CAP_XATTR_EXCL)) 813 if (!(issued & CEPH_CAP_XATTR_EXCL))
748 goto do_sync; 814 goto do_sync;
749 __build_xattrs(inode); 815 __build_xattrs(inode);
@@ -752,7 +818,7 @@ retry:
752 818
753 if (!ci->i_xattrs.prealloc_blob || 819 if (!ci->i_xattrs.prealloc_blob ||
754 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { 820 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
755 struct ceph_buffer *blob = NULL; 821 struct ceph_buffer *blob;
756 822
757 spin_unlock(&ci->i_ceph_lock); 823 spin_unlock(&ci->i_ceph_lock);
758 dout(" preaallocating new blob size=%d\n", required_blob_size); 824 dout(" preaallocating new blob size=%d\n", required_blob_size);
@@ -766,12 +832,13 @@ retry:
766 goto retry; 832 goto retry;
767 } 833 }
768 834
769 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
770 err = __set_xattr(ci, newname, name_len, newval, 835 err = __set_xattr(ci, newname, name_len, newval,
771 val_len, 1, 1, 1, &xattr); 836 val_len, 1, 1, 1, &xattr);
837
772 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 838 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
773 ci->i_xattrs.dirty = true; 839 ci->i_xattrs.dirty = true;
774 inode->i_ctime = CURRENT_TIME; 840 inode->i_ctime = CURRENT_TIME;
841
775 spin_unlock(&ci->i_ceph_lock); 842 spin_unlock(&ci->i_ceph_lock);
776 if (dirty) 843 if (dirty)
777 __mark_inode_dirty(inode, dirty); 844 __mark_inode_dirty(inode, dirty);
@@ -816,8 +883,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
816int ceph_removexattr(struct dentry *dentry, const char *name) 883int ceph_removexattr(struct dentry *dentry, const char *name)
817{ 884{
818 struct inode *inode = dentry->d_inode; 885 struct inode *inode = dentry->d_inode;
886 struct ceph_vxattr *vxattr;
819 struct ceph_inode_info *ci = ceph_inode(inode); 887 struct ceph_inode_info *ci = ceph_inode(inode);
820 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
821 int issued; 888 int issued;
822 int err; 889 int err;
823 int required_blob_size; 890 int required_blob_size;
@@ -829,22 +896,19 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
829 if (!ceph_is_valid_xattr(name)) 896 if (!ceph_is_valid_xattr(name))
830 return -EOPNOTSUPP; 897 return -EOPNOTSUPP;
831 898
832 if (vxattrs) { 899 vxattr = ceph_match_vxattr(inode, name);
833 struct ceph_vxattr_cb *vxattr = 900 if (vxattr && vxattr->readonly)
834 ceph_match_vxattr(vxattrs, name); 901 return -EOPNOTSUPP;
835 if (vxattr && vxattr->readonly)
836 return -EOPNOTSUPP;
837 }
838 902
839 err = -ENOMEM; 903 err = -ENOMEM;
840 spin_lock(&ci->i_ceph_lock); 904 spin_lock(&ci->i_ceph_lock);
841 __build_xattrs(inode);
842retry: 905retry:
843 issued = __ceph_caps_issued(ci, NULL); 906 issued = __ceph_caps_issued(ci, NULL);
844 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); 907 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
845 908
846 if (!(issued & CEPH_CAP_XATTR_EXCL)) 909 if (!(issued & CEPH_CAP_XATTR_EXCL))
847 goto do_sync; 910 goto do_sync;
911 __build_xattrs(inode);
848 912
849 required_blob_size = __get_required_blob_size(ci, 0, 0); 913 required_blob_size = __get_required_blob_size(ci, 0, 0);
850 914
@@ -865,10 +929,10 @@ retry:
865 } 929 }
866 930
867 err = __remove_xattr_by_name(ceph_inode(inode), name); 931 err = __remove_xattr_by_name(ceph_inode(inode), name);
932
868 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 933 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
869 ci->i_xattrs.dirty = true; 934 ci->i_xattrs.dirty = true;
870 inode->i_ctime = CURRENT_TIME; 935 inode->i_ctime = CURRENT_TIME;
871
872 spin_unlock(&ci->i_ceph_lock); 936 spin_unlock(&ci->i_ceph_lock);
873 if (dirty) 937 if (dirty)
874 __mark_inode_dirty(inode, dirty); 938 __mark_inode_dirty(inode, dirty);
diff --git a/fs/cifs/README b/fs/cifs/README
index 895da1dc1550..b7d782bab797 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -753,10 +753,6 @@ module loading or during the runtime by using the interface
753 753
754i.e. echo "value" > /sys/module/cifs/parameters/<param> 754i.e. echo "value" > /sys/module/cifs/parameters/<param>
755 755
7561. echo_retries - The number of echo attempts before giving up and 7561. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
757 reconnecting to the server. The default is 5. The value 0
758 means never reconnect.
759
7602. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
761 [Y/y/1]. To disable use any of [N/n/0]. 757 [Y/y/1]. To disable use any of [N/n/0].
762 758
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 24b3dfc05282..573b899b5a5d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -171,8 +171,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
171 seq_printf(m, "TCP status: %d\n\tLocal Users To " 171 seq_printf(m, "TCP status: %d\n\tLocal Users To "
172 "Server: %d SecMode: 0x%x Req On Wire: %d", 172 "Server: %d SecMode: 0x%x Req On Wire: %d",
173 server->tcpStatus, server->srv_count, 173 server->tcpStatus, server->srv_count,
174 server->sec_mode, 174 server->sec_mode, in_flight(server));
175 atomic_read(&server->inFlight));
176 175
177#ifdef CONFIG_CIFS_STATS2 176#ifdef CONFIG_CIFS_STATS2
178 seq_printf(m, " In Send: %d In MaxReq Wait: %d", 177 seq_printf(m, " In Send: %d In MaxReq Wait: %d",
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 418fc42fb8b2..eee522c56ef0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -76,12 +76,7 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
76unsigned int cifs_max_pending = CIFS_MAX_REQ; 76unsigned int cifs_max_pending = CIFS_MAX_REQ;
77module_param(cifs_max_pending, int, 0444); 77module_param(cifs_max_pending, int, 0444);
78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 78MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
79 "Default: 50 Range: 2 to 256"); 79 "Default: 32767 Range: 2 to 32767.");
80unsigned short echo_retries = 5;
81module_param(echo_retries, ushort, 0644);
82MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
83 "reconnecting server. Default: 5. 0 means "
84 "never reconnect.");
85module_param(enable_oplocks, bool, 0644); 80module_param(enable_oplocks, bool, 0644);
86MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" 81MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
87 "y/Y/1"); 82 "y/Y/1");
@@ -1111,9 +1106,9 @@ init_cifs(void)
1111 if (cifs_max_pending < 2) { 1106 if (cifs_max_pending < 2) {
1112 cifs_max_pending = 2; 1107 cifs_max_pending = 2;
1113 cFYI(1, "cifs_max_pending set to min of 2"); 1108 cFYI(1, "cifs_max_pending set to min of 2");
1114 } else if (cifs_max_pending > 256) { 1109 } else if (cifs_max_pending > CIFS_MAX_REQ) {
1115 cifs_max_pending = 256; 1110 cifs_max_pending = CIFS_MAX_REQ;
1116 cFYI(1, "cifs_max_pending set to max of 256"); 1111 cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
1117 } 1112 }
1118 1113
1119 rc = cifs_fscache_register(); 1114 rc = cifs_fscache_register();
@@ -1175,11 +1170,8 @@ static void __exit
1175exit_cifs(void) 1170exit_cifs(void)
1176{ 1171{
1177 cFYI(DBG2, "exit_cifs"); 1172 cFYI(DBG2, "exit_cifs");
1178 cifs_proc_clean(); 1173 unregister_filesystem(&cifs_fs_type);
1179 cifs_fscache_unregister();
1180#ifdef CONFIG_CIFS_DFS_UPCALL
1181 cifs_dfs_release_automount_timer(); 1174 cifs_dfs_release_automount_timer();
1182#endif
1183#ifdef CONFIG_CIFS_ACL 1175#ifdef CONFIG_CIFS_ACL
1184 cifs_destroy_idmaptrees(); 1176 cifs_destroy_idmaptrees();
1185 exit_cifs_idmap(); 1177 exit_cifs_idmap();
@@ -1187,10 +1179,11 @@ exit_cifs(void)
1187#ifdef CONFIG_CIFS_UPCALL 1179#ifdef CONFIG_CIFS_UPCALL
1188 unregister_key_type(&cifs_spnego_key_type); 1180 unregister_key_type(&cifs_spnego_key_type);
1189#endif 1181#endif
1190 unregister_filesystem(&cifs_fs_type);
1191 cifs_destroy_inodecache();
1192 cifs_destroy_mids();
1193 cifs_destroy_request_bufs(); 1182 cifs_destroy_request_bufs();
1183 cifs_destroy_mids();
1184 cifs_destroy_inodecache();
1185 cifs_fscache_unregister();
1186 cifs_proc_clean();
1194} 1187}
1195 1188
1196MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); 1189MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 76e7d8b6da17..339ebe3ebc0d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -55,14 +55,9 @@
55 55
56/* 56/*
57 * MAX_REQ is the maximum number of requests that WE will send 57 * MAX_REQ is the maximum number of requests that WE will send
58 * on one socket concurrently. It also matches the most common 58 * on one socket concurrently.
59 * value of max multiplex returned by servers. We may
60 * eventually want to use the negotiated value (in case
61 * future servers can handle more) when we are more confident that
62 * we will not have problems oveloading the socket with pending
63 * write data.
64 */ 59 */
65#define CIFS_MAX_REQ 50 60#define CIFS_MAX_REQ 32767
66 61
67#define RFC1001_NAME_LEN 15 62#define RFC1001_NAME_LEN 15
68#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) 63#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
@@ -255,7 +250,9 @@ struct TCP_Server_Info {
255 bool noblocksnd; /* use blocking sendmsg */ 250 bool noblocksnd; /* use blocking sendmsg */
256 bool noautotune; /* do not autotune send buf sizes */ 251 bool noautotune; /* do not autotune send buf sizes */
257 bool tcp_nodelay; 252 bool tcp_nodelay;
258 atomic_t inFlight; /* number of requests on the wire to server */ 253 int credits; /* send no more requests at once */
254 unsigned int in_flight; /* number of requests on the wire to server */
255 spinlock_t req_lock; /* protect the two values above */
259 struct mutex srv_mutex; 256 struct mutex srv_mutex;
260 struct task_struct *tsk; 257 struct task_struct *tsk;
261 char server_GUID[16]; 258 char server_GUID[16];
@@ -263,6 +260,7 @@ struct TCP_Server_Info {
263 bool session_estab; /* mark when very first sess is established */ 260 bool session_estab; /* mark when very first sess is established */
264 u16 dialect; /* dialect index that server chose */ 261 u16 dialect; /* dialect index that server chose */
265 enum securityEnum secType; 262 enum securityEnum secType;
263 bool oplocks:1; /* enable oplocks */
266 unsigned int maxReq; /* Clients should submit no more */ 264 unsigned int maxReq; /* Clients should submit no more */
267 /* than maxReq distinct unanswered SMBs to the server when using */ 265 /* than maxReq distinct unanswered SMBs to the server when using */
268 /* multiplexed reads or writes */ 266 /* multiplexed reads or writes */
@@ -307,6 +305,36 @@ struct TCP_Server_Info {
307#endif 305#endif
308}; 306};
309 307
308static inline unsigned int
309in_flight(struct TCP_Server_Info *server)
310{
311 unsigned int num;
312 spin_lock(&server->req_lock);
313 num = server->in_flight;
314 spin_unlock(&server->req_lock);
315 return num;
316}
317
318static inline int*
319get_credits_field(struct TCP_Server_Info *server)
320{
321 /*
322 * This will change to switch statement when we reserve slots for echos
323 * and oplock breaks.
324 */
325 return &server->credits;
326}
327
328static inline bool
329has_credits(struct TCP_Server_Info *server, int *credits)
330{
331 int num;
332 spin_lock(&server->req_lock);
333 num = *credits;
334 spin_unlock(&server->req_lock);
335 return num > 0;
336}
337
310/* 338/*
311 * Macros to allow the TCP_Server_Info->net field and related code to drop out 339 * Macros to allow the TCP_Server_Info->net field and related code to drop out
312 * when CONFIG_NET_NS isn't set. 340 * when CONFIG_NET_NS isn't set.
@@ -1010,9 +1038,6 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
1010GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 1038GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
1011GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 1039GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
1012 1040
1013/* reconnect after this many failed echo attempts */
1014GLOBAL_EXTERN unsigned short echo_retries;
1015
1016#ifdef CONFIG_CIFS_ACL 1041#ifdef CONFIG_CIFS_ACL
1017GLOBAL_EXTERN struct rb_root uidtree; 1042GLOBAL_EXTERN struct rb_root uidtree;
1018GLOBAL_EXTERN struct rb_root gidtree; 1043GLOBAL_EXTERN struct rb_root gidtree;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6f4e243e0f62..503e73d8bdb7 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -88,6 +88,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
88 struct smb_hdr *in_buf , 88 struct smb_hdr *in_buf ,
89 struct smb_hdr *out_buf, 89 struct smb_hdr *out_buf,
90 int *bytes_returned); 90 int *bytes_returned);
91extern void cifs_add_credits(struct TCP_Server_Info *server,
92 const unsigned int add);
93extern void cifs_set_credits(struct TCP_Server_Info *server, const int val);
91extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length); 94extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
92extern bool is_valid_oplock_break(struct smb_hdr *smb, 95extern bool is_valid_oplock_break(struct smb_hdr *smb,
93 struct TCP_Server_Info *); 96 struct TCP_Server_Info *);
@@ -168,7 +171,13 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data,
168 const char *devname); 171 const char *devname);
169extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); 172extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);
170extern void cifs_umount(struct cifs_sb_info *); 173extern void cifs_umount(struct cifs_sb_info *);
174
175#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
171extern void cifs_dfs_release_automount_timer(void); 176extern void cifs_dfs_release_automount_timer(void);
177#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
178#define cifs_dfs_release_automount_timer() do { } while (0)
179#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
180
172void cifs_proc_init(void); 181void cifs_proc_init(void);
173void cifs_proc_clean(void); 182void cifs_proc_clean(void);
174 183
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 8b7794c31591..70aac35c398f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -458,7 +458,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
458 goto neg_err_exit; 458 goto neg_err_exit;
459 } 459 }
460 server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); 460 server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
461 server->maxReq = le16_to_cpu(rsp->MaxMpxCount); 461 server->maxReq = min_t(unsigned int,
462 le16_to_cpu(rsp->MaxMpxCount),
463 cifs_max_pending);
464 cifs_set_credits(server, server->maxReq);
462 server->maxBuf = le16_to_cpu(rsp->MaxBufSize); 465 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
463 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); 466 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
464 /* even though we do not use raw we might as well set this 467 /* even though we do not use raw we might as well set this
@@ -564,7 +567,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
564 567
565 /* one byte, so no need to convert this or EncryptionKeyLen from 568 /* one byte, so no need to convert this or EncryptionKeyLen from
566 little endian */ 569 little endian */
567 server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); 570 server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
571 cifs_max_pending);
572 cifs_set_credits(server, server->maxReq);
568 /* probably no need to store and check maxvcs */ 573 /* probably no need to store and check maxvcs */
569 server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); 574 server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
570 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); 575 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
@@ -716,8 +721,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
716 struct TCP_Server_Info *server = mid->callback_data; 721 struct TCP_Server_Info *server = mid->callback_data;
717 722
718 DeleteMidQEntry(mid); 723 DeleteMidQEntry(mid);
719 atomic_dec(&server->inFlight); 724 cifs_add_credits(server, 1);
720 wake_up(&server->request_q);
721} 725}
722 726
723int 727int
@@ -1669,8 +1673,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
1669 1673
1670 queue_work(system_nrt_wq, &rdata->work); 1674 queue_work(system_nrt_wq, &rdata->work);
1671 DeleteMidQEntry(mid); 1675 DeleteMidQEntry(mid);
1672 atomic_dec(&server->inFlight); 1676 cifs_add_credits(server, 1);
1673 wake_up(&server->request_q);
1674} 1677}
1675 1678
1676/* cifs_async_readv - send an async write, and set up mid to handle result */ 1679/* cifs_async_readv - send an async write, and set up mid to handle result */
@@ -2110,8 +2113,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
2110 2113
2111 queue_work(system_nrt_wq, &wdata->work); 2114 queue_work(system_nrt_wq, &wdata->work);
2112 DeleteMidQEntry(mid); 2115 DeleteMidQEntry(mid);
2113 atomic_dec(&tcon->ses->server->inFlight); 2116 cifs_add_credits(tcon->ses->server, 1);
2114 wake_up(&tcon->ses->server->request_q);
2115} 2117}
2116 2118
2117/* cifs_async_writev - send an async write, and set up mid to handle result */ 2119/* cifs_async_writev - send an async write, and set up mid to handle result */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 602f77c304c9..5560e1d5e54b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -373,12 +373,22 @@ allocate_buffers(struct TCP_Server_Info *server)
373static bool 373static bool
374server_unresponsive(struct TCP_Server_Info *server) 374server_unresponsive(struct TCP_Server_Info *server)
375{ 375{
376 if (echo_retries > 0 && server->tcpStatus == CifsGood && 376 /*
377 time_after(jiffies, server->lstrp + 377 * We need to wait 2 echo intervals to make sure we handle such
378 (echo_retries * SMB_ECHO_INTERVAL))) { 378 * situations right:
379 * 1s client sends a normal SMB request
380 * 2s client gets a response
381 * 30s echo workqueue job pops, and decides we got a response recently
382 * and don't need to send another
383 * ...
384 * 65s kernel_recvmsg times out, and we see that we haven't gotten
385 * a response in >60s.
386 */
387 if (server->tcpStatus == CifsGood &&
388 time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
379 cERROR(1, "Server %s has not responded in %d seconds. " 389 cERROR(1, "Server %s has not responded in %d seconds. "
380 "Reconnecting...", server->hostname, 390 "Reconnecting...", server->hostname,
381 (echo_retries * SMB_ECHO_INTERVAL / HZ)); 391 (2 * SMB_ECHO_INTERVAL) / HZ);
382 cifs_reconnect(server); 392 cifs_reconnect(server);
383 wake_up(&server->response_q); 393 wake_up(&server->response_q);
384 return true; 394 return true;
@@ -642,19 +652,11 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
642 spin_unlock(&GlobalMid_Lock); 652 spin_unlock(&GlobalMid_Lock);
643 wake_up_all(&server->response_q); 653 wake_up_all(&server->response_q);
644 654
645 /* 655 /* check if we have blocked requests that need to free */
646 * Check if we have blocked requests that need to free. Note that 656 spin_lock(&server->req_lock);
647 * cifs_max_pending is normally 50, but can be set at module install 657 if (server->credits <= 0)
648 * time to as little as two. 658 server->credits = 1;
649 */ 659 spin_unlock(&server->req_lock);
650 spin_lock(&GlobalMid_Lock);
651 if (atomic_read(&server->inFlight) >= cifs_max_pending)
652 atomic_set(&server->inFlight, cifs_max_pending - 1);
653 /*
654 * We do not want to set the max_pending too low or we could end up
655 * with the counter going negative.
656 */
657 spin_unlock(&GlobalMid_Lock);
658 /* 660 /*
659 * Although there should not be any requests blocked on this queue it 661 * Although there should not be any requests blocked on this queue it
660 * can not hurt to be paranoid and try to wake up requests that may 662 * can not hurt to be paranoid and try to wake up requests that may
@@ -1909,7 +1911,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1909 tcp_ses->noblocksnd = volume_info->noblocksnd; 1911 tcp_ses->noblocksnd = volume_info->noblocksnd;
1910 tcp_ses->noautotune = volume_info->noautotune; 1912 tcp_ses->noautotune = volume_info->noautotune;
1911 tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; 1913 tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
1912 atomic_set(&tcp_ses->inFlight, 0); 1914 tcp_ses->in_flight = 0;
1915 tcp_ses->credits = 1;
1913 init_waitqueue_head(&tcp_ses->response_q); 1916 init_waitqueue_head(&tcp_ses->response_q);
1914 init_waitqueue_head(&tcp_ses->request_q); 1917 init_waitqueue_head(&tcp_ses->request_q);
1915 INIT_LIST_HEAD(&tcp_ses->pending_mid_q); 1918 INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
@@ -3371,7 +3374,7 @@ cifs_ra_pages(struct cifs_sb_info *cifs_sb)
3371int 3374int
3372cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) 3375cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3373{ 3376{
3374 int rc = 0; 3377 int rc;
3375 int xid; 3378 int xid;
3376 struct cifs_ses *pSesInfo; 3379 struct cifs_ses *pSesInfo;
3377 struct cifs_tcon *tcon; 3380 struct cifs_tcon *tcon;
@@ -3398,6 +3401,7 @@ try_mount_again:
3398 FreeXid(xid); 3401 FreeXid(xid);
3399 } 3402 }
3400#endif 3403#endif
3404 rc = 0;
3401 tcon = NULL; 3405 tcon = NULL;
3402 pSesInfo = NULL; 3406 pSesInfo = NULL;
3403 srvTcp = NULL; 3407 srvTcp = NULL;
@@ -3759,9 +3763,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)
3759 if (server->maxBuf != 0) 3763 if (server->maxBuf != 0)
3760 return 0; 3764 return 0;
3761 3765
3766 cifs_set_credits(server, 1);
3762 rc = CIFSSMBNegotiate(xid, ses); 3767 rc = CIFSSMBNegotiate(xid, ses);
3763 if (rc == -EAGAIN) { 3768 if (rc == -EAGAIN) {
3764 /* retry only once on 1st time connection */ 3769 /* retry only once on 1st time connection */
3770 cifs_set_credits(server, 1);
3765 rc = CIFSSMBNegotiate(xid, ses); 3771 rc = CIFSSMBNegotiate(xid, ses);
3766 if (rc == -EAGAIN) 3772 if (rc == -EAGAIN)
3767 rc = -EHOSTDOWN; 3773 rc = -EHOSTDOWN;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index bc7e24420ac0..d172c8ed9017 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
171 } 171 }
172 tcon = tlink_tcon(tlink); 172 tcon = tlink_tcon(tlink);
173 173
174 if (enable_oplocks) 174 if (tcon->ses->server->oplocks)
175 oplock = REQ_OPLOCK; 175 oplock = REQ_OPLOCK;
176 176
177 if (nd) 177 if (nd)
@@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
492{ 492{
493 int xid; 493 int xid;
494 int rc = 0; /* to get around spurious gcc warning, set to zero here */ 494 int rc = 0; /* to get around spurious gcc warning, set to zero here */
495 __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0; 495 __u32 oplock;
496 __u16 fileHandle = 0; 496 __u16 fileHandle = 0;
497 bool posix_open = false; 497 bool posix_open = false;
498 struct cifs_sb_info *cifs_sb; 498 struct cifs_sb_info *cifs_sb;
@@ -518,6 +518,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
518 } 518 }
519 pTcon = tlink_tcon(tlink); 519 pTcon = tlink_tcon(tlink);
520 520
521 oplock = pTcon->ses->server->oplocks ? REQ_OPLOCK : 0;
522
521 /* 523 /*
522 * Don't allow the separator character in a path component. 524 * Don't allow the separator character in a path component.
523 * The VFS will not allow "/", but "\" is allowed by posix. 525 * The VFS will not allow "/", but "\" is allowed by posix.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5e64748a2917..159fcc56dc2d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -380,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
380 cFYI(1, "inode = 0x%p file flags are 0x%x for %s", 380 cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
381 inode, file->f_flags, full_path); 381 inode, file->f_flags, full_path);
382 382
383 if (enable_oplocks) 383 if (tcon->ses->server->oplocks)
384 oplock = REQ_OPLOCK; 384 oplock = REQ_OPLOCK;
385 else 385 else
386 oplock = 0; 386 oplock = 0;
@@ -505,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
505 cFYI(1, "inode = 0x%p file flags 0x%x for %s", 505 cFYI(1, "inode = 0x%p file flags 0x%x for %s",
506 inode, pCifsFile->f_flags, full_path); 506 inode, pCifsFile->f_flags, full_path);
507 507
508 if (enable_oplocks) 508 if (tcon->ses->server->oplocks)
509 oplock = REQ_OPLOCK; 509 oplock = REQ_OPLOCK;
510 else 510 else
511 oplock = 0; 511 oplock = 0;
@@ -960,9 +960,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
960 INIT_LIST_HEAD(&locks_to_send); 960 INIT_LIST_HEAD(&locks_to_send);
961 961
962 /* 962 /*
963 * Allocating count locks is enough because no locks can be added to 963 * Allocating count locks is enough because no FL_POSIX locks can be
964 * the list while we are holding cinode->lock_mutex that protects 964 * added to the list while we are holding cinode->lock_mutex that
965 * locking operations of this inode. 965 * protects locking operations of this inode.
966 */ 966 */
967 for (; i < count; i++) { 967 for (; i < count; i++) {
968 lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); 968 lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
@@ -973,18 +973,20 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
973 list_add_tail(&lck->llist, &locks_to_send); 973 list_add_tail(&lck->llist, &locks_to_send);
974 } 974 }
975 975
976 i = 0;
977 el = locks_to_send.next; 976 el = locks_to_send.next;
978 lock_flocks(); 977 lock_flocks();
979 cifs_for_each_lock(cfile->dentry->d_inode, before) { 978 cifs_for_each_lock(cfile->dentry->d_inode, before) {
979 flock = *before;
980 if ((flock->fl_flags & FL_POSIX) == 0)
981 continue;
980 if (el == &locks_to_send) { 982 if (el == &locks_to_send) {
981 /* something is really wrong */ 983 /*
984 * The list ended. We don't have enough allocated
985 * structures - something is really wrong.
986 */
982 cERROR(1, "Can't push all brlocks!"); 987 cERROR(1, "Can't push all brlocks!");
983 break; 988 break;
984 } 989 }
985 flock = *before;
986 if ((flock->fl_flags & FL_POSIX) == 0)
987 continue;
988 length = 1 + flock->fl_end - flock->fl_start; 990 length = 1 + flock->fl_end - flock->fl_start;
989 if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK) 991 if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
990 type = CIFS_RDLCK; 992 type = CIFS_RDLCK;
@@ -996,7 +998,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
996 lck->length = length; 998 lck->length = length;
997 lck->type = type; 999 lck->type = type;
998 lck->offset = flock->fl_start; 1000 lck->offset = flock->fl_start;
999 i++;
1000 el = el->next; 1001 el = el->next;
1001 } 1002 }
1002 unlock_flocks(); 1003 unlock_flocks();
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 703ef5c6fdb1..c273c12de98e 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -690,3 +690,22 @@ backup_cred(struct cifs_sb_info *cifs_sb)
690 690
691 return false; 691 return false;
692} 692}
693
694void
695cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add)
696{
697 spin_lock(&server->req_lock);
698 server->credits += add;
699 server->in_flight--;
700 spin_unlock(&server->req_lock);
701 wake_up(&server->request_q);
702}
703
704void
705cifs_set_credits(struct TCP_Server_Info *server, const int val)
706{
707 spin_lock(&server->req_lock);
708 server->credits = val;
709 server->oplocks = val > 1 ? enable_oplocks : false;
710 spin_unlock(&server->req_lock);
711}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0cc9584f5889..310918b6fcb4 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -254,44 +254,60 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
254 return smb_sendv(server, &iov, 1); 254 return smb_sendv(server, &iov, 1);
255} 255}
256 256
257static int wait_for_free_request(struct TCP_Server_Info *server, 257static int
258 const int long_op) 258wait_for_free_credits(struct TCP_Server_Info *server, const int optype,
259 int *credits)
259{ 260{
260 if (long_op == CIFS_ASYNC_OP) { 261 int rc;
262
263 spin_lock(&server->req_lock);
264 if (optype == CIFS_ASYNC_OP) {
261 /* oplock breaks must not be held up */ 265 /* oplock breaks must not be held up */
262 atomic_inc(&server->inFlight); 266 server->in_flight++;
267 *credits -= 1;
268 spin_unlock(&server->req_lock);
263 return 0; 269 return 0;
264 } 270 }
265 271
266 spin_lock(&GlobalMid_Lock);
267 while (1) { 272 while (1) {
268 if (atomic_read(&server->inFlight) >= cifs_max_pending) { 273 if (*credits <= 0) {
269 spin_unlock(&GlobalMid_Lock); 274 spin_unlock(&server->req_lock);
270 cifs_num_waiters_inc(server); 275 cifs_num_waiters_inc(server);
271 wait_event(server->request_q, 276 rc = wait_event_killable(server->request_q,
272 atomic_read(&server->inFlight) 277 has_credits(server, credits));
273 < cifs_max_pending);
274 cifs_num_waiters_dec(server); 278 cifs_num_waiters_dec(server);
275 spin_lock(&GlobalMid_Lock); 279 if (rc)
280 return rc;
281 spin_lock(&server->req_lock);
276 } else { 282 } else {
277 if (server->tcpStatus == CifsExiting) { 283 if (server->tcpStatus == CifsExiting) {
278 spin_unlock(&GlobalMid_Lock); 284 spin_unlock(&server->req_lock);
279 return -ENOENT; 285 return -ENOENT;
280 } 286 }
281 287
282 /* can not count locking commands against total 288 /*
283 as they are allowed to block on server */ 289 * Can not count locking commands against total
290 * as they are allowed to block on server.
291 */
284 292
285 /* update # of requests on the wire to server */ 293 /* update # of requests on the wire to server */
286 if (long_op != CIFS_BLOCKING_OP) 294 if (optype != CIFS_BLOCKING_OP) {
287 atomic_inc(&server->inFlight); 295 *credits -= 1;
288 spin_unlock(&GlobalMid_Lock); 296 server->in_flight++;
297 }
298 spin_unlock(&server->req_lock);
289 break; 299 break;
290 } 300 }
291 } 301 }
292 return 0; 302 return 0;
293} 303}
294 304
305static int
306wait_for_free_request(struct TCP_Server_Info *server, const int optype)
307{
308 return wait_for_free_credits(server, optype, get_credits_field(server));
309}
310
295static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, 311static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
296 struct mid_q_entry **ppmidQ) 312 struct mid_q_entry **ppmidQ)
297{ 313{
@@ -359,7 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
359 mid = AllocMidQEntry(hdr, server); 375 mid = AllocMidQEntry(hdr, server);
360 if (mid == NULL) { 376 if (mid == NULL) {
361 mutex_unlock(&server->srv_mutex); 377 mutex_unlock(&server->srv_mutex);
362 atomic_dec(&server->inFlight); 378 cifs_add_credits(server, 1);
363 wake_up(&server->request_q); 379 wake_up(&server->request_q);
364 return -ENOMEM; 380 return -ENOMEM;
365 } 381 }
@@ -392,7 +408,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
392 return rc; 408 return rc;
393out_err: 409out_err:
394 delete_mid(mid); 410 delete_mid(mid);
395 atomic_dec(&server->inFlight); 411 cifs_add_credits(server, 1);
396 wake_up(&server->request_q); 412 wake_up(&server->request_q);
397 return rc; 413 return rc;
398} 414}
@@ -564,8 +580,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
564 mutex_unlock(&ses->server->srv_mutex); 580 mutex_unlock(&ses->server->srv_mutex);
565 cifs_small_buf_release(in_buf); 581 cifs_small_buf_release(in_buf);
566 /* Update # of requests on wire to server */ 582 /* Update # of requests on wire to server */
567 atomic_dec(&ses->server->inFlight); 583 cifs_add_credits(ses->server, 1);
568 wake_up(&ses->server->request_q);
569 return rc; 584 return rc;
570 } 585 }
571 rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); 586 rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
@@ -601,8 +616,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
601 midQ->callback = DeleteMidQEntry; 616 midQ->callback = DeleteMidQEntry;
602 spin_unlock(&GlobalMid_Lock); 617 spin_unlock(&GlobalMid_Lock);
603 cifs_small_buf_release(in_buf); 618 cifs_small_buf_release(in_buf);
604 atomic_dec(&ses->server->inFlight); 619 cifs_add_credits(ses->server, 1);
605 wake_up(&ses->server->request_q);
606 return rc; 620 return rc;
607 } 621 }
608 spin_unlock(&GlobalMid_Lock); 622 spin_unlock(&GlobalMid_Lock);
@@ -612,8 +626,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
612 626
613 rc = cifs_sync_mid_result(midQ, ses->server); 627 rc = cifs_sync_mid_result(midQ, ses->server);
614 if (rc != 0) { 628 if (rc != 0) {
615 atomic_dec(&ses->server->inFlight); 629 cifs_add_credits(ses->server, 1);
616 wake_up(&ses->server->request_q);
617 return rc; 630 return rc;
618 } 631 }
619 632
@@ -637,8 +650,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
637 midQ->resp_buf = NULL; 650 midQ->resp_buf = NULL;
638out: 651out:
639 delete_mid(midQ); 652 delete_mid(midQ);
640 atomic_dec(&ses->server->inFlight); 653 cifs_add_credits(ses->server, 1);
641 wake_up(&ses->server->request_q);
642 654
643 return rc; 655 return rc;
644} 656}
@@ -688,8 +700,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
688 if (rc) { 700 if (rc) {
689 mutex_unlock(&ses->server->srv_mutex); 701 mutex_unlock(&ses->server->srv_mutex);
690 /* Update # of requests on wire to server */ 702 /* Update # of requests on wire to server */
691 atomic_dec(&ses->server->inFlight); 703 cifs_add_credits(ses->server, 1);
692 wake_up(&ses->server->request_q);
693 return rc; 704 return rc;
694 } 705 }
695 706
@@ -721,8 +732,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
721 /* no longer considered to be "in-flight" */ 732 /* no longer considered to be "in-flight" */
722 midQ->callback = DeleteMidQEntry; 733 midQ->callback = DeleteMidQEntry;
723 spin_unlock(&GlobalMid_Lock); 734 spin_unlock(&GlobalMid_Lock);
724 atomic_dec(&ses->server->inFlight); 735 cifs_add_credits(ses->server, 1);
725 wake_up(&ses->server->request_q);
726 return rc; 736 return rc;
727 } 737 }
728 spin_unlock(&GlobalMid_Lock); 738 spin_unlock(&GlobalMid_Lock);
@@ -730,8 +740,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
730 740
731 rc = cifs_sync_mid_result(midQ, ses->server); 741 rc = cifs_sync_mid_result(midQ, ses->server);
732 if (rc != 0) { 742 if (rc != 0) {
733 atomic_dec(&ses->server->inFlight); 743 cifs_add_credits(ses->server, 1);
734 wake_up(&ses->server->request_q);
735 return rc; 744 return rc;
736 } 745 }
737 746
@@ -747,8 +756,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
747 rc = cifs_check_receive(midQ, ses->server, 0); 756 rc = cifs_check_receive(midQ, ses->server, 0);
748out: 757out:
749 delete_mid(midQ); 758 delete_mid(midQ);
750 atomic_dec(&ses->server->inFlight); 759 cifs_add_credits(ses->server, 1);
751 wake_up(&ses->server->request_q);
752 760
753 return rc; 761 return rc;
754} 762}
diff --git a/fs/compat.c b/fs/compat.c
index 07880bae28a9..14483a715bbb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -33,7 +33,6 @@
33#include <linux/nfs4_mount.h> 33#include <linux/nfs4_mount.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/ctype.h> 35#include <linux/ctype.h>
36#include <linux/module.h>
37#include <linux/dirent.h> 36#include <linux/dirent.h>
38#include <linux/fsnotify.h> 37#include <linux/fsnotify.h>
39#include <linux/highuid.h> 38#include <linux/highuid.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 10d8cd90ca6f..debdfe0fc809 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -49,7 +49,6 @@
49#include <linux/elevator.h> 49#include <linux/elevator.h>
50#include <linux/rtc.h> 50#include <linux/rtc.h>
51#include <linux/pci.h> 51#include <linux/pci.h>
52#include <linux/module.h>
53#include <linux/serial.h> 52#include <linux/serial.h>
54#include <linux/if_tun.h> 53#include <linux/if_tun.h>
55#include <linux/ctype.h> 54#include <linux/ctype.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 2b55bd0c1061..b60ddc41d783 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -23,7 +23,7 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/hash.h> 24#include <linux/hash.h>
25#include <linux/cache.h> 25#include <linux/cache.h>
26#include <linux/module.h> 26#include <linux/export.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/file.h> 28#include <linux/file.h>
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
@@ -2404,6 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2404 if (d_ancestor(alias, dentry)) { 2404 if (d_ancestor(alias, dentry)) {
2405 /* Check for loops */ 2405 /* Check for loops */
2406 actual = ERR_PTR(-ELOOP); 2406 actual = ERR_PTR(-ELOOP);
2407 spin_unlock(&inode->i_lock);
2407 } else if (IS_ROOT(alias)) { 2408 } else if (IS_ROOT(alias)) {
2408 /* Is this an anonymous mountpoint that we 2409 /* Is this an anonymous mountpoint that we
2409 * could splice into our tree? */ 2410 * could splice into our tree? */
@@ -2413,7 +2414,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2413 goto found; 2414 goto found;
2414 } else { 2415 } else {
2415 /* Nope, but we must(!) avoid directory 2416 /* Nope, but we must(!) avoid directory
2416 * aliasing */ 2417 * aliasing. This drops inode->i_lock */
2417 actual = __d_unalias(inode, dentry, alias); 2418 actual = __d_unalias(inode, dentry, alias);
2418 } 2419 }
2419 write_sequnlock(&rename_lock); 2420 write_sequnlock(&rename_lock);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index dda0dc702d1b..17c779967828 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -13,7 +13,7 @@
13 */ 13 */
14 14
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d9a591773919..dba15fecf23e 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,7 +16,7 @@
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/module.h> 19#include <linux/export.h>
20#include <linux/kref.h> 20#include <linux/kref.h>
21#include <linux/eventfd.h> 21#include <linux/eventfd.h>
22 22
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2a7dcd6ddc09..739b0985b398 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -426,6 +426,31 @@ out_unlock:
426 return error; 426 return error;
427} 427}
428 428
429/*
430 * As described in commit 0ccf831cb lockdep: annotate epoll
431 * the use of wait queues used by epoll is done in a very controlled
432 * manner. Wake ups can nest inside each other, but are never done
433 * with the same locking. For example:
434 *
435 * dfd = socket(...);
436 * efd1 = epoll_create();
437 * efd2 = epoll_create();
438 * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
439 * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
440 *
441 * When a packet arrives to the device underneath "dfd", the net code will
442 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
443 * callback wakeup entry on that queue, and the wake_up() performed by the
444 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
445 * (efd1) notices that it may have some event ready, so it needs to wake up
446 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
447 * that ends up in another wake_up(), after having checked about the
448 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
449 * avoid stack blasting.
450 *
451 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
452 * this special case of epoll.
453 */
429#ifdef CONFIG_DEBUG_LOCK_ALLOC 454#ifdef CONFIG_DEBUG_LOCK_ALLOC
430static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, 455static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
431 unsigned long events, int subclass) 456 unsigned long events, int subclass)
@@ -698,9 +723,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
698 void *priv) 723 void *priv)
699{ 724{
700 struct epitem *epi, *tmp; 725 struct epitem *epi, *tmp;
726 poll_table pt;
701 727
728 init_poll_funcptr(&pt, NULL);
702 list_for_each_entry_safe(epi, tmp, head, rdllink) { 729 list_for_each_entry_safe(epi, tmp, head, rdllink) {
703 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & 730 pt._key = epi->event.events;
731 if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
704 epi->event.events) 732 epi->event.events)
705 return POLLIN | POLLRDNORM; 733 return POLLIN | POLLRDNORM;
706 else { 734 else {
@@ -1048,13 +1076,11 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1048 */ 1076 */
1049static int reverse_path_check(void) 1077static int reverse_path_check(void)
1050{ 1078{
1051 int length = 0;
1052 int error = 0; 1079 int error = 0;
1053 struct file *current_file; 1080 struct file *current_file;
1054 1081
1055 /* let's call this for all tfiles */ 1082 /* let's call this for all tfiles */
1056 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { 1083 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1057 length++;
1058 path_count_init(); 1084 path_count_init();
1059 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1085 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1060 reverse_path_check_proc, current_file, 1086 reverse_path_check_proc, current_file,
@@ -1096,6 +1122,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1096 /* Initialize the poll table using the queue callback */ 1122 /* Initialize the poll table using the queue callback */
1097 epq.epi = epi; 1123 epq.epi = epi;
1098 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 1124 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1125 epq.pt._key = event->events;
1099 1126
1100 /* 1127 /*
1101 * Attach the item to the poll hooks and get current event bits. 1128 * Attach the item to the poll hooks and get current event bits.
@@ -1190,6 +1217,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1190{ 1217{
1191 int pwake = 0; 1218 int pwake = 0;
1192 unsigned int revents; 1219 unsigned int revents;
1220 poll_table pt;
1221
1222 init_poll_funcptr(&pt, NULL);
1193 1223
1194 /* 1224 /*
1195 * Set the new event interest mask before calling f_op->poll(); 1225 * Set the new event interest mask before calling f_op->poll();
@@ -1197,13 +1227,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1197 * f_op->poll() call and the new event set registering. 1227 * f_op->poll() call and the new event set registering.
1198 */ 1228 */
1199 epi->event.events = event->events; 1229 epi->event.events = event->events;
1230 pt._key = event->events;
1200 epi->event.data = event->data; /* protected by mtx */ 1231 epi->event.data = event->data; /* protected by mtx */
1201 1232
1202 /* 1233 /*
1203 * Get current event bits. We can safely use the file* here because 1234 * Get current event bits. We can safely use the file* here because
1204 * its usage count has been increased by the caller of this function. 1235 * its usage count has been increased by the caller of this function.
1205 */ 1236 */
1206 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1237 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
1207 1238
1208 /* 1239 /*
1209 * If the item is "hot" and it is not registered inside the ready 1240 * If the item is "hot" and it is not registered inside the ready
@@ -1238,6 +1269,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1238 unsigned int revents; 1269 unsigned int revents;
1239 struct epitem *epi; 1270 struct epitem *epi;
1240 struct epoll_event __user *uevent; 1271 struct epoll_event __user *uevent;
1272 poll_table pt;
1273
1274 init_poll_funcptr(&pt, NULL);
1241 1275
1242 /* 1276 /*
1243 * We can loop without lock because we are passed a task private list. 1277 * We can loop without lock because we are passed a task private list.
@@ -1250,7 +1284,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1250 1284
1251 list_del_init(&epi->rdllink); 1285 list_del_init(&epi->rdllink);
1252 1286
1253 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & 1287 pt._key = epi->event.events;
1288 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
1254 epi->event.events; 1289 epi->event.events;
1255 1290
1256 /* 1291 /*
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a2038928f9a3..1e036b79384c 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1743,8 +1743,11 @@ allocated:
1743 1743
1744 *errp = 0; 1744 *errp = 0;
1745 brelse(bitmap_bh); 1745 brelse(bitmap_bh);
1746 dquot_free_block(inode, *count-num); 1746
1747 *count = num; 1747 if (num < *count) {
1748 dquot_free_block(inode, *count-num);
1749 *count = num;
1750 }
1748 1751
1749 trace_ext3_allocate_blocks(inode, goal, num, 1752 trace_ext3_allocate_blocks(inode, goal, num,
1750 (unsigned long long)ret_block); 1753 (unsigned long long)ret_block);
@@ -1970,7 +1973,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
1970 sbi = EXT3_SB(sb); 1973 sbi = EXT3_SB(sb);
1971 1974
1972 /* Walk through the whole group */ 1975 /* Walk through the whole group */
1973 while (start < max) { 1976 while (start <= max) {
1974 start = bitmap_search_next_usable_block(start, bitmap_bh, max); 1977 start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1975 if (start < 0) 1978 if (start < 0)
1976 break; 1979 break;
@@ -1980,7 +1983,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
1980 * Allocate contiguous free extents by setting bits in the 1983 * Allocate contiguous free extents by setting bits in the
1981 * block bitmap 1984 * block bitmap
1982 */ 1985 */
1983 while (next < max 1986 while (next <= max
1984 && claim_block(sb_bgl_lock(sbi, group), 1987 && claim_block(sb_bgl_lock(sbi, group),
1985 next, bitmap_bh)) { 1988 next, bitmap_bh)) {
1986 next++; 1989 next++;
@@ -2091,73 +2094,74 @@ err_out:
2091 */ 2094 */
2092int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) 2095int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2093{ 2096{
2094 ext3_grpblk_t last_block, first_block, free_blocks; 2097 ext3_grpblk_t last_block, first_block;
2095 unsigned long first_group, last_group; 2098 unsigned long group, first_group, last_group;
2096 unsigned long group, ngroups;
2097 struct ext3_group_desc *gdp; 2099 struct ext3_group_desc *gdp;
2098 struct ext3_super_block *es = EXT3_SB(sb)->s_es; 2100 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2099 uint64_t start, len, minlen, trimmed; 2101 uint64_t start, minlen, end, trimmed = 0;
2102 ext3_fsblk_t first_data_blk =
2103 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
2100 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); 2104 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2101 int ret = 0; 2105 int ret = 0;
2102 2106
2103 start = (range->start >> sb->s_blocksize_bits) + 2107 start = range->start >> sb->s_blocksize_bits;
2104 le32_to_cpu(es->s_first_data_block); 2108 end = start + (range->len >> sb->s_blocksize_bits) - 1;
2105 len = range->len >> sb->s_blocksize_bits;
2106 minlen = range->minlen >> sb->s_blocksize_bits; 2109 minlen = range->minlen >> sb->s_blocksize_bits;
2107 trimmed = 0;
2108 2110
2109 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) 2111 if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
2112 unlikely(start >= max_blks))
2110 return -EINVAL; 2113 return -EINVAL;
2111 if (start >= max_blks) 2114 if (end >= max_blks)
2112 return -EINVAL; 2115 end = max_blks - 1;
2113 if (start + len > max_blks) 2116 if (end <= first_data_blk)
2114 len = max_blks - start; 2117 goto out;
2118 if (start < first_data_blk)
2119 start = first_data_blk;
2115 2120
2116 ngroups = EXT3_SB(sb)->s_groups_count;
2117 smp_rmb(); 2121 smp_rmb();
2118 2122
2119 /* Determine first and last group to examine based on start and len */ 2123 /* Determine first and last group to examine based on start and len */
2120 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start, 2124 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2121 &first_group, &first_block); 2125 &first_group, &first_block);
2122 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len), 2126 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
2123 &last_group, &last_block); 2127 &last_group, &last_block);
2124 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
2125 last_block = EXT3_BLOCKS_PER_GROUP(sb);
2126 2128
2127 if (first_group > last_group) 2129 /* end now represents the last block to discard in this group */
2128 return -EINVAL; 2130 end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
2129 2131
2130 for (group = first_group; group <= last_group; group++) { 2132 for (group = first_group; group <= last_group; group++) {
2131 gdp = ext3_get_group_desc(sb, group, NULL); 2133 gdp = ext3_get_group_desc(sb, group, NULL);
2132 if (!gdp) 2134 if (!gdp)
2133 break; 2135 break;
2134 2136
2135 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
2136 if (free_blocks < minlen)
2137 continue;
2138
2139 /* 2137 /*
2140 * For all the groups except the last one, last block will 2138 * For all the groups except the last one, last block will
2141 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to 2139 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
2142 * change it for the last group in which case first_block + 2140 * change it for the last group, note that last_block is
2143 * len < EXT3_BLOCKS_PER_GROUP(sb). 2141 * already computed earlier by ext3_get_group_no_and_offset()
2144 */ 2142 */
2145 if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) 2143 if (group == last_group)
2146 last_block = first_block + len; 2144 end = last_block;
2147 len -= last_block - first_block;
2148 2145
2149 ret = ext3_trim_all_free(sb, group, first_block, 2146 if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
2150 last_block, minlen); 2147 ret = ext3_trim_all_free(sb, group, first_block,
2151 if (ret < 0) 2148 end, minlen);
2152 break; 2149 if (ret < 0)
2150 break;
2151 trimmed += ret;
2152 }
2153 2153
2154 trimmed += ret; 2154 /*
2155 * For every group except the first one, we are sure
2156 * that the first block to discard will be block #0.
2157 */
2155 first_block = 0; 2158 first_block = 0;
2156 } 2159 }
2157 2160
2158 if (ret >= 0) 2161 if (ret > 0)
2159 ret = 0; 2162 ret = 0;
2160 range->len = trimmed * sb->s_blocksize;
2161 2163
2164out:
2165 range->len = trimmed * sb->s_blocksize;
2162 return ret; 2166 return ret;
2163} 2167}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2d0afeca0b47..6d3418662b54 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -756,6 +756,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
756 struct ext3_block_alloc_info *block_i; 756 struct ext3_block_alloc_info *block_i;
757 ext3_fsblk_t current_block; 757 ext3_fsblk_t current_block;
758 struct ext3_inode_info *ei = EXT3_I(inode); 758 struct ext3_inode_info *ei = EXT3_I(inode);
759 struct timespec now;
759 760
760 block_i = ei->i_block_alloc_info; 761 block_i = ei->i_block_alloc_info;
761 /* 762 /*
@@ -795,9 +796,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
795 } 796 }
796 797
797 /* We are done with atomic stuff, now do the rest of housekeeping */ 798 /* We are done with atomic stuff, now do the rest of housekeeping */
798 799 now = CURRENT_TIME_SEC;
799 inode->i_ctime = CURRENT_TIME_SEC; 800 if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
800 ext3_mark_inode_dirty(handle, inode); 801 inode->i_ctime = now;
802 ext3_mark_inode_dirty(handle, inode);
803 }
801 /* ext3_mark_inode_dirty already updated i_sync_tid */ 804 /* ext3_mark_inode_dirty already updated i_sync_tid */
802 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); 805 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
803 806
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8cf711..4bbd07a6fa18 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -336,10 +336,10 @@ err_out:
336 * Return buffer_head on success or NULL in case of failure. 336 * Return buffer_head on success or NULL in case of failure.
337 */ 337 */
338struct buffer_head * 338struct buffer_head *
339ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 339ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
340{ 340{
341 struct ext4_group_desc *desc; 341 struct ext4_group_desc *desc;
342 struct buffer_head *bh = NULL; 342 struct buffer_head *bh;
343 ext4_fsblk_t bitmap_blk; 343 ext4_fsblk_t bitmap_blk;
344 344
345 desc = ext4_get_group_desc(sb, block_group, NULL); 345 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
348 bitmap_blk = ext4_block_bitmap(sb, desc); 348 bitmap_blk = ext4_block_bitmap(sb, desc);
349 bh = sb_getblk(sb, bitmap_blk); 349 bh = sb_getblk(sb, bitmap_blk);
350 if (unlikely(!bh)) { 350 if (unlikely(!bh)) {
351 ext4_error(sb, "Cannot read block bitmap - " 351 ext4_error(sb, "Cannot get buffer for block bitmap - "
352 "block_group = %u, block_bitmap = %llu", 352 "block_group = %u, block_bitmap = %llu",
353 block_group, bitmap_blk); 353 block_group, bitmap_blk);
354 return NULL; 354 return NULL;
355 } 355 }
356 356
@@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
382 return bh; 382 return bh;
383 } 383 }
384 /* 384 /*
385 * submit the buffer_head for read. We can 385 * submit the buffer_head for reading
386 * safely mark the bitmap as uptodate now.
387 * We do it here so the bitmap uptodate bit
388 * get set with buffer lock held.
389 */ 386 */
387 set_buffer_new(bh);
390 trace_ext4_read_block_bitmap_load(sb, block_group); 388 trace_ext4_read_block_bitmap_load(sb, block_group);
391 set_bitmap_uptodate(bh); 389 bh->b_end_io = ext4_end_bitmap_read;
392 if (bh_submit_read(bh) < 0) { 390 get_bh(bh);
393 put_bh(bh); 391 submit_bh(READ, bh);
392 return bh;
393}
394
395/* Returns 0 on success, 1 on error */
396int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
397 struct buffer_head *bh)
398{
399 struct ext4_group_desc *desc;
400
401 if (!buffer_new(bh))
402 return 0;
403 desc = ext4_get_group_desc(sb, block_group, NULL);
404 if (!desc)
405 return 1;
406 wait_on_buffer(bh);
407 if (!buffer_uptodate(bh)) {
394 ext4_error(sb, "Cannot read block bitmap - " 408 ext4_error(sb, "Cannot read block bitmap - "
395 "block_group = %u, block_bitmap = %llu", 409 "block_group = %u, block_bitmap = %llu",
396 block_group, bitmap_blk); 410 block_group, (unsigned long long) bh->b_blocknr);
397 return NULL; 411 return 1;
398 } 412 }
413 clear_buffer_new(bh);
414 /* Panic or remount fs read-only if block bitmap is invalid */
399 ext4_valid_block_bitmap(sb, desc, block_group, bh); 415 ext4_valid_block_bitmap(sb, desc, block_group, bh);
400 /* 416 return 0;
401 * file system mounted not to panic on error, 417}
402 * continue with corrupt bitmap 418
403 */ 419struct buffer_head *
420ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
421{
422 struct buffer_head *bh;
423
424 bh = ext4_read_block_bitmap_nowait(sb, block_group);
425 if (ext4_wait_block_bitmap(sb, block_group, bh)) {
426 put_bh(bh);
427 return NULL;
428 }
404 return bh; 429 return bh;
405} 430}
406 431
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c56092e58..ad56866d729a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
91 return 0; 91 return 0;
92 92
93 if (filp) 93 if (filp)
94 ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, 94 ext4_error_file(filp, function, line, bh->b_blocknr,
95 "bad entry in directory: %s - offset=%u(%u), " 95 "bad entry in directory: %s - offset=%u(%u), "
96 "inode=%u, rec_len=%d, name_len=%d", 96 "inode=%u, rec_len=%d, name_len=%d",
97 error_msg, (unsigned) (offset%bh->b_size), 97 error_msg, (unsigned) (offset % bh->b_size),
98 offset, le32_to_cpu(de->inode), 98 offset, le32_to_cpu(de->inode),
99 rlen, de->name_len); 99 rlen, de->name_len);
100 else 100 else
101 ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, 101 ext4_error_inode(dir, function, line, bh->b_blocknr,
102 "bad entry in directory: %s - offset=%u(%u), " 102 "bad entry in directory: %s - offset=%u(%u), "
103 "inode=%u, rec_len=%d, name_len=%d", 103 "inode=%u, rec_len=%d, name_len=%d",
104 error_msg, (unsigned) (offset%bh->b_size), 104 error_msg, (unsigned) (offset % bh->b_size),
105 offset, le32_to_cpu(de->inode), 105 offset, le32_to_cpu(de->inode),
106 rlen, de->name_len); 106 rlen, de->name_len);
107 107
@@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent,
425 sb = inode->i_sb; 425 sb = inode->i_sb;
426 426
427 if (!fname) { 427 if (!fname) {
428 printk(KERN_ERR "EXT4-fs: call_filldir: called with " 428 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
429 "null fname?!?\n"); 429 "called with null fname?!?", __func__, __LINE__,
430 inode->i_ino, current->comm);
430 return 0; 431 return 0;
431 } 432 }
432 curr_pos = hash2pos(fname->hash, fname->minor_hash); 433 curr_pos = hash2pos(fname->hash, fname->minor_hash);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004fc3d84..ded731ac8a32 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,7 +53,7 @@
53 printk(KERN_DEBUG f, ## a); \ 53 printk(KERN_DEBUG f, ## a); \
54 } while (0) 54 } while (0)
55#else 55#else
56#define ext4_debug(f, a...) do {} while (0) 56#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
57#endif 57#endif
58 58
59#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@ struct mpage_da_data {
184#define EXT4_IO_END_UNWRITTEN 0x0001 184#define EXT4_IO_END_UNWRITTEN 0x0001
185#define EXT4_IO_END_ERROR 0x0002 185#define EXT4_IO_END_ERROR 0x0002
186#define EXT4_IO_END_QUEUED 0x0004 186#define EXT4_IO_END_QUEUED 0x0004
187#define EXT4_IO_END_DIRECT 0x0008
188#define EXT4_IO_END_IN_FSYNC 0x0010
187 189
188struct ext4_io_page { 190struct ext4_io_page {
189 struct page *p_page; 191 struct page *p_page;
@@ -192,18 +194,25 @@ struct ext4_io_page {
192 194
193#define MAX_IO_PAGES 128 195#define MAX_IO_PAGES 128
194 196
197/*
198 * For converting uninitialized extents on a work queue.
199 *
200 * 'page' is only used from the writepage() path; 'pages' is only used for
201 * buffered writes; they are used to keep page references until conversion
202 * takes place. For AIO/DIO, neither field is filled in.
203 */
195typedef struct ext4_io_end { 204typedef struct ext4_io_end {
196 struct list_head list; /* per-file finished IO list */ 205 struct list_head list; /* per-file finished IO list */
197 struct inode *inode; /* file being written to */ 206 struct inode *inode; /* file being written to */
198 unsigned int flag; /* unwritten or not */ 207 unsigned int flag; /* unwritten or not */
199 struct page *page; /* page struct for buffer write */ 208 struct page *page; /* for writepage() path */
200 loff_t offset; /* offset in the file */ 209 loff_t offset; /* offset in the file */
201 ssize_t size; /* size of the extent */ 210 ssize_t size; /* size of the extent */
202 struct work_struct work; /* data work queue */ 211 struct work_struct work; /* data work queue */
203 struct kiocb *iocb; /* iocb struct for AIO */ 212 struct kiocb *iocb; /* iocb struct for AIO */
204 int result; /* error value for AIO */ 213 int result; /* error value for AIO */
205 int num_io_pages; 214 int num_io_pages; /* for writepages() */
206 struct ext4_io_page *pages[MAX_IO_PAGES]; 215 struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
207} ext4_io_end_t; 216} ext4_io_end_t;
208 217
209struct ext4_io_submit { 218struct ext4_io_submit {
@@ -923,6 +932,7 @@ struct ext4_inode_info {
923#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ 932#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
924#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ 933#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
925#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ 934#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
935#define EXT4_MOUNT_ERRORS_MASK 0x00070
926#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ 936#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
927#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ 937#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
928#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ 938#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
@@ -941,7 +951,6 @@ struct ext4_inode_info {
941#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ 951#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
942#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 952#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
943#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 953#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
944#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
945#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ 954#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
946#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 955#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
947#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 956#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@ struct ext4_sb_info {
1142 unsigned int s_mount_opt; 1151 unsigned int s_mount_opt;
1143 unsigned int s_mount_opt2; 1152 unsigned int s_mount_opt2;
1144 unsigned int s_mount_flags; 1153 unsigned int s_mount_flags;
1154 unsigned int s_def_mount_opt;
1145 ext4_fsblk_t s_sb_block; 1155 ext4_fsblk_t s_sb_block;
1146 uid_t s_resuid; 1156 uid_t s_resuid;
1147 gid_t s_resgid; 1157 gid_t s_resgid;
@@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1420#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1430#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1421#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1431#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1422#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1432#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1423#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */ 1433#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
1424#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ 1434#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
1435#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
1425 1436
1426#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1437#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1427#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1438#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
1794 ext4_group_t block_group, 1805 ext4_group_t block_group,
1795 struct buffer_head ** bh); 1806 struct buffer_head ** bh);
1796extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1807extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1797struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, 1808
1798 ext4_group_t block_group); 1809extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
1810 ext4_group_t block_group);
1811extern int ext4_wait_block_bitmap(struct super_block *sb,
1812 ext4_group_t block_group,
1813 struct buffer_head *bh);
1814extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1815 ext4_group_t block_group);
1799extern void ext4_init_block_bitmap(struct super_block *sb, 1816extern void ext4_init_block_bitmap(struct super_block *sb,
1800 struct buffer_head *bh, 1817 struct buffer_head *bh,
1801 ext4_group_t group, 1818 ext4_group_t group,
@@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);
1841extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); 1858extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
1842extern int ext4_init_inode_table(struct super_block *sb, 1859extern int ext4_init_inode_table(struct super_block *sb,
1843 ext4_group_t group, int barrier); 1860 ext4_group_t group, int barrier);
1861extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
1844 1862
1845/* mballoc.c */ 1863/* mballoc.c */
1846extern long ext4_mb_stats; 1864extern long ext4_mb_stats;
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a69a30..0f58b86e3a02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -47,9 +47,9 @@
47 */ 47 */
48#define EXT_DEBUG__ 48#define EXT_DEBUG__
49#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
50#define ext_debug(a...) printk(a) 50#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
51#else 51#else
52#define ext_debug(a...) 52#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
53#endif 53#endif
54 54
55/* 55/*
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1dab18..83b20fcf9400 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -104,6 +104,78 @@
104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
106 106
107/**
108 * struct ext4_journal_cb_entry - Base structure for callback information.
109 *
110 * This struct is a 'seed' structure for a using with your own callback
111 * structs. If you are using callbacks you must allocate one of these
112 * or another struct of your own definition which has this struct
113 * as it's first element and pass it to ext4_journal_callback_add().
114 */
115struct ext4_journal_cb_entry {
116 /* list information for other callbacks attached to the same handle */
117 struct list_head jce_list;
118
119 /* Function to call with this callback structure */
120 void (*jce_func)(struct super_block *sb,
121 struct ext4_journal_cb_entry *jce, int error);
122
123 /* user data goes here */
124};
125
126/**
127 * ext4_journal_callback_add: add a function to call after transaction commit
128 * @handle: active journal transaction handle to register callback on
129 * @func: callback function to call after the transaction has committed:
130 * @sb: superblock of current filesystem for transaction
131 * @jce: returned journal callback data
132 * @rc: journal state at commit (0 = transaction committed properly)
133 * @jce: journal callback data (internal and function private data struct)
134 *
135 * The registered function will be called in the context of the journal thread
136 * after the transaction for which the handle was created has completed.
137 *
138 * No locks are held when the callback function is called, so it is safe to
139 * call blocking functions from within the callback, but the callback should
140 * not block or run for too long, or the filesystem will be blocked waiting for
141 * the next transaction to commit. No journaling functions can be used, or
142 * there is a risk of deadlock.
143 *
144 * There is no guaranteed calling order of multiple registered callbacks on
145 * the same transaction.
146 */
147static inline void ext4_journal_callback_add(handle_t *handle,
148 void (*func)(struct super_block *sb,
149 struct ext4_journal_cb_entry *jce,
150 int rc),
151 struct ext4_journal_cb_entry *jce)
152{
153 struct ext4_sb_info *sbi =
154 EXT4_SB(handle->h_transaction->t_journal->j_private);
155
156 /* Add the jce to transaction's private list */
157 jce->jce_func = func;
158 spin_lock(&sbi->s_md_lock);
159 list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
160 spin_unlock(&sbi->s_md_lock);
161}
162
163/**
164 * ext4_journal_callback_del: delete a registered callback
165 * @handle: active journal transaction handle on which callback was registered
166 * @jce: registered journal callback entry to unregister
167 */
168static inline void ext4_journal_callback_del(handle_t *handle,
169 struct ext4_journal_cb_entry *jce)
170{
171 struct ext4_sb_info *sbi =
172 EXT4_SB(handle->h_transaction->t_journal->j_private);
173
174 spin_lock(&sbi->s_md_lock);
175 list_del_init(&jce->jce_list);
176 spin_unlock(&sbi->s_md_lock);
177}
178
107int 179int
108ext4_mark_iloc_dirty(handle_t *handle, 180ext4_mark_iloc_dirty(handle_t *handle,
109 struct inode *inode, 181 struct inode *inode,
@@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
261/* super.c */ 333/* super.c */
262int ext4_force_commit(struct super_block *sb); 334int ext4_force_commit(struct super_block *sb);
263 335
264static inline int ext4_should_journal_data(struct inode *inode) 336/*
337 * Ext4 inode journal modes
338 */
339#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
340#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
341#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
342
343static inline int ext4_inode_journal_mode(struct inode *inode)
265{ 344{
266 if (EXT4_JOURNAL(inode) == NULL) 345 if (EXT4_JOURNAL(inode) == NULL)
267 return 0; 346 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
268 if (!S_ISREG(inode->i_mode)) 347 /* We do not support data journalling with delayed allocation */
269 return 1; 348 if (!S_ISREG(inode->i_mode) ||
270 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 349 test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
271 return 1; 350 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
272 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 351 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
273 return 1; 352 !test_opt(inode->i_sb, DELALLOC))
274 return 0; 353 return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
354 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
355 return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
356 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
357 return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
358 else
359 BUG();
360}
361
362static inline int ext4_should_journal_data(struct inode *inode)
363{
364 return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
275} 365}
276 366
277static inline int ext4_should_order_data(struct inode *inode) 367static inline int ext4_should_order_data(struct inode *inode)
278{ 368{
279 if (EXT4_JOURNAL(inode) == NULL) 369 return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
280 return 0;
281 if (!S_ISREG(inode->i_mode))
282 return 0;
283 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
284 return 0;
285 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
286 return 1;
287 return 0;
288} 370}
289 371
290static inline int ext4_should_writeback_data(struct inode *inode) 372static inline int ext4_should_writeback_data(struct inode *inode)
291{ 373{
292 if (EXT4_JOURNAL(inode) == NULL) 374 return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
293 return 1;
294 if (!S_ISREG(inode->i_mode))
295 return 0;
296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
297 return 0;
298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
299 return 1;
300 return 0;
301} 375}
302 376
303/* 377/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c292e1b..1421938e6792 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,14 @@
44 44
45#include <trace/events/ext4.h> 45#include <trace/events/ext4.h>
46 46
47/*
48 * used by extent splitting.
49 */
50#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
51 due to ENOSPC */
52#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
53#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
54
47static int ext4_split_extent(handle_t *handle, 55static int ext4_split_extent(handle_t *handle,
48 struct inode *inode, 56 struct inode *inode,
49 struct ext4_ext_path *path, 57 struct ext4_ext_path *path,
@@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,
51 int split_flag, 59 int split_flag,
52 int flags); 60 int flags);
53 61
62static int ext4_split_extent_at(handle_t *handle,
63 struct inode *inode,
64 struct ext4_ext_path *path,
65 ext4_lblk_t split,
66 int split_flag,
67 int flags);
68
54static int ext4_ext_truncate_extend_restart(handle_t *handle, 69static int ext4_ext_truncate_extend_restart(handle_t *handle,
55 struct inode *inode, 70 struct inode *inode,
56 int needed) 71 int needed)
@@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
300 ext4_fsblk_t block = ext4_ext_pblock(ext); 315 ext4_fsblk_t block = ext4_ext_pblock(ext);
301 int len = ext4_ext_get_actual_len(ext); 316 int len = ext4_ext_get_actual_len(ext);
302 317
318 if (len == 0)
319 return 0;
303 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 320 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
304} 321}
305 322
@@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2308 struct ext4_extent *ex; 2325 struct ext4_extent *ex;
2309 2326
2310 /* the header must be checked already in ext4_ext_remove_space() */ 2327 /* the header must be checked already in ext4_ext_remove_space() */
2311 ext_debug("truncate since %u in leaf\n", start); 2328 ext_debug("truncate since %u in leaf to %u\n", start, end);
2312 if (!path[depth].p_hdr) 2329 if (!path[depth].p_hdr)
2313 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2330 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2314 eh = path[depth].p_hdr; 2331 eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2343 ext_debug(" border %u:%u\n", a, b); 2360 ext_debug(" border %u:%u\n", a, b);
2344 2361
2345 /* If this extent is beyond the end of the hole, skip it */ 2362 /* If this extent is beyond the end of the hole, skip it */
2346 if (end <= ex_ee_block) { 2363 if (end < ex_ee_block) {
2347 ex--; 2364 ex--;
2348 ex_ee_block = le32_to_cpu(ex->ee_block); 2365 ex_ee_block = le32_to_cpu(ex->ee_block);
2349 ex_ee_len = ext4_ext_get_actual_len(ex); 2366 ex_ee_len = ext4_ext_get_actual_len(ex);
2350 continue; 2367 continue;
2351 } else if (b != ex_ee_block + ex_ee_len - 1) { 2368 } else if (b != ex_ee_block + ex_ee_len - 1) {
2352 EXT4_ERROR_INODE(inode," bad truncate %u:%u\n", 2369 EXT4_ERROR_INODE(inode,
2353 start, end); 2370 "can not handle truncate %u:%u "
2371 "on extent %u:%u",
2372 start, end, ex_ee_block,
2373 ex_ee_block + ex_ee_len - 1);
2354 err = -EIO; 2374 err = -EIO;
2355 goto out; 2375 goto out;
2356 } else if (a != ex_ee_block) { 2376 } else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2482 return 1; 2502 return 1;
2483} 2503}
2484 2504
2485static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2505static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2506 ext4_lblk_t end)
2486{ 2507{
2487 struct super_block *sb = inode->i_sb; 2508 struct super_block *sb = inode->i_sb;
2488 int depth = ext_depth(inode); 2509 int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2491 handle_t *handle; 2512 handle_t *handle;
2492 int i, err; 2513 int i, err;
2493 2514
2494 ext_debug("truncate since %u\n", start); 2515 ext_debug("truncate since %u to %u\n", start, end);
2495 2516
2496 /* probably first extent we're gonna free will be last in block */ 2517 /* probably first extent we're gonna free will be last in block */
2497 handle = ext4_journal_start(inode, depth + 1); 2518 handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@ again:
2504 trace_ext4_ext_remove_space(inode, start, depth); 2525 trace_ext4_ext_remove_space(inode, start, depth);
2505 2526
2506 /* 2527 /*
2528 * Check if we are removing extents inside the extent tree. If that
2529 * is the case, we are going to punch a hole inside the extent tree
2530 * so we have to check whether we need to split the extent covering
2531 * the last block to remove so we can easily remove the part of it
2532 * in ext4_ext_rm_leaf().
2533 */
2534 if (end < EXT_MAX_BLOCKS - 1) {
2535 struct ext4_extent *ex;
2536 ext4_lblk_t ee_block;
2537
2538 /* find extent for this block */
2539 path = ext4_ext_find_extent(inode, end, NULL);
2540 if (IS_ERR(path)) {
2541 ext4_journal_stop(handle);
2542 return PTR_ERR(path);
2543 }
2544 depth = ext_depth(inode);
2545 ex = path[depth].p_ext;
2546 if (!ex)
2547 goto cont;
2548
2549 ee_block = le32_to_cpu(ex->ee_block);
2550
2551 /*
2552 * See if the last block is inside the extent, if so split
2553 * the extent at 'end' block so we can easily remove the
2554 * tail of the first part of the split extent in
2555 * ext4_ext_rm_leaf().
2556 */
2557 if (end >= ee_block &&
2558 end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2559 int split_flag = 0;
2560
2561 if (ext4_ext_is_uninitialized(ex))
2562 split_flag = EXT4_EXT_MARK_UNINIT1 |
2563 EXT4_EXT_MARK_UNINIT2;
2564
2565 /*
2566 * Split the extent in two so that 'end' is the last
2567 * block in the first new extent
2568 */
2569 err = ext4_split_extent_at(handle, inode, path,
2570 end + 1, split_flag,
2571 EXT4_GET_BLOCKS_PRE_IO |
2572 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
2573
2574 if (err < 0)
2575 goto out;
2576 }
2577 ext4_ext_drop_refs(path);
2578 kfree(path);
2579 }
2580cont:
2581
2582 /*
2507 * We start scanning from right side, freeing all the blocks 2583 * We start scanning from right side, freeing all the blocks
2508 * after i_size and walking into the tree depth-wise. 2584 * after i_size and walking into the tree depth-wise.
2509 */ 2585 */
@@ -2515,6 +2591,7 @@ again:
2515 } 2591 }
2516 path[0].p_depth = depth; 2592 path[0].p_depth = depth;
2517 path[0].p_hdr = ext_inode_hdr(inode); 2593 path[0].p_hdr = ext_inode_hdr(inode);
2594
2518 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2595 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2519 err = -EIO; 2596 err = -EIO;
2520 goto out; 2597 goto out;
@@ -2526,7 +2603,7 @@ again:
2526 /* this is leaf block */ 2603 /* this is leaf block */
2527 err = ext4_ext_rm_leaf(handle, inode, path, 2604 err = ext4_ext_rm_leaf(handle, inode, path,
2528 &partial_cluster, start, 2605 &partial_cluster, start,
2529 EXT_MAX_BLOCKS - 1); 2606 end);
2530 /* root level has p_bh == NULL, brelse() eats this */ 2607 /* root level has p_bh == NULL, brelse() eats this */
2531 brelse(path[i].p_bh); 2608 brelse(path[i].p_bh);
2532 path[i].p_bh = NULL; 2609 path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)
2651 2728
2652 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2729 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2653#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2730#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2654 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2731 printk(KERN_INFO "EXT4-fs: file extents enabled"
2655#ifdef AGGRESSIVE_TEST 2732#ifdef AGGRESSIVE_TEST
2656 printk(", aggressive tests"); 2733 ", aggressive tests"
2657#endif 2734#endif
2658#ifdef CHECK_BINSEARCH 2735#ifdef CHECK_BINSEARCH
2659 printk(", check binsearch"); 2736 ", check binsearch"
2660#endif 2737#endif
2661#ifdef EXTENTS_STATS 2738#ifdef EXTENTS_STATS
2662 printk(", stats"); 2739 ", stats"
2663#endif 2740#endif
2664 printk("\n"); 2741 "\n");
2665#endif 2742#endif
2666#ifdef EXTENTS_STATS 2743#ifdef EXTENTS_STATS
2667 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2744 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2709} 2786}
2710 2787
2711/* 2788/*
2712 * used by extent splitting.
2713 */
2714#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2715 due to ENOSPC */
2716#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2717#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2718
2719/*
2720 * ext4_split_extent_at() splits an extent at given block. 2789 * ext4_split_extent_at() splits an extent at given block.
2721 * 2790 *
2722 * @handle: the journal handle 2791 * @handle: the journal handle
@@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3224 depth = ext_depth(inode); 3293 depth = ext_depth(inode);
3225 eh = path[depth].p_hdr; 3294 eh = path[depth].p_hdr;
3226 3295
3227 if (unlikely(!eh->eh_entries)) { 3296 /*
3228 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3297 * We're going to remove EOFBLOCKS_FL entirely in future so we
3229 "EOFBLOCKS_FL set"); 3298 * do not care for this case anymore. Simply remove the flag
3230 return -EIO; 3299 * if there are no extents.
3231 } 3300 */
3301 if (unlikely(!eh->eh_entries))
3302 goto out;
3232 last_ex = EXT_LAST_EXTENT(eh); 3303 last_ex = EXT_LAST_EXTENT(eh);
3233 /* 3304 /*
3234 * We should clear the EOFBLOCKS_FL flag if we are writing the 3305 * We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3252 for (i = depth-1; i >= 0; i--) 3323 for (i = depth-1; i >= 0; i--)
3253 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3324 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3254 return 0; 3325 return 0;
3326out:
3255 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3327 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3256 return ext4_mark_inode_dirty(handle, inode); 3328 return ext4_mark_inode_dirty(handle, inode);
3257} 3329}
@@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3710 int free_on_err = 0, err = 0, depth, ret; 3782 int free_on_err = 0, err = 0, depth, ret;
3711 unsigned int allocated = 0, offset = 0; 3783 unsigned int allocated = 0, offset = 0;
3712 unsigned int allocated_clusters = 0; 3784 unsigned int allocated_clusters = 0;
3713 unsigned int punched_out = 0;
3714 unsigned int result = 0;
3715 struct ext4_allocation_request ar; 3785 struct ext4_allocation_request ar;
3716 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3786 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3717 ext4_lblk_t cluster_offset; 3787 ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3721 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3791 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3722 3792
3723 /* check in cache */ 3793 /* check in cache */
3724 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && 3794 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3725 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3726 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3795 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3727 if ((sbi->s_cluster_ratio > 1) && 3796 if ((sbi->s_cluster_ratio > 1) &&
3728 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3797 ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3790 3859
3791 /* if found extent covers block, simply return it */ 3860 /* if found extent covers block, simply return it */
3792 if (in_range(map->m_lblk, ee_block, ee_len)) { 3861 if (in_range(map->m_lblk, ee_block, ee_len)) {
3793 struct ext4_map_blocks punch_map;
3794 ext4_fsblk_t partial_cluster = 0;
3795
3796 newblock = map->m_lblk - ee_block + ee_start; 3862 newblock = map->m_lblk - ee_block + ee_start;
3797 /* number of remaining blocks in the extent */ 3863 /* number of remaining blocks in the extent */
3798 allocated = ee_len - (map->m_lblk - ee_block); 3864 allocated = ee_len - (map->m_lblk - ee_block);
3799 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3865 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3800 ee_block, ee_len, newblock); 3866 ee_block, ee_len, newblock);
3801 3867
3802 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3803 /*
3804 * Do not put uninitialized extent
3805 * in the cache
3806 */
3807 if (!ext4_ext_is_uninitialized(ex)) {
3808 ext4_ext_put_in_cache(inode, ee_block,
3809 ee_len, ee_start);
3810 goto out;
3811 }
3812 ret = ext4_ext_handle_uninitialized_extents(
3813 handle, inode, map, path, flags,
3814 allocated, newblock);
3815 return ret;
3816 }
3817
3818 /*
3819 * Punch out the map length, but only to the
3820 * end of the extent
3821 */
3822 punched_out = allocated < map->m_len ?
3823 allocated : map->m_len;
3824
3825 /* 3868 /*
3826 * Sense extents need to be converted to 3869 * Do not put uninitialized extent
3827 * uninitialized, they must fit in an 3870 * in the cache
3828 * uninitialized extent
3829 */ 3871 */
3830 if (punched_out > EXT_UNINIT_MAX_LEN) 3872 if (!ext4_ext_is_uninitialized(ex)) {
3831 punched_out = EXT_UNINIT_MAX_LEN; 3873 ext4_ext_put_in_cache(inode, ee_block,
3832 3874 ee_len, ee_start);
3833 punch_map.m_lblk = map->m_lblk; 3875 goto out;
3834 punch_map.m_pblk = newblock;
3835 punch_map.m_len = punched_out;
3836 punch_map.m_flags = 0;
3837
3838 /* Check to see if the extent needs to be split */
3839 if (punch_map.m_len != ee_len ||
3840 punch_map.m_lblk != ee_block) {
3841
3842 ret = ext4_split_extent(handle, inode,
3843 path, &punch_map, 0,
3844 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3845 EXT4_GET_BLOCKS_PRE_IO);
3846
3847 if (ret < 0) {
3848 err = ret;
3849 goto out2;
3850 }
3851 /*
3852 * find extent for the block at
3853 * the start of the hole
3854 */
3855 ext4_ext_drop_refs(path);
3856 kfree(path);
3857
3858 path = ext4_ext_find_extent(inode,
3859 map->m_lblk, NULL);
3860 if (IS_ERR(path)) {
3861 err = PTR_ERR(path);
3862 path = NULL;
3863 goto out2;
3864 }
3865
3866 depth = ext_depth(inode);
3867 ex = path[depth].p_ext;
3868 ee_len = ext4_ext_get_actual_len(ex);
3869 ee_block = le32_to_cpu(ex->ee_block);
3870 ee_start = ext4_ext_pblock(ex);
3871
3872 }
3873
3874 ext4_ext_mark_uninitialized(ex);
3875
3876 ext4_ext_invalidate_cache(inode);
3877
3878 err = ext4_ext_rm_leaf(handle, inode, path,
3879 &partial_cluster, map->m_lblk,
3880 map->m_lblk + punched_out);
3881
3882 if (!err && path->p_hdr->eh_entries == 0) {
3883 /*
3884 * Punch hole freed all of this sub tree,
3885 * so we need to correct eh_depth
3886 */
3887 err = ext4_ext_get_access(handle, inode, path);
3888 if (err == 0) {
3889 ext_inode_hdr(inode)->eh_depth = 0;
3890 ext_inode_hdr(inode)->eh_max =
3891 cpu_to_le16(ext4_ext_space_root(
3892 inode, 0));
3893
3894 err = ext4_ext_dirty(
3895 handle, inode, path);
3896 }
3897 } 3876 }
3898 3877 ret = ext4_ext_handle_uninitialized_extents(
3899 goto out2; 3878 handle, inode, map, path, flags,
3879 allocated, newblock);
3880 return ret;
3900 } 3881 }
3901 } 3882 }
3902 3883
@@ -4165,13 +4146,11 @@ out2:
4165 ext4_ext_drop_refs(path); 4146 ext4_ext_drop_refs(path);
4166 kfree(path); 4147 kfree(path);
4167 } 4148 }
4168 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
4169 punched_out : allocated;
4170 4149
4171 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 4150 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
4172 newblock, map->m_len, err ? err : result); 4151 newblock, map->m_len, err ? err : allocated);
4173 4152
4174 return err ? err : result; 4153 return err ? err : allocated;
4175} 4154}
4176 4155
4177void ext4_ext_truncate(struct inode *inode) 4156void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)
4228 4207
4229 last_block = (inode->i_size + sb->s_blocksize - 1) 4208 last_block = (inode->i_size + sb->s_blocksize - 1)
4230 >> EXT4_BLOCK_SIZE_BITS(sb); 4209 >> EXT4_BLOCK_SIZE_BITS(sb);
4231 err = ext4_ext_remove_space(inode, last_block); 4210 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4232 4211
4233 /* In a multi-transaction truncate, we only make the final 4212 /* In a multi-transaction truncate, we only make the final
4234 * transaction synchronous. 4213 * transaction synchronous.
@@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4436 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4415 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4437 if (ret <= 0) { 4416 if (ret <= 0) {
4438 WARN_ON(ret <= 0); 4417 WARN_ON(ret <= 0);
4439 printk(KERN_ERR "%s: ext4_ext_map_blocks " 4418 ext4_msg(inode->i_sb, KERN_ERR,
4440 "returned error inode#%lu, block=%u, " 4419 "%s:%d: inode #%lu: block %u: len %u: "
4441 "max_blocks=%u", __func__, 4420 "ext4_ext_map_blocks returned %d",
4442 inode->i_ino, map.m_lblk, map.m_len); 4421 __func__, __LINE__, inode->i_ino, map.m_lblk,
4422 map.m_len, ret);
4443 } 4423 }
4444 ext4_mark_inode_dirty(handle, inode); 4424 ext4_mark_inode_dirty(handle, inode);
4445 ret2 = ext4_journal_stop(handle); 4425 ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4705{ 4685{
4706 struct inode *inode = file->f_path.dentry->d_inode; 4686 struct inode *inode = file->f_path.dentry->d_inode;
4707 struct super_block *sb = inode->i_sb; 4687 struct super_block *sb = inode->i_sb;
4708 struct ext4_ext_cache cache_ex; 4688 ext4_lblk_t first_block, stop_block;
4709 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4710 struct address_space *mapping = inode->i_mapping; 4689 struct address_space *mapping = inode->i_mapping;
4711 struct ext4_map_blocks map;
4712 handle_t *handle; 4690 handle_t *handle;
4713 loff_t first_page, last_page, page_len; 4691 loff_t first_page, last_page, page_len;
4714 loff_t first_page_offset, last_page_offset; 4692 loff_t first_page_offset, last_page_offset;
4715 int ret, credits, blocks_released, err = 0; 4693 int credits, err = 0;
4716 4694
4717 /* No need to punch hole beyond i_size */ 4695 /* No need to punch hole beyond i_size */
4718 if (offset >= inode->i_size) 4696 if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4728 offset; 4706 offset;
4729 } 4707 }
4730 4708
4731 first_block = (offset + sb->s_blocksize - 1) >>
4732 EXT4_BLOCK_SIZE_BITS(sb);
4733 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4734
4735 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4709 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4736 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4710 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4737 4711
@@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4810 } 4784 }
4811 } 4785 }
4812 4786
4813
4814 /* 4787 /*
4815 * If i_size is contained in the last page, we need to 4788 * If i_size is contained in the last page, we need to
4816 * unmap and zero the partial page after i_size 4789 * unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4830 } 4803 }
4831 } 4804 }
4832 4805
4806 first_block = (offset + sb->s_blocksize - 1) >>
4807 EXT4_BLOCK_SIZE_BITS(sb);
4808 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4809
4833 /* If there are no blocks to remove, return now */ 4810 /* If there are no blocks to remove, return now */
4834 if (first_block >= last_block) 4811 if (first_block >= stop_block)
4835 goto out; 4812 goto out;
4836 4813
4837 down_write(&EXT4_I(inode)->i_data_sem); 4814 down_write(&EXT4_I(inode)->i_data_sem);
4838 ext4_ext_invalidate_cache(inode); 4815 ext4_ext_invalidate_cache(inode);
4839 ext4_discard_preallocations(inode); 4816 ext4_discard_preallocations(inode);
4840 4817
4841 /* 4818 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4842 * Loop over all the blocks and identify blocks
4843 * that need to be punched out
4844 */
4845 iblock = first_block;
4846 blocks_released = 0;
4847 while (iblock < last_block) {
4848 max_blocks = last_block - iblock;
4849 num_blocks = 1;
4850 memset(&map, 0, sizeof(map));
4851 map.m_lblk = iblock;
4852 map.m_len = max_blocks;
4853 ret = ext4_ext_map_blocks(handle, inode, &map,
4854 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4855
4856 if (ret > 0) {
4857 blocks_released += ret;
4858 num_blocks = ret;
4859 } else if (ret == 0) {
4860 /*
4861 * If map blocks could not find the block,
4862 * then it is in a hole. If the hole was
4863 * not already cached, then map blocks should
4864 * put it in the cache. So we can get the hole
4865 * out of the cache
4866 */
4867 memset(&cache_ex, 0, sizeof(cache_ex));
4868 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4869 !cache_ex.ec_start) {
4870
4871 /* The hole is cached */
4872 num_blocks = cache_ex.ec_block +
4873 cache_ex.ec_len - iblock;
4874
4875 } else {
4876 /* The block could not be identified */
4877 err = -EIO;
4878 break;
4879 }
4880 } else {
4881 /* Map blocks error */
4882 err = ret;
4883 break;
4884 }
4885
4886 if (num_blocks == 0) {
4887 /* This condition should never happen */
4888 ext_debug("Block lookup failed");
4889 err = -EIO;
4890 break;
4891 }
4892
4893 iblock += num_blocks;
4894 }
4895 4819
4896 if (blocks_released > 0) { 4820 ext4_ext_invalidate_cache(inode);
4897 ext4_ext_invalidate_cache(inode); 4821 ext4_discard_preallocations(inode);
4898 ext4_discard_preallocations(inode);
4899 }
4900 4822
4901 if (IS_SYNC(inode)) 4823 if (IS_SYNC(inode))
4902 ext4_handle_sync(handle); 4824 ext4_handle_sync(handle);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753efd..bb6c7d811313 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
89 io = list_entry(ei->i_completed_io_list.next, 89 io = list_entry(ei->i_completed_io_list.next,
90 ext4_io_end_t, list); 90 ext4_io_end_t, list);
91 list_del_init(&io->list); 91 list_del_init(&io->list);
92 io->flag |= EXT4_IO_END_IN_FSYNC;
92 /* 93 /*
93 * Calling ext4_end_io_nolock() to convert completed 94 * Calling ext4_end_io_nolock() to convert completed
94 * IO to written. 95 * IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
108 if (ret < 0) 109 if (ret < 0)
109 ret2 = ret; 110 ret2 = ret;
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 111 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112 io->flag &= ~EXT4_IO_END_IN_FSYNC;
111 } 113 }
112 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 114 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
113 return (ret2 < 0) ? ret2 : 0; 115 return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c9781ad9..409c2ee7750a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
92 return EXT4_INODES_PER_GROUP(sb); 92 return EXT4_INODES_PER_GROUP(sb);
93} 93}
94 94
95void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
96{
97 if (uptodate) {
98 set_buffer_uptodate(bh);
99 set_bitmap_uptodate(bh);
100 }
101 unlock_buffer(bh);
102 put_bh(bh);
103}
104
95/* 105/*
96 * Read the inode allocation bitmap for a given block_group, reading 106 * Read the inode allocation bitmap for a given block_group, reading
97 * into the specified slot in the superblock's bitmap cache. 107 * into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
147 return bh; 157 return bh;
148 } 158 }
149 /* 159 /*
150 * submit the buffer_head for read. We can 160 * submit the buffer_head for reading
151 * safely mark the bitmap as uptodate now.
152 * We do it here so the bitmap uptodate bit
153 * get set with buffer lock held.
154 */ 161 */
155 trace_ext4_load_inode_bitmap(sb, block_group); 162 trace_ext4_load_inode_bitmap(sb, block_group);
156 set_bitmap_uptodate(bh); 163 bh->b_end_io = ext4_end_bitmap_read;
157 if (bh_submit_read(bh) < 0) { 164 get_bh(bh);
165 submit_bh(READ, bh);
166 wait_on_buffer(bh);
167 if (!buffer_uptodate(bh)) {
158 put_bh(bh); 168 put_bh(bh);
159 ext4_error(sb, "Cannot read inode bitmap - " 169 ext4_error(sb, "Cannot read inode bitmap - "
160 "block_group = %u, inode_bitmap = %llu", 170 "block_group = %u, inode_bitmap = %llu",
161 block_group, bitmap_blk); 171 block_group, bitmap_blk);
162 return NULL; 172 return NULL;
163 } 173 }
164 return bh; 174 return bh;
@@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
194 struct ext4_sb_info *sbi; 204 struct ext4_sb_info *sbi;
195 int fatal = 0, err, count, cleared; 205 int fatal = 0, err, count, cleared;
196 206
197 if (atomic_read(&inode->i_count) > 1) { 207 if (!sb) {
198 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 208 printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
199 atomic_read(&inode->i_count)); 209 "nonexistent device\n", __func__, __LINE__);
200 return; 210 return;
201 } 211 }
202 if (inode->i_nlink) { 212 if (atomic_read(&inode->i_count) > 1) {
203 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", 213 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
204 inode->i_nlink); 214 __func__, __LINE__, inode->i_ino,
215 atomic_read(&inode->i_count));
205 return; 216 return;
206 } 217 }
207 if (!sb) { 218 if (inode->i_nlink) {
208 printk(KERN_ERR "ext4_free_inode: inode on " 219 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
209 "nonexistent device\n"); 220 __func__, __LINE__, inode->i_ino, inode->i_nlink);
210 return; 221 return;
211 } 222 }
212 sbi = EXT4_SB(sb); 223 sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
593} 604}
594 605
595/* 606/*
596 * claim the inode from the inode bitmap. If the group
597 * is uninit we need to take the groups's ext4_group_lock
598 * and clear the uninit flag. The inode bitmap update
599 * and group desc uninit flag clear should be done
600 * after holding ext4_group_lock so that ext4_read_inode_bitmap
601 * doesn't race with the ext4_claim_inode
602 */
603static int ext4_claim_inode(struct super_block *sb,
604 struct buffer_head *inode_bitmap_bh,
605 unsigned long ino, ext4_group_t group, umode_t mode)
606{
607 int free = 0, retval = 0, count;
608 struct ext4_sb_info *sbi = EXT4_SB(sb);
609 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
610 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
611
612 /*
613 * We have to be sure that new inode allocation does not race with
614 * inode table initialization, because otherwise we may end up
615 * allocating and writing new inode right before sb_issue_zeroout
616 * takes place and overwriting our new inode with zeroes. So we
617 * take alloc_sem to prevent it.
618 */
619 down_read(&grp->alloc_sem);
620 ext4_lock_group(sb, group);
621 if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
622 /* not a free inode */
623 retval = 1;
624 goto err_ret;
625 }
626 ino++;
627 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
628 ino > EXT4_INODES_PER_GROUP(sb)) {
629 ext4_unlock_group(sb, group);
630 up_read(&grp->alloc_sem);
631 ext4_error(sb, "reserved inode or inode > inodes count - "
632 "block_group = %u, inode=%lu", group,
633 ino + group * EXT4_INODES_PER_GROUP(sb));
634 return 1;
635 }
636 /* If we didn't allocate from within the initialized part of the inode
637 * table then we need to initialize up to this inode. */
638 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
639
640 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
641 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
642 /* When marking the block group with
643 * ~EXT4_BG_INODE_UNINIT we don't want to depend
644 * on the value of bg_itable_unused even though
645 * mke2fs could have initialized the same for us.
646 * Instead we calculated the value below
647 */
648
649 free = 0;
650 } else {
651 free = EXT4_INODES_PER_GROUP(sb) -
652 ext4_itable_unused_count(sb, gdp);
653 }
654
655 /*
656 * Check the relative inode number against the last used
657 * relative inode number in this group. if it is greater
658 * we need to update the bg_itable_unused count
659 *
660 */
661 if (ino > free)
662 ext4_itable_unused_set(sb, gdp,
663 (EXT4_INODES_PER_GROUP(sb) - ino));
664 }
665 count = ext4_free_inodes_count(sb, gdp) - 1;
666 ext4_free_inodes_set(sb, gdp, count);
667 if (S_ISDIR(mode)) {
668 count = ext4_used_dirs_count(sb, gdp) + 1;
669 ext4_used_dirs_set(sb, gdp, count);
670 if (sbi->s_log_groups_per_flex) {
671 ext4_group_t f = ext4_flex_group(sbi, group);
672
673 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
674 }
675 }
676 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
677err_ret:
678 ext4_unlock_group(sb, group);
679 up_read(&grp->alloc_sem);
680 return retval;
681}
682
683/*
684 * There are two policies for allocating an inode. If the new inode is 607 * There are two policies for allocating an inode. If the new inode is
685 * a directory, then a forward search is made for a block group with both 608 * a directory, then a forward search is made for a block group with both
686 * free space and a low directory-to-inode ratio; if that fails, then of 609 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@ got_group:
741 if (ret2 == -1) 664 if (ret2 == -1)
742 goto out; 665 goto out;
743 666
667 /*
668 * Normally we will only go through one pass of this loop,
669 * unless we get unlucky and it turns out the group we selected
670 * had its last inode grabbed by someone else.
671 */
744 for (i = 0; i < ngroups; i++, ino = 0) { 672 for (i = 0; i < ngroups; i++, ino = 0) {
745 err = -EIO; 673 err = -EIO;
746 674
@@ -757,51 +685,24 @@ repeat_in_this_group:
757 ino = ext4_find_next_zero_bit((unsigned long *) 685 ino = ext4_find_next_zero_bit((unsigned long *)
758 inode_bitmap_bh->b_data, 686 inode_bitmap_bh->b_data,
759 EXT4_INODES_PER_GROUP(sb), ino); 687 EXT4_INODES_PER_GROUP(sb), ino);
760 688 if (ino >= EXT4_INODES_PER_GROUP(sb)) {
761 if (ino < EXT4_INODES_PER_GROUP(sb)) { 689 if (++group == ngroups)
762 690 group = 0;
763 BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); 691 continue;
764 err = ext4_journal_get_write_access(handle,
765 inode_bitmap_bh);
766 if (err)
767 goto fail;
768
769 BUFFER_TRACE(group_desc_bh, "get_write_access");
770 err = ext4_journal_get_write_access(handle,
771 group_desc_bh);
772 if (err)
773 goto fail;
774 if (!ext4_claim_inode(sb, inode_bitmap_bh,
775 ino, group, mode)) {
776 /* we won it */
777 BUFFER_TRACE(inode_bitmap_bh,
778 "call ext4_handle_dirty_metadata");
779 err = ext4_handle_dirty_metadata(handle,
780 NULL,
781 inode_bitmap_bh);
782 if (err)
783 goto fail;
784 /* zero bit is inode number 1*/
785 ino++;
786 goto got;
787 }
788 /* we lost it */
789 ext4_handle_release_buffer(handle, inode_bitmap_bh);
790 ext4_handle_release_buffer(handle, group_desc_bh);
791
792 if (++ino < EXT4_INODES_PER_GROUP(sb))
793 goto repeat_in_this_group;
794 } 692 }
795 693 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
796 /* 694 ext4_error(sb, "reserved inode found cleared - "
797 * This case is possible in concurrent environment. It is very 695 "inode=%lu", ino + 1);
798 * rare. We cannot repeat the find_group_xxx() call because 696 continue;
799 * that will simply return the same blockgroup, because the 697 }
800 * group descriptor metadata has not yet been updated. 698 ext4_lock_group(sb, group);
801 * So we just go onto the next blockgroup. 699 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
802 */ 700 ext4_unlock_group(sb, group);
803 if (++group == ngroups) 701 ino++; /* the inode bitmap is zero-based */
804 group = 0; 702 if (!ret2)
703 goto got; /* we grabbed the inode! */
704 if (ino < EXT4_INODES_PER_GROUP(sb))
705 goto repeat_in_this_group;
805 } 706 }
806 err = -ENOSPC; 707 err = -ENOSPC;
807 goto out; 708 goto out;
@@ -838,6 +739,59 @@ got:
838 if (err) 739 if (err)
839 goto fail; 740 goto fail;
840 } 741 }
742
743 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
744 err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
745 if (err)
746 goto fail;
747
748 BUFFER_TRACE(group_desc_bh, "get_write_access");
749 err = ext4_journal_get_write_access(handle, group_desc_bh);
750 if (err)
751 goto fail;
752
753 /* Update the relevant bg descriptor fields */
754 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
755 int free;
756 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
757
758 down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
759 ext4_lock_group(sb, group); /* while we modify the bg desc */
760 free = EXT4_INODES_PER_GROUP(sb) -
761 ext4_itable_unused_count(sb, gdp);
762 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
763 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
764 free = 0;
765 }
766 /*
767 * Check the relative inode number against the last used
768 * relative inode number in this group. if it is greater
769 * we need to update the bg_itable_unused count
770 */
771 if (ino > free)
772 ext4_itable_unused_set(sb, gdp,
773 (EXT4_INODES_PER_GROUP(sb) - ino));
774 up_read(&grp->alloc_sem);
775 }
776 ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
777 if (S_ISDIR(mode)) {
778 ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
779 if (sbi->s_log_groups_per_flex) {
780 ext4_group_t f = ext4_flex_group(sbi, group);
781
782 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 }
784 }
785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
786 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
787 ext4_unlock_group(sb, group);
788 }
789
790 BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
791 err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
792 if (err)
793 goto fail;
794
841 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); 795 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
842 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); 796 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
843 if (err) 797 if (err)
@@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1101 * where it is called from on active part of filesystem is ext4lazyinit 1055 * where it is called from on active part of filesystem is ext4lazyinit
1102 * thread, so we do not need any special locks, however we have to prevent 1056 * thread, so we do not need any special locks, however we have to prevent
1103 * inode allocation from the current group, so we take alloc_sem lock, to 1057 * inode allocation from the current group, so we take alloc_sem lock, to
1104 * block ext4_claim_inode until we are finished. 1058 * block ext4_new_inode() until we are finished.
1105 */ 1059 */
1106int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, 1060int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1107 int barrier) 1061 int barrier)
@@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1149 sbi->s_inodes_per_block); 1103 sbi->s_inodes_per_block);
1150 1104
1151 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { 1105 if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1152 ext4_error(sb, "Something is wrong with group %u\n" 1106 ext4_error(sb, "Something is wrong with group %u: "
1153 "Used itable blocks: %d" 1107 "used itable blocks: %d; "
1154 "itable unused count: %u\n", 1108 "itable unused count: %u",
1155 group, used_blks, 1109 group, used_blks,
1156 ext4_itable_unused_count(sb, gdp)); 1110 ext4_itable_unused_count(sb, gdp));
1157 ret = 1; 1111 ret = 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82fe629d..c77b0bd2c711 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
272 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 272 trace_ext4_da_update_reserve_space(inode, used, quota_claim);
273 if (unlikely(used > ei->i_reserved_data_blocks)) { 273 if (unlikely(used > ei->i_reserved_data_blocks)) {
274 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " 274 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
275 "with only %d reserved data blocks\n", 275 "with only %d reserved data blocks",
276 __func__, inode->i_ino, used, 276 __func__, inode->i_ino, used,
277 ei->i_reserved_data_blocks); 277 ei->i_reserved_data_blocks);
278 WARN_ON(1); 278 WARN_ON(1);
@@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1165 */ 1165 */
1166 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " 1166 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1167 "ino %lu, to_free %d with only %d reserved " 1167 "ino %lu, to_free %d with only %d reserved "
1168 "data blocks\n", inode->i_ino, to_free, 1168 "data blocks", inode->i_ino, to_free,
1169 ei->i_reserved_data_blocks); 1169 ei->i_reserved_data_blocks);
1170 WARN_ON(1); 1170 WARN_ON(1);
1171 to_free = ei->i_reserved_data_blocks; 1171 to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1428static void ext4_print_free_blocks(struct inode *inode) 1428static void ext4_print_free_blocks(struct inode *inode)
1429{ 1429{
1430 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1430 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1431 printk(KERN_CRIT "Total free blocks count %lld\n", 1431 struct super_block *sb = inode->i_sb;
1432
1433 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
1432 EXT4_C2B(EXT4_SB(inode->i_sb), 1434 EXT4_C2B(EXT4_SB(inode->i_sb),
1433 ext4_count_free_clusters(inode->i_sb))); 1435 ext4_count_free_clusters(inode->i_sb)));
1434 printk(KERN_CRIT "Free/Dirty block details\n"); 1436 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
1435 printk(KERN_CRIT "free_blocks=%lld\n", 1437 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
1436 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1438 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1437 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1439 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1438 printk(KERN_CRIT "dirty_blocks=%lld\n", 1440 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
1439 (long long) EXT4_C2B(EXT4_SB(inode->i_sb), 1441 (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1440 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1442 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1441 printk(KERN_CRIT "Block reservation details\n"); 1443 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1442 printk(KERN_CRIT "i_reserved_data_blocks=%u\n", 1444 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1443 EXT4_I(inode)->i_reserved_data_blocks); 1445 EXT4_I(inode)->i_reserved_data_blocks);
1444 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", 1446 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1445 EXT4_I(inode)->i_reserved_meta_blocks); 1447 EXT4_I(inode)->i_reserved_meta_blocks);
1446 return; 1448 return;
1447} 1449}
@@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,
2482 int write_mode = (int)(unsigned long)fsdata; 2484 int write_mode = (int)(unsigned long)fsdata;
2483 2485
2484 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2486 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2485 if (ext4_should_order_data(inode)) { 2487 switch (ext4_inode_journal_mode(inode)) {
2488 case EXT4_INODE_ORDERED_DATA_MODE:
2486 return ext4_ordered_write_end(file, mapping, pos, 2489 return ext4_ordered_write_end(file, mapping, pos,
2487 len, copied, page, fsdata); 2490 len, copied, page, fsdata);
2488 } else if (ext4_should_writeback_data(inode)) { 2491 case EXT4_INODE_WRITEBACK_DATA_MODE:
2489 return ext4_writeback_write_end(file, mapping, pos, 2492 return ext4_writeback_write_end(file, mapping, pos,
2490 len, copied, page, fsdata); 2493 len, copied, page, fsdata);
2491 } else { 2494 default:
2492 BUG(); 2495 BUG();
2493 } 2496 }
2494 } 2497 }
@@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2763 goto out; 2766 goto out;
2764 2767
2765 ext_debug("ext4_end_io_dio(): io_end 0x%p " 2768 ext_debug("ext4_end_io_dio(): io_end 0x%p "
2766 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 2769 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
2767 iocb->private, io_end->inode->i_ino, iocb, offset, 2770 iocb->private, io_end->inode->i_ino, iocb, offset,
2768 size); 2771 size);
2769 2772
@@ -2795,9 +2798,6 @@ out:
2795 2798
2796 /* queue the work to convert unwritten extents to written */ 2799 /* queue the work to convert unwritten extents to written */
2797 queue_work(wq, &io_end->work); 2800 queue_work(wq, &io_end->work);
2798
2799 /* XXX: probably should move into the real I/O completion handler */
2800 inode_dio_done(inode);
2801} 2801}
2802 2802
2803static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2803static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2811 goto out; 2811 goto out;
2812 2812
2813 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { 2813 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2814 printk("sb umounted, discard end_io request for inode %lu\n", 2814 ext4_msg(io_end->inode->i_sb, KERN_INFO,
2815 io_end->inode->i_ino); 2815 "sb umounted, discard end_io request for inode %lu",
2816 io_end->inode->i_ino);
2816 ext4_free_io_end(io_end); 2817 ext4_free_io_end(io_end);
2817 goto out; 2818 goto out;
2818 } 2819 }
@@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2921 iocb->private = NULL; 2922 iocb->private = NULL;
2922 EXT4_I(inode)->cur_aio_dio = NULL; 2923 EXT4_I(inode)->cur_aio_dio = NULL;
2923 if (!is_sync_kiocb(iocb)) { 2924 if (!is_sync_kiocb(iocb)) {
2924 iocb->private = ext4_init_io_end(inode, GFP_NOFS); 2925 ext4_io_end_t *io_end =
2925 if (!iocb->private) 2926 ext4_init_io_end(inode, GFP_NOFS);
2927 if (!io_end)
2926 return -ENOMEM; 2928 return -ENOMEM;
2929 io_end->flag |= EXT4_IO_END_DIRECT;
2930 iocb->private = io_end;
2927 /* 2931 /*
2928 * we save the io structure for current async 2932 * we save the io structure for current async
2929 * direct IO, so that later ext4_map_blocks() 2933 * direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2940 ext4_get_block_write, 2944 ext4_get_block_write,
2941 ext4_end_io_dio, 2945 ext4_end_io_dio,
2942 NULL, 2946 NULL,
2943 DIO_LOCKING | DIO_SKIP_HOLES); 2947 DIO_LOCKING);
2944 if (iocb->private) 2948 if (iocb->private)
2945 EXT4_I(inode)->cur_aio_dio = NULL; 2949 EXT4_I(inode)->cur_aio_dio = NULL;
2946 /* 2950 /*
@@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {
3086 3090
3087void ext4_set_aops(struct inode *inode) 3091void ext4_set_aops(struct inode *inode)
3088{ 3092{
3089 if (ext4_should_order_data(inode) && 3093 switch (ext4_inode_journal_mode(inode)) {
3090 test_opt(inode->i_sb, DELALLOC)) 3094 case EXT4_INODE_ORDERED_DATA_MODE:
3091 inode->i_mapping->a_ops = &ext4_da_aops; 3095 if (test_opt(inode->i_sb, DELALLOC))
3092 else if (ext4_should_order_data(inode)) 3096 inode->i_mapping->a_ops = &ext4_da_aops;
3093 inode->i_mapping->a_ops = &ext4_ordered_aops; 3097 else
3094 else if (ext4_should_writeback_data(inode) && 3098 inode->i_mapping->a_ops = &ext4_ordered_aops;
3095 test_opt(inode->i_sb, DELALLOC)) 3099 break;
3096 inode->i_mapping->a_ops = &ext4_da_aops; 3100 case EXT4_INODE_WRITEBACK_DATA_MODE:
3097 else if (ext4_should_writeback_data(inode)) 3101 if (test_opt(inode->i_sb, DELALLOC))
3098 inode->i_mapping->a_ops = &ext4_writeback_aops; 3102 inode->i_mapping->a_ops = &ext4_da_aops;
3099 else 3103 else
3104 inode->i_mapping->a_ops = &ext4_writeback_aops;
3105 break;
3106 case EXT4_INODE_JOURNAL_DATA_MODE:
3100 inode->i_mapping->a_ops = &ext4_journalled_aops; 3107 inode->i_mapping->a_ops = &ext4_journalled_aops;
3108 break;
3109 default:
3110 BUG();
3111 }
3101} 3112}
3102 3113
3103 3114
@@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3329{ 3340{
3330 struct inode *inode = file->f_path.dentry->d_inode; 3341 struct inode *inode = file->f_path.dentry->d_inode;
3331 if (!S_ISREG(inode->i_mode)) 3342 if (!S_ISREG(inode->i_mode))
3332 return -ENOTSUPP; 3343 return -EOPNOTSUPP;
3333 3344
3334 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3345 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3335 /* TODO: Add support for non extent hole punching */ 3346 /* TODO: Add support for non extent hole punching */
3336 return -ENOTSUPP; 3347 return -EOPNOTSUPP;
3337 } 3348 }
3338 3349
3339 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { 3350 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3340 /* TODO: Add support for bigalloc file systems */ 3351 /* TODO: Add support for bigalloc file systems */
3341 return -ENOTSUPP; 3352 return -EOPNOTSUPP;
3342 } 3353 }
3343 3354
3344 return ext4_ext_punch_hole(file, offset, length); 3355 return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,
3924 ext4_update_dynamic_rev(sb); 3935 ext4_update_dynamic_rev(sb);
3925 EXT4_SET_RO_COMPAT_FEATURE(sb, 3936 EXT4_SET_RO_COMPAT_FEATURE(sb,
3926 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 3937 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3927 sb->s_dirt = 1;
3928 ext4_handle_sync(handle); 3938 ext4_handle_sync(handle);
3929 err = ext4_handle_dirty_metadata(handle, NULL, 3939 err = ext4_handle_dirty_super(handle, sb);
3930 EXT4_SB(sb)->s_sbh);
3931 } 3940 }
3932 } 3941 }
3933 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 3942 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4152 } 4161 }
4153 4162
4154 if (attr->ia_valid & ATTR_SIZE) { 4163 if (attr->ia_valid & ATTR_SIZE) {
4155 if (attr->ia_size != i_size_read(inode)) { 4164 if (attr->ia_size != i_size_read(inode))
4156 truncate_setsize(inode, attr->ia_size); 4165 truncate_setsize(inode, attr->ia_size);
4157 ext4_truncate(inode); 4166 ext4_truncate(inode);
4158 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
4159 ext4_truncate(inode);
4160 } 4167 }
4161 4168
4162 if (!rc) { 4169 if (!rc) {
@@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4314{ 4321{
4315 int err = 0; 4322 int err = 0;
4316 4323
4317 if (test_opt(inode->i_sb, I_VERSION)) 4324 if (IS_I_VERSION(inode))
4318 inode_inc_iversion(inode); 4325 inode_inc_iversion(inode);
4319 4326
4320 /* the do_update_inode consumes one bh->b_count */ 4327 /* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b21c698..99ab428bcfa0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,6 +21,7 @@
21 * mballoc.c contains the multiblocks allocation routines 21 * mballoc.c contains the multiblocks allocation routines
22 */ 22 */
23 23
24#include "ext4_jbd2.h"
24#include "mballoc.h" 25#include "mballoc.h"
25#include <linux/debugfs.h> 26#include <linux/debugfs.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
@@ -339,7 +340,7 @@
339 */ 340 */
340static struct kmem_cache *ext4_pspace_cachep; 341static struct kmem_cache *ext4_pspace_cachep;
341static struct kmem_cache *ext4_ac_cachep; 342static struct kmem_cache *ext4_ac_cachep;
342static struct kmem_cache *ext4_free_ext_cachep; 343static struct kmem_cache *ext4_free_data_cachep;
343 344
344/* We create slab caches for groupinfo data structures based on the 345/* We create slab caches for groupinfo data structures based on the
345 * superblock block size. There will be one per mounted filesystem for 346 * superblock block size. There will be one per mounted filesystem for
@@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
357 ext4_group_t group); 358 ext4_group_t group);
358static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 359static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
359 ext4_group_t group); 360 ext4_group_t group);
360static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 361static void ext4_free_data_callback(struct super_block *sb,
362 struct ext4_journal_cb_entry *jce, int rc);
361 363
362static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 364static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
363{ 365{
@@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
425{ 427{
426 char *bb; 428 char *bb;
427 429
428 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 430 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
429 BUG_ON(max == NULL); 431 BUG_ON(max == NULL);
430 432
431 if (order > e4b->bd_blkbits + 1) { 433 if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
436 /* at order 0 we see each particular block */ 438 /* at order 0 we see each particular block */
437 if (order == 0) { 439 if (order == 0) {
438 *max = 1 << (e4b->bd_blkbits + 3); 440 *max = 1 << (e4b->bd_blkbits + 3);
439 return EXT4_MB_BITMAP(e4b); 441 return e4b->bd_bitmap;
440 } 442 }
441 443
442 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 444 bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
443 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 445 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
444 446
445 return bb; 447 return bb;
@@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
588 for (j = 0; j < (1 << order); j++) { 590 for (j = 0; j < (1 << order); j++) {
589 k = (i * (1 << order)) + j; 591 k = (i * (1 << order)) + j;
590 MB_CHECK_ASSERT( 592 MB_CHECK_ASSERT(
591 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 593 !mb_test_bit(k, e4b->bd_bitmap));
592 } 594 }
593 count++; 595 count++;
594 } 596 }
@@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 int groups_per_page; 784 int groups_per_page;
783 int err = 0; 785 int err = 0;
784 int i; 786 int i;
785 ext4_group_t first_group; 787 ext4_group_t first_group, group;
786 int first_block; 788 int first_block;
787 struct super_block *sb; 789 struct super_block *sb;
788 struct buffer_head *bhs; 790 struct buffer_head *bhs;
@@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
806 808
807 /* allocate buffer_heads to read bitmaps */ 809 /* allocate buffer_heads to read bitmaps */
808 if (groups_per_page > 1) { 810 if (groups_per_page > 1) {
809 err = -ENOMEM;
810 i = sizeof(struct buffer_head *) * groups_per_page; 811 i = sizeof(struct buffer_head *) * groups_per_page;
811 bh = kzalloc(i, GFP_NOFS); 812 bh = kzalloc(i, GFP_NOFS);
812 if (bh == NULL) 813 if (bh == NULL) {
814 err = -ENOMEM;
813 goto out; 815 goto out;
816 }
814 } else 817 } else
815 bh = &bhs; 818 bh = &bhs;
816 819
817 first_group = page->index * blocks_per_page / 2; 820 first_group = page->index * blocks_per_page / 2;
818 821
819 /* read all groups the page covers into the cache */ 822 /* read all groups the page covers into the cache */
820 for (i = 0; i < groups_per_page; i++) { 823 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
821 struct ext4_group_desc *desc; 824 if (group >= ngroups)
822
823 if (first_group + i >= ngroups)
824 break; 825 break;
825 826
826 grinfo = ext4_get_group_info(sb, first_group + i); 827 grinfo = ext4_get_group_info(sb, group);
827 /* 828 /*
828 * If page is uptodate then we came here after online resize 829 * If page is uptodate then we came here after online resize
829 * which added some new uninitialized group info structs, so 830 * which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
834 bh[i] = NULL; 835 bh[i] = NULL;
835 continue; 836 continue;
836 } 837 }
837 838 if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
838 err = -EIO; 839 err = -ENOMEM;
839 desc = ext4_get_group_desc(sb, first_group + i, NULL);
840 if (desc == NULL)
841 goto out;
842
843 err = -ENOMEM;
844 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
845 if (bh[i] == NULL)
846 goto out; 840 goto out;
847
848 if (bitmap_uptodate(bh[i]))
849 continue;
850
851 lock_buffer(bh[i]);
852 if (bitmap_uptodate(bh[i])) {
853 unlock_buffer(bh[i]);
854 continue;
855 }
856 ext4_lock_group(sb, first_group + i);
857 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
858 ext4_init_block_bitmap(sb, bh[i],
859 first_group + i, desc);
860 set_bitmap_uptodate(bh[i]);
861 set_buffer_uptodate(bh[i]);
862 ext4_unlock_group(sb, first_group + i);
863 unlock_buffer(bh[i]);
864 continue;
865 } 841 }
866 ext4_unlock_group(sb, first_group + i); 842 mb_debug(1, "read bitmap for group %u\n", group);
867 if (buffer_uptodate(bh[i])) {
868 /*
869 * if not uninit if bh is uptodate,
870 * bitmap is also uptodate
871 */
872 set_bitmap_uptodate(bh[i]);
873 unlock_buffer(bh[i]);
874 continue;
875 }
876 get_bh(bh[i]);
877 /*
878 * submit the buffer_head for read. We can
879 * safely mark the bitmap as uptodate now.
880 * We do it here so the bitmap uptodate bit
881 * get set with buffer lock held.
882 */
883 set_bitmap_uptodate(bh[i]);
884 bh[i]->b_end_io = end_buffer_read_sync;
885 submit_bh(READ, bh[i]);
886 mb_debug(1, "read bitmap for group %u\n", first_group + i);
887 } 843 }
888 844
889 /* wait for I/O completion */ 845 /* wait for I/O completion */
890 for (i = 0; i < groups_per_page; i++) 846 for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
891 if (bh[i]) 847 if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
892 wait_on_buffer(bh[i]); 848 err = -EIO;
893
894 err = -EIO;
895 for (i = 0; i < groups_per_page; i++)
896 if (bh[i] && !buffer_uptodate(bh[i]))
897 goto out; 849 goto out;
850 }
851 }
898 852
899 err = 0;
900 first_block = page->index * blocks_per_page; 853 first_block = page->index * blocks_per_page;
901 for (i = 0; i < blocks_per_page; i++) { 854 for (i = 0; i < blocks_per_page; i++) {
902 int group; 855 int group;
@@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1250 int order = 1; 1203 int order = 1;
1251 void *bb; 1204 void *bb;
1252 1205
1253 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1206 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1254 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1207 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1255 1208
1256 bb = EXT4_MB_BUDDY(e4b); 1209 bb = e4b->bd_buddy;
1257 while (order <= e4b->bd_blkbits + 1) { 1210 while (order <= e4b->bd_blkbits + 1) {
1258 block = block >> 1; 1211 block = block >> 1;
1259 if (!mb_test_bit(block, bb)) { 1212 if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1323 1276
1324 /* let's maintain fragments counter */ 1277 /* let's maintain fragments counter */
1325 if (first != 0) 1278 if (first != 0)
1326 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1279 block = !mb_test_bit(first - 1, e4b->bd_bitmap);
1327 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1280 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1328 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1281 max = !mb_test_bit(first + count, e4b->bd_bitmap);
1329 if (block && max) 1282 if (block && max)
1330 e4b->bd_info->bb_fragments--; 1283 e4b->bd_info->bb_fragments--;
1331 else if (!block && !max) 1284 else if (!block && !max)
@@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1336 block = first++; 1289 block = first++;
1337 order = 0; 1290 order = 0;
1338 1291
1339 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1292 if (!mb_test_bit(block, e4b->bd_bitmap)) {
1340 ext4_fsblk_t blocknr; 1293 ext4_fsblk_t blocknr;
1341 1294
1342 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1295 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1347 "freeing already freed block " 1300 "freeing already freed block "
1348 "(bit %u)", block); 1301 "(bit %u)", block);
1349 } 1302 }
1350 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1303 mb_clear_bit(block, e4b->bd_bitmap);
1351 e4b->bd_info->bb_counters[order]++; 1304 e4b->bd_info->bb_counters[order]++;
1352 1305
1353 /* start of the buddy */ 1306 /* start of the buddy */
@@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1429 break; 1382 break;
1430 1383
1431 next = (block + 1) * (1 << order); 1384 next = (block + 1) * (1 << order);
1432 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1385 if (mb_test_bit(next, e4b->bd_bitmap))
1433 break; 1386 break;
1434 1387
1435 order = mb_find_order_for_block(e4b, next); 1388 order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1466 1419
1467 /* let's maintain fragments counter */ 1420 /* let's maintain fragments counter */
1468 if (start != 0) 1421 if (start != 0)
1469 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1422 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1470 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1423 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1471 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1424 max = !mb_test_bit(start + len, e4b->bd_bitmap);
1472 if (mlen && max) 1425 if (mlen && max)
1473 e4b->bd_info->bb_fragments++; 1426 e4b->bd_info->bb_fragments++;
1474 else if (!mlen && !max) 1427 else if (!mlen && !max)
@@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1511 } 1464 }
1512 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1465 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1513 1466
1514 ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1467 ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1515 mb_check_buddy(e4b); 1468 mb_check_buddy(e4b);
1516 1469
1517 return ret; 1470 return ret;
@@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1810 struct ext4_buddy *e4b) 1763 struct ext4_buddy *e4b)
1811{ 1764{
1812 struct super_block *sb = ac->ac_sb; 1765 struct super_block *sb = ac->ac_sb;
1813 void *bitmap = EXT4_MB_BITMAP(e4b); 1766 void *bitmap = e4b->bd_bitmap;
1814 struct ext4_free_extent ex; 1767 struct ext4_free_extent ex;
1815 int i; 1768 int i;
1816 int free; 1769 int free;
@@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1870{ 1823{
1871 struct super_block *sb = ac->ac_sb; 1824 struct super_block *sb = ac->ac_sb;
1872 struct ext4_sb_info *sbi = EXT4_SB(sb); 1825 struct ext4_sb_info *sbi = EXT4_SB(sb);
1873 void *bitmap = EXT4_MB_BITMAP(e4b); 1826 void *bitmap = e4b->bd_bitmap;
1874 struct ext4_free_extent ex; 1827 struct ext4_free_extent ex;
1875 ext4_fsblk_t first_group_block; 1828 ext4_fsblk_t first_group_block;
1876 ext4_fsblk_t a; 1829 ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2224 EXT4_DESC_PER_BLOCK_BITS(sb); 2177 EXT4_DESC_PER_BLOCK_BITS(sb);
2225 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2178 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2226 if (meta_group_info == NULL) { 2179 if (meta_group_info == NULL) {
2227 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " 2180 ext4_msg(sb, KERN_ERR, "can't allocate mem "
2228 "for a buddy group"); 2181 "for a buddy group");
2229 goto exit_meta_group_info; 2182 goto exit_meta_group_info;
2230 } 2183 }
@@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2238 2191
2239 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2192 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2240 if (meta_group_info[i] == NULL) { 2193 if (meta_group_info[i] == NULL) {
2241 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); 2194 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2242 goto exit_group_info; 2195 goto exit_group_info;
2243 } 2196 }
2244 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2197 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2522 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2475 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2523 &ext4_mb_seq_groups_fops, sb); 2476 &ext4_mb_seq_groups_fops, sb);
2524 2477
2525 if (sbi->s_journal)
2526 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2527
2528 return 0; 2478 return 0;
2529 2479
2530out_free_locality_groups: 2480out_free_locality_groups:
@@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,
2637 * This function is called by the jbd2 layer once the commit has finished, 2587 * This function is called by the jbd2 layer once the commit has finished,
2638 * so we know we can free the blocks that were released with that commit. 2588 * so we know we can free the blocks that were released with that commit.
2639 */ 2589 */
2640static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2590static void ext4_free_data_callback(struct super_block *sb,
2591 struct ext4_journal_cb_entry *jce,
2592 int rc)
2641{ 2593{
2642 struct super_block *sb = journal->j_private; 2594 struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2643 struct ext4_buddy e4b; 2595 struct ext4_buddy e4b;
2644 struct ext4_group_info *db; 2596 struct ext4_group_info *db;
2645 int err, count = 0, count2 = 0; 2597 int err, count = 0, count2 = 0;
2646 struct ext4_free_data *entry;
2647 struct list_head *l, *ltmp;
2648 2598
2649 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2599 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2650 entry = list_entry(l, struct ext4_free_data, list); 2600 entry->efd_count, entry->efd_group, entry);
2651 2601
2652 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2602 if (test_opt(sb, DISCARD))
2653 entry->count, entry->group, entry); 2603 ext4_issue_discard(sb, entry->efd_group,
2604 entry->efd_start_cluster, entry->efd_count);
2654 2605
2655 if (test_opt(sb, DISCARD)) 2606 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2656 ext4_issue_discard(sb, entry->group, 2607 /* we expect to find existing buddy because it's pinned */
2657 entry->start_cluster, entry->count); 2608 BUG_ON(err != 0);
2658 2609
2659 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2660 /* we expect to find existing buddy because it's pinned */
2661 BUG_ON(err != 0);
2662 2610
2663 db = e4b.bd_info; 2611 db = e4b.bd_info;
2664 /* there are blocks to put in buddy to make them really free */ 2612 /* there are blocks to put in buddy to make them really free */
2665 count += entry->count; 2613 count += entry->efd_count;
2666 count2++; 2614 count2++;
2667 ext4_lock_group(sb, entry->group); 2615 ext4_lock_group(sb, entry->efd_group);
2668 /* Take it out of per group rb tree */ 2616 /* Take it out of per group rb tree */
2669 rb_erase(&entry->node, &(db->bb_free_root)); 2617 rb_erase(&entry->efd_node, &(db->bb_free_root));
2670 mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); 2618 mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2671 2619
2672 /* 2620 /*
2673 * Clear the trimmed flag for the group so that the next 2621 * Clear the trimmed flag for the group so that the next
2674 * ext4_trim_fs can trim it. 2622 * ext4_trim_fs can trim it.
2675 * If the volume is mounted with -o discard, online discard 2623 * If the volume is mounted with -o discard, online discard
2676 * is supported and the free blocks will be trimmed online. 2624 * is supported and the free blocks will be trimmed online.
2677 */ 2625 */
2678 if (!test_opt(sb, DISCARD)) 2626 if (!test_opt(sb, DISCARD))
2679 EXT4_MB_GRP_CLEAR_TRIMMED(db); 2627 EXT4_MB_GRP_CLEAR_TRIMMED(db);
2680 2628
2681 if (!db->bb_free_root.rb_node) { 2629 if (!db->bb_free_root.rb_node) {
2682 /* No more items in the per group rb tree 2630 /* No more items in the per group rb tree
2683 * balance refcounts from ext4_mb_free_metadata() 2631 * balance refcounts from ext4_mb_free_metadata()
2684 */ 2632 */
2685 page_cache_release(e4b.bd_buddy_page); 2633 page_cache_release(e4b.bd_buddy_page);
2686 page_cache_release(e4b.bd_bitmap_page); 2634 page_cache_release(e4b.bd_bitmap_page);
2687 }
2688 ext4_unlock_group(sb, entry->group);
2689 kmem_cache_free(ext4_free_ext_cachep, entry);
2690 ext4_mb_unload_buddy(&e4b);
2691 } 2635 }
2636 ext4_unlock_group(sb, entry->efd_group);
2637 kmem_cache_free(ext4_free_data_cachep, entry);
2638 ext4_mb_unload_buddy(&e4b);
2692 2639
2693 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2640 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2694} 2641}
@@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)
2741 return -ENOMEM; 2688 return -ENOMEM;
2742 } 2689 }
2743 2690
2744 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, 2691 ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2745 SLAB_RECLAIM_ACCOUNT); 2692 SLAB_RECLAIM_ACCOUNT);
2746 if (ext4_free_ext_cachep == NULL) { 2693 if (ext4_free_data_cachep == NULL) {
2747 kmem_cache_destroy(ext4_pspace_cachep); 2694 kmem_cache_destroy(ext4_pspace_cachep);
2748 kmem_cache_destroy(ext4_ac_cachep); 2695 kmem_cache_destroy(ext4_ac_cachep);
2749 return -ENOMEM; 2696 return -ENOMEM;
@@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)
2761 rcu_barrier(); 2708 rcu_barrier();
2762 kmem_cache_destroy(ext4_pspace_cachep); 2709 kmem_cache_destroy(ext4_pspace_cachep);
2763 kmem_cache_destroy(ext4_ac_cachep); 2710 kmem_cache_destroy(ext4_ac_cachep);
2764 kmem_cache_destroy(ext4_free_ext_cachep); 2711 kmem_cache_destroy(ext4_free_data_cachep);
2765 ext4_groupinfo_destroy_slabs(); 2712 ext4_groupinfo_destroy_slabs();
2766 ext4_remove_debugfs_entry(); 2713 ext4_remove_debugfs_entry();
2767} 2714}
@@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2815 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 2762 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2816 if (!ext4_data_block_valid(sbi, block, len)) { 2763 if (!ext4_data_block_valid(sbi, block, len)) {
2817 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2764 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2818 "fs metadata\n", block, block+len); 2765 "fs metadata", block, block+len);
2819 /* File system mounted not to panic on error 2766 /* File system mounted not to panic on error
2820 * Fix the bitmap and repeat the block allocation 2767 * Fix the bitmap and repeat the block allocation
2821 * We leak some of the blocks here. 2768 * We leak some of the blocks here.
@@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2911 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 2858 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2912 int bsbits, max; 2859 int bsbits, max;
2913 ext4_lblk_t end; 2860 ext4_lblk_t end;
2914 loff_t size, orig_size, start_off; 2861 loff_t size, start_off;
2862 loff_t orig_size __maybe_unused;
2915 ext4_lblk_t start; 2863 ext4_lblk_t start;
2916 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2864 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2917 struct ext4_prealloc_space *pa; 2865 struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3321 n = rb_first(&(grp->bb_free_root)); 3269 n = rb_first(&(grp->bb_free_root));
3322 3270
3323 while (n) { 3271 while (n) {
3324 entry = rb_entry(n, struct ext4_free_data, node); 3272 entry = rb_entry(n, struct ext4_free_data, efd_node);
3325 ext4_set_bits(bitmap, entry->start_cluster, entry->count); 3273 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3326 n = rb_next(n); 3274 n = rb_next(n);
3327 } 3275 }
3328 return; 3276 return;
@@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3864 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3917 return; 3865 return;
3918 3866
3919 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" 3867 ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
3920 " Allocation context details:"); 3868 " Allocation context details:");
3921 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", 3869 ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
3922 ac->ac_status, ac->ac_flags); 3870 ac->ac_status, ac->ac_flags);
3923 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " 3871 ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
3924 "goal %lu/%lu/%lu@%lu, " 3872 "goal %lu/%lu/%lu@%lu, "
3925 "best %lu/%lu/%lu@%lu cr %d", 3873 "best %lu/%lu/%lu@%lu cr %d",
3926 (unsigned long)ac->ac_o_ex.fe_group, 3874 (unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3936 (unsigned long)ac->ac_b_ex.fe_len, 3884 (unsigned long)ac->ac_b_ex.fe_len,
3937 (unsigned long)ac->ac_b_ex.fe_logical, 3885 (unsigned long)ac->ac_b_ex.fe_logical,
3938 (int)ac->ac_criteria); 3886 (int)ac->ac_criteria);
3939 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", 3887 ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
3940 ac->ac_ex_scanned, ac->ac_found); 3888 ac->ac_ex_scanned, ac->ac_found);
3941 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); 3889 ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
3942 ngroups = ext4_get_groups_count(sb); 3890 ngroups = ext4_get_groups_count(sb);
3943 for (i = 0; i < ngroups; i++) { 3891 for (i = 0; i < ngroups; i++) {
3944 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3892 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@ out:
4428static int can_merge(struct ext4_free_data *entry1, 4376static int can_merge(struct ext4_free_data *entry1,
4429 struct ext4_free_data *entry2) 4377 struct ext4_free_data *entry2)
4430{ 4378{
4431 if ((entry1->t_tid == entry2->t_tid) && 4379 if ((entry1->efd_tid == entry2->efd_tid) &&
4432 (entry1->group == entry2->group) && 4380 (entry1->efd_group == entry2->efd_group) &&
4433 ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) 4381 ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4434 return 1; 4382 return 1;
4435 return 0; 4383 return 0;
4436} 4384}
@@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4452 BUG_ON(e4b->bd_bitmap_page == NULL); 4400 BUG_ON(e4b->bd_bitmap_page == NULL);
4453 BUG_ON(e4b->bd_buddy_page == NULL); 4401 BUG_ON(e4b->bd_buddy_page == NULL);
4454 4402
4455 new_node = &new_entry->node; 4403 new_node = &new_entry->efd_node;
4456 cluster = new_entry->start_cluster; 4404 cluster = new_entry->efd_start_cluster;
4457 4405
4458 if (!*n) { 4406 if (!*n) {
4459 /* first free block exent. We need to 4407 /* first free block exent. We need to
@@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4466 } 4414 }
4467 while (*n) { 4415 while (*n) {
4468 parent = *n; 4416 parent = *n;
4469 entry = rb_entry(parent, struct ext4_free_data, node); 4417 entry = rb_entry(parent, struct ext4_free_data, efd_node);
4470 if (cluster < entry->start_cluster) 4418 if (cluster < entry->efd_start_cluster)
4471 n = &(*n)->rb_left; 4419 n = &(*n)->rb_left;
4472 else if (cluster >= (entry->start_cluster + entry->count)) 4420 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4473 n = &(*n)->rb_right; 4421 n = &(*n)->rb_right;
4474 else { 4422 else {
4475 ext4_grp_locked_error(sb, group, 0, 4423 ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4486 /* Now try to see the extent can be merged to left and right */ 4434 /* Now try to see the extent can be merged to left and right */
4487 node = rb_prev(new_node); 4435 node = rb_prev(new_node);
4488 if (node) { 4436 if (node) {
4489 entry = rb_entry(node, struct ext4_free_data, node); 4437 entry = rb_entry(node, struct ext4_free_data, efd_node);
4490 if (can_merge(entry, new_entry)) { 4438 if (can_merge(entry, new_entry)) {
4491 new_entry->start_cluster = entry->start_cluster; 4439 new_entry->efd_start_cluster = entry->efd_start_cluster;
4492 new_entry->count += entry->count; 4440 new_entry->efd_count += entry->efd_count;
4493 rb_erase(node, &(db->bb_free_root)); 4441 rb_erase(node, &(db->bb_free_root));
4494 spin_lock(&sbi->s_md_lock); 4442 ext4_journal_callback_del(handle, &entry->efd_jce);
4495 list_del(&entry->list); 4443 kmem_cache_free(ext4_free_data_cachep, entry);
4496 spin_unlock(&sbi->s_md_lock);
4497 kmem_cache_free(ext4_free_ext_cachep, entry);
4498 } 4444 }
4499 } 4445 }
4500 4446
4501 node = rb_next(new_node); 4447 node = rb_next(new_node);
4502 if (node) { 4448 if (node) {
4503 entry = rb_entry(node, struct ext4_free_data, node); 4449 entry = rb_entry(node, struct ext4_free_data, efd_node);
4504 if (can_merge(new_entry, entry)) { 4450 if (can_merge(new_entry, entry)) {
4505 new_entry->count += entry->count; 4451 new_entry->efd_count += entry->efd_count;
4506 rb_erase(node, &(db->bb_free_root)); 4452 rb_erase(node, &(db->bb_free_root));
4507 spin_lock(&sbi->s_md_lock); 4453 ext4_journal_callback_del(handle, &entry->efd_jce);
4508 list_del(&entry->list); 4454 kmem_cache_free(ext4_free_data_cachep, entry);
4509 spin_unlock(&sbi->s_md_lock);
4510 kmem_cache_free(ext4_free_ext_cachep, entry);
4511 } 4455 }
4512 } 4456 }
4513 /* Add the extent to transaction's private list */ 4457 /* Add the extent to transaction's private list */
4514 spin_lock(&sbi->s_md_lock); 4458 ext4_journal_callback_add(handle, ext4_free_data_callback,
4515 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4459 &new_entry->efd_jce);
4516 spin_unlock(&sbi->s_md_lock);
4517 return 0; 4460 return 0;
4518} 4461}
4519 4462
@@ -4691,15 +4634,15 @@ do_more:
4691 * blocks being freed are metadata. these blocks shouldn't 4634 * blocks being freed are metadata. these blocks shouldn't
4692 * be used until this transaction is committed 4635 * be used until this transaction is committed
4693 */ 4636 */
4694 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4637 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4695 if (!new_entry) { 4638 if (!new_entry) {
4696 err = -ENOMEM; 4639 err = -ENOMEM;
4697 goto error_return; 4640 goto error_return;
4698 } 4641 }
4699 new_entry->start_cluster = bit; 4642 new_entry->efd_start_cluster = bit;
4700 new_entry->group = block_group; 4643 new_entry->efd_group = block_group;
4701 new_entry->count = count_clusters; 4644 new_entry->efd_count = count_clusters;
4702 new_entry->t_tid = handle->h_transaction->t_tid; 4645 new_entry->efd_tid = handle->h_transaction->t_tid;
4703 4646
4704 ext4_lock_group(sb, block_group); 4647 ext4_lock_group(sb, block_group);
4705 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4648 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4971 start = (e4b.bd_info->bb_first_free > start) ? 4914 start = (e4b.bd_info->bb_first_free > start) ?
4972 e4b.bd_info->bb_first_free : start; 4915 e4b.bd_info->bb_first_free : start;
4973 4916
4974 while (start < max) { 4917 while (start <= max) {
4975 start = mb_find_next_zero_bit(bitmap, max, start); 4918 start = mb_find_next_zero_bit(bitmap, max + 1, start);
4976 if (start >= max) 4919 if (start > max)
4977 break; 4920 break;
4978 next = mb_find_next_bit(bitmap, max, start); 4921 next = mb_find_next_bit(bitmap, max + 1, start);
4979 4922
4980 if ((next - start) >= minblocks) { 4923 if ((next - start) >= minblocks) {
4981 ext4_trim_extent(sb, start, 4924 ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@ out:
5027int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4970int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5028{ 4971{
5029 struct ext4_group_info *grp; 4972 struct ext4_group_info *grp;
5030 ext4_group_t first_group, last_group; 4973 ext4_group_t group, first_group, last_group;
5031 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
5032 ext4_grpblk_t cnt = 0, first_cluster, last_cluster; 4974 ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5033 uint64_t start, len, minlen, trimmed = 0; 4975 uint64_t start, end, minlen, trimmed = 0;
5034 ext4_fsblk_t first_data_blk = 4976 ext4_fsblk_t first_data_blk =
5035 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4977 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4978 ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5036 int ret = 0; 4979 int ret = 0;
5037 4980
5038 start = range->start >> sb->s_blocksize_bits; 4981 start = range->start >> sb->s_blocksize_bits;
5039 len = range->len >> sb->s_blocksize_bits; 4982 end = start + (range->len >> sb->s_blocksize_bits) - 1;
5040 minlen = range->minlen >> sb->s_blocksize_bits; 4983 minlen = range->minlen >> sb->s_blocksize_bits;
5041 4984
5042 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) 4985 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
4986 unlikely(start >= max_blks))
5043 return -EINVAL; 4987 return -EINVAL;
5044 if (start + len <= first_data_blk) 4988 if (end >= max_blks)
4989 end = max_blks - 1;
4990 if (end <= first_data_blk)
5045 goto out; 4991 goto out;
5046 if (start < first_data_blk) { 4992 if (start < first_data_blk)
5047 len -= first_data_blk - start;
5048 start = first_data_blk; 4993 start = first_data_blk;
5049 }
5050 4994
5051 /* Determine first and last group to examine based on start and len */ 4995 /* Determine first and last group to examine based on start and end */
5052 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 4996 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5053 &first_group, &first_cluster); 4997 &first_group, &first_cluster);
5054 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 4998 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5055 &last_group, &last_cluster); 4999 &last_group, &last_cluster);
5056 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
5057 last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
5058 5000
5059 if (first_group > last_group) 5001 /* end now represents the last cluster to discard in this group */
5060 return -EINVAL; 5002 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5061 5003
5062 for (group = first_group; group <= last_group; group++) { 5004 for (group = first_group; group <= last_group; group++) {
5063 grp = ext4_get_group_info(sb, group); 5005 grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5069 } 5011 }
5070 5012
5071 /* 5013 /*
5072 * For all the groups except the last one, last block will 5014 * For all the groups except the last one, last cluster will
5073 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to 5015 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5074 * change it for the last group in which case start + 5016 * change it for the last group, note that last_cluster is
5075 * len < EXT4_BLOCKS_PER_GROUP(sb). 5017 * already computed earlier by ext4_get_group_no_and_offset()
5076 */ 5018 */
5077 if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) 5019 if (group == last_group)
5078 last_cluster = first_cluster + len; 5020 end = last_cluster;
5079 len -= last_cluster - first_cluster;
5080 5021
5081 if (grp->bb_free >= minlen) { 5022 if (grp->bb_free >= minlen) {
5082 cnt = ext4_trim_all_free(sb, group, first_cluster, 5023 cnt = ext4_trim_all_free(sb, group, first_cluster,
5083 last_cluster, minlen); 5024 end, minlen);
5084 if (cnt < 0) { 5025 if (cnt < 0) {
5085 ret = cnt; 5026 ret = cnt;
5086 break; 5027 break;
5087 } 5028 }
5029 trimmed += cnt;
5088 } 5030 }
5089 trimmed += cnt; 5031
5032 /*
5033 * For every group except the first one, we are sure
5034 * that the first cluster to discard will be cluster #0.
5035 */
5090 first_cluster = 0; 5036 first_cluster = 0;
5091 } 5037 }
5092 range->len = trimmed * sb->s_blocksize;
5093 5038
5094 if (!ret) 5039 if (!ret)
5095 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5040 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5096 5041
5097out: 5042out:
5043 range->len = trimmed * sb->s_blocksize;
5098 return ret; 5044 return ret;
5099} 5045}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3285e3..c070618c21ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
96 96
97 97
98struct ext4_free_data { 98struct ext4_free_data {
99 /* this links the free block information from group_info */ 99 /* MUST be the first member */
100 struct rb_node node; 100 struct ext4_journal_cb_entry efd_jce;
101
102 /* ext4_free_data private data starts from here */
101 103
102 /* this links the free block information from ext4_sb_info */ 104 /* this links the free block information from group_info */
103 struct list_head list; 105 struct rb_node efd_node;
104 106
105 /* group which free block extent belongs */ 107 /* group which free block extent belongs */
106 ext4_group_t group; 108 ext4_group_t efd_group;
107 109
108 /* free block extent */ 110 /* free block extent */
109 ext4_grpblk_t start_cluster; 111 ext4_grpblk_t efd_start_cluster;
110 ext4_grpblk_t count; 112 ext4_grpblk_t efd_count;
111 113
112 /* transaction which freed this extent */ 114 /* transaction which freed this extent */
113 tid_t t_tid; 115 tid_t efd_tid;
114}; 116};
115 117
116struct ext4_prealloc_space { 118struct ext4_prealloc_space {
@@ -210,8 +212,6 @@ struct ext4_buddy {
210 __u16 bd_blkbits; 212 __u16 bd_blkbits;
211 ext4_group_t bd_group; 213 ext4_group_t bd_group;
212}; 214};
213#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
214#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
215 215
216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 216static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
217 struct ext4_free_extent *fex) 217 struct ext4_free_extent *fex)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0acfa6..f39f80f8f2c5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)
471 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, 471 tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
472 S_IFREG, NULL, goal, owner); 472 S_IFREG, NULL, goal, owner);
473 if (IS_ERR(tmp_inode)) { 473 if (IS_ERR(tmp_inode)) {
474 retval = PTR_ERR(inode); 474 retval = PTR_ERR(tmp_inode);
475 ext4_journal_stop(handle); 475 ext4_journal_stop(handle);
476 return retval; 476 return retval;
477 } 477 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4eff2a..ed6548d89165 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,
257 * If check_interval in MMP block is larger, use that instead of 257 * If check_interval in MMP block is larger, use that instead of
258 * update_interval from the superblock. 258 * update_interval from the superblock.
259 */ 259 */
260 if (mmp->mmp_check_interval > mmp_check_interval) 260 if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
261 mmp_check_interval = mmp->mmp_check_interval; 261 mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
262 262
263 seq = le32_to_cpu(mmp->mmp_seq); 263 seq = le32_to_cpu(mmp->mmp_seq);
264 if (seq == EXT4_MMP_SEQ_CLEAN) 264 if (seq == EXT4_MMP_SEQ_CLEAN)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f482375d..349d7b3671c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -468,7 +468,7 @@ fail2:
468fail: 468fail:
469 if (*err == ERR_BAD_DX_DIR) 469 if (*err == ERR_BAD_DX_DIR)
470 ext4_warning(dir->i_sb, 470 ext4_warning(dir->i_sb,
471 "Corrupt dir inode %ld, running e2fsck is " 471 "Corrupt dir inode %lu, running e2fsck is "
472 "recommended.", dir->i_ino); 472 "recommended.", dir->i_ino);
473 return NULL; 473 return NULL;
474} 474}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 475851896518..74cd1f7f1f88 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode)
60static void put_io_page(struct ext4_io_page *io_page) 60static void put_io_page(struct ext4_io_page *io_page)
61{ 61{
62 if (atomic_dec_and_test(&io_page->p_count)) { 62 if (atomic_dec_and_test(&io_page->p_count)) {
63 end_page_writeback(io_page->p_page);
64 put_page(io_page->p_page); 63 put_page(io_page->p_page);
65 kmem_cache_free(io_page_cachep, io_page); 64 kmem_cache_free(io_page_cachep, io_page);
66 } 65 }
@@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
110 if (io->iocb) 109 if (io->iocb)
111 aio_complete(io->iocb, io->result, 0); 110 aio_complete(io->iocb, io->result, 0);
112 111
112 if (io->flag & EXT4_IO_END_DIRECT)
113 inode_dio_done(inode);
113 /* Wake up anyone waiting on unwritten extent conversion */ 114 /* Wake up anyone waiting on unwritten extent conversion */
114 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) 115 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
115 wake_up_all(ext4_ioend_wq(io->inode)); 116 wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work)
127 unsigned long flags; 128 unsigned long flags;
128 129
129 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 130 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
131 if (io->flag & EXT4_IO_END_IN_FSYNC)
132 goto requeue;
130 if (list_empty(&io->list)) { 133 if (list_empty(&io->list)) {
131 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 134 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
132 goto free; 135 goto free;
133 } 136 }
134 137
135 if (!mutex_trylock(&inode->i_mutex)) { 138 if (!mutex_trylock(&inode->i_mutex)) {
139 bool was_queued;
140requeue:
141 was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
142 io->flag |= EXT4_IO_END_QUEUED;
136 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 143 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
137 /* 144 /*
138 * Requeue the work instead of waiting so that the work 145 * Requeue the work instead of waiting so that the work
@@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work)
145 * yield the cpu if it sees an end_io request that has already 152 * yield the cpu if it sees an end_io request that has already
146 * been requeued. 153 * been requeued.
147 */ 154 */
148 if (io->flag & EXT4_IO_END_QUEUED) 155 if (was_queued)
149 yield(); 156 yield();
150 io->flag |= EXT4_IO_END_QUEUED;
151 return; 157 return;
152 } 158 }
153 list_del_init(&io->list); 159 list_del_init(&io->list);
@@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error)
227 } while (bh != head); 233 } while (bh != head);
228 } 234 }
229 235
230 put_io_page(io_end->pages[i]); 236 if (atomic_read(&io_end->pages[i]->p_count) == 1)
237 end_page_writeback(io_end->pages[i]->p_page);
231 } 238 }
232 io_end->num_io_pages = 0;
233 inode = io_end->inode; 239 inode = io_end->inode;
234 240
235 if (error) { 241 if (error) {
@@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
421 * PageWriteback bit from the page to prevent the system from 427 * PageWriteback bit from the page to prevent the system from
422 * wedging later on. 428 * wedging later on.
423 */ 429 */
430 if (atomic_read(&io_page->p_count) == 1)
431 end_page_writeback(page);
424 put_io_page(io_page); 432 put_io_page(io_page);
425 return ret; 433 return ret;
426} 434}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f0eb86..59fa0be27251 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,
1163 do_div(reserved_blocks, 100); 1163 do_div(reserved_blocks, 100);
1164 1164
1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); 1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
1166 ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
1166 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * 1167 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1167 flex_gd->count); 1168 flex_gd->count);
1169 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1170 flex_gd->count);
1168 1171
1169 /* 1172 /*
1170 * We need to protect s_groups_count against other CPUs seeing 1173 * We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
1465 } 1468 }
1466 1469
1467 ext4_blocks_count_set(es, o_blocks_count + add); 1470 ext4_blocks_count_set(es, o_blocks_count + add);
1471 ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
1468 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1472 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1469 o_blocks_count + add); 1473 o_blocks_count + add);
1470 /* We add the blocks to the bitmap and set the group need init bit */ 1474 /* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1512 o_blocks_count = ext4_blocks_count(es); 1516 o_blocks_count = ext4_blocks_count(es);
1513 1517
1514 if (test_opt(sb, DEBUG)) 1518 if (test_opt(sb, DEBUG))
1515 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", 1519 ext4_msg(sb, KERN_DEBUG,
1516 o_blocks_count, n_blocks_count); 1520 "extending last group from %llu to %llu blocks",
1521 o_blocks_count, n_blocks_count);
1517 1522
1518 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 1523 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
1519 return 0; 1524 return 0;
1520 1525
1521 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 1526 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1522 printk(KERN_ERR "EXT4-fs: filesystem on %s:" 1527 ext4_msg(sb, KERN_ERR,
1523 " too large to resize to %llu blocks safely\n", 1528 "filesystem too large to resize to %llu blocks safely",
1524 sb->s_id, n_blocks_count); 1529 n_blocks_count);
1525 if (sizeof(sector_t) < 8) 1530 if (sizeof(sector_t) < 8)
1526 ext4_warning(sb, "CONFIG_LBDAF not enabled"); 1531 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1527 return -EINVAL; 1532 return -EINVAL;
@@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1582 ext4_fsblk_t o_blocks_count; 1587 ext4_fsblk_t o_blocks_count;
1583 ext4_group_t o_group; 1588 ext4_group_t o_group;
1584 ext4_group_t n_group; 1589 ext4_group_t n_group;
1585 ext4_grpblk_t offset; 1590 ext4_grpblk_t offset, add;
1586 unsigned long n_desc_blocks; 1591 unsigned long n_desc_blocks;
1587 unsigned long o_desc_blocks; 1592 unsigned long o_desc_blocks;
1588 unsigned long desc_blocks; 1593 unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1591 o_blocks_count = ext4_blocks_count(es); 1596 o_blocks_count = ext4_blocks_count(es);
1592 1597
1593 if (test_opt(sb, DEBUG)) 1598 if (test_opt(sb, DEBUG))
1594 printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " 1599 ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
1595 "upto %llu blocks\n", o_blocks_count, n_blocks_count); 1600 "to %llu blocks", o_blocks_count, n_blocks_count);
1596 1601
1597 if (n_blocks_count < o_blocks_count) { 1602 if (n_blocks_count < o_blocks_count) {
1598 /* On-line shrinking not supported */ 1603 /* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1605 return 0; 1610 return 0;
1606 1611
1607 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1612 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
1608 ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); 1613 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1609 1614
1610 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / 1615 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
1611 EXT4_DESC_PER_BLOCK(sb); 1616 EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1634 } 1639 }
1635 brelse(bh); 1640 brelse(bh);
1636 1641
1637 if (offset != 0) { 1642 /* extend the last group */
1638 /* extend the last group */ 1643 if (n_group == o_group)
1639 ext4_grpblk_t add; 1644 add = n_blocks_count - o_blocks_count;
1640 add = EXT4_BLOCKS_PER_GROUP(sb) - offset; 1645 else
1646 add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
1647 if (add > 0) {
1641 err = ext4_group_extend_no_check(sb, o_blocks_count, add); 1648 err = ext4_group_extend_no_check(sb, o_blocks_count, add);
1642 if (err) 1649 if (err)
1643 goto out; 1650 goto out;
@@ -1674,7 +1681,7 @@ out:
1674 1681
1675 iput(resize_inode); 1682 iput(resize_inode);
1676 if (test_opt(sb, DEBUG)) 1683 if (test_opt(sb, DEBUG))
1677 printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " 1684 ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
1678 "upto %llu blocks\n", o_blocks_count, n_blocks_count); 1685 "upto %llu blocks", o_blocks_count, n_blocks_count);
1679 return err; 1686 return err;
1680} 1687}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 933900909ed0..ceebaf853beb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;
62 62
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 63static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64 unsigned long journal_devnum); 64 unsigned long journal_devnum);
65static int ext4_show_options(struct seq_file *seq, struct dentry *root);
65static int ext4_commit_super(struct super_block *sb, int sync); 66static int ext4_commit_super(struct super_block *sb, int sync);
66static void ext4_mark_recovery_complete(struct super_block *sb, 67static void ext4_mark_recovery_complete(struct super_block *sb,
67 struct ext4_super_block *es); 68 struct ext4_super_block *es);
@@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
375 if (is_handle_aborted(handle)) 376 if (is_handle_aborted(handle))
376 return; 377 return;
377 378
378 printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", 379 printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
379 caller, line, errstr, err_fn); 380 caller, line, errstr, err_fn);
380 381
381 jbd2_journal_abort_handle(handle); 382 jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)
431 return bdi->dev == NULL; 432 return bdi->dev == NULL;
432} 433}
433 434
435static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
436{
437 struct super_block *sb = journal->j_private;
438 struct ext4_sb_info *sbi = EXT4_SB(sb);
439 int error = is_journal_aborted(journal);
440 struct ext4_journal_cb_entry *jce, *tmp;
441
442 spin_lock(&sbi->s_md_lock);
443 list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
444 list_del_init(&jce->jce_list);
445 spin_unlock(&sbi->s_md_lock);
446 jce->jce_func(sb, jce, error);
447 spin_lock(&sbi->s_md_lock);
448 }
449 spin_unlock(&sbi->s_md_lock);
450}
434 451
435/* Deal with the reporting of failure conditions on a filesystem such as 452/* Deal with the reporting of failure conditions on a filesystem such as
436 * inconsistencies detected or read IO failures. 453 * inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,
498 va_start(args, fmt); 515 va_start(args, fmt);
499 vaf.fmt = fmt; 516 vaf.fmt = fmt;
500 vaf.va = &args; 517 vaf.va = &args;
501 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
502 inode->i_sb->s_id, function, line, inode->i_ino);
503 if (block) 518 if (block)
504 printk(KERN_CONT "block %llu: ", block); 519 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
505 printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); 520 "inode #%lu: block %llu: comm %s: %pV\n",
521 inode->i_sb->s_id, function, line, inode->i_ino,
522 block, current->comm, &vaf);
523 else
524 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
525 "inode #%lu: comm %s: %pV\n",
526 inode->i_sb->s_id, function, line, inode->i_ino,
527 current->comm, &vaf);
506 va_end(args); 528 va_end(args);
507 529
508 ext4_handle_error(inode->i_sb); 530 ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,
524 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 546 path = d_path(&(file->f_path), pathname, sizeof(pathname));
525 if (IS_ERR(path)) 547 if (IS_ERR(path))
526 path = "(unknown)"; 548 path = "(unknown)";
527 printk(KERN_CRIT
528 "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
529 inode->i_sb->s_id, function, line, inode->i_ino);
530 if (block)
531 printk(KERN_CONT "block %llu: ", block);
532 va_start(args, fmt); 549 va_start(args, fmt);
533 vaf.fmt = fmt; 550 vaf.fmt = fmt;
534 vaf.va = &args; 551 vaf.va = &args;
535 printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); 552 if (block)
553 printk(KERN_CRIT
554 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
555 "block %llu: comm %s: path %s: %pV\n",
556 inode->i_sb->s_id, function, line, inode->i_ino,
557 block, current->comm, path, &vaf);
558 else
559 printk(KERN_CRIT
560 "EXT4-fs error (device %s): %s:%d: inode #%lu: "
561 "comm %s: path %s: %pV\n",
562 inode->i_sb->s_id, function, line, inode->i_ino,
563 current->comm, path, &vaf);
536 va_end(args); 564 va_end(args);
537 565
538 ext4_handle_error(inode->i_sb); 566 ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)
808 destroy_workqueue(sbi->dio_unwritten_wq); 836 destroy_workqueue(sbi->dio_unwritten_wq);
809 837
810 lock_super(sb); 838 lock_super(sb);
811 if (sb->s_dirt)
812 ext4_commit_super(sb, 1);
813
814 if (sbi->s_journal) { 839 if (sbi->s_journal) {
815 err = jbd2_journal_destroy(sbi->s_journal); 840 err = jbd2_journal_destroy(sbi->s_journal);
816 sbi->s_journal = NULL; 841 sbi->s_journal = NULL;
@@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)
827 if (!(sb->s_flags & MS_RDONLY)) { 852 if (!(sb->s_flags & MS_RDONLY)) {
828 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 853 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
829 es->s_state = cpu_to_le16(sbi->s_mount_state); 854 es->s_state = cpu_to_le16(sbi->s_mount_state);
830 ext4_commit_super(sb, 1);
831 } 855 }
856 if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
857 ext4_commit_super(sb, 1);
858
832 if (sbi->s_proc) { 859 if (sbi->s_proc) {
860 remove_proc_entry("options", sbi->s_proc);
833 remove_proc_entry(sb->s_id, ext4_proc_root); 861 remove_proc_entry(sb->s_id, ext4_proc_root);
834 } 862 }
835 kobject_del(&sbi->s_kobj); 863 kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)
990 } 1018 }
991} 1019}
992 1020
993static inline void ext4_show_quota_options(struct seq_file *seq,
994 struct super_block *sb)
995{
996#if defined(CONFIG_QUOTA)
997 struct ext4_sb_info *sbi = EXT4_SB(sb);
998
999 if (sbi->s_jquota_fmt) {
1000 char *fmtname = "";
1001
1002 switch (sbi->s_jquota_fmt) {
1003 case QFMT_VFS_OLD:
1004 fmtname = "vfsold";
1005 break;
1006 case QFMT_VFS_V0:
1007 fmtname = "vfsv0";
1008 break;
1009 case QFMT_VFS_V1:
1010 fmtname = "vfsv1";
1011 break;
1012 }
1013 seq_printf(seq, ",jqfmt=%s", fmtname);
1014 }
1015
1016 if (sbi->s_qf_names[USRQUOTA])
1017 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1018
1019 if (sbi->s_qf_names[GRPQUOTA])
1020 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1021
1022 if (test_opt(sb, USRQUOTA))
1023 seq_puts(seq, ",usrquota");
1024
1025 if (test_opt(sb, GRPQUOTA))
1026 seq_puts(seq, ",grpquota");
1027#endif
1028}
1029
1030/*
1031 * Show an option if
1032 * - it's set to a non-default value OR
1033 * - if the per-sb default is different from the global default
1034 */
1035static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1036{
1037 int def_errors;
1038 unsigned long def_mount_opts;
1039 struct super_block *sb = root->d_sb;
1040 struct ext4_sb_info *sbi = EXT4_SB(sb);
1041 struct ext4_super_block *es = sbi->s_es;
1042
1043 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1044 def_errors = le16_to_cpu(es->s_errors);
1045
1046 if (sbi->s_sb_block != 1)
1047 seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
1048 if (test_opt(sb, MINIX_DF))
1049 seq_puts(seq, ",minixdf");
1050 if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
1051 seq_puts(seq, ",grpid");
1052 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
1053 seq_puts(seq, ",nogrpid");
1054 if (sbi->s_resuid != EXT4_DEF_RESUID ||
1055 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
1056 seq_printf(seq, ",resuid=%u", sbi->s_resuid);
1057 }
1058 if (sbi->s_resgid != EXT4_DEF_RESGID ||
1059 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
1060 seq_printf(seq, ",resgid=%u", sbi->s_resgid);
1061 }
1062 if (test_opt(sb, ERRORS_RO)) {
1063 if (def_errors == EXT4_ERRORS_PANIC ||
1064 def_errors == EXT4_ERRORS_CONTINUE) {
1065 seq_puts(seq, ",errors=remount-ro");
1066 }
1067 }
1068 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1069 seq_puts(seq, ",errors=continue");
1070 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1071 seq_puts(seq, ",errors=panic");
1072 if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
1073 seq_puts(seq, ",nouid32");
1074 if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
1075 seq_puts(seq, ",debug");
1076#ifdef CONFIG_EXT4_FS_XATTR
1077 if (test_opt(sb, XATTR_USER))
1078 seq_puts(seq, ",user_xattr");
1079 if (!test_opt(sb, XATTR_USER))
1080 seq_puts(seq, ",nouser_xattr");
1081#endif
1082#ifdef CONFIG_EXT4_FS_POSIX_ACL
1083 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
1084 seq_puts(seq, ",acl");
1085 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
1086 seq_puts(seq, ",noacl");
1087#endif
1088 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
1089 seq_printf(seq, ",commit=%u",
1090 (unsigned) (sbi->s_commit_interval / HZ));
1091 }
1092 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
1093 seq_printf(seq, ",min_batch_time=%u",
1094 (unsigned) sbi->s_min_batch_time);
1095 }
1096 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
1097 seq_printf(seq, ",max_batch_time=%u",
1098 (unsigned) sbi->s_max_batch_time);
1099 }
1100
1101 /*
1102 * We're changing the default of barrier mount option, so
1103 * let's always display its mount state so it's clear what its
1104 * status is.
1105 */
1106 seq_puts(seq, ",barrier=");
1107 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
1108 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
1109 seq_puts(seq, ",journal_async_commit");
1110 else if (test_opt(sb, JOURNAL_CHECKSUM))
1111 seq_puts(seq, ",journal_checksum");
1112 if (test_opt(sb, I_VERSION))
1113 seq_puts(seq, ",i_version");
1114 if (!test_opt(sb, DELALLOC) &&
1115 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1116 seq_puts(seq, ",nodelalloc");
1117
1118 if (!test_opt(sb, MBLK_IO_SUBMIT))
1119 seq_puts(seq, ",nomblk_io_submit");
1120 if (sbi->s_stripe)
1121 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1122 /*
1123 * journal mode get enabled in different ways
1124 * So just print the value even if we didn't specify it
1125 */
1126 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1127 seq_puts(seq, ",data=journal");
1128 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1129 seq_puts(seq, ",data=ordered");
1130 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1131 seq_puts(seq, ",data=writeback");
1132
1133 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1134 seq_printf(seq, ",inode_readahead_blks=%u",
1135 sbi->s_inode_readahead_blks);
1136
1137 if (test_opt(sb, DATA_ERR_ABORT))
1138 seq_puts(seq, ",data_err=abort");
1139
1140 if (test_opt(sb, NO_AUTO_DA_ALLOC))
1141 seq_puts(seq, ",noauto_da_alloc");
1142
1143 if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
1144 seq_puts(seq, ",discard");
1145
1146 if (test_opt(sb, NOLOAD))
1147 seq_puts(seq, ",norecovery");
1148
1149 if (test_opt(sb, DIOREAD_NOLOCK))
1150 seq_puts(seq, ",dioread_nolock");
1151
1152 if (test_opt(sb, BLOCK_VALIDITY) &&
1153 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1154 seq_puts(seq, ",block_validity");
1155
1156 if (!test_opt(sb, INIT_INODE_TABLE))
1157 seq_puts(seq, ",noinit_itable");
1158 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1159 seq_printf(seq, ",init_itable=%u",
1160 (unsigned) sbi->s_li_wait_mult);
1161
1162 ext4_show_quota_options(seq, sb);
1163
1164 return 0;
1165}
1166
1167static struct inode *ext4_nfs_get_inode(struct super_block *sb, 1021static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1168 u64 ino, u32 generation) 1022 u64 ino, u32 generation)
1169{ 1023{
@@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {
1316enum { 1170enum {
1317 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 1171 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1318 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1172 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1319 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1173 Opt_nouid32, Opt_debug, Opt_removed,
1320 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1174 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1321 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, 1175 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1322 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 1176 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1323 Opt_journal_update, Opt_journal_dev, 1177 Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
1324 Opt_journal_checksum, Opt_journal_async_commit,
1325 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1178 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1326 Opt_data_err_abort, Opt_data_err_ignore, 1179 Opt_data_err_abort, Opt_data_err_ignore,
1327 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1180 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1328 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1181 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1329 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1182 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1330 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1183 Opt_usrquota, Opt_grpquota, Opt_i_version,
1331 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, 1184 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1332 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1185 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1333 Opt_inode_readahead_blks, Opt_journal_ioprio, 1186 Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@ static const match_table_t tokens = {
1350 {Opt_err_ro, "errors=remount-ro"}, 1203 {Opt_err_ro, "errors=remount-ro"},
1351 {Opt_nouid32, "nouid32"}, 1204 {Opt_nouid32, "nouid32"},
1352 {Opt_debug, "debug"}, 1205 {Opt_debug, "debug"},
1353 {Opt_oldalloc, "oldalloc"}, 1206 {Opt_removed, "oldalloc"},
1354 {Opt_orlov, "orlov"}, 1207 {Opt_removed, "orlov"},
1355 {Opt_user_xattr, "user_xattr"}, 1208 {Opt_user_xattr, "user_xattr"},
1356 {Opt_nouser_xattr, "nouser_xattr"}, 1209 {Opt_nouser_xattr, "nouser_xattr"},
1357 {Opt_acl, "acl"}, 1210 {Opt_acl, "acl"},
1358 {Opt_noacl, "noacl"}, 1211 {Opt_noacl, "noacl"},
1359 {Opt_noload, "noload"},
1360 {Opt_noload, "norecovery"}, 1212 {Opt_noload, "norecovery"},
1361 {Opt_nobh, "nobh"}, 1213 {Opt_noload, "noload"},
1362 {Opt_bh, "bh"}, 1214 {Opt_removed, "nobh"},
1215 {Opt_removed, "bh"},
1363 {Opt_commit, "commit=%u"}, 1216 {Opt_commit, "commit=%u"},
1364 {Opt_min_batch_time, "min_batch_time=%u"}, 1217 {Opt_min_batch_time, "min_batch_time=%u"},
1365 {Opt_max_batch_time, "max_batch_time=%u"}, 1218 {Opt_max_batch_time, "max_batch_time=%u"},
1366 {Opt_journal_update, "journal=update"},
1367 {Opt_journal_dev, "journal_dev=%u"}, 1219 {Opt_journal_dev, "journal_dev=%u"},
1368 {Opt_journal_checksum, "journal_checksum"}, 1220 {Opt_journal_checksum, "journal_checksum"},
1369 {Opt_journal_async_commit, "journal_async_commit"}, 1221 {Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@ static const match_table_t tokens = {
1389 {Opt_nobarrier, "nobarrier"}, 1241 {Opt_nobarrier, "nobarrier"},
1390 {Opt_i_version, "i_version"}, 1242 {Opt_i_version, "i_version"},
1391 {Opt_stripe, "stripe=%u"}, 1243 {Opt_stripe, "stripe=%u"},
1392 {Opt_resize, "resize"},
1393 {Opt_delalloc, "delalloc"}, 1244 {Opt_delalloc, "delalloc"},
1394 {Opt_nodelalloc, "nodelalloc"}, 1245 {Opt_nodelalloc, "nodelalloc"},
1395 {Opt_mblk_io_submit, "mblk_io_submit"}, 1246 {Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@ static const match_table_t tokens = {
1408 {Opt_init_itable, "init_itable=%u"}, 1259 {Opt_init_itable, "init_itable=%u"},
1409 {Opt_init_itable, "init_itable"}, 1260 {Opt_init_itable, "init_itable"},
1410 {Opt_noinit_itable, "noinit_itable"}, 1261 {Opt_noinit_itable, "noinit_itable"},
1262 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
1263 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
1264 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
1265 {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1266 {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
1411 {Opt_err, NULL}, 1267 {Opt_err, NULL},
1412}; 1268};
1413 1269
@@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)
1496} 1352}
1497#endif 1353#endif
1498 1354
1499static int parse_options(char *options, struct super_block *sb, 1355#define MOPT_SET 0x0001
1500 unsigned long *journal_devnum, 1356#define MOPT_CLEAR 0x0002
1501 unsigned int *journal_ioprio, 1357#define MOPT_NOSUPPORT 0x0004
1502 ext4_fsblk_t *n_blocks_count, int is_remount) 1358#define MOPT_EXPLICIT 0x0008
1503{ 1359#define MOPT_CLEAR_ERR 0x0010
1504 struct ext4_sb_info *sbi = EXT4_SB(sb); 1360#define MOPT_GTE0 0x0020
1505 char *p;
1506 substring_t args[MAX_OPT_ARGS];
1507 int data_opt = 0;
1508 int option;
1509#ifdef CONFIG_QUOTA 1361#ifdef CONFIG_QUOTA
1510 int qfmt; 1362#define MOPT_Q 0
1363#define MOPT_QFMT 0x0040
1364#else
1365#define MOPT_Q MOPT_NOSUPPORT
1366#define MOPT_QFMT MOPT_NOSUPPORT
1511#endif 1367#endif
1512 1368#define MOPT_DATAJ 0x0080
1513 if (!options) 1369
1514 return 1; 1370static const struct mount_opts {
1515 1371 int token;
1516 while ((p = strsep(&options, ",")) != NULL) { 1372 int mount_opt;
1517 int token; 1373 int flags;
1518 if (!*p) 1374} ext4_mount_opts[] = {
1519 continue; 1375 {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1520 1376 {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1521 /* 1377 {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1522 * Initialize args struct so we know whether arg was 1378 {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1523 * found; some options take optional arguments. 1379 {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
1524 */ 1380 {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
1525 args[0].to = args[0].from = NULL; 1381 {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1526 token = match_token(p, tokens, args); 1382 {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1527 switch (token) { 1383 {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
1528 case Opt_bsd_df: 1384 {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
1529 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1385 {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1530 clear_opt(sb, MINIX_DF); 1386 {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1531 break; 1387 {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
1532 case Opt_minix_df: 1388 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
1533 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1389 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
1534 set_opt(sb, MINIX_DF); 1390 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1535 1391 EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
1536 break; 1392 {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
1537 case Opt_grpid: 1393 {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1538 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1394 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1539 set_opt(sb, GRPID); 1395 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1540 1396 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
1541 break; 1397 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
1542 case Opt_nogrpid: 1398 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1543 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); 1399 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1544 clear_opt(sb, GRPID); 1400 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1545 1401 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1546 break; 1402 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1547 case Opt_resuid: 1403 {Opt_commit, 0, MOPT_GTE0},
1548 if (match_int(&args[0], &option)) 1404 {Opt_max_batch_time, 0, MOPT_GTE0},
1549 return 0; 1405 {Opt_min_batch_time, 0, MOPT_GTE0},
1550 sbi->s_resuid = option; 1406 {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1551 break; 1407 {Opt_init_itable, 0, MOPT_GTE0},
1552 case Opt_resgid: 1408 {Opt_stripe, 0, MOPT_GTE0},
1553 if (match_int(&args[0], &option)) 1409 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
1554 return 0; 1410 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
1555 sbi->s_resgid = option; 1411 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
1556 break;
1557 case Opt_sb:
1558 /* handled by get_sb_block() instead of here */
1559 /* *sb_block = match_int(&args[0]); */
1560 break;
1561 case Opt_err_panic:
1562 clear_opt(sb, ERRORS_CONT);
1563 clear_opt(sb, ERRORS_RO);
1564 set_opt(sb, ERRORS_PANIC);
1565 break;
1566 case Opt_err_ro:
1567 clear_opt(sb, ERRORS_CONT);
1568 clear_opt(sb, ERRORS_PANIC);
1569 set_opt(sb, ERRORS_RO);
1570 break;
1571 case Opt_err_cont:
1572 clear_opt(sb, ERRORS_RO);
1573 clear_opt(sb, ERRORS_PANIC);
1574 set_opt(sb, ERRORS_CONT);
1575 break;
1576 case Opt_nouid32:
1577 set_opt(sb, NO_UID32);
1578 break;
1579 case Opt_debug:
1580 set_opt(sb, DEBUG);
1581 break;
1582 case Opt_oldalloc:
1583 ext4_msg(sb, KERN_WARNING,
1584 "Ignoring deprecated oldalloc option");
1585 break;
1586 case Opt_orlov:
1587 ext4_msg(sb, KERN_WARNING,
1588 "Ignoring deprecated orlov option");
1589 break;
1590#ifdef CONFIG_EXT4_FS_XATTR 1412#ifdef CONFIG_EXT4_FS_XATTR
1591 case Opt_user_xattr: 1413 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1592 set_opt(sb, XATTR_USER); 1414 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1593 break;
1594 case Opt_nouser_xattr:
1595 clear_opt(sb, XATTR_USER);
1596 break;
1597#else 1415#else
1598 case Opt_user_xattr: 1416 {Opt_user_xattr, 0, MOPT_NOSUPPORT},
1599 case Opt_nouser_xattr: 1417 {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
1600 ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
1601 break;
1602#endif 1418#endif
1603#ifdef CONFIG_EXT4_FS_POSIX_ACL 1419#ifdef CONFIG_EXT4_FS_POSIX_ACL
1604 case Opt_acl: 1420 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1605 set_opt(sb, POSIX_ACL); 1421 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1606 break;
1607 case Opt_noacl:
1608 clear_opt(sb, POSIX_ACL);
1609 break;
1610#else 1422#else
1611 case Opt_acl: 1423 {Opt_acl, 0, MOPT_NOSUPPORT},
1612 case Opt_noacl: 1424 {Opt_noacl, 0, MOPT_NOSUPPORT},
1613 ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
1614 break;
1615#endif 1425#endif
1616 case Opt_journal_update: 1426 {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1617 /* @@@ FIXME */ 1427 {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1618 /* Eventually we will want to be able to create 1428 {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1619 a journal file here. For now, only allow the 1429 {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1620 user to specify an existing inode to be the 1430 MOPT_SET | MOPT_Q},
1621 journal file. */ 1431 {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1622 if (is_remount) { 1432 MOPT_SET | MOPT_Q},
1623 ext4_msg(sb, KERN_ERR, 1433 {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1624 "Cannot specify journal on remount"); 1434 EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
1625 return 0; 1435 {Opt_usrjquota, 0, MOPT_Q},
1626 } 1436 {Opt_grpjquota, 0, MOPT_Q},
1627 set_opt(sb, UPDATE_JOURNAL); 1437 {Opt_offusrjquota, 0, MOPT_Q},
1628 break; 1438 {Opt_offgrpjquota, 0, MOPT_Q},
1629 case Opt_journal_dev: 1439 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1630 if (is_remount) { 1440 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1441 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1442 {Opt_err, 0, 0}
1443};
1444
1445static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1446 substring_t *args, unsigned long *journal_devnum,
1447 unsigned int *journal_ioprio, int is_remount)
1448{
1449 struct ext4_sb_info *sbi = EXT4_SB(sb);
1450 const struct mount_opts *m;
1451 int arg = 0;
1452
1453 if (args->from && match_int(args, &arg))
1454 return -1;
1455 switch (token) {
1456 case Opt_noacl:
1457 case Opt_nouser_xattr:
1458 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1459 break;
1460 case Opt_sb:
1461 return 1; /* handled by get_sb_block() */
1462 case Opt_removed:
1463 ext4_msg(sb, KERN_WARNING,
1464 "Ignoring removed %s option", opt);
1465 return 1;
1466 case Opt_resuid:
1467 sbi->s_resuid = arg;
1468 return 1;
1469 case Opt_resgid:
1470 sbi->s_resgid = arg;
1471 return 1;
1472 case Opt_abort:
1473 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1474 return 1;
1475 case Opt_i_version:
1476 sb->s_flags |= MS_I_VERSION;
1477 return 1;
1478 case Opt_journal_dev:
1479 if (is_remount) {
1480 ext4_msg(sb, KERN_ERR,
1481 "Cannot specify journal on remount");
1482 return -1;
1483 }
1484 *journal_devnum = arg;
1485 return 1;
1486 case Opt_journal_ioprio:
1487 if (arg < 0 || arg > 7)
1488 return -1;
1489 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1490 return 1;
1491 }
1492
1493 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1494 if (token != m->token)
1495 continue;
1496 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1497 return -1;
1498 if (m->flags & MOPT_EXPLICIT)
1499 set_opt2(sb, EXPLICIT_DELALLOC);
1500 if (m->flags & MOPT_CLEAR_ERR)
1501 clear_opt(sb, ERRORS_MASK);
1502 if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1503 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1504 "options when quota turned on");
1505 return -1;
1506 }
1507
1508 if (m->flags & MOPT_NOSUPPORT) {
1509 ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1510 } else if (token == Opt_commit) {
1511 if (arg == 0)
1512 arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1513 sbi->s_commit_interval = HZ * arg;
1514 } else if (token == Opt_max_batch_time) {
1515 if (arg == 0)
1516 arg = EXT4_DEF_MAX_BATCH_TIME;
1517 sbi->s_max_batch_time = arg;
1518 } else if (token == Opt_min_batch_time) {
1519 sbi->s_min_batch_time = arg;
1520 } else if (token == Opt_inode_readahead_blks) {
1521 if (arg > (1 << 30))
1522 return -1;
1523 if (arg && !is_power_of_2(arg)) {
1631 ext4_msg(sb, KERN_ERR, 1524 ext4_msg(sb, KERN_ERR,
1632 "Cannot specify journal on remount"); 1525 "EXT4-fs: inode_readahead_blks"
1633 return 0; 1526 " must be a power of 2");
1527 return -1;
1634 } 1528 }
1635 if (match_int(&args[0], &option)) 1529 sbi->s_inode_readahead_blks = arg;
1636 return 0; 1530 } else if (token == Opt_init_itable) {
1637 *journal_devnum = option; 1531 set_opt(sb, INIT_INODE_TABLE);
1638 break; 1532 if (!args->from)
1639 case Opt_journal_checksum: 1533 arg = EXT4_DEF_LI_WAIT_MULT;
1640 set_opt(sb, JOURNAL_CHECKSUM); 1534 sbi->s_li_wait_mult = arg;
1641 break; 1535 } else if (token == Opt_stripe) {
1642 case Opt_journal_async_commit: 1536 sbi->s_stripe = arg;
1643 set_opt(sb, JOURNAL_ASYNC_COMMIT); 1537 } else if (m->flags & MOPT_DATAJ) {
1644 set_opt(sb, JOURNAL_CHECKSUM);
1645 break;
1646 case Opt_noload:
1647 set_opt(sb, NOLOAD);
1648 break;
1649 case Opt_commit:
1650 if (match_int(&args[0], &option))
1651 return 0;
1652 if (option < 0)
1653 return 0;
1654 if (option == 0)
1655 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1656 sbi->s_commit_interval = HZ * option;
1657 break;
1658 case Opt_max_batch_time:
1659 if (match_int(&args[0], &option))
1660 return 0;
1661 if (option < 0)
1662 return 0;
1663 if (option == 0)
1664 option = EXT4_DEF_MAX_BATCH_TIME;
1665 sbi->s_max_batch_time = option;
1666 break;
1667 case Opt_min_batch_time:
1668 if (match_int(&args[0], &option))
1669 return 0;
1670 if (option < 0)
1671 return 0;
1672 sbi->s_min_batch_time = option;
1673 break;
1674 case Opt_data_journal:
1675 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1676 goto datacheck;
1677 case Opt_data_ordered:
1678 data_opt = EXT4_MOUNT_ORDERED_DATA;
1679 goto datacheck;
1680 case Opt_data_writeback:
1681 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1682 datacheck:
1683 if (is_remount) { 1538 if (is_remount) {
1684 if (!sbi->s_journal) 1539 if (!sbi->s_journal)
1685 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); 1540 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1686 else if (test_opt(sb, DATA_FLAGS) != data_opt) { 1541 else if (test_opt(sb, DATA_FLAGS) !=
1542 m->mount_opt) {
1687 ext4_msg(sb, KERN_ERR, 1543 ext4_msg(sb, KERN_ERR,
1688 "Cannot change data mode on remount"); 1544 "Cannot change data mode on remount");
1689 return 0; 1545 return -1;
1690 } 1546 }
1691 } else { 1547 } else {
1692 clear_opt(sb, DATA_FLAGS); 1548 clear_opt(sb, DATA_FLAGS);
1693 sbi->s_mount_opt |= data_opt; 1549 sbi->s_mount_opt |= m->mount_opt;
1694 } 1550 }
1695 break;
1696 case Opt_data_err_abort:
1697 set_opt(sb, DATA_ERR_ABORT);
1698 break;
1699 case Opt_data_err_ignore:
1700 clear_opt(sb, DATA_ERR_ABORT);
1701 break;
1702#ifdef CONFIG_QUOTA 1551#ifdef CONFIG_QUOTA
1703 case Opt_usrjquota: 1552 } else if (token == Opt_usrjquota) {
1704 if (!set_qf_name(sb, USRQUOTA, &args[0])) 1553 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1705 return 0; 1554 return -1;
1706 break; 1555 } else if (token == Opt_grpjquota) {
1707 case Opt_grpjquota:
1708 if (!set_qf_name(sb, GRPQUOTA, &args[0])) 1556 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1709 return 0; 1557 return -1;
1710 break; 1558 } else if (token == Opt_offusrjquota) {
1711 case Opt_offusrjquota:
1712 if (!clear_qf_name(sb, USRQUOTA)) 1559 if (!clear_qf_name(sb, USRQUOTA))
1713 return 0; 1560 return -1;
1714 break; 1561 } else if (token == Opt_offgrpjquota) {
1715 case Opt_offgrpjquota:
1716 if (!clear_qf_name(sb, GRPQUOTA)) 1562 if (!clear_qf_name(sb, GRPQUOTA))
1717 return 0; 1563 return -1;
1718 break; 1564 } else if (m->flags & MOPT_QFMT) {
1719
1720 case Opt_jqfmt_vfsold:
1721 qfmt = QFMT_VFS_OLD;
1722 goto set_qf_format;
1723 case Opt_jqfmt_vfsv0:
1724 qfmt = QFMT_VFS_V0;
1725 goto set_qf_format;
1726 case Opt_jqfmt_vfsv1:
1727 qfmt = QFMT_VFS_V1;
1728set_qf_format:
1729 if (sb_any_quota_loaded(sb) && 1565 if (sb_any_quota_loaded(sb) &&
1730 sbi->s_jquota_fmt != qfmt) { 1566 sbi->s_jquota_fmt != m->mount_opt) {
1731 ext4_msg(sb, KERN_ERR, "Cannot change " 1567 ext4_msg(sb, KERN_ERR, "Cannot "
1732 "journaled quota options when " 1568 "change journaled quota options "
1733 "quota turned on"); 1569 "when quota turned on");
1734 return 0; 1570 return -1;
1735 }
1736 sbi->s_jquota_fmt = qfmt;
1737 break;
1738 case Opt_quota:
1739 case Opt_usrquota:
1740 set_opt(sb, QUOTA);
1741 set_opt(sb, USRQUOTA);
1742 break;
1743 case Opt_grpquota:
1744 set_opt(sb, QUOTA);
1745 set_opt(sb, GRPQUOTA);
1746 break;
1747 case Opt_noquota:
1748 if (sb_any_quota_loaded(sb)) {
1749 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1750 "options when quota turned on");
1751 return 0;
1752 } 1571 }
1753 clear_opt(sb, QUOTA); 1572 sbi->s_jquota_fmt = m->mount_opt;
1754 clear_opt(sb, USRQUOTA);
1755 clear_opt(sb, GRPQUOTA);
1756 break;
1757#else
1758 case Opt_quota:
1759 case Opt_usrquota:
1760 case Opt_grpquota:
1761 ext4_msg(sb, KERN_ERR,
1762 "quota options not supported");
1763 break;
1764 case Opt_usrjquota:
1765 case Opt_grpjquota:
1766 case Opt_offusrjquota:
1767 case Opt_offgrpjquota:
1768 case Opt_jqfmt_vfsold:
1769 case Opt_jqfmt_vfsv0:
1770 case Opt_jqfmt_vfsv1:
1771 ext4_msg(sb, KERN_ERR,
1772 "journaled quota options not supported");
1773 break;
1774 case Opt_noquota:
1775 break;
1776#endif 1573#endif
1777 case Opt_abort: 1574 } else {
1778 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; 1575 if (!args->from)
1779 break; 1576 arg = 1;
1780 case Opt_nobarrier: 1577 if (m->flags & MOPT_CLEAR)
1781 clear_opt(sb, BARRIER); 1578 arg = !arg;
1782 break; 1579 else if (unlikely(!(m->flags & MOPT_SET))) {
1783 case Opt_barrier: 1580 ext4_msg(sb, KERN_WARNING,
1784 if (args[0].from) { 1581 "buggy handling of option %s", opt);
1785 if (match_int(&args[0], &option)) 1582 WARN_ON(1);
1786 return 0; 1583 return -1;
1787 } else
1788 option = 1; /* No argument, default to 1 */
1789 if (option)
1790 set_opt(sb, BARRIER);
1791 else
1792 clear_opt(sb, BARRIER);
1793 break;
1794 case Opt_ignore:
1795 break;
1796 case Opt_resize:
1797 if (!is_remount) {
1798 ext4_msg(sb, KERN_ERR,
1799 "resize option only available "
1800 "for remount");
1801 return 0;
1802 }
1803 if (match_int(&args[0], &option) != 0)
1804 return 0;
1805 *n_blocks_count = option;
1806 break;
1807 case Opt_nobh:
1808 ext4_msg(sb, KERN_WARNING,
1809 "Ignoring deprecated nobh option");
1810 break;
1811 case Opt_bh:
1812 ext4_msg(sb, KERN_WARNING,
1813 "Ignoring deprecated bh option");
1814 break;
1815 case Opt_i_version:
1816 set_opt(sb, I_VERSION);
1817 sb->s_flags |= MS_I_VERSION;
1818 break;
1819 case Opt_nodelalloc:
1820 clear_opt(sb, DELALLOC);
1821 clear_opt2(sb, EXPLICIT_DELALLOC);
1822 break;
1823 case Opt_mblk_io_submit:
1824 set_opt(sb, MBLK_IO_SUBMIT);
1825 break;
1826 case Opt_nomblk_io_submit:
1827 clear_opt(sb, MBLK_IO_SUBMIT);
1828 break;
1829 case Opt_stripe:
1830 if (match_int(&args[0], &option))
1831 return 0;
1832 if (option < 0)
1833 return 0;
1834 sbi->s_stripe = option;
1835 break;
1836 case Opt_delalloc:
1837 set_opt(sb, DELALLOC);
1838 set_opt2(sb, EXPLICIT_DELALLOC);
1839 break;
1840 case Opt_block_validity:
1841 set_opt(sb, BLOCK_VALIDITY);
1842 break;
1843 case Opt_noblock_validity:
1844 clear_opt(sb, BLOCK_VALIDITY);
1845 break;
1846 case Opt_inode_readahead_blks:
1847 if (match_int(&args[0], &option))
1848 return 0;
1849 if (option < 0 || option > (1 << 30))
1850 return 0;
1851 if (option && !is_power_of_2(option)) {
1852 ext4_msg(sb, KERN_ERR,
1853 "EXT4-fs: inode_readahead_blks"
1854 " must be a power of 2");
1855 return 0;
1856 } 1584 }
1857 sbi->s_inode_readahead_blks = option; 1585 if (arg != 0)
1858 break; 1586 sbi->s_mount_opt |= m->mount_opt;
1859 case Opt_journal_ioprio:
1860 if (match_int(&args[0], &option))
1861 return 0;
1862 if (option < 0 || option > 7)
1863 break;
1864 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1865 option);
1866 break;
1867 case Opt_noauto_da_alloc:
1868 set_opt(sb, NO_AUTO_DA_ALLOC);
1869 break;
1870 case Opt_auto_da_alloc:
1871 if (args[0].from) {
1872 if (match_int(&args[0], &option))
1873 return 0;
1874 } else
1875 option = 1; /* No argument, default to 1 */
1876 if (option)
1877 clear_opt(sb, NO_AUTO_DA_ALLOC);
1878 else 1587 else
1879 set_opt(sb,NO_AUTO_DA_ALLOC); 1588 sbi->s_mount_opt &= ~m->mount_opt;
1880 break;
1881 case Opt_discard:
1882 set_opt(sb, DISCARD);
1883 break;
1884 case Opt_nodiscard:
1885 clear_opt(sb, DISCARD);
1886 break;
1887 case Opt_dioread_nolock:
1888 set_opt(sb, DIOREAD_NOLOCK);
1889 break;
1890 case Opt_dioread_lock:
1891 clear_opt(sb, DIOREAD_NOLOCK);
1892 break;
1893 case Opt_init_itable:
1894 set_opt(sb, INIT_INODE_TABLE);
1895 if (args[0].from) {
1896 if (match_int(&args[0], &option))
1897 return 0;
1898 } else
1899 option = EXT4_DEF_LI_WAIT_MULT;
1900 if (option < 0)
1901 return 0;
1902 sbi->s_li_wait_mult = option;
1903 break;
1904 case Opt_noinit_itable:
1905 clear_opt(sb, INIT_INODE_TABLE);
1906 break;
1907 default:
1908 ext4_msg(sb, KERN_ERR,
1909 "Unrecognized mount option \"%s\" "
1910 "or missing value", p);
1911 return 0;
1912 } 1589 }
1590 return 1;
1591 }
1592 ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1593 "or missing value", opt);
1594 return -1;
1595}
1596
1597static int parse_options(char *options, struct super_block *sb,
1598 unsigned long *journal_devnum,
1599 unsigned int *journal_ioprio,
1600 int is_remount)
1601{
1602 struct ext4_sb_info *sbi = EXT4_SB(sb);
1603 char *p;
1604 substring_t args[MAX_OPT_ARGS];
1605 int token;
1606
1607 if (!options)
1608 return 1;
1609
1610 while ((p = strsep(&options, ",")) != NULL) {
1611 if (!*p)
1612 continue;
1613 /*
1614 * Initialize args struct so we know whether arg was
1615 * found; some options take optional arguments.
1616 */
1617 args[0].to = args[0].from = 0;
1618 token = match_token(p, tokens, args);
1619 if (handle_mount_opt(sb, p, token, args, journal_devnum,
1620 journal_ioprio, is_remount) < 0)
1621 return 0;
1913 } 1622 }
1914#ifdef CONFIG_QUOTA 1623#ifdef CONFIG_QUOTA
1915 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1624 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@ set_qf_format:
1942 return 1; 1651 return 1;
1943} 1652}
1944 1653
1654static inline void ext4_show_quota_options(struct seq_file *seq,
1655 struct super_block *sb)
1656{
1657#if defined(CONFIG_QUOTA)
1658 struct ext4_sb_info *sbi = EXT4_SB(sb);
1659
1660 if (sbi->s_jquota_fmt) {
1661 char *fmtname = "";
1662
1663 switch (sbi->s_jquota_fmt) {
1664 case QFMT_VFS_OLD:
1665 fmtname = "vfsold";
1666 break;
1667 case QFMT_VFS_V0:
1668 fmtname = "vfsv0";
1669 break;
1670 case QFMT_VFS_V1:
1671 fmtname = "vfsv1";
1672 break;
1673 }
1674 seq_printf(seq, ",jqfmt=%s", fmtname);
1675 }
1676
1677 if (sbi->s_qf_names[USRQUOTA])
1678 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1679
1680 if (sbi->s_qf_names[GRPQUOTA])
1681 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1682
1683 if (test_opt(sb, USRQUOTA))
1684 seq_puts(seq, ",usrquota");
1685
1686 if (test_opt(sb, GRPQUOTA))
1687 seq_puts(seq, ",grpquota");
1688#endif
1689}
1690
1691static const char *token2str(int token)
1692{
1693 static const struct match_token *t;
1694
1695 for (t = tokens; t->token != Opt_err; t++)
1696 if (t->token == token && !strchr(t->pattern, '='))
1697 break;
1698 return t->pattern;
1699}
1700
1701/*
1702 * Show an option if
1703 * - it's set to a non-default value OR
1704 * - if the per-sb default is different from the global default
1705 */
1706static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1707 int nodefs)
1708{
1709 struct ext4_sb_info *sbi = EXT4_SB(sb);
1710 struct ext4_super_block *es = sbi->s_es;
1711 int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
1712 const struct mount_opts *m;
1713 char sep = nodefs ? '\n' : ',';
1714
1715#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
1716#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
1717
1718 if (sbi->s_sb_block != 1)
1719 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
1720
1721 for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1722 int want_set = m->flags & MOPT_SET;
1723 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
1724 (m->flags & MOPT_CLEAR_ERR))
1725 continue;
1726 if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
1727 continue; /* skip if same as the default */
1728 if ((want_set &&
1729 (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
1730 (!want_set && (sbi->s_mount_opt & m->mount_opt)))
1731 continue; /* select Opt_noFoo vs Opt_Foo */
1732 SEQ_OPTS_PRINT("%s", token2str(m->token));
1733 }
1734
1735 if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
1736 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
1737 SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
1738 if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
1739 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
1740 SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
1741 def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
1742 if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
1743 SEQ_OPTS_PUTS("errors=remount-ro");
1744 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1745 SEQ_OPTS_PUTS("errors=continue");
1746 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1747 SEQ_OPTS_PUTS("errors=panic");
1748 if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
1749 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
1750 if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
1751 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
1752 if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
1753 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
1754 if (sb->s_flags & MS_I_VERSION)
1755 SEQ_OPTS_PUTS("i_version");
1756 if (nodefs || sbi->s_stripe)
1757 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
1758 if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
1759 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1760 SEQ_OPTS_PUTS("data=journal");
1761 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1762 SEQ_OPTS_PUTS("data=ordered");
1763 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1764 SEQ_OPTS_PUTS("data=writeback");
1765 }
1766 if (nodefs ||
1767 sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1768 SEQ_OPTS_PRINT("inode_readahead_blks=%u",
1769 sbi->s_inode_readahead_blks);
1770
1771 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1772 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1773 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1774
1775 ext4_show_quota_options(seq, sb);
1776 return 0;
1777}
1778
1779static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1780{
1781 return _ext4_show_options(seq, root->d_sb, 0);
1782}
1783
1784static int options_seq_show(struct seq_file *seq, void *offset)
1785{
1786 struct super_block *sb = seq->private;
1787 int rc;
1788
1789 seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
1790 rc = _ext4_show_options(seq, sb, 1);
1791 seq_puts(seq, "\n");
1792 return rc;
1793}
1794
1795static int options_open_fs(struct inode *inode, struct file *file)
1796{
1797 return single_open(file, options_seq_show, PDE(inode)->data);
1798}
1799
1800static const struct file_operations ext4_seq_options_fops = {
1801 .owner = THIS_MODULE,
1802 .open = options_open_fs,
1803 .read = seq_read,
1804 .llseek = seq_lseek,
1805 .release = single_release,
1806};
1807
1945static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 1808static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1946 int read_only) 1809 int read_only)
1947{ 1810{
@@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)
2945 ext4_clear_request_list(); 2808 ext4_clear_request_list();
2946 kfree(ext4_li_info); 2809 kfree(ext4_li_info);
2947 ext4_li_info = NULL; 2810 ext4_li_info = NULL;
2948 printk(KERN_CRIT "EXT4: error %d creating inode table " 2811 printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
2949 "initialization thread\n", 2812 "initialization thread\n",
2950 err); 2813 err);
2951 return err; 2814 return err;
@@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3183 set_opt(sb, INIT_INODE_TABLE); 3046 set_opt(sb, INIT_INODE_TABLE);
3184 if (def_mount_opts & EXT4_DEFM_DEBUG) 3047 if (def_mount_opts & EXT4_DEFM_DEBUG)
3185 set_opt(sb, DEBUG); 3048 set_opt(sb, DEBUG);
3186 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { 3049 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3187 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
3188 "2.6.38");
3189 set_opt(sb, GRPID); 3050 set_opt(sb, GRPID);
3190 }
3191 if (def_mount_opts & EXT4_DEFM_UID16) 3051 if (def_mount_opts & EXT4_DEFM_UID16)
3192 set_opt(sb, NO_UID32); 3052 set_opt(sb, NO_UID32);
3193 /* xattr user namespace & acls are now defaulted on */ 3053 /* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3240 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 3100 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3241 3101
3242 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3102 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3243 &journal_devnum, &journal_ioprio, NULL, 0)) { 3103 &journal_devnum, &journal_ioprio, 0)) {
3244 ext4_msg(sb, KERN_WARNING, 3104 ext4_msg(sb, KERN_WARNING,
3245 "failed to parse options in superblock: %s", 3105 "failed to parse options in superblock: %s",
3246 sbi->s_es->s_mount_opts); 3106 sbi->s_es->s_mount_opts);
3247 } 3107 }
3108 sbi->s_def_mount_opt = sbi->s_mount_opt;
3248 if (!parse_options((char *) data, sb, &journal_devnum, 3109 if (!parse_options((char *) data, sb, &journal_devnum,
3249 &journal_ioprio, NULL, 0)) 3110 &journal_ioprio, 0))
3250 goto failed_mount; 3111 goto failed_mount;
3251 3112
3252 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 3113 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3416#else 3277#else
3417 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 3278 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3418#endif 3279#endif
3419 sb->s_dirt = 1;
3420 } 3280 }
3421 3281
3422 /* Handle clustersize */ 3282 /* Handle clustersize */
@@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3540 if (ext4_proc_root) 3400 if (ext4_proc_root)
3541 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); 3401 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3542 3402
3403 if (sbi->s_proc)
3404 proc_create_data("options", S_IRUGO, sbi->s_proc,
3405 &ext4_seq_options_fops, sb);
3406
3543 bgl_lock_init(sbi->s_blockgroup_lock); 3407 bgl_lock_init(sbi->s_blockgroup_lock);
3544 3408
3545 for (i = 0; i < db_count; i++) { 3409 for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3694 } 3558 }
3695 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 3559 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3696 3560
3561 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
3562
3697 /* 3563 /*
3698 * The journal may have updated the bg summary counts, so we 3564 * The journal may have updated the bg summary counts, so we
3699 * need to update the global counters. 3565 * need to update the global counters.
@@ -3861,6 +3727,7 @@ failed_mount2:
3861 ext4_kvfree(sbi->s_group_desc); 3727 ext4_kvfree(sbi->s_group_desc);
3862failed_mount: 3728failed_mount:
3863 if (sbi->s_proc) { 3729 if (sbi->s_proc) {
3730 remove_proc_entry("options", sbi->s_proc);
3864 remove_proc_entry(sb->s_id, ext4_proc_root); 3731 remove_proc_entry(sb->s_id, ext4_proc_root);
3865 } 3732 }
3866#ifdef CONFIG_QUOTA 3733#ifdef CONFIG_QUOTA
@@ -4090,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,
4090 if (!(journal->j_flags & JBD2_BARRIER)) 3957 if (!(journal->j_flags & JBD2_BARRIER))
4091 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3958 ext4_msg(sb, KERN_INFO, "barriers disabled");
4092 3959
4093 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
4094 err = jbd2_journal_update_format(journal);
4095 if (err) {
4096 ext4_msg(sb, KERN_ERR, "error updating journal");
4097 jbd2_journal_destroy(journal);
4098 return err;
4099 }
4100 }
4101
4102 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) 3960 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
4103 err = jbd2_journal_wipe(journal, !really_read_only); 3961 err = jbd2_journal_wipe(journal, !really_read_only);
4104 if (!err) { 3962 if (!err) {
@@ -4385,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4385{ 4243{
4386 struct ext4_super_block *es; 4244 struct ext4_super_block *es;
4387 struct ext4_sb_info *sbi = EXT4_SB(sb); 4245 struct ext4_sb_info *sbi = EXT4_SB(sb);
4388 ext4_fsblk_t n_blocks_count = 0;
4389 unsigned long old_sb_flags; 4246 unsigned long old_sb_flags;
4390 struct ext4_mount_options old_opts; 4247 struct ext4_mount_options old_opts;
4391 int enable_quota = 0; 4248 int enable_quota = 0;
@@ -4418,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4418 /* 4275 /*
4419 * Allow the "check" option to be passed as a remount option. 4276 * Allow the "check" option to be passed as a remount option.
4420 */ 4277 */
4421 if (!parse_options(data, sb, NULL, &journal_ioprio, 4278 if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
4422 &n_blocks_count, 1)) {
4423 err = -EINVAL; 4279 err = -EINVAL;
4424 goto restore_opts; 4280 goto restore_opts;
4425 } 4281 }
@@ -4437,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4437 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 4293 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4438 } 4294 }
4439 4295
4440 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 4296 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
4441 n_blocks_count > ext4_blocks_count(es)) {
4442 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { 4297 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
4443 err = -EROFS; 4298 err = -EROFS;
4444 goto restore_opts; 4299 goto restore_opts;
@@ -4513,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4513 if (sbi->s_journal) 4368 if (sbi->s_journal)
4514 ext4_clear_journal_err(sb, es); 4369 ext4_clear_journal_err(sb, es);
4515 sbi->s_mount_state = le16_to_cpu(es->s_state); 4370 sbi->s_mount_state = le16_to_cpu(es->s_state);
4516 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
4517 goto restore_opts;
4518 if (!ext4_setup_super(sb, es, 0)) 4371 if (!ext4_setup_super(sb, es, 0))
4519 sb->s_flags &= ~MS_RDONLY; 4372 sb->s_flags &= ~MS_RDONLY;
4520 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 4373 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d89a220..e88748e55c0f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -82,8 +82,8 @@
82 printk("\n"); \ 82 printk("\n"); \
83 } while (0) 83 } while (0)
84#else 84#else
85# define ea_idebug(f...) 85# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
86# define ea_bdebug(f...) 86# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
87#endif 87#endif
88 88
89static void ext4_xattr_cache_insert(struct buffer_head *); 89static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
158static inline int 158static inline int
159ext4_xattr_check_block(struct buffer_head *bh) 159ext4_xattr_check_block(struct buffer_head *bh)
160{ 160{
161 int error;
162
163 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 161 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
164 BHDR(bh)->h_blocks != cpu_to_le32(1)) 162 BHDR(bh)->h_blocks != cpu_to_le32(1))
165 return -EIO; 163 return -EIO;
166 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); 164 return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
167 return error;
168} 165}
169 166
170static inline int 167static inline int
@@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
220 error = -ENODATA; 217 error = -ENODATA;
221 if (!EXT4_I(inode)->i_file_acl) 218 if (!EXT4_I(inode)->i_file_acl)
222 goto cleanup; 219 goto cleanup;
223 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); 220 ea_idebug(inode, "reading block %llu",
221 (unsigned long long)EXT4_I(inode)->i_file_acl);
224 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 222 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
225 if (!bh) 223 if (!bh)
226 goto cleanup; 224 goto cleanup;
@@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
363 error = 0; 361 error = 0;
364 if (!EXT4_I(inode)->i_file_acl) 362 if (!EXT4_I(inode)->i_file_acl)
365 goto cleanup; 363 goto cleanup;
366 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); 364 ea_idebug(inode, "reading block %llu",
365 (unsigned long long)EXT4_I(inode)->i_file_acl);
367 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 366 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
368 error = -EIO; 367 error = -EIO;
369 if (!bh) 368 if (!bh)
@@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_free_blocks(handle, inode, bh, 0, 1, 486 ext4_free_blocks(handle, inode, bh, 0, 1,
488 EXT4_FREE_BLOCKS_METADATA | 487 EXT4_FREE_BLOCKS_METADATA |
489 EXT4_FREE_BLOCKS_FORGET); 488 EXT4_FREE_BLOCKS_FORGET);
489 unlock_buffer(bh);
490 } else { 490 } else {
491 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 491 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
492 if (ce)
493 mb_cache_entry_release(ce);
494 unlock_buffer(bh);
492 error = ext4_handle_dirty_metadata(handle, inode, bh); 495 error = ext4_handle_dirty_metadata(handle, inode, bh);
493 if (IS_SYNC(inode)) 496 if (IS_SYNC(inode))
494 ext4_handle_sync(handle); 497 ext4_handle_sync(handle);
495 dquot_free_block(inode, 1); 498 dquot_free_block(inode, 1);
496 ea_bdebug(bh, "refcount now=%d; releasing", 499 ea_bdebug(bh, "refcount now=%d; releasing",
497 le32_to_cpu(BHDR(bh)->h_refcount)); 500 le32_to_cpu(BHDR(bh)->h_refcount));
498 if (ce)
499 mb_cache_entry_release(ce);
500 } 501 }
501 unlock_buffer(bh);
502out: 502out:
503 ext4_std_error(inode->i_sb, error); 503 ext4_std_error(inode->i_sb, error);
504 return; 504 return;
@@ -834,7 +834,8 @@ inserted:
834 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 834 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
835 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 835 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
836 836
837 ea_idebug(inode, "creating block %d", block); 837 ea_idebug(inode, "creating block %llu",
838 (unsigned long long)block);
838 839
839 new_bh = sb_getblk(sb, block); 840 new_bh = sb_getblk(sb, block);
840 if (!new_bh) { 841 if (!new_bh) {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a81eb2367d39..98ae804f5273 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -521,57 +521,46 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
521 521
522 op = &outname[*outlen * sizeof(wchar_t)]; 522 op = &outname[*outlen * sizeof(wchar_t)];
523 } else { 523 } else {
524 if (nls) { 524 for (i = 0, ip = name, op = outname, *outlen = 0;
525 for (i = 0, ip = name, op = outname, *outlen = 0; 525 i < len && *outlen < FAT_LFN_LEN;
526 i < len && *outlen <= FAT_LFN_LEN; 526 *outlen += 1) {
527 *outlen += 1) 527 if (escape && (*ip == ':')) {
528 { 528 if (i > len - 5)
529 if (escape && (*ip == ':')) { 529 return -EINVAL;
530 if (i > len - 5) 530 ec = 0;
531 return -EINVAL; 531 for (k = 1; k < 5; k++) {
532 ec = 0; 532 nc = ip[k];
533 for (k = 1; k < 5; k++) { 533 ec <<= 4;
534 nc = ip[k]; 534 if (nc >= '0' && nc <= '9') {
535 ec <<= 4; 535 ec |= nc - '0';
536 if (nc >= '0' && nc <= '9') { 536 continue;
537 ec |= nc - '0';
538 continue;
539 }
540 if (nc >= 'a' && nc <= 'f') {
541 ec |= nc - ('a' - 10);
542 continue;
543 }
544 if (nc >= 'A' && nc <= 'F') {
545 ec |= nc - ('A' - 10);
546 continue;
547 }
548 return -EINVAL;
549 } 537 }
550 *op++ = ec & 0xFF; 538 if (nc >= 'a' && nc <= 'f') {
551 *op++ = ec >> 8; 539 ec |= nc - ('a' - 10);
552 ip += 5; 540 continue;
553 i += 5; 541 }
554 } else { 542 if (nc >= 'A' && nc <= 'F') {
555 if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0) 543 ec |= nc - ('A' - 10);
556 return -EINVAL; 544 continue;
557 ip += charlen; 545 }
558 i += charlen; 546 return -EINVAL;
559 op += 2;
560 } 547 }
548 *op++ = ec & 0xFF;
549 *op++ = ec >> 8;
550 ip += 5;
551 i += 5;
552 } else {
553 charlen = nls->char2uni(ip, len - i,
554 (wchar_t *)op);
555 if (charlen < 0)
556 return -EINVAL;
557 ip += charlen;
558 i += charlen;
559 op += 2;
561 } 560 }
562 if (i < len)
563 return -ENAMETOOLONG;
564 } else {
565 for (i = 0, ip = name, op = outname, *outlen = 0;
566 i < len && *outlen <= FAT_LFN_LEN;
567 i++, *outlen += 1)
568 {
569 *op++ = *ip++;
570 *op++ = 0;
571 }
572 if (i < len)
573 return -ENAMETOOLONG;
574 } 561 }
562 if (i < len)
563 return -ENAMETOOLONG;
575 } 564 }
576 565
577 *longlen = *outlen; 566 *longlen = *outlen;
diff --git a/fs/file.c b/fs/file.c
index 4c6992d8f3ba..3c426de7203a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,7 +6,7 @@
6 * Manage the dynamic fd arrays in the process files_struct. 6 * Manage the dynamic fd arrays in the process files_struct.
7 */ 7 */
8 8
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/mmzone.h> 12#include <linux/mmzone.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 77b535ac7136..539f36cf3e4a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,7 +14,7 @@
14 */ 14 */
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/export.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
@@ -256,7 +256,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
256} 256}
257 257
258/* 258/*
259 * Move expired dirty inodes from @delaying_queue to @dispatch_queue. 259 * Move expired (dirtied after work->older_than_this) dirty inodes from
260 * @delaying_queue to @dispatch_queue.
260 */ 261 */
261static int move_expired_inodes(struct list_head *delaying_queue, 262static int move_expired_inodes(struct list_head *delaying_queue,
262 struct list_head *dispatch_queue, 263 struct list_head *dispatch_queue,
@@ -1148,23 +1149,6 @@ out_unlock_inode:
1148} 1149}
1149EXPORT_SYMBOL(__mark_inode_dirty); 1150EXPORT_SYMBOL(__mark_inode_dirty);
1150 1151
1151/*
1152 * Write out a superblock's list of dirty inodes. A wait will be performed
1153 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1154 *
1155 * If older_than_this is non-NULL, then only write out inodes which
1156 * had their first dirtying at a time earlier than *older_than_this.
1157 *
1158 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1159 * This function assumes that the blockdev superblock's inodes are backed by
1160 * a variety of queues, so all inodes are searched. For other superblocks,
1161 * assume that all inodes are backed by the same queue.
1162 *
1163 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1164 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1165 * on the writer throttling path, and we get decent balancing between many
1166 * throttled threads: we don't want them all piling up on inode_sync_wait.
1167 */
1168static void wait_sb_inodes(struct super_block *sb) 1152static void wait_sb_inodes(struct super_block *sb)
1169{ 1153{
1170 struct inode *inode, *old_inode = NULL; 1154 struct inode *inode, *old_inode = NULL;
@@ -1364,8 +1348,6 @@ int write_inode_now(struct inode *inode, int sync)
1364 ret = writeback_single_inode(inode, wb, &wbc); 1348 ret = writeback_single_inode(inode, wb, &wbc);
1365 spin_unlock(&inode->i_lock); 1349 spin_unlock(&inode->i_lock);
1366 spin_unlock(&wb->list_lock); 1350 spin_unlock(&wb->list_lock);
1367 if (sync)
1368 inode_sync_wait(inode);
1369 return ret; 1351 return ret;
1370} 1352}
1371EXPORT_SYMBOL(write_inode_now); 1353EXPORT_SYMBOL(write_inode_now);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 6324c4274959..e159e682ad4c 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -1,4 +1,4 @@
1#include <linux/module.h> 1#include <linux/export.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/path.h> 4#include <linux/path.h>
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 3cbfa93cd782..1fe731337f07 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -67,7 +67,8 @@ extern int access_file(char *path, int r, int w, int x);
67extern int open_file(char *path, int r, int w, int append); 67extern int open_file(char *path, int r, int w, int append);
68extern void *open_dir(char *path, int *err_out); 68extern void *open_dir(char *path, int *err_out);
69extern char *read_dir(void *stream, unsigned long long *pos, 69extern char *read_dir(void *stream, unsigned long long *pos,
70 unsigned long long *ino_out, int *len_out); 70 unsigned long long *ino_out, int *len_out,
71 unsigned int *type_out);
71extern void close_file(void *stream); 72extern void close_file(void *stream);
72extern int replace_file(int oldfd, int fd); 73extern int replace_file(int oldfd, int fd);
73extern void close_dir(void *stream); 74extern void close_dir(void *stream);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 588d45885a6f..07c516bfea76 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -283,6 +283,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
283 char *name; 283 char *name;
284 unsigned long long next, ino; 284 unsigned long long next, ino;
285 int error, len; 285 int error, len;
286 unsigned int type;
286 287
287 name = dentry_name(file->f_path.dentry); 288 name = dentry_name(file->f_path.dentry);
288 if (name == NULL) 289 if (name == NULL)
@@ -292,9 +293,9 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
292 if (dir == NULL) 293 if (dir == NULL)
293 return -error; 294 return -error;
294 next = file->f_pos; 295 next = file->f_pos;
295 while ((name = read_dir(dir, &next, &ino, &len)) != NULL) { 296 while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
296 error = (*filldir)(ent, name, len, file->f_pos, 297 error = (*filldir)(ent, name, len, file->f_pos,
297 ino, DT_UNKNOWN); 298 ino, type);
298 if (error) break; 299 if (error) break;
299 file->f_pos = next; 300 file->f_pos = next;
300 } 301 }
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index dd7bc38a3825..a74ad0d371c2 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -98,7 +98,8 @@ void *open_dir(char *path, int *err_out)
98} 98}
99 99
100char *read_dir(void *stream, unsigned long long *pos, 100char *read_dir(void *stream, unsigned long long *pos,
101 unsigned long long *ino_out, int *len_out) 101 unsigned long long *ino_out, int *len_out,
102 unsigned int *type_out)
102{ 103{
103 DIR *dir = stream; 104 DIR *dir = stream;
104 struct dirent *ent; 105 struct dirent *ent;
@@ -109,6 +110,7 @@ char *read_dir(void *stream, unsigned long long *pos,
109 return NULL; 110 return NULL;
110 *len_out = strlen(ent->d_name); 111 *len_out = strlen(ent->d_name);
111 *ino_out = ent->d_ino; 112 *ino_out = ent->d_ino;
113 *type_out = ent->d_type;
112 *pos = telldir(dir); 114 *pos = telldir(dir);
113 return ent->d_name; 115 return ent->d_name;
114} 116}
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 066836e81848..29167bebe874 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -10,7 +10,7 @@
10#include <linux/file.h> 10#include <linux/file.h>
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/security.h> 12#include <linux/security.h>
13#include <linux/module.h> 13#include <linux/export.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/writeback.h> 15#include <linux/writeback.h>
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903fb..c78841ee81cf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
88 * whole transaction. 88 * whole transaction.
89 * 89 *
90 * Requires j_list_lock 90 * Requires j_list_lock
91 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
92 */ 91 */
93static int __try_to_free_cp_buf(struct journal_head *jh) 92static int __try_to_free_cp_buf(struct journal_head *jh)
94{ 93{
95 int ret = 0; 94 int ret = 0;
96 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
97 96
98 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && 97 if (jh->b_transaction == NULL && !buffer_locked(bh) &&
99 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
100 /* 99 /*
101 * Get our reference so that bh cannot be freed before 100 * Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
104 get_bh(bh); 103 get_bh(bh);
105 JBUFFER_TRACE(jh, "remove from checkpoint list"); 104 JBUFFER_TRACE(jh, "remove from checkpoint list");
106 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 105 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
107 jbd_unlock_bh_state(bh);
108 BUFFER_TRACE(bh, "release"); 106 BUFFER_TRACE(bh, "release");
109 __brelse(bh); 107 __brelse(bh);
110 } else {
111 jbd_unlock_bh_state(bh);
112 } 108 }
113 return ret; 109 return ret;
114} 110}
@@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
180} 176}
181 177
182/* 178/*
183 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
184 * The caller must restart a list walk. Wait for someone else to run
185 * jbd_unlock_bh_state().
186 */
187static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
188 __releases(journal->j_list_lock)
189{
190 get_bh(bh);
191 spin_unlock(&journal->j_list_lock);
192 jbd_lock_bh_state(bh);
193 jbd_unlock_bh_state(bh);
194 put_bh(bh);
195}
196
197/*
198 * Clean up transaction's list of buffers submitted for io. 179 * Clean up transaction's list of buffers submitted for io.
199 * We wait for any pending IO to complete and remove any clean 180 * We wait for any pending IO to complete and remove any clean
200 * buffers. Note that we take the buffers in the opposite ordering 181 * buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@ restart:
222 while (!released && transaction->t_checkpoint_io_list) { 203 while (!released && transaction->t_checkpoint_io_list) {
223 jh = transaction->t_checkpoint_io_list; 204 jh = transaction->t_checkpoint_io_list;
224 bh = jh2bh(jh); 205 bh = jh2bh(jh);
225 if (!jbd_trylock_bh_state(bh)) {
226 jbd_sync_bh(journal, bh);
227 spin_lock(&journal->j_list_lock);
228 goto restart;
229 }
230 get_bh(bh); 206 get_bh(bh);
231 if (buffer_locked(bh)) { 207 if (buffer_locked(bh)) {
232 spin_unlock(&journal->j_list_lock); 208 spin_unlock(&journal->j_list_lock);
233 jbd_unlock_bh_state(bh);
234 wait_on_buffer(bh); 209 wait_on_buffer(bh);
235 /* the journal_head may have gone by now */ 210 /* the journal_head may have gone by now */
236 BUFFER_TRACE(bh, "brelse"); 211 BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@ restart:
246 * it has been written out and so we can drop it from the list 221 * it has been written out and so we can drop it from the list
247 */ 222 */
248 released = __jbd2_journal_remove_checkpoint(jh); 223 released = __jbd2_journal_remove_checkpoint(jh);
249 jbd_unlock_bh_state(bh);
250 __brelse(bh); 224 __brelse(bh);
251 } 225 }
252 226
@@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)
266 240
267 for (i = 0; i < *batch_count; i++) { 241 for (i = 0; i < *batch_count; i++) {
268 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 242 struct buffer_head *bh = journal->j_chkpt_bhs[i];
269 clear_buffer_jwrite(bh);
270 BUFFER_TRACE(bh, "brelse"); 243 BUFFER_TRACE(bh, "brelse");
271 __brelse(bh); 244 __brelse(bh);
272 } 245 }
@@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)
281 * be written out. 254 * be written out.
282 * 255 *
283 * Called with j_list_lock held and drops it if 1 is returned 256 * Called with j_list_lock held and drops it if 1 is returned
284 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
285 */ 257 */
286static int __process_buffer(journal_t *journal, struct journal_head *jh, 258static int __process_buffer(journal_t *journal, struct journal_head *jh,
287 int *batch_count, transaction_t *transaction) 259 int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
292 if (buffer_locked(bh)) { 264 if (buffer_locked(bh)) {
293 get_bh(bh); 265 get_bh(bh);
294 spin_unlock(&journal->j_list_lock); 266 spin_unlock(&journal->j_list_lock);
295 jbd_unlock_bh_state(bh);
296 wait_on_buffer(bh); 267 wait_on_buffer(bh);
297 /* the journal_head may have gone by now */ 268 /* the journal_head may have gone by now */
298 BUFFER_TRACE(bh, "brelse"); 269 BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
304 275
305 transaction->t_chp_stats.cs_forced_to_close++; 276 transaction->t_chp_stats.cs_forced_to_close++;
306 spin_unlock(&journal->j_list_lock); 277 spin_unlock(&journal->j_list_lock);
307 jbd_unlock_bh_state(bh);
308 if (unlikely(journal->j_flags & JBD2_UNMOUNT)) 278 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
309 /* 279 /*
310 * The journal thread is dead; so starting and 280 * The journal thread is dead; so starting and
@@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
323 if (unlikely(buffer_write_io_error(bh))) 293 if (unlikely(buffer_write_io_error(bh)))
324 ret = -EIO; 294 ret = -EIO;
325 get_bh(bh); 295 get_bh(bh);
326 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
327 BUFFER_TRACE(bh, "remove from checkpoint"); 296 BUFFER_TRACE(bh, "remove from checkpoint");
328 __jbd2_journal_remove_checkpoint(jh); 297 __jbd2_journal_remove_checkpoint(jh);
329 spin_unlock(&journal->j_list_lock); 298 spin_unlock(&journal->j_list_lock);
330 jbd_unlock_bh_state(bh);
331 __brelse(bh); 299 __brelse(bh);
332 } else { 300 } else {
333 /* 301 /*
@@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
340 BUFFER_TRACE(bh, "queue"); 308 BUFFER_TRACE(bh, "queue");
341 get_bh(bh); 309 get_bh(bh);
342 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 310 J_ASSERT_BH(bh, !buffer_jwrite(bh));
343 set_buffer_jwrite(bh);
344 journal->j_chkpt_bhs[*batch_count] = bh; 311 journal->j_chkpt_bhs[*batch_count] = bh;
345 __buffer_relink_io(jh); 312 __buffer_relink_io(jh);
346 jbd_unlock_bh_state(bh);
347 transaction->t_chp_stats.cs_written++; 313 transaction->t_chp_stats.cs_written++;
348 (*batch_count)++; 314 (*batch_count)++;
349 if (*batch_count == JBD2_NR_BATCH) { 315 if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@ restart:
407 int retry = 0, err; 373 int retry = 0, err;
408 374
409 while (!retry && transaction->t_checkpoint_list) { 375 while (!retry && transaction->t_checkpoint_list) {
410 struct buffer_head *bh;
411
412 jh = transaction->t_checkpoint_list; 376 jh = transaction->t_checkpoint_list;
413 bh = jh2bh(jh);
414 if (!jbd_trylock_bh_state(bh)) {
415 jbd_sync_bh(journal, bh);
416 retry = 1;
417 break;
418 }
419 retry = __process_buffer(journal, jh, &batch_count, 377 retry = __process_buffer(journal, jh, &batch_count,
420 transaction); 378 transaction);
421 if (retry < 0 && !result) 379 if (retry < 0 && !result)
@@ -478,79 +436,28 @@ out:
478 436
479int jbd2_cleanup_journal_tail(journal_t *journal) 437int jbd2_cleanup_journal_tail(journal_t *journal)
480{ 438{
481 transaction_t * transaction;
482 tid_t first_tid; 439 tid_t first_tid;
483 unsigned long blocknr, freed; 440 unsigned long blocknr;
484 441
485 if (is_journal_aborted(journal)) 442 if (is_journal_aborted(journal))
486 return 1; 443 return 1;
487 444
488 /* OK, work out the oldest transaction remaining in the log, and 445 if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
489 * the log block it starts at.
490 *
491 * If the log is now empty, we need to work out which is the
492 * next transaction ID we will write, and where it will
493 * start. */
494
495 write_lock(&journal->j_state_lock);
496 spin_lock(&journal->j_list_lock);
497 transaction = journal->j_checkpoint_transactions;
498 if (transaction) {
499 first_tid = transaction->t_tid;
500 blocknr = transaction->t_log_start;
501 } else if ((transaction = journal->j_committing_transaction) != NULL) {
502 first_tid = transaction->t_tid;
503 blocknr = transaction->t_log_start;
504 } else if ((transaction = journal->j_running_transaction) != NULL) {
505 first_tid = transaction->t_tid;
506 blocknr = journal->j_head;
507 } else {
508 first_tid = journal->j_transaction_sequence;
509 blocknr = journal->j_head;
510 }
511 spin_unlock(&journal->j_list_lock);
512 J_ASSERT(blocknr != 0);
513
514 /* If the oldest pinned transaction is at the tail of the log
515 already then there's not much we can do right now. */
516 if (journal->j_tail_sequence == first_tid) {
517 write_unlock(&journal->j_state_lock);
518 return 1; 446 return 1;
519 } 447 J_ASSERT(blocknr != 0);
520
521 /* OK, update the superblock to recover the freed space.
522 * Physical blocks come first: have we wrapped beyond the end of
523 * the log? */
524 freed = blocknr - journal->j_tail;
525 if (blocknr < journal->j_tail)
526 freed = freed + journal->j_last - journal->j_first;
527
528 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
529 jbd_debug(1,
530 "Cleaning journal tail from %d to %d (offset %lu), "
531 "freeing %lu\n",
532 journal->j_tail_sequence, first_tid, blocknr, freed);
533
534 journal->j_free += freed;
535 journal->j_tail_sequence = first_tid;
536 journal->j_tail = blocknr;
537 write_unlock(&journal->j_state_lock);
538 448
539 /* 449 /*
540 * If there is an external journal, we need to make sure that 450 * We need to make sure that any blocks that were recently written out
541 * any data blocks that were recently written out --- perhaps 451 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
542 * by jbd2_log_do_checkpoint() --- are flushed out before we 452 * we drop the transactions from the journal. It's unlikely this will
543 * drop the transactions from the external journal. It's 453 * be necessary, especially with an appropriately sized journal, but we
544 * unlikely this will be necessary, especially with a 454 * need this to guarantee correctness. Fortunately
545 * appropriately sized journal, but we need this to guarantee 455 * jbd2_cleanup_journal_tail() doesn't get called all that often.
546 * correctness. Fortunately jbd2_cleanup_journal_tail()
547 * doesn't get called all that often.
548 */ 456 */
549 if ((journal->j_fs_dev != journal->j_dev) && 457 if (journal->j_flags & JBD2_BARRIER)
550 (journal->j_flags & JBD2_BARRIER))
551 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 458 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
552 if (!(journal->j_flags & JBD2_ABORT)) 459
553 jbd2_journal_update_superblock(journal, 1); 460 __jbd2_update_log_tail(journal, first_tid, blocknr);
554 return 0; 461 return 0;
555} 462}
556 463
@@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
582 do { 489 do {
583 jh = next_jh; 490 jh = next_jh;
584 next_jh = jh->b_cpnext; 491 next_jh = jh->b_cpnext;
585 /* Use trylock because of the ranking */ 492 ret = __try_to_free_cp_buf(jh);
586 if (jbd_trylock_bh_state(jh2bh(jh))) { 493 if (ret) {
587 ret = __try_to_free_cp_buf(jh); 494 freed++;
588 if (ret) { 495 if (ret == 2) {
589 freed++; 496 *released = 1;
590 if (ret == 2) { 497 return freed;
591 *released = 1;
592 return freed;
593 }
594 } 498 }
595 } 499 }
596 /* 500 /*
@@ -673,9 +577,7 @@ out:
673 * The function can free jh and bh. 577 * The function can free jh and bh.
674 * 578 *
675 * This function is called with j_list_lock held. 579 * This function is called with j_list_lock held.
676 * This function is called with jbd_lock_bh_state(jh2bh(jh))
677 */ 580 */
678
679int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 581int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
680{ 582{
681 struct transaction_chp_stats_s *stats; 583 struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
722 transaction->t_tid, stats); 624 transaction->t_tid, stats);
723 625
724 __jbd2_journal_drop_transaction(journal, transaction); 626 __jbd2_journal_drop_transaction(journal, transaction);
725 kfree(transaction); 627 jbd2_journal_free_transaction(transaction);
726 628
727 /* Just in case anybody was waiting for more transactions to be 629 /* Just in case anybody was waiting for more transactions to be
728 checkpointed... */ 630 checkpointed... */
@@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
797 J_ASSERT(journal->j_committing_transaction != transaction); 699 J_ASSERT(journal->j_committing_transaction != transaction);
798 J_ASSERT(journal->j_running_transaction != transaction); 700 J_ASSERT(journal->j_running_transaction != transaction);
799 701
702 trace_jbd2_drop_transaction(journal, transaction);
703
800 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 704 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
801} 705}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29853deee5ed..806525a7269c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -330,6 +330,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
330 struct buffer_head *cbh = NULL; /* For transactional checksums */ 330 struct buffer_head *cbh = NULL; /* For transactional checksums */
331 __u32 crc32_sum = ~0; 331 __u32 crc32_sum = ~0;
332 struct blk_plug plug; 332 struct blk_plug plug;
333 /* Tail of the journal */
334 unsigned long first_block;
335 tid_t first_tid;
336 int update_tail;
333 337
334 /* 338 /*
335 * First job: lock down the current transaction and wait for 339 * First job: lock down the current transaction and wait for
@@ -339,7 +343,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
339 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 343 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
340 if (journal->j_flags & JBD2_FLUSHED) { 344 if (journal->j_flags & JBD2_FLUSHED) {
341 jbd_debug(3, "super block updated\n"); 345 jbd_debug(3, "super block updated\n");
342 jbd2_journal_update_superblock(journal, 1); 346 mutex_lock(&journal->j_checkpoint_mutex);
347 /*
348 * We hold j_checkpoint_mutex so tail cannot change under us.
349 * We don't need any special data guarantees for writing sb
350 * since journal is empty and it is ok for write to be
351 * flushed only with transaction commit.
352 */
353 jbd2_journal_update_sb_log_tail(journal,
354 journal->j_tail_sequence,
355 journal->j_tail,
356 WRITE_SYNC);
357 mutex_unlock(&journal->j_checkpoint_mutex);
343 } else { 358 } else {
344 jbd_debug(3, "superblock not updated\n"); 359 jbd_debug(3, "superblock not updated\n");
345 } 360 }
@@ -676,10 +691,30 @@ start_journal_io:
676 err = 0; 691 err = 0;
677 } 692 }
678 693
694 /*
695 * Get current oldest transaction in the log before we issue flush
696 * to the filesystem device. After the flush we can be sure that
697 * blocks of all older transactions are checkpointed to persistent
698 * storage and we will be safe to update journal start in the
699 * superblock with the numbers we get here.
700 */
701 update_tail =
702 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
703
679 write_lock(&journal->j_state_lock); 704 write_lock(&journal->j_state_lock);
705 if (update_tail) {
706 long freed = first_block - journal->j_tail;
707
708 if (first_block < journal->j_tail)
709 freed += journal->j_last - journal->j_first;
710 /* Update tail only if we free significant amount of space */
711 if (freed < journal->j_maxlen / 4)
712 update_tail = 0;
713 }
680 J_ASSERT(commit_transaction->t_state == T_COMMIT); 714 J_ASSERT(commit_transaction->t_state == T_COMMIT);
681 commit_transaction->t_state = T_COMMIT_DFLUSH; 715 commit_transaction->t_state = T_COMMIT_DFLUSH;
682 write_unlock(&journal->j_state_lock); 716 write_unlock(&journal->j_state_lock);
717
683 /* 718 /*
684 * If the journal is not located on the file system device, 719 * If the journal is not located on the file system device,
685 * then we must flush the file system device before we issue 720 * then we must flush the file system device before we issue
@@ -830,6 +865,14 @@ wait_for_iobuf:
830 if (err) 865 if (err)
831 jbd2_journal_abort(journal, err); 866 jbd2_journal_abort(journal, err);
832 867
868 /*
869 * Now disk caches for filesystem device are flushed so we are safe to
870 * erase checkpointed transactions from the log by updating journal
871 * superblock.
872 */
873 if (update_tail)
874 jbd2_update_log_tail(journal, first_tid, first_block);
875
833 /* End of a transaction! Finally, we can do checkpoint 876 /* End of a transaction! Finally, we can do checkpoint
834 processing: any buffers committed as a result of this 877 processing: any buffers committed as a result of this
835 transaction can be removed from any checkpoint list it was on 878 transaction can be removed from any checkpoint list it was on
@@ -1047,7 +1090,7 @@ restart_loop:
1047 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1090 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1048 journal->j_commit_sequence, journal->j_tail_sequence); 1091 journal->j_commit_sequence, journal->j_tail_sequence);
1049 if (to_free) 1092 if (to_free)
1050 kfree(commit_transaction); 1093 jbd2_journal_free_transaction(commit_transaction);
1051 1094
1052 wake_up(&journal->j_wait_done_commit); 1095 wake_up(&journal->j_wait_done_commit);
1053} 1096}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c6d22745553f..1afb701622b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -70,7 +70,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
70 70
71EXPORT_SYMBOL(jbd2_journal_init_dev); 71EXPORT_SYMBOL(jbd2_journal_init_dev);
72EXPORT_SYMBOL(jbd2_journal_init_inode); 72EXPORT_SYMBOL(jbd2_journal_init_inode);
73EXPORT_SYMBOL(jbd2_journal_update_format);
74EXPORT_SYMBOL(jbd2_journal_check_used_features); 73EXPORT_SYMBOL(jbd2_journal_check_used_features);
75EXPORT_SYMBOL(jbd2_journal_check_available_features); 74EXPORT_SYMBOL(jbd2_journal_check_available_features);
76EXPORT_SYMBOL(jbd2_journal_set_features); 75EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -95,7 +94,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
95EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 94EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
96EXPORT_SYMBOL(jbd2_inode_cache); 95EXPORT_SYMBOL(jbd2_inode_cache);
97 96
98static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
99static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
100static int jbd2_journal_create_slab(size_t slab_size); 98static int jbd2_journal_create_slab(size_t slab_size);
101 99
@@ -745,6 +743,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
745 return jbd2_journal_add_journal_head(bh); 743 return jbd2_journal_add_journal_head(bh);
746} 744}
747 745
746/*
747 * Return tid of the oldest transaction in the journal and block in the journal
748 * where the transaction starts.
749 *
750 * If the journal is now empty, return which will be the next transaction ID
751 * we will write and where will that transaction start.
752 *
753 * The return value is 0 if journal tail cannot be pushed any further, 1 if
754 * it can.
755 */
756int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
757 unsigned long *block)
758{
759 transaction_t *transaction;
760 int ret;
761
762 read_lock(&journal->j_state_lock);
763 spin_lock(&journal->j_list_lock);
764 transaction = journal->j_checkpoint_transactions;
765 if (transaction) {
766 *tid = transaction->t_tid;
767 *block = transaction->t_log_start;
768 } else if ((transaction = journal->j_committing_transaction) != NULL) {
769 *tid = transaction->t_tid;
770 *block = transaction->t_log_start;
771 } else if ((transaction = journal->j_running_transaction) != NULL) {
772 *tid = transaction->t_tid;
773 *block = journal->j_head;
774 } else {
775 *tid = journal->j_transaction_sequence;
776 *block = journal->j_head;
777 }
778 ret = tid_gt(*tid, journal->j_tail_sequence);
779 spin_unlock(&journal->j_list_lock);
780 read_unlock(&journal->j_state_lock);
781
782 return ret;
783}
784
785/*
786 * Update information in journal structure and in on disk journal superblock
787 * about log tail. This function does not check whether information passed in
788 * really pushes log tail further. It's responsibility of the caller to make
789 * sure provided log tail information is valid (e.g. by holding
790 * j_checkpoint_mutex all the time between computing log tail and calling this
791 * function as is the case with jbd2_cleanup_journal_tail()).
792 *
793 * Requires j_checkpoint_mutex
794 */
795void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
796{
797 unsigned long freed;
798
799 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
800
801 /*
802 * We cannot afford for write to remain in drive's caches since as
803 * soon as we update j_tail, next transaction can start reusing journal
804 * space and if we lose sb update during power failure we'd replay
805 * old transaction with possibly newly overwritten data.
806 */
807 jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
808 write_lock(&journal->j_state_lock);
809 freed = block - journal->j_tail;
810 if (block < journal->j_tail)
811 freed += journal->j_last - journal->j_first;
812
813 trace_jbd2_update_log_tail(journal, tid, block, freed);
814 jbd_debug(1,
815 "Cleaning journal tail from %d to %d (offset %lu), "
816 "freeing %lu\n",
817 journal->j_tail_sequence, tid, block, freed);
818
819 journal->j_free += freed;
820 journal->j_tail_sequence = tid;
821 journal->j_tail = block;
822 write_unlock(&journal->j_state_lock);
823}
824
825/*
826 * This is a variaon of __jbd2_update_log_tail which checks for validity of
827 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
828 * with other threads updating log tail.
829 */
830void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
831{
832 mutex_lock(&journal->j_checkpoint_mutex);
833 if (tid_gt(tid, journal->j_tail_sequence))
834 __jbd2_update_log_tail(journal, tid, block);
835 mutex_unlock(&journal->j_checkpoint_mutex);
836}
837
748struct jbd2_stats_proc_session { 838struct jbd2_stats_proc_session {
749 journal_t *journal; 839 journal_t *journal;
750 struct transaction_stats_s *stats; 840 struct transaction_stats_s *stats;
@@ -1113,40 +1203,45 @@ static int journal_reset(journal_t *journal)
1113 1203
1114 journal->j_max_transaction_buffers = journal->j_maxlen / 4; 1204 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
1115 1205
1116 /* Add the dynamic fields and write it to disk. */
1117 jbd2_journal_update_superblock(journal, 1);
1118 return jbd2_journal_start_thread(journal);
1119}
1120
1121/**
1122 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1123 * @journal: The journal to update.
1124 * @wait: Set to '0' if you don't want to wait for IO completion.
1125 *
1126 * Update a journal's dynamic superblock fields and write it to disk,
1127 * optionally waiting for the IO to complete.
1128 */
1129void jbd2_journal_update_superblock(journal_t *journal, int wait)
1130{
1131 journal_superblock_t *sb = journal->j_superblock;
1132 struct buffer_head *bh = journal->j_sb_buffer;
1133
1134 /* 1206 /*
1135 * As a special case, if the on-disk copy is already marked as needing 1207 * As a special case, if the on-disk copy is already marked as needing
1136 * no recovery (s_start == 0) and there are no outstanding transactions 1208 * no recovery (s_start == 0), then we can safely defer the superblock
1137 * in the filesystem, then we can safely defer the superblock update 1209 * update until the next commit by setting JBD2_FLUSHED. This avoids
1138 * until the next commit by setting JBD2_FLUSHED. This avoids
1139 * attempting a write to a potential-readonly device. 1210 * attempting a write to a potential-readonly device.
1140 */ 1211 */
1141 if (sb->s_start == 0 && journal->j_tail_sequence == 1212 if (sb->s_start == 0) {
1142 journal->j_transaction_sequence) {
1143 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " 1213 jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1144 "(start %ld, seq %d, errno %d)\n", 1214 "(start %ld, seq %d, errno %d)\n",
1145 journal->j_tail, journal->j_tail_sequence, 1215 journal->j_tail, journal->j_tail_sequence,
1146 journal->j_errno); 1216 journal->j_errno);
1147 goto out; 1217 journal->j_flags |= JBD2_FLUSHED;
1218 } else {
1219 /* Lock here to make assertions happy... */
1220 mutex_lock(&journal->j_checkpoint_mutex);
1221 /*
1222 * Update log tail information. We use WRITE_FUA since new
1223 * transaction will start reusing journal space and so we
1224 * must make sure information about current log tail is on
1225 * disk before that.
1226 */
1227 jbd2_journal_update_sb_log_tail(journal,
1228 journal->j_tail_sequence,
1229 journal->j_tail,
1230 WRITE_FUA);
1231 mutex_unlock(&journal->j_checkpoint_mutex);
1148 } 1232 }
1233 return jbd2_journal_start_thread(journal);
1234}
1149 1235
1236static void jbd2_write_superblock(journal_t *journal, int write_op)
1237{
1238 struct buffer_head *bh = journal->j_sb_buffer;
1239 int ret;
1240
1241 trace_jbd2_write_superblock(journal, write_op);
1242 if (!(journal->j_flags & JBD2_BARRIER))
1243 write_op &= ~(REQ_FUA | REQ_FLUSH);
1244 lock_buffer(bh);
1150 if (buffer_write_io_error(bh)) { 1245 if (buffer_write_io_error(bh)) {
1151 /* 1246 /*
1152 * Oh, dear. A previous attempt to write the journal 1247 * Oh, dear. A previous attempt to write the journal
@@ -1162,48 +1257,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1162 clear_buffer_write_io_error(bh); 1257 clear_buffer_write_io_error(bh);
1163 set_buffer_uptodate(bh); 1258 set_buffer_uptodate(bh);
1164 } 1259 }
1260 get_bh(bh);
1261 bh->b_end_io = end_buffer_write_sync;
1262 ret = submit_bh(write_op, bh);
1263 wait_on_buffer(bh);
1264 if (buffer_write_io_error(bh)) {
1265 clear_buffer_write_io_error(bh);
1266 set_buffer_uptodate(bh);
1267 ret = -EIO;
1268 }
1269 if (ret) {
1270 printk(KERN_ERR "JBD2: Error %d detected when updating "
1271 "journal superblock for %s.\n", ret,
1272 journal->j_devname);
1273 }
1274}
1275
1276/**
1277 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1278 * @journal: The journal to update.
1279 * @tail_tid: TID of the new transaction at the tail of the log
1280 * @tail_block: The first block of the transaction at the tail of the log
1281 * @write_op: With which operation should we write the journal sb
1282 *
1283 * Update a journal's superblock information about log tail and write it to
1284 * disk, waiting for the IO to complete.
1285 */
1286void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1287 unsigned long tail_block, int write_op)
1288{
1289 journal_superblock_t *sb = journal->j_superblock;
1290
1291 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1292 jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1293 tail_block, tail_tid);
1294
1295 sb->s_sequence = cpu_to_be32(tail_tid);
1296 sb->s_start = cpu_to_be32(tail_block);
1297
1298 jbd2_write_superblock(journal, write_op);
1299
1300 /* Log is no longer empty */
1301 write_lock(&journal->j_state_lock);
1302 WARN_ON(!sb->s_sequence);
1303 journal->j_flags &= ~JBD2_FLUSHED;
1304 write_unlock(&journal->j_state_lock);
1305}
1306
1307/**
1308 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1309 * @journal: The journal to update.
1310 *
1311 * Update a journal's dynamic superblock fields to show that journal is empty.
1312 * Write updated superblock to disk waiting for IO to complete.
1313 */
1314static void jbd2_mark_journal_empty(journal_t *journal)
1315{
1316 journal_superblock_t *sb = journal->j_superblock;
1165 1317
1318 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1166 read_lock(&journal->j_state_lock); 1319 read_lock(&journal->j_state_lock);
1167 jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", 1320 jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
1168 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1321 journal->j_tail_sequence);
1169 1322
1170 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1323 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1171 sb->s_start = cpu_to_be32(journal->j_tail); 1324 sb->s_start = cpu_to_be32(0);
1172 sb->s_errno = cpu_to_be32(journal->j_errno);
1173 read_unlock(&journal->j_state_lock); 1325 read_unlock(&journal->j_state_lock);
1174 1326
1175 BUFFER_TRACE(bh, "marking dirty"); 1327 jbd2_write_superblock(journal, WRITE_FUA);
1176 mark_buffer_dirty(bh);
1177 if (wait) {
1178 sync_dirty_buffer(bh);
1179 if (buffer_write_io_error(bh)) {
1180 printk(KERN_ERR "JBD2: I/O error detected "
1181 "when updating journal superblock for %s.\n",
1182 journal->j_devname);
1183 clear_buffer_write_io_error(bh);
1184 set_buffer_uptodate(bh);
1185 }
1186 } else
1187 write_dirty_buffer(bh, WRITE);
1188
1189out:
1190 /* If we have just flushed the log (by marking s_start==0), then
1191 * any future commit will have to be careful to update the
1192 * superblock again to re-record the true start of the log. */
1193 1328
1329 /* Log is no longer empty */
1194 write_lock(&journal->j_state_lock); 1330 write_lock(&journal->j_state_lock);
1195 if (sb->s_start) 1331 journal->j_flags |= JBD2_FLUSHED;
1196 journal->j_flags &= ~JBD2_FLUSHED;
1197 else
1198 journal->j_flags |= JBD2_FLUSHED;
1199 write_unlock(&journal->j_state_lock); 1332 write_unlock(&journal->j_state_lock);
1200} 1333}
1201 1334
1335
1336/**
1337 * jbd2_journal_update_sb_errno() - Update error in the journal.
1338 * @journal: The journal to update.
1339 *
1340 * Update a journal's errno. Write updated superblock to disk waiting for IO
1341 * to complete.
1342 */
1343static void jbd2_journal_update_sb_errno(journal_t *journal)
1344{
1345 journal_superblock_t *sb = journal->j_superblock;
1346
1347 read_lock(&journal->j_state_lock);
1348 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1349 journal->j_errno);
1350 sb->s_errno = cpu_to_be32(journal->j_errno);
1351 read_unlock(&journal->j_state_lock);
1352
1353 jbd2_write_superblock(journal, WRITE_SYNC);
1354}
1355
1202/* 1356/*
1203 * Read the superblock for a given journal, performing initial 1357 * Read the superblock for a given journal, performing initial
1204 * validation of the format. 1358 * validation of the format.
1205 */ 1359 */
1206
1207static int journal_get_superblock(journal_t *journal) 1360static int journal_get_superblock(journal_t *journal)
1208{ 1361{
1209 struct buffer_head *bh; 1362 struct buffer_head *bh;
@@ -1397,14 +1550,11 @@ int jbd2_journal_destroy(journal_t *journal)
1397 1550
1398 if (journal->j_sb_buffer) { 1551 if (journal->j_sb_buffer) {
1399 if (!is_journal_aborted(journal)) { 1552 if (!is_journal_aborted(journal)) {
1400 /* We can now mark the journal as empty. */ 1553 mutex_lock(&journal->j_checkpoint_mutex);
1401 journal->j_tail = 0; 1554 jbd2_mark_journal_empty(journal);
1402 journal->j_tail_sequence = 1555 mutex_unlock(&journal->j_checkpoint_mutex);
1403 ++journal->j_transaction_sequence; 1556 } else
1404 jbd2_journal_update_superblock(journal, 1);
1405 } else {
1406 err = -EIO; 1557 err = -EIO;
1407 }
1408 brelse(journal->j_sb_buffer); 1558 brelse(journal->j_sb_buffer);
1409 } 1559 }
1410 1560
@@ -1551,61 +1701,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1551EXPORT_SYMBOL(jbd2_journal_clear_features); 1701EXPORT_SYMBOL(jbd2_journal_clear_features);
1552 1702
1553/** 1703/**
1554 * int jbd2_journal_update_format () - Update on-disk journal structure.
1555 * @journal: Journal to act on.
1556 *
1557 * Given an initialised but unloaded journal struct, poke about in the
1558 * on-disk structure to update it to the most recent supported version.
1559 */
1560int jbd2_journal_update_format (journal_t *journal)
1561{
1562 journal_superblock_t *sb;
1563 int err;
1564
1565 err = journal_get_superblock(journal);
1566 if (err)
1567 return err;
1568
1569 sb = journal->j_superblock;
1570
1571 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1572 case JBD2_SUPERBLOCK_V2:
1573 return 0;
1574 case JBD2_SUPERBLOCK_V1:
1575 return journal_convert_superblock_v1(journal, sb);
1576 default:
1577 break;
1578 }
1579 return -EINVAL;
1580}
1581
1582static int journal_convert_superblock_v1(journal_t *journal,
1583 journal_superblock_t *sb)
1584{
1585 int offset, blocksize;
1586 struct buffer_head *bh;
1587
1588 printk(KERN_WARNING
1589 "JBD2: Converting superblock from version 1 to 2.\n");
1590
1591 /* Pre-initialise new fields to zero */
1592 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1593 blocksize = be32_to_cpu(sb->s_blocksize);
1594 memset(&sb->s_feature_compat, 0, blocksize-offset);
1595
1596 sb->s_nr_users = cpu_to_be32(1);
1597 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1598 journal->j_format_version = 2;
1599
1600 bh = journal->j_sb_buffer;
1601 BUFFER_TRACE(bh, "marking dirty");
1602 mark_buffer_dirty(bh);
1603 sync_dirty_buffer(bh);
1604 return 0;
1605}
1606
1607
1608/**
1609 * int jbd2_journal_flush () - Flush journal 1704 * int jbd2_journal_flush () - Flush journal
1610 * @journal: Journal to act on. 1705 * @journal: Journal to act on.
1611 * 1706 *
@@ -1618,7 +1713,6 @@ int jbd2_journal_flush(journal_t *journal)
1618{ 1713{
1619 int err = 0; 1714 int err = 0;
1620 transaction_t *transaction = NULL; 1715 transaction_t *transaction = NULL;
1621 unsigned long old_tail;
1622 1716
1623 write_lock(&journal->j_state_lock); 1717 write_lock(&journal->j_state_lock);
1624 1718
@@ -1653,6 +1747,7 @@ int jbd2_journal_flush(journal_t *journal)
1653 if (is_journal_aborted(journal)) 1747 if (is_journal_aborted(journal))
1654 return -EIO; 1748 return -EIO;
1655 1749
1750 mutex_lock(&journal->j_checkpoint_mutex);
1656 jbd2_cleanup_journal_tail(journal); 1751 jbd2_cleanup_journal_tail(journal);
1657 1752
1658 /* Finally, mark the journal as really needing no recovery. 1753 /* Finally, mark the journal as really needing no recovery.
@@ -1660,14 +1755,9 @@ int jbd2_journal_flush(journal_t *journal)
1660 * the magic code for a fully-recovered superblock. Any future 1755 * the magic code for a fully-recovered superblock. Any future
1661 * commits of data to the journal will restore the current 1756 * commits of data to the journal will restore the current
1662 * s_start value. */ 1757 * s_start value. */
1758 jbd2_mark_journal_empty(journal);
1759 mutex_unlock(&journal->j_checkpoint_mutex);
1663 write_lock(&journal->j_state_lock); 1760 write_lock(&journal->j_state_lock);
1664 old_tail = journal->j_tail;
1665 journal->j_tail = 0;
1666 write_unlock(&journal->j_state_lock);
1667 jbd2_journal_update_superblock(journal, 1);
1668 write_lock(&journal->j_state_lock);
1669 journal->j_tail = old_tail;
1670
1671 J_ASSERT(!journal->j_running_transaction); 1761 J_ASSERT(!journal->j_running_transaction);
1672 J_ASSERT(!journal->j_committing_transaction); 1762 J_ASSERT(!journal->j_committing_transaction);
1673 J_ASSERT(!journal->j_checkpoint_transactions); 1763 J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1707,8 +1797,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1707 write ? "Clearing" : "Ignoring"); 1797 write ? "Clearing" : "Ignoring");
1708 1798
1709 err = jbd2_journal_skip_recovery(journal); 1799 err = jbd2_journal_skip_recovery(journal);
1710 if (write) 1800 if (write) {
1711 jbd2_journal_update_superblock(journal, 1); 1801 /* Lock to make assertions happy... */
1802 mutex_lock(&journal->j_checkpoint_mutex);
1803 jbd2_mark_journal_empty(journal);
1804 mutex_unlock(&journal->j_checkpoint_mutex);
1805 }
1712 1806
1713 no_recovery: 1807 no_recovery:
1714 return err; 1808 return err;
@@ -1758,7 +1852,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
1758 __jbd2_journal_abort_hard(journal); 1852 __jbd2_journal_abort_hard(journal);
1759 1853
1760 if (errno) 1854 if (errno)
1761 jbd2_journal_update_superblock(journal, 1); 1855 jbd2_journal_update_sb_errno(journal);
1762} 1856}
1763 1857
1764/** 1858/**
@@ -2016,7 +2110,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
2016static atomic_t nr_journal_heads = ATOMIC_INIT(0); 2110static atomic_t nr_journal_heads = ATOMIC_INIT(0);
2017#endif 2111#endif
2018 2112
2019static int journal_init_jbd2_journal_head_cache(void) 2113static int jbd2_journal_init_journal_head_cache(void)
2020{ 2114{
2021 int retval; 2115 int retval;
2022 2116
@@ -2034,7 +2128,7 @@ static int journal_init_jbd2_journal_head_cache(void)
2034 return retval; 2128 return retval;
2035} 2129}
2036 2130
2037static void jbd2_journal_destroy_jbd2_journal_head_cache(void) 2131static void jbd2_journal_destroy_journal_head_cache(void)
2038{ 2132{
2039 if (jbd2_journal_head_cache) { 2133 if (jbd2_journal_head_cache) {
2040 kmem_cache_destroy(jbd2_journal_head_cache); 2134 kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2322,7 +2416,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2322 2416
2323struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; 2417struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2324 2418
2325static int __init journal_init_handle_cache(void) 2419static int __init jbd2_journal_init_handle_cache(void)
2326{ 2420{
2327 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); 2421 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2328 if (jbd2_handle_cache == NULL) { 2422 if (jbd2_handle_cache == NULL) {
@@ -2357,17 +2451,20 @@ static int __init journal_init_caches(void)
2357 2451
2358 ret = jbd2_journal_init_revoke_caches(); 2452 ret = jbd2_journal_init_revoke_caches();
2359 if (ret == 0) 2453 if (ret == 0)
2360 ret = journal_init_jbd2_journal_head_cache(); 2454 ret = jbd2_journal_init_journal_head_cache();
2455 if (ret == 0)
2456 ret = jbd2_journal_init_handle_cache();
2361 if (ret == 0) 2457 if (ret == 0)
2362 ret = journal_init_handle_cache(); 2458 ret = jbd2_journal_init_transaction_cache();
2363 return ret; 2459 return ret;
2364} 2460}
2365 2461
2366static void jbd2_journal_destroy_caches(void) 2462static void jbd2_journal_destroy_caches(void)
2367{ 2463{
2368 jbd2_journal_destroy_revoke_caches(); 2464 jbd2_journal_destroy_revoke_caches();
2369 jbd2_journal_destroy_jbd2_journal_head_cache(); 2465 jbd2_journal_destroy_journal_head_cache();
2370 jbd2_journal_destroy_handle_cache(); 2466 jbd2_journal_destroy_handle_cache();
2467 jbd2_journal_destroy_transaction_cache();
2371 jbd2_journal_destroy_slabs(); 2468 jbd2_journal_destroy_slabs();
2372} 2469}
2373 2470
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf1390..c1a03354a22f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/crc32.h> 23#include <linux/crc32.h>
24#include <linux/blkdev.h>
24#endif 25#endif
25 26
26/* 27/*
@@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
265 err2 = sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
266 if (!err) 267 if (!err)
267 err = err2; 268 err = err2;
268 269 /* Make sure all replayed data is on permanent storage */
270 if (journal->j_flags & JBD2_BARRIER)
271 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
269 return err; 272 return err;
270} 273}
271 274
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc9..6973705d6a3d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)
208 J_ASSERT(!jbd2_revoke_record_cache); 208 J_ASSERT(!jbd2_revoke_record_cache);
209 J_ASSERT(!jbd2_revoke_table_cache); 209 J_ASSERT(!jbd2_revoke_table_cache);
210 210
211 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", 211 jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
212 sizeof(struct jbd2_revoke_record_s), 212 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
213 0,
214 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
215 NULL);
216 if (!jbd2_revoke_record_cache) 213 if (!jbd2_revoke_record_cache)
217 goto record_cache_failure; 214 goto record_cache_failure;
218 215
219 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", 216 jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
220 sizeof(struct jbd2_revoke_table_s), 217 SLAB_TEMPORARY);
221 0, SLAB_TEMPORARY, NULL);
222 if (!jbd2_revoke_table_cache) 218 if (!jbd2_revoke_table_cache)
223 goto table_cache_failure; 219 goto table_cache_failure;
224 return 0; 220 return 0;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5aba56e1fd5..ddcd3549c6c2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -33,6 +33,35 @@
33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34static void __jbd2_journal_unfile_buffer(struct journal_head *jh); 34static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35 35
36static struct kmem_cache *transaction_cache;
37int __init jbd2_journal_init_transaction_cache(void)
38{
39 J_ASSERT(!transaction_cache);
40 transaction_cache = kmem_cache_create("jbd2_transaction_s",
41 sizeof(transaction_t),
42 0,
43 SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
44 NULL);
45 if (transaction_cache)
46 return 0;
47 return -ENOMEM;
48}
49
50void jbd2_journal_destroy_transaction_cache(void)
51{
52 if (transaction_cache) {
53 kmem_cache_destroy(transaction_cache);
54 transaction_cache = NULL;
55 }
56}
57
58void jbd2_journal_free_transaction(transaction_t *transaction)
59{
60 if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61 return;
62 kmem_cache_free(transaction_cache, transaction);
63}
64
36/* 65/*
37 * jbd2_get_transaction: obtain a new transaction_t object. 66 * jbd2_get_transaction: obtain a new transaction_t object.
38 * 67 *
@@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
133 162
134alloc_transaction: 163alloc_transaction:
135 if (!journal->j_running_transaction) { 164 if (!journal->j_running_transaction) {
136 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); 165 new_transaction = kmem_cache_alloc(transaction_cache,
166 gfp_mask | __GFP_ZERO);
137 if (!new_transaction) { 167 if (!new_transaction) {
138 /* 168 /*
139 * If __GFP_FS is not present, then we may be 169 * If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@ repeat:
162 if (is_journal_aborted(journal) || 192 if (is_journal_aborted(journal) ||
163 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 193 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
164 read_unlock(&journal->j_state_lock); 194 read_unlock(&journal->j_state_lock);
165 kfree(new_transaction); 195 jbd2_journal_free_transaction(new_transaction);
166 return -EROFS; 196 return -EROFS;
167 } 197 }
168 198
@@ -284,7 +314,7 @@ repeat:
284 read_unlock(&journal->j_state_lock); 314 read_unlock(&journal->j_state_lock);
285 315
286 lock_map_acquire(&handle->h_lockdep_map); 316 lock_map_acquire(&handle->h_lockdep_map);
287 kfree(new_transaction); 317 jbd2_journal_free_transaction(new_transaction);
288 return 0; 318 return 0;
289} 319}
290 320
@@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1549 * of these pointers, it could go bad. Generally the caller needs to re-read 1579 * of these pointers, it could go bad. Generally the caller needs to re-read
1550 * the pointer from the transaction_t. 1580 * the pointer from the transaction_t.
1551 * 1581 *
1552 * Called under j_list_lock. The journal may not be locked. 1582 * Called under j_list_lock.
1553 */ 1583 */
1554void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) 1584static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1555{ 1585{
1556 struct journal_head **list = NULL; 1586 struct journal_head **list = NULL;
1557 transaction_t *transaction; 1587 transaction_t *transaction;
@@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1646 spin_lock(&journal->j_list_lock); 1676 spin_lock(&journal->j_list_lock);
1647 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1677 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1648 /* written-back checkpointed metadata buffer */ 1678 /* written-back checkpointed metadata buffer */
1649 if (jh->b_jlist == BJ_None) { 1679 JBUFFER_TRACE(jh, "remove from checkpoint list");
1650 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1680 __jbd2_journal_remove_checkpoint(jh);
1651 __jbd2_journal_remove_checkpoint(jh);
1652 }
1653 } 1681 }
1654 spin_unlock(&journal->j_list_lock); 1682 spin_unlock(&journal->j_list_lock);
1655out: 1683out:
@@ -1949,6 +1977,8 @@ zap_buffer_unlocked:
1949 clear_buffer_mapped(bh); 1977 clear_buffer_mapped(bh);
1950 clear_buffer_req(bh); 1978 clear_buffer_req(bh);
1951 clear_buffer_new(bh); 1979 clear_buffer_new(bh);
1980 clear_buffer_delay(bh);
1981 clear_buffer_unwritten(bh);
1952 bh->b_bdev = NULL; 1982 bh->b_bdev = NULL;
1953 return may_free; 1983 return may_free;
1954} 1984}
diff --git a/fs/libfs.c b/fs/libfs.c
index 722e0d5ba182..4a0d1f06da57 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -3,7 +3,7 @@
3 * Library for filesystems writers. 3 * Library for filesystems writers.
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/export.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index f848b52c67b1..3ddcbb1c0a43 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -598,7 +598,7 @@ static struct rpc_procinfo nlm4_procedures[] = {
598 PROC(GRANTED_RES, res, norep), 598 PROC(GRANTED_RES, res, norep),
599}; 599};
600 600
601struct rpc_version nlm_version4 = { 601const struct rpc_version nlm_version4 = {
602 .number = 4, 602 .number = 4,
603 .nrprocs = ARRAY_SIZE(nlm4_procedures), 603 .nrprocs = ARRAY_SIZE(nlm4_procedures),
604 .procs = nlm4_procedures, 604 .procs = nlm4_procedures,
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8d4ea8351e3d..ba1dc2eebd1e 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -62,7 +62,8 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
62 62
63 host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, 63 host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
64 nlm_init->protocol, nlm_version, 64 nlm_init->protocol, nlm_version,
65 nlm_init->hostname, nlm_init->noresvport); 65 nlm_init->hostname, nlm_init->noresvport,
66 nlm_init->net);
66 if (host == NULL) { 67 if (host == NULL) {
67 lockd_down(); 68 lockd_down();
68 return ERR_PTR(-ENOLCK); 69 return ERR_PTR(-ENOLCK);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 180ac34feb9a..3d35e3e80c1c 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -596,19 +596,19 @@ static struct rpc_procinfo nlm_procedures[] = {
596 PROC(GRANTED_RES, res, norep), 596 PROC(GRANTED_RES, res, norep),
597}; 597};
598 598
599static struct rpc_version nlm_version1 = { 599static const struct rpc_version nlm_version1 = {
600 .number = 1, 600 .number = 1,
601 .nrprocs = ARRAY_SIZE(nlm_procedures), 601 .nrprocs = ARRAY_SIZE(nlm_procedures),
602 .procs = nlm_procedures, 602 .procs = nlm_procedures,
603}; 603};
604 604
605static struct rpc_version nlm_version3 = { 605static const struct rpc_version nlm_version3 = {
606 .number = 3, 606 .number = 3,
607 .nrprocs = ARRAY_SIZE(nlm_procedures), 607 .nrprocs = ARRAY_SIZE(nlm_procedures),
608 .procs = nlm_procedures, 608 .procs = nlm_procedures,
609}; 609};
610 610
611static struct rpc_version *nlm_versions[] = { 611static const struct rpc_version *nlm_versions[] = {
612 [1] = &nlm_version1, 612 [1] = &nlm_version1,
613 [3] = &nlm_version3, 613 [3] = &nlm_version3,
614#ifdef CONFIG_LOCKD_V4 614#ifdef CONFIG_LOCKD_V4
@@ -618,7 +618,7 @@ static struct rpc_version *nlm_versions[] = {
618 618
619static struct rpc_stat nlm_rpc_stats; 619static struct rpc_stat nlm_rpc_stats;
620 620
621struct rpc_program nlm_program = { 621const struct rpc_program nlm_program = {
622 .name = "lockd", 622 .name = "lockd",
623 .number = NLM_PROGRAM, 623 .number = NLM_PROGRAM,
624 .nrvers = ARRAY_SIZE(nlm_versions), 624 .nrvers = ARRAY_SIZE(nlm_versions),
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 6f29836ec0cb..eb75ca7c2d6e 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -17,6 +17,8 @@
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19 19
20#include <linux/sunrpc/svc_xprt.h>
21
20#include <net/ipv6.h> 22#include <net/ipv6.h>
21 23
22#define NLMDBG_FACILITY NLMDBG_HOSTCACHE 24#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
@@ -54,6 +56,7 @@ struct nlm_lookup_host_info {
54 const char *hostname; /* remote's hostname */ 56 const char *hostname; /* remote's hostname */
55 const size_t hostname_len; /* it's length */ 57 const size_t hostname_len; /* it's length */
56 const int noresvport; /* use non-priv port */ 58 const int noresvport; /* use non-priv port */
59 struct net *net; /* network namespace to bind */
57}; 60};
58 61
59/* 62/*
@@ -155,6 +158,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
155 INIT_LIST_HEAD(&host->h_reclaim); 158 INIT_LIST_HEAD(&host->h_reclaim);
156 host->h_nsmhandle = nsm; 159 host->h_nsmhandle = nsm;
157 host->h_addrbuf = nsm->sm_addrbuf; 160 host->h_addrbuf = nsm->sm_addrbuf;
161 host->net = ni->net;
158 162
159out: 163out:
160 return host; 164 return host;
@@ -206,7 +210,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
206 const unsigned short protocol, 210 const unsigned short protocol,
207 const u32 version, 211 const u32 version,
208 const char *hostname, 212 const char *hostname,
209 int noresvport) 213 int noresvport,
214 struct net *net)
210{ 215{
211 struct nlm_lookup_host_info ni = { 216 struct nlm_lookup_host_info ni = {
212 .server = 0, 217 .server = 0,
@@ -217,6 +222,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
217 .hostname = hostname, 222 .hostname = hostname,
218 .hostname_len = strlen(hostname), 223 .hostname_len = strlen(hostname),
219 .noresvport = noresvport, 224 .noresvport = noresvport,
225 .net = net,
220 }; 226 };
221 struct hlist_head *chain; 227 struct hlist_head *chain;
222 struct hlist_node *pos; 228 struct hlist_node *pos;
@@ -231,6 +237,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
231 237
232 chain = &nlm_client_hosts[nlm_hash_address(sap)]; 238 chain = &nlm_client_hosts[nlm_hash_address(sap)];
233 hlist_for_each_entry(host, pos, chain, h_hash) { 239 hlist_for_each_entry(host, pos, chain, h_hash) {
240 if (host->net != net)
241 continue;
234 if (!rpc_cmp_addr(nlm_addr(host), sap)) 242 if (!rpc_cmp_addr(nlm_addr(host), sap))
235 continue; 243 continue;
236 244
@@ -318,6 +326,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
318 struct nsm_handle *nsm = NULL; 326 struct nsm_handle *nsm = NULL;
319 struct sockaddr *src_sap = svc_daddr(rqstp); 327 struct sockaddr *src_sap = svc_daddr(rqstp);
320 size_t src_len = rqstp->rq_daddrlen; 328 size_t src_len = rqstp->rq_daddrlen;
329 struct net *net = rqstp->rq_xprt->xpt_net;
321 struct nlm_lookup_host_info ni = { 330 struct nlm_lookup_host_info ni = {
322 .server = 1, 331 .server = 1,
323 .sap = svc_addr(rqstp), 332 .sap = svc_addr(rqstp),
@@ -326,6 +335,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
326 .version = rqstp->rq_vers, 335 .version = rqstp->rq_vers,
327 .hostname = hostname, 336 .hostname = hostname,
328 .hostname_len = hostname_len, 337 .hostname_len = hostname_len,
338 .net = net,
329 }; 339 };
330 340
331 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, 341 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
@@ -339,6 +349,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
339 349
340 chain = &nlm_server_hosts[nlm_hash_address(ni.sap)]; 350 chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
341 hlist_for_each_entry(host, pos, chain, h_hash) { 351 hlist_for_each_entry(host, pos, chain, h_hash) {
352 if (host->net != net)
353 continue;
342 if (!rpc_cmp_addr(nlm_addr(host), ni.sap)) 354 if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
343 continue; 355 continue;
344 356
@@ -431,7 +443,7 @@ nlm_bind_host(struct nlm_host *host)
431 .to_retries = 5U, 443 .to_retries = 5U,
432 }; 444 };
433 struct rpc_create_args args = { 445 struct rpc_create_args args = {
434 .net = &init_net, 446 .net = host->net,
435 .protocol = host->h_proto, 447 .protocol = host->h_proto,
436 .address = nlm_addr(host), 448 .address = nlm_addr(host),
437 .addrsize = host->h_addrlen, 449 .addrsize = host->h_addrlen,
@@ -553,12 +565,8 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
553 nsm_release(nsm); 565 nsm_release(nsm);
554} 566}
555 567
556/*
557 * Shut down the hosts module.
558 * Note that this routine is called only at server shutdown time.
559 */
560void 568void
561nlm_shutdown_hosts(void) 569nlm_shutdown_hosts_net(struct net *net)
562{ 570{
563 struct hlist_head *chain; 571 struct hlist_head *chain;
564 struct hlist_node *pos; 572 struct hlist_node *pos;
@@ -570,6 +578,8 @@ nlm_shutdown_hosts(void)
570 /* First, make all hosts eligible for gc */ 578 /* First, make all hosts eligible for gc */
571 dprintk("lockd: nuking all hosts...\n"); 579 dprintk("lockd: nuking all hosts...\n");
572 for_each_host(host, pos, chain, nlm_server_hosts) { 580 for_each_host(host, pos, chain, nlm_server_hosts) {
581 if (net && host->net != net)
582 continue;
573 host->h_expires = jiffies - 1; 583 host->h_expires = jiffies - 1;
574 if (host->h_rpcclnt) { 584 if (host->h_rpcclnt) {
575 rpc_shutdown_client(host->h_rpcclnt); 585 rpc_shutdown_client(host->h_rpcclnt);
@@ -580,15 +590,29 @@ nlm_shutdown_hosts(void)
580 /* Then, perform a garbage collection pass */ 590 /* Then, perform a garbage collection pass */
581 nlm_gc_hosts(); 591 nlm_gc_hosts();
582 mutex_unlock(&nlm_host_mutex); 592 mutex_unlock(&nlm_host_mutex);
593}
594
595/*
596 * Shut down the hosts module.
597 * Note that this routine is called only at server shutdown time.
598 */
599void
600nlm_shutdown_hosts(void)
601{
602 struct hlist_head *chain;
603 struct hlist_node *pos;
604 struct nlm_host *host;
605
606 nlm_shutdown_hosts_net(NULL);
583 607
584 /* complain if any hosts are left */ 608 /* complain if any hosts are left */
585 if (nrhosts != 0) { 609 if (nrhosts != 0) {
586 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); 610 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
587 dprintk("lockd: %lu hosts left:\n", nrhosts); 611 dprintk("lockd: %lu hosts left:\n", nrhosts);
588 for_each_host(host, pos, chain, nlm_server_hosts) { 612 for_each_host(host, pos, chain, nlm_server_hosts) {
589 dprintk(" %s (cnt %d use %d exp %ld)\n", 613 dprintk(" %s (cnt %d use %d exp %ld net %p)\n",
590 host->h_name, atomic_read(&host->h_count), 614 host->h_name, atomic_read(&host->h_count),
591 host->h_inuse, host->h_expires); 615 host->h_inuse, host->h_expires, host->net);
592 } 616 }
593 } 617 }
594} 618}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 65ba36b80a9e..7ef14b3c5bee 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -47,7 +47,7 @@ struct nsm_res {
47 u32 state; 47 u32 state;
48}; 48};
49 49
50static struct rpc_program nsm_program; 50static const struct rpc_program nsm_program;
51static LIST_HEAD(nsm_handles); 51static LIST_HEAD(nsm_handles);
52static DEFINE_SPINLOCK(nsm_lock); 52static DEFINE_SPINLOCK(nsm_lock);
53 53
@@ -62,14 +62,14 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
62 return (struct sockaddr *)&nsm->sm_addr; 62 return (struct sockaddr *)&nsm->sm_addr;
63} 63}
64 64
65static struct rpc_clnt *nsm_create(void) 65static struct rpc_clnt *nsm_create(struct net *net)
66{ 66{
67 struct sockaddr_in sin = { 67 struct sockaddr_in sin = {
68 .sin_family = AF_INET, 68 .sin_family = AF_INET,
69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK), 69 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
70 }; 70 };
71 struct rpc_create_args args = { 71 struct rpc_create_args args = {
72 .net = &init_net, 72 .net = net,
73 .protocol = XPRT_TRANSPORT_UDP, 73 .protocol = XPRT_TRANSPORT_UDP,
74 .address = (struct sockaddr *)&sin, 74 .address = (struct sockaddr *)&sin,
75 .addrsize = sizeof(sin), 75 .addrsize = sizeof(sin),
@@ -83,7 +83,8 @@ static struct rpc_clnt *nsm_create(void)
83 return rpc_create(&args); 83 return rpc_create(&args);
84} 84}
85 85
86static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) 86static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
87 struct net *net)
87{ 88{
88 struct rpc_clnt *clnt; 89 struct rpc_clnt *clnt;
89 int status; 90 int status;
@@ -99,7 +100,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
99 .rpc_resp = res, 100 .rpc_resp = res,
100 }; 101 };
101 102
102 clnt = nsm_create(); 103 clnt = nsm_create(net);
103 if (IS_ERR(clnt)) { 104 if (IS_ERR(clnt)) {
104 status = PTR_ERR(clnt); 105 status = PTR_ERR(clnt);
105 dprintk("lockd: failed to create NSM upcall transport, " 106 dprintk("lockd: failed to create NSM upcall transport, "
@@ -149,7 +150,7 @@ int nsm_monitor(const struct nlm_host *host)
149 */ 150 */
150 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 151 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
151 152
152 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); 153 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net);
153 if (unlikely(res.status != 0)) 154 if (unlikely(res.status != 0))
154 status = -EIO; 155 status = -EIO;
155 if (unlikely(status < 0)) { 156 if (unlikely(status < 0)) {
@@ -183,7 +184,7 @@ void nsm_unmonitor(const struct nlm_host *host)
183 && nsm->sm_monitored && !nsm->sm_sticky) { 184 && nsm->sm_monitored && !nsm->sm_sticky) {
184 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); 185 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
185 186
186 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); 187 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net);
187 if (res.status != 0) 188 if (res.status != 0)
188 status = -EIO; 189 status = -EIO;
189 if (status < 0) 190 if (status < 0)
@@ -534,19 +535,19 @@ static struct rpc_procinfo nsm_procedures[] = {
534 }, 535 },
535}; 536};
536 537
537static struct rpc_version nsm_version1 = { 538static const struct rpc_version nsm_version1 = {
538 .number = 1, 539 .number = 1,
539 .nrprocs = ARRAY_SIZE(nsm_procedures), 540 .nrprocs = ARRAY_SIZE(nsm_procedures),
540 .procs = nsm_procedures 541 .procs = nsm_procedures
541}; 542};
542 543
543static struct rpc_version * nsm_version[] = { 544static const struct rpc_version *nsm_version[] = {
544 [1] = &nsm_version1, 545 [1] = &nsm_version1,
545}; 546};
546 547
547static struct rpc_stat nsm_stats; 548static struct rpc_stat nsm_stats;
548 549
549static struct rpc_program nsm_program = { 550static const struct rpc_program nsm_program = {
550 .name = "statd", 551 .name = "statd",
551 .number = NSM_PROGRAM, 552 .number = NSM_PROGRAM,
552 .nrvers = ARRAY_SIZE(nsm_version), 553 .nrvers = ARRAY_SIZE(nsm_version),
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
new file mode 100644
index 000000000000..ce227e0fbc5c
--- /dev/null
+++ b/fs/lockd/netns.h
@@ -0,0 +1,12 @@
1#ifndef __LOCKD_NETNS_H__
2#define __LOCKD_NETNS_H__
3
4#include <net/netns/generic.h>
5
6struct lockd_net {
7 unsigned int nlmsvc_users;
8};
9
10extern int lockd_net_id;
11
12#endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index c061b9aa7ddb..2774e1013b34 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,6 +35,8 @@
35#include <linux/lockd/lockd.h> 35#include <linux/lockd/lockd.h>
36#include <linux/nfs.h> 36#include <linux/nfs.h>
37 37
38#include "netns.h"
39
38#define NLMDBG_FACILITY NLMDBG_SVC 40#define NLMDBG_FACILITY NLMDBG_SVC
39#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) 41#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
40#define ALLOWED_SIGS (sigmask(SIGKILL)) 42#define ALLOWED_SIGS (sigmask(SIGKILL))
@@ -50,6 +52,8 @@ static struct task_struct *nlmsvc_task;
50static struct svc_rqst *nlmsvc_rqst; 52static struct svc_rqst *nlmsvc_rqst;
51unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
52 54
55int lockd_net_id;
56
53/* 57/*
54 * These can be set at insmod time (useful for NFS as root filesystem), 58 * These can be set at insmod time (useful for NFS as root filesystem),
55 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 59 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
@@ -189,27 +193,29 @@ lockd(void *vrqstp)
189} 193}
190 194
191static int create_lockd_listener(struct svc_serv *serv, const char *name, 195static int create_lockd_listener(struct svc_serv *serv, const char *name,
192 const int family, const unsigned short port) 196 struct net *net, const int family,
197 const unsigned short port)
193{ 198{
194 struct svc_xprt *xprt; 199 struct svc_xprt *xprt;
195 200
196 xprt = svc_find_xprt(serv, name, family, 0); 201 xprt = svc_find_xprt(serv, name, net, family, 0);
197 if (xprt == NULL) 202 if (xprt == NULL)
198 return svc_create_xprt(serv, name, &init_net, family, port, 203 return svc_create_xprt(serv, name, net, family, port,
199 SVC_SOCK_DEFAULTS); 204 SVC_SOCK_DEFAULTS);
200 svc_xprt_put(xprt); 205 svc_xprt_put(xprt);
201 return 0; 206 return 0;
202} 207}
203 208
204static int create_lockd_family(struct svc_serv *serv, const int family) 209static int create_lockd_family(struct svc_serv *serv, struct net *net,
210 const int family)
205{ 211{
206 int err; 212 int err;
207 213
208 err = create_lockd_listener(serv, "udp", family, nlm_udpport); 214 err = create_lockd_listener(serv, "udp", net, family, nlm_udpport);
209 if (err < 0) 215 if (err < 0)
210 return err; 216 return err;
211 217
212 return create_lockd_listener(serv, "tcp", family, nlm_tcpport); 218 return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport);
213} 219}
214 220
215/* 221/*
@@ -222,16 +228,16 @@ static int create_lockd_family(struct svc_serv *serv, const int family)
222 * Returns zero if all listeners are available; otherwise a 228 * Returns zero if all listeners are available; otherwise a
223 * negative errno value is returned. 229 * negative errno value is returned.
224 */ 230 */
225static int make_socks(struct svc_serv *serv) 231static int make_socks(struct svc_serv *serv, struct net *net)
226{ 232{
227 static int warned; 233 static int warned;
228 int err; 234 int err;
229 235
230 err = create_lockd_family(serv, PF_INET); 236 err = create_lockd_family(serv, net, PF_INET);
231 if (err < 0) 237 if (err < 0)
232 goto out_err; 238 goto out_err;
233 239
234 err = create_lockd_family(serv, PF_INET6); 240 err = create_lockd_family(serv, net, PF_INET6);
235 if (err < 0 && err != -EAFNOSUPPORT) 241 if (err < 0 && err != -EAFNOSUPPORT)
236 goto out_err; 242 goto out_err;
237 243
@@ -245,6 +251,47 @@ out_err:
245 return err; 251 return err;
246} 252}
247 253
254static int lockd_up_net(struct net *net)
255{
256 struct lockd_net *ln = net_generic(net, lockd_net_id);
257 struct svc_serv *serv = nlmsvc_rqst->rq_server;
258 int error;
259
260 if (ln->nlmsvc_users)
261 return 0;
262
263 error = svc_rpcb_setup(serv, net);
264 if (error)
265 goto err_rpcb;
266
267 error = make_socks(serv, net);
268 if (error < 0)
269 goto err_socks;
270 return 0;
271
272err_socks:
273 svc_rpcb_cleanup(serv, net);
274err_rpcb:
275 return error;
276}
277
278static void lockd_down_net(struct net *net)
279{
280 struct lockd_net *ln = net_generic(net, lockd_net_id);
281 struct svc_serv *serv = nlmsvc_rqst->rq_server;
282
283 if (ln->nlmsvc_users) {
284 if (--ln->nlmsvc_users == 0) {
285 nlm_shutdown_hosts_net(net);
286 svc_shutdown_net(serv, net);
287 }
288 } else {
289 printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
290 nlmsvc_task, net);
291 BUG();
292 }
293}
294
248/* 295/*
249 * Bring up the lockd process if it's not already up. 296 * Bring up the lockd process if it's not already up.
250 */ 297 */
@@ -252,13 +299,16 @@ int lockd_up(void)
252{ 299{
253 struct svc_serv *serv; 300 struct svc_serv *serv;
254 int error = 0; 301 int error = 0;
302 struct net *net = current->nsproxy->net_ns;
255 303
256 mutex_lock(&nlmsvc_mutex); 304 mutex_lock(&nlmsvc_mutex);
257 /* 305 /*
258 * Check whether we're already up and running. 306 * Check whether we're already up and running.
259 */ 307 */
260 if (nlmsvc_rqst) 308 if (nlmsvc_rqst) {
309 error = lockd_up_net(net);
261 goto out; 310 goto out;
311 }
262 312
263 /* 313 /*
264 * Sanity check: if there's no pid, 314 * Sanity check: if there's no pid,
@@ -275,7 +325,7 @@ int lockd_up(void)
275 goto out; 325 goto out;
276 } 326 }
277 327
278 error = make_socks(serv); 328 error = make_socks(serv, net);
279 if (error < 0) 329 if (error < 0)
280 goto destroy_and_out; 330 goto destroy_and_out;
281 331
@@ -313,8 +363,12 @@ int lockd_up(void)
313destroy_and_out: 363destroy_and_out:
314 svc_destroy(serv); 364 svc_destroy(serv);
315out: 365out:
316 if (!error) 366 if (!error) {
367 struct lockd_net *ln = net_generic(net, lockd_net_id);
368
369 ln->nlmsvc_users++;
317 nlmsvc_users++; 370 nlmsvc_users++;
371 }
318 mutex_unlock(&nlmsvc_mutex); 372 mutex_unlock(&nlmsvc_mutex);
319 return error; 373 return error;
320} 374}
@@ -328,8 +382,10 @@ lockd_down(void)
328{ 382{
329 mutex_lock(&nlmsvc_mutex); 383 mutex_lock(&nlmsvc_mutex);
330 if (nlmsvc_users) { 384 if (nlmsvc_users) {
331 if (--nlmsvc_users) 385 if (--nlmsvc_users) {
386 lockd_down_net(current->nsproxy->net_ns);
332 goto out; 387 goto out;
388 }
333 } else { 389 } else {
334 printk(KERN_ERR "lockd_down: no users! task=%p\n", 390 printk(KERN_ERR "lockd_down: no users! task=%p\n",
335 nlmsvc_task); 391 nlmsvc_task);
@@ -497,24 +553,55 @@ module_param_call(nlm_tcpport, param_set_port, param_get_int,
497module_param(nsm_use_hostnames, bool, 0644); 553module_param(nsm_use_hostnames, bool, 0644);
498module_param(nlm_max_connections, uint, 0644); 554module_param(nlm_max_connections, uint, 0644);
499 555
556static int lockd_init_net(struct net *net)
557{
558 return 0;
559}
560
561static void lockd_exit_net(struct net *net)
562{
563}
564
565static struct pernet_operations lockd_net_ops = {
566 .init = lockd_init_net,
567 .exit = lockd_exit_net,
568 .id = &lockd_net_id,
569 .size = sizeof(struct lockd_net),
570};
571
572
500/* 573/*
501 * Initialising and terminating the module. 574 * Initialising and terminating the module.
502 */ 575 */
503 576
504static int __init init_nlm(void) 577static int __init init_nlm(void)
505{ 578{
579 int err;
580
506#ifdef CONFIG_SYSCTL 581#ifdef CONFIG_SYSCTL
582 err = -ENOMEM;
507 nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); 583 nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
508 return nlm_sysctl_table ? 0 : -ENOMEM; 584 if (nlm_sysctl_table == NULL)
509#else 585 goto err_sysctl;
586#endif
587 err = register_pernet_subsys(&lockd_net_ops);
588 if (err)
589 goto err_pernet;
510 return 0; 590 return 0;
591
592err_pernet:
593#ifdef CONFIG_SYSCTL
594 unregister_sysctl_table(nlm_sysctl_table);
511#endif 595#endif
596err_sysctl:
597 return err;
512} 598}
513 599
514static void __exit exit_nlm(void) 600static void __exit exit_nlm(void)
515{ 601{
516 /* FIXME: delete all NLM clients */ 602 /* FIXME: delete all NLM clients */
517 nlm_shutdown_hosts(); 603 nlm_shutdown_hosts();
604 unregister_pernet_subsys(&lockd_net_ops);
518#ifdef CONFIG_SYSCTL 605#ifdef CONFIG_SYSCTL
519 unregister_sysctl_table(nlm_sysctl_table); 606 unregister_sysctl_table(nlm_sysctl_table);
520#endif 607#endif
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index f0179c3745d2..e46353f41a42 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -46,7 +46,6 @@ static void nlmsvc_remove_block(struct nlm_block *block);
46static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); 46static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
47static void nlmsvc_freegrantargs(struct nlm_rqst *call); 47static void nlmsvc_freegrantargs(struct nlm_rqst *call);
48static const struct rpc_call_ops nlmsvc_grant_ops; 48static const struct rpc_call_ops nlmsvc_grant_ops;
49static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
50 49
51/* 50/*
52 * The list of blocked locks to retry 51 * The list of blocked locks to retry
@@ -54,6 +53,35 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);
54static LIST_HEAD(nlm_blocked); 53static LIST_HEAD(nlm_blocked);
55static DEFINE_SPINLOCK(nlm_blocked_lock); 54static DEFINE_SPINLOCK(nlm_blocked_lock);
56 55
56#ifdef LOCKD_DEBUG
57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
58{
59 /*
60 * We can get away with a static buffer because we're only
61 * called with BKL held.
62 */
63 static char buf[2*NLM_MAXCOOKIELEN+1];
64 unsigned int i, len = sizeof(buf);
65 char *p = buf;
66
67 len--; /* allow for trailing \0 */
68 if (len < 3)
69 return "???";
70 for (i = 0 ; i < cookie->len ; i++) {
71 if (len < 2) {
72 strcpy(p-3, "...");
73 break;
74 }
75 sprintf(p, "%02x", cookie->data[i]);
76 p += 2;
77 len -= 2;
78 }
79 *p = '\0';
80
81 return buf;
82}
83#endif
84
57/* 85/*
58 * Insert a blocked lock into the global list 86 * Insert a blocked lock into the global list
59 */ 87 */
@@ -935,32 +963,3 @@ nlmsvc_retry_blocked(void)
935 963
936 return timeout; 964 return timeout;
937} 965}
938
939#ifdef RPC_DEBUG
940static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
941{
942 /*
943 * We can get away with a static buffer because we're only
944 * called with BKL held.
945 */
946 static char buf[2*NLM_MAXCOOKIELEN+1];
947 unsigned int i, len = sizeof(buf);
948 char *p = buf;
949
950 len--; /* allow for trailing \0 */
951 if (len < 3)
952 return "???";
953 for (i = 0 ; i < cookie->len ; i++) {
954 if (len < 2) {
955 strcpy(p-3, "...");
956 break;
957 }
958 sprintf(p, "%02x", cookie->data[i]);
959 p += 2;
960 len -= 2;
961 }
962 *p = '\0';
963
964 return buf;
965}
966#endif
diff --git a/fs/mpage.c b/fs/mpage.c
index 643e9f55ef29..0face1c4d4c6 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -13,7 +13,7 @@
13 */ 13 */
14 14
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/module.h> 16#include <linux/export.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h> 19#include <linux/gfp.h>
diff --git a/fs/namei.c b/fs/namei.c
index 73ec863a9896..e615ff37e27d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -15,7 +15,7 @@
15 */ 15 */
16 16
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/module.h> 18#include <linux/export.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index dbcd82126aed..2a0e6c599147 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -64,6 +64,7 @@ config NFS_V4
64 bool "NFS client support for NFS version 4" 64 bool "NFS client support for NFS version 4"
65 depends on NFS_FS 65 depends on NFS_FS
66 select SUNRPC_GSS 66 select SUNRPC_GSS
67 select KEYS
67 help 68 help
68 This option enables support for version 4 of the NFS protocol 69 This option enables support for version 4 of the NFS protocol
69 (RFC 3530) in the kernel's NFS client. 70 (RFC 3530) in the kernel's NFS client.
@@ -98,6 +99,18 @@ config PNFS_OBJLAYOUT
98 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD 99 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
99 default m 100 default m
100 101
102config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
103 string "NFSv4.1 Implementation ID Domain"
104 depends on NFS_V4_1
105 default "kernel.org"
106 help
107 This option defines the domain portion of the implementation ID that
108 may be sent in the NFS exchange_id operation. The value must be in
109 the format of a DNS domain name and should be set to the DNS domain
110 name of the distribution.
111 If the NFS client is unchanged from the upstream kernel, this
112 option should be set to the default "kernel.org".
113
101config ROOT_NFS 114config ROOT_NFS
102 bool "Root file system on NFS" 115 bool "Root file system on NFS"
103 depends on NFS_FS=y && IP_PNP 116 depends on NFS_FS=y && IP_PNP
@@ -130,16 +143,10 @@ config NFS_USE_KERNEL_DNS
130 bool 143 bool
131 depends on NFS_V4 && !NFS_USE_LEGACY_DNS 144 depends on NFS_V4 && !NFS_USE_LEGACY_DNS
132 select DNS_RESOLVER 145 select DNS_RESOLVER
133 select KEYS
134 default y 146 default y
135 147
136config NFS_USE_NEW_IDMAPPER 148config NFS_DEBUG
137 bool "Use the new idmapper upcall routine" 149 bool
138 depends on NFS_V4 && KEYS 150 depends on NFS_FS && SUNRPC_DEBUG
139 help 151 select CRC32
140 Say Y here if you want NFS to use the new idmapper upcall functions. 152 default y
141 You will need /sbin/request-key (usually provided by the keyutils
142 package). For details, read
143 <file:Documentation/filesystems/nfs/idmapper.txt>.
144
145 If you are unsure, say N.
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 48cfac31f64c..9c94297bb70e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -46,9 +46,6 @@ MODULE_LICENSE("GPL");
46MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 46MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
47MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 47MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
48 48
49struct dentry *bl_device_pipe;
50wait_queue_head_t bl_wq;
51
52static void print_page(struct page *page) 49static void print_page(struct page *page)
53{ 50{
54 dprintk("PRINTPAGE page %p\n", page); 51 dprintk("PRINTPAGE page %p\n", page);
@@ -236,12 +233,11 @@ bl_read_pagelist(struct nfs_read_data *rdata)
236 sector_t isect, extent_length = 0; 233 sector_t isect, extent_length = 0;
237 struct parallel_io *par; 234 struct parallel_io *par;
238 loff_t f_offset = rdata->args.offset; 235 loff_t f_offset = rdata->args.offset;
239 size_t count = rdata->args.count;
240 struct page **pages = rdata->args.pages; 236 struct page **pages = rdata->args.pages;
241 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 237 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
242 238
243 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, 239 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
244 rdata->npages, f_offset, count); 240 rdata->npages, f_offset, (unsigned int)rdata->args.count);
245 241
246 par = alloc_parallel(rdata); 242 par = alloc_parallel(rdata);
247 if (!par) 243 if (!par)
@@ -1025,10 +1021,128 @@ static const struct rpc_pipe_ops bl_upcall_ops = {
1025 .destroy_msg = bl_pipe_destroy_msg, 1021 .destroy_msg = bl_pipe_destroy_msg,
1026}; 1022};
1027 1023
1024static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
1025 struct rpc_pipe *pipe)
1026{
1027 struct dentry *dir, *dentry;
1028
1029 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
1030 if (dir == NULL)
1031 return ERR_PTR(-ENOENT);
1032 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
1033 dput(dir);
1034 return dentry;
1035}
1036
1037static void nfs4blocklayout_unregister_sb(struct super_block *sb,
1038 struct rpc_pipe *pipe)
1039{
1040 if (pipe->dentry)
1041 rpc_unlink(pipe->dentry);
1042}
1043
1044static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
1045 void *ptr)
1046{
1047 struct super_block *sb = ptr;
1048 struct net *net = sb->s_fs_info;
1049 struct nfs_net *nn = net_generic(net, nfs_net_id);
1050 struct dentry *dentry;
1051 int ret = 0;
1052
1053 if (!try_module_get(THIS_MODULE))
1054 return 0;
1055
1056 if (nn->bl_device_pipe == NULL) {
1057 module_put(THIS_MODULE);
1058 return 0;
1059 }
1060
1061 switch (event) {
1062 case RPC_PIPEFS_MOUNT:
1063 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
1064 if (IS_ERR(dentry)) {
1065 ret = PTR_ERR(dentry);
1066 break;
1067 }
1068 nn->bl_device_pipe->dentry = dentry;
1069 break;
1070 case RPC_PIPEFS_UMOUNT:
1071 if (nn->bl_device_pipe->dentry)
1072 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
1073 break;
1074 default:
1075 ret = -ENOTSUPP;
1076 break;
1077 }
1078 module_put(THIS_MODULE);
1079 return ret;
1080}
1081
1082static struct notifier_block nfs4blocklayout_block = {
1083 .notifier_call = rpc_pipefs_event,
1084};
1085
1086static struct dentry *nfs4blocklayout_register_net(struct net *net,
1087 struct rpc_pipe *pipe)
1088{
1089 struct super_block *pipefs_sb;
1090 struct dentry *dentry;
1091
1092 pipefs_sb = rpc_get_sb_net(net);
1093 if (!pipefs_sb)
1094 return NULL;
1095 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1096 rpc_put_sb_net(net);
1097 return dentry;
1098}
1099
1100static void nfs4blocklayout_unregister_net(struct net *net,
1101 struct rpc_pipe *pipe)
1102{
1103 struct super_block *pipefs_sb;
1104
1105 pipefs_sb = rpc_get_sb_net(net);
1106 if (pipefs_sb) {
1107 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1108 rpc_put_sb_net(net);
1109 }
1110}
1111
1112static int nfs4blocklayout_net_init(struct net *net)
1113{
1114 struct nfs_net *nn = net_generic(net, nfs_net_id);
1115 struct dentry *dentry;
1116
1117 init_waitqueue_head(&nn->bl_wq);
1118 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1119 if (IS_ERR(nn->bl_device_pipe))
1120 return PTR_ERR(nn->bl_device_pipe);
1121 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1122 if (IS_ERR(dentry)) {
1123 rpc_destroy_pipe_data(nn->bl_device_pipe);
1124 return PTR_ERR(dentry);
1125 }
1126 nn->bl_device_pipe->dentry = dentry;
1127 return 0;
1128}
1129
1130static void nfs4blocklayout_net_exit(struct net *net)
1131{
1132 struct nfs_net *nn = net_generic(net, nfs_net_id);
1133
1134 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1135 rpc_destroy_pipe_data(nn->bl_device_pipe);
1136 nn->bl_device_pipe = NULL;
1137}
1138
1139static struct pernet_operations nfs4blocklayout_net_ops = {
1140 .init = nfs4blocklayout_net_init,
1141 .exit = nfs4blocklayout_net_exit,
1142};
1143
1028static int __init nfs4blocklayout_init(void) 1144static int __init nfs4blocklayout_init(void)
1029{ 1145{
1030 struct vfsmount *mnt;
1031 struct path path;
1032 int ret; 1146 int ret;
1033 1147
1034 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); 1148 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
@@ -1037,32 +1151,17 @@ static int __init nfs4blocklayout_init(void)
1037 if (ret) 1151 if (ret)
1038 goto out; 1152 goto out;
1039 1153
1040 init_waitqueue_head(&bl_wq); 1154 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
1041 1155 if (ret)
1042 mnt = rpc_get_mount();
1043 if (IS_ERR(mnt)) {
1044 ret = PTR_ERR(mnt);
1045 goto out_remove; 1156 goto out_remove;
1046 } 1157 ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
1047
1048 ret = vfs_path_lookup(mnt->mnt_root,
1049 mnt,
1050 NFS_PIPE_DIRNAME, 0, &path);
1051 if (ret) 1158 if (ret)
1052 goto out_putrpc; 1159 goto out_notifier;
1053
1054 bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
1055 &bl_upcall_ops, 0);
1056 path_put(&path);
1057 if (IS_ERR(bl_device_pipe)) {
1058 ret = PTR_ERR(bl_device_pipe);
1059 goto out_putrpc;
1060 }
1061out: 1160out:
1062 return ret; 1161 return ret;
1063 1162
1064out_putrpc: 1163out_notifier:
1065 rpc_put_mount(); 1164 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1066out_remove: 1165out_remove:
1067 pnfs_unregister_layoutdriver(&blocklayout_type); 1166 pnfs_unregister_layoutdriver(&blocklayout_type);
1068 return ret; 1167 return ret;
@@ -1073,9 +1172,9 @@ static void __exit nfs4blocklayout_exit(void)
1073 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 1172 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1074 __func__); 1173 __func__);
1075 1174
1175 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1176 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
1076 pnfs_unregister_layoutdriver(&blocklayout_type); 1177 pnfs_unregister_layoutdriver(&blocklayout_type);
1077 rpc_unlink(bl_device_pipe);
1078 rpc_put_mount();
1079} 1178}
1080 1179
1081MODULE_ALIAS("nfs-layouttype4-3"); 1180MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index e31a2df28e70..03350690118e 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -37,6 +37,7 @@
37#include <linux/sunrpc/rpc_pipe_fs.h> 37#include <linux/sunrpc/rpc_pipe_fs.h>
38 38
39#include "../pnfs.h" 39#include "../pnfs.h"
40#include "../netns.h"
40 41
41#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) 42#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
42#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 43#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
@@ -50,6 +51,7 @@ struct pnfs_block_dev {
50 struct list_head bm_node; 51 struct list_head bm_node;
51 struct nfs4_deviceid bm_mdevid; /* associated devid */ 52 struct nfs4_deviceid bm_mdevid; /* associated devid */
52 struct block_device *bm_mdev; /* meta device itself */ 53 struct block_device *bm_mdev; /* meta device itself */
54 struct net *net;
53}; 55};
54 56
55enum exstate4 { 57enum exstate4 {
@@ -151,9 +153,9 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
151 return BLK_LO2EXT(lseg->pls_layout); 153 return BLK_LO2EXT(lseg->pls_layout);
152} 154}
153 155
154struct bl_dev_msg { 156struct bl_pipe_msg {
155 int32_t status; 157 struct rpc_pipe_msg msg;
156 uint32_t major, minor; 158 wait_queue_head_t *bl_wq;
157}; 159};
158 160
159struct bl_msg_hdr { 161struct bl_msg_hdr {
@@ -161,9 +163,6 @@ struct bl_msg_hdr {
161 u16 totallen; /* length of entire message, including hdr itself */ 163 u16 totallen; /* length of entire message, including hdr itself */
162}; 164};
163 165
164extern struct dentry *bl_device_pipe;
165extern wait_queue_head_t bl_wq;
166
167#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ 166#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
168#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ 167#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
169#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ 168#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index d08ba9107fde..a5c88a554d92 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -46,7 +46,7 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
46 46
47 *rp = xdr_decode_hyper(*rp, &s); 47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) { 48 if (s & 0x1ff) {
49 printk(KERN_WARNING "%s: sector not aligned\n", __func__); 49 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
50 return -1; 50 return -1;
51 } 51 }
52 *sp = s >> SECTOR_SHIFT; 52 *sp = s >> SECTOR_SHIFT;
@@ -79,27 +79,30 @@ int nfs4_blkdev_put(struct block_device *bdev)
79 return blkdev_put(bdev, FMODE_READ); 79 return blkdev_put(bdev, FMODE_READ);
80} 80}
81 81
82static struct bl_dev_msg bl_mount_reply;
83
84ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, 82ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
85 size_t mlen) 83 size_t mlen)
86{ 84{
85 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
86 nfs_net_id);
87
87 if (mlen != sizeof (struct bl_dev_msg)) 88 if (mlen != sizeof (struct bl_dev_msg))
88 return -EINVAL; 89 return -EINVAL;
89 90
90 if (copy_from_user(&bl_mount_reply, src, mlen) != 0) 91 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
91 return -EFAULT; 92 return -EFAULT;
92 93
93 wake_up(&bl_wq); 94 wake_up(&nn->bl_wq);
94 95
95 return mlen; 96 return mlen;
96} 97}
97 98
98void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) 99void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
99{ 100{
101 struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
102
100 if (msg->errno >= 0) 103 if (msg->errno >= 0)
101 return; 104 return;
102 wake_up(&bl_wq); 105 wake_up(bl_pipe_msg->bl_wq);
103} 106}
104 107
105/* 108/*
@@ -111,29 +114,33 @@ nfs4_blk_decode_device(struct nfs_server *server,
111{ 114{
112 struct pnfs_block_dev *rv; 115 struct pnfs_block_dev *rv;
113 struct block_device *bd = NULL; 116 struct block_device *bd = NULL;
114 struct rpc_pipe_msg msg; 117 struct bl_pipe_msg bl_pipe_msg;
118 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
115 struct bl_msg_hdr bl_msg = { 119 struct bl_msg_hdr bl_msg = {
116 .type = BL_DEVICE_MOUNT, 120 .type = BL_DEVICE_MOUNT,
117 .totallen = dev->mincount, 121 .totallen = dev->mincount,
118 }; 122 };
119 uint8_t *dataptr; 123 uint8_t *dataptr;
120 DECLARE_WAITQUEUE(wq, current); 124 DECLARE_WAITQUEUE(wq, current);
121 struct bl_dev_msg *reply = &bl_mount_reply;
122 int offset, len, i, rc; 125 int offset, len, i, rc;
126 struct net *net = server->nfs_client->net;
127 struct nfs_net *nn = net_generic(net, nfs_net_id);
128 struct bl_dev_msg *reply = &nn->bl_mount_reply;
123 129
124 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); 130 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
125 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, 131 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
126 dev->mincount); 132 dev->mincount);
127 133
128 memset(&msg, 0, sizeof(msg)); 134 bl_pipe_msg.bl_wq = &nn->bl_wq;
129 msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); 135 memset(msg, 0, sizeof(*msg));
130 if (!msg.data) { 136 msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
137 if (!msg->data) {
131 rv = ERR_PTR(-ENOMEM); 138 rv = ERR_PTR(-ENOMEM);
132 goto out; 139 goto out;
133 } 140 }
134 141
135 memcpy(msg.data, &bl_msg, sizeof(bl_msg)); 142 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
136 dataptr = (uint8_t *) msg.data; 143 dataptr = (uint8_t *) msg->data;
137 len = dev->mincount; 144 len = dev->mincount;
138 offset = sizeof(bl_msg); 145 offset = sizeof(bl_msg);
139 for (i = 0; len > 0; i++) { 146 for (i = 0; len > 0; i++) {
@@ -142,13 +149,13 @@ nfs4_blk_decode_device(struct nfs_server *server,
142 len -= PAGE_CACHE_SIZE; 149 len -= PAGE_CACHE_SIZE;
143 offset += PAGE_CACHE_SIZE; 150 offset += PAGE_CACHE_SIZE;
144 } 151 }
145 msg.len = sizeof(bl_msg) + dev->mincount; 152 msg->len = sizeof(bl_msg) + dev->mincount;
146 153
147 dprintk("%s CALLING USERSPACE DAEMON\n", __func__); 154 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
148 add_wait_queue(&bl_wq, &wq); 155 add_wait_queue(&nn->bl_wq, &wq);
149 rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); 156 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
150 if (rc < 0) { 157 if (rc < 0) {
151 remove_wait_queue(&bl_wq, &wq); 158 remove_wait_queue(&nn->bl_wq, &wq);
152 rv = ERR_PTR(rc); 159 rv = ERR_PTR(rc);
153 goto out; 160 goto out;
154 } 161 }
@@ -156,7 +163,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
156 set_current_state(TASK_UNINTERRUPTIBLE); 163 set_current_state(TASK_UNINTERRUPTIBLE);
157 schedule(); 164 schedule();
158 __set_current_state(TASK_RUNNING); 165 __set_current_state(TASK_RUNNING);
159 remove_wait_queue(&bl_wq, &wq); 166 remove_wait_queue(&nn->bl_wq, &wq);
160 167
161 if (reply->status != BL_DEVICE_REQUEST_PROC) { 168 if (reply->status != BL_DEVICE_REQUEST_PROC) {
162 dprintk("%s failed to open device: %d\n", 169 dprintk("%s failed to open device: %d\n",
@@ -181,13 +188,14 @@ nfs4_blk_decode_device(struct nfs_server *server,
181 188
182 rv->bm_mdev = bd; 189 rv->bm_mdev = bd;
183 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); 190 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
191 rv->net = net;
184 dprintk("%s Created device %s with bd_block_size %u\n", 192 dprintk("%s Created device %s with bd_block_size %u\n",
185 __func__, 193 __func__,
186 bd->bd_disk->disk_name, 194 bd->bd_disk->disk_name,
187 bd->bd_block_size); 195 bd->bd_block_size);
188 196
189out: 197out:
190 kfree(msg.data); 198 kfree(msg->data);
191 return rv; 199 return rv;
192} 200}
193 201
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index d055c7558073..737d839bc17b 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -38,9 +38,10 @@
38 38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD 39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40 40
41static void dev_remove(dev_t dev) 41static void dev_remove(struct net *net, dev_t dev)
42{ 42{
43 struct rpc_pipe_msg msg; 43 struct bl_pipe_msg bl_pipe_msg;
44 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
44 struct bl_dev_msg bl_umount_request; 45 struct bl_dev_msg bl_umount_request;
45 struct bl_msg_hdr bl_msg = { 46 struct bl_msg_hdr bl_msg = {
46 .type = BL_DEVICE_UMOUNT, 47 .type = BL_DEVICE_UMOUNT,
@@ -48,36 +49,38 @@ static void dev_remove(dev_t dev)
48 }; 49 };
49 uint8_t *dataptr; 50 uint8_t *dataptr;
50 DECLARE_WAITQUEUE(wq, current); 51 DECLARE_WAITQUEUE(wq, current);
52 struct nfs_net *nn = net_generic(net, nfs_net_id);
51 53
52 dprintk("Entering %s\n", __func__); 54 dprintk("Entering %s\n", __func__);
53 55
54 memset(&msg, 0, sizeof(msg)); 56 bl_pipe_msg.bl_wq = &nn->bl_wq;
55 msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); 57 memset(msg, 0, sizeof(*msg));
56 if (!msg.data) 58 msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
59 if (!msg->data)
57 goto out; 60 goto out;
58 61
59 memset(&bl_umount_request, 0, sizeof(bl_umount_request)); 62 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
60 bl_umount_request.major = MAJOR(dev); 63 bl_umount_request.major = MAJOR(dev);
61 bl_umount_request.minor = MINOR(dev); 64 bl_umount_request.minor = MINOR(dev);
62 65
63 memcpy(msg.data, &bl_msg, sizeof(bl_msg)); 66 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
64 dataptr = (uint8_t *) msg.data; 67 dataptr = (uint8_t *) msg->data;
65 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); 68 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
66 msg.len = sizeof(bl_msg) + bl_msg.totallen; 69 msg->len = sizeof(bl_msg) + bl_msg.totallen;
67 70
68 add_wait_queue(&bl_wq, &wq); 71 add_wait_queue(&nn->bl_wq, &wq);
69 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { 72 if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
70 remove_wait_queue(&bl_wq, &wq); 73 remove_wait_queue(&nn->bl_wq, &wq);
71 goto out; 74 goto out;
72 } 75 }
73 76
74 set_current_state(TASK_UNINTERRUPTIBLE); 77 set_current_state(TASK_UNINTERRUPTIBLE);
75 schedule(); 78 schedule();
76 __set_current_state(TASK_RUNNING); 79 __set_current_state(TASK_RUNNING);
77 remove_wait_queue(&bl_wq, &wq); 80 remove_wait_queue(&nn->bl_wq, &wq);
78 81
79out: 82out:
80 kfree(msg.data); 83 kfree(msg->data);
81} 84}
82 85
83/* 86/*
@@ -90,10 +93,10 @@ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
90 dprintk("%s Releasing\n", __func__); 93 dprintk("%s Releasing\n", __func__);
91 rv = nfs4_blkdev_put(bdev->bm_mdev); 94 rv = nfs4_blkdev_put(bdev->bm_mdev);
92 if (rv) 95 if (rv)
93 printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", 96 printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
94 __func__, rv); 97 __func__, rv);
95 98
96 dev_remove(bdev->bm_mdev->bd_dev); 99 dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
97} 100}
98 101
99void bl_free_block_dev(struct pnfs_block_dev *bdev) 102void bl_free_block_dev(struct pnfs_block_dev *bdev)
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 1abac09f7cd5..1f9a6032796b 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -147,7 +147,7 @@ static int _preload_range(struct pnfs_inval_markings *marks,
147 count = (int)(end - start) / (int)tree->mtt_step_size; 147 count = (int)(end - start) / (int)tree->mtt_step_size;
148 148
149 /* Pre-malloc what memory we might need */ 149 /* Pre-malloc what memory we might need */
150 storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); 150 storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
151 if (!storage) 151 if (!storage)
152 return -ENOMEM; 152 return -ENOMEM;
153 for (i = 0; i < count; i++) { 153 for (i = 0; i < count; i++) {
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index c98b439332fc..dded26368111 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
15#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
16#include <net/net_namespace.h>
16 17
17#include "cache_lib.h" 18#include "cache_lib.h"
18 19
@@ -111,30 +112,54 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
111 return 0; 112 return 0;
112} 113}
113 114
114int nfs_cache_register(struct cache_detail *cd) 115int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
115{ 116{
116 struct vfsmount *mnt;
117 struct path path;
118 int ret; 117 int ret;
118 struct dentry *dir;
119 119
120 mnt = rpc_get_mount(); 120 dir = rpc_d_lookup_sb(sb, "cache");
121 if (IS_ERR(mnt)) 121 BUG_ON(dir == NULL);
122 return PTR_ERR(mnt); 122 ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
123 ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path); 123 dput(dir);
124 if (ret)
125 goto err;
126 ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd);
127 path_put(&path);
128 if (!ret)
129 return ret;
130err:
131 rpc_put_mount();
132 return ret; 124 return ret;
133} 125}
134 126
135void nfs_cache_unregister(struct cache_detail *cd) 127int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
136{ 128{
137 sunrpc_cache_unregister_pipefs(cd); 129 struct super_block *pipefs_sb;
138 rpc_put_mount(); 130 int ret = 0;
131
132 pipefs_sb = rpc_get_sb_net(net);
133 if (pipefs_sb) {
134 ret = nfs_cache_register_sb(pipefs_sb, cd);
135 rpc_put_sb_net(net);
136 }
137 return ret;
138}
139
140void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
141{
142 if (cd->u.pipefs.dir)
143 sunrpc_cache_unregister_pipefs(cd);
144}
145
146void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
147{
148 struct super_block *pipefs_sb;
149
150 pipefs_sb = rpc_get_sb_net(net);
151 if (pipefs_sb) {
152 nfs_cache_unregister_sb(pipefs_sb, cd);
153 rpc_put_sb_net(net);
154 }
155}
156
157void nfs_cache_init(struct cache_detail *cd)
158{
159 sunrpc_init_cache_detail(cd);
139} 160}
140 161
162void nfs_cache_destroy(struct cache_detail *cd)
163{
164 sunrpc_destroy_cache_detail(cd);
165}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 7cf6cafcc007..317db95e37f8 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -23,5 +23,11 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
23extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); 23extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
24extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); 24extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
25 25
26extern int nfs_cache_register(struct cache_detail *cd); 26extern void nfs_cache_init(struct cache_detail *cd);
27extern void nfs_cache_unregister(struct cache_detail *cd); 27extern void nfs_cache_destroy(struct cache_detail *cd);
28extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
29extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
30extern int nfs_cache_register_sb(struct super_block *sb,
31 struct cache_detail *cd);
32extern void nfs_cache_unregister_sb(struct super_block *sb,
33 struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 516f3375e067..eb95f5091c1a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -85,7 +85,7 @@ nfs4_callback_svc(void *vrqstp)
85 } 85 }
86 if (err < 0) { 86 if (err < 0) {
87 if (err != preverr) { 87 if (err != preverr) {
88 printk(KERN_WARNING "%s: unexpected error " 88 printk(KERN_WARNING "NFS: %s: unexpected error "
89 "from svc_recv (%d)\n", __func__, err); 89 "from svc_recv (%d)\n", __func__, err);
90 preverr = err; 90 preverr = err;
91 } 91 }
@@ -101,12 +101,12 @@ nfs4_callback_svc(void *vrqstp)
101/* 101/*
102 * Prepare to bring up the NFSv4 callback service 102 * Prepare to bring up the NFSv4 callback service
103 */ 103 */
104struct svc_rqst * 104static struct svc_rqst *
105nfs4_callback_up(struct svc_serv *serv) 105nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
106{ 106{
107 int ret; 107 int ret;
108 108
109 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, 109 ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET,
110 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 110 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
111 if (ret <= 0) 111 if (ret <= 0)
112 goto out_err; 112 goto out_err;
@@ -114,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv)
114 dprintk("NFS: Callback listener port = %u (af %u)\n", 114 dprintk("NFS: Callback listener port = %u (af %u)\n",
115 nfs_callback_tcpport, PF_INET); 115 nfs_callback_tcpport, PF_INET);
116 116
117 ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, 117 ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET6,
118 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 118 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
119 if (ret > 0) { 119 if (ret > 0) {
120 nfs_callback_tcpport6 = ret; 120 nfs_callback_tcpport6 = ret;
@@ -172,7 +172,7 @@ nfs41_callback_svc(void *vrqstp)
172/* 172/*
173 * Bring up the NFSv4.1 callback service 173 * Bring up the NFSv4.1 callback service
174 */ 174 */
175struct svc_rqst * 175static struct svc_rqst *
176nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) 176nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
177{ 177{
178 struct svc_rqst *rqstp; 178 struct svc_rqst *rqstp;
@@ -183,7 +183,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
183 * fore channel connection. 183 * fore channel connection.
184 * Returns the input port (0) and sets the svc_serv bc_xprt on success 184 * Returns the input port (0) and sets the svc_serv bc_xprt on success
185 */ 185 */
186 ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, 186 ret = svc_create_xprt(serv, "tcp-bc", xprt->xprt_net, PF_INET, 0,
187 SVC_SOCK_ANONYMOUS); 187 SVC_SOCK_ANONYMOUS);
188 if (ret < 0) { 188 if (ret < 0) {
189 rqstp = ERR_PTR(ret); 189 rqstp = ERR_PTR(ret);
@@ -269,7 +269,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
269 serv, xprt, &rqstp, &callback_svc); 269 serv, xprt, &rqstp, &callback_svc);
270 if (!minorversion_setup) { 270 if (!minorversion_setup) {
271 /* v4.0 callback setup */ 271 /* v4.0 callback setup */
272 rqstp = nfs4_callback_up(serv); 272 rqstp = nfs4_callback_up(serv, xprt);
273 callback_svc = nfs4_callback_svc; 273 callback_svc = nfs4_callback_svc;
274 } 274 }
275 275
@@ -332,7 +332,6 @@ void nfs_callback_down(int minorversion)
332int 332int
333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) 333check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
334{ 334{
335 struct rpc_clnt *r = clp->cl_rpcclient;
336 char *p = svc_gss_principal(rqstp); 335 char *p = svc_gss_principal(rqstp);
337 336
338 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) 337 if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
@@ -353,7 +352,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
353 if (memcmp(p, "nfs@", 4) != 0) 352 if (memcmp(p, "nfs@", 4) != 0)
354 return 0; 353 return 0;
355 p += 4; 354 p += 4;
356 if (strcmp(p, r->cl_server) != 0) 355 if (strcmp(p, clp->cl_hostname) != 0)
357 return 0; 356 return 0;
358 return 1; 357 return 1;
359} 358}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index c89d3b9e483c..a5527c90a5aa 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,8 @@ enum nfs4_callback_opnum {
38struct cb_process_state { 38struct cb_process_state {
39 __be32 drc_status; 39 __be32 drc_status;
40 struct nfs_client *clp; 40 struct nfs_client *clp;
41 int slotid; 41 u32 slotid;
42 struct net *net;
42}; 43};
43 44
44struct cb_compound_hdr_arg { 45struct cb_compound_hdr_arg {
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 54cea8ad5a76..1b5d809a105e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -8,6 +8,7 @@
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/rcupdate.h>
11#include "nfs4_fs.h" 12#include "nfs4_fs.h"
12#include "callback.h" 13#include "callback.h"
13#include "delegation.h" 14#include "delegation.h"
@@ -33,7 +34,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
33 res->bitmap[0] = res->bitmap[1] = 0; 34 res->bitmap[0] = res->bitmap[1] = 0;
34 res->status = htonl(NFS4ERR_BADHANDLE); 35 res->status = htonl(NFS4ERR_BADHANDLE);
35 36
36 dprintk("NFS: GETATTR callback request from %s\n", 37 dprintk_rcu("NFS: GETATTR callback request from %s\n",
37 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 38 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
38 39
39 inode = nfs_delegation_find_inode(cps->clp, &args->fh); 40 inode = nfs_delegation_find_inode(cps->clp, &args->fh);
@@ -73,7 +74,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
73 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ 74 if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
74 goto out; 75 goto out;
75 76
76 dprintk("NFS: RECALL callback request from %s\n", 77 dprintk_rcu("NFS: RECALL callback request from %s\n",
77 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 78 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
78 79
79 res = htonl(NFS4ERR_BADHANDLE); 80 res = htonl(NFS4ERR_BADHANDLE);
@@ -86,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
86 res = 0; 87 res = 0;
87 break; 88 break;
88 case -ENOENT: 89 case -ENOENT:
89 if (res != 0) 90 res = htonl(NFS4ERR_BAD_STATEID);
90 res = htonl(NFS4ERR_BAD_STATEID);
91 break; 91 break;
92 default: 92 default:
93 res = htonl(NFS4ERR_RESOURCE); 93 res = htonl(NFS4ERR_RESOURCE);
@@ -98,52 +98,64 @@ out:
98 return res; 98 return res;
99} 99}
100 100
101int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
102{
103 if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
104 sizeof(delegation->stateid.data)) != 0)
105 return 0;
106 return 1;
107}
108
109#if defined(CONFIG_NFS_V4_1) 101#if defined(CONFIG_NFS_V4_1)
110 102
111static u32 initiate_file_draining(struct nfs_client *clp, 103/*
112 struct cb_layoutrecallargs *args) 104 * Lookup a layout by filehandle.
105 *
106 * Note: gets a refcount on the layout hdr and on its respective inode.
107 * Caller must put the layout hdr and the inode.
108 *
109 * TODO: keep track of all layouts (and delegations) in a hash table
110 * hashed by filehandle.
111 */
112static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)
113{ 113{
114 struct nfs_server *server; 114 struct nfs_server *server;
115 struct pnfs_layout_hdr *lo;
116 struct inode *ino; 115 struct inode *ino;
117 bool found = false; 116 struct pnfs_layout_hdr *lo;
118 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
119 LIST_HEAD(free_me_list);
120 117
121 spin_lock(&clp->cl_lock);
122 rcu_read_lock();
123 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 118 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
124 list_for_each_entry(lo, &server->layouts, plh_layouts) { 119 list_for_each_entry(lo, &server->layouts, plh_layouts) {
125 if (nfs_compare_fh(&args->cbl_fh, 120 if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
126 &NFS_I(lo->plh_inode)->fh))
127 continue; 121 continue;
128 ino = igrab(lo->plh_inode); 122 ino = igrab(lo->plh_inode);
129 if (!ino) 123 if (!ino)
130 continue; 124 continue;
131 found = true;
132 /* Without this, layout can be freed as soon
133 * as we release cl_lock.
134 */
135 get_layout_hdr(lo); 125 get_layout_hdr(lo);
136 break; 126 return lo;
137 } 127 }
138 if (found)
139 break;
140 } 128 }
129
130 return NULL;
131}
132
133static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh)
134{
135 struct pnfs_layout_hdr *lo;
136
137 spin_lock(&clp->cl_lock);
138 rcu_read_lock();
139 lo = get_layout_by_fh_locked(clp, fh);
141 rcu_read_unlock(); 140 rcu_read_unlock();
142 spin_unlock(&clp->cl_lock); 141 spin_unlock(&clp->cl_lock);
143 142
144 if (!found) 143 return lo;
144}
145
146static u32 initiate_file_draining(struct nfs_client *clp,
147 struct cb_layoutrecallargs *args)
148{
149 struct inode *ino;
150 struct pnfs_layout_hdr *lo;
151 u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
152 LIST_HEAD(free_me_list);
153
154 lo = get_layout_by_fh(clp, &args->cbl_fh);
155 if (!lo)
145 return NFS4ERR_NOMATCHING_LAYOUT; 156 return NFS4ERR_NOMATCHING_LAYOUT;
146 157
158 ino = lo->plh_inode;
147 spin_lock(&ino->i_lock); 159 spin_lock(&ino->i_lock);
148 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 160 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
149 mark_matching_lsegs_invalid(lo, &free_me_list, 161 mark_matching_lsegs_invalid(lo, &free_me_list,
@@ -213,17 +225,13 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
213static u32 do_callback_layoutrecall(struct nfs_client *clp, 225static u32 do_callback_layoutrecall(struct nfs_client *clp,
214 struct cb_layoutrecallargs *args) 226 struct cb_layoutrecallargs *args)
215{ 227{
216 u32 res = NFS4ERR_DELAY; 228 u32 res;
217 229
218 dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); 230 dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
219 if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state))
220 goto out;
221 if (args->cbl_recall_type == RETURN_FILE) 231 if (args->cbl_recall_type == RETURN_FILE)
222 res = initiate_file_draining(clp, args); 232 res = initiate_file_draining(clp, args);
223 else 233 else
224 res = initiate_bulk_draining(clp, args); 234 res = initiate_bulk_draining(clp, args);
225 clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state);
226out:
227 dprintk("%s returning %i\n", __func__, res); 235 dprintk("%s returning %i\n", __func__, res);
228 return res; 236 return res;
229 237
@@ -303,21 +311,6 @@ out:
303 return res; 311 return res;
304} 312}
305 313
306int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
307{
308 if (delegation == NULL)
309 return 0;
310
311 if (stateid->stateid.seqid != 0)
312 return 0;
313 if (memcmp(&delegation->stateid.stateid.other,
314 &stateid->stateid.other,
315 NFS4_STATEID_OTHER_SIZE))
316 return 0;
317
318 return 1;
319}
320
321/* 314/*
322 * Validate the sequenceID sent by the server. 315 * Validate the sequenceID sent by the server.
323 * Return success if the sequenceID is one more than what we last saw on 316 * Return success if the sequenceID is one more than what we last saw on
@@ -441,7 +434,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
441 int i; 434 int i;
442 __be32 status = htonl(NFS4ERR_BADSESSION); 435 __be32 status = htonl(NFS4ERR_BADSESSION);
443 436
444 clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); 437 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
445 if (clp == NULL) 438 if (clp == NULL)
446 goto out; 439 goto out;
447 440
@@ -517,7 +510,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
517 if (!cps->clp) /* set in cb_sequence */ 510 if (!cps->clp) /* set in cb_sequence */
518 goto out; 511 goto out;
519 512
520 dprintk("NFS: RECALL_ANY callback request from %s\n", 513 dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",
521 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 514 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
522 515
523 status = cpu_to_be32(NFS4ERR_INVAL); 516 status = cpu_to_be32(NFS4ERR_INVAL);
@@ -552,7 +545,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
552 if (!cps->clp) /* set in cb_sequence */ 545 if (!cps->clp) /* set in cb_sequence */
553 goto out; 546 goto out;
554 547
555 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", 548 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
556 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), 549 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
557 args->crsa_target_max_slots); 550 args->crsa_target_max_slots);
558 551
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d50b2742f23b..95bfc243992c 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,8 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/ratelimit.h>
13#include <linux/printk.h>
12#include <linux/slab.h> 14#include <linux/slab.h>
13#include <linux/sunrpc/bc_xprt.h> 15#include <linux/sunrpc/bc_xprt.h>
14#include "nfs4_fs.h" 16#include "nfs4_fs.h"
@@ -73,7 +75,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
73 75
74 p = xdr_inline_decode(xdr, nbytes); 76 p = xdr_inline_decode(xdr, nbytes);
75 if (unlikely(p == NULL)) 77 if (unlikely(p == NULL))
76 printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); 78 printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
77 return p; 79 return p;
78} 80}
79 81
@@ -138,10 +140,10 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
138{ 140{
139 __be32 *p; 141 __be32 *p;
140 142
141 p = read_buf(xdr, 16); 143 p = read_buf(xdr, NFS4_STATEID_SIZE);
142 if (unlikely(p == NULL)) 144 if (unlikely(p == NULL))
143 return htonl(NFS4ERR_RESOURCE); 145 return htonl(NFS4ERR_RESOURCE);
144 memcpy(stateid->data, p, 16); 146 memcpy(stateid, p, NFS4_STATEID_SIZE);
145 return 0; 147 return 0;
146} 148}
147 149
@@ -155,7 +157,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
155 return status; 157 return status;
156 /* We do not like overly long tags! */ 158 /* We do not like overly long tags! */
157 if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { 159 if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
158 printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", 160 printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
159 __func__, hdr->taglen); 161 __func__, hdr->taglen);
160 return htonl(NFS4ERR_RESOURCE); 162 return htonl(NFS4ERR_RESOURCE);
161 } 163 }
@@ -167,7 +169,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
167 if (hdr->minorversion <= 1) { 169 if (hdr->minorversion <= 1) {
168 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ 170 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
169 } else { 171 } else {
170 printk(KERN_WARNING "%s: NFSv4 server callback with " 172 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
171 "illegal minor version %u!\n", 173 "illegal minor version %u!\n",
172 __func__, hdr->minorversion); 174 __func__, hdr->minorversion);
173 return htonl(NFS4ERR_MINOR_VERS_MISMATCH); 175 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
@@ -759,14 +761,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
759 * Let the state manager know callback processing done. 761 * Let the state manager know callback processing done.
760 * A single slot, so highest used slotid is either 0 or -1 762 * A single slot, so highest used slotid is either 0 or -1
761 */ 763 */
762 tbl->highest_used_slotid = -1; 764 tbl->highest_used_slotid = NFS4_NO_SLOT;
763 nfs4_check_drain_bc_complete(session); 765 nfs4_check_drain_bc_complete(session);
764 spin_unlock(&tbl->slot_tbl_lock); 766 spin_unlock(&tbl->slot_tbl_lock);
765} 767}
766 768
767static void nfs4_cb_free_slot(struct cb_process_state *cps) 769static void nfs4_cb_free_slot(struct cb_process_state *cps)
768{ 770{
769 if (cps->slotid != -1) 771 if (cps->slotid != NFS4_NO_SLOT)
770 nfs4_callback_free_slot(cps->clp->cl_session); 772 nfs4_callback_free_slot(cps->clp->cl_session);
771} 773}
772 774
@@ -860,7 +862,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
860 struct cb_process_state cps = { 862 struct cb_process_state cps = {
861 .drc_status = 0, 863 .drc_status = 0,
862 .clp = NULL, 864 .clp = NULL,
863 .slotid = -1, 865 .slotid = NFS4_NO_SLOT,
866 .net = rqstp->rq_xprt->xpt_net,
864 }; 867 };
865 unsigned int nops = 0; 868 unsigned int nops = 0;
866 869
@@ -876,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
876 return rpc_garbage_args; 879 return rpc_garbage_args;
877 880
878 if (hdr_arg.minorversion == 0) { 881 if (hdr_arg.minorversion == 0) {
879 cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); 882 cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident);
880 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) 883 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
881 return rpc_drop_reply; 884 return rpc_drop_reply;
882 } 885 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ad5565acbf3b..da7b5e4ff9ec 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -40,6 +40,8 @@
40#include <net/ipv6.h> 40#include <net/ipv6.h>
41#include <linux/nfs_xdr.h> 41#include <linux/nfs_xdr.h>
42#include <linux/sunrpc/bc_xprt.h> 42#include <linux/sunrpc/bc_xprt.h>
43#include <linux/nsproxy.h>
44#include <linux/pid_namespace.h>
43 45
44 46
45#include "nfs4_fs.h" 47#include "nfs4_fs.h"
@@ -49,15 +51,12 @@
49#include "internal.h" 51#include "internal.h"
50#include "fscache.h" 52#include "fscache.h"
51#include "pnfs.h" 53#include "pnfs.h"
54#include "netns.h"
52 55
53#define NFSDBG_FACILITY NFSDBG_CLIENT 56#define NFSDBG_FACILITY NFSDBG_CLIENT
54 57
55static DEFINE_SPINLOCK(nfs_client_lock);
56static LIST_HEAD(nfs_client_list);
57static LIST_HEAD(nfs_volume_list);
58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); 58static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
59#ifdef CONFIG_NFS_V4 59#ifdef CONFIG_NFS_V4
60static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
61 60
62/* 61/*
63 * Get a unique NFSv4.0 callback identifier which will be used 62 * Get a unique NFSv4.0 callback identifier which will be used
@@ -66,15 +65,16 @@ static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */
66static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) 65static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
67{ 66{
68 int ret = 0; 67 int ret = 0;
68 struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
69 69
70 if (clp->rpc_ops->version != 4 || minorversion != 0) 70 if (clp->rpc_ops->version != 4 || minorversion != 0)
71 return ret; 71 return ret;
72retry: 72retry:
73 if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL)) 73 if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL))
74 return -ENOMEM; 74 return -ENOMEM;
75 spin_lock(&nfs_client_lock); 75 spin_lock(&nn->nfs_client_lock);
76 ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident); 76 ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident);
77 spin_unlock(&nfs_client_lock); 77 spin_unlock(&nn->nfs_client_lock);
78 if (ret == -EAGAIN) 78 if (ret == -EAGAIN)
79 goto retry; 79 goto retry;
80 return ret; 80 return ret;
@@ -89,7 +89,7 @@ static bool nfs4_disable_idmapping = true;
89/* 89/*
90 * RPC cruft for NFS 90 * RPC cruft for NFS
91 */ 91 */
92static struct rpc_version *nfs_version[5] = { 92static const struct rpc_version *nfs_version[5] = {
93 [2] = &nfs_version2, 93 [2] = &nfs_version2,
94#ifdef CONFIG_NFS_V3 94#ifdef CONFIG_NFS_V3
95 [3] = &nfs_version3, 95 [3] = &nfs_version3,
@@ -99,7 +99,7 @@ static struct rpc_version *nfs_version[5] = {
99#endif 99#endif
100}; 100};
101 101
102struct rpc_program nfs_program = { 102const struct rpc_program nfs_program = {
103 .name = "nfs", 103 .name = "nfs",
104 .number = NFS_PROGRAM, 104 .number = NFS_PROGRAM,
105 .nrvers = ARRAY_SIZE(nfs_version), 105 .nrvers = ARRAY_SIZE(nfs_version),
@@ -115,11 +115,11 @@ struct rpc_stat nfs_rpcstat = {
115 115
116#ifdef CONFIG_NFS_V3_ACL 116#ifdef CONFIG_NFS_V3_ACL
117static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; 117static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
118static struct rpc_version * nfsacl_version[] = { 118static const struct rpc_version *nfsacl_version[] = {
119 [3] = &nfsacl_version3, 119 [3] = &nfsacl_version3,
120}; 120};
121 121
122struct rpc_program nfsacl_program = { 122const struct rpc_program nfsacl_program = {
123 .name = "nfsacl", 123 .name = "nfsacl",
124 .number = NFS_ACL_PROGRAM, 124 .number = NFS_ACL_PROGRAM,
125 .nrvers = ARRAY_SIZE(nfsacl_version), 125 .nrvers = ARRAY_SIZE(nfsacl_version),
@@ -135,6 +135,7 @@ struct nfs_client_initdata {
135 const struct nfs_rpc_ops *rpc_ops; 135 const struct nfs_rpc_ops *rpc_ops;
136 int proto; 136 int proto;
137 u32 minorversion; 137 u32 minorversion;
138 struct net *net;
138}; 139};
139 140
140/* 141/*
@@ -171,6 +172,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
171 clp->cl_rpcclient = ERR_PTR(-EINVAL); 172 clp->cl_rpcclient = ERR_PTR(-EINVAL);
172 173
173 clp->cl_proto = cl_init->proto; 174 clp->cl_proto = cl_init->proto;
175 clp->net = get_net(cl_init->net);
174 176
175#ifdef CONFIG_NFS_V4 177#ifdef CONFIG_NFS_V4
176 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); 178 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion);
@@ -202,8 +204,11 @@ error_0:
202#ifdef CONFIG_NFS_V4_1 204#ifdef CONFIG_NFS_V4_1
203static void nfs4_shutdown_session(struct nfs_client *clp) 205static void nfs4_shutdown_session(struct nfs_client *clp)
204{ 206{
205 if (nfs4_has_session(clp)) 207 if (nfs4_has_session(clp)) {
208 nfs4_deviceid_purge_client(clp);
206 nfs4_destroy_session(clp->cl_session); 209 nfs4_destroy_session(clp->cl_session);
210 }
211
207} 212}
208#else /* CONFIG_NFS_V4_1 */ 213#else /* CONFIG_NFS_V4_1 */
209static void nfs4_shutdown_session(struct nfs_client *clp) 214static void nfs4_shutdown_session(struct nfs_client *clp)
@@ -233,16 +238,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
233} 238}
234 239
235/* idr_remove_all is not needed as all id's are removed by nfs_put_client */ 240/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
236void nfs_cleanup_cb_ident_idr(void) 241void nfs_cleanup_cb_ident_idr(struct net *net)
237{ 242{
238 idr_destroy(&cb_ident_idr); 243 struct nfs_net *nn = net_generic(net, nfs_net_id);
244
245 idr_destroy(&nn->cb_ident_idr);
239} 246}
240 247
241/* nfs_client_lock held */ 248/* nfs_client_lock held */
242static void nfs_cb_idr_remove_locked(struct nfs_client *clp) 249static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
243{ 250{
251 struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
252
244 if (clp->cl_cb_ident) 253 if (clp->cl_cb_ident)
245 idr_remove(&cb_ident_idr, clp->cl_cb_ident); 254 idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);
246} 255}
247 256
248static void pnfs_init_server(struct nfs_server *server) 257static void pnfs_init_server(struct nfs_server *server)
@@ -260,7 +269,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
260{ 269{
261} 270}
262 271
263void nfs_cleanup_cb_ident_idr(void) 272void nfs_cleanup_cb_ident_idr(struct net *net)
264{ 273{
265} 274}
266 275
@@ -292,10 +301,10 @@ static void nfs_free_client(struct nfs_client *clp)
292 if (clp->cl_machine_cred != NULL) 301 if (clp->cl_machine_cred != NULL)
293 put_rpccred(clp->cl_machine_cred); 302 put_rpccred(clp->cl_machine_cred);
294 303
295 nfs4_deviceid_purge_client(clp); 304 put_net(clp->net);
296
297 kfree(clp->cl_hostname); 305 kfree(clp->cl_hostname);
298 kfree(clp->server_scope); 306 kfree(clp->server_scope);
307 kfree(clp->impl_id);
299 kfree(clp); 308 kfree(clp);
300 309
301 dprintk("<-- nfs_free_client()\n"); 310 dprintk("<-- nfs_free_client()\n");
@@ -306,15 +315,18 @@ static void nfs_free_client(struct nfs_client *clp)
306 */ 315 */
307void nfs_put_client(struct nfs_client *clp) 316void nfs_put_client(struct nfs_client *clp)
308{ 317{
318 struct nfs_net *nn;
319
309 if (!clp) 320 if (!clp)
310 return; 321 return;
311 322
312 dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); 323 dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
324 nn = net_generic(clp->net, nfs_net_id);
313 325
314 if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { 326 if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
315 list_del(&clp->cl_share_link); 327 list_del(&clp->cl_share_link);
316 nfs_cb_idr_remove_locked(clp); 328 nfs_cb_idr_remove_locked(clp);
317 spin_unlock(&nfs_client_lock); 329 spin_unlock(&nn->nfs_client_lock);
318 330
319 BUG_ON(!list_empty(&clp->cl_superblocks)); 331 BUG_ON(!list_empty(&clp->cl_superblocks));
320 332
@@ -392,6 +404,7 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
392 (sin1->sin_port == sin2->sin_port); 404 (sin1->sin_port == sin2->sin_port);
393} 405}
394 406
407#if defined(CONFIG_NFS_V4_1)
395/* 408/*
396 * Test if two socket addresses represent the same actual socket, 409 * Test if two socket addresses represent the same actual socket,
397 * by comparing (only) relevant fields, excluding the port number. 410 * by comparing (only) relevant fields, excluding the port number.
@@ -410,6 +423,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
410 } 423 }
411 return 0; 424 return 0;
412} 425}
426#endif /* CONFIG_NFS_V4_1 */
413 427
414/* 428/*
415 * Test if two socket addresses represent the same actual socket, 429 * Test if two socket addresses represent the same actual socket,
@@ -430,10 +444,10 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
430 return 0; 444 return 0;
431} 445}
432 446
447#if defined(CONFIG_NFS_V4_1)
433/* Common match routine for v4.0 and v4.1 callback services */ 448/* Common match routine for v4.0 and v4.1 callback services */
434bool 449static bool nfs4_cb_match_client(const struct sockaddr *addr,
435nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, 450 struct nfs_client *clp, u32 minorversion)
436 u32 minorversion)
437{ 451{
438 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; 452 struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
439 453
@@ -453,6 +467,7 @@ nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,
453 467
454 return true; 468 return true;
455} 469}
470#endif /* CONFIG_NFS_V4_1 */
456 471
457/* 472/*
458 * Find an nfs_client on the list that matches the initialisation data 473 * Find an nfs_client on the list that matches the initialisation data
@@ -462,8 +477,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
462{ 477{
463 struct nfs_client *clp; 478 struct nfs_client *clp;
464 const struct sockaddr *sap = data->addr; 479 const struct sockaddr *sap = data->addr;
480 struct nfs_net *nn = net_generic(data->net, nfs_net_id);
465 481
466 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 482 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
467 const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; 483 const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
468 /* Don't match clients that failed to initialise properly */ 484 /* Don't match clients that failed to initialise properly */
469 if (clp->cl_cons_state < 0) 485 if (clp->cl_cons_state < 0)
@@ -501,13 +517,14 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
501{ 517{
502 struct nfs_client *clp, *new = NULL; 518 struct nfs_client *clp, *new = NULL;
503 int error; 519 int error;
520 struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
504 521
505 dprintk("--> nfs_get_client(%s,v%u)\n", 522 dprintk("--> nfs_get_client(%s,v%u)\n",
506 cl_init->hostname ?: "", cl_init->rpc_ops->version); 523 cl_init->hostname ?: "", cl_init->rpc_ops->version);
507 524
508 /* see if the client already exists */ 525 /* see if the client already exists */
509 do { 526 do {
510 spin_lock(&nfs_client_lock); 527 spin_lock(&nn->nfs_client_lock);
511 528
512 clp = nfs_match_client(cl_init); 529 clp = nfs_match_client(cl_init);
513 if (clp) 530 if (clp)
@@ -515,7 +532,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
515 if (new) 532 if (new)
516 goto install_client; 533 goto install_client;
517 534
518 spin_unlock(&nfs_client_lock); 535 spin_unlock(&nn->nfs_client_lock);
519 536
520 new = nfs_alloc_client(cl_init); 537 new = nfs_alloc_client(cl_init);
521 } while (!IS_ERR(new)); 538 } while (!IS_ERR(new));
@@ -526,8 +543,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
526 /* install a new client and return with it unready */ 543 /* install a new client and return with it unready */
527install_client: 544install_client:
528 clp = new; 545 clp = new;
529 list_add(&clp->cl_share_link, &nfs_client_list); 546 list_add(&clp->cl_share_link, &nn->nfs_client_list);
530 spin_unlock(&nfs_client_lock); 547 spin_unlock(&nn->nfs_client_lock);
531 548
532 error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, 549 error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
533 authflavour, noresvport); 550 authflavour, noresvport);
@@ -542,7 +559,7 @@ install_client:
542 * - make sure it's ready before returning 559 * - make sure it's ready before returning
543 */ 560 */
544found_client: 561found_client:
545 spin_unlock(&nfs_client_lock); 562 spin_unlock(&nn->nfs_client_lock);
546 563
547 if (new) 564 if (new)
548 nfs_free_client(new); 565 nfs_free_client(new);
@@ -642,7 +659,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
642{ 659{
643 struct rpc_clnt *clnt = NULL; 660 struct rpc_clnt *clnt = NULL;
644 struct rpc_create_args args = { 661 struct rpc_create_args args = {
645 .net = &init_net, 662 .net = clp->net,
646 .protocol = clp->cl_proto, 663 .protocol = clp->cl_proto,
647 .address = (struct sockaddr *)&clp->cl_addr, 664 .address = (struct sockaddr *)&clp->cl_addr,
648 .addrsize = clp->cl_addrlen, 665 .addrsize = clp->cl_addrlen,
@@ -696,6 +713,7 @@ static int nfs_start_lockd(struct nfs_server *server)
696 .nfs_version = clp->rpc_ops->version, 713 .nfs_version = clp->rpc_ops->version,
697 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 714 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
698 1 : 0, 715 1 : 0,
716 .net = clp->net,
699 }; 717 };
700 718
701 if (nlm_init.nfs_version > 3) 719 if (nlm_init.nfs_version > 3)
@@ -831,6 +849,7 @@ static int nfs_init_server(struct nfs_server *server,
831 .addrlen = data->nfs_server.addrlen, 849 .addrlen = data->nfs_server.addrlen,
832 .rpc_ops = &nfs_v2_clientops, 850 .rpc_ops = &nfs_v2_clientops,
833 .proto = data->nfs_server.protocol, 851 .proto = data->nfs_server.protocol,
852 .net = data->net,
834 }; 853 };
835 struct rpc_timeout timeparms; 854 struct rpc_timeout timeparms;
836 struct nfs_client *clp; 855 struct nfs_client *clp;
@@ -1029,25 +1048,30 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
1029static void nfs_server_insert_lists(struct nfs_server *server) 1048static void nfs_server_insert_lists(struct nfs_server *server)
1030{ 1049{
1031 struct nfs_client *clp = server->nfs_client; 1050 struct nfs_client *clp = server->nfs_client;
1051 struct nfs_net *nn = net_generic(clp->net, nfs_net_id);
1032 1052
1033 spin_lock(&nfs_client_lock); 1053 spin_lock(&nn->nfs_client_lock);
1034 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); 1054 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1035 list_add_tail(&server->master_link, &nfs_volume_list); 1055 list_add_tail(&server->master_link, &nn->nfs_volume_list);
1036 clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); 1056 clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1037 spin_unlock(&nfs_client_lock); 1057 spin_unlock(&nn->nfs_client_lock);
1038 1058
1039} 1059}
1040 1060
1041static void nfs_server_remove_lists(struct nfs_server *server) 1061static void nfs_server_remove_lists(struct nfs_server *server)
1042{ 1062{
1043 struct nfs_client *clp = server->nfs_client; 1063 struct nfs_client *clp = server->nfs_client;
1064 struct nfs_net *nn;
1044 1065
1045 spin_lock(&nfs_client_lock); 1066 if (clp == NULL)
1067 return;
1068 nn = net_generic(clp->net, nfs_net_id);
1069 spin_lock(&nn->nfs_client_lock);
1046 list_del_rcu(&server->client_link); 1070 list_del_rcu(&server->client_link);
1047 if (clp && list_empty(&clp->cl_superblocks)) 1071 if (list_empty(&clp->cl_superblocks))
1048 set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); 1072 set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1049 list_del(&server->master_link); 1073 list_del(&server->master_link);
1050 spin_unlock(&nfs_client_lock); 1074 spin_unlock(&nn->nfs_client_lock);
1051 1075
1052 synchronize_rcu(); 1076 synchronize_rcu();
1053} 1077}
@@ -1086,6 +1110,8 @@ static struct nfs_server *nfs_alloc_server(void)
1086 return NULL; 1110 return NULL;
1087 } 1111 }
1088 1112
1113 ida_init(&server->openowner_id);
1114 ida_init(&server->lockowner_id);
1089 pnfs_init_server(server); 1115 pnfs_init_server(server);
1090 1116
1091 return server; 1117 return server;
@@ -1111,6 +1137,8 @@ void nfs_free_server(struct nfs_server *server)
1111 1137
1112 nfs_put_client(server->nfs_client); 1138 nfs_put_client(server->nfs_client);
1113 1139
1140 ida_destroy(&server->lockowner_id);
1141 ida_destroy(&server->openowner_id);
1114 nfs_free_iostats(server->io_stats); 1142 nfs_free_iostats(server->io_stats);
1115 bdi_destroy(&server->backing_dev_info); 1143 bdi_destroy(&server->backing_dev_info);
1116 kfree(server); 1144 kfree(server);
@@ -1189,45 +1217,19 @@ error:
1189/* 1217/*
1190 * NFSv4.0 callback thread helper 1218 * NFSv4.0 callback thread helper
1191 * 1219 *
1192 * Find a client by IP address, protocol version, and minorversion
1193 *
1194 * Called from the pg_authenticate method. The callback identifier
1195 * is not used as it has not been decoded.
1196 *
1197 * Returns NULL if no such client
1198 */
1199struct nfs_client *
1200nfs4_find_client_no_ident(const struct sockaddr *addr)
1201{
1202 struct nfs_client *clp;
1203
1204 spin_lock(&nfs_client_lock);
1205 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
1206 if (nfs4_cb_match_client(addr, clp, 0) == false)
1207 continue;
1208 atomic_inc(&clp->cl_count);
1209 spin_unlock(&nfs_client_lock);
1210 return clp;
1211 }
1212 spin_unlock(&nfs_client_lock);
1213 return NULL;
1214}
1215
1216/*
1217 * NFSv4.0 callback thread helper
1218 *
1219 * Find a client by callback identifier 1220 * Find a client by callback identifier
1220 */ 1221 */
1221struct nfs_client * 1222struct nfs_client *
1222nfs4_find_client_ident(int cb_ident) 1223nfs4_find_client_ident(struct net *net, int cb_ident)
1223{ 1224{
1224 struct nfs_client *clp; 1225 struct nfs_client *clp;
1226 struct nfs_net *nn = net_generic(net, nfs_net_id);
1225 1227
1226 spin_lock(&nfs_client_lock); 1228 spin_lock(&nn->nfs_client_lock);
1227 clp = idr_find(&cb_ident_idr, cb_ident); 1229 clp = idr_find(&nn->cb_ident_idr, cb_ident);
1228 if (clp) 1230 if (clp)
1229 atomic_inc(&clp->cl_count); 1231 atomic_inc(&clp->cl_count);
1230 spin_unlock(&nfs_client_lock); 1232 spin_unlock(&nn->nfs_client_lock);
1231 return clp; 1233 return clp;
1232} 1234}
1233 1235
@@ -1240,13 +1242,14 @@ nfs4_find_client_ident(int cb_ident)
1240 * Returns NULL if no such client 1242 * Returns NULL if no such client
1241 */ 1243 */
1242struct nfs_client * 1244struct nfs_client *
1243nfs4_find_client_sessionid(const struct sockaddr *addr, 1245nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
1244 struct nfs4_sessionid *sid) 1246 struct nfs4_sessionid *sid)
1245{ 1247{
1246 struct nfs_client *clp; 1248 struct nfs_client *clp;
1249 struct nfs_net *nn = net_generic(net, nfs_net_id);
1247 1250
1248 spin_lock(&nfs_client_lock); 1251 spin_lock(&nn->nfs_client_lock);
1249 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 1252 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
1250 if (nfs4_cb_match_client(addr, clp, 1) == false) 1253 if (nfs4_cb_match_client(addr, clp, 1) == false)
1251 continue; 1254 continue;
1252 1255
@@ -1259,17 +1262,17 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
1259 continue; 1262 continue;
1260 1263
1261 atomic_inc(&clp->cl_count); 1264 atomic_inc(&clp->cl_count);
1262 spin_unlock(&nfs_client_lock); 1265 spin_unlock(&nn->nfs_client_lock);
1263 return clp; 1266 return clp;
1264 } 1267 }
1265 spin_unlock(&nfs_client_lock); 1268 spin_unlock(&nn->nfs_client_lock);
1266 return NULL; 1269 return NULL;
1267} 1270}
1268 1271
1269#else /* CONFIG_NFS_V4_1 */ 1272#else /* CONFIG_NFS_V4_1 */
1270 1273
1271struct nfs_client * 1274struct nfs_client *
1272nfs4_find_client_sessionid(const struct sockaddr *addr, 1275nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
1273 struct nfs4_sessionid *sid) 1276 struct nfs4_sessionid *sid)
1274{ 1277{
1275 return NULL; 1278 return NULL;
@@ -1284,16 +1287,18 @@ static int nfs4_init_callback(struct nfs_client *clp)
1284 int error; 1287 int error;
1285 1288
1286 if (clp->rpc_ops->version == 4) { 1289 if (clp->rpc_ops->version == 4) {
1290 struct rpc_xprt *xprt;
1291
1292 xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt);
1293
1287 if (nfs4_has_session(clp)) { 1294 if (nfs4_has_session(clp)) {
1288 error = xprt_setup_backchannel( 1295 error = xprt_setup_backchannel(xprt,
1289 clp->cl_rpcclient->cl_xprt,
1290 NFS41_BC_MIN_CALLBACKS); 1296 NFS41_BC_MIN_CALLBACKS);
1291 if (error < 0) 1297 if (error < 0)
1292 return error; 1298 return error;
1293 } 1299 }
1294 1300
1295 error = nfs_callback_up(clp->cl_mvops->minor_version, 1301 error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);
1296 clp->cl_rpcclient->cl_xprt);
1297 if (error < 0) { 1302 if (error < 0) {
1298 dprintk("%s: failed to start callback. Error = %d\n", 1303 dprintk("%s: failed to start callback. Error = %d\n",
1299 __func__, error); 1304 __func__, error);
@@ -1344,6 +1349,7 @@ int nfs4_init_client(struct nfs_client *clp,
1344 rpc_authflavor_t authflavour, 1349 rpc_authflavor_t authflavour,
1345 int noresvport) 1350 int noresvport)
1346{ 1351{
1352 char buf[INET6_ADDRSTRLEN + 1];
1347 int error; 1353 int error;
1348 1354
1349 if (clp->cl_cons_state == NFS_CS_READY) { 1355 if (clp->cl_cons_state == NFS_CS_READY) {
@@ -1359,6 +1365,20 @@ int nfs4_init_client(struct nfs_client *clp,
1359 1, noresvport); 1365 1, noresvport);
1360 if (error < 0) 1366 if (error < 0)
1361 goto error; 1367 goto error;
1368
1369 /* If no clientaddr= option was specified, find a usable cb address */
1370 if (ip_addr == NULL) {
1371 struct sockaddr_storage cb_addr;
1372 struct sockaddr *sap = (struct sockaddr *)&cb_addr;
1373
1374 error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr));
1375 if (error < 0)
1376 goto error;
1377 error = rpc_ntop(sap, buf, sizeof(buf));
1378 if (error < 0)
1379 goto error;
1380 ip_addr = (const char *)buf;
1381 }
1362 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1382 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
1363 1383
1364 error = nfs_idmap_new(clp); 1384 error = nfs_idmap_new(clp);
@@ -1393,7 +1413,7 @@ static int nfs4_set_client(struct nfs_server *server,
1393 const char *ip_addr, 1413 const char *ip_addr,
1394 rpc_authflavor_t authflavour, 1414 rpc_authflavor_t authflavour,
1395 int proto, const struct rpc_timeout *timeparms, 1415 int proto, const struct rpc_timeout *timeparms,
1396 u32 minorversion) 1416 u32 minorversion, struct net *net)
1397{ 1417{
1398 struct nfs_client_initdata cl_init = { 1418 struct nfs_client_initdata cl_init = {
1399 .hostname = hostname, 1419 .hostname = hostname,
@@ -1402,6 +1422,7 @@ static int nfs4_set_client(struct nfs_server *server,
1402 .rpc_ops = &nfs_v4_clientops, 1422 .rpc_ops = &nfs_v4_clientops,
1403 .proto = proto, 1423 .proto = proto,
1404 .minorversion = minorversion, 1424 .minorversion = minorversion,
1425 .net = net,
1405 }; 1426 };
1406 struct nfs_client *clp; 1427 struct nfs_client *clp;
1407 int error; 1428 int error;
@@ -1453,6 +1474,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
1453 .rpc_ops = &nfs_v4_clientops, 1474 .rpc_ops = &nfs_v4_clientops,
1454 .proto = ds_proto, 1475 .proto = ds_proto,
1455 .minorversion = mds_clp->cl_minorversion, 1476 .minorversion = mds_clp->cl_minorversion,
1477 .net = mds_clp->net,
1456 }; 1478 };
1457 struct rpc_timeout ds_timeout = { 1479 struct rpc_timeout ds_timeout = {
1458 .to_initval = 15 * HZ, 1480 .to_initval = 15 * HZ,
@@ -1580,7 +1602,8 @@ static int nfs4_init_server(struct nfs_server *server,
1580 data->auth_flavors[0], 1602 data->auth_flavors[0],
1581 data->nfs_server.protocol, 1603 data->nfs_server.protocol,
1582 &timeparms, 1604 &timeparms,
1583 data->minorversion); 1605 data->minorversion,
1606 data->net);
1584 if (error < 0) 1607 if (error < 0)
1585 goto error; 1608 goto error;
1586 1609
@@ -1675,9 +1698,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1675 data->addrlen, 1698 data->addrlen,
1676 parent_client->cl_ipaddr, 1699 parent_client->cl_ipaddr,
1677 data->authflavor, 1700 data->authflavor,
1678 parent_server->client->cl_xprt->prot, 1701 rpc_protocol(parent_server->client),
1679 parent_server->client->cl_timeout, 1702 parent_server->client->cl_timeout,
1680 parent_client->cl_mvops->minor_version); 1703 parent_client->cl_mvops->minor_version,
1704 parent_client->net);
1681 if (error < 0) 1705 if (error < 0)
1682 goto error; 1706 goto error;
1683 1707
@@ -1770,6 +1794,18 @@ out_free_server:
1770 return ERR_PTR(error); 1794 return ERR_PTR(error);
1771} 1795}
1772 1796
1797void nfs_clients_init(struct net *net)
1798{
1799 struct nfs_net *nn = net_generic(net, nfs_net_id);
1800
1801 INIT_LIST_HEAD(&nn->nfs_client_list);
1802 INIT_LIST_HEAD(&nn->nfs_volume_list);
1803#ifdef CONFIG_NFS_V4
1804 idr_init(&nn->cb_ident_idr);
1805#endif
1806 spin_lock_init(&nn->nfs_client_lock);
1807}
1808
1773#ifdef CONFIG_PROC_FS 1809#ifdef CONFIG_PROC_FS
1774static struct proc_dir_entry *proc_fs_nfs; 1810static struct proc_dir_entry *proc_fs_nfs;
1775 1811
@@ -1823,13 +1859,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1823{ 1859{
1824 struct seq_file *m; 1860 struct seq_file *m;
1825 int ret; 1861 int ret;
1862 struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
1863 struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
1826 1864
1827 ret = seq_open(file, &nfs_server_list_ops); 1865 ret = seq_open(file, &nfs_server_list_ops);
1828 if (ret < 0) 1866 if (ret < 0)
1829 return ret; 1867 return ret;
1830 1868
1831 m = file->private_data; 1869 m = file->private_data;
1832 m->private = PDE(inode)->data; 1870 m->private = net;
1833 1871
1834 return 0; 1872 return 0;
1835} 1873}
@@ -1839,9 +1877,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1839 */ 1877 */
1840static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1878static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1841{ 1879{
1880 struct nfs_net *nn = net_generic(m->private, nfs_net_id);
1881
1842 /* lock the list against modification */ 1882 /* lock the list against modification */
1843 spin_lock(&nfs_client_lock); 1883 spin_lock(&nn->nfs_client_lock);
1844 return seq_list_start_head(&nfs_client_list, *_pos); 1884 return seq_list_start_head(&nn->nfs_client_list, *_pos);
1845} 1885}
1846 1886
1847/* 1887/*
@@ -1849,7 +1889,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1849 */ 1889 */
1850static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) 1890static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1851{ 1891{
1852 return seq_list_next(v, &nfs_client_list, pos); 1892 struct nfs_net *nn = net_generic(p->private, nfs_net_id);
1893
1894 return seq_list_next(v, &nn->nfs_client_list, pos);
1853} 1895}
1854 1896
1855/* 1897/*
@@ -1857,7 +1899,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1857 */ 1899 */
1858static void nfs_server_list_stop(struct seq_file *p, void *v) 1900static void nfs_server_list_stop(struct seq_file *p, void *v)
1859{ 1901{
1860 spin_unlock(&nfs_client_lock); 1902 struct nfs_net *nn = net_generic(p->private, nfs_net_id);
1903
1904 spin_unlock(&nn->nfs_client_lock);
1861} 1905}
1862 1906
1863/* 1907/*
@@ -1866,9 +1910,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
1866static int nfs_server_list_show(struct seq_file *m, void *v) 1910static int nfs_server_list_show(struct seq_file *m, void *v)
1867{ 1911{
1868 struct nfs_client *clp; 1912 struct nfs_client *clp;
1913 struct nfs_net *nn = net_generic(m->private, nfs_net_id);
1869 1914
1870 /* display header on line 1 */ 1915 /* display header on line 1 */
1871 if (v == &nfs_client_list) { 1916 if (v == &nn->nfs_client_list) {
1872 seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); 1917 seq_puts(m, "NV SERVER PORT USE HOSTNAME\n");
1873 return 0; 1918 return 0;
1874 } 1919 }
@@ -1880,12 +1925,14 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1880 if (clp->cl_cons_state != NFS_CS_READY) 1925 if (clp->cl_cons_state != NFS_CS_READY)
1881 return 0; 1926 return 0;
1882 1927
1928 rcu_read_lock();
1883 seq_printf(m, "v%u %s %s %3d %s\n", 1929 seq_printf(m, "v%u %s %s %3d %s\n",
1884 clp->rpc_ops->version, 1930 clp->rpc_ops->version,
1885 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 1931 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
1886 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), 1932 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
1887 atomic_read(&clp->cl_count), 1933 atomic_read(&clp->cl_count),
1888 clp->cl_hostname); 1934 clp->cl_hostname);
1935 rcu_read_unlock();
1889 1936
1890 return 0; 1937 return 0;
1891} 1938}
@@ -1897,13 +1944,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1897{ 1944{
1898 struct seq_file *m; 1945 struct seq_file *m;
1899 int ret; 1946 int ret;
1947 struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
1948 struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
1900 1949
1901 ret = seq_open(file, &nfs_volume_list_ops); 1950 ret = seq_open(file, &nfs_volume_list_ops);
1902 if (ret < 0) 1951 if (ret < 0)
1903 return ret; 1952 return ret;
1904 1953
1905 m = file->private_data; 1954 m = file->private_data;
1906 m->private = PDE(inode)->data; 1955 m->private = net;
1907 1956
1908 return 0; 1957 return 0;
1909} 1958}
@@ -1913,9 +1962,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1913 */ 1962 */
1914static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1963static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1915{ 1964{
1965 struct nfs_net *nn = net_generic(m->private, nfs_net_id);
1966
1916 /* lock the list against modification */ 1967 /* lock the list against modification */
1917 spin_lock(&nfs_client_lock); 1968 spin_lock(&nn->nfs_client_lock);
1918 return seq_list_start_head(&nfs_volume_list, *_pos); 1969 return seq_list_start_head(&nn->nfs_volume_list, *_pos);
1919} 1970}
1920 1971
1921/* 1972/*
@@ -1923,7 +1974,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1923 */ 1974 */
1924static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) 1975static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1925{ 1976{
1926 return seq_list_next(v, &nfs_volume_list, pos); 1977 struct nfs_net *nn = net_generic(p->private, nfs_net_id);
1978
1979 return seq_list_next(v, &nn->nfs_volume_list, pos);
1927} 1980}
1928 1981
1929/* 1982/*
@@ -1931,7 +1984,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1931 */ 1984 */
1932static void nfs_volume_list_stop(struct seq_file *p, void *v) 1985static void nfs_volume_list_stop(struct seq_file *p, void *v)
1933{ 1986{
1934 spin_unlock(&nfs_client_lock); 1987 struct nfs_net *nn = net_generic(p->private, nfs_net_id);
1988
1989 spin_unlock(&nn->nfs_client_lock);
1935} 1990}
1936 1991
1937/* 1992/*
@@ -1942,9 +1997,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1942 struct nfs_server *server; 1997 struct nfs_server *server;
1943 struct nfs_client *clp; 1998 struct nfs_client *clp;
1944 char dev[8], fsid[17]; 1999 char dev[8], fsid[17];
2000 struct nfs_net *nn = net_generic(m->private, nfs_net_id);
1945 2001
1946 /* display header on line 1 */ 2002 /* display header on line 1 */
1947 if (v == &nfs_volume_list) { 2003 if (v == &nn->nfs_volume_list) {
1948 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); 2004 seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
1949 return 0; 2005 return 0;
1950 } 2006 }
@@ -1959,6 +2015,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1959 (unsigned long long) server->fsid.major, 2015 (unsigned long long) server->fsid.major,
1960 (unsigned long long) server->fsid.minor); 2016 (unsigned long long) server->fsid.minor);
1961 2017
2018 rcu_read_lock();
1962 seq_printf(m, "v%u %s %s %-7s %-17s %s\n", 2019 seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
1963 clp->rpc_ops->version, 2020 clp->rpc_ops->version,
1964 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), 2021 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
@@ -1966,6 +2023,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1966 dev, 2023 dev,
1967 fsid, 2024 fsid,
1968 nfs_server_fscache_state(server)); 2025 nfs_server_fscache_state(server));
2026 rcu_read_unlock();
1969 2027
1970 return 0; 2028 return 0;
1971} 2029}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f2654069806..89af1d269274 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -105,7 +105,7 @@ again:
105 continue; 105 continue;
106 if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) 106 if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
107 continue; 107 continue;
108 if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) 108 if (!nfs4_stateid_match(&state->stateid, stateid))
109 continue; 109 continue;
110 get_nfs_open_context(ctx); 110 get_nfs_open_context(ctx);
111 spin_unlock(&inode->i_lock); 111 spin_unlock(&inode->i_lock);
@@ -139,8 +139,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
139 if (delegation != NULL) { 139 if (delegation != NULL) {
140 spin_lock(&delegation->lock); 140 spin_lock(&delegation->lock);
141 if (delegation->inode != NULL) { 141 if (delegation->inode != NULL) {
142 memcpy(delegation->stateid.data, res->delegation.data, 142 nfs4_stateid_copy(&delegation->stateid, &res->delegation);
143 sizeof(delegation->stateid.data));
144 delegation->type = res->delegation_type; 143 delegation->type = res->delegation_type;
145 delegation->maxsize = res->maxsize; 144 delegation->maxsize = res->maxsize;
146 oldcred = delegation->cred; 145 oldcred = delegation->cred;
@@ -236,8 +235,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
236 delegation = kmalloc(sizeof(*delegation), GFP_NOFS); 235 delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
237 if (delegation == NULL) 236 if (delegation == NULL)
238 return -ENOMEM; 237 return -ENOMEM;
239 memcpy(delegation->stateid.data, res->delegation.data, 238 nfs4_stateid_copy(&delegation->stateid, &res->delegation);
240 sizeof(delegation->stateid.data));
241 delegation->type = res->delegation_type; 239 delegation->type = res->delegation_type;
242 delegation->maxsize = res->maxsize; 240 delegation->maxsize = res->maxsize;
243 delegation->change_attr = inode->i_version; 241 delegation->change_attr = inode->i_version;
@@ -250,19 +248,22 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
250 old_delegation = rcu_dereference_protected(nfsi->delegation, 248 old_delegation = rcu_dereference_protected(nfsi->delegation,
251 lockdep_is_held(&clp->cl_lock)); 249 lockdep_is_held(&clp->cl_lock));
252 if (old_delegation != NULL) { 250 if (old_delegation != NULL) {
253 if (memcmp(&delegation->stateid, &old_delegation->stateid, 251 if (nfs4_stateid_match(&delegation->stateid,
254 sizeof(old_delegation->stateid)) == 0 && 252 &old_delegation->stateid) &&
255 delegation->type == old_delegation->type) { 253 delegation->type == old_delegation->type) {
256 goto out; 254 goto out;
257 } 255 }
258 /* 256 /*
259 * Deal with broken servers that hand out two 257 * Deal with broken servers that hand out two
260 * delegations for the same file. 258 * delegations for the same file.
259 * Allow for upgrades to a WRITE delegation, but
260 * nothing else.
261 */ 261 */
262 dfprintk(FILE, "%s: server %s handed out " 262 dfprintk(FILE, "%s: server %s handed out "
263 "a duplicate delegation!\n", 263 "a duplicate delegation!\n",
264 __func__, clp->cl_hostname); 264 __func__, clp->cl_hostname);
265 if (delegation->type <= old_delegation->type) { 265 if (delegation->type == old_delegation->type ||
266 !(delegation->type & FMODE_WRITE)) {
266 freeme = delegation; 267 freeme = delegation;
267 delegation = NULL; 268 delegation = NULL;
268 goto out; 269 goto out;
@@ -455,17 +456,24 @@ static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
455 rcu_read_unlock(); 456 rcu_read_unlock();
456} 457}
457 458
458static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
459{
460 nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
461}
462
463static void nfs_delegation_run_state_manager(struct nfs_client *clp) 459static void nfs_delegation_run_state_manager(struct nfs_client *clp)
464{ 460{
465 if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) 461 if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
466 nfs4_schedule_state_manager(clp); 462 nfs4_schedule_state_manager(clp);
467} 463}
468 464
465void nfs_remove_bad_delegation(struct inode *inode)
466{
467 struct nfs_delegation *delegation;
468
469 delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
470 if (delegation) {
471 nfs_inode_find_state_and_recover(inode, &delegation->stateid);
472 nfs_free_delegation(delegation);
473 }
474}
475EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
476
469/** 477/**
470 * nfs_expire_all_delegation_types 478 * nfs_expire_all_delegation_types
471 * @clp: client to process 479 * @clp: client to process
@@ -488,18 +496,6 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
488 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); 496 nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
489} 497}
490 498
491/**
492 * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
493 * @clp: client to process
494 *
495 */
496void nfs_handle_cb_pathdown(struct nfs_client *clp)
497{
498 if (clp == NULL)
499 return;
500 nfs_client_mark_return_all_delegations(clp);
501}
502
503static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) 499static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
504{ 500{
505 struct nfs_delegation *delegation; 501 struct nfs_delegation *delegation;
@@ -531,7 +527,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
531/** 527/**
532 * nfs_async_inode_return_delegation - asynchronously return a delegation 528 * nfs_async_inode_return_delegation - asynchronously return a delegation
533 * @inode: inode to process 529 * @inode: inode to process
534 * @stateid: state ID information from CB_RECALL arguments 530 * @stateid: state ID information
535 * 531 *
536 * Returns zero on success, or a negative errno value. 532 * Returns zero on success, or a negative errno value.
537 */ 533 */
@@ -545,7 +541,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,
545 rcu_read_lock(); 541 rcu_read_lock();
546 delegation = rcu_dereference(NFS_I(inode)->delegation); 542 delegation = rcu_dereference(NFS_I(inode)->delegation);
547 543
548 if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { 544 if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) {
549 rcu_read_unlock(); 545 rcu_read_unlock();
550 return -ENOENT; 546 return -ENOENT;
551 } 547 }
@@ -684,21 +680,25 @@ int nfs_delegations_present(struct nfs_client *clp)
684 * nfs4_copy_delegation_stateid - Copy inode's state ID information 680 * nfs4_copy_delegation_stateid - Copy inode's state ID information
685 * @dst: stateid data structure to fill in 681 * @dst: stateid data structure to fill in
686 * @inode: inode to check 682 * @inode: inode to check
683 * @flags: delegation type requirement
687 * 684 *
688 * Returns one and fills in "dst->data" * if inode had a delegation, 685 * Returns "true" and fills in "dst->data" * if inode had a delegation,
689 * otherwise zero is returned. 686 * otherwise "false" is returned.
690 */ 687 */
691int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) 688bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
689 fmode_t flags)
692{ 690{
693 struct nfs_inode *nfsi = NFS_I(inode); 691 struct nfs_inode *nfsi = NFS_I(inode);
694 struct nfs_delegation *delegation; 692 struct nfs_delegation *delegation;
695 int ret = 0; 693 bool ret;
696 694
695 flags &= FMODE_READ|FMODE_WRITE;
697 rcu_read_lock(); 696 rcu_read_lock();
698 delegation = rcu_dereference(nfsi->delegation); 697 delegation = rcu_dereference(nfsi->delegation);
699 if (delegation != NULL) { 698 ret = (delegation != NULL && (delegation->type & flags) == flags);
700 memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); 699 if (ret) {
701 ret = 1; 700 nfs4_stateid_copy(dst, &delegation->stateid);
701 nfs_mark_delegation_referenced(delegation);
702 } 702 }
703 rcu_read_unlock(); 703 rcu_read_unlock();
704 return ret; 704 return ret;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index d9322e490c56..cd6a7a8dadae 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -42,9 +42,9 @@ void nfs_super_return_all_delegations(struct super_block *sb);
42void nfs_expire_all_delegations(struct nfs_client *clp); 42void nfs_expire_all_delegations(struct nfs_client *clp);
43void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); 43void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
44void nfs_expire_unreferenced_delegations(struct nfs_client *clp); 44void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
45void nfs_handle_cb_pathdown(struct nfs_client *clp);
46int nfs_client_return_marked_delegations(struct nfs_client *clp); 45int nfs_client_return_marked_delegations(struct nfs_client *clp);
47int nfs_delegations_present(struct nfs_client *clp); 46int nfs_delegations_present(struct nfs_client *clp);
47void nfs_remove_bad_delegation(struct inode *inode);
48 48
49void nfs_delegation_mark_reclaim(struct nfs_client *clp); 49void nfs_delegation_mark_reclaim(struct nfs_client *clp);
50void nfs_delegation_reap_unclaimed(struct nfs_client *clp); 50void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -53,7 +53,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
53int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); 53int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
54int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); 54int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
55int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); 55int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
56int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); 56bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
57 57
58void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); 58void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
59int nfs_have_delegation(struct inode *inode, fmode_t flags); 59int nfs_have_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32aa6917265a..4aaf0316d76a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -207,7 +207,7 @@ struct nfs_cache_array_entry {
207}; 207};
208 208
209struct nfs_cache_array { 209struct nfs_cache_array {
210 unsigned int size; 210 int size;
211 int eof_index; 211 int eof_index;
212 u64 last_cookie; 212 u64 last_cookie;
213 struct nfs_cache_array_entry array[0]; 213 struct nfs_cache_array_entry array[0];
@@ -1429,6 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1429 } 1429 }
1430 1430
1431 open_flags = nd->intent.open.flags; 1431 open_flags = nd->intent.open.flags;
1432 attr.ia_valid = 0;
1432 1433
1433 ctx = create_nfs_open_context(dentry, open_flags); 1434 ctx = create_nfs_open_context(dentry, open_flags);
1434 res = ERR_CAST(ctx); 1435 res = ERR_CAST(ctx);
@@ -1437,11 +1438,14 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1437 1438
1438 if (nd->flags & LOOKUP_CREATE) { 1439 if (nd->flags & LOOKUP_CREATE) {
1439 attr.ia_mode = nd->intent.open.create_mode; 1440 attr.ia_mode = nd->intent.open.create_mode;
1440 attr.ia_valid = ATTR_MODE; 1441 attr.ia_valid |= ATTR_MODE;
1441 attr.ia_mode &= ~current_umask(); 1442 attr.ia_mode &= ~current_umask();
1442 } else { 1443 } else
1443 open_flags &= ~(O_EXCL | O_CREAT); 1444 open_flags &= ~(O_EXCL | O_CREAT);
1444 attr.ia_valid = 0; 1445
1446 if (open_flags & O_TRUNC) {
1447 attr.ia_valid |= ATTR_SIZE;
1448 attr.ia_size = 0;
1445 } 1449 }
1446 1450
1447 /* Open the file on the server */ 1451 /* Open the file on the server */
@@ -1495,6 +1499,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1495 struct inode *inode; 1499 struct inode *inode;
1496 struct inode *dir; 1500 struct inode *dir;
1497 struct nfs_open_context *ctx; 1501 struct nfs_open_context *ctx;
1502 struct iattr attr;
1498 int openflags, ret = 0; 1503 int openflags, ret = 0;
1499 1504
1500 if (nd->flags & LOOKUP_RCU) 1505 if (nd->flags & LOOKUP_RCU)
@@ -1523,19 +1528,27 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1523 /* We cannot do exclusive creation on a positive dentry */ 1528 /* We cannot do exclusive creation on a positive dentry */
1524 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) 1529 if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1525 goto no_open_dput; 1530 goto no_open_dput;
1526 /* We can't create new files, or truncate existing ones here */ 1531 /* We can't create new files here */
1527 openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); 1532 openflags &= ~(O_CREAT|O_EXCL);
1528 1533
1529 ctx = create_nfs_open_context(dentry, openflags); 1534 ctx = create_nfs_open_context(dentry, openflags);
1530 ret = PTR_ERR(ctx); 1535 ret = PTR_ERR(ctx);
1531 if (IS_ERR(ctx)) 1536 if (IS_ERR(ctx))
1532 goto out; 1537 goto out;
1538
1539 attr.ia_valid = 0;
1540 if (openflags & O_TRUNC) {
1541 attr.ia_valid |= ATTR_SIZE;
1542 attr.ia_size = 0;
1543 nfs_wb_all(inode);
1544 }
1545
1533 /* 1546 /*
1534 * Note: we're not holding inode->i_mutex and so may be racing with 1547 * Note: we're not holding inode->i_mutex and so may be racing with
1535 * operations that change the directory. We therefore save the 1548 * operations that change the directory. We therefore save the
1536 * change attribute *before* we do the RPC call. 1549 * change attribute *before* we do the RPC call.
1537 */ 1550 */
1538 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); 1551 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
1539 if (IS_ERR(inode)) { 1552 if (IS_ERR(inode)) {
1540 ret = PTR_ERR(inode); 1553 ret = PTR_ERR(inode);
1541 switch (ret) { 1554 switch (ret) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ea5be1262d41..481be7f7bdd3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -264,9 +264,7 @@ static void nfs_direct_read_release(void *calldata)
264} 264}
265 265
266static const struct rpc_call_ops nfs_read_direct_ops = { 266static const struct rpc_call_ops nfs_read_direct_ops = {
267#if defined(CONFIG_NFS_V4_1)
268 .rpc_call_prepare = nfs_read_prepare, 267 .rpc_call_prepare = nfs_read_prepare,
269#endif /* CONFIG_NFS_V4_1 */
270 .rpc_call_done = nfs_direct_read_result, 268 .rpc_call_done = nfs_direct_read_result,
271 .rpc_release = nfs_direct_read_release, 269 .rpc_release = nfs_direct_read_release,
272}; 270};
@@ -553,9 +551,7 @@ static void nfs_direct_commit_release(void *calldata)
553} 551}
554 552
555static const struct rpc_call_ops nfs_commit_direct_ops = { 553static const struct rpc_call_ops nfs_commit_direct_ops = {
556#if defined(CONFIG_NFS_V4_1)
557 .rpc_call_prepare = nfs_write_prepare, 554 .rpc_call_prepare = nfs_write_prepare,
558#endif /* CONFIG_NFS_V4_1 */
559 .rpc_call_done = nfs_direct_commit_result, 555 .rpc_call_done = nfs_direct_commit_result,
560 .rpc_release = nfs_direct_commit_release, 556 .rpc_release = nfs_direct_commit_release,
561}; 557};
@@ -695,9 +691,7 @@ out_unlock:
695} 691}
696 692
697static const struct rpc_call_ops nfs_write_direct_ops = { 693static const struct rpc_call_ops nfs_write_direct_ops = {
698#if defined(CONFIG_NFS_V4_1)
699 .rpc_call_prepare = nfs_write_prepare, 694 .rpc_call_prepare = nfs_write_prepare,
700#endif /* CONFIG_NFS_V4_1 */
701 .rpc_call_done = nfs_direct_write_result, 695 .rpc_call_done = nfs_direct_write_result,
702 .rpc_release = nfs_direct_write_release, 696 .rpc_release = nfs_direct_write_release,
703}; 697};
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index a6e711ad130f..b3924b8a6000 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -10,8 +10,9 @@
10 10
11#include <linux/sunrpc/clnt.h> 11#include <linux/sunrpc/clnt.h>
12#include <linux/dns_resolver.h> 12#include <linux/dns_resolver.h>
13#include "dns_resolve.h"
13 14
14ssize_t nfs_dns_resolve_name(char *name, size_t namelen, 15ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
15 struct sockaddr *sa, size_t salen) 16 struct sockaddr *sa, size_t salen)
16{ 17{
17 ssize_t ret; 18 ssize_t ret;
@@ -20,7 +21,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
20 21
21 ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); 22 ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
22 if (ip_len > 0) 23 if (ip_len > 0)
23 ret = rpc_pton(ip_addr, ip_len, sa, salen); 24 ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
24 else 25 else
25 ret = -ESRCH; 26 ret = -ESRCH;
26 kfree(ip_addr); 27 kfree(ip_addr);
@@ -40,15 +41,15 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
40#include <linux/sunrpc/clnt.h> 41#include <linux/sunrpc/clnt.h>
41#include <linux/sunrpc/cache.h> 42#include <linux/sunrpc/cache.h>
42#include <linux/sunrpc/svcauth.h> 43#include <linux/sunrpc/svcauth.h>
44#include <linux/sunrpc/rpc_pipe_fs.h>
43 45
44#include "dns_resolve.h" 46#include "dns_resolve.h"
45#include "cache_lib.h" 47#include "cache_lib.h"
48#include "netns.h"
46 49
47#define NFS_DNS_HASHBITS 4 50#define NFS_DNS_HASHBITS 4
48#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) 51#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
49 52
50static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
51
52struct nfs_dns_ent { 53struct nfs_dns_ent {
53 struct cache_head h; 54 struct cache_head h;
54 55
@@ -224,7 +225,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
224 len = qword_get(&buf, buf1, sizeof(buf1)); 225 len = qword_get(&buf, buf1, sizeof(buf1));
225 if (len <= 0) 226 if (len <= 0)
226 goto out; 227 goto out;
227 key.addrlen = rpc_pton(buf1, len, 228 key.addrlen = rpc_pton(cd->net, buf1, len,
228 (struct sockaddr *)&key.addr, 229 (struct sockaddr *)&key.addr,
229 sizeof(key.addr)); 230 sizeof(key.addr));
230 231
@@ -259,21 +260,6 @@ out:
259 return ret; 260 return ret;
260} 261}
261 262
262static struct cache_detail nfs_dns_resolve = {
263 .owner = THIS_MODULE,
264 .hash_size = NFS_DNS_HASHTBL_SIZE,
265 .hash_table = nfs_dns_table,
266 .name = "dns_resolve",
267 .cache_put = nfs_dns_ent_put,
268 .cache_upcall = nfs_dns_upcall,
269 .cache_parse = nfs_dns_parse,
270 .cache_show = nfs_dns_show,
271 .match = nfs_dns_match,
272 .init = nfs_dns_ent_init,
273 .update = nfs_dns_ent_update,
274 .alloc = nfs_dns_ent_alloc,
275};
276
277static int do_cache_lookup(struct cache_detail *cd, 263static int do_cache_lookup(struct cache_detail *cd,
278 struct nfs_dns_ent *key, 264 struct nfs_dns_ent *key,
279 struct nfs_dns_ent **item, 265 struct nfs_dns_ent **item,
@@ -336,8 +322,8 @@ out:
336 return ret; 322 return ret;
337} 323}
338 324
339ssize_t nfs_dns_resolve_name(char *name, size_t namelen, 325ssize_t nfs_dns_resolve_name(struct net *net, char *name,
340 struct sockaddr *sa, size_t salen) 326 size_t namelen, struct sockaddr *sa, size_t salen)
341{ 327{
342 struct nfs_dns_ent key = { 328 struct nfs_dns_ent key = {
343 .hostname = name, 329 .hostname = name,
@@ -345,28 +331,118 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
345 }; 331 };
346 struct nfs_dns_ent *item = NULL; 332 struct nfs_dns_ent *item = NULL;
347 ssize_t ret; 333 ssize_t ret;
334 struct nfs_net *nn = net_generic(net, nfs_net_id);
348 335
349 ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); 336 ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);
350 if (ret == 0) { 337 if (ret == 0) {
351 if (salen >= item->addrlen) { 338 if (salen >= item->addrlen) {
352 memcpy(sa, &item->addr, item->addrlen); 339 memcpy(sa, &item->addr, item->addrlen);
353 ret = item->addrlen; 340 ret = item->addrlen;
354 } else 341 } else
355 ret = -EOVERFLOW; 342 ret = -EOVERFLOW;
356 cache_put(&item->h, &nfs_dns_resolve); 343 cache_put(&item->h, nn->nfs_dns_resolve);
357 } else if (ret == -ENOENT) 344 } else if (ret == -ENOENT)
358 ret = -ESRCH; 345 ret = -ESRCH;
359 return ret; 346 return ret;
360} 347}
361 348
349int nfs_dns_resolver_cache_init(struct net *net)
350{
351 int err = -ENOMEM;
352 struct nfs_net *nn = net_generic(net, nfs_net_id);
353 struct cache_detail *cd;
354 struct cache_head **tbl;
355
356 cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
357 if (cd == NULL)
358 goto err_cd;
359
360 tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *),
361 GFP_KERNEL);
362 if (tbl == NULL)
363 goto err_tbl;
364
365 cd->owner = THIS_MODULE,
366 cd->hash_size = NFS_DNS_HASHTBL_SIZE,
367 cd->hash_table = tbl,
368 cd->name = "dns_resolve",
369 cd->cache_put = nfs_dns_ent_put,
370 cd->cache_upcall = nfs_dns_upcall,
371 cd->cache_parse = nfs_dns_parse,
372 cd->cache_show = nfs_dns_show,
373 cd->match = nfs_dns_match,
374 cd->init = nfs_dns_ent_init,
375 cd->update = nfs_dns_ent_update,
376 cd->alloc = nfs_dns_ent_alloc,
377
378 nfs_cache_init(cd);
379 err = nfs_cache_register_net(net, cd);
380 if (err)
381 goto err_reg;
382 nn->nfs_dns_resolve = cd;
383 return 0;
384
385err_reg:
386 nfs_cache_destroy(cd);
387 kfree(cd->hash_table);
388err_tbl:
389 kfree(cd);
390err_cd:
391 return err;
392}
393
394void nfs_dns_resolver_cache_destroy(struct net *net)
395{
396 struct nfs_net *nn = net_generic(net, nfs_net_id);
397 struct cache_detail *cd = nn->nfs_dns_resolve;
398
399 nfs_cache_unregister_net(net, cd);
400 nfs_cache_destroy(cd);
401 kfree(cd->hash_table);
402 kfree(cd);
403}
404
405static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
406 void *ptr)
407{
408 struct super_block *sb = ptr;
409 struct net *net = sb->s_fs_info;
410 struct nfs_net *nn = net_generic(net, nfs_net_id);
411 struct cache_detail *cd = nn->nfs_dns_resolve;
412 int ret = 0;
413
414 if (cd == NULL)
415 return 0;
416
417 if (!try_module_get(THIS_MODULE))
418 return 0;
419
420 switch (event) {
421 case RPC_PIPEFS_MOUNT:
422 ret = nfs_cache_register_sb(sb, cd);
423 break;
424 case RPC_PIPEFS_UMOUNT:
425 nfs_cache_unregister_sb(sb, cd);
426 break;
427 default:
428 ret = -ENOTSUPP;
429 break;
430 }
431 module_put(THIS_MODULE);
432 return ret;
433}
434
435static struct notifier_block nfs_dns_resolver_block = {
436 .notifier_call = rpc_pipefs_event,
437};
438
362int nfs_dns_resolver_init(void) 439int nfs_dns_resolver_init(void)
363{ 440{
364 return nfs_cache_register(&nfs_dns_resolve); 441 return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
365} 442}
366 443
367void nfs_dns_resolver_destroy(void) 444void nfs_dns_resolver_destroy(void)
368{ 445{
369 nfs_cache_unregister(&nfs_dns_resolve); 446 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
370} 447}
371
372#endif 448#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index 199bb5543a91..2e4f596d2923 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -15,12 +15,22 @@ static inline int nfs_dns_resolver_init(void)
15 15
16static inline void nfs_dns_resolver_destroy(void) 16static inline void nfs_dns_resolver_destroy(void)
17{} 17{}
18
19static inline int nfs_dns_resolver_cache_init(struct net *net)
20{
21 return 0;
22}
23
24static inline void nfs_dns_resolver_cache_destroy(struct net *net)
25{}
18#else 26#else
19extern int nfs_dns_resolver_init(void); 27extern int nfs_dns_resolver_init(void);
20extern void nfs_dns_resolver_destroy(void); 28extern void nfs_dns_resolver_destroy(void);
29extern int nfs_dns_resolver_cache_init(struct net *net);
30extern void nfs_dns_resolver_cache_destroy(struct net *net);
21#endif 31#endif
22 32
23extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, 33extern ssize_t nfs_dns_resolve_name(struct net *net, char *name,
24 struct sockaddr *sa, size_t salen); 34 size_t namelen, struct sockaddr *sa, size_t salen);
25 35
26#endif 36#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a77a1f2da5d6..aa9b709fd328 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -529,6 +529,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
529 if (mapping != dentry->d_inode->i_mapping) 529 if (mapping != dentry->d_inode->i_mapping)
530 goto out_unlock; 530 goto out_unlock;
531 531
532 wait_on_page_writeback(page);
533
532 pagelen = nfs_page_length(page); 534 pagelen = nfs_page_length(page);
533 if (pagelen == 0) 535 if (pagelen == 0)
534 goto out_unlock; 536 goto out_unlock;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 419119c371bf..ae65c16b3670 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
327{ 327{
328 struct nfs_inode *nfsi = NFS_I(inode); 328 struct nfs_inode *nfsi = NFS_I(inode);
329 struct nfs_server *nfss = NFS_SERVER(inode); 329 struct nfs_server *nfss = NFS_SERVER(inode);
330 struct fscache_cookie *old = nfsi->fscache; 330 NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
331 331
332 nfs_fscache_inode_lock(inode); 332 nfs_fscache_inode_lock(inode);
333 if (nfsi->fscache) { 333 if (nfsi->fscache) {
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index a1bbf7780dfc..b7f348bb618b 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -34,11 +34,29 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/string.h> 37#include <linux/parser.h>
38#include <linux/kernel.h> 38#include <linux/fs.h>
39#include <linux/slab.h>
40#include <linux/nfs_idmap.h> 39#include <linux/nfs_idmap.h>
40#include <net/net_namespace.h>
41#include <linux/sunrpc/rpc_pipe_fs.h>
41#include <linux/nfs_fs.h> 42#include <linux/nfs_fs.h>
43#include <linux/nfs_fs_sb.h>
44#include <linux/key.h>
45#include <linux/keyctl.h>
46#include <linux/key-type.h>
47#include <keys/user-type.h>
48#include <linux/module.h>
49
50#include "internal.h"
51#include "netns.h"
52
53#define NFS_UINT_MAXLEN 11
54
55/* Default cache timeout is 10 minutes */
56unsigned int nfs_idmap_cache_timeout = 600;
57static const struct cred *id_resolver_cache;
58static struct key_type key_type_id_resolver_legacy;
59
42 60
43/** 61/**
44 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields 62 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
@@ -142,24 +160,7 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
142 return snprintf(buf, buflen, "%u", id); 160 return snprintf(buf, buflen, "%u", id);
143} 161}
144 162
145#ifdef CONFIG_NFS_USE_NEW_IDMAPPER 163static struct key_type key_type_id_resolver = {
146
147#include <linux/cred.h>
148#include <linux/sunrpc/sched.h>
149#include <linux/nfs4.h>
150#include <linux/nfs_fs_sb.h>
151#include <linux/keyctl.h>
152#include <linux/key-type.h>
153#include <linux/rcupdate.h>
154#include <linux/err.h>
155
156#include <keys/user-type.h>
157
158#define NFS_UINT_MAXLEN 11
159
160const struct cred *id_resolver_cache;
161
162struct key_type key_type_id_resolver = {
163 .name = "id_resolver", 164 .name = "id_resolver",
164 .instantiate = user_instantiate, 165 .instantiate = user_instantiate,
165 .match = user_match, 166 .match = user_match,
@@ -169,13 +170,14 @@ struct key_type key_type_id_resolver = {
169 .read = user_read, 170 .read = user_read,
170}; 171};
171 172
172int nfs_idmap_init(void) 173static int nfs_idmap_init_keyring(void)
173{ 174{
174 struct cred *cred; 175 struct cred *cred;
175 struct key *keyring; 176 struct key *keyring;
176 int ret = 0; 177 int ret = 0;
177 178
178 printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); 179 printk(KERN_NOTICE "NFS: Registering the %s key type\n",
180 key_type_id_resolver.name);
179 181
180 cred = prepare_kernel_cred(NULL); 182 cred = prepare_kernel_cred(NULL);
181 if (!cred) 183 if (!cred)
@@ -211,7 +213,7 @@ failed_put_cred:
211 return ret; 213 return ret;
212} 214}
213 215
214void nfs_idmap_quit(void) 216static void nfs_idmap_quit_keyring(void)
215{ 217{
216 key_revoke(id_resolver_cache->thread_keyring); 218 key_revoke(id_resolver_cache->thread_keyring);
217 unregister_key_type(&key_type_id_resolver); 219 unregister_key_type(&key_type_id_resolver);
@@ -246,8 +248,10 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
246 return desclen; 248 return desclen;
247} 249}
248 250
249static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, 251static ssize_t nfs_idmap_request_key(struct key_type *key_type,
250 const char *type, void *data, size_t data_size) 252 const char *name, size_t namelen,
253 const char *type, void *data,
254 size_t data_size, struct idmap *idmap)
251{ 255{
252 const struct cred *saved_cred; 256 const struct cred *saved_cred;
253 struct key *rkey; 257 struct key *rkey;
@@ -260,8 +264,12 @@ static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,
260 goto out; 264 goto out;
261 265
262 saved_cred = override_creds(id_resolver_cache); 266 saved_cred = override_creds(id_resolver_cache);
263 rkey = request_key(&key_type_id_resolver, desc, ""); 267 if (idmap)
268 rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
269 else
270 rkey = request_key(&key_type_id_resolver, desc, "");
264 revert_creds(saved_cred); 271 revert_creds(saved_cred);
272
265 kfree(desc); 273 kfree(desc);
266 if (IS_ERR(rkey)) { 274 if (IS_ERR(rkey)) {
267 ret = PTR_ERR(rkey); 275 ret = PTR_ERR(rkey);
@@ -294,31 +302,46 @@ out:
294 return ret; 302 return ret;
295} 303}
296 304
305static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
306 const char *type, void *data,
307 size_t data_size, struct idmap *idmap)
308{
309 ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
310 name, namelen, type, data,
311 data_size, NULL);
312 if (ret < 0) {
313 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
314 name, namelen, type, data,
315 data_size, idmap);
316 }
317 return ret;
318}
297 319
298/* ID -> Name */ 320/* ID -> Name */
299static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) 321static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
322 size_t buflen, struct idmap *idmap)
300{ 323{
301 char id_str[NFS_UINT_MAXLEN]; 324 char id_str[NFS_UINT_MAXLEN];
302 int id_len; 325 int id_len;
303 ssize_t ret; 326 ssize_t ret;
304 327
305 id_len = snprintf(id_str, sizeof(id_str), "%u", id); 328 id_len = snprintf(id_str, sizeof(id_str), "%u", id);
306 ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); 329 ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);
307 if (ret < 0) 330 if (ret < 0)
308 return -EINVAL; 331 return -EINVAL;
309 return ret; 332 return ret;
310} 333}
311 334
312/* Name -> ID */ 335/* Name -> ID */
313static int nfs_idmap_lookup_id(const char *name, size_t namelen, 336static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type,
314 const char *type, __u32 *id) 337 __u32 *id, struct idmap *idmap)
315{ 338{
316 char id_str[NFS_UINT_MAXLEN]; 339 char id_str[NFS_UINT_MAXLEN];
317 long id_long; 340 long id_long;
318 ssize_t data_size; 341 ssize_t data_size;
319 int ret = 0; 342 int ret = 0;
320 343
321 data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); 344 data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);
322 if (data_size <= 0) { 345 if (data_size <= 0) {
323 ret = -EINVAL; 346 ret = -EINVAL;
324 } else { 347 } else {
@@ -328,114 +351,103 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
328 return ret; 351 return ret;
329} 352}
330 353
331int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) 354/* idmap classic begins here */
332{ 355module_param(nfs_idmap_cache_timeout, int, 0644);
333 if (nfs_map_string_to_numeric(name, namelen, uid))
334 return 0;
335 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
336}
337
338int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
339{
340 if (nfs_map_string_to_numeric(name, namelen, gid))
341 return 0;
342 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
343}
344
345int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
346{
347 int ret = -EINVAL;
348
349 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
350 ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
351 if (ret < 0)
352 ret = nfs_map_numeric_to_string(uid, buf, buflen);
353 return ret;
354}
355int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
356{
357 int ret = -EINVAL;
358 356
359 if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) 357struct idmap {
360 ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); 358 struct rpc_pipe *idmap_pipe;
361 if (ret < 0) 359 struct key_construction *idmap_key_cons;
362 ret = nfs_map_numeric_to_string(gid, buf, buflen);
363 return ret;
364}
365
366#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
367
368#include <linux/module.h>
369#include <linux/mutex.h>
370#include <linux/init.h>
371#include <linux/socket.h>
372#include <linux/in.h>
373#include <linux/sched.h>
374#include <linux/sunrpc/clnt.h>
375#include <linux/workqueue.h>
376#include <linux/sunrpc/rpc_pipe_fs.h>
377
378#include <linux/nfs_fs.h>
379
380#include "nfs4_fs.h"
381
382#define IDMAP_HASH_SZ 128
383
384/* Default cache timeout is 10 minutes */
385unsigned int nfs_idmap_cache_timeout = 600 * HZ;
386
387static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
388{
389 char *endp;
390 int num = simple_strtol(val, &endp, 0);
391 int jif = num * HZ;
392 if (endp == val || *endp || num < 0 || jif < num)
393 return -EINVAL;
394 *((int *)kp->arg) = jif;
395 return 0;
396}
397
398module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
399 &nfs_idmap_cache_timeout, 0644);
400
401struct idmap_hashent {
402 unsigned long ih_expires;
403 __u32 ih_id;
404 size_t ih_namelen;
405 char ih_name[IDMAP_NAMESZ];
406}; 360};
407 361
408struct idmap_hashtable { 362enum {
409 __u8 h_type; 363 Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err
410 struct idmap_hashent h_entries[IDMAP_HASH_SZ];
411}; 364};
412 365
413struct idmap { 366static const match_table_t nfs_idmap_tokens = {
414 struct dentry *idmap_dentry; 367 { Opt_find_uid, "uid:%s" },
415 wait_queue_head_t idmap_wq; 368 { Opt_find_gid, "gid:%s" },
416 struct idmap_msg idmap_im; 369 { Opt_find_user, "user:%s" },
417 struct mutex idmap_lock; /* Serializes upcalls */ 370 { Opt_find_group, "group:%s" },
418 struct mutex idmap_im_lock; /* Protects the hashtable */ 371 { Opt_find_err, NULL }
419 struct idmap_hashtable idmap_user_hash;
420 struct idmap_hashtable idmap_group_hash;
421}; 372};
422 373
374static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
423static ssize_t idmap_pipe_downcall(struct file *, const char __user *, 375static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
424 size_t); 376 size_t);
425static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); 377static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
426 378
427static unsigned int fnvhash32(const void *, size_t);
428
429static const struct rpc_pipe_ops idmap_upcall_ops = { 379static const struct rpc_pipe_ops idmap_upcall_ops = {
430 .upcall = rpc_pipe_generic_upcall, 380 .upcall = rpc_pipe_generic_upcall,
431 .downcall = idmap_pipe_downcall, 381 .downcall = idmap_pipe_downcall,
432 .destroy_msg = idmap_pipe_destroy_msg, 382 .destroy_msg = idmap_pipe_destroy_msg,
433}; 383};
434 384
385static struct key_type key_type_id_resolver_legacy = {
386 .name = "id_resolver",
387 .instantiate = user_instantiate,
388 .match = user_match,
389 .revoke = user_revoke,
390 .destroy = user_destroy,
391 .describe = user_describe,
392 .read = user_read,
393 .request_key = nfs_idmap_legacy_upcall,
394};
395
396static void __nfs_idmap_unregister(struct rpc_pipe *pipe)
397{
398 if (pipe->dentry)
399 rpc_unlink(pipe->dentry);
400}
401
402static int __nfs_idmap_register(struct dentry *dir,
403 struct idmap *idmap,
404 struct rpc_pipe *pipe)
405{
406 struct dentry *dentry;
407
408 dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
409 if (IS_ERR(dentry))
410 return PTR_ERR(dentry);
411 pipe->dentry = dentry;
412 return 0;
413}
414
415static void nfs_idmap_unregister(struct nfs_client *clp,
416 struct rpc_pipe *pipe)
417{
418 struct net *net = clp->net;
419 struct super_block *pipefs_sb;
420
421 pipefs_sb = rpc_get_sb_net(net);
422 if (pipefs_sb) {
423 __nfs_idmap_unregister(pipe);
424 rpc_put_sb_net(net);
425 }
426}
427
428static int nfs_idmap_register(struct nfs_client *clp,
429 struct idmap *idmap,
430 struct rpc_pipe *pipe)
431{
432 struct net *net = clp->net;
433 struct super_block *pipefs_sb;
434 int err = 0;
435
436 pipefs_sb = rpc_get_sb_net(net);
437 if (pipefs_sb) {
438 if (clp->cl_rpcclient->cl_dentry)
439 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
440 idmap, pipe);
441 rpc_put_sb_net(net);
442 }
443 return err;
444}
445
435int 446int
436nfs_idmap_new(struct nfs_client *clp) 447nfs_idmap_new(struct nfs_client *clp)
437{ 448{
438 struct idmap *idmap; 449 struct idmap *idmap;
450 struct rpc_pipe *pipe;
439 int error; 451 int error;
440 452
441 BUG_ON(clp->cl_idmap != NULL); 453 BUG_ON(clp->cl_idmap != NULL);
@@ -444,19 +456,19 @@ nfs_idmap_new(struct nfs_client *clp)
444 if (idmap == NULL) 456 if (idmap == NULL)
445 return -ENOMEM; 457 return -ENOMEM;
446 458
447 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, 459 pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0);
448 "idmap", idmap, &idmap_upcall_ops, 0); 460 if (IS_ERR(pipe)) {
449 if (IS_ERR(idmap->idmap_dentry)) { 461 error = PTR_ERR(pipe);
450 error = PTR_ERR(idmap->idmap_dentry);
451 kfree(idmap); 462 kfree(idmap);
452 return error; 463 return error;
453 } 464 }
454 465 error = nfs_idmap_register(clp, idmap, pipe);
455 mutex_init(&idmap->idmap_lock); 466 if (error) {
456 mutex_init(&idmap->idmap_im_lock); 467 rpc_destroy_pipe_data(pipe);
457 init_waitqueue_head(&idmap->idmap_wq); 468 kfree(idmap);
458 idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; 469 return error;
459 idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; 470 }
471 idmap->idmap_pipe = pipe;
460 472
461 clp->cl_idmap = idmap; 473 clp->cl_idmap = idmap;
462 return 0; 474 return 0;
@@ -469,211 +481,220 @@ nfs_idmap_delete(struct nfs_client *clp)
469 481
470 if (!idmap) 482 if (!idmap)
471 return; 483 return;
472 rpc_unlink(idmap->idmap_dentry); 484 nfs_idmap_unregister(clp, idmap->idmap_pipe);
485 rpc_destroy_pipe_data(idmap->idmap_pipe);
473 clp->cl_idmap = NULL; 486 clp->cl_idmap = NULL;
474 kfree(idmap); 487 kfree(idmap);
475} 488}
476 489
477/* 490static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event,
478 * Helper routines for manipulating the hashtable 491 struct super_block *sb)
479 */
480static inline struct idmap_hashent *
481idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len)
482{
483 return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ];
484}
485
486static struct idmap_hashent *
487idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len)
488{ 492{
489 struct idmap_hashent *he = idmap_name_hash(h, name, len); 493 int err = 0;
490 494
491 if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) 495 switch (event) {
492 return NULL; 496 case RPC_PIPEFS_MOUNT:
493 if (time_after(jiffies, he->ih_expires)) 497 BUG_ON(clp->cl_rpcclient->cl_dentry == NULL);
494 return NULL; 498 err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry,
495 return he; 499 clp->cl_idmap,
500 clp->cl_idmap->idmap_pipe);
501 break;
502 case RPC_PIPEFS_UMOUNT:
503 if (clp->cl_idmap->idmap_pipe) {
504 struct dentry *parent;
505
506 parent = clp->cl_idmap->idmap_pipe->dentry->d_parent;
507 __nfs_idmap_unregister(clp->cl_idmap->idmap_pipe);
508 /*
509 * Note: This is a dirty hack. SUNRPC hook has been
510 * called already but simple_rmdir() call for the
511 * directory returned with error because of idmap pipe
512 * inside. Thus now we have to remove this directory
513 * here.
514 */
515 if (rpc_rmdir(parent))
516 printk(KERN_ERR "NFS: %s: failed to remove "
517 "clnt dir!\n", __func__);
518 }
519 break;
520 default:
521 printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__,
522 event);
523 return -ENOTSUPP;
524 }
525 return err;
526}
527
528static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)
529{
530 struct nfs_net *nn = net_generic(net, nfs_net_id);
531 struct dentry *cl_dentry;
532 struct nfs_client *clp;
533
534 spin_lock(&nn->nfs_client_lock);
535 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
536 if (clp->rpc_ops != &nfs_v4_clientops)
537 continue;
538 cl_dentry = clp->cl_idmap->idmap_pipe->dentry;
539 if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) ||
540 ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry))
541 continue;
542 atomic_inc(&clp->cl_count);
543 spin_unlock(&nn->nfs_client_lock);
544 return clp;
545 }
546 spin_unlock(&nn->nfs_client_lock);
547 return NULL;
496} 548}
497 549
498static inline struct idmap_hashent * 550static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
499idmap_id_hash(struct idmap_hashtable* h, __u32 id) 551 void *ptr)
500{ 552{
501 return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; 553 struct super_block *sb = ptr;
502} 554 struct nfs_client *clp;
555 int error = 0;
503 556
504static struct idmap_hashent * 557 while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) {
505idmap_lookup_id(struct idmap_hashtable *h, __u32 id) 558 error = __rpc_pipefs_event(clp, event, sb);
506{ 559 nfs_put_client(clp);
507 struct idmap_hashent *he = idmap_id_hash(h, id); 560 if (error)
508 if (he->ih_id != id || he->ih_namelen == 0) 561 break;
509 return NULL; 562 }
510 if (time_after(jiffies, he->ih_expires)) 563 return error;
511 return NULL;
512 return he;
513} 564}
514 565
515/* 566#define PIPEFS_NFS_PRIO 1
516 * Routines for allocating new entries in the hashtable. 567
517 * For now, we just have 1 entry per bucket, so it's all 568static struct notifier_block nfs_idmap_block = {
518 * pretty trivial. 569 .notifier_call = rpc_pipefs_event,
519 */ 570 .priority = SUNRPC_PIPEFS_NFS_PRIO,
520static inline struct idmap_hashent * 571};
521idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len)
522{
523 return idmap_name_hash(h, name, len);
524}
525 572
526static inline struct idmap_hashent * 573int nfs_idmap_init(void)
527idmap_alloc_id(struct idmap_hashtable *h, __u32 id)
528{ 574{
529 return idmap_id_hash(h, id); 575 int ret;
576 ret = nfs_idmap_init_keyring();
577 if (ret != 0)
578 goto out;
579 ret = rpc_pipefs_notifier_register(&nfs_idmap_block);
580 if (ret != 0)
581 nfs_idmap_quit_keyring();
582out:
583 return ret;
530} 584}
531 585
532static void 586void nfs_idmap_quit(void)
533idmap_update_entry(struct idmap_hashent *he, const char *name,
534 size_t namelen, __u32 id)
535{ 587{
536 he->ih_id = id; 588 rpc_pipefs_notifier_unregister(&nfs_idmap_block);
537 memcpy(he->ih_name, name, namelen); 589 nfs_idmap_quit_keyring();
538 he->ih_name[namelen] = '\0';
539 he->ih_namelen = namelen;
540 he->ih_expires = jiffies + nfs_idmap_cache_timeout;
541} 590}
542 591
543/* 592static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im,
544 * Name -> ID 593 struct rpc_pipe_msg *msg)
545 */
546static int
547nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h,
548 const char *name, size_t namelen, __u32 *id)
549{ 594{
550 struct rpc_pipe_msg msg; 595 substring_t substr;
551 struct idmap_msg *im; 596 int token, ret;
552 struct idmap_hashent *he;
553 DECLARE_WAITQUEUE(wq, current);
554 int ret = -EIO;
555
556 im = &idmap->idmap_im;
557
558 /*
559 * String sanity checks
560 * Note that the userland daemon expects NUL terminated strings
561 */
562 for (;;) {
563 if (namelen == 0)
564 return -EINVAL;
565 if (name[namelen-1] != '\0')
566 break;
567 namelen--;
568 }
569 if (namelen >= IDMAP_NAMESZ)
570 return -EINVAL;
571 597
572 mutex_lock(&idmap->idmap_lock); 598 memset(im, 0, sizeof(*im));
573 mutex_lock(&idmap->idmap_im_lock); 599 memset(msg, 0, sizeof(*msg));
574
575 he = idmap_lookup_name(h, name, namelen);
576 if (he != NULL) {
577 *id = he->ih_id;
578 ret = 0;
579 goto out;
580 }
581 600
582 memset(im, 0, sizeof(*im)); 601 im->im_type = IDMAP_TYPE_GROUP;
583 memcpy(im->im_name, name, namelen); 602 token = match_token(desc, nfs_idmap_tokens, &substr);
584 603
585 im->im_type = h->h_type; 604 switch (token) {
586 im->im_conv = IDMAP_CONV_NAMETOID; 605 case Opt_find_uid:
606 im->im_type = IDMAP_TYPE_USER;
607 case Opt_find_gid:
608 im->im_conv = IDMAP_CONV_NAMETOID;
609 ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ);
610 break;
587 611
588 memset(&msg, 0, sizeof(msg)); 612 case Opt_find_user:
589 msg.data = im; 613 im->im_type = IDMAP_TYPE_USER;
590 msg.len = sizeof(*im); 614 case Opt_find_group:
615 im->im_conv = IDMAP_CONV_IDTONAME;
616 ret = match_int(&substr, &im->im_id);
617 break;
591 618
592 add_wait_queue(&idmap->idmap_wq, &wq); 619 default:
593 if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { 620 ret = -EINVAL;
594 remove_wait_queue(&idmap->idmap_wq, &wq);
595 goto out; 621 goto out;
596 } 622 }
597 623
598 set_current_state(TASK_UNINTERRUPTIBLE); 624 msg->data = im;
599 mutex_unlock(&idmap->idmap_im_lock); 625 msg->len = sizeof(struct idmap_msg);
600 schedule();
601 __set_current_state(TASK_RUNNING);
602 remove_wait_queue(&idmap->idmap_wq, &wq);
603 mutex_lock(&idmap->idmap_im_lock);
604 626
605 if (im->im_status & IDMAP_STATUS_SUCCESS) { 627out:
606 *id = im->im_id;
607 ret = 0;
608 }
609
610 out:
611 memset(im, 0, sizeof(*im));
612 mutex_unlock(&idmap->idmap_im_lock);
613 mutex_unlock(&idmap->idmap_lock);
614 return ret; 628 return ret;
615} 629}
616 630
617/* 631static int nfs_idmap_legacy_upcall(struct key_construction *cons,
618 * ID -> Name 632 const char *op,
619 */ 633 void *aux)
620static int
621nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
622 __u32 id, char *name)
623{ 634{
624 struct rpc_pipe_msg msg; 635 struct rpc_pipe_msg *msg;
625 struct idmap_msg *im; 636 struct idmap_msg *im;
626 struct idmap_hashent *he; 637 struct idmap *idmap = (struct idmap *)aux;
627 DECLARE_WAITQUEUE(wq, current); 638 struct key *key = cons->key;
628 int ret = -EIO; 639 int ret;
629 unsigned int len;
630
631 im = &idmap->idmap_im;
632 640
633 mutex_lock(&idmap->idmap_lock); 641 /* msg and im are freed in idmap_pipe_destroy_msg */
634 mutex_lock(&idmap->idmap_im_lock); 642 msg = kmalloc(sizeof(*msg), GFP_KERNEL);
643 if (IS_ERR(msg)) {
644 ret = PTR_ERR(msg);
645 goto out0;
646 }
635 647
636 he = idmap_lookup_id(h, id); 648 im = kmalloc(sizeof(*im), GFP_KERNEL);
637 if (he) { 649 if (IS_ERR(im)) {
638 memcpy(name, he->ih_name, he->ih_namelen); 650 ret = PTR_ERR(im);
639 ret = he->ih_namelen; 651 goto out1;
640 goto out;
641 } 652 }
642 653
643 memset(im, 0, sizeof(*im)); 654 ret = nfs_idmap_prepare_message(key->description, im, msg);
644 im->im_type = h->h_type; 655 if (ret < 0)
645 im->im_conv = IDMAP_CONV_IDTONAME; 656 goto out2;
646 im->im_id = id;
647 657
648 memset(&msg, 0, sizeof(msg)); 658 idmap->idmap_key_cons = cons;
649 msg.data = im;
650 msg.len = sizeof(*im);
651 659
652 add_wait_queue(&idmap->idmap_wq, &wq); 660 ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
661 if (ret < 0)
662 goto out2;
653 663
654 if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { 664 return ret;
655 remove_wait_queue(&idmap->idmap_wq, &wq); 665
656 goto out; 666out2:
657 } 667 kfree(im);
668out1:
669 kfree(msg);
670out0:
671 key_revoke(cons->key);
672 key_revoke(cons->authkey);
673 return ret;
674}
675
676static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data)
677{
678 return key_instantiate_and_link(key, data, strlen(data) + 1,
679 id_resolver_cache->thread_keyring,
680 authkey);
681}
658 682
659 set_current_state(TASK_UNINTERRUPTIBLE); 683static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey)
660 mutex_unlock(&idmap->idmap_im_lock); 684{
661 schedule(); 685 char id_str[NFS_UINT_MAXLEN];
662 __set_current_state(TASK_RUNNING); 686 int ret = -EINVAL;
663 remove_wait_queue(&idmap->idmap_wq, &wq); 687
664 mutex_lock(&idmap->idmap_im_lock); 688 switch (im->im_conv) {
665 689 case IDMAP_CONV_NAMETOID:
666 if (im->im_status & IDMAP_STATUS_SUCCESS) { 690 sprintf(id_str, "%d", im->im_id);
667 if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) 691 ret = nfs_idmap_instantiate(key, authkey, id_str);
668 goto out; 692 break;
669 memcpy(name, im->im_name, len); 693 case IDMAP_CONV_IDTONAME:
670 ret = len; 694 ret = nfs_idmap_instantiate(key, authkey, im->im_name);
695 break;
671 } 696 }
672 697
673 out:
674 memset(im, 0, sizeof(*im));
675 mutex_unlock(&idmap->idmap_im_lock);
676 mutex_unlock(&idmap->idmap_lock);
677 return ret; 698 return ret;
678} 699}
679 700
@@ -682,115 +703,51 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
682{ 703{
683 struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); 704 struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
684 struct idmap *idmap = (struct idmap *)rpci->private; 705 struct idmap *idmap = (struct idmap *)rpci->private;
685 struct idmap_msg im_in, *im = &idmap->idmap_im; 706 struct key_construction *cons = idmap->idmap_key_cons;
686 struct idmap_hashtable *h; 707 struct idmap_msg im;
687 struct idmap_hashent *he = NULL;
688 size_t namelen_in; 708 size_t namelen_in;
689 int ret; 709 int ret;
690 710
691 if (mlen != sizeof(im_in)) 711 if (mlen != sizeof(im)) {
692 return -ENOSPC; 712 ret = -ENOSPC;
693
694 if (copy_from_user(&im_in, src, mlen) != 0)
695 return -EFAULT;
696
697 mutex_lock(&idmap->idmap_im_lock);
698
699 ret = mlen;
700 im->im_status = im_in.im_status;
701 /* If we got an error, terminate now, and wake up pending upcalls */
702 if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) {
703 wake_up(&idmap->idmap_wq);
704 goto out; 713 goto out;
705 } 714 }
706 715
707 /* Sanity checking of strings */ 716 if (copy_from_user(&im, src, mlen) != 0) {
708 ret = -EINVAL; 717 ret = -EFAULT;
709 namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ);
710 if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ)
711 goto out; 718 goto out;
719 }
712 720
713 switch (im_in.im_type) { 721 if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
714 case IDMAP_TYPE_USER: 722 ret = mlen;
715 h = &idmap->idmap_user_hash; 723 complete_request_key(idmap->idmap_key_cons, -ENOKEY);
716 break; 724 goto out_incomplete;
717 case IDMAP_TYPE_GROUP:
718 h = &idmap->idmap_group_hash;
719 break;
720 default:
721 goto out;
722 } 725 }
723 726
724 switch (im_in.im_conv) { 727 namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
725 case IDMAP_CONV_IDTONAME: 728 if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
726 /* Did we match the current upcall? */ 729 ret = -EINVAL;
727 if (im->im_conv == IDMAP_CONV_IDTONAME
728 && im->im_type == im_in.im_type
729 && im->im_id == im_in.im_id) {
730 /* Yes: copy string, including the terminating '\0' */
731 memcpy(im->im_name, im_in.im_name, namelen_in);
732 im->im_name[namelen_in] = '\0';
733 wake_up(&idmap->idmap_wq);
734 }
735 he = idmap_alloc_id(h, im_in.im_id);
736 break;
737 case IDMAP_CONV_NAMETOID:
738 /* Did we match the current upcall? */
739 if (im->im_conv == IDMAP_CONV_NAMETOID
740 && im->im_type == im_in.im_type
741 && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in
742 && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) {
743 im->im_id = im_in.im_id;
744 wake_up(&idmap->idmap_wq);
745 }
746 he = idmap_alloc_name(h, im_in.im_name, namelen_in);
747 break;
748 default:
749 goto out; 730 goto out;
750 } 731 }
751 732
752 /* If the entry is valid, also copy it to the cache */ 733 ret = nfs_idmap_read_message(&im, cons->key, cons->authkey);
753 if (he != NULL) 734 if (ret >= 0) {
754 idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); 735 key_set_timeout(cons->key, nfs_idmap_cache_timeout);
755 ret = mlen; 736 ret = mlen;
737 }
738
756out: 739out:
757 mutex_unlock(&idmap->idmap_im_lock); 740 complete_request_key(idmap->idmap_key_cons, ret);
741out_incomplete:
758 return ret; 742 return ret;
759} 743}
760 744
761static void 745static void
762idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) 746idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
763{ 747{
764 struct idmap_msg *im = msg->data; 748 /* Free memory allocated in nfs_idmap_legacy_upcall() */
765 struct idmap *idmap = container_of(im, struct idmap, idmap_im); 749 kfree(msg->data);
766 750 kfree(msg);
767 if (msg->errno >= 0)
768 return;
769 mutex_lock(&idmap->idmap_im_lock);
770 im->im_status = IDMAP_STATUS_LOOKUPFAIL;
771 wake_up(&idmap->idmap_wq);
772 mutex_unlock(&idmap->idmap_im_lock);
773}
774
775/*
776 * Fowler/Noll/Vo hash
777 * http://www.isthe.com/chongo/tech/comp/fnv/
778 */
779
780#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */
781#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */
782
783static unsigned int fnvhash32(const void *buf, size_t buflen)
784{
785 const unsigned char *p, *end = (const unsigned char *)buf + buflen;
786 unsigned int hash = FNV_1_32;
787
788 for (p = buf; p < end; p++) {
789 hash *= FNV_P_32;
790 hash ^= (unsigned int)*p;
791 }
792
793 return hash;
794} 751}
795 752
796int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) 753int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
@@ -799,16 +756,16 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_
799 756
800 if (nfs_map_string_to_numeric(name, namelen, uid)) 757 if (nfs_map_string_to_numeric(name, namelen, uid))
801 return 0; 758 return 0;
802 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); 759 return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap);
803} 760}
804 761
805int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) 762int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
806{ 763{
807 struct idmap *idmap = server->nfs_client->cl_idmap; 764 struct idmap *idmap = server->nfs_client->cl_idmap;
808 765
809 if (nfs_map_string_to_numeric(name, namelen, uid)) 766 if (nfs_map_string_to_numeric(name, namelen, gid))
810 return 0; 767 return 0;
811 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 768 return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap);
812} 769}
813 770
814int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) 771int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
@@ -817,21 +774,19 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s
817 int ret = -EINVAL; 774 int ret = -EINVAL;
818 775
819 if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) 776 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
820 ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 777 ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap);
821 if (ret < 0) 778 if (ret < 0)
822 ret = nfs_map_numeric_to_string(uid, buf, buflen); 779 ret = nfs_map_numeric_to_string(uid, buf, buflen);
823 return ret; 780 return ret;
824} 781}
825int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) 782int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
826{ 783{
827 struct idmap *idmap = server->nfs_client->cl_idmap; 784 struct idmap *idmap = server->nfs_client->cl_idmap;
828 int ret = -EINVAL; 785 int ret = -EINVAL;
829 786
830 if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) 787 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
831 ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 788 ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap);
832 if (ret < 0) 789 if (ret < 0)
833 ret = nfs_map_numeric_to_string(uid, buf, buflen); 790 ret = nfs_map_numeric_to_string(gid, buf, buflen);
834 return ret; 791 return ret;
835} 792}
836
837#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c2ce8196912c..e8bbfa5b3500 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,6 +39,7 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/compat.h> 40#include <linux/compat.h>
41#include <linux/freezer.h> 41#include <linux/freezer.h>
42#include <linux/crc32.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44 45
@@ -50,6 +51,7 @@
50#include "fscache.h" 51#include "fscache.h"
51#include "dns_resolve.h" 52#include "dns_resolve.h"
52#include "pnfs.h" 53#include "pnfs.h"
54#include "netns.h"
53 55
54#define NFSDBG_FACILITY NFSDBG_VFS 56#define NFSDBG_FACILITY NFSDBG_VFS
55 57
@@ -387,9 +389,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
387 unlock_new_inode(inode); 389 unlock_new_inode(inode);
388 } else 390 } else
389 nfs_refresh_inode(inode, fattr); 391 nfs_refresh_inode(inode, fattr);
390 dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", 392 dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
391 inode->i_sb->s_id, 393 inode->i_sb->s_id,
392 (long long)NFS_FILEID(inode), 394 (long long)NFS_FILEID(inode),
395 nfs_display_fhandle_hash(fh),
393 atomic_read(&inode->i_count)); 396 atomic_read(&inode->i_count));
394 397
395out: 398out:
@@ -400,7 +403,7 @@ out_no_inode:
400 goto out; 403 goto out;
401} 404}
402 405
403#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) 406#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
404 407
405int 408int
406nfs_setattr(struct dentry *dentry, struct iattr *attr) 409nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -422,7 +425,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
422 425
423 /* Optimization: if the end result is no change, don't RPC */ 426 /* Optimization: if the end result is no change, don't RPC */
424 attr->ia_valid &= NFS_VALID_ATTRS; 427 attr->ia_valid &= NFS_VALID_ATTRS;
425 if ((attr->ia_valid & ~ATTR_FILE) == 0) 428 if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)
426 return 0; 429 return 0;
427 430
428 /* Write all dirty data */ 431 /* Write all dirty data */
@@ -1043,6 +1046,67 @@ struct nfs_fh *nfs_alloc_fhandle(void)
1043 return fh; 1046 return fh;
1044} 1047}
1045 1048
1049#ifdef NFS_DEBUG
1050/*
1051 * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle
1052 * in the same way that wireshark does
1053 *
1054 * @fh: file handle
1055 *
1056 * For debugging only.
1057 */
1058u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
1059{
1060 /* wireshark uses 32-bit AUTODIN crc and does a bitwise
1061 * not on the result */
1062 return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size);
1063}
1064
1065/*
1066 * _nfs_display_fhandle - display an NFS file handle on the console
1067 *
1068 * @fh: file handle to display
1069 * @caption: display caption
1070 *
1071 * For debugging only.
1072 */
1073void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
1074{
1075 unsigned short i;
1076
1077 if (fh == NULL || fh->size == 0) {
1078 printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh);
1079 return;
1080 }
1081
1082 printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n",
1083 caption, fh, fh->size, _nfs_display_fhandle_hash(fh));
1084 for (i = 0; i < fh->size; i += 16) {
1085 __be32 *pos = (__be32 *)&fh->data[i];
1086
1087 switch ((fh->size - i - 1) >> 2) {
1088 case 0:
1089 printk(KERN_DEFAULT " %08x\n",
1090 be32_to_cpup(pos));
1091 break;
1092 case 1:
1093 printk(KERN_DEFAULT " %08x %08x\n",
1094 be32_to_cpup(pos), be32_to_cpup(pos + 1));
1095 break;
1096 case 2:
1097 printk(KERN_DEFAULT " %08x %08x %08x\n",
1098 be32_to_cpup(pos), be32_to_cpup(pos + 1),
1099 be32_to_cpup(pos + 2));
1100 break;
1101 default:
1102 printk(KERN_DEFAULT " %08x %08x %08x %08x\n",
1103 be32_to_cpup(pos), be32_to_cpup(pos + 1),
1104 be32_to_cpup(pos + 2), be32_to_cpup(pos + 3));
1105 }
1106 }
1107}
1108#endif
1109
1046/** 1110/**
1047 * nfs_inode_attrs_need_update - check if the inode attributes need updating 1111 * nfs_inode_attrs_need_update - check if the inode attributes need updating
1048 * @inode - pointer to inode 1112 * @inode - pointer to inode
@@ -1210,8 +1274,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1210 unsigned long now = jiffies; 1274 unsigned long now = jiffies;
1211 unsigned long save_cache_validity; 1275 unsigned long save_cache_validity;
1212 1276
1213 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 1277 dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",
1214 __func__, inode->i_sb->s_id, inode->i_ino, 1278 __func__, inode->i_sb->s_id, inode->i_ino,
1279 nfs_display_fhandle_hash(NFS_FH(inode)),
1215 atomic_read(&inode->i_count), fattr->valid); 1280 atomic_read(&inode->i_count), fattr->valid);
1216 1281
1217 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) 1282 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
@@ -1405,7 +1470,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1405 /* 1470 /*
1406 * Big trouble! The inode has become a different object. 1471 * Big trouble! The inode has become a different object.
1407 */ 1472 */
1408 printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", 1473 printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
1409 __func__, inode->i_ino, inode->i_mode, fattr->mode); 1474 __func__, inode->i_ino, inode->i_mode, fattr->mode);
1410 out_err: 1475 out_err:
1411 /* 1476 /*
@@ -1494,7 +1559,7 @@ static void init_once(void *foo)
1494 INIT_LIST_HEAD(&nfsi->open_files); 1559 INIT_LIST_HEAD(&nfsi->open_files);
1495 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1560 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
1496 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1561 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1497 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1562 INIT_LIST_HEAD(&nfsi->commit_list);
1498 nfsi->npages = 0; 1563 nfsi->npages = 0;
1499 nfsi->ncommit = 0; 1564 nfsi->ncommit = 0;
1500 atomic_set(&nfsi->silly_count, 1); 1565 atomic_set(&nfsi->silly_count, 1);
@@ -1551,6 +1616,28 @@ static void nfsiod_stop(void)
1551 destroy_workqueue(wq); 1616 destroy_workqueue(wq);
1552} 1617}
1553 1618
1619int nfs_net_id;
1620EXPORT_SYMBOL_GPL(nfs_net_id);
1621
1622static int nfs_net_init(struct net *net)
1623{
1624 nfs_clients_init(net);
1625 return nfs_dns_resolver_cache_init(net);
1626}
1627
1628static void nfs_net_exit(struct net *net)
1629{
1630 nfs_dns_resolver_cache_destroy(net);
1631 nfs_cleanup_cb_ident_idr(net);
1632}
1633
1634static struct pernet_operations nfs_net_ops = {
1635 .init = nfs_net_init,
1636 .exit = nfs_net_exit,
1637 .id = &nfs_net_id,
1638 .size = sizeof(struct nfs_net),
1639};
1640
1554/* 1641/*
1555 * Initialize NFS 1642 * Initialize NFS
1556 */ 1643 */
@@ -1560,10 +1647,14 @@ static int __init init_nfs_fs(void)
1560 1647
1561 err = nfs_idmap_init(); 1648 err = nfs_idmap_init();
1562 if (err < 0) 1649 if (err < 0)
1563 goto out9; 1650 goto out10;
1564 1651
1565 err = nfs_dns_resolver_init(); 1652 err = nfs_dns_resolver_init();
1566 if (err < 0) 1653 if (err < 0)
1654 goto out9;
1655
1656 err = register_pernet_subsys(&nfs_net_ops);
1657 if (err < 0)
1567 goto out8; 1658 goto out8;
1568 1659
1569 err = nfs_fscache_register(); 1660 err = nfs_fscache_register();
@@ -1599,14 +1690,14 @@ static int __init init_nfs_fs(void)
1599 goto out0; 1690 goto out0;
1600 1691
1601#ifdef CONFIG_PROC_FS 1692#ifdef CONFIG_PROC_FS
1602 rpc_proc_register(&nfs_rpcstat); 1693 rpc_proc_register(&init_net, &nfs_rpcstat);
1603#endif 1694#endif
1604 if ((err = register_nfs_fs()) != 0) 1695 if ((err = register_nfs_fs()) != 0)
1605 goto out; 1696 goto out;
1606 return 0; 1697 return 0;
1607out: 1698out:
1608#ifdef CONFIG_PROC_FS 1699#ifdef CONFIG_PROC_FS
1609 rpc_proc_unregister("nfs"); 1700 rpc_proc_unregister(&init_net, "nfs");
1610#endif 1701#endif
1611 nfs_destroy_directcache(); 1702 nfs_destroy_directcache();
1612out0: 1703out0:
@@ -1624,10 +1715,12 @@ out5:
1624out6: 1715out6:
1625 nfs_fscache_unregister(); 1716 nfs_fscache_unregister();
1626out7: 1717out7:
1627 nfs_dns_resolver_destroy(); 1718 unregister_pernet_subsys(&nfs_net_ops);
1628out8: 1719out8:
1629 nfs_idmap_quit(); 1720 nfs_dns_resolver_destroy();
1630out9: 1721out9:
1722 nfs_idmap_quit();
1723out10:
1631 return err; 1724 return err;
1632} 1725}
1633 1726
@@ -1639,12 +1732,12 @@ static void __exit exit_nfs_fs(void)
1639 nfs_destroy_inodecache(); 1732 nfs_destroy_inodecache();
1640 nfs_destroy_nfspagecache(); 1733 nfs_destroy_nfspagecache();
1641 nfs_fscache_unregister(); 1734 nfs_fscache_unregister();
1735 unregister_pernet_subsys(&nfs_net_ops);
1642 nfs_dns_resolver_destroy(); 1736 nfs_dns_resolver_destroy();
1643 nfs_idmap_quit(); 1737 nfs_idmap_quit();
1644#ifdef CONFIG_PROC_FS 1738#ifdef CONFIG_PROC_FS
1645 rpc_proc_unregister("nfs"); 1739 rpc_proc_unregister(&init_net, "nfs");
1646#endif 1740#endif
1647 nfs_cleanup_cb_ident_idr();
1648 unregister_nfs_fs(); 1741 unregister_nfs_fs();
1649 nfs_fs_proc_exit(); 1742 nfs_fs_proc_exit();
1650 nfsiod_stop(); 1743 nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8102db9b926c..2476dc69365f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -123,6 +123,7 @@ struct nfs_parsed_mount_data {
123 } nfs_server; 123 } nfs_server;
124 124
125 struct security_mnt_opts lsm_opts; 125 struct security_mnt_opts lsm_opts;
126 struct net *net;
126}; 127};
127 128
128/* mount_clnt.c */ 129/* mount_clnt.c */
@@ -137,20 +138,22 @@ struct nfs_mount_request {
137 int noresvport; 138 int noresvport;
138 unsigned int *auth_flav_len; 139 unsigned int *auth_flav_len;
139 rpc_authflavor_t *auth_flavs; 140 rpc_authflavor_t *auth_flavs;
141 struct net *net;
140}; 142};
141 143
142extern int nfs_mount(struct nfs_mount_request *info); 144extern int nfs_mount(struct nfs_mount_request *info);
143extern void nfs_umount(const struct nfs_mount_request *info); 145extern void nfs_umount(const struct nfs_mount_request *info);
144 146
145/* client.c */ 147/* client.c */
146extern struct rpc_program nfs_program; 148extern const struct rpc_program nfs_program;
149extern void nfs_clients_init(struct net *net);
147 150
148extern void nfs_cleanup_cb_ident_idr(void); 151extern void nfs_cleanup_cb_ident_idr(struct net *);
149extern void nfs_put_client(struct nfs_client *); 152extern void nfs_put_client(struct nfs_client *);
150extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); 153extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
151extern struct nfs_client *nfs4_find_client_ident(int);
152extern struct nfs_client * 154extern struct nfs_client *
153nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *); 155nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
156 struct nfs4_sessionid *);
154extern struct nfs_server *nfs_create_server( 157extern struct nfs_server *nfs_create_server(
155 const struct nfs_parsed_mount_data *, 158 const struct nfs_parsed_mount_data *,
156 struct nfs_fh *); 159 struct nfs_fh *);
@@ -329,6 +332,8 @@ void nfs_retry_commit(struct list_head *page_list,
329void nfs_commit_clear_lock(struct nfs_inode *nfsi); 332void nfs_commit_clear_lock(struct nfs_inode *nfsi);
330void nfs_commitdata_release(void *data); 333void nfs_commitdata_release(void *data);
331void nfs_commit_release_pages(struct nfs_write_data *data); 334void nfs_commit_release_pages(struct nfs_write_data *data);
335void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head);
336void nfs_request_remove_commit_list(struct nfs_page *req);
332 337
333#ifdef CONFIG_MIGRATION 338#ifdef CONFIG_MIGRATION
334extern int nfs_migrate_page(struct address_space *, 339extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d4c2d6b7507e..8e65c7f1f87c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,7 +16,7 @@
16#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#ifdef RPC_DEBUG 19#ifdef NFS_DEBUG
20# define NFSDBG_FACILITY NFSDBG_MOUNT 20# define NFSDBG_FACILITY NFSDBG_MOUNT
21#endif 21#endif
22 22
@@ -67,7 +67,7 @@ enum {
67 MOUNTPROC3_EXPORT = 5, 67 MOUNTPROC3_EXPORT = 5,
68}; 68};
69 69
70static struct rpc_program mnt_program; 70static const struct rpc_program mnt_program;
71 71
72/* 72/*
73 * Defined by OpenGroup XNFS Version 3W, chapter 8 73 * Defined by OpenGroup XNFS Version 3W, chapter 8
@@ -153,7 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)
153 .rpc_resp = &result, 153 .rpc_resp = &result,
154 }; 154 };
155 struct rpc_create_args args = { 155 struct rpc_create_args args = {
156 .net = &init_net, 156 .net = info->net,
157 .protocol = info->protocol, 157 .protocol = info->protocol,
158 .address = info->sap, 158 .address = info->sap,
159 .addrsize = info->salen, 159 .addrsize = info->salen,
@@ -225,7 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)
225 .to_retries = 2, 225 .to_retries = 2,
226 }; 226 };
227 struct rpc_create_args args = { 227 struct rpc_create_args args = {
228 .net = &init_net, 228 .net = info->net,
229 .protocol = IPPROTO_UDP, 229 .protocol = IPPROTO_UDP,
230 .address = info->sap, 230 .address = info->sap,
231 .addrsize = info->salen, 231 .addrsize = info->salen,
@@ -488,19 +488,19 @@ static struct rpc_procinfo mnt3_procedures[] = {
488}; 488};
489 489
490 490
491static struct rpc_version mnt_version1 = { 491static const struct rpc_version mnt_version1 = {
492 .number = 1, 492 .number = 1,
493 .nrprocs = ARRAY_SIZE(mnt_procedures), 493 .nrprocs = ARRAY_SIZE(mnt_procedures),
494 .procs = mnt_procedures, 494 .procs = mnt_procedures,
495}; 495};
496 496
497static struct rpc_version mnt_version3 = { 497static const struct rpc_version mnt_version3 = {
498 .number = 3, 498 .number = 3,
499 .nrprocs = ARRAY_SIZE(mnt3_procedures), 499 .nrprocs = ARRAY_SIZE(mnt3_procedures),
500 .procs = mnt3_procedures, 500 .procs = mnt3_procedures,
501}; 501};
502 502
503static struct rpc_version *mnt_version[] = { 503static const struct rpc_version *mnt_version[] = {
504 NULL, 504 NULL,
505 &mnt_version1, 505 &mnt_version1,
506 NULL, 506 NULL,
@@ -509,7 +509,7 @@ static struct rpc_version *mnt_version[] = {
509 509
510static struct rpc_stat mnt_stats; 510static struct rpc_stat mnt_stats;
511 511
512static struct rpc_program mnt_program = { 512static const struct rpc_program mnt_program = {
513 .name = "mount", 513 .name = "mount",
514 .number = NFS_MNT_PROGRAM, 514 .number = NFS_MNT_PROGRAM,
515 .nrvers = ARRAY_SIZE(mnt_version), 515 .nrvers = ARRAY_SIZE(mnt_version),
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 8102391bb374..1807866bb3ab 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -276,7 +276,10 @@ out:
276 nfs_free_fattr(fattr); 276 nfs_free_fattr(fattr);
277 nfs_free_fhandle(fh); 277 nfs_free_fhandle(fh);
278out_nofree: 278out_nofree:
279 dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); 279 if (IS_ERR(mnt))
280 dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt));
281 else
282 dprintk("<-- %s() = %p\n", __func__, mnt);
280 return mnt; 283 return mnt;
281} 284}
282 285
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
new file mode 100644
index 000000000000..aa14ec303e94
--- /dev/null
+++ b/fs/nfs/netns.h
@@ -0,0 +1,27 @@
1#ifndef __NFS_NETNS_H__
2#define __NFS_NETNS_H__
3
4#include <net/net_namespace.h>
5#include <net/netns/generic.h>
6
7struct bl_dev_msg {
8 int32_t status;
9 uint32_t major, minor;
10};
11
12struct nfs_net {
13 struct cache_detail *nfs_dns_resolve;
14 struct rpc_pipe *bl_device_pipe;
15 struct bl_dev_msg bl_mount_reply;
16 wait_queue_head_t bl_wq;
17 struct list_head nfs_client_list;
18 struct list_head nfs_volume_list;
19#ifdef CONFIG_NFS_V4
20 struct idr cb_ident_idr; /* Protected by nfs_client_lock */
21#endif
22 spinlock_t nfs_client_lock;
23};
24
25extern int nfs_net_id;
26
27#endif
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 792cb13a4304..1f56000fabbd 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -1150,7 +1150,7 @@ struct rpc_procinfo nfs_procedures[] = {
1150 PROC(STATFS, fhandle, statfsres, 0), 1150 PROC(STATFS, fhandle, statfsres, 0),
1151}; 1151};
1152 1152
1153struct rpc_version nfs_version2 = { 1153const struct rpc_version nfs_version2 = {
1154 .number = 2, 1154 .number = 2,
1155 .nrprocs = ARRAY_SIZE(nfs_procedures), 1155 .nrprocs = ARRAY_SIZE(nfs_procedures),
1156 .procs = nfs_procedures 1156 .procs = nfs_procedures
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 7ef23979896d..e4498dc351a8 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
192 .pages = pages, 192 .pages = pages,
193 }; 193 };
194 struct nfs3_getaclres res = { 194 struct nfs3_getaclres res = {
195 0 195 NULL,
196 }; 196 };
197 struct rpc_message msg = { 197 struct rpc_message msg = {
198 .rpc_argp = &args, 198 .rpc_argp = &args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 91943953a370..5242eae6711a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -428,6 +428,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
428 msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; 428 msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
429} 429}
430 430
431static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
432{
433 rpc_call_start(task);
434}
435
431static int 436static int
432nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) 437nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
433{ 438{
@@ -445,6 +450,11 @@ nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
445 msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; 450 msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];
446} 451}
447 452
453static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
454{
455 rpc_call_start(task);
456}
457
448static int 458static int
449nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 459nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
450 struct inode *new_dir) 460 struct inode *new_dir)
@@ -814,6 +824,11 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
814 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; 824 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
815} 825}
816 826
827static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
828{
829 rpc_call_start(task);
830}
831
817static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) 832static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
818{ 833{
819 if (nfs3_async_handle_jukebox(task, data->inode)) 834 if (nfs3_async_handle_jukebox(task, data->inode))
@@ -828,6 +843,11 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
828 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; 843 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
829} 844}
830 845
846static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
847{
848 rpc_call_start(task);
849}
850
831static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) 851static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
832{ 852{
833 if (nfs3_async_handle_jukebox(task, data->inode)) 853 if (nfs3_async_handle_jukebox(task, data->inode))
@@ -864,9 +884,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
864 .create = nfs3_proc_create, 884 .create = nfs3_proc_create,
865 .remove = nfs3_proc_remove, 885 .remove = nfs3_proc_remove,
866 .unlink_setup = nfs3_proc_unlink_setup, 886 .unlink_setup = nfs3_proc_unlink_setup,
887 .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
867 .unlink_done = nfs3_proc_unlink_done, 888 .unlink_done = nfs3_proc_unlink_done,
868 .rename = nfs3_proc_rename, 889 .rename = nfs3_proc_rename,
869 .rename_setup = nfs3_proc_rename_setup, 890 .rename_setup = nfs3_proc_rename_setup,
891 .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
870 .rename_done = nfs3_proc_rename_done, 892 .rename_done = nfs3_proc_rename_done,
871 .link = nfs3_proc_link, 893 .link = nfs3_proc_link,
872 .symlink = nfs3_proc_symlink, 894 .symlink = nfs3_proc_symlink,
@@ -879,8 +901,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
879 .pathconf = nfs3_proc_pathconf, 901 .pathconf = nfs3_proc_pathconf,
880 .decode_dirent = nfs3_decode_dirent, 902 .decode_dirent = nfs3_decode_dirent,
881 .read_setup = nfs3_proc_read_setup, 903 .read_setup = nfs3_proc_read_setup,
904 .read_rpc_prepare = nfs3_proc_read_rpc_prepare,
882 .read_done = nfs3_read_done, 905 .read_done = nfs3_read_done,
883 .write_setup = nfs3_proc_write_setup, 906 .write_setup = nfs3_proc_write_setup,
907 .write_rpc_prepare = nfs3_proc_write_rpc_prepare,
884 .write_done = nfs3_write_done, 908 .write_done = nfs3_write_done,
885 .commit_setup = nfs3_proc_commit_setup, 909 .commit_setup = nfs3_proc_commit_setup,
886 .commit_done = nfs3_commit_done, 910 .commit_done = nfs3_commit_done,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 183c6b123d0f..a77cc9a3ce55 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2461,7 +2461,7 @@ struct rpc_procinfo nfs3_procedures[] = {
2461 PROC(COMMIT, commit, commit, 5), 2461 PROC(COMMIT, commit, commit, 5),
2462}; 2462};
2463 2463
2464struct rpc_version nfs_version3 = { 2464const struct rpc_version nfs_version3 = {
2465 .number = 3, 2465 .number = 3,
2466 .nrprocs = ARRAY_SIZE(nfs3_procedures), 2466 .nrprocs = ARRAY_SIZE(nfs3_procedures),
2467 .procs = nfs3_procedures 2467 .procs = nfs3_procedures
@@ -2489,7 +2489,7 @@ static struct rpc_procinfo nfs3_acl_procedures[] = {
2489 }, 2489 },
2490}; 2490};
2491 2491
2492struct rpc_version nfsacl_version3 = { 2492const struct rpc_version nfsacl_version3 = {
2493 .number = 3, 2493 .number = 3,
2494 .nrprocs = sizeof(nfs3_acl_procedures)/ 2494 .nrprocs = sizeof(nfs3_acl_procedures)/
2495 sizeof(nfs3_acl_procedures[0]), 2495 sizeof(nfs3_acl_procedures[0]),
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4d7d0aedc101..97ecc863dd76 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -20,7 +20,6 @@ enum nfs4_client_state {
20 NFS4CLNT_RECLAIM_REBOOT, 20 NFS4CLNT_RECLAIM_REBOOT,
21 NFS4CLNT_RECLAIM_NOGRACE, 21 NFS4CLNT_RECLAIM_NOGRACE,
22 NFS4CLNT_DELEGRETURN, 22 NFS4CLNT_DELEGRETURN,
23 NFS4CLNT_LAYOUTRECALL,
24 NFS4CLNT_SESSION_RESET, 23 NFS4CLNT_SESSION_RESET,
25 NFS4CLNT_RECALL_SLOT, 24 NFS4CLNT_RECALL_SLOT,
26 NFS4CLNT_LEASE_CONFIRM, 25 NFS4CLNT_LEASE_CONFIRM,
@@ -44,7 +43,7 @@ struct nfs4_minor_version_ops {
44 struct nfs4_sequence_args *args, 43 struct nfs4_sequence_args *args,
45 struct nfs4_sequence_res *res, 44 struct nfs4_sequence_res *res,
46 int cache_reply); 45 int cache_reply);
47 int (*validate_stateid)(struct nfs_delegation *, 46 bool (*match_stateid)(const nfs4_stateid *,
48 const nfs4_stateid *); 47 const nfs4_stateid *);
49 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, 48 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
50 struct nfs_fsinfo *); 49 struct nfs_fsinfo *);
@@ -53,26 +52,25 @@ struct nfs4_minor_version_ops {
53 const struct nfs4_state_maintenance_ops *state_renewal_ops; 52 const struct nfs4_state_maintenance_ops *state_renewal_ops;
54}; 53};
55 54
56/* 55struct nfs_unique_id {
57 * struct rpc_sequence ensures that RPC calls are sent in the exact 56 struct rb_node rb_node;
58 * order that they appear on the list. 57 __u64 id;
59 */
60struct rpc_sequence {
61 struct rpc_wait_queue wait; /* RPC call delay queue */
62 spinlock_t lock; /* Protects the list */
63 struct list_head list; /* Defines sequence of RPC calls */
64}; 58};
65 59
66#define NFS_SEQID_CONFIRMED 1 60#define NFS_SEQID_CONFIRMED 1
67struct nfs_seqid_counter { 61struct nfs_seqid_counter {
68 struct rpc_sequence *sequence; 62 int owner_id;
69 int flags; 63 int flags;
70 u32 counter; 64 u32 counter;
65 spinlock_t lock; /* Protects the list */
66 struct list_head list; /* Defines sequence of RPC calls */
67 struct rpc_wait_queue wait; /* RPC call delay queue */
71}; 68};
72 69
73struct nfs_seqid { 70struct nfs_seqid {
74 struct nfs_seqid_counter *sequence; 71 struct nfs_seqid_counter *sequence;
75 struct list_head list; 72 struct list_head list;
73 struct rpc_task *task;
76}; 74};
77 75
78static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) 76static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
@@ -81,18 +79,12 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
81 seqid->flags |= NFS_SEQID_CONFIRMED; 79 seqid->flags |= NFS_SEQID_CONFIRMED;
82} 80}
83 81
84struct nfs_unique_id {
85 struct rb_node rb_node;
86 __u64 id;
87};
88
89/* 82/*
90 * NFS4 state_owners and lock_owners are simply labels for ordered 83 * NFS4 state_owners and lock_owners are simply labels for ordered
91 * sequences of RPC calls. Their sole purpose is to provide once-only 84 * sequences of RPC calls. Their sole purpose is to provide once-only
92 * semantics by allowing the server to identify replayed requests. 85 * semantics by allowing the server to identify replayed requests.
93 */ 86 */
94struct nfs4_state_owner { 87struct nfs4_state_owner {
95 struct nfs_unique_id so_owner_id;
96 struct nfs_server *so_server; 88 struct nfs_server *so_server;
97 struct list_head so_lru; 89 struct list_head so_lru;
98 unsigned long so_expires; 90 unsigned long so_expires;
@@ -105,7 +97,6 @@ struct nfs4_state_owner {
105 unsigned long so_flags; 97 unsigned long so_flags;
106 struct list_head so_states; 98 struct list_head so_states;
107 struct nfs_seqid_counter so_seqid; 99 struct nfs_seqid_counter so_seqid;
108 struct rpc_sequence so_sequence;
109}; 100};
110 101
111enum { 102enum {
@@ -146,8 +137,6 @@ struct nfs4_lock_state {
146#define NFS_LOCK_INITIALIZED 1 137#define NFS_LOCK_INITIALIZED 1
147 int ls_flags; 138 int ls_flags;
148 struct nfs_seqid_counter ls_seqid; 139 struct nfs_seqid_counter ls_seqid;
149 struct rpc_sequence ls_sequence;
150 struct nfs_unique_id ls_id;
151 nfs4_stateid ls_stateid; 140 nfs4_stateid ls_stateid;
152 atomic_t ls_count; 141 atomic_t ls_count;
153 struct nfs4_lock_owner ls_owner; 142 struct nfs4_lock_owner ls_owner;
@@ -193,6 +182,7 @@ struct nfs4_exception {
193 long timeout; 182 long timeout;
194 int retry; 183 int retry;
195 struct nfs4_state *state; 184 struct nfs4_state *state;
185 struct inode *inode;
196}; 186};
197 187
198struct nfs4_state_recovery_ops { 188struct nfs4_state_recovery_ops {
@@ -224,7 +214,7 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, boo
224extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 214extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
225extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 215extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
226 struct nfs4_fs_locations *fs_locations, struct page *page); 216 struct nfs4_fs_locations *fs_locations, struct page *page);
227extern void nfs4_release_lockowner(const struct nfs4_lock_state *); 217extern int nfs4_release_lockowner(struct nfs4_lock_state *);
228extern const struct xattr_handler *nfs4_xattr_handlers[]; 218extern const struct xattr_handler *nfs4_xattr_handlers[];
229 219
230#if defined(CONFIG_NFS_V4_1) 220#if defined(CONFIG_NFS_V4_1)
@@ -233,12 +223,13 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
233 return server->nfs_client->cl_session; 223 return server->nfs_client->cl_session;
234} 224}
235 225
226extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
236extern int nfs4_setup_sequence(const struct nfs_server *server, 227extern int nfs4_setup_sequence(const struct nfs_server *server,
237 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 228 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
238 int cache_reply, struct rpc_task *task); 229 struct rpc_task *task);
239extern int nfs41_setup_sequence(struct nfs4_session *session, 230extern int nfs41_setup_sequence(struct nfs4_session *session,
240 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 231 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
241 int cache_reply, struct rpc_task *task); 232 struct rpc_task *task);
242extern void nfs4_destroy_session(struct nfs4_session *session); 233extern void nfs4_destroy_session(struct nfs4_session *session);
243extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 234extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
244extern int nfs4_proc_create_session(struct nfs_client *); 235extern int nfs4_proc_create_session(struct nfs_client *);
@@ -269,7 +260,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
269 260
270static inline int nfs4_setup_sequence(const struct nfs_server *server, 261static inline int nfs4_setup_sequence(const struct nfs_server *server,
271 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 262 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
272 int cache_reply, struct rpc_task *task) 263 struct rpc_task *task)
273{ 264{
274 return 0; 265 return 0;
275} 266}
@@ -319,7 +310,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
319} 310}
320#endif /* CONFIG_NFS_V4_1 */ 311#endif /* CONFIG_NFS_V4_1 */
321 312
322extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 313extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
323extern void nfs4_put_state_owner(struct nfs4_state_owner *); 314extern void nfs4_put_state_owner(struct nfs4_state_owner *);
324extern void nfs4_purge_state_owners(struct nfs_server *); 315extern void nfs4_purge_state_owners(struct nfs_server *);
325extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); 316extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
@@ -327,6 +318,8 @@ extern void nfs4_put_open_state(struct nfs4_state *);
327extern void nfs4_close_state(struct nfs4_state *, fmode_t); 318extern void nfs4_close_state(struct nfs4_state *, fmode_t);
328extern void nfs4_close_sync(struct nfs4_state *, fmode_t); 319extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
329extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 320extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
321extern void nfs_inode_find_state_and_recover(struct inode *inode,
322 const nfs4_stateid *stateid);
330extern void nfs4_schedule_lease_recovery(struct nfs_client *); 323extern void nfs4_schedule_lease_recovery(struct nfs_client *);
331extern void nfs4_schedule_state_manager(struct nfs_client *); 324extern void nfs4_schedule_state_manager(struct nfs_client *);
332extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); 325extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
@@ -337,7 +330,8 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
337 struct server_scope **); 330 struct server_scope **);
338extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 331extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
339extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 332extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
340extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); 333extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
334 fmode_t, fl_owner_t, pid_t);
341 335
342extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); 336extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
343extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 337extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
@@ -346,6 +340,8 @@ extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
346extern void nfs_release_seqid(struct nfs_seqid *seqid); 340extern void nfs_release_seqid(struct nfs_seqid *seqid);
347extern void nfs_free_seqid(struct nfs_seqid *seqid); 341extern void nfs_free_seqid(struct nfs_seqid *seqid);
348 342
343extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
344
349extern const nfs4_stateid zero_stateid; 345extern const nfs4_stateid zero_stateid;
350 346
351/* nfs4xdr.c */ 347/* nfs4xdr.c */
@@ -357,6 +353,16 @@ struct nfs4_mount_data;
357extern struct svc_version nfs4_callback_version1; 353extern struct svc_version nfs4_callback_version1;
358extern struct svc_version nfs4_callback_version4; 354extern struct svc_version nfs4_callback_version4;
359 355
356static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src)
357{
358 memcpy(dst, src, sizeof(*dst));
359}
360
361static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src)
362{
363 return memcmp(dst, src, sizeof(*dst)) == 0;
364}
365
360#else 366#else
361 367
362#define nfs4_close_state(a, b) do { } while (0) 368#define nfs4_close_state(a, b) do { } while (0)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 71ec08617e23..634c0bcb4fd6 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -33,7 +33,10 @@
33#include <linux/nfs_page.h> 33#include <linux/nfs_page.h>
34#include <linux/module.h> 34#include <linux/module.h>
35 35
36#include <linux/sunrpc/metrics.h>
37
36#include "internal.h" 38#include "internal.h"
39#include "delegation.h"
37#include "nfs4filelayout.h" 40#include "nfs4filelayout.h"
38 41
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD 42#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -84,12 +87,27 @@ static int filelayout_async_handle_error(struct rpc_task *task,
84 struct nfs_client *clp, 87 struct nfs_client *clp,
85 int *reset) 88 int *reset)
86{ 89{
90 struct nfs_server *mds_server = NFS_SERVER(state->inode);
91 struct nfs_client *mds_client = mds_server->nfs_client;
92
87 if (task->tk_status >= 0) 93 if (task->tk_status >= 0)
88 return 0; 94 return 0;
89
90 *reset = 0; 95 *reset = 0;
91 96
92 switch (task->tk_status) { 97 switch (task->tk_status) {
98 /* MDS state errors */
99 case -NFS4ERR_DELEG_REVOKED:
100 case -NFS4ERR_ADMIN_REVOKED:
101 case -NFS4ERR_BAD_STATEID:
102 nfs_remove_bad_delegation(state->inode);
103 case -NFS4ERR_OPENMODE:
104 nfs4_schedule_stateid_recovery(mds_server, state);
105 goto wait_on_recovery;
106 case -NFS4ERR_EXPIRED:
107 nfs4_schedule_stateid_recovery(mds_server, state);
108 nfs4_schedule_lease_recovery(mds_client);
109 goto wait_on_recovery;
110 /* DS session errors */
93 case -NFS4ERR_BADSESSION: 111 case -NFS4ERR_BADSESSION:
94 case -NFS4ERR_BADSLOT: 112 case -NFS4ERR_BADSLOT:
95 case -NFS4ERR_BAD_HIGH_SLOT: 113 case -NFS4ERR_BAD_HIGH_SLOT:
@@ -115,8 +133,14 @@ static int filelayout_async_handle_error(struct rpc_task *task,
115 *reset = 1; 133 *reset = 1;
116 break; 134 break;
117 } 135 }
136out:
118 task->tk_status = 0; 137 task->tk_status = 0;
119 return -EAGAIN; 138 return -EAGAIN;
139wait_on_recovery:
140 rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
141 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
142 rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
143 goto out;
120} 144}
121 145
122/* NFS_PROTO call done callback routines */ 146/* NFS_PROTO call done callback routines */
@@ -173,7 +197,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
173 197
174 if (nfs41_setup_sequence(rdata->ds_clp->cl_session, 198 if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
175 &rdata->args.seq_args, &rdata->res.seq_res, 199 &rdata->args.seq_args, &rdata->res.seq_res,
176 0, task)) 200 task))
177 return; 201 return;
178 202
179 rpc_call_start(task); 203 rpc_call_start(task);
@@ -189,10 +213,18 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
189 rdata->mds_ops->rpc_call_done(task, data); 213 rdata->mds_ops->rpc_call_done(task, data);
190} 214}
191 215
216static void filelayout_read_count_stats(struct rpc_task *task, void *data)
217{
218 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
219
220 rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics);
221}
222
192static void filelayout_read_release(void *data) 223static void filelayout_read_release(void *data)
193{ 224{
194 struct nfs_read_data *rdata = (struct nfs_read_data *)data; 225 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
195 226
227 put_lseg(rdata->lseg);
196 rdata->mds_ops->rpc_release(data); 228 rdata->mds_ops->rpc_release(data);
197} 229}
198 230
@@ -254,7 +286,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
254 286
255 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 287 if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
256 &wdata->args.seq_args, &wdata->res.seq_res, 288 &wdata->args.seq_args, &wdata->res.seq_res,
257 0, task)) 289 task))
258 return; 290 return;
259 291
260 rpc_call_start(task); 292 rpc_call_start(task);
@@ -268,10 +300,18 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
268 wdata->mds_ops->rpc_call_done(task, data); 300 wdata->mds_ops->rpc_call_done(task, data);
269} 301}
270 302
303static void filelayout_write_count_stats(struct rpc_task *task, void *data)
304{
305 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
306
307 rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics);
308}
309
271static void filelayout_write_release(void *data) 310static void filelayout_write_release(void *data)
272{ 311{
273 struct nfs_write_data *wdata = (struct nfs_write_data *)data; 312 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
274 313
314 put_lseg(wdata->lseg);
275 wdata->mds_ops->rpc_release(data); 315 wdata->mds_ops->rpc_release(data);
276} 316}
277 317
@@ -282,24 +322,28 @@ static void filelayout_commit_release(void *data)
282 nfs_commit_release_pages(wdata); 322 nfs_commit_release_pages(wdata);
283 if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) 323 if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
284 nfs_commit_clear_lock(NFS_I(wdata->inode)); 324 nfs_commit_clear_lock(NFS_I(wdata->inode));
325 put_lseg(wdata->lseg);
285 nfs_commitdata_release(wdata); 326 nfs_commitdata_release(wdata);
286} 327}
287 328
288struct rpc_call_ops filelayout_read_call_ops = { 329static const struct rpc_call_ops filelayout_read_call_ops = {
289 .rpc_call_prepare = filelayout_read_prepare, 330 .rpc_call_prepare = filelayout_read_prepare,
290 .rpc_call_done = filelayout_read_call_done, 331 .rpc_call_done = filelayout_read_call_done,
332 .rpc_count_stats = filelayout_read_count_stats,
291 .rpc_release = filelayout_read_release, 333 .rpc_release = filelayout_read_release,
292}; 334};
293 335
294struct rpc_call_ops filelayout_write_call_ops = { 336static const struct rpc_call_ops filelayout_write_call_ops = {
295 .rpc_call_prepare = filelayout_write_prepare, 337 .rpc_call_prepare = filelayout_write_prepare,
296 .rpc_call_done = filelayout_write_call_done, 338 .rpc_call_done = filelayout_write_call_done,
339 .rpc_count_stats = filelayout_write_count_stats,
297 .rpc_release = filelayout_write_release, 340 .rpc_release = filelayout_write_release,
298}; 341};
299 342
300struct rpc_call_ops filelayout_commit_call_ops = { 343static const struct rpc_call_ops filelayout_commit_call_ops = {
301 .rpc_call_prepare = filelayout_write_prepare, 344 .rpc_call_prepare = filelayout_write_prepare,
302 .rpc_call_done = filelayout_write_call_done, 345 .rpc_call_done = filelayout_write_call_done,
346 .rpc_count_stats = filelayout_write_count_stats,
303 .rpc_release = filelayout_commit_release, 347 .rpc_release = filelayout_commit_release,
304}; 348};
305 349
@@ -367,7 +411,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
367 idx = nfs4_fl_calc_ds_index(lseg, j); 411 idx = nfs4_fl_calc_ds_index(lseg, j);
368 ds = nfs4_fl_prepare_ds(lseg, idx); 412 ds = nfs4_fl_prepare_ds(lseg, idx);
369 if (!ds) { 413 if (!ds) {
370 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); 414 printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
415 __func__);
371 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); 416 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
372 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 417 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
373 return PNFS_NOT_ATTEMPTED; 418 return PNFS_NOT_ATTEMPTED;
@@ -575,7 +620,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
575 goto out_err_free; 620 goto out_err_free;
576 fl->fh_array[i]->size = be32_to_cpup(p++); 621 fl->fh_array[i]->size = be32_to_cpup(p++);
577 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { 622 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
578 printk(KERN_ERR "Too big fh %d received %d\n", 623 printk(KERN_ERR "NFS: Too big fh %d received %d\n",
579 i, fl->fh_array[i]->size); 624 i, fl->fh_array[i]->size);
580 goto out_err_free; 625 goto out_err_free;
581 } 626 }
@@ -640,14 +685,16 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
640 int size = (fl->stripe_type == STRIPE_SPARSE) ? 685 int size = (fl->stripe_type == STRIPE_SPARSE) ?
641 fl->dsaddr->ds_num : fl->dsaddr->stripe_count; 686 fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
642 687
643 fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags); 688 fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags);
644 if (!fl->commit_buckets) { 689 if (!fl->commit_buckets) {
645 filelayout_free_lseg(&fl->generic_hdr); 690 filelayout_free_lseg(&fl->generic_hdr);
646 return NULL; 691 return NULL;
647 } 692 }
648 fl->number_of_buckets = size; 693 fl->number_of_buckets = size;
649 for (i = 0; i < size; i++) 694 for (i = 0; i < size; i++) {
650 INIT_LIST_HEAD(&fl->commit_buckets[i]); 695 INIT_LIST_HEAD(&fl->commit_buckets[i].written);
696 INIT_LIST_HEAD(&fl->commit_buckets[i].committing);
697 }
651 } 698 }
652 return &fl->generic_hdr; 699 return &fl->generic_hdr;
653} 700}
@@ -679,7 +726,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
679 return (p_stripe == r_stripe); 726 return (p_stripe == r_stripe);
680} 727}
681 728
682void 729static void
683filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 730filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
684 struct nfs_page *req) 731 struct nfs_page *req)
685{ 732{
@@ -696,7 +743,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
696 nfs_pageio_reset_read_mds(pgio); 743 nfs_pageio_reset_read_mds(pgio);
697} 744}
698 745
699void 746static void
700filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, 747filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
701 struct nfs_page *req) 748 struct nfs_page *req)
702{ 749{
@@ -725,11 +772,6 @@ static const struct nfs_pageio_ops filelayout_pg_write_ops = {
725 .pg_doio = pnfs_generic_pg_writepages, 772 .pg_doio = pnfs_generic_pg_writepages,
726}; 773};
727 774
728static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
729{
730 return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
731}
732
733static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) 775static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
734{ 776{
735 if (fl->stripe_type == STRIPE_SPARSE) 777 if (fl->stripe_type == STRIPE_SPARSE)
@@ -738,13 +780,49 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
738 return j; 780 return j;
739} 781}
740 782
741struct list_head *filelayout_choose_commit_list(struct nfs_page *req) 783/* The generic layer is about to remove the req from the commit list.
784 * If this will make the bucket empty, it will need to put the lseg reference.
785 */
786static void
787filelayout_clear_request_commit(struct nfs_page *req)
788{
789 struct pnfs_layout_segment *freeme = NULL;
790 struct inode *inode = req->wb_context->dentry->d_inode;
791
792 spin_lock(&inode->i_lock);
793 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
794 goto out;
795 if (list_is_singular(&req->wb_list)) {
796 struct inode *inode = req->wb_context->dentry->d_inode;
797 struct pnfs_layout_segment *lseg;
798
799 /* From here we can find the bucket, but for the moment,
800 * since there is only one relevant lseg...
801 */
802 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
803 if (lseg->pls_range.iomode == IOMODE_RW) {
804 freeme = lseg;
805 break;
806 }
807 }
808 }
809out:
810 nfs_request_remove_commit_list(req);
811 spin_unlock(&inode->i_lock);
812 put_lseg(freeme);
813}
814
815static struct list_head *
816filelayout_choose_commit_list(struct nfs_page *req,
817 struct pnfs_layout_segment *lseg)
742{ 818{
743 struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
744 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 819 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
745 u32 i, j; 820 u32 i, j;
746 struct list_head *list; 821 struct list_head *list;
747 822
823 if (fl->commit_through_mds)
824 return &NFS_I(req->wb_context->dentry->d_inode)->commit_list;
825
748 /* Note that we are calling nfs4_fl_calc_j_index on each page 826 /* Note that we are calling nfs4_fl_calc_j_index on each page
749 * that ends up being committed to a data server. An attractive 827 * that ends up being committed to a data server. An attractive
750 * alternative is to add a field to nfs_write_data and nfs_page 828 * alternative is to add a field to nfs_write_data and nfs_page
@@ -754,14 +832,30 @@ struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
754 j = nfs4_fl_calc_j_index(lseg, 832 j = nfs4_fl_calc_j_index(lseg,
755 (loff_t)req->wb_index << PAGE_CACHE_SHIFT); 833 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
756 i = select_bucket_index(fl, j); 834 i = select_bucket_index(fl, j);
757 list = &fl->commit_buckets[i]; 835 list = &fl->commit_buckets[i].written;
758 if (list_empty(list)) { 836 if (list_empty(list)) {
759 /* Non-empty buckets hold a reference on the lseg */ 837 /* Non-empty buckets hold a reference on the lseg. That ref
838 * is normally transferred to the COMMIT call and released
839 * there. It could also be released if the last req is pulled
840 * off due to a rewrite, in which case it will be done in
841 * filelayout_remove_commit_req
842 */
760 get_lseg(lseg); 843 get_lseg(lseg);
761 } 844 }
845 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
762 return list; 846 return list;
763} 847}
764 848
849static void
850filelayout_mark_request_commit(struct nfs_page *req,
851 struct pnfs_layout_segment *lseg)
852{
853 struct list_head *list;
854
855 list = filelayout_choose_commit_list(req, lseg);
856 nfs_request_add_commit_list(req, list);
857}
858
765static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) 859static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
766{ 860{
767 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); 861 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
@@ -797,11 +891,12 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
797 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); 891 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
798 ds = nfs4_fl_prepare_ds(lseg, idx); 892 ds = nfs4_fl_prepare_ds(lseg, idx);
799 if (!ds) { 893 if (!ds) {
800 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); 894 printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n",
895 __func__);
801 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); 896 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
802 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); 897 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
803 prepare_to_resend_writes(data); 898 prepare_to_resend_writes(data);
804 data->mds_ops->rpc_release(data); 899 filelayout_commit_release(data);
805 return -EAGAIN; 900 return -EAGAIN;
806 } 901 }
807 dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); 902 dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
@@ -817,24 +912,87 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
817/* 912/*
818 * This is only useful while we are using whole file layouts. 913 * This is only useful while we are using whole file layouts.
819 */ 914 */
820static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) 915static struct pnfs_layout_segment *
916find_only_write_lseg_locked(struct inode *inode)
821{ 917{
822 struct pnfs_layout_segment *lseg, *rv = NULL; 918 struct pnfs_layout_segment *lseg;
823 919
824 spin_lock(&inode->i_lock);
825 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) 920 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
826 if (lseg->pls_range.iomode == IOMODE_RW) 921 if (lseg->pls_range.iomode == IOMODE_RW)
827 rv = get_lseg(lseg); 922 return lseg;
923 return NULL;
924}
925
926static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
927{
928 struct pnfs_layout_segment *rv;
929
930 spin_lock(&inode->i_lock);
931 rv = find_only_write_lseg_locked(inode);
932 if (rv)
933 get_lseg(rv);
828 spin_unlock(&inode->i_lock); 934 spin_unlock(&inode->i_lock);
829 return rv; 935 return rv;
830} 936}
831 937
832static int alloc_ds_commits(struct inode *inode, struct list_head *list) 938static int
939filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max,
940 spinlock_t *lock)
941{
942 struct list_head *src = &bucket->written;
943 struct list_head *dst = &bucket->committing;
944 struct nfs_page *req, *tmp;
945 int ret = 0;
946
947 list_for_each_entry_safe(req, tmp, src, wb_list) {
948 if (!nfs_lock_request(req))
949 continue;
950 if (cond_resched_lock(lock))
951 list_safe_reset_next(req, tmp, wb_list);
952 nfs_request_remove_commit_list(req);
953 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
954 nfs_list_add_request(req, dst);
955 ret++;
956 if (ret == max)
957 break;
958 }
959 return ret;
960}
961
962/* Move reqs from written to committing lists, returning count of number moved.
963 * Note called with i_lock held.
964 */
965static int filelayout_scan_commit_lists(struct inode *inode, int max,
966 spinlock_t *lock)
967{
968 struct pnfs_layout_segment *lseg;
969 struct nfs4_filelayout_segment *fl;
970 int i, rv = 0, cnt;
971
972 lseg = find_only_write_lseg_locked(inode);
973 if (!lseg)
974 goto out_done;
975 fl = FILELAYOUT_LSEG(lseg);
976 if (fl->commit_through_mds)
977 goto out_done;
978 for (i = 0; i < fl->number_of_buckets && max != 0; i++) {
979 cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i],
980 max, lock);
981 max -= cnt;
982 rv += cnt;
983 }
984out_done:
985 return rv;
986}
987
988static unsigned int
989alloc_ds_commits(struct inode *inode, struct list_head *list)
833{ 990{
834 struct pnfs_layout_segment *lseg; 991 struct pnfs_layout_segment *lseg;
835 struct nfs4_filelayout_segment *fl; 992 struct nfs4_filelayout_segment *fl;
836 struct nfs_write_data *data; 993 struct nfs_write_data *data;
837 int i, j; 994 int i, j;
995 unsigned int nreq = 0;
838 996
839 /* Won't need this when non-whole file layout segments are supported 997 /* Won't need this when non-whole file layout segments are supported
840 * instead we will use a pnfs_layout_hdr structure */ 998 * instead we will use a pnfs_layout_hdr structure */
@@ -843,28 +1001,27 @@ static int alloc_ds_commits(struct inode *inode, struct list_head *list)
843 return 0; 1001 return 0;
844 fl = FILELAYOUT_LSEG(lseg); 1002 fl = FILELAYOUT_LSEG(lseg);
845 for (i = 0; i < fl->number_of_buckets; i++) { 1003 for (i = 0; i < fl->number_of_buckets; i++) {
846 if (list_empty(&fl->commit_buckets[i])) 1004 if (list_empty(&fl->commit_buckets[i].committing))
847 continue; 1005 continue;
848 data = nfs_commitdata_alloc(); 1006 data = nfs_commitdata_alloc();
849 if (!data) 1007 if (!data)
850 goto out_bad; 1008 break;
851 data->ds_commit_index = i; 1009 data->ds_commit_index = i;
852 data->lseg = lseg; 1010 data->lseg = lseg;
853 list_add(&data->pages, list); 1011 list_add(&data->pages, list);
1012 nreq++;
854 } 1013 }
855 put_lseg(lseg);
856 return 0;
857 1014
858out_bad: 1015 /* Clean up on error */
859 for (j = i; j < fl->number_of_buckets; j++) { 1016 for (j = i; j < fl->number_of_buckets; j++) {
860 if (list_empty(&fl->commit_buckets[i])) 1017 if (list_empty(&fl->commit_buckets[i].committing))
861 continue; 1018 continue;
862 nfs_retry_commit(&fl->commit_buckets[i], lseg); 1019 nfs_retry_commit(&fl->commit_buckets[i].committing, lseg);
863 put_lseg(lseg); /* associated with emptying bucket */ 1020 put_lseg(lseg); /* associated with emptying bucket */
864 } 1021 }
865 put_lseg(lseg); 1022 put_lseg(lseg);
866 /* Caller will clean up entries put on list */ 1023 /* Caller will clean up entries put on list */
867 return -ENOMEM; 1024 return nreq;
868} 1025}
869 1026
870/* This follows nfs_commit_list pretty closely */ 1027/* This follows nfs_commit_list pretty closely */
@@ -874,40 +1031,40 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
874{ 1031{
875 struct nfs_write_data *data, *tmp; 1032 struct nfs_write_data *data, *tmp;
876 LIST_HEAD(list); 1033 LIST_HEAD(list);
1034 unsigned int nreq = 0;
877 1035
878 if (!list_empty(mds_pages)) { 1036 if (!list_empty(mds_pages)) {
879 data = nfs_commitdata_alloc(); 1037 data = nfs_commitdata_alloc();
880 if (!data) 1038 if (data != NULL) {
881 goto out_bad; 1039 data->lseg = NULL;
882 data->lseg = NULL; 1040 list_add(&data->pages, &list);
883 list_add(&data->pages, &list); 1041 nreq++;
1042 } else
1043 nfs_retry_commit(mds_pages, NULL);
884 } 1044 }
885 1045
886 if (alloc_ds_commits(inode, &list)) 1046 nreq += alloc_ds_commits(inode, &list);
887 goto out_bad; 1047
1048 if (nreq == 0) {
1049 nfs_commit_clear_lock(NFS_I(inode));
1050 goto out;
1051 }
1052
1053 atomic_add(nreq, &NFS_I(inode)->commits_outstanding);
888 1054
889 list_for_each_entry_safe(data, tmp, &list, pages) { 1055 list_for_each_entry_safe(data, tmp, &list, pages) {
890 list_del_init(&data->pages); 1056 list_del_init(&data->pages);
891 atomic_inc(&NFS_I(inode)->commits_outstanding);
892 if (!data->lseg) { 1057 if (!data->lseg) {
893 nfs_init_commit(data, mds_pages, NULL); 1058 nfs_init_commit(data, mds_pages, NULL);
894 nfs_initiate_commit(data, NFS_CLIENT(inode), 1059 nfs_initiate_commit(data, NFS_CLIENT(inode),
895 data->mds_ops, how); 1060 data->mds_ops, how);
896 } else { 1061 } else {
897 nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); 1062 nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg);
898 filelayout_initiate_commit(data, how); 1063 filelayout_initiate_commit(data, how);
899 } 1064 }
900 } 1065 }
901 return 0; 1066out:
902 out_bad: 1067 return PNFS_ATTEMPTED;
903 list_for_each_entry_safe(data, tmp, &list, pages) {
904 nfs_retry_commit(&data->pages, data->lseg);
905 list_del_init(&data->pages);
906 nfs_commit_free(data);
907 }
908 nfs_retry_commit(mds_pages, NULL);
909 nfs_commit_clear_lock(NFS_I(inode));
910 return -ENOMEM;
911} 1068}
912 1069
913static void 1070static void
@@ -924,8 +1081,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
924 .free_lseg = filelayout_free_lseg, 1081 .free_lseg = filelayout_free_lseg,
925 .pg_read_ops = &filelayout_pg_read_ops, 1082 .pg_read_ops = &filelayout_pg_read_ops,
926 .pg_write_ops = &filelayout_pg_write_ops, 1083 .pg_write_ops = &filelayout_pg_write_ops,
927 .mark_pnfs_commit = filelayout_mark_pnfs_commit, 1084 .mark_request_commit = filelayout_mark_request_commit,
928 .choose_commit_list = filelayout_choose_commit_list, 1085 .clear_request_commit = filelayout_clear_request_commit,
1086 .scan_commit_lists = filelayout_scan_commit_lists,
929 .commit_pagelist = filelayout_commit_pagelist, 1087 .commit_pagelist = filelayout_commit_pagelist,
930 .read_pagelist = filelayout_read_pagelist, 1088 .read_pagelist = filelayout_read_pagelist,
931 .write_pagelist = filelayout_write_pagelist, 1089 .write_pagelist = filelayout_write_pagelist,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2e42284253fa..21190bb1f5e3 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -74,6 +74,11 @@ struct nfs4_file_layout_dsaddr {
74 struct nfs4_pnfs_ds *ds_list[1]; 74 struct nfs4_pnfs_ds *ds_list[1];
75}; 75};
76 76
77struct nfs4_fl_commit_bucket {
78 struct list_head written;
79 struct list_head committing;
80};
81
77struct nfs4_filelayout_segment { 82struct nfs4_filelayout_segment {
78 struct pnfs_layout_segment generic_hdr; 83 struct pnfs_layout_segment generic_hdr;
79 u32 stripe_type; 84 u32 stripe_type;
@@ -84,7 +89,7 @@ struct nfs4_filelayout_segment {
84 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ 89 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
85 unsigned int num_fh; 90 unsigned int num_fh;
86 struct nfs_fh **fh_array; 91 struct nfs_fh **fh_array;
87 struct list_head *commit_buckets; /* Sort commits to ds */ 92 struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */
88 int number_of_buckets; 93 int number_of_buckets;
89}; 94};
90 95
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 8ae91908f5aa..a866bbd2890a 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,7 +45,7 @@
45 * - incremented when a device id maps a data server already in the cache. 45 * - incremented when a device id maps a data server already in the cache.
46 * - decremented when deviceid is removed from the cache. 46 * - decremented when deviceid is removed from the cache.
47 */ 47 */
48DEFINE_SPINLOCK(nfs4_ds_cache_lock); 48static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
49static LIST_HEAD(nfs4_data_server_cache); 49static LIST_HEAD(nfs4_data_server_cache);
50 50
51/* Debug routines */ 51/* Debug routines */
@@ -108,58 +108,40 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
108 return false; 108 return false;
109} 109}
110 110
111/* 111static bool
112 * Lookup DS by addresses. The first matching address returns true. 112_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
113 * nfs4_ds_cache_lock is held 113 const struct list_head *dsaddrs2)
114 */
115static struct nfs4_pnfs_ds *
116_data_server_lookup_locked(struct list_head *dsaddrs)
117{ 114{
118 struct nfs4_pnfs_ds *ds;
119 struct nfs4_pnfs_ds_addr *da1, *da2; 115 struct nfs4_pnfs_ds_addr *da1, *da2;
120 116
121 list_for_each_entry(da1, dsaddrs, da_node) { 117 /* step through both lists, comparing as we go */
122 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { 118 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
123 list_for_each_entry(da2, &ds->ds_addrs, da_node) { 119 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
124 if (same_sockaddr( 120 da1 != NULL && da2 != NULL;
125 (struct sockaddr *)&da1->da_addr, 121 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
126 (struct sockaddr *)&da2->da_addr)) 122 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
127 return ds; 123 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
128 } 124 (struct sockaddr *)&da2->da_addr))
129 } 125 return false;
130 } 126 }
131 return NULL; 127 if (da1 == NULL && da2 == NULL)
128 return true;
129
130 return false;
132} 131}
133 132
134/* 133/*
135 * Compare two lists of addresses. 134 * Lookup DS by addresses. nfs4_ds_cache_lock is held
136 */ 135 */
137static bool 136static struct nfs4_pnfs_ds *
138_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, 137_data_server_lookup_locked(const struct list_head *dsaddrs)
139 struct list_head *dsaddrs2)
140{ 138{
141 struct nfs4_pnfs_ds_addr *da1, *da2; 139 struct nfs4_pnfs_ds *ds;
142 size_t count1 = 0,
143 count2 = 0;
144
145 list_for_each_entry(da1, dsaddrs1, da_node)
146 count1++;
147
148 list_for_each_entry(da2, dsaddrs2, da_node) {
149 bool found = false;
150 count2++;
151 list_for_each_entry(da1, dsaddrs1, da_node) {
152 if (same_sockaddr((struct sockaddr *)&da1->da_addr,
153 (struct sockaddr *)&da2->da_addr)) {
154 found = true;
155 break;
156 }
157 }
158 if (!found)
159 return false;
160 }
161 140
162 return (count1 == count2); 141 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
142 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
143 return ds;
144 return NULL;
163} 145}
164 146
165/* 147/*
@@ -356,11 +338,6 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
356 dprintk("%s add new data server %s\n", __func__, 338 dprintk("%s add new data server %s\n", __func__,
357 ds->ds_remotestr); 339 ds->ds_remotestr);
358 } else { 340 } else {
359 if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs,
360 dsaddrs)) {
361 dprintk("%s: multipath address mismatch: %s != %s",
362 __func__, tmp_ds->ds_remotestr, remotestr);
363 }
364 kfree(remotestr); 341 kfree(remotestr);
365 kfree(ds); 342 kfree(ds);
366 atomic_inc(&tmp_ds->ds_count); 343 atomic_inc(&tmp_ds->ds_count);
@@ -378,7 +355,7 @@ out:
378 * Currently only supports ipv4, ipv6 and one multi-path address. 355 * Currently only supports ipv4, ipv6 and one multi-path address.
379 */ 356 */
380static struct nfs4_pnfs_ds_addr * 357static struct nfs4_pnfs_ds_addr *
381decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) 358decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
382{ 359{
383 struct nfs4_pnfs_ds_addr *da = NULL; 360 struct nfs4_pnfs_ds_addr *da = NULL;
384 char *buf, *portstr; 361 char *buf, *portstr;
@@ -457,7 +434,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
457 434
458 INIT_LIST_HEAD(&da->da_node); 435 INIT_LIST_HEAD(&da->da_node);
459 436
460 if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, 437 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
461 sizeof(da->da_addr))) { 438 sizeof(da->da_addr))) {
462 dprintk("%s: error parsing address %s\n", __func__, buf); 439 dprintk("%s: error parsing address %s\n", __func__, buf);
463 goto out_free_da; 440 goto out_free_da;
@@ -554,7 +531,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
554 cnt = be32_to_cpup(p); 531 cnt = be32_to_cpup(p);
555 dprintk("%s stripe count %d\n", __func__, cnt); 532 dprintk("%s stripe count %d\n", __func__, cnt);
556 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { 533 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
557 printk(KERN_WARNING "%s: stripe count %d greater than " 534 printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
558 "supported maximum %d\n", __func__, 535 "supported maximum %d\n", __func__,
559 cnt, NFS4_PNFS_MAX_STRIPE_CNT); 536 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
560 goto out_err_free_scratch; 537 goto out_err_free_scratch;
@@ -585,7 +562,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
585 num = be32_to_cpup(p); 562 num = be32_to_cpup(p);
586 dprintk("%s ds_num %u\n", __func__, num); 563 dprintk("%s ds_num %u\n", __func__, num);
587 if (num > NFS4_PNFS_MAX_MULTI_CNT) { 564 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
588 printk(KERN_WARNING "%s: multipath count %d greater than " 565 printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
589 "supported maximum %d\n", __func__, 566 "supported maximum %d\n", __func__,
590 num, NFS4_PNFS_MAX_MULTI_CNT); 567 num, NFS4_PNFS_MAX_MULTI_CNT);
591 goto out_err_free_stripe_indices; 568 goto out_err_free_stripe_indices;
@@ -593,7 +570,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
593 570
594 /* validate stripe indices are all < num */ 571 /* validate stripe indices are all < num */
595 if (max_stripe_index >= num) { 572 if (max_stripe_index >= num) {
596 printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", 573 printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
597 __func__, max_stripe_index, num); 574 __func__, max_stripe_index, num);
598 goto out_err_free_stripe_indices; 575 goto out_err_free_stripe_indices;
599 } 576 }
@@ -625,7 +602,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
625 602
626 mp_count = be32_to_cpup(p); /* multipath count */ 603 mp_count = be32_to_cpup(p); /* multipath count */
627 for (j = 0; j < mp_count; j++) { 604 for (j = 0; j < mp_count; j++) {
628 da = decode_ds_addr(&stream, gfp_flags); 605 da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net,
606 &stream, gfp_flags);
629 if (da) 607 if (da)
630 list_add_tail(&da->da_node, &dsaddrs); 608 list_add_tail(&da->da_node, &dsaddrs);
631 } 609 }
@@ -686,7 +664,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
686 664
687 new = decode_device(inode, dev, gfp_flags); 665 new = decode_device(inode, dev, gfp_flags);
688 if (!new) { 666 if (!new) {
689 printk(KERN_WARNING "%s: Could not decode or add device\n", 667 printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
690 __func__); 668 __func__);
691 return NULL; 669 return NULL;
692 } 670 }
@@ -835,7 +813,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
835 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 813 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
836 814
837 if (ds == NULL) { 815 if (ds == NULL) {
838 printk(KERN_ERR "%s: No data server for offset index %d\n", 816 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
839 __func__, ds_idx); 817 __func__, ds_idx);
840 return NULL; 818 return NULL;
841 } 819 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index bb80c49b6533..9c8eca315f43 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -94,13 +94,14 @@ static int nfs4_validate_fspath(struct dentry *dentry,
94} 94}
95 95
96static size_t nfs_parse_server_name(char *string, size_t len, 96static size_t nfs_parse_server_name(char *string, size_t len,
97 struct sockaddr *sa, size_t salen) 97 struct sockaddr *sa, size_t salen, struct nfs_server *server)
98{ 98{
99 struct net *net = rpc_net_ns(server->client);
99 ssize_t ret; 100 ssize_t ret;
100 101
101 ret = rpc_pton(string, len, sa, salen); 102 ret = rpc_pton(net, string, len, sa, salen);
102 if (ret == 0) { 103 if (ret == 0) {
103 ret = nfs_dns_resolve_name(string, len, sa, salen); 104 ret = nfs_dns_resolve_name(net, string, len, sa, salen);
104 if (ret < 0) 105 if (ret < 0)
105 ret = 0; 106 ret = 0;
106 } 107 }
@@ -137,7 +138,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
137 continue; 138 continue;
138 139
139 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, 140 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
140 mountdata->addr, addr_bufsize); 141 mountdata->addr, addr_bufsize,
142 NFS_SB(mountdata->sb));
141 if (mountdata->addrlen == 0) 143 if (mountdata->addrlen == 0)
142 continue; 144 continue;
143 145
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index caf92d05c3a9..e809d2305ebf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,18 +72,21 @@
72 72
73#define NFS4_MAX_LOOP_ON_RECOVER (10) 73#define NFS4_MAX_LOOP_ON_RECOVER (10)
74 74
75static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
76
75struct nfs4_opendata; 77struct nfs4_opendata;
76static int _nfs4_proc_open(struct nfs4_opendata *data); 78static int _nfs4_proc_open(struct nfs4_opendata *data);
77static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 79static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
78static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 80static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
79static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 81static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
82static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
80static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
81static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 84static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
82 struct nfs_fattr *fattr, struct iattr *sattr, 85 struct nfs_fattr *fattr, struct iattr *sattr,
83 struct nfs4_state *state); 86 struct nfs4_state *state);
84#ifdef CONFIG_NFS_V4_1 87#ifdef CONFIG_NFS_V4_1
85static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); 88static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
86static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); 89static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
87#endif 90#endif
88/* Prevent leaks of NFSv4 errors into userland */ 91/* Prevent leaks of NFSv4 errors into userland */
89static int nfs4_map_errors(int err) 92static int nfs4_map_errors(int err)
@@ -259,15 +262,28 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
259{ 262{
260 struct nfs_client *clp = server->nfs_client; 263 struct nfs_client *clp = server->nfs_client;
261 struct nfs4_state *state = exception->state; 264 struct nfs4_state *state = exception->state;
265 struct inode *inode = exception->inode;
262 int ret = errorcode; 266 int ret = errorcode;
263 267
264 exception->retry = 0; 268 exception->retry = 0;
265 switch(errorcode) { 269 switch(errorcode) {
266 case 0: 270 case 0:
267 return 0; 271 return 0;
272 case -NFS4ERR_OPENMODE:
273 if (nfs_have_delegation(inode, FMODE_READ)) {
274 nfs_inode_return_delegation(inode);
275 exception->retry = 1;
276 return 0;
277 }
278 if (state == NULL)
279 break;
280 nfs4_schedule_stateid_recovery(server, state);
281 goto wait_on_recovery;
282 case -NFS4ERR_DELEG_REVOKED:
268 case -NFS4ERR_ADMIN_REVOKED: 283 case -NFS4ERR_ADMIN_REVOKED:
269 case -NFS4ERR_BAD_STATEID: 284 case -NFS4ERR_BAD_STATEID:
270 case -NFS4ERR_OPENMODE: 285 if (state != NULL)
286 nfs_remove_bad_delegation(state->inode);
271 if (state == NULL) 287 if (state == NULL)
272 break; 288 break;
273 nfs4_schedule_stateid_recovery(server, state); 289 nfs4_schedule_stateid_recovery(server, state);
@@ -360,16 +376,14 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
360 * When updating highest_used_slotid there may be "holes" in the bitmap 376 * When updating highest_used_slotid there may be "holes" in the bitmap
361 * so we need to scan down from highest_used_slotid to 0 looking for the now 377 * so we need to scan down from highest_used_slotid to 0 looking for the now
362 * highest slotid in use. 378 * highest slotid in use.
363 * If none found, highest_used_slotid is set to -1. 379 * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
364 * 380 *
365 * Must be called while holding tbl->slot_tbl_lock 381 * Must be called while holding tbl->slot_tbl_lock
366 */ 382 */
367static void 383static void
368nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) 384nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
369{ 385{
370 int slotid = free_slotid; 386 BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
371
372 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
373 /* clear used bit in bitmap */ 387 /* clear used bit in bitmap */
374 __clear_bit(slotid, tbl->used_slots); 388 __clear_bit(slotid, tbl->used_slots);
375 389
@@ -379,10 +393,16 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
379 if (slotid < tbl->max_slots) 393 if (slotid < tbl->max_slots)
380 tbl->highest_used_slotid = slotid; 394 tbl->highest_used_slotid = slotid;
381 else 395 else
382 tbl->highest_used_slotid = -1; 396 tbl->highest_used_slotid = NFS4_NO_SLOT;
383 } 397 }
384 dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, 398 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
385 free_slotid, tbl->highest_used_slotid); 399 slotid, tbl->highest_used_slotid);
400}
401
402bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
403{
404 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
405 return true;
386} 406}
387 407
388/* 408/*
@@ -390,16 +410,13 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
390 */ 410 */
391static void nfs4_check_drain_fc_complete(struct nfs4_session *ses) 411static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
392{ 412{
393 struct rpc_task *task;
394
395 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { 413 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
396 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); 414 rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
397 if (task) 415 nfs4_set_task_privileged, NULL);
398 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
399 return; 416 return;
400 } 417 }
401 418
402 if (ses->fc_slot_table.highest_used_slotid != -1) 419 if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
403 return; 420 return;
404 421
405 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); 422 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
@@ -412,7 +429,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
412void nfs4_check_drain_bc_complete(struct nfs4_session *ses) 429void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
413{ 430{
414 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || 431 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
415 ses->bc_slot_table.highest_used_slotid != -1) 432 ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
416 return; 433 return;
417 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__); 434 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
418 complete(&ses->bc_slot_table.complete); 435 complete(&ses->bc_slot_table.complete);
@@ -507,25 +524,25 @@ static int nfs4_sequence_done(struct rpc_task *task,
507 * nfs4_find_slot looks for an unset bit in the used_slots bitmap. 524 * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
508 * If found, we mark the slot as used, update the highest_used_slotid, 525 * If found, we mark the slot as used, update the highest_used_slotid,
509 * and respectively set up the sequence operation args. 526 * and respectively set up the sequence operation args.
510 * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise. 527 * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
511 * 528 *
512 * Note: must be called with under the slot_tbl_lock. 529 * Note: must be called with under the slot_tbl_lock.
513 */ 530 */
514static u8 531static u32
515nfs4_find_slot(struct nfs4_slot_table *tbl) 532nfs4_find_slot(struct nfs4_slot_table *tbl)
516{ 533{
517 int slotid; 534 u32 slotid;
518 u8 ret_id = NFS4_MAX_SLOT_TABLE; 535 u32 ret_id = NFS4_NO_SLOT;
519 BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
520 536
521 dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n", 537 dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
522 __func__, tbl->used_slots[0], tbl->highest_used_slotid, 538 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
523 tbl->max_slots); 539 tbl->max_slots);
524 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots); 540 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
525 if (slotid >= tbl->max_slots) 541 if (slotid >= tbl->max_slots)
526 goto out; 542 goto out;
527 __set_bit(slotid, tbl->used_slots); 543 __set_bit(slotid, tbl->used_slots);
528 if (slotid > tbl->highest_used_slotid) 544 if (slotid > tbl->highest_used_slotid ||
545 tbl->highest_used_slotid == NFS4_NO_SLOT)
529 tbl->highest_used_slotid = slotid; 546 tbl->highest_used_slotid = slotid;
530 ret_id = slotid; 547 ret_id = slotid;
531out: 548out:
@@ -534,15 +551,25 @@ out:
534 return ret_id; 551 return ret_id;
535} 552}
536 553
554static void nfs41_init_sequence(struct nfs4_sequence_args *args,
555 struct nfs4_sequence_res *res, int cache_reply)
556{
557 args->sa_session = NULL;
558 args->sa_cache_this = 0;
559 if (cache_reply)
560 args->sa_cache_this = 1;
561 res->sr_session = NULL;
562 res->sr_slot = NULL;
563}
564
537int nfs41_setup_sequence(struct nfs4_session *session, 565int nfs41_setup_sequence(struct nfs4_session *session,
538 struct nfs4_sequence_args *args, 566 struct nfs4_sequence_args *args,
539 struct nfs4_sequence_res *res, 567 struct nfs4_sequence_res *res,
540 int cache_reply,
541 struct rpc_task *task) 568 struct rpc_task *task)
542{ 569{
543 struct nfs4_slot *slot; 570 struct nfs4_slot *slot;
544 struct nfs4_slot_table *tbl; 571 struct nfs4_slot_table *tbl;
545 u8 slotid; 572 u32 slotid;
546 573
547 dprintk("--> %s\n", __func__); 574 dprintk("--> %s\n", __func__);
548 /* slot already allocated? */ 575 /* slot already allocated? */
@@ -570,7 +597,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
570 } 597 }
571 598
572 slotid = nfs4_find_slot(tbl); 599 slotid = nfs4_find_slot(tbl);
573 if (slotid == NFS4_MAX_SLOT_TABLE) { 600 if (slotid == NFS4_NO_SLOT) {
574 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); 601 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
575 spin_unlock(&tbl->slot_tbl_lock); 602 spin_unlock(&tbl->slot_tbl_lock);
576 dprintk("<-- %s: no free slots\n", __func__); 603 dprintk("<-- %s: no free slots\n", __func__);
@@ -582,7 +609,6 @@ int nfs41_setup_sequence(struct nfs4_session *session,
582 slot = tbl->slots + slotid; 609 slot = tbl->slots + slotid;
583 args->sa_session = session; 610 args->sa_session = session;
584 args->sa_slotid = slotid; 611 args->sa_slotid = slotid;
585 args->sa_cache_this = cache_reply;
586 612
587 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 613 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr);
588 614
@@ -602,24 +628,19 @@ EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
602int nfs4_setup_sequence(const struct nfs_server *server, 628int nfs4_setup_sequence(const struct nfs_server *server,
603 struct nfs4_sequence_args *args, 629 struct nfs4_sequence_args *args,
604 struct nfs4_sequence_res *res, 630 struct nfs4_sequence_res *res,
605 int cache_reply,
606 struct rpc_task *task) 631 struct rpc_task *task)
607{ 632{
608 struct nfs4_session *session = nfs4_get_session(server); 633 struct nfs4_session *session = nfs4_get_session(server);
609 int ret = 0; 634 int ret = 0;
610 635
611 if (session == NULL) { 636 if (session == NULL)
612 args->sa_session = NULL;
613 res->sr_session = NULL;
614 goto out; 637 goto out;
615 }
616 638
617 dprintk("--> %s clp %p session %p sr_slot %td\n", 639 dprintk("--> %s clp %p session %p sr_slot %td\n",
618 __func__, session->clp, session, res->sr_slot ? 640 __func__, session->clp, session, res->sr_slot ?
619 res->sr_slot - session->fc_slot_table.slots : -1); 641 res->sr_slot - session->fc_slot_table.slots : -1);
620 642
621 ret = nfs41_setup_sequence(session, args, res, cache_reply, 643 ret = nfs41_setup_sequence(session, args, res, task);
622 task);
623out: 644out:
624 dprintk("<-- %s status=%d\n", __func__, ret); 645 dprintk("<-- %s status=%d\n", __func__, ret);
625 return ret; 646 return ret;
@@ -629,7 +650,6 @@ struct nfs41_call_sync_data {
629 const struct nfs_server *seq_server; 650 const struct nfs_server *seq_server;
630 struct nfs4_sequence_args *seq_args; 651 struct nfs4_sequence_args *seq_args;
631 struct nfs4_sequence_res *seq_res; 652 struct nfs4_sequence_res *seq_res;
632 int cache_reply;
633}; 653};
634 654
635static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) 655static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
@@ -639,7 +659,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
639 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); 659 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
640 660
641 if (nfs4_setup_sequence(data->seq_server, data->seq_args, 661 if (nfs4_setup_sequence(data->seq_server, data->seq_args,
642 data->seq_res, data->cache_reply, task)) 662 data->seq_res, task))
643 return; 663 return;
644 rpc_call_start(task); 664 rpc_call_start(task);
645} 665}
@@ -657,12 +677,12 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
657 nfs41_sequence_done(task, data->seq_res); 677 nfs41_sequence_done(task, data->seq_res);
658} 678}
659 679
660struct rpc_call_ops nfs41_call_sync_ops = { 680static const struct rpc_call_ops nfs41_call_sync_ops = {
661 .rpc_call_prepare = nfs41_call_sync_prepare, 681 .rpc_call_prepare = nfs41_call_sync_prepare,
662 .rpc_call_done = nfs41_call_sync_done, 682 .rpc_call_done = nfs41_call_sync_done,
663}; 683};
664 684
665struct rpc_call_ops nfs41_call_priv_sync_ops = { 685static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
666 .rpc_call_prepare = nfs41_call_priv_sync_prepare, 686 .rpc_call_prepare = nfs41_call_priv_sync_prepare,
667 .rpc_call_done = nfs41_call_sync_done, 687 .rpc_call_done = nfs41_call_sync_done,
668}; 688};
@@ -672,7 +692,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
672 struct rpc_message *msg, 692 struct rpc_message *msg,
673 struct nfs4_sequence_args *args, 693 struct nfs4_sequence_args *args,
674 struct nfs4_sequence_res *res, 694 struct nfs4_sequence_res *res,
675 int cache_reply,
676 int privileged) 695 int privileged)
677{ 696{
678 int ret; 697 int ret;
@@ -681,7 +700,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
681 .seq_server = server, 700 .seq_server = server,
682 .seq_args = args, 701 .seq_args = args,
683 .seq_res = res, 702 .seq_res = res,
684 .cache_reply = cache_reply,
685 }; 703 };
686 struct rpc_task_setup task_setup = { 704 struct rpc_task_setup task_setup = {
687 .rpc_client = clnt, 705 .rpc_client = clnt,
@@ -690,7 +708,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
690 .callback_data = &data 708 .callback_data = &data
691 }; 709 };
692 710
693 res->sr_slot = NULL;
694 if (privileged) 711 if (privileged)
695 task_setup.callback_ops = &nfs41_call_priv_sync_ops; 712 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
696 task = rpc_run_task(&task_setup); 713 task = rpc_run_task(&task_setup);
@@ -710,10 +727,17 @@ int _nfs4_call_sync_session(struct rpc_clnt *clnt,
710 struct nfs4_sequence_res *res, 727 struct nfs4_sequence_res *res,
711 int cache_reply) 728 int cache_reply)
712{ 729{
713 return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); 730 nfs41_init_sequence(args, res, cache_reply);
731 return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
714} 732}
715 733
716#else 734#else
735static inline
736void nfs41_init_sequence(struct nfs4_sequence_args *args,
737 struct nfs4_sequence_res *res, int cache_reply)
738{
739}
740
717static int nfs4_sequence_done(struct rpc_task *task, 741static int nfs4_sequence_done(struct rpc_task *task,
718 struct nfs4_sequence_res *res) 742 struct nfs4_sequence_res *res)
719{ 743{
@@ -728,7 +752,7 @@ int _nfs4_call_sync(struct rpc_clnt *clnt,
728 struct nfs4_sequence_res *res, 752 struct nfs4_sequence_res *res,
729 int cache_reply) 753 int cache_reply)
730{ 754{
731 args->sa_session = res->sr_session = NULL; 755 nfs41_init_sequence(args, res, cache_reply);
732 return rpc_call_sync(clnt, msg, 0); 756 return rpc_call_sync(clnt, msg, 0);
733} 757}
734 758
@@ -815,20 +839,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
815 p->o_arg.open_flags = flags; 839 p->o_arg.open_flags = flags;
816 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 840 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
817 p->o_arg.clientid = server->nfs_client->cl_clientid; 841 p->o_arg.clientid = server->nfs_client->cl_clientid;
818 p->o_arg.id = sp->so_owner_id.id; 842 p->o_arg.id = sp->so_seqid.owner_id;
819 p->o_arg.name = &dentry->d_name; 843 p->o_arg.name = &dentry->d_name;
820 p->o_arg.server = server; 844 p->o_arg.server = server;
821 p->o_arg.bitmask = server->attr_bitmask; 845 p->o_arg.bitmask = server->attr_bitmask;
822 p->o_arg.dir_bitmask = server->cache_consistency_bitmask; 846 p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
823 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 847 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
824 if (flags & O_CREAT) { 848 if (attrs != NULL && attrs->ia_valid != 0) {
825 u32 *s; 849 __be32 verf[2];
826 850
827 p->o_arg.u.attrs = &p->attrs; 851 p->o_arg.u.attrs = &p->attrs;
828 memcpy(&p->attrs, attrs, sizeof(p->attrs)); 852 memcpy(&p->attrs, attrs, sizeof(p->attrs));
829 s = (u32 *) p->o_arg.u.verifier.data; 853
830 s[0] = jiffies; 854 verf[0] = jiffies;
831 s[1] = current->pid; 855 verf[1] = current->pid;
856 memcpy(p->o_arg.u.verifier.data, verf,
857 sizeof(p->o_arg.u.verifier.data));
832 } 858 }
833 p->c_arg.fh = &p->o_res.fh; 859 p->c_arg.fh = &p->o_res.fh;
834 p->c_arg.stateid = &p->o_res.stateid; 860 p->c_arg.stateid = &p->o_res.stateid;
@@ -878,7 +904,7 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode
878{ 904{
879 int ret = 0; 905 int ret = 0;
880 906
881 if (open_mode & O_EXCL) 907 if (open_mode & (O_EXCL|O_TRUNC))
882 goto out; 908 goto out;
883 switch (mode & (FMODE_READ|FMODE_WRITE)) { 909 switch (mode & (FMODE_READ|FMODE_WRITE)) {
884 case FMODE_READ: 910 case FMODE_READ:
@@ -927,8 +953,8 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
927static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) 953static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
928{ 954{
929 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 955 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
930 memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); 956 nfs4_stateid_copy(&state->stateid, stateid);
931 memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); 957 nfs4_stateid_copy(&state->open_stateid, stateid);
932 switch (fmode) { 958 switch (fmode) {
933 case FMODE_READ: 959 case FMODE_READ:
934 set_bit(NFS_O_RDONLY_STATE, &state->flags); 960 set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -956,7 +982,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
956 */ 982 */
957 write_seqlock(&state->seqlock); 983 write_seqlock(&state->seqlock);
958 if (deleg_stateid != NULL) { 984 if (deleg_stateid != NULL) {
959 memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); 985 nfs4_stateid_copy(&state->stateid, deleg_stateid);
960 set_bit(NFS_DELEGATED_STATE, &state->flags); 986 set_bit(NFS_DELEGATED_STATE, &state->flags);
961 } 987 }
962 if (open_stateid != NULL) 988 if (open_stateid != NULL)
@@ -987,7 +1013,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
987 1013
988 if (delegation == NULL) 1014 if (delegation == NULL)
989 delegation = &deleg_cur->stateid; 1015 delegation = &deleg_cur->stateid;
990 else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) 1016 else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation))
991 goto no_delegation_unlock; 1017 goto no_delegation_unlock;
992 1018
993 nfs_mark_delegation_referenced(deleg_cur); 1019 nfs_mark_delegation_referenced(deleg_cur);
@@ -1026,7 +1052,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1026 struct nfs4_state *state = opendata->state; 1052 struct nfs4_state *state = opendata->state;
1027 struct nfs_inode *nfsi = NFS_I(state->inode); 1053 struct nfs_inode *nfsi = NFS_I(state->inode);
1028 struct nfs_delegation *delegation; 1054 struct nfs_delegation *delegation;
1029 int open_mode = opendata->o_arg.open_flags & O_EXCL; 1055 int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);
1030 fmode_t fmode = opendata->o_arg.fmode; 1056 fmode_t fmode = opendata->o_arg.fmode;
1031 nfs4_stateid stateid; 1057 nfs4_stateid stateid;
1032 int ret = -EAGAIN; 1058 int ret = -EAGAIN;
@@ -1048,7 +1074,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1048 break; 1074 break;
1049 } 1075 }
1050 /* Save the delegation */ 1076 /* Save the delegation */
1051 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); 1077 nfs4_stateid_copy(&stateid, &delegation->stateid);
1052 rcu_read_unlock(); 1078 rcu_read_unlock();
1053 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); 1079 ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
1054 if (ret != 0) 1080 if (ret != 0)
@@ -1090,6 +1116,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
1090 if (state == NULL) 1116 if (state == NULL)
1091 goto err_put_inode; 1117 goto err_put_inode;
1092 if (data->o_res.delegation_type != 0) { 1118 if (data->o_res.delegation_type != 0) {
1119 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
1093 int delegation_flags = 0; 1120 int delegation_flags = 0;
1094 1121
1095 rcu_read_lock(); 1122 rcu_read_lock();
@@ -1101,7 +1128,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
1101 pr_err_ratelimited("NFS: Broken NFSv4 server %s is " 1128 pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
1102 "returning a delegation for " 1129 "returning a delegation for "
1103 "OPEN(CLAIM_DELEGATE_CUR)\n", 1130 "OPEN(CLAIM_DELEGATE_CUR)\n",
1104 NFS_CLIENT(inode)->cl_server); 1131 clp->cl_hostname);
1105 } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) 1132 } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
1106 nfs_inode_set_delegation(state->inode, 1133 nfs_inode_set_delegation(state->inode,
1107 data->owner->so_cred, 1134 data->owner->so_cred,
@@ -1210,10 +1237,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
1210 * Check if we need to update the current stateid. 1237 * Check if we need to update the current stateid.
1211 */ 1238 */
1212 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && 1239 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 &&
1213 memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { 1240 !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {
1214 write_seqlock(&state->seqlock); 1241 write_seqlock(&state->seqlock);
1215 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1242 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1216 memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); 1243 nfs4_stateid_copy(&state->stateid, &state->open_stateid);
1217 write_sequnlock(&state->seqlock); 1244 write_sequnlock(&state->seqlock);
1218 } 1245 }
1219 return 0; 1246 return 0;
@@ -1282,8 +1309,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs
1282 if (IS_ERR(opendata)) 1309 if (IS_ERR(opendata))
1283 return PTR_ERR(opendata); 1310 return PTR_ERR(opendata);
1284 opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; 1311 opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
1285 memcpy(opendata->o_arg.u.delegation.data, stateid->data, 1312 nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
1286 sizeof(opendata->o_arg.u.delegation.data));
1287 ret = nfs4_open_recover(opendata, state); 1313 ret = nfs4_open_recover(opendata, state);
1288 nfs4_opendata_put(opendata); 1314 nfs4_opendata_put(opendata);
1289 return ret; 1315 return ret;
@@ -1319,8 +1345,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1319 * The show must go on: exit, but mark the 1345 * The show must go on: exit, but mark the
1320 * stateid as needing recovery. 1346 * stateid as needing recovery.
1321 */ 1347 */
1348 case -NFS4ERR_DELEG_REVOKED:
1322 case -NFS4ERR_ADMIN_REVOKED: 1349 case -NFS4ERR_ADMIN_REVOKED:
1323 case -NFS4ERR_BAD_STATEID: 1350 case -NFS4ERR_BAD_STATEID:
1351 nfs_inode_find_state_and_recover(state->inode,
1352 stateid);
1324 nfs4_schedule_stateid_recovery(server, state); 1353 nfs4_schedule_stateid_recovery(server, state);
1325 case -EKEYEXPIRED: 1354 case -EKEYEXPIRED:
1326 /* 1355 /*
@@ -1345,8 +1374,7 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
1345 1374
1346 data->rpc_status = task->tk_status; 1375 data->rpc_status = task->tk_status;
1347 if (data->rpc_status == 0) { 1376 if (data->rpc_status == 0) {
1348 memcpy(data->o_res.stateid.data, data->c_res.stateid.data, 1377 nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);
1349 sizeof(data->o_res.stateid.data));
1350 nfs_confirm_seqid(&data->owner->so_seqid, 0); 1378 nfs_confirm_seqid(&data->owner->so_seqid, 0);
1351 renew_lease(data->o_res.server, data->timestamp); 1379 renew_lease(data->o_res.server, data->timestamp);
1352 data->rpc_done = 1; 1380 data->rpc_done = 1;
@@ -1440,7 +1468,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1440 rcu_read_unlock(); 1468 rcu_read_unlock();
1441 } 1469 }
1442 /* Update sequence id. */ 1470 /* Update sequence id. */
1443 data->o_arg.id = sp->so_owner_id.id; 1471 data->o_arg.id = sp->so_seqid.owner_id;
1444 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; 1472 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
1445 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 1473 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
1446 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 1474 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
@@ -1449,7 +1477,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1449 data->timestamp = jiffies; 1477 data->timestamp = jiffies;
1450 if (nfs4_setup_sequence(data->o_arg.server, 1478 if (nfs4_setup_sequence(data->o_arg.server,
1451 &data->o_arg.seq_args, 1479 &data->o_arg.seq_args,
1452 &data->o_res.seq_res, 1, task)) 1480 &data->o_res.seq_res, task))
1453 return; 1481 return;
1454 rpc_call_start(task); 1482 rpc_call_start(task);
1455 return; 1483 return;
@@ -1551,6 +1579,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1551 }; 1579 };
1552 int status; 1580 int status;
1553 1581
1582 nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);
1554 kref_get(&data->kref); 1583 kref_get(&data->kref);
1555 data->rpc_done = 0; 1584 data->rpc_done = 0;
1556 data->rpc_status = 0; 1585 data->rpc_status = 0;
@@ -1712,15 +1741,32 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
1712} 1741}
1713 1742
1714#if defined(CONFIG_NFS_V4_1) 1743#if defined(CONFIG_NFS_V4_1)
1715static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) 1744static int nfs41_check_expired_stateid(struct nfs4_state *state, nfs4_stateid *stateid, unsigned int flags)
1716{ 1745{
1717 int status; 1746 int status = NFS_OK;
1718 struct nfs_server *server = NFS_SERVER(state->inode); 1747 struct nfs_server *server = NFS_SERVER(state->inode);
1719 1748
1720 status = nfs41_test_stateid(server, state); 1749 if (state->flags & flags) {
1721 if (status == NFS_OK) 1750 status = nfs41_test_stateid(server, stateid);
1722 return 0; 1751 if (status != NFS_OK) {
1723 nfs41_free_stateid(server, state); 1752 nfs41_free_stateid(server, stateid);
1753 state->flags &= ~flags;
1754 }
1755 }
1756 return status;
1757}
1758
1759static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
1760{
1761 int deleg_status, open_status;
1762 int deleg_flags = 1 << NFS_DELEGATED_STATE;
1763 int open_flags = (1 << NFS_O_RDONLY_STATE) | (1 << NFS_O_WRONLY_STATE) | (1 << NFS_O_RDWR_STATE);
1764
1765 deleg_status = nfs41_check_expired_stateid(state, &state->stateid, deleg_flags);
1766 open_status = nfs41_check_expired_stateid(state, &state->open_stateid, open_flags);
1767
1768 if ((deleg_status == NFS_OK) && (open_status == NFS_OK))
1769 return NFS_OK;
1724 return nfs4_open_expired(sp, state); 1770 return nfs4_open_expired(sp, state);
1725} 1771}
1726#endif 1772#endif
@@ -1754,7 +1800,8 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
1754 1800
1755 /* Protect against reboot recovery conflicts */ 1801 /* Protect against reboot recovery conflicts */
1756 status = -ENOMEM; 1802 status = -ENOMEM;
1757 if (!(sp = nfs4_get_state_owner(server, cred))) { 1803 sp = nfs4_get_state_owner(server, cred, GFP_KERNEL);
1804 if (sp == NULL) {
1758 dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); 1805 dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
1759 goto out_err; 1806 goto out_err;
1760 } 1807 }
@@ -1829,7 +1876,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
1829 * the user though... 1876 * the user though...
1830 */ 1877 */
1831 if (status == -NFS4ERR_BAD_SEQID) { 1878 if (status == -NFS4ERR_BAD_SEQID) {
1832 printk(KERN_WARNING "NFS: v4 server %s " 1879 pr_warn_ratelimited("NFS: v4 server %s "
1833 " returned a bad sequence-id error!\n", 1880 " returned a bad sequence-id error!\n",
1834 NFS_SERVER(dir)->nfs_client->cl_hostname); 1881 NFS_SERVER(dir)->nfs_client->cl_hostname);
1835 exception.retry = 1; 1882 exception.retry = 1;
@@ -1882,12 +1929,14 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1882 1929
1883 nfs_fattr_init(fattr); 1930 nfs_fattr_init(fattr);
1884 1931
1885 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { 1932 if (state != NULL) {
1933 nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
1934 current->files, current->tgid);
1935 } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
1936 FMODE_WRITE)) {
1886 /* Use that stateid */ 1937 /* Use that stateid */
1887 } else if (state != NULL) {
1888 nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
1889 } else 1938 } else
1890 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1939 nfs4_stateid_copy(&arg.stateid, &zero_stateid);
1891 1940
1892 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 1941 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
1893 if (status == 0 && state != NULL) 1942 if (status == 0 && state != NULL)
@@ -1900,7 +1949,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1900 struct nfs4_state *state) 1949 struct nfs4_state *state)
1901{ 1950{
1902 struct nfs_server *server = NFS_SERVER(inode); 1951 struct nfs_server *server = NFS_SERVER(inode);
1903 struct nfs4_exception exception = { }; 1952 struct nfs4_exception exception = {
1953 .state = state,
1954 .inode = inode,
1955 };
1904 int err; 1956 int err;
1905 do { 1957 do {
1906 err = nfs4_handle_exception(server, 1958 err = nfs4_handle_exception(server,
@@ -1954,6 +2006,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1954 struct nfs4_state *state = calldata->state; 2006 struct nfs4_state *state = calldata->state;
1955 struct nfs_server *server = NFS_SERVER(calldata->inode); 2007 struct nfs_server *server = NFS_SERVER(calldata->inode);
1956 2008
2009 dprintk("%s: begin!\n", __func__);
1957 if (!nfs4_sequence_done(task, &calldata->res.seq_res)) 2010 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
1958 return; 2011 return;
1959 /* hmm. we are done with the inode, and in the process of freeing 2012 /* hmm. we are done with the inode, and in the process of freeing
@@ -1981,6 +2034,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1981 } 2034 }
1982 nfs_release_seqid(calldata->arg.seqid); 2035 nfs_release_seqid(calldata->arg.seqid);
1983 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2036 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
2037 dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
1984} 2038}
1985 2039
1986static void nfs4_close_prepare(struct rpc_task *task, void *data) 2040static void nfs4_close_prepare(struct rpc_task *task, void *data)
@@ -1989,6 +2043,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1989 struct nfs4_state *state = calldata->state; 2043 struct nfs4_state *state = calldata->state;
1990 int call_close = 0; 2044 int call_close = 0;
1991 2045
2046 dprintk("%s: begin!\n", __func__);
1992 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 2047 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
1993 return; 2048 return;
1994 2049
@@ -2013,7 +2068,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2013 if (!call_close) { 2068 if (!call_close) {
2014 /* Note: exit _without_ calling nfs4_close_done */ 2069 /* Note: exit _without_ calling nfs4_close_done */
2015 task->tk_action = NULL; 2070 task->tk_action = NULL;
2016 return; 2071 goto out;
2017 } 2072 }
2018 2073
2019 if (calldata->arg.fmode == 0) { 2074 if (calldata->arg.fmode == 0) {
@@ -2022,17 +2077,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2022 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { 2077 pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {
2023 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, 2078 rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,
2024 task, NULL); 2079 task, NULL);
2025 return; 2080 goto out;
2026 } 2081 }
2027 } 2082 }
2028 2083
2029 nfs_fattr_init(calldata->res.fattr); 2084 nfs_fattr_init(calldata->res.fattr);
2030 calldata->timestamp = jiffies; 2085 calldata->timestamp = jiffies;
2031 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), 2086 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
2032 &calldata->arg.seq_args, &calldata->res.seq_res, 2087 &calldata->arg.seq_args,
2033 1, task)) 2088 &calldata->res.seq_res,
2034 return; 2089 task))
2090 goto out;
2035 rpc_call_start(task); 2091 rpc_call_start(task);
2092out:
2093 dprintk("%s: done!\n", __func__);
2036} 2094}
2037 2095
2038static const struct rpc_call_ops nfs4_close_ops = { 2096static const struct rpc_call_ops nfs4_close_ops = {
@@ -2074,6 +2132,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)
2074 calldata = kzalloc(sizeof(*calldata), gfp_mask); 2132 calldata = kzalloc(sizeof(*calldata), gfp_mask);
2075 if (calldata == NULL) 2133 if (calldata == NULL)
2076 goto out; 2134 goto out;
2135 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);
2077 calldata->inode = state->inode; 2136 calldata->inode = state->inode;
2078 calldata->state = state; 2137 calldata->state = state;
2079 calldata->arg.fh = NFS_FH(state->inode); 2138 calldata->arg.fh = NFS_FH(state->inode);
@@ -2182,6 +2241,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2182 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2241 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2183 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2242 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
2184 server->acl_bitmask = res.acl_bitmask; 2243 server->acl_bitmask = res.acl_bitmask;
2244 server->fh_expire_type = res.fh_expire_type;
2185 } 2245 }
2186 2246
2187 return status; 2247 return status;
@@ -2303,7 +2363,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
2303 return nfs4_map_errors(status); 2363 return nfs4_map_errors(status);
2304} 2364}
2305 2365
2306static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
2307/* 2366/*
2308 * Get locations and (maybe) other attributes of a referral. 2367 * Get locations and (maybe) other attributes of a referral.
2309 * Note that we'll actually follow the referral later when 2368 * Note that we'll actually follow the referral later when
@@ -2420,6 +2479,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2420 } 2479 }
2421 } 2480 }
2422 2481
2482 /* Deal with open(O_TRUNC) */
2483 if (sattr->ia_valid & ATTR_OPEN)
2484 sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
2485
2423 status = nfs4_do_setattr(inode, cred, fattr, sattr, state); 2486 status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
2424 if (status == 0) 2487 if (status == 0)
2425 nfs_setattr_update_inode(inode, sattr); 2488 nfs_setattr_update_inode(inode, sattr);
@@ -2494,7 +2557,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2494 struct nfs_server *server = NFS_SERVER(inode); 2557 struct nfs_server *server = NFS_SERVER(inode);
2495 struct nfs4_accessargs args = { 2558 struct nfs4_accessargs args = {
2496 .fh = NFS_FH(inode), 2559 .fh = NFS_FH(inode),
2497 .bitmask = server->attr_bitmask, 2560 .bitmask = server->cache_consistency_bitmask,
2498 }; 2561 };
2499 struct nfs4_accessres res = { 2562 struct nfs4_accessres res = {
2500 .server = server, 2563 .server = server,
@@ -2712,8 +2775,18 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
2712 2775
2713 args->bitmask = server->cache_consistency_bitmask; 2776 args->bitmask = server->cache_consistency_bitmask;
2714 res->server = server; 2777 res->server = server;
2715 res->seq_res.sr_slot = NULL;
2716 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 2778 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
2779 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
2780}
2781
2782static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
2783{
2784 if (nfs4_setup_sequence(NFS_SERVER(data->dir),
2785 &data->args.seq_args,
2786 &data->res.seq_res,
2787 task))
2788 return;
2789 rpc_call_start(task);
2717} 2790}
2718 2791
2719static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) 2792static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -2738,6 +2811,17 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
2738 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; 2811 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];
2739 arg->bitmask = server->attr_bitmask; 2812 arg->bitmask = server->attr_bitmask;
2740 res->server = server; 2813 res->server = server;
2814 nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1);
2815}
2816
2817static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
2818{
2819 if (nfs4_setup_sequence(NFS_SERVER(data->old_dir),
2820 &data->args.seq_args,
2821 &data->res.seq_res,
2822 task))
2823 return;
2824 rpc_call_start(task);
2741} 2825}
2742 2826
2743static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 2827static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3232,6 +3316,17 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
3232 data->timestamp = jiffies; 3316 data->timestamp = jiffies;
3233 data->read_done_cb = nfs4_read_done_cb; 3317 data->read_done_cb = nfs4_read_done_cb;
3234 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 3318 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
3319 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
3320}
3321
3322static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
3323{
3324 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
3325 &data->args.seq_args,
3326 &data->res.seq_res,
3327 task))
3328 return;
3329 rpc_call_start(task);
3235} 3330}
3236 3331
3237/* Reset the the nfs_read_data to send the read to the MDS. */ 3332/* Reset the the nfs_read_data to send the read to the MDS. */
@@ -3305,6 +3400,17 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
3305 data->timestamp = jiffies; 3400 data->timestamp = jiffies;
3306 3401
3307 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; 3402 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
3403 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
3404}
3405
3406static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
3407{
3408 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
3409 &data->args.seq_args,
3410 &data->res.seq_res,
3411 task))
3412 return;
3413 rpc_call_start(task);
3308} 3414}
3309 3415
3310static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) 3416static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3339,6 +3445,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3339 data->write_done_cb = nfs4_commit_done_cb; 3445 data->write_done_cb = nfs4_commit_done_cb;
3340 data->res.server = server; 3446 data->res.server = server;
3341 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 3447 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
3448 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
3342} 3449}
3343 3450
3344struct nfs4_renewdata { 3451struct nfs4_renewdata {
@@ -3714,8 +3821,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3714 if (task->tk_status >= 0) 3821 if (task->tk_status >= 0)
3715 return 0; 3822 return 0;
3716 switch(task->tk_status) { 3823 switch(task->tk_status) {
3824 case -NFS4ERR_DELEG_REVOKED:
3717 case -NFS4ERR_ADMIN_REVOKED: 3825 case -NFS4ERR_ADMIN_REVOKED:
3718 case -NFS4ERR_BAD_STATEID: 3826 case -NFS4ERR_BAD_STATEID:
3827 if (state != NULL)
3828 nfs_remove_bad_delegation(state->inode);
3719 case -NFS4ERR_OPENMODE: 3829 case -NFS4ERR_OPENMODE:
3720 if (state == NULL) 3830 if (state == NULL)
3721 break; 3831 break;
@@ -3764,6 +3874,16 @@ wait_on_recovery:
3764 return -EAGAIN; 3874 return -EAGAIN;
3765} 3875}
3766 3876
3877static void nfs4_construct_boot_verifier(struct nfs_client *clp,
3878 nfs4_verifier *bootverf)
3879{
3880 __be32 verf[2];
3881
3882 verf[0] = htonl((u32)clp->cl_boot_time.tv_sec);
3883 verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec);
3884 memcpy(bootverf->data, verf, sizeof(bootverf->data));
3885}
3886
3767int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, 3887int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3768 unsigned short port, struct rpc_cred *cred, 3888 unsigned short port, struct rpc_cred *cred,
3769 struct nfs4_setclientid_res *res) 3889 struct nfs4_setclientid_res *res)
@@ -3780,15 +3900,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3780 .rpc_resp = res, 3900 .rpc_resp = res,
3781 .rpc_cred = cred, 3901 .rpc_cred = cred,
3782 }; 3902 };
3783 __be32 *p;
3784 int loop = 0; 3903 int loop = 0;
3785 int status; 3904 int status;
3786 3905
3787 p = (__be32*)sc_verifier.data; 3906 nfs4_construct_boot_verifier(clp, &sc_verifier);
3788 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
3789 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
3790 3907
3791 for(;;) { 3908 for(;;) {
3909 rcu_read_lock();
3792 setclientid.sc_name_len = scnprintf(setclientid.sc_name, 3910 setclientid.sc_name_len = scnprintf(setclientid.sc_name,
3793 sizeof(setclientid.sc_name), "%s/%s %s %s %u", 3911 sizeof(setclientid.sc_name), "%s/%s %s %s %u",
3794 clp->cl_ipaddr, 3912 clp->cl_ipaddr,
@@ -3805,6 +3923,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3805 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, 3923 setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
3806 sizeof(setclientid.sc_uaddr), "%s.%u.%u", 3924 sizeof(setclientid.sc_uaddr), "%s.%u.%u",
3807 clp->cl_ipaddr, port >> 8, port & 255); 3925 clp->cl_ipaddr, port >> 8, port & 255);
3926 rcu_read_unlock();
3808 3927
3809 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 3928 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
3810 if (status != -NFS4ERR_CLID_INUSE) 3929 if (status != -NFS4ERR_CLID_INUSE)
@@ -3891,7 +4010,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
3891 4010
3892 if (nfs4_setup_sequence(d_data->res.server, 4011 if (nfs4_setup_sequence(d_data->res.server,
3893 &d_data->args.seq_args, 4012 &d_data->args.seq_args,
3894 &d_data->res.seq_res, 1, task)) 4013 &d_data->res.seq_res, task))
3895 return; 4014 return;
3896 rpc_call_start(task); 4015 rpc_call_start(task);
3897} 4016}
@@ -3925,11 +4044,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
3925 data = kzalloc(sizeof(*data), GFP_NOFS); 4044 data = kzalloc(sizeof(*data), GFP_NOFS);
3926 if (data == NULL) 4045 if (data == NULL)
3927 return -ENOMEM; 4046 return -ENOMEM;
4047 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
3928 data->args.fhandle = &data->fh; 4048 data->args.fhandle = &data->fh;
3929 data->args.stateid = &data->stateid; 4049 data->args.stateid = &data->stateid;
3930 data->args.bitmask = server->attr_bitmask; 4050 data->args.bitmask = server->attr_bitmask;
3931 nfs_copy_fh(&data->fh, NFS_FH(inode)); 4051 nfs_copy_fh(&data->fh, NFS_FH(inode));
3932 memcpy(&data->stateid, stateid, sizeof(data->stateid)); 4052 nfs4_stateid_copy(&data->stateid, stateid);
3933 data->res.fattr = &data->fattr; 4053 data->res.fattr = &data->fattr;
3934 data->res.server = server; 4054 data->res.server = server;
3935 nfs_fattr_init(data->res.fattr); 4055 nfs_fattr_init(data->res.fattr);
@@ -4016,7 +4136,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
4016 if (status != 0) 4136 if (status != 0)
4017 goto out; 4137 goto out;
4018 lsp = request->fl_u.nfs4_fl.owner; 4138 lsp = request->fl_u.nfs4_fl.owner;
4019 arg.lock_owner.id = lsp->ls_id.id; 4139 arg.lock_owner.id = lsp->ls_seqid.owner_id;
4020 arg.lock_owner.s_dev = server->s_dev; 4140 arg.lock_owner.s_dev = server->s_dev;
4021 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 4141 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
4022 switch (status) { 4142 switch (status) {
@@ -4112,9 +4232,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
4112 return; 4232 return;
4113 switch (task->tk_status) { 4233 switch (task->tk_status) {
4114 case 0: 4234 case 0:
4115 memcpy(calldata->lsp->ls_stateid.data, 4235 nfs4_stateid_copy(&calldata->lsp->ls_stateid,
4116 calldata->res.stateid.data, 4236 &calldata->res.stateid);
4117 sizeof(calldata->lsp->ls_stateid.data));
4118 renew_lease(calldata->server, calldata->timestamp); 4237 renew_lease(calldata->server, calldata->timestamp);
4119 break; 4238 break;
4120 case -NFS4ERR_BAD_STATEID: 4239 case -NFS4ERR_BAD_STATEID:
@@ -4142,7 +4261,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4142 calldata->timestamp = jiffies; 4261 calldata->timestamp = jiffies;
4143 if (nfs4_setup_sequence(calldata->server, 4262 if (nfs4_setup_sequence(calldata->server,
4144 &calldata->arg.seq_args, 4263 &calldata->arg.seq_args,
4145 &calldata->res.seq_res, 1, task)) 4264 &calldata->res.seq_res, task))
4146 return; 4265 return;
4147 rpc_call_start(task); 4266 rpc_call_start(task);
4148} 4267}
@@ -4182,6 +4301,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
4182 return ERR_PTR(-ENOMEM); 4301 return ERR_PTR(-ENOMEM);
4183 } 4302 }
4184 4303
4304 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
4185 msg.rpc_argp = &data->arg; 4305 msg.rpc_argp = &data->arg;
4186 msg.rpc_resp = &data->res; 4306 msg.rpc_resp = &data->res;
4187 task_setup_data.callback_data = data; 4307 task_setup_data.callback_data = data;
@@ -4261,7 +4381,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
4261 goto out_free_seqid; 4381 goto out_free_seqid;
4262 p->arg.lock_stateid = &lsp->ls_stateid; 4382 p->arg.lock_stateid = &lsp->ls_stateid;
4263 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 4383 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
4264 p->arg.lock_owner.id = lsp->ls_id.id; 4384 p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
4265 p->arg.lock_owner.s_dev = server->s_dev; 4385 p->arg.lock_owner.s_dev = server->s_dev;
4266 p->res.lock_seqid = p->arg.lock_seqid; 4386 p->res.lock_seqid = p->arg.lock_seqid;
4267 p->lsp = lsp; 4387 p->lsp = lsp;
@@ -4297,7 +4417,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4297 data->timestamp = jiffies; 4417 data->timestamp = jiffies;
4298 if (nfs4_setup_sequence(data->server, 4418 if (nfs4_setup_sequence(data->server,
4299 &data->arg.seq_args, 4419 &data->arg.seq_args,
4300 &data->res.seq_res, 1, task)) 4420 &data->res.seq_res, task))
4301 return; 4421 return;
4302 rpc_call_start(task); 4422 rpc_call_start(task);
4303 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); 4423 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
@@ -4326,8 +4446,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4326 goto out; 4446 goto out;
4327 } 4447 }
4328 if (data->rpc_status == 0) { 4448 if (data->rpc_status == 0) {
4329 memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, 4449 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
4330 sizeof(data->lsp->ls_stateid.data));
4331 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; 4450 data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;
4332 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 4451 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
4333 } 4452 }
@@ -4415,6 +4534,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4415 data->arg.reclaim = NFS_LOCK_RECLAIM; 4534 data->arg.reclaim = NFS_LOCK_RECLAIM;
4416 task_setup_data.callback_ops = &nfs4_recover_lock_ops; 4535 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4417 } 4536 }
4537 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
4418 msg.rpc_argp = &data->arg; 4538 msg.rpc_argp = &data->arg;
4419 msg.rpc_resp = &data->res; 4539 msg.rpc_resp = &data->res;
4420 task_setup_data.callback_data = data; 4540 task_setup_data.callback_data = data;
@@ -4479,15 +4599,34 @@ out:
4479} 4599}
4480 4600
4481#if defined(CONFIG_NFS_V4_1) 4601#if defined(CONFIG_NFS_V4_1)
4482static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) 4602static int nfs41_check_expired_locks(struct nfs4_state *state)
4483{ 4603{
4484 int status; 4604 int status, ret = NFS_OK;
4605 struct nfs4_lock_state *lsp;
4485 struct nfs_server *server = NFS_SERVER(state->inode); 4606 struct nfs_server *server = NFS_SERVER(state->inode);
4486 4607
4487 status = nfs41_test_stateid(server, state); 4608 list_for_each_entry(lsp, &state->lock_states, ls_locks) {
4609 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
4610 status = nfs41_test_stateid(server, &lsp->ls_stateid);
4611 if (status != NFS_OK) {
4612 nfs41_free_stateid(server, &lsp->ls_stateid);
4613 lsp->ls_flags &= ~NFS_LOCK_INITIALIZED;
4614 ret = status;
4615 }
4616 }
4617 };
4618
4619 return ret;
4620}
4621
4622static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
4623{
4624 int status = NFS_OK;
4625
4626 if (test_bit(LK_STATE_IN_USE, &state->flags))
4627 status = nfs41_check_expired_locks(state);
4488 if (status == NFS_OK) 4628 if (status == NFS_OK)
4489 return 0; 4629 return status;
4490 nfs41_free_stateid(server, state);
4491 return nfs4_lock_expired(state, request); 4630 return nfs4_lock_expired(state, request);
4492} 4631}
4493#endif 4632#endif
@@ -4523,7 +4662,8 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
4523 /* Note: we always want to sleep here! */ 4662 /* Note: we always want to sleep here! */
4524 request->fl_flags = fl_flags | FL_SLEEP; 4663 request->fl_flags = fl_flags | FL_SLEEP;
4525 if (do_vfs_lock(request->fl_file, request) < 0) 4664 if (do_vfs_lock(request->fl_file, request) < 0)
4526 printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); 4665 printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
4666 "manager!\n", __func__);
4527out_unlock: 4667out_unlock:
4528 up_read(&nfsi->rwsem); 4668 up_read(&nfsi->rwsem);
4529out: 4669out:
@@ -4533,7 +4673,9 @@ out:
4533 4673
4534static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 4674static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
4535{ 4675{
4536 struct nfs4_exception exception = { }; 4676 struct nfs4_exception exception = {
4677 .state = state,
4678 };
4537 int err; 4679 int err;
4538 4680
4539 do { 4681 do {
@@ -4603,8 +4745,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4603 err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); 4745 err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
4604 switch (err) { 4746 switch (err) {
4605 default: 4747 default:
4606 printk(KERN_ERR "%s: unhandled error %d.\n", 4748 printk(KERN_ERR "NFS: %s: unhandled error "
4607 __func__, err); 4749 "%d.\n", __func__, err);
4608 case 0: 4750 case 0:
4609 case -ESTALE: 4751 case -ESTALE:
4610 goto out; 4752 goto out;
@@ -4626,6 +4768,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4626 * The show must go on: exit, but mark the 4768 * The show must go on: exit, but mark the
4627 * stateid as needing recovery. 4769 * stateid as needing recovery.
4628 */ 4770 */
4771 case -NFS4ERR_DELEG_REVOKED:
4629 case -NFS4ERR_ADMIN_REVOKED: 4772 case -NFS4ERR_ADMIN_REVOKED:
4630 case -NFS4ERR_BAD_STATEID: 4773 case -NFS4ERR_BAD_STATEID:
4631 case -NFS4ERR_OPENMODE: 4774 case -NFS4ERR_OPENMODE:
@@ -4655,33 +4798,44 @@ out:
4655 return err; 4798 return err;
4656} 4799}
4657 4800
4801struct nfs_release_lockowner_data {
4802 struct nfs4_lock_state *lsp;
4803 struct nfs_server *server;
4804 struct nfs_release_lockowner_args args;
4805};
4806
4658static void nfs4_release_lockowner_release(void *calldata) 4807static void nfs4_release_lockowner_release(void *calldata)
4659{ 4808{
4809 struct nfs_release_lockowner_data *data = calldata;
4810 nfs4_free_lock_state(data->server, data->lsp);
4660 kfree(calldata); 4811 kfree(calldata);
4661} 4812}
4662 4813
4663const struct rpc_call_ops nfs4_release_lockowner_ops = { 4814static const struct rpc_call_ops nfs4_release_lockowner_ops = {
4664 .rpc_release = nfs4_release_lockowner_release, 4815 .rpc_release = nfs4_release_lockowner_release,
4665}; 4816};
4666 4817
4667void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) 4818int nfs4_release_lockowner(struct nfs4_lock_state *lsp)
4668{ 4819{
4669 struct nfs_server *server = lsp->ls_state->owner->so_server; 4820 struct nfs_server *server = lsp->ls_state->owner->so_server;
4670 struct nfs_release_lockowner_args *args; 4821 struct nfs_release_lockowner_data *data;
4671 struct rpc_message msg = { 4822 struct rpc_message msg = {
4672 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], 4823 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
4673 }; 4824 };
4674 4825
4675 if (server->nfs_client->cl_mvops->minor_version != 0) 4826 if (server->nfs_client->cl_mvops->minor_version != 0)
4676 return; 4827 return -EINVAL;
4677 args = kmalloc(sizeof(*args), GFP_NOFS); 4828 data = kmalloc(sizeof(*data), GFP_NOFS);
4678 if (!args) 4829 if (!data)
4679 return; 4830 return -ENOMEM;
4680 args->lock_owner.clientid = server->nfs_client->cl_clientid; 4831 data->lsp = lsp;
4681 args->lock_owner.id = lsp->ls_id.id; 4832 data->server = server;
4682 args->lock_owner.s_dev = server->s_dev; 4833 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
4683 msg.rpc_argp = args; 4834 data->args.lock_owner.id = lsp->ls_seqid.owner_id;
4684 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); 4835 data->args.lock_owner.s_dev = server->s_dev;
4836 msg.rpc_argp = &data->args;
4837 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
4838 return 0;
4685} 4839}
4686 4840
4687#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4841#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -4727,11 +4881,11 @@ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
4727 if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || 4881 if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||
4728 (fattr->valid & NFS_ATTR_FATTR_FILEID)) && 4882 (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&
4729 (fattr->valid & NFS_ATTR_FATTR_FSID) && 4883 (fattr->valid & NFS_ATTR_FATTR_FSID) &&
4730 (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) 4884 (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))
4731 return; 4885 return;
4732 4886
4733 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | 4887 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
4734 NFS_ATTR_FATTR_NLINK; 4888 NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;
4735 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; 4889 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
4736 fattr->nlink = 2; 4890 fattr->nlink = 2;
4737} 4891}
@@ -4798,7 +4952,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
4798 return status; 4952 return status;
4799} 4953}
4800 4954
4801int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) 4955static int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name,
4956 struct nfs4_secinfo_flavors *flavors)
4802{ 4957{
4803 struct nfs4_exception exception = { }; 4958 struct nfs4_exception exception = { };
4804 int err; 4959 int err;
@@ -4852,6 +5007,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4852{ 5007{
4853 nfs4_verifier verifier; 5008 nfs4_verifier verifier;
4854 struct nfs41_exchange_id_args args = { 5009 struct nfs41_exchange_id_args args = {
5010 .verifier = &verifier,
4855 .client = clp, 5011 .client = clp,
4856 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, 5012 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
4857 }; 5013 };
@@ -4865,15 +5021,11 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4865 .rpc_resp = &res, 5021 .rpc_resp = &res,
4866 .rpc_cred = cred, 5022 .rpc_cred = cred,
4867 }; 5023 };
4868 __be32 *p;
4869 5024
4870 dprintk("--> %s\n", __func__); 5025 dprintk("--> %s\n", __func__);
4871 BUG_ON(clp == NULL); 5026 BUG_ON(clp == NULL);
4872 5027
4873 p = (u32 *)verifier.data; 5028 nfs4_construct_boot_verifier(clp, &verifier);
4874 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
4875 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
4876 args.verifier = &verifier;
4877 5029
4878 args.id_len = scnprintf(args.id, sizeof(args.id), 5030 args.id_len = scnprintf(args.id, sizeof(args.id),
4879 "%s/%s.%s/%u", 5031 "%s/%s.%s/%u",
@@ -4888,11 +5040,24 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4888 goto out; 5040 goto out;
4889 } 5041 }
4890 5042
5043 res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL);
5044 if (unlikely(!res.impl_id)) {
5045 status = -ENOMEM;
5046 goto out_server_scope;
5047 }
5048
4891 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 5049 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
4892 if (!status) 5050 if (!status)
4893 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); 5051 status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
4894 5052
4895 if (!status) { 5053 if (!status) {
5054 /* use the most recent implementation id */
5055 kfree(clp->impl_id);
5056 clp->impl_id = res.impl_id;
5057 } else
5058 kfree(res.impl_id);
5059
5060 if (!status) {
4896 if (clp->server_scope && 5061 if (clp->server_scope &&
4897 !nfs41_same_server_scope(clp->server_scope, 5062 !nfs41_same_server_scope(clp->server_scope,
4898 res.server_scope)) { 5063 res.server_scope)) {
@@ -4908,8 +5073,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4908 goto out; 5073 goto out;
4909 } 5074 }
4910 } 5075 }
5076
5077out_server_scope:
4911 kfree(res.server_scope); 5078 kfree(res.server_scope);
4912out: 5079out:
5080 if (clp->impl_id)
5081 dprintk("%s: Server Implementation ID: "
5082 "domain: %s, name: %s, date: %llu,%u\n",
5083 __func__, clp->impl_id->domain, clp->impl_id->name,
5084 clp->impl_id->date.seconds,
5085 clp->impl_id->date.nseconds);
4913 dprintk("<-- %s status= %d\n", __func__, status); 5086 dprintk("<-- %s status= %d\n", __func__, status);
4914 return status; 5087 return status;
4915} 5088}
@@ -4933,7 +5106,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
4933 since we're invoked within one */ 5106 since we're invoked within one */
4934 ret = nfs41_setup_sequence(data->clp->cl_session, 5107 ret = nfs41_setup_sequence(data->clp->cl_session,
4935 &data->args->la_seq_args, 5108 &data->args->la_seq_args,
4936 &data->res->lr_seq_res, 0, task); 5109 &data->res->lr_seq_res, task);
4937 5110
4938 BUG_ON(ret == -EAGAIN); 5111 BUG_ON(ret == -EAGAIN);
4939 rpc_call_start(task); 5112 rpc_call_start(task);
@@ -4966,7 +5139,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4966 dprintk("<-- %s\n", __func__); 5139 dprintk("<-- %s\n", __func__);
4967} 5140}
4968 5141
4969struct rpc_call_ops nfs4_get_lease_time_ops = { 5142static const struct rpc_call_ops nfs4_get_lease_time_ops = {
4970 .rpc_call_prepare = nfs4_get_lease_time_prepare, 5143 .rpc_call_prepare = nfs4_get_lease_time_prepare,
4971 .rpc_call_done = nfs4_get_lease_time_done, 5144 .rpc_call_done = nfs4_get_lease_time_done,
4972}; 5145};
@@ -4997,6 +5170,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4997 }; 5170 };
4998 int status; 5171 int status;
4999 5172
5173 nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
5000 dprintk("--> %s\n", __func__); 5174 dprintk("--> %s\n", __func__);
5001 task = rpc_run_task(&task_setup); 5175 task = rpc_run_task(&task_setup);
5002 5176
@@ -5113,13 +5287,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
5113 return NULL; 5287 return NULL;
5114 5288
5115 tbl = &session->fc_slot_table; 5289 tbl = &session->fc_slot_table;
5116 tbl->highest_used_slotid = -1; 5290 tbl->highest_used_slotid = NFS4_NO_SLOT;
5117 spin_lock_init(&tbl->slot_tbl_lock); 5291 spin_lock_init(&tbl->slot_tbl_lock);
5118 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); 5292 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
5119 init_completion(&tbl->complete); 5293 init_completion(&tbl->complete);
5120 5294
5121 tbl = &session->bc_slot_table; 5295 tbl = &session->bc_slot_table;
5122 tbl->highest_used_slotid = -1; 5296 tbl->highest_used_slotid = NFS4_NO_SLOT;
5123 spin_lock_init(&tbl->slot_tbl_lock); 5297 spin_lock_init(&tbl->slot_tbl_lock);
5124 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 5298 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
5125 init_completion(&tbl->complete); 5299 init_completion(&tbl->complete);
@@ -5132,11 +5306,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
5132 5306
5133void nfs4_destroy_session(struct nfs4_session *session) 5307void nfs4_destroy_session(struct nfs4_session *session)
5134{ 5308{
5309 struct rpc_xprt *xprt;
5310
5135 nfs4_proc_destroy_session(session); 5311 nfs4_proc_destroy_session(session);
5312
5313 rcu_read_lock();
5314 xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
5315 rcu_read_unlock();
5136 dprintk("%s Destroy backchannel for xprt %p\n", 5316 dprintk("%s Destroy backchannel for xprt %p\n",
5137 __func__, session->clp->cl_rpcclient->cl_xprt); 5317 __func__, xprt);
5138 xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt, 5318 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
5139 NFS41_BC_MIN_CALLBACKS);
5140 nfs4_destroy_slot_tables(session); 5319 nfs4_destroy_slot_tables(session);
5141 kfree(session); 5320 kfree(session);
5142} 5321}
@@ -5164,7 +5343,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
5164 args->fc_attrs.max_rqst_sz = mxrqst_sz; 5343 args->fc_attrs.max_rqst_sz = mxrqst_sz;
5165 args->fc_attrs.max_resp_sz = mxresp_sz; 5344 args->fc_attrs.max_resp_sz = mxresp_sz;
5166 args->fc_attrs.max_ops = NFS4_MAX_OPS; 5345 args->fc_attrs.max_ops = NFS4_MAX_OPS;
5167 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; 5346 args->fc_attrs.max_reqs = max_session_slots;
5168 5347
5169 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " 5348 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
5170 "max_ops=%u max_reqs=%u\n", 5349 "max_ops=%u max_reqs=%u\n",
@@ -5204,6 +5383,8 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
5204 return -EINVAL; 5383 return -EINVAL;
5205 if (rcvd->max_reqs == 0) 5384 if (rcvd->max_reqs == 0)
5206 return -EINVAL; 5385 return -EINVAL;
5386 if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE)
5387 rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;
5207 return 0; 5388 return 0;
5208} 5389}
5209 5390
@@ -5219,9 +5400,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
5219 if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) 5400 if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
5220 return -EINVAL; 5401 return -EINVAL;
5221 /* These would render the backchannel useless: */ 5402 /* These would render the backchannel useless: */
5222 if (rcvd->max_ops == 0) 5403 if (rcvd->max_ops != sent->max_ops)
5223 return -EINVAL; 5404 return -EINVAL;
5224 if (rcvd->max_reqs == 0) 5405 if (rcvd->max_reqs != sent->max_reqs)
5225 return -EINVAL; 5406 return -EINVAL;
5226 return 0; 5407 return 0;
5227} 5408}
@@ -5324,7 +5505,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
5324 5505
5325 if (status) 5506 if (status)
5326 printk(KERN_WARNING 5507 printk(KERN_WARNING
5327 "Got error %d from the server on DESTROY_SESSION. " 5508 "NFS: Got error %d from the server on DESTROY_SESSION. "
5328 "Session has been destroyed regardless...\n", status); 5509 "Session has been destroyed regardless...\n", status);
5329 5510
5330 dprintk("<-- nfs4_proc_destroy_session\n"); 5511 dprintk("<-- nfs4_proc_destroy_session\n");
@@ -5447,7 +5628,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
5447 args = task->tk_msg.rpc_argp; 5628 args = task->tk_msg.rpc_argp;
5448 res = task->tk_msg.rpc_resp; 5629 res = task->tk_msg.rpc_resp;
5449 5630
5450 if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) 5631 if (nfs41_setup_sequence(clp->cl_session, args, res, task))
5451 return; 5632 return;
5452 rpc_call_start(task); 5633 rpc_call_start(task);
5453} 5634}
@@ -5479,6 +5660,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5479 nfs_put_client(clp); 5660 nfs_put_client(clp);
5480 return ERR_PTR(-ENOMEM); 5661 return ERR_PTR(-ENOMEM);
5481 } 5662 }
5663 nfs41_init_sequence(&calldata->args, &calldata->res, 0);
5482 msg.rpc_argp = &calldata->args; 5664 msg.rpc_argp = &calldata->args;
5483 msg.rpc_resp = &calldata->res; 5665 msg.rpc_resp = &calldata->res;
5484 calldata->clp = clp; 5666 calldata->clp = clp;
@@ -5540,7 +5722,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
5540 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 5722 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5541 if (nfs41_setup_sequence(calldata->clp->cl_session, 5723 if (nfs41_setup_sequence(calldata->clp->cl_session,
5542 &calldata->arg.seq_args, 5724 &calldata->arg.seq_args,
5543 &calldata->res.seq_res, 0, task)) 5725 &calldata->res.seq_res, task))
5544 return; 5726 return;
5545 5727
5546 rpc_call_start(task); 5728 rpc_call_start(task);
@@ -5619,6 +5801,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5619 calldata->clp = clp; 5801 calldata->clp = clp;
5620 calldata->arg.one_fs = 0; 5802 calldata->arg.one_fs = 0;
5621 5803
5804 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
5622 msg.rpc_argp = &calldata->arg; 5805 msg.rpc_argp = &calldata->arg;
5623 msg.rpc_resp = &calldata->res; 5806 msg.rpc_resp = &calldata->res;
5624 task_setup_data.callback_data = calldata; 5807 task_setup_data.callback_data = calldata;
@@ -5650,7 +5833,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
5650 * to be no way to prevent it completely. 5833 * to be no way to prevent it completely.
5651 */ 5834 */
5652 if (nfs4_setup_sequence(server, &lgp->args.seq_args, 5835 if (nfs4_setup_sequence(server, &lgp->args.seq_args,
5653 &lgp->res.seq_res, 0, task)) 5836 &lgp->res.seq_res, task))
5654 return; 5837 return;
5655 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, 5838 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
5656 NFS_I(lgp->args.inode)->layout, 5839 NFS_I(lgp->args.inode)->layout,
@@ -5725,6 +5908,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5725 5908
5726 lgp->res.layoutp = &lgp->args.layout; 5909 lgp->res.layoutp = &lgp->args.layout;
5727 lgp->res.seq_res.sr_slot = NULL; 5910 lgp->res.seq_res.sr_slot = NULL;
5911 nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
5728 task = rpc_run_task(&task_setup_data); 5912 task = rpc_run_task(&task_setup_data);
5729 if (IS_ERR(task)) 5913 if (IS_ERR(task))
5730 return PTR_ERR(task); 5914 return PTR_ERR(task);
@@ -5745,7 +5929,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
5745 5929
5746 dprintk("--> %s\n", __func__); 5930 dprintk("--> %s\n", __func__);
5747 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, 5931 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
5748 &lrp->res.seq_res, 0, task)) 5932 &lrp->res.seq_res, task))
5749 return; 5933 return;
5750 rpc_call_start(task); 5934 rpc_call_start(task);
5751} 5935}
@@ -5811,6 +5995,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5811 int status; 5995 int status;
5812 5996
5813 dprintk("--> %s\n", __func__); 5997 dprintk("--> %s\n", __func__);
5998 nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
5814 task = rpc_run_task(&task_setup_data); 5999 task = rpc_run_task(&task_setup_data);
5815 if (IS_ERR(task)) 6000 if (IS_ERR(task))
5816 return PTR_ERR(task); 6001 return PTR_ERR(task);
@@ -5911,7 +6096,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
5911 struct nfs_server *server = NFS_SERVER(data->args.inode); 6096 struct nfs_server *server = NFS_SERVER(data->args.inode);
5912 6097
5913 if (nfs4_setup_sequence(server, &data->args.seq_args, 6098 if (nfs4_setup_sequence(server, &data->args.seq_args,
5914 &data->res.seq_res, 1, task)) 6099 &data->res.seq_res, task))
5915 return; 6100 return;
5916 rpc_call_start(task); 6101 rpc_call_start(task);
5917} 6102}
@@ -5998,6 +6183,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
5998 data->args.lastbytewritten, 6183 data->args.lastbytewritten,
5999 data->args.inode->i_ino); 6184 data->args.inode->i_ino);
6000 6185
6186 nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
6001 task = rpc_run_task(&task_setup_data); 6187 task = rpc_run_task(&task_setup_data);
6002 if (IS_ERR(task)) 6188 if (IS_ERR(task))
6003 return PTR_ERR(task); 6189 return PTR_ERR(task);
@@ -6091,11 +6277,12 @@ out_freepage:
6091out: 6277out:
6092 return err; 6278 return err;
6093} 6279}
6094static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) 6280
6281static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6095{ 6282{
6096 int status; 6283 int status;
6097 struct nfs41_test_stateid_args args = { 6284 struct nfs41_test_stateid_args args = {
6098 .stateid = &state->stateid, 6285 .stateid = stateid,
6099 }; 6286 };
6100 struct nfs41_test_stateid_res res; 6287 struct nfs41_test_stateid_res res;
6101 struct rpc_message msg = { 6288 struct rpc_message msg = {
@@ -6103,28 +6290,31 @@ static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *sta
6103 .rpc_argp = &args, 6290 .rpc_argp = &args,
6104 .rpc_resp = &res, 6291 .rpc_resp = &res,
6105 }; 6292 };
6106 args.seq_args.sa_session = res.seq_res.sr_session = NULL; 6293
6107 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); 6294 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
6295 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
6296
6297 if (status == NFS_OK)
6298 return res.status;
6108 return status; 6299 return status;
6109} 6300}
6110 6301
6111static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) 6302static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6112{ 6303{
6113 struct nfs4_exception exception = { }; 6304 struct nfs4_exception exception = { };
6114 int err; 6305 int err;
6115 do { 6306 do {
6116 err = nfs4_handle_exception(server, 6307 err = nfs4_handle_exception(server,
6117 _nfs41_test_stateid(server, state), 6308 _nfs41_test_stateid(server, stateid),
6118 &exception); 6309 &exception);
6119 } while (exception.retry); 6310 } while (exception.retry);
6120 return err; 6311 return err;
6121} 6312}
6122 6313
6123static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) 6314static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6124{ 6315{
6125 int status;
6126 struct nfs41_free_stateid_args args = { 6316 struct nfs41_free_stateid_args args = {
6127 .stateid = &state->stateid, 6317 .stateid = stateid,
6128 }; 6318 };
6129 struct nfs41_free_stateid_res res; 6319 struct nfs41_free_stateid_res res;
6130 struct rpc_message msg = { 6320 struct rpc_message msg = {
@@ -6133,25 +6323,46 @@ static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *stat
6133 .rpc_resp = &res, 6323 .rpc_resp = &res,
6134 }; 6324 };
6135 6325
6136 args.seq_args.sa_session = res.seq_res.sr_session = NULL; 6326 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
6137 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); 6327 return nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
6138 return status;
6139} 6328}
6140 6329
6141static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) 6330static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6142{ 6331{
6143 struct nfs4_exception exception = { }; 6332 struct nfs4_exception exception = { };
6144 int err; 6333 int err;
6145 do { 6334 do {
6146 err = nfs4_handle_exception(server, 6335 err = nfs4_handle_exception(server,
6147 _nfs4_free_stateid(server, state), 6336 _nfs4_free_stateid(server, stateid),
6148 &exception); 6337 &exception);
6149 } while (exception.retry); 6338 } while (exception.retry);
6150 return err; 6339 return err;
6151} 6340}
6341
6342static bool nfs41_match_stateid(const nfs4_stateid *s1,
6343 const nfs4_stateid *s2)
6344{
6345 if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0)
6346 return false;
6347
6348 if (s1->seqid == s2->seqid)
6349 return true;
6350 if (s1->seqid == 0 || s2->seqid == 0)
6351 return true;
6352
6353 return false;
6354}
6355
6152#endif /* CONFIG_NFS_V4_1 */ 6356#endif /* CONFIG_NFS_V4_1 */
6153 6357
6154struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 6358static bool nfs4_match_stateid(const nfs4_stateid *s1,
6359 const nfs4_stateid *s2)
6360{
6361 return nfs4_stateid_match(s1, s2);
6362}
6363
6364
6365static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
6155 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, 6366 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
6156 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, 6367 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
6157 .recover_open = nfs4_open_reclaim, 6368 .recover_open = nfs4_open_reclaim,
@@ -6161,7 +6372,7 @@ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
6161}; 6372};
6162 6373
6163#if defined(CONFIG_NFS_V4_1) 6374#if defined(CONFIG_NFS_V4_1)
6164struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { 6375static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
6165 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, 6376 .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
6166 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, 6377 .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
6167 .recover_open = nfs4_open_reclaim, 6378 .recover_open = nfs4_open_reclaim,
@@ -6172,7 +6383,7 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
6172}; 6383};
6173#endif /* CONFIG_NFS_V4_1 */ 6384#endif /* CONFIG_NFS_V4_1 */
6174 6385
6175struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { 6386static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
6176 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, 6387 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
6177 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 6388 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
6178 .recover_open = nfs4_open_expired, 6389 .recover_open = nfs4_open_expired,
@@ -6182,7 +6393,7 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
6182}; 6393};
6183 6394
6184#if defined(CONFIG_NFS_V4_1) 6395#if defined(CONFIG_NFS_V4_1)
6185struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { 6396static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
6186 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, 6397 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
6187 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 6398 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
6188 .recover_open = nfs41_open_expired, 6399 .recover_open = nfs41_open_expired,
@@ -6192,14 +6403,14 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
6192}; 6403};
6193#endif /* CONFIG_NFS_V4_1 */ 6404#endif /* CONFIG_NFS_V4_1 */
6194 6405
6195struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { 6406static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {
6196 .sched_state_renewal = nfs4_proc_async_renew, 6407 .sched_state_renewal = nfs4_proc_async_renew,
6197 .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked, 6408 .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,
6198 .renew_lease = nfs4_proc_renew, 6409 .renew_lease = nfs4_proc_renew,
6199}; 6410};
6200 6411
6201#if defined(CONFIG_NFS_V4_1) 6412#if defined(CONFIG_NFS_V4_1)
6202struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { 6413static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
6203 .sched_state_renewal = nfs41_proc_async_sequence, 6414 .sched_state_renewal = nfs41_proc_async_sequence,
6204 .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked, 6415 .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,
6205 .renew_lease = nfs4_proc_sequence, 6416 .renew_lease = nfs4_proc_sequence,
@@ -6209,7 +6420,7 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
6209static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { 6420static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
6210 .minor_version = 0, 6421 .minor_version = 0,
6211 .call_sync = _nfs4_call_sync, 6422 .call_sync = _nfs4_call_sync,
6212 .validate_stateid = nfs4_validate_delegation_stateid, 6423 .match_stateid = nfs4_match_stateid,
6213 .find_root_sec = nfs4_find_root_sec, 6424 .find_root_sec = nfs4_find_root_sec,
6214 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 6425 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
6215 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 6426 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -6220,7 +6431,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
6220static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { 6431static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
6221 .minor_version = 1, 6432 .minor_version = 1,
6222 .call_sync = _nfs4_call_sync_session, 6433 .call_sync = _nfs4_call_sync_session,
6223 .validate_stateid = nfs41_validate_delegation_stateid, 6434 .match_stateid = nfs41_match_stateid,
6224 .find_root_sec = nfs41_find_root_sec, 6435 .find_root_sec = nfs41_find_root_sec,
6225 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 6436 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
6226 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 6437 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -6260,9 +6471,11 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6260 .create = nfs4_proc_create, 6471 .create = nfs4_proc_create,
6261 .remove = nfs4_proc_remove, 6472 .remove = nfs4_proc_remove,
6262 .unlink_setup = nfs4_proc_unlink_setup, 6473 .unlink_setup = nfs4_proc_unlink_setup,
6474 .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
6263 .unlink_done = nfs4_proc_unlink_done, 6475 .unlink_done = nfs4_proc_unlink_done,
6264 .rename = nfs4_proc_rename, 6476 .rename = nfs4_proc_rename,
6265 .rename_setup = nfs4_proc_rename_setup, 6477 .rename_setup = nfs4_proc_rename_setup,
6478 .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
6266 .rename_done = nfs4_proc_rename_done, 6479 .rename_done = nfs4_proc_rename_done,
6267 .link = nfs4_proc_link, 6480 .link = nfs4_proc_link,
6268 .symlink = nfs4_proc_symlink, 6481 .symlink = nfs4_proc_symlink,
@@ -6276,8 +6489,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6276 .set_capabilities = nfs4_server_capabilities, 6489 .set_capabilities = nfs4_server_capabilities,
6277 .decode_dirent = nfs4_decode_dirent, 6490 .decode_dirent = nfs4_decode_dirent,
6278 .read_setup = nfs4_proc_read_setup, 6491 .read_setup = nfs4_proc_read_setup,
6492 .read_rpc_prepare = nfs4_proc_read_rpc_prepare,
6279 .read_done = nfs4_read_done, 6493 .read_done = nfs4_read_done,
6280 .write_setup = nfs4_proc_write_setup, 6494 .write_setup = nfs4_proc_write_setup,
6495 .write_rpc_prepare = nfs4_proc_write_rpc_prepare,
6281 .write_done = nfs4_write_done, 6496 .write_done = nfs4_write_done,
6282 .commit_setup = nfs4_proc_commit_setup, 6497 .commit_setup = nfs4_proc_commit_setup,
6283 .commit_done = nfs4_commit_done, 6498 .commit_done = nfs4_commit_done,
@@ -6301,6 +6516,10 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
6301 NULL 6516 NULL
6302}; 6517};
6303 6518
6519module_param(max_session_slots, ushort, 0644);
6520MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
6521 "requests the client will negotiate");
6522
6304/* 6523/*
6305 * Local variables: 6524 * Local variables:
6306 * c-basic-offset: 8 6525 * c-basic-offset: 8
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 45392032e7bd..0f43414eb25a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -146,6 +146,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
146 struct rpc_cred *cred = NULL; 146 struct rpc_cred *cred = NULL;
147 struct nfs_server *server; 147 struct nfs_server *server;
148 148
149 /* Use machine credentials if available */
150 cred = nfs4_get_machine_cred_locked(clp);
151 if (cred != NULL)
152 goto out;
153
149 rcu_read_lock(); 154 rcu_read_lock();
150 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 155 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
151 cred = nfs4_get_renew_cred_server_locked(server); 156 cred = nfs4_get_renew_cred_server_locked(server);
@@ -153,6 +158,8 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
153 break; 158 break;
154 } 159 }
155 rcu_read_unlock(); 160 rcu_read_unlock();
161
162out:
156 return cred; 163 return cred;
157} 164}
158 165
@@ -190,30 +197,29 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
190static void nfs4_end_drain_session(struct nfs_client *clp) 197static void nfs4_end_drain_session(struct nfs_client *clp)
191{ 198{
192 struct nfs4_session *ses = clp->cl_session; 199 struct nfs4_session *ses = clp->cl_session;
200 struct nfs4_slot_table *tbl;
193 int max_slots; 201 int max_slots;
194 202
195 if (ses == NULL) 203 if (ses == NULL)
196 return; 204 return;
205 tbl = &ses->fc_slot_table;
197 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { 206 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
198 spin_lock(&ses->fc_slot_table.slot_tbl_lock); 207 spin_lock(&tbl->slot_tbl_lock);
199 max_slots = ses->fc_slot_table.max_slots; 208 max_slots = tbl->max_slots;
200 while (max_slots--) { 209 while (max_slots--) {
201 struct rpc_task *task; 210 if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
202 211 nfs4_set_task_privileged,
203 task = rpc_wake_up_next(&ses->fc_slot_table. 212 NULL) == NULL)
204 slot_tbl_waitq);
205 if (!task)
206 break; 213 break;
207 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
208 } 214 }
209 spin_unlock(&ses->fc_slot_table.slot_tbl_lock); 215 spin_unlock(&tbl->slot_tbl_lock);
210 } 216 }
211} 217}
212 218
213static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) 219static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
214{ 220{
215 spin_lock(&tbl->slot_tbl_lock); 221 spin_lock(&tbl->slot_tbl_lock);
216 if (tbl->highest_used_slotid != -1) { 222 if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
217 INIT_COMPLETION(tbl->complete); 223 INIT_COMPLETION(tbl->complete);
218 spin_unlock(&tbl->slot_tbl_lock); 224 spin_unlock(&tbl->slot_tbl_lock);
219 return wait_for_completion_interruptible(&tbl->complete); 225 return wait_for_completion_interruptible(&tbl->complete);
@@ -317,62 +323,6 @@ out:
317 return cred; 323 return cred;
318} 324}
319 325
320static void nfs_alloc_unique_id_locked(struct rb_root *root,
321 struct nfs_unique_id *new,
322 __u64 minval, int maxbits)
323{
324 struct rb_node **p, *parent;
325 struct nfs_unique_id *pos;
326 __u64 mask = ~0ULL;
327
328 if (maxbits < 64)
329 mask = (1ULL << maxbits) - 1ULL;
330
331 /* Ensure distribution is more or less flat */
332 get_random_bytes(&new->id, sizeof(new->id));
333 new->id &= mask;
334 if (new->id < minval)
335 new->id += minval;
336retry:
337 p = &root->rb_node;
338 parent = NULL;
339
340 while (*p != NULL) {
341 parent = *p;
342 pos = rb_entry(parent, struct nfs_unique_id, rb_node);
343
344 if (new->id < pos->id)
345 p = &(*p)->rb_left;
346 else if (new->id > pos->id)
347 p = &(*p)->rb_right;
348 else
349 goto id_exists;
350 }
351 rb_link_node(&new->rb_node, parent, p);
352 rb_insert_color(&new->rb_node, root);
353 return;
354id_exists:
355 for (;;) {
356 new->id++;
357 if (new->id < minval || (new->id & mask) != new->id) {
358 new->id = minval;
359 break;
360 }
361 parent = rb_next(parent);
362 if (parent == NULL)
363 break;
364 pos = rb_entry(parent, struct nfs_unique_id, rb_node);
365 if (new->id < pos->id)
366 break;
367 }
368 goto retry;
369}
370
371static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id)
372{
373 rb_erase(&id->rb_node, root);
374}
375
376static struct nfs4_state_owner * 326static struct nfs4_state_owner *
377nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) 327nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
378{ 328{
@@ -405,6 +355,7 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
405 struct rb_node **p = &server->state_owners.rb_node, 355 struct rb_node **p = &server->state_owners.rb_node,
406 *parent = NULL; 356 *parent = NULL;
407 struct nfs4_state_owner *sp; 357 struct nfs4_state_owner *sp;
358 int err;
408 359
409 while (*p != NULL) { 360 while (*p != NULL) {
410 parent = *p; 361 parent = *p;
@@ -421,8 +372,9 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
421 return sp; 372 return sp;
422 } 373 }
423 } 374 }
424 nfs_alloc_unique_id_locked(&server->openowner_id, 375 err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id);
425 &new->so_owner_id, 1, 64); 376 if (err)
377 return ERR_PTR(err);
426 rb_link_node(&new->so_server_node, parent, p); 378 rb_link_node(&new->so_server_node, parent, p);
427 rb_insert_color(&new->so_server_node, &server->state_owners); 379 rb_insert_color(&new->so_server_node, &server->state_owners);
428 return new; 380 return new;
@@ -435,7 +387,23 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
435 387
436 if (!RB_EMPTY_NODE(&sp->so_server_node)) 388 if (!RB_EMPTY_NODE(&sp->so_server_node))
437 rb_erase(&sp->so_server_node, &server->state_owners); 389 rb_erase(&sp->so_server_node, &server->state_owners);
438 nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id); 390 ida_remove(&server->openowner_id, sp->so_seqid.owner_id);
391}
392
393static void
394nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
395{
396 sc->flags = 0;
397 sc->counter = 0;
398 spin_lock_init(&sc->lock);
399 INIT_LIST_HEAD(&sc->list);
400 rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue");
401}
402
403static void
404nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc)
405{
406 rpc_destroy_wait_queue(&sc->wait);
439} 407}
440 408
441/* 409/*
@@ -444,19 +412,20 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
444 * 412 *
445 */ 413 */
446static struct nfs4_state_owner * 414static struct nfs4_state_owner *
447nfs4_alloc_state_owner(void) 415nfs4_alloc_state_owner(struct nfs_server *server,
416 struct rpc_cred *cred,
417 gfp_t gfp_flags)
448{ 418{
449 struct nfs4_state_owner *sp; 419 struct nfs4_state_owner *sp;
450 420
451 sp = kzalloc(sizeof(*sp),GFP_NOFS); 421 sp = kzalloc(sizeof(*sp), gfp_flags);
452 if (!sp) 422 if (!sp)
453 return NULL; 423 return NULL;
424 sp->so_server = server;
425 sp->so_cred = get_rpccred(cred);
454 spin_lock_init(&sp->so_lock); 426 spin_lock_init(&sp->so_lock);
455 INIT_LIST_HEAD(&sp->so_states); 427 INIT_LIST_HEAD(&sp->so_states);
456 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); 428 nfs4_init_seqid_counter(&sp->so_seqid);
457 sp->so_seqid.sequence = &sp->so_sequence;
458 spin_lock_init(&sp->so_sequence.lock);
459 INIT_LIST_HEAD(&sp->so_sequence.list);
460 atomic_set(&sp->so_count, 1); 429 atomic_set(&sp->so_count, 1);
461 INIT_LIST_HEAD(&sp->so_lru); 430 INIT_LIST_HEAD(&sp->so_lru);
462 return sp; 431 return sp;
@@ -478,7 +447,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
478 447
479static void nfs4_free_state_owner(struct nfs4_state_owner *sp) 448static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
480{ 449{
481 rpc_destroy_wait_queue(&sp->so_sequence.wait); 450 nfs4_destroy_seqid_counter(&sp->so_seqid);
482 put_rpccred(sp->so_cred); 451 put_rpccred(sp->so_cred);
483 kfree(sp); 452 kfree(sp);
484} 453}
@@ -516,7 +485,8 @@ static void nfs4_gc_state_owners(struct nfs_server *server)
516 * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. 485 * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.
517 */ 486 */
518struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, 487struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
519 struct rpc_cred *cred) 488 struct rpc_cred *cred,
489 gfp_t gfp_flags)
520{ 490{
521 struct nfs_client *clp = server->nfs_client; 491 struct nfs_client *clp = server->nfs_client;
522 struct nfs4_state_owner *sp, *new; 492 struct nfs4_state_owner *sp, *new;
@@ -526,20 +496,18 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
526 spin_unlock(&clp->cl_lock); 496 spin_unlock(&clp->cl_lock);
527 if (sp != NULL) 497 if (sp != NULL)
528 goto out; 498 goto out;
529 new = nfs4_alloc_state_owner(); 499 new = nfs4_alloc_state_owner(server, cred, gfp_flags);
530 if (new == NULL) 500 if (new == NULL)
531 goto out; 501 goto out;
532 new->so_server = server; 502 do {
533 new->so_cred = cred; 503 if (ida_pre_get(&server->openowner_id, gfp_flags) == 0)
534 spin_lock(&clp->cl_lock); 504 break;
535 sp = nfs4_insert_state_owner_locked(new); 505 spin_lock(&clp->cl_lock);
536 spin_unlock(&clp->cl_lock); 506 sp = nfs4_insert_state_owner_locked(new);
537 if (sp == new) 507 spin_unlock(&clp->cl_lock);
538 get_rpccred(cred); 508 } while (sp == ERR_PTR(-EAGAIN));
539 else { 509 if (sp != new)
540 rpc_destroy_wait_queue(&new->so_sequence.wait); 510 nfs4_free_state_owner(new);
541 kfree(new);
542 }
543out: 511out:
544 nfs4_gc_state_owners(server); 512 nfs4_gc_state_owners(server);
545 return sp; 513 return sp;
@@ -795,15 +763,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
795{ 763{
796 struct nfs4_lock_state *lsp; 764 struct nfs4_lock_state *lsp;
797 struct nfs_server *server = state->owner->so_server; 765 struct nfs_server *server = state->owner->so_server;
798 struct nfs_client *clp = server->nfs_client;
799 766
800 lsp = kzalloc(sizeof(*lsp), GFP_NOFS); 767 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
801 if (lsp == NULL) 768 if (lsp == NULL)
802 return NULL; 769 return NULL;
803 rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); 770 nfs4_init_seqid_counter(&lsp->ls_seqid);
804 spin_lock_init(&lsp->ls_sequence.lock);
805 INIT_LIST_HEAD(&lsp->ls_sequence.list);
806 lsp->ls_seqid.sequence = &lsp->ls_sequence;
807 atomic_set(&lsp->ls_count, 1); 771 atomic_set(&lsp->ls_count, 1);
808 lsp->ls_state = state; 772 lsp->ls_state = state;
809 lsp->ls_owner.lo_type = type; 773 lsp->ls_owner.lo_type = type;
@@ -815,25 +779,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
815 lsp->ls_owner.lo_u.posix_owner = fl_owner; 779 lsp->ls_owner.lo_u.posix_owner = fl_owner;
816 break; 780 break;
817 default: 781 default:
818 kfree(lsp); 782 goto out_free;
819 return NULL;
820 } 783 }
821 spin_lock(&clp->cl_lock); 784 lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
822 nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64); 785 if (lsp->ls_seqid.owner_id < 0)
823 spin_unlock(&clp->cl_lock); 786 goto out_free;
824 INIT_LIST_HEAD(&lsp->ls_locks); 787 INIT_LIST_HEAD(&lsp->ls_locks);
825 return lsp; 788 return lsp;
789out_free:
790 kfree(lsp);
791 return NULL;
826} 792}
827 793
828static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 794void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
829{ 795{
830 struct nfs_server *server = lsp->ls_state->owner->so_server; 796 ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id);
831 struct nfs_client *clp = server->nfs_client; 797 nfs4_destroy_seqid_counter(&lsp->ls_seqid);
832
833 spin_lock(&clp->cl_lock);
834 nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id);
835 spin_unlock(&clp->cl_lock);
836 rpc_destroy_wait_queue(&lsp->ls_sequence.wait);
837 kfree(lsp); 798 kfree(lsp);
838} 799}
839 800
@@ -865,7 +826,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
865 } 826 }
866 spin_unlock(&state->state_lock); 827 spin_unlock(&state->state_lock);
867 if (new != NULL) 828 if (new != NULL)
868 nfs4_free_lock_state(new); 829 nfs4_free_lock_state(state->owner->so_server, new);
869 return lsp; 830 return lsp;
870} 831}
871 832
@@ -886,9 +847,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
886 if (list_empty(&state->lock_states)) 847 if (list_empty(&state->lock_states))
887 clear_bit(LK_STATE_IN_USE, &state->flags); 848 clear_bit(LK_STATE_IN_USE, &state->flags);
888 spin_unlock(&state->state_lock); 849 spin_unlock(&state->state_lock);
889 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) 850 if (lsp->ls_flags & NFS_LOCK_INITIALIZED) {
890 nfs4_release_lockowner(lsp); 851 if (nfs4_release_lockowner(lsp) == 0)
891 nfs4_free_lock_state(lsp); 852 return;
853 }
854 nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);
892} 855}
893 856
894static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 857static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -918,7 +881,8 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
918 if (fl->fl_flags & FL_POSIX) 881 if (fl->fl_flags & FL_POSIX)
919 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); 882 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
920 else if (fl->fl_flags & FL_FLOCK) 883 else if (fl->fl_flags & FL_FLOCK)
921 lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); 884 lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
885 NFS4_FLOCK_LOCK_TYPE);
922 else 886 else
923 return -EINVAL; 887 return -EINVAL;
924 if (lsp == NULL) 888 if (lsp == NULL)
@@ -928,28 +892,49 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
928 return 0; 892 return 0;
929} 893}
930 894
931/* 895static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
932 * Byte-range lock aware utility to initialize the stateid of read/write 896 fl_owner_t fl_owner, pid_t fl_pid)
933 * requests.
934 */
935void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
936{ 897{
937 struct nfs4_lock_state *lsp; 898 struct nfs4_lock_state *lsp;
938 int seq; 899 bool ret = false;
939 900
940 do {
941 seq = read_seqbegin(&state->seqlock);
942 memcpy(dst, &state->stateid, sizeof(*dst));
943 } while (read_seqretry(&state->seqlock, seq));
944 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) 901 if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
945 return; 902 goto out;
946 903
947 spin_lock(&state->state_lock); 904 spin_lock(&state->state_lock);
948 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); 905 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
949 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 906 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) {
950 memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); 907 nfs4_stateid_copy(dst, &lsp->ls_stateid);
908 ret = true;
909 }
951 spin_unlock(&state->state_lock); 910 spin_unlock(&state->state_lock);
952 nfs4_put_lock_state(lsp); 911 nfs4_put_lock_state(lsp);
912out:
913 return ret;
914}
915
916static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
917{
918 int seq;
919
920 do {
921 seq = read_seqbegin(&state->seqlock);
922 nfs4_stateid_copy(dst, &state->stateid);
923 } while (read_seqretry(&state->seqlock, seq));
924}
925
926/*
927 * Byte-range lock aware utility to initialize the stateid of read/write
928 * requests.
929 */
930void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
931 fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid)
932{
933 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
934 return;
935 if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid))
936 return;
937 nfs4_copy_open_stateid(dst, state);
953} 938}
954 939
955struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) 940struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
@@ -960,20 +945,28 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
960 if (new != NULL) { 945 if (new != NULL) {
961 new->sequence = counter; 946 new->sequence = counter;
962 INIT_LIST_HEAD(&new->list); 947 INIT_LIST_HEAD(&new->list);
948 new->task = NULL;
963 } 949 }
964 return new; 950 return new;
965} 951}
966 952
967void nfs_release_seqid(struct nfs_seqid *seqid) 953void nfs_release_seqid(struct nfs_seqid *seqid)
968{ 954{
969 if (!list_empty(&seqid->list)) { 955 struct nfs_seqid_counter *sequence;
970 struct rpc_sequence *sequence = seqid->sequence->sequence;
971 956
972 spin_lock(&sequence->lock); 957 if (list_empty(&seqid->list))
973 list_del_init(&seqid->list); 958 return;
974 spin_unlock(&sequence->lock); 959 sequence = seqid->sequence;
975 rpc_wake_up(&sequence->wait); 960 spin_lock(&sequence->lock);
961 list_del_init(&seqid->list);
962 if (!list_empty(&sequence->list)) {
963 struct nfs_seqid *next;
964
965 next = list_first_entry(&sequence->list,
966 struct nfs_seqid, list);
967 rpc_wake_up_queued_task(&sequence->wait, next->task);
976 } 968 }
969 spin_unlock(&sequence->lock);
977} 970}
978 971
979void nfs_free_seqid(struct nfs_seqid *seqid) 972void nfs_free_seqid(struct nfs_seqid *seqid)
@@ -989,14 +982,14 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
989 */ 982 */
990static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) 983static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
991{ 984{
992 BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid); 985 BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
993 switch (status) { 986 switch (status) {
994 case 0: 987 case 0:
995 break; 988 break;
996 case -NFS4ERR_BAD_SEQID: 989 case -NFS4ERR_BAD_SEQID:
997 if (seqid->sequence->flags & NFS_SEQID_CONFIRMED) 990 if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)
998 return; 991 return;
999 printk(KERN_WARNING "NFS: v4 server returned a bad" 992 pr_warn_ratelimited("NFS: v4 server returned a bad"
1000 " sequence-id error on an" 993 " sequence-id error on an"
1001 " unconfirmed sequence %p!\n", 994 " unconfirmed sequence %p!\n",
1002 seqid->sequence); 995 seqid->sequence);
@@ -1040,10 +1033,11 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
1040 1033
1041int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) 1034int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1042{ 1035{
1043 struct rpc_sequence *sequence = seqid->sequence->sequence; 1036 struct nfs_seqid_counter *sequence = seqid->sequence;
1044 int status = 0; 1037 int status = 0;
1045 1038
1046 spin_lock(&sequence->lock); 1039 spin_lock(&sequence->lock);
1040 seqid->task = task;
1047 if (list_empty(&seqid->list)) 1041 if (list_empty(&seqid->list))
1048 list_add_tail(&seqid->list, &sequence->list); 1042 list_add_tail(&seqid->list, &sequence->list);
1049 if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) 1043 if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid)
@@ -1072,19 +1066,28 @@ static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
1072void nfs4_schedule_state_manager(struct nfs_client *clp) 1066void nfs4_schedule_state_manager(struct nfs_client *clp)
1073{ 1067{
1074 struct task_struct *task; 1068 struct task_struct *task;
1069 char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
1075 1070
1076 if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) 1071 if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
1077 return; 1072 return;
1078 __module_get(THIS_MODULE); 1073 __module_get(THIS_MODULE);
1079 atomic_inc(&clp->cl_count); 1074 atomic_inc(&clp->cl_count);
1080 task = kthread_run(nfs4_run_state_manager, clp, "%s-manager", 1075
1081 rpc_peeraddr2str(clp->cl_rpcclient, 1076 /* The rcu_read_lock() is not strictly necessary, as the state
1082 RPC_DISPLAY_ADDR)); 1077 * manager is the only thread that ever changes the rpc_xprt
1083 if (!IS_ERR(task)) 1078 * after it's initialized. At this point, we're single threaded. */
1084 return; 1079 rcu_read_lock();
1085 nfs4_clear_state_manager_bit(clp); 1080 snprintf(buf, sizeof(buf), "%s-manager",
1086 nfs_put_client(clp); 1081 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
1087 module_put(THIS_MODULE); 1082 rcu_read_unlock();
1083 task = kthread_run(nfs4_run_state_manager, clp, buf);
1084 if (IS_ERR(task)) {
1085 printk(KERN_ERR "%s: kthread_run: %ld\n",
1086 __func__, PTR_ERR(task));
1087 nfs4_clear_state_manager_bit(clp);
1088 nfs_put_client(clp);
1089 module_put(THIS_MODULE);
1090 }
1088} 1091}
1089 1092
1090/* 1093/*
@@ -1098,10 +1101,25 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1098 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); 1101 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1099 nfs4_schedule_state_manager(clp); 1102 nfs4_schedule_state_manager(clp);
1100} 1103}
1104EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
1105
1106/*
1107 * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
1108 * @clp: client to process
1109 *
1110 * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a
1111 * resend of the SETCLIENTID and hence re-establish the
1112 * callback channel. Then return all existing delegations.
1113 */
1114static void nfs40_handle_cb_pathdown(struct nfs_client *clp)
1115{
1116 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
1117 nfs_expire_all_delegations(clp);
1118}
1101 1119
1102void nfs4_schedule_path_down_recovery(struct nfs_client *clp) 1120void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
1103{ 1121{
1104 nfs_handle_cb_pathdown(clp); 1122 nfs40_handle_cb_pathdown(clp);
1105 nfs4_schedule_state_manager(clp); 1123 nfs4_schedule_state_manager(clp);
1106} 1124}
1107 1125
@@ -1132,11 +1150,37 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
1132{ 1150{
1133 struct nfs_client *clp = server->nfs_client; 1151 struct nfs_client *clp = server->nfs_client;
1134 1152
1135 if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags))
1136 nfs_async_inode_return_delegation(state->inode, &state->stateid);
1137 nfs4_state_mark_reclaim_nograce(clp, state); 1153 nfs4_state_mark_reclaim_nograce(clp, state);
1138 nfs4_schedule_state_manager(clp); 1154 nfs4_schedule_state_manager(clp);
1139} 1155}
1156EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
1157
1158void nfs_inode_find_state_and_recover(struct inode *inode,
1159 const nfs4_stateid *stateid)
1160{
1161 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
1162 struct nfs_inode *nfsi = NFS_I(inode);
1163 struct nfs_open_context *ctx;
1164 struct nfs4_state *state;
1165 bool found = false;
1166
1167 spin_lock(&inode->i_lock);
1168 list_for_each_entry(ctx, &nfsi->open_files, list) {
1169 state = ctx->state;
1170 if (state == NULL)
1171 continue;
1172 if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
1173 continue;
1174 if (!nfs4_stateid_match(&state->stateid, stateid))
1175 continue;
1176 nfs4_state_mark_reclaim_nograce(clp, state);
1177 found = true;
1178 }
1179 spin_unlock(&inode->i_lock);
1180 if (found)
1181 nfs4_schedule_state_manager(clp);
1182}
1183
1140 1184
1141static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) 1185static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
1142{ 1186{
@@ -1175,8 +1219,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1175 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1219 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1176 goto out; 1220 goto out;
1177 default: 1221 default:
1178 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", 1222 printk(KERN_ERR "NFS: %s: unhandled error %d. "
1179 __func__, status); 1223 "Zeroing state\n", __func__, status);
1180 case -ENOMEM: 1224 case -ENOMEM:
1181 case -NFS4ERR_DENIED: 1225 case -NFS4ERR_DENIED:
1182 case -NFS4ERR_RECLAIM_BAD: 1226 case -NFS4ERR_RECLAIM_BAD:
@@ -1222,8 +1266,9 @@ restart:
1222 spin_lock(&state->state_lock); 1266 spin_lock(&state->state_lock);
1223 list_for_each_entry(lock, &state->lock_states, ls_locks) { 1267 list_for_each_entry(lock, &state->lock_states, ls_locks) {
1224 if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) 1268 if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
1225 printk("%s: Lock reclaim failed!\n", 1269 pr_warn_ratelimited("NFS: "
1226 __func__); 1270 "%s: Lock reclaim "
1271 "failed!\n", __func__);
1227 } 1272 }
1228 spin_unlock(&state->state_lock); 1273 spin_unlock(&state->state_lock);
1229 nfs4_put_open_state(state); 1274 nfs4_put_open_state(state);
@@ -1232,8 +1277,8 @@ restart:
1232 } 1277 }
1233 switch (status) { 1278 switch (status) {
1234 default: 1279 default:
1235 printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", 1280 printk(KERN_ERR "NFS: %s: unhandled error %d. "
1236 __func__, status); 1281 "Zeroing state\n", __func__, status);
1237 case -ENOENT: 1282 case -ENOENT:
1238 case -ENOMEM: 1283 case -ENOMEM:
1239 case -ESTALE: 1284 case -ESTALE:
@@ -1241,8 +1286,8 @@ restart:
1241 * Open state on this file cannot be recovered 1286 * Open state on this file cannot be recovered
1242 * All we can do is revert to using the zero stateid. 1287 * All we can do is revert to using the zero stateid.
1243 */ 1288 */
1244 memset(state->stateid.data, 0, 1289 memset(&state->stateid, 0,
1245 sizeof(state->stateid.data)); 1290 sizeof(state->stateid));
1246 /* Mark the file as being 'closed' */ 1291 /* Mark the file as being 'closed' */
1247 state->state = 0; 1292 state->state = 0;
1248 break; 1293 break;
@@ -1420,7 +1465,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1420 case 0: 1465 case 0:
1421 break; 1466 break;
1422 case -NFS4ERR_CB_PATH_DOWN: 1467 case -NFS4ERR_CB_PATH_DOWN:
1423 nfs_handle_cb_pathdown(clp); 1468 nfs40_handle_cb_pathdown(clp);
1424 break; 1469 break;
1425 case -NFS4ERR_NO_GRACE: 1470 case -NFS4ERR_NO_GRACE:
1426 nfs4_state_end_reclaim_reboot(clp); 1471 nfs4_state_end_reclaim_reboot(clp);
@@ -1801,7 +1846,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1801 } while (atomic_read(&clp->cl_count) > 1); 1846 } while (atomic_read(&clp->cl_count) > 1);
1802 return; 1847 return;
1803out_error: 1848out_error:
1804 printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" 1849 pr_warn_ratelimited("NFS: state manager failed on NFSv4 server %s"
1805 " with error %d\n", clp->cl_hostname, -status); 1850 " with error %d\n", clp->cl_hostname, -status);
1806 nfs4_end_drain_session(clp); 1851 nfs4_end_drain_session(clp);
1807 nfs4_clear_state_manager_bit(clp); 1852 nfs4_clear_state_manager_bit(clp);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 33bd8d0f745d..c74fdb114b48 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -44,6 +44,8 @@
44#include <linux/pagemap.h> 44#include <linux/pagemap.h>
45#include <linux/proc_fs.h> 45#include <linux/proc_fs.h>
46#include <linux/kdev_t.h> 46#include <linux/kdev_t.h>
47#include <linux/module.h>
48#include <linux/utsname.h>
47#include <linux/sunrpc/clnt.h> 49#include <linux/sunrpc/clnt.h>
48#include <linux/sunrpc/msg_prot.h> 50#include <linux/sunrpc/msg_prot.h>
49#include <linux/sunrpc/gss_api.h> 51#include <linux/sunrpc/gss_api.h>
@@ -271,7 +273,12 @@ static int nfs4_stat_to_errno(int);
271 1 /* flags */ + \ 273 1 /* flags */ + \
272 1 /* spa_how */ + \ 274 1 /* spa_how */ + \
273 0 /* SP4_NONE (for now) */ + \ 275 0 /* SP4_NONE (for now) */ + \
274 1 /* zero implemetation id array */) 276 1 /* implementation id array of size 1 */ + \
277 1 /* nii_domain */ + \
278 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
279 1 /* nii_name */ + \
280 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
281 3 /* nii_date */)
275#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ 282#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
276 2 /* eir_clientid */ + \ 283 2 /* eir_clientid */ + \
277 1 /* eir_sequenceid */ + \ 284 1 /* eir_sequenceid */ + \
@@ -284,7 +291,11 @@ static int nfs4_stat_to_errno(int);
284 /* eir_server_scope<> */ \ 291 /* eir_server_scope<> */ \
285 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ 292 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \
286 1 /* eir_server_impl_id array length */ + \ 293 1 /* eir_server_impl_id array length */ + \
287 0 /* ignored eir_server_impl_id contents */) 294 1 /* nii_domain */ + \
295 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
296 1 /* nii_name */ + \
297 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
298 3 /* nii_date */)
288#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */) 299#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */)
289#define decode_channel_attrs_maxsz (6 + \ 300#define decode_channel_attrs_maxsz (6 + \
290 1 /* ca_rdma_ird.len */ + \ 301 1 /* ca_rdma_ird.len */ + \
@@ -838,6 +849,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
838 XDR_UNIT); 849 XDR_UNIT);
839#endif /* CONFIG_NFS_V4_1 */ 850#endif /* CONFIG_NFS_V4_1 */
840 851
852static unsigned short send_implementation_id = 1;
853
854module_param(send_implementation_id, ushort, 0644);
855MODULE_PARM_DESC(send_implementation_id,
856 "Send implementation ID with NFSv4.1 exchange_id");
857
841static const umode_t nfs_type2fmt[] = { 858static const umode_t nfs_type2fmt[] = {
842 [NF4BAD] = 0, 859 [NF4BAD] = 0,
843 [NF4REG] = S_IFREG, 860 [NF4REG] = S_IFREG,
@@ -868,15 +885,44 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
868 return p; 885 return p;
869} 886}
870 887
888static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
889{
890 __be32 *p;
891
892 p = xdr_reserve_space(xdr, len);
893 xdr_encode_opaque_fixed(p, buf, len);
894}
895
871static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 896static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
872{ 897{
873 __be32 *p; 898 __be32 *p;
874 899
875 p = xdr_reserve_space(xdr, 4 + len); 900 p = reserve_space(xdr, 4 + len);
876 BUG_ON(p == NULL);
877 xdr_encode_opaque(p, str, len); 901 xdr_encode_opaque(p, str, len);
878} 902}
879 903
904static void encode_uint32(struct xdr_stream *xdr, u32 n)
905{
906 __be32 *p;
907
908 p = reserve_space(xdr, 4);
909 *p = cpu_to_be32(n);
910}
911
912static void encode_uint64(struct xdr_stream *xdr, u64 n)
913{
914 __be32 *p;
915
916 p = reserve_space(xdr, 8);
917 xdr_encode_hyper(p, n);
918}
919
920static void encode_nfs4_seqid(struct xdr_stream *xdr,
921 const struct nfs_seqid *seqid)
922{
923 encode_uint32(xdr, seqid->sequence->counter);
924}
925
880static void encode_compound_hdr(struct xdr_stream *xdr, 926static void encode_compound_hdr(struct xdr_stream *xdr,
881 struct rpc_rqst *req, 927 struct rpc_rqst *req,
882 struct compound_hdr *hdr) 928 struct compound_hdr *hdr)
@@ -889,28 +935,37 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
889 * but this is not required as a MUST for the server to do so. */ 935 * but this is not required as a MUST for the server to do so. */
890 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; 936 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
891 937
892 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
893 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 938 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
894 p = reserve_space(xdr, 4 + hdr->taglen + 8); 939 encode_string(xdr, hdr->taglen, hdr->tag);
895 p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); 940 p = reserve_space(xdr, 8);
896 *p++ = cpu_to_be32(hdr->minorversion); 941 *p++ = cpu_to_be32(hdr->minorversion);
897 hdr->nops_p = p; 942 hdr->nops_p = p;
898 *p = cpu_to_be32(hdr->nops); 943 *p = cpu_to_be32(hdr->nops);
899} 944}
900 945
946static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
947 uint32_t replen,
948 struct compound_hdr *hdr)
949{
950 encode_uint32(xdr, op);
951 hdr->nops++;
952 hdr->replen += replen;
953}
954
901static void encode_nops(struct compound_hdr *hdr) 955static void encode_nops(struct compound_hdr *hdr)
902{ 956{
903 BUG_ON(hdr->nops > NFS4_MAX_OPS); 957 BUG_ON(hdr->nops > NFS4_MAX_OPS);
904 *hdr->nops_p = htonl(hdr->nops); 958 *hdr->nops_p = htonl(hdr->nops);
905} 959}
906 960
907static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) 961static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
908{ 962{
909 __be32 *p; 963 encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
964}
910 965
911 p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); 966static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
912 BUG_ON(p == NULL); 967{
913 xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); 968 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
914} 969}
915 970
916static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) 971static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
@@ -1023,7 +1078,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1023 * Now we backfill the bitmap and the attribute buffer length. 1078 * Now we backfill the bitmap and the attribute buffer length.
1024 */ 1079 */
1025 if (len != ((char *)p - (char *)q) + 4) { 1080 if (len != ((char *)p - (char *)q) + 4) {
1026 printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", 1081 printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",
1027 len, ((char *)p - (char *)q) + 4); 1082 len, ((char *)p - (char *)q) + 4);
1028 BUG(); 1083 BUG();
1029 } 1084 }
@@ -1037,46 +1092,33 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1037 1092
1038static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr) 1093static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
1039{ 1094{
1040 __be32 *p; 1095 encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr);
1041 1096 encode_uint32(xdr, access);
1042 p = reserve_space(xdr, 8);
1043 *p++ = cpu_to_be32(OP_ACCESS);
1044 *p = cpu_to_be32(access);
1045 hdr->nops++;
1046 hdr->replen += decode_access_maxsz;
1047} 1097}
1048 1098
1049static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 1099static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
1050{ 1100{
1051 __be32 *p; 1101 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
1052 1102 encode_nfs4_seqid(xdr, arg->seqid);
1053 p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); 1103 encode_nfs4_stateid(xdr, arg->stateid);
1054 *p++ = cpu_to_be32(OP_CLOSE);
1055 *p++ = cpu_to_be32(arg->seqid->sequence->counter);
1056 xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1057 hdr->nops++;
1058 hdr->replen += decode_close_maxsz;
1059} 1104}
1060 1105
1061static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) 1106static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
1062{ 1107{
1063 __be32 *p; 1108 __be32 *p;
1064 1109
1065 p = reserve_space(xdr, 16); 1110 encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
1066 *p++ = cpu_to_be32(OP_COMMIT); 1111 p = reserve_space(xdr, 12);
1067 p = xdr_encode_hyper(p, args->offset); 1112 p = xdr_encode_hyper(p, args->offset);
1068 *p = cpu_to_be32(args->count); 1113 *p = cpu_to_be32(args->count);
1069 hdr->nops++;
1070 hdr->replen += decode_commit_maxsz;
1071} 1114}
1072 1115
1073static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) 1116static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
1074{ 1117{
1075 __be32 *p; 1118 __be32 *p;
1076 1119
1077 p = reserve_space(xdr, 8); 1120 encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr);
1078 *p++ = cpu_to_be32(OP_CREATE); 1121 encode_uint32(xdr, create->ftype);
1079 *p = cpu_to_be32(create->ftype);
1080 1122
1081 switch (create->ftype) { 1123 switch (create->ftype) {
1082 case NF4LNK: 1124 case NF4LNK:
@@ -1096,9 +1138,6 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
1096 } 1138 }
1097 1139
1098 encode_string(xdr, create->name->len, create->name->name); 1140 encode_string(xdr, create->name->len, create->name->name);
1099 hdr->nops++;
1100 hdr->replen += decode_create_maxsz;
1101
1102 encode_attrs(xdr, create->attrs, create->server); 1141 encode_attrs(xdr, create->attrs, create->server);
1103} 1142}
1104 1143
@@ -1106,25 +1145,21 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
1106{ 1145{
1107 __be32 *p; 1146 __be32 *p;
1108 1147
1109 p = reserve_space(xdr, 12); 1148 encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
1110 *p++ = cpu_to_be32(OP_GETATTR); 1149 p = reserve_space(xdr, 8);
1111 *p++ = cpu_to_be32(1); 1150 *p++ = cpu_to_be32(1);
1112 *p = cpu_to_be32(bitmap); 1151 *p = cpu_to_be32(bitmap);
1113 hdr->nops++;
1114 hdr->replen += decode_getattr_maxsz;
1115} 1152}
1116 1153
1117static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) 1154static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
1118{ 1155{
1119 __be32 *p; 1156 __be32 *p;
1120 1157
1121 p = reserve_space(xdr, 16); 1158 encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
1122 *p++ = cpu_to_be32(OP_GETATTR); 1159 p = reserve_space(xdr, 12);
1123 *p++ = cpu_to_be32(2); 1160 *p++ = cpu_to_be32(2);
1124 *p++ = cpu_to_be32(bm0); 1161 *p++ = cpu_to_be32(bm0);
1125 *p = cpu_to_be32(bm1); 1162 *p = cpu_to_be32(bm1);
1126 hdr->nops++;
1127 hdr->replen += decode_getattr_maxsz;
1128} 1163}
1129 1164
1130static void 1165static void
@@ -1134,8 +1169,7 @@ encode_getattr_three(struct xdr_stream *xdr,
1134{ 1169{
1135 __be32 *p; 1170 __be32 *p;
1136 1171
1137 p = reserve_space(xdr, 4); 1172 encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);
1138 *p = cpu_to_be32(OP_GETATTR);
1139 if (bm2) { 1173 if (bm2) {
1140 p = reserve_space(xdr, 16); 1174 p = reserve_space(xdr, 16);
1141 *p++ = cpu_to_be32(3); 1175 *p++ = cpu_to_be32(3);
@@ -1152,8 +1186,6 @@ encode_getattr_three(struct xdr_stream *xdr,
1152 *p++ = cpu_to_be32(1); 1186 *p++ = cpu_to_be32(1);
1153 *p = cpu_to_be32(bm0); 1187 *p = cpu_to_be32(bm0);
1154 } 1188 }
1155 hdr->nops++;
1156 hdr->replen += decode_getattr_maxsz;
1157} 1189}
1158 1190
1159static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1191static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1179,23 +1211,13 @@ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, stru
1179 1211
1180static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) 1212static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1181{ 1213{
1182 __be32 *p; 1214 encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);
1183
1184 p = reserve_space(xdr, 4);
1185 *p = cpu_to_be32(OP_GETFH);
1186 hdr->nops++;
1187 hdr->replen += decode_getfh_maxsz;
1188} 1215}
1189 1216
1190static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1217static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1191{ 1218{
1192 __be32 *p; 1219 encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr);
1193 1220 encode_string(xdr, name->len, name->name);
1194 p = reserve_space(xdr, 8 + name->len);
1195 *p++ = cpu_to_be32(OP_LINK);
1196 xdr_encode_opaque(p, name->name, name->len);
1197 hdr->nops++;
1198 hdr->replen += decode_link_maxsz;
1199} 1221}
1200 1222
1201static inline int nfs4_lock_type(struct file_lock *fl, int block) 1223static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -1232,79 +1254,60 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1232{ 1254{
1233 __be32 *p; 1255 __be32 *p;
1234 1256
1235 p = reserve_space(xdr, 32); 1257 encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr);
1236 *p++ = cpu_to_be32(OP_LOCK); 1258 p = reserve_space(xdr, 28);
1237 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); 1259 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
1238 *p++ = cpu_to_be32(args->reclaim); 1260 *p++ = cpu_to_be32(args->reclaim);
1239 p = xdr_encode_hyper(p, args->fl->fl_start); 1261 p = xdr_encode_hyper(p, args->fl->fl_start);
1240 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1262 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1241 *p = cpu_to_be32(args->new_lock_owner); 1263 *p = cpu_to_be32(args->new_lock_owner);
1242 if (args->new_lock_owner){ 1264 if (args->new_lock_owner){
1243 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); 1265 encode_nfs4_seqid(xdr, args->open_seqid);
1244 *p++ = cpu_to_be32(args->open_seqid->sequence->counter); 1266 encode_nfs4_stateid(xdr, args->open_stateid);
1245 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); 1267 encode_nfs4_seqid(xdr, args->lock_seqid);
1246 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1247 encode_lockowner(xdr, &args->lock_owner); 1268 encode_lockowner(xdr, &args->lock_owner);
1248 } 1269 }
1249 else { 1270 else {
1250 p = reserve_space(xdr, NFS4_STATEID_SIZE+4); 1271 encode_nfs4_stateid(xdr, args->lock_stateid);
1251 p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); 1272 encode_nfs4_seqid(xdr, args->lock_seqid);
1252 *p = cpu_to_be32(args->lock_seqid->sequence->counter);
1253 } 1273 }
1254 hdr->nops++;
1255 hdr->replen += decode_lock_maxsz;
1256} 1274}
1257 1275
1258static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) 1276static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
1259{ 1277{
1260 __be32 *p; 1278 __be32 *p;
1261 1279
1262 p = reserve_space(xdr, 24); 1280 encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr);
1263 *p++ = cpu_to_be32(OP_LOCKT); 1281 p = reserve_space(xdr, 20);
1264 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); 1282 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1265 p = xdr_encode_hyper(p, args->fl->fl_start); 1283 p = xdr_encode_hyper(p, args->fl->fl_start);
1266 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1284 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1267 encode_lockowner(xdr, &args->lock_owner); 1285 encode_lockowner(xdr, &args->lock_owner);
1268 hdr->nops++;
1269 hdr->replen += decode_lockt_maxsz;
1270} 1286}
1271 1287
1272static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) 1288static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
1273{ 1289{
1274 __be32 *p; 1290 __be32 *p;
1275 1291
1276 p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); 1292 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
1277 *p++ = cpu_to_be32(OP_LOCKU); 1293 encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
1278 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); 1294 encode_nfs4_seqid(xdr, args->seqid);
1279 *p++ = cpu_to_be32(args->seqid->sequence->counter); 1295 encode_nfs4_stateid(xdr, args->stateid);
1280 p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); 1296 p = reserve_space(xdr, 16);
1281 p = xdr_encode_hyper(p, args->fl->fl_start); 1297 p = xdr_encode_hyper(p, args->fl->fl_start);
1282 xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1298 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1283 hdr->nops++;
1284 hdr->replen += decode_locku_maxsz;
1285} 1299}
1286 1300
1287static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) 1301static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
1288{ 1302{
1289 __be32 *p; 1303 encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);
1290
1291 p = reserve_space(xdr, 4);
1292 *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
1293 encode_lockowner(xdr, lowner); 1304 encode_lockowner(xdr, lowner);
1294 hdr->nops++;
1295 hdr->replen += decode_release_lockowner_maxsz;
1296} 1305}
1297 1306
1298static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1307static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1299{ 1308{
1300 int len = name->len; 1309 encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr);
1301 __be32 *p; 1310 encode_string(xdr, name->len, name->name);
1302
1303 p = reserve_space(xdr, 8 + len);
1304 *p++ = cpu_to_be32(OP_LOOKUP);
1305 xdr_encode_opaque(p, name->name, len);
1306 hdr->nops++;
1307 hdr->replen += decode_lookup_maxsz;
1308} 1311}
1309 1312
1310static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) 1313static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
@@ -1335,9 +1338,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1335 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, 1338 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
1336 * owner 4 = 32 1339 * owner 4 = 32
1337 */ 1340 */
1338 p = reserve_space(xdr, 8); 1341 encode_nfs4_seqid(xdr, arg->seqid);
1339 *p++ = cpu_to_be32(OP_OPEN);
1340 *p = cpu_to_be32(arg->seqid->sequence->counter);
1341 encode_share_access(xdr, arg->fmode); 1342 encode_share_access(xdr, arg->fmode);
1342 p = reserve_space(xdr, 32); 1343 p = reserve_space(xdr, 32);
1343 p = xdr_encode_hyper(p, arg->clientid); 1344 p = xdr_encode_hyper(p, arg->clientid);
@@ -1437,14 +1438,15 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
1437{ 1438{
1438 __be32 *p; 1439 __be32 *p;
1439 1440
1440 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); 1441 p = reserve_space(xdr, 4);
1441 *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); 1442 *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
1442 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); 1443 encode_nfs4_stateid(xdr, stateid);
1443 encode_string(xdr, name->len, name->name); 1444 encode_string(xdr, name->len, name->name);
1444} 1445}
1445 1446
1446static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) 1447static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
1447{ 1448{
1449 encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
1448 encode_openhdr(xdr, arg); 1450 encode_openhdr(xdr, arg);
1449 encode_opentype(xdr, arg); 1451 encode_opentype(xdr, arg);
1450 switch (arg->claim) { 1452 switch (arg->claim) {
@@ -1460,88 +1462,64 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
1460 default: 1462 default:
1461 BUG(); 1463 BUG();
1462 } 1464 }
1463 hdr->nops++;
1464 hdr->replen += decode_open_maxsz;
1465} 1465}
1466 1466
1467static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) 1467static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
1468{ 1468{
1469 __be32 *p; 1469 encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr);
1470 1470 encode_nfs4_stateid(xdr, arg->stateid);
1471 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); 1471 encode_nfs4_seqid(xdr, arg->seqid);
1472 *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
1473 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1474 *p = cpu_to_be32(arg->seqid->sequence->counter);
1475 hdr->nops++;
1476 hdr->replen += decode_open_confirm_maxsz;
1477} 1472}
1478 1473
1479static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 1474static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
1480{ 1475{
1481 __be32 *p; 1476 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
1482 1477 encode_nfs4_stateid(xdr, arg->stateid);
1483 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); 1478 encode_nfs4_seqid(xdr, arg->seqid);
1484 *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
1485 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1486 *p = cpu_to_be32(arg->seqid->sequence->counter);
1487 encode_share_access(xdr, arg->fmode); 1479 encode_share_access(xdr, arg->fmode);
1488 hdr->nops++;
1489 hdr->replen += decode_open_downgrade_maxsz;
1490} 1480}
1491 1481
1492static void 1482static void
1493encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr) 1483encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
1494{ 1484{
1495 int len = fh->size; 1485 encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr);
1496 __be32 *p; 1486 encode_string(xdr, fh->size, fh->data);
1497
1498 p = reserve_space(xdr, 8 + len);
1499 *p++ = cpu_to_be32(OP_PUTFH);
1500 xdr_encode_opaque(p, fh->data, len);
1501 hdr->nops++;
1502 hdr->replen += decode_putfh_maxsz;
1503} 1487}
1504 1488
1505static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) 1489static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1506{ 1490{
1507 __be32 *p; 1491 encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
1508
1509 p = reserve_space(xdr, 4);
1510 *p = cpu_to_be32(OP_PUTROOTFH);
1511 hdr->nops++;
1512 hdr->replen += decode_putrootfh_maxsz;
1513} 1492}
1514 1493
1515static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) 1494static void encode_open_stateid(struct xdr_stream *xdr,
1495 const struct nfs_open_context *ctx,
1496 const struct nfs_lock_context *l_ctx,
1497 fmode_t fmode,
1498 int zero_seqid)
1516{ 1499{
1517 nfs4_stateid stateid; 1500 nfs4_stateid stateid;
1518 __be32 *p;
1519 1501
1520 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1521 if (ctx->state != NULL) { 1502 if (ctx->state != NULL) {
1522 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); 1503 nfs4_select_rw_stateid(&stateid, ctx->state,
1504 fmode, l_ctx->lockowner, l_ctx->pid);
1523 if (zero_seqid) 1505 if (zero_seqid)
1524 stateid.stateid.seqid = 0; 1506 stateid.seqid = 0;
1525 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1507 encode_nfs4_stateid(xdr, &stateid);
1526 } else 1508 } else
1527 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1509 encode_nfs4_stateid(xdr, &zero_stateid);
1528} 1510}
1529 1511
1530static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) 1512static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
1531{ 1513{
1532 __be32 *p; 1514 __be32 *p;
1533 1515
1534 p = reserve_space(xdr, 4); 1516 encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
1535 *p = cpu_to_be32(OP_READ); 1517 encode_open_stateid(xdr, args->context, args->lock_context,
1536 1518 FMODE_READ, hdr->minorversion);
1537 encode_stateid(xdr, args->context, args->lock_context,
1538 hdr->minorversion);
1539 1519
1540 p = reserve_space(xdr, 12); 1520 p = reserve_space(xdr, 12);
1541 p = xdr_encode_hyper(p, args->offset); 1521 p = xdr_encode_hyper(p, args->offset);
1542 *p = cpu_to_be32(args->count); 1522 *p = cpu_to_be32(args->count);
1543 hdr->nops++;
1544 hdr->replen += decode_read_maxsz;
1545} 1523}
1546 1524
1547static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1525static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1551,7 +1529,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1551 FATTR4_WORD1_MOUNTED_ON_FILEID, 1529 FATTR4_WORD1_MOUNTED_ON_FILEID,
1552 }; 1530 };
1553 uint32_t dircount = readdir->count >> 1; 1531 uint32_t dircount = readdir->count >> 1;
1554 __be32 *p; 1532 __be32 *p, verf[2];
1555 1533
1556 if (readdir->plus) { 1534 if (readdir->plus) {
1557 attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| 1535 attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1566,80 +1544,54 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1566 if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) 1544 if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
1567 attrs[0] |= FATTR4_WORD0_FILEID; 1545 attrs[0] |= FATTR4_WORD0_FILEID;
1568 1546
1569 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); 1547 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
1570 *p++ = cpu_to_be32(OP_READDIR); 1548 encode_uint64(xdr, readdir->cookie);
1571 p = xdr_encode_hyper(p, readdir->cookie); 1549 encode_nfs4_verifier(xdr, &readdir->verifier);
1572 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); 1550 p = reserve_space(xdr, 20);
1573 *p++ = cpu_to_be32(dircount); 1551 *p++ = cpu_to_be32(dircount);
1574 *p++ = cpu_to_be32(readdir->count); 1552 *p++ = cpu_to_be32(readdir->count);
1575 *p++ = cpu_to_be32(2); 1553 *p++ = cpu_to_be32(2);
1576 1554
1577 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1555 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1578 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1556 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1579 hdr->nops++; 1557 memcpy(verf, readdir->verifier.data, sizeof(verf));
1580 hdr->replen += decode_readdir_maxsz;
1581 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1558 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
1582 __func__, 1559 __func__,
1583 (unsigned long long)readdir->cookie, 1560 (unsigned long long)readdir->cookie,
1584 ((u32 *)readdir->verifier.data)[0], 1561 verf[0], verf[1],
1585 ((u32 *)readdir->verifier.data)[1],
1586 attrs[0] & readdir->bitmask[0], 1562 attrs[0] & readdir->bitmask[0],
1587 attrs[1] & readdir->bitmask[1]); 1563 attrs[1] & readdir->bitmask[1]);
1588} 1564}
1589 1565
1590static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) 1566static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
1591{ 1567{
1592 __be32 *p; 1568 encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);
1593
1594 p = reserve_space(xdr, 4);
1595 *p = cpu_to_be32(OP_READLINK);
1596 hdr->nops++;
1597 hdr->replen += decode_readlink_maxsz;
1598} 1569}
1599 1570
1600static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1571static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1601{ 1572{
1602 __be32 *p; 1573 encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr);
1603 1574 encode_string(xdr, name->len, name->name);
1604 p = reserve_space(xdr, 8 + name->len);
1605 *p++ = cpu_to_be32(OP_REMOVE);
1606 xdr_encode_opaque(p, name->name, name->len);
1607 hdr->nops++;
1608 hdr->replen += decode_remove_maxsz;
1609} 1575}
1610 1576
1611static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) 1577static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
1612{ 1578{
1613 __be32 *p; 1579 encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);
1614
1615 p = reserve_space(xdr, 4);
1616 *p = cpu_to_be32(OP_RENAME);
1617 encode_string(xdr, oldname->len, oldname->name); 1580 encode_string(xdr, oldname->len, oldname->name);
1618 encode_string(xdr, newname->len, newname->name); 1581 encode_string(xdr, newname->len, newname->name);
1619 hdr->nops++;
1620 hdr->replen += decode_rename_maxsz;
1621} 1582}
1622 1583
1623static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) 1584static void encode_renew(struct xdr_stream *xdr, clientid4 clid,
1585 struct compound_hdr *hdr)
1624{ 1586{
1625 __be32 *p; 1587 encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr);
1626 1588 encode_uint64(xdr, clid);
1627 p = reserve_space(xdr, 12);
1628 *p++ = cpu_to_be32(OP_RENEW);
1629 xdr_encode_hyper(p, client_stateid->cl_clientid);
1630 hdr->nops++;
1631 hdr->replen += decode_renew_maxsz;
1632} 1589}
1633 1590
1634static void 1591static void
1635encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) 1592encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1636{ 1593{
1637 __be32 *p; 1594 encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
1638
1639 p = reserve_space(xdr, 4);
1640 *p = cpu_to_be32(OP_RESTOREFH);
1641 hdr->nops++;
1642 hdr->replen += decode_restorefh_maxsz;
1643} 1595}
1644 1596
1645static void 1597static void
@@ -1647,9 +1599,8 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1647{ 1599{
1648 __be32 *p; 1600 __be32 *p;
1649 1601
1650 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); 1602 encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
1651 *p++ = cpu_to_be32(OP_SETATTR); 1603 encode_nfs4_stateid(xdr, &zero_stateid);
1652 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1653 p = reserve_space(xdr, 2*4); 1604 p = reserve_space(xdr, 2*4);
1654 *p++ = cpu_to_be32(1); 1605 *p++ = cpu_to_be32(1);
1655 *p = cpu_to_be32(FATTR4_WORD0_ACL); 1606 *p = cpu_to_be32(FATTR4_WORD0_ACL);
@@ -1657,30 +1608,18 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1657 p = reserve_space(xdr, 4); 1608 p = reserve_space(xdr, 4);
1658 *p = cpu_to_be32(arg->acl_len); 1609 *p = cpu_to_be32(arg->acl_len);
1659 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1610 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1660 hdr->nops++;
1661 hdr->replen += decode_setacl_maxsz;
1662} 1611}
1663 1612
1664static void 1613static void
1665encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) 1614encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1666{ 1615{
1667 __be32 *p; 1616 encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);
1668
1669 p = reserve_space(xdr, 4);
1670 *p = cpu_to_be32(OP_SAVEFH);
1671 hdr->nops++;
1672 hdr->replen += decode_savefh_maxsz;
1673} 1617}
1674 1618
1675static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) 1619static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
1676{ 1620{
1677 __be32 *p; 1621 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
1678 1622 encode_nfs4_stateid(xdr, &arg->stateid);
1679 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1680 *p++ = cpu_to_be32(OP_SETATTR);
1681 xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
1682 hdr->nops++;
1683 hdr->replen += decode_setattr_maxsz;
1684 encode_attrs(xdr, arg->iap, server); 1623 encode_attrs(xdr, arg->iap, server);
1685} 1624}
1686 1625
@@ -1688,9 +1627,8 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1688{ 1627{
1689 __be32 *p; 1628 __be32 *p;
1690 1629
1691 p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); 1630 encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
1692 *p++ = cpu_to_be32(OP_SETCLIENTID); 1631 encode_nfs4_verifier(xdr, setclientid->sc_verifier);
1693 xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
1694 1632
1695 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); 1633 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
1696 p = reserve_space(xdr, 4); 1634 p = reserve_space(xdr, 4);
@@ -1699,31 +1637,23 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1699 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); 1637 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
1700 p = reserve_space(xdr, 4); 1638 p = reserve_space(xdr, 4);
1701 *p = cpu_to_be32(setclientid->sc_cb_ident); 1639 *p = cpu_to_be32(setclientid->sc_cb_ident);
1702 hdr->nops++;
1703 hdr->replen += decode_setclientid_maxsz;
1704} 1640}
1705 1641
1706static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) 1642static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
1707{ 1643{
1708 __be32 *p; 1644 encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM,
1709 1645 decode_setclientid_confirm_maxsz, hdr);
1710 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); 1646 encode_uint64(xdr, arg->clientid);
1711 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); 1647 encode_nfs4_verifier(xdr, &arg->confirm);
1712 p = xdr_encode_hyper(p, arg->clientid);
1713 xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
1714 hdr->nops++;
1715 hdr->replen += decode_setclientid_confirm_maxsz;
1716} 1648}
1717 1649
1718static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) 1650static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
1719{ 1651{
1720 __be32 *p; 1652 __be32 *p;
1721 1653
1722 p = reserve_space(xdr, 4); 1654 encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
1723 *p = cpu_to_be32(OP_WRITE); 1655 encode_open_stateid(xdr, args->context, args->lock_context,
1724 1656 FMODE_WRITE, hdr->minorversion);
1725 encode_stateid(xdr, args->context, args->lock_context,
1726 hdr->minorversion);
1727 1657
1728 p = reserve_space(xdr, 16); 1658 p = reserve_space(xdr, 16);
1729 p = xdr_encode_hyper(p, args->offset); 1659 p = xdr_encode_hyper(p, args->offset);
@@ -1731,32 +1661,18 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1731 *p = cpu_to_be32(args->count); 1661 *p = cpu_to_be32(args->count);
1732 1662
1733 xdr_write_pages(xdr, args->pages, args->pgbase, args->count); 1663 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
1734 hdr->nops++;
1735 hdr->replen += decode_write_maxsz;
1736} 1664}
1737 1665
1738static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) 1666static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
1739{ 1667{
1740 __be32 *p; 1668 encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr);
1741 1669 encode_nfs4_stateid(xdr, stateid);
1742 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1743
1744 *p++ = cpu_to_be32(OP_DELEGRETURN);
1745 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1746 hdr->nops++;
1747 hdr->replen += decode_delegreturn_maxsz;
1748} 1670}
1749 1671
1750static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1672static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1751{ 1673{
1752 int len = name->len; 1674 encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr);
1753 __be32 *p; 1675 encode_string(xdr, name->len, name->name);
1754
1755 p = reserve_space(xdr, 8 + len);
1756 *p++ = cpu_to_be32(OP_SECINFO);
1757 xdr_encode_opaque(p, name->name, len);
1758 hdr->nops++;
1759 hdr->replen += decode_secinfo_maxsz;
1760} 1676}
1761 1677
1762#if defined(CONFIG_NFS_V4_1) 1678#if defined(CONFIG_NFS_V4_1)
@@ -1766,19 +1682,39 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1766 struct compound_hdr *hdr) 1682 struct compound_hdr *hdr)
1767{ 1683{
1768 __be32 *p; 1684 __be32 *p;
1685 char impl_name[NFS4_OPAQUE_LIMIT];
1686 int len = 0;
1769 1687
1770 p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); 1688 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
1771 *p++ = cpu_to_be32(OP_EXCHANGE_ID); 1689 encode_nfs4_verifier(xdr, args->verifier);
1772 xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
1773 1690
1774 encode_string(xdr, args->id_len, args->id); 1691 encode_string(xdr, args->id_len, args->id);
1775 1692
1776 p = reserve_space(xdr, 12); 1693 p = reserve_space(xdr, 12);
1777 *p++ = cpu_to_be32(args->flags); 1694 *p++ = cpu_to_be32(args->flags);
1778 *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ 1695 *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
1779 *p = cpu_to_be32(0); /* zero length implementation id array */ 1696
1780 hdr->nops++; 1697 if (send_implementation_id &&
1781 hdr->replen += decode_exchange_id_maxsz; 1698 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
1699 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
1700 <= NFS4_OPAQUE_LIMIT + 1)
1701 len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
1702 utsname()->sysname, utsname()->release,
1703 utsname()->version, utsname()->machine);
1704
1705 if (len > 0) {
1706 *p = cpu_to_be32(1); /* implementation id array length=1 */
1707
1708 encode_string(xdr,
1709 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1,
1710 CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN);
1711 encode_string(xdr, len, impl_name);
1712 /* just send zeros for nii_date - the date is in nii_name */
1713 p = reserve_space(xdr, 12);
1714 p = xdr_encode_hyper(p, 0);
1715 *p = cpu_to_be32(0);
1716 } else
1717 *p = cpu_to_be32(0); /* implementation id array length=0 */
1782} 1718}
1783 1719
1784static void encode_create_session(struct xdr_stream *xdr, 1720static void encode_create_session(struct xdr_stream *xdr,
@@ -1801,8 +1737,8 @@ static void encode_create_session(struct xdr_stream *xdr,
1801 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1737 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1802 clp->cl_ipaddr); 1738 clp->cl_ipaddr);
1803 1739
1804 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); 1740 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
1805 *p++ = cpu_to_be32(OP_CREATE_SESSION); 1741 p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);
1806 p = xdr_encode_hyper(p, clp->cl_clientid); 1742 p = xdr_encode_hyper(p, clp->cl_clientid);
1807 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ 1743 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1808 *p++ = cpu_to_be32(args->flags); /*flags */ 1744 *p++ = cpu_to_be32(args->flags); /*flags */
@@ -1835,33 +1771,22 @@ static void encode_create_session(struct xdr_stream *xdr,
1835 *p++ = cpu_to_be32(0); /* UID */ 1771 *p++ = cpu_to_be32(0); /* UID */
1836 *p++ = cpu_to_be32(0); /* GID */ 1772 *p++ = cpu_to_be32(0); /* GID */
1837 *p = cpu_to_be32(0); /* No more gids */ 1773 *p = cpu_to_be32(0); /* No more gids */
1838 hdr->nops++;
1839 hdr->replen += decode_create_session_maxsz;
1840} 1774}
1841 1775
1842static void encode_destroy_session(struct xdr_stream *xdr, 1776static void encode_destroy_session(struct xdr_stream *xdr,
1843 struct nfs4_session *session, 1777 struct nfs4_session *session,
1844 struct compound_hdr *hdr) 1778 struct compound_hdr *hdr)
1845{ 1779{
1846 __be32 *p; 1780 encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr);
1847 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); 1781 encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1848 *p++ = cpu_to_be32(OP_DESTROY_SESSION);
1849 xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1850 hdr->nops++;
1851 hdr->replen += decode_destroy_session_maxsz;
1852} 1782}
1853 1783
1854static void encode_reclaim_complete(struct xdr_stream *xdr, 1784static void encode_reclaim_complete(struct xdr_stream *xdr,
1855 struct nfs41_reclaim_complete_args *args, 1785 struct nfs41_reclaim_complete_args *args,
1856 struct compound_hdr *hdr) 1786 struct compound_hdr *hdr)
1857{ 1787{
1858 __be32 *p; 1788 encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr);
1859 1789 encode_uint32(xdr, args->one_fs);
1860 p = reserve_space(xdr, 8);
1861 *p++ = cpu_to_be32(OP_RECLAIM_COMPLETE);
1862 *p++ = cpu_to_be32(args->one_fs);
1863 hdr->nops++;
1864 hdr->replen += decode_reclaim_complete_maxsz;
1865} 1790}
1866#endif /* CONFIG_NFS_V4_1 */ 1791#endif /* CONFIG_NFS_V4_1 */
1867 1792
@@ -1883,8 +1808,7 @@ static void encode_sequence(struct xdr_stream *xdr,
1883 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); 1808 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1884 slot = tp->slots + args->sa_slotid; 1809 slot = tp->slots + args->sa_slotid;
1885 1810
1886 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); 1811 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
1887 *p++ = cpu_to_be32(OP_SEQUENCE);
1888 1812
1889 /* 1813 /*
1890 * Sessionid + seqid + slotid + max slotid + cache_this 1814 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1898,13 +1822,12 @@ static void encode_sequence(struct xdr_stream *xdr,
1898 ((u32 *)session->sess_id.data)[3], 1822 ((u32 *)session->sess_id.data)[3],
1899 slot->seq_nr, args->sa_slotid, 1823 slot->seq_nr, args->sa_slotid,
1900 tp->highest_used_slotid, args->sa_cache_this); 1824 tp->highest_used_slotid, args->sa_cache_this);
1825 p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
1901 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1826 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1902 *p++ = cpu_to_be32(slot->seq_nr); 1827 *p++ = cpu_to_be32(slot->seq_nr);
1903 *p++ = cpu_to_be32(args->sa_slotid); 1828 *p++ = cpu_to_be32(args->sa_slotid);
1904 *p++ = cpu_to_be32(tp->highest_used_slotid); 1829 *p++ = cpu_to_be32(tp->highest_used_slotid);
1905 *p = cpu_to_be32(args->sa_cache_this); 1830 *p = cpu_to_be32(args->sa_cache_this);
1906 hdr->nops++;
1907 hdr->replen += decode_sequence_maxsz;
1908#endif /* CONFIG_NFS_V4_1 */ 1831#endif /* CONFIG_NFS_V4_1 */
1909} 1832}
1910 1833
@@ -1919,14 +1842,12 @@ encode_getdevicelist(struct xdr_stream *xdr,
1919 .data = "dummmmmy", 1842 .data = "dummmmmy",
1920 }; 1843 };
1921 1844
1922 p = reserve_space(xdr, 20); 1845 encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
1923 *p++ = cpu_to_be32(OP_GETDEVICELIST); 1846 p = reserve_space(xdr, 16);
1924 *p++ = cpu_to_be32(args->layoutclass); 1847 *p++ = cpu_to_be32(args->layoutclass);
1925 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); 1848 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1926 xdr_encode_hyper(p, 0ULL); /* cookie */ 1849 xdr_encode_hyper(p, 0ULL); /* cookie */
1927 encode_nfs4_verifier(xdr, &dummy); 1850 encode_nfs4_verifier(xdr, &dummy);
1928 hdr->nops++;
1929 hdr->replen += decode_getdevicelist_maxsz;
1930} 1851}
1931 1852
1932static void 1853static void
@@ -1936,15 +1857,13 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1936{ 1857{
1937 __be32 *p; 1858 __be32 *p;
1938 1859
1939 p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); 1860 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
1940 *p++ = cpu_to_be32(OP_GETDEVICEINFO); 1861 p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
1941 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1862 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1942 NFS4_DEVICEID4_SIZE); 1863 NFS4_DEVICEID4_SIZE);
1943 *p++ = cpu_to_be32(args->pdev->layout_type); 1864 *p++ = cpu_to_be32(args->pdev->layout_type);
1944 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ 1865 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */
1945 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1866 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1946 hdr->nops++;
1947 hdr->replen += decode_getdeviceinfo_maxsz;
1948} 1867}
1949 1868
1950static void 1869static void
@@ -1954,16 +1873,16 @@ encode_layoutget(struct xdr_stream *xdr,
1954{ 1873{
1955 __be32 *p; 1874 __be32 *p;
1956 1875
1957 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); 1876 encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr);
1958 *p++ = cpu_to_be32(OP_LAYOUTGET); 1877 p = reserve_space(xdr, 36);
1959 *p++ = cpu_to_be32(0); /* Signal layout available */ 1878 *p++ = cpu_to_be32(0); /* Signal layout available */
1960 *p++ = cpu_to_be32(args->type); 1879 *p++ = cpu_to_be32(args->type);
1961 *p++ = cpu_to_be32(args->range.iomode); 1880 *p++ = cpu_to_be32(args->range.iomode);
1962 p = xdr_encode_hyper(p, args->range.offset); 1881 p = xdr_encode_hyper(p, args->range.offset);
1963 p = xdr_encode_hyper(p, args->range.length); 1882 p = xdr_encode_hyper(p, args->range.length);
1964 p = xdr_encode_hyper(p, args->minlength); 1883 p = xdr_encode_hyper(p, args->minlength);
1965 p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); 1884 encode_nfs4_stateid(xdr, &args->stateid);
1966 *p = cpu_to_be32(args->maxcount); 1885 encode_uint32(xdr, args->maxcount);
1967 1886
1968 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", 1887 dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
1969 __func__, 1888 __func__,
@@ -1972,8 +1891,6 @@ encode_layoutget(struct xdr_stream *xdr,
1972 (unsigned long)args->range.offset, 1891 (unsigned long)args->range.offset,
1973 (unsigned long)args->range.length, 1892 (unsigned long)args->range.length,
1974 args->maxcount); 1893 args->maxcount);
1975 hdr->nops++;
1976 hdr->replen += decode_layoutget_maxsz;
1977} 1894}
1978 1895
1979static int 1896static int
@@ -1987,13 +1904,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
1987 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, 1904 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
1988 NFS_SERVER(args->inode)->pnfs_curr_ld->id); 1905 NFS_SERVER(args->inode)->pnfs_curr_ld->id);
1989 1906
1990 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); 1907 encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr);
1991 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1908 p = reserve_space(xdr, 20);
1992 /* Only whole file layouts */ 1909 /* Only whole file layouts */
1993 p = xdr_encode_hyper(p, 0); /* offset */ 1910 p = xdr_encode_hyper(p, 0); /* offset */
1994 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ 1911 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
1995 *p++ = cpu_to_be32(0); /* reclaim */ 1912 *p = cpu_to_be32(0); /* reclaim */
1996 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); 1913 encode_nfs4_stateid(xdr, &args->stateid);
1914 p = reserve_space(xdr, 20);
1997 *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1915 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
1998 p = xdr_encode_hyper(p, args->lastbytewritten); 1916 p = xdr_encode_hyper(p, args->lastbytewritten);
1999 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1917 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
@@ -2002,13 +1920,9 @@ encode_layoutcommit(struct xdr_stream *xdr,
2002 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) 1920 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
2003 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( 1921 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
2004 NFS_I(inode)->layout, xdr, args); 1922 NFS_I(inode)->layout, xdr, args);
2005 else { 1923 else
2006 p = reserve_space(xdr, 4); 1924 encode_uint32(xdr, 0); /* no layout-type payload */
2007 *p = cpu_to_be32(0); /* no layout-type payload */
2008 }
2009 1925
2010 hdr->nops++;
2011 hdr->replen += decode_layoutcommit_maxsz;
2012 return 0; 1926 return 0;
2013} 1927}
2014 1928
@@ -2019,27 +1933,23 @@ encode_layoutreturn(struct xdr_stream *xdr,
2019{ 1933{
2020 __be32 *p; 1934 __be32 *p;
2021 1935
2022 p = reserve_space(xdr, 20); 1936 encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
2023 *p++ = cpu_to_be32(OP_LAYOUTRETURN); 1937 p = reserve_space(xdr, 16);
2024 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ 1938 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
2025 *p++ = cpu_to_be32(args->layout_type); 1939 *p++ = cpu_to_be32(args->layout_type);
2026 *p++ = cpu_to_be32(IOMODE_ANY); 1940 *p++ = cpu_to_be32(IOMODE_ANY);
2027 *p = cpu_to_be32(RETURN_FILE); 1941 *p = cpu_to_be32(RETURN_FILE);
2028 p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); 1942 p = reserve_space(xdr, 16);
2029 p = xdr_encode_hyper(p, 0); 1943 p = xdr_encode_hyper(p, 0);
2030 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); 1944 p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
2031 spin_lock(&args->inode->i_lock); 1945 spin_lock(&args->inode->i_lock);
2032 xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); 1946 encode_nfs4_stateid(xdr, &args->stateid);
2033 spin_unlock(&args->inode->i_lock); 1947 spin_unlock(&args->inode->i_lock);
2034 if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { 1948 if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
2035 NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( 1949 NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
2036 NFS_I(args->inode)->layout, xdr, args); 1950 NFS_I(args->inode)->layout, xdr, args);
2037 } else { 1951 } else
2038 p = reserve_space(xdr, 4); 1952 encode_uint32(xdr, 0);
2039 *p = cpu_to_be32(0);
2040 }
2041 hdr->nops++;
2042 hdr->replen += decode_layoutreturn_maxsz;
2043} 1953}
2044 1954
2045static int 1955static int
@@ -2047,12 +1957,8 @@ encode_secinfo_no_name(struct xdr_stream *xdr,
2047 const struct nfs41_secinfo_no_name_args *args, 1957 const struct nfs41_secinfo_no_name_args *args,
2048 struct compound_hdr *hdr) 1958 struct compound_hdr *hdr)
2049{ 1959{
2050 __be32 *p; 1960 encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr);
2051 p = reserve_space(xdr, 8); 1961 encode_uint32(xdr, args->style);
2052 *p++ = cpu_to_be32(OP_SECINFO_NO_NAME);
2053 *p++ = cpu_to_be32(args->style);
2054 hdr->nops++;
2055 hdr->replen += decode_secinfo_no_name_maxsz;
2056 return 0; 1962 return 0;
2057} 1963}
2058 1964
@@ -2060,26 +1966,17 @@ static void encode_test_stateid(struct xdr_stream *xdr,
2060 struct nfs41_test_stateid_args *args, 1966 struct nfs41_test_stateid_args *args,
2061 struct compound_hdr *hdr) 1967 struct compound_hdr *hdr)
2062{ 1968{
2063 __be32 *p; 1969 encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
2064 1970 encode_uint32(xdr, 1);
2065 p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); 1971 encode_nfs4_stateid(xdr, args->stateid);
2066 *p++ = cpu_to_be32(OP_TEST_STATEID);
2067 *p++ = cpu_to_be32(1);
2068 xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
2069 hdr->nops++;
2070 hdr->replen += decode_test_stateid_maxsz;
2071} 1972}
2072 1973
2073static void encode_free_stateid(struct xdr_stream *xdr, 1974static void encode_free_stateid(struct xdr_stream *xdr,
2074 struct nfs41_free_stateid_args *args, 1975 struct nfs41_free_stateid_args *args,
2075 struct compound_hdr *hdr) 1976 struct compound_hdr *hdr)
2076{ 1977{
2077 __be32 *p; 1978 encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
2078 p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); 1979 encode_nfs4_stateid(xdr, args->stateid);
2079 *p++ = cpu_to_be32(OP_FREE_STATEID);
2080 xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
2081 hdr->nops++;
2082 hdr->replen += decode_free_stateid_maxsz;
2083} 1980}
2084#endif /* CONFIG_NFS_V4_1 */ 1981#endif /* CONFIG_NFS_V4_1 */
2085 1982
@@ -2633,6 +2530,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
2633 encode_sequence(xdr, &args->seq_args, &hdr); 2530 encode_sequence(xdr, &args->seq_args, &hdr);
2634 encode_putfh(xdr, args->fhandle, &hdr); 2531 encode_putfh(xdr, args->fhandle, &hdr);
2635 encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| 2532 encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
2533 FATTR4_WORD0_FH_EXPIRE_TYPE|
2636 FATTR4_WORD0_LINK_SUPPORT| 2534 FATTR4_WORD0_LINK_SUPPORT|
2637 FATTR4_WORD0_SYMLINK_SUPPORT| 2535 FATTR4_WORD0_SYMLINK_SUPPORT|
2638 FATTR4_WORD0_ACLSUPPORT, &hdr); 2536 FATTR4_WORD0_ACLSUPPORT, &hdr);
@@ -2650,7 +2548,7 @@ static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,
2650 }; 2548 };
2651 2549
2652 encode_compound_hdr(xdr, req, &hdr); 2550 encode_compound_hdr(xdr, req, &hdr);
2653 encode_renew(xdr, clp, &hdr); 2551 encode_renew(xdr, clp->cl_clientid, &hdr);
2654 encode_nops(&hdr); 2552 encode_nops(&hdr);
2655} 2553}
2656 2554
@@ -3180,6 +3078,28 @@ out_overflow:
3180 return -EIO; 3078 return -EIO;
3181} 3079}
3182 3080
3081static int decode_attr_fh_expire_type(struct xdr_stream *xdr,
3082 uint32_t *bitmap, uint32_t *type)
3083{
3084 __be32 *p;
3085
3086 *type = 0;
3087 if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U)))
3088 return -EIO;
3089 if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) {
3090 p = xdr_inline_decode(xdr, 4);
3091 if (unlikely(!p))
3092 goto out_overflow;
3093 *type = be32_to_cpup(p);
3094 bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE;
3095 }
3096 dprintk("%s: expire type=0x%x\n", __func__, *type);
3097 return 0;
3098out_overflow:
3099 print_overflow_msg(__func__, xdr);
3100 return -EIO;
3101}
3102
3183static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 3103static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
3184{ 3104{
3185 __be32 *p; 3105 __be32 *p;
@@ -3513,16 +3433,17 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
3513 n = be32_to_cpup(p); 3433 n = be32_to_cpup(p);
3514 if (n == 0) 3434 if (n == 0)
3515 goto root_path; 3435 goto root_path;
3516 dprintk("path "); 3436 dprintk("pathname4: ");
3517 path->ncomponents = 0; 3437 path->ncomponents = 0;
3518 while (path->ncomponents < n) { 3438 while (path->ncomponents < n) {
3519 struct nfs4_string *component = &path->components[path->ncomponents]; 3439 struct nfs4_string *component = &path->components[path->ncomponents];
3520 status = decode_opaque_inline(xdr, &component->len, &component->data); 3440 status = decode_opaque_inline(xdr, &component->len, &component->data);
3521 if (unlikely(status != 0)) 3441 if (unlikely(status != 0))
3522 goto out_eio; 3442 goto out_eio;
3523 if (path->ncomponents != n) 3443 ifdebug (XDR)
3524 dprintk("/"); 3444 pr_cont("%s%.*s ",
3525 dprintk("%s", component->data); 3445 (path->ncomponents != n ? "/ " : ""),
3446 component->len, component->data);
3526 if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) 3447 if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
3527 path->ncomponents++; 3448 path->ncomponents++;
3528 else { 3449 else {
@@ -3531,14 +3452,13 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
3531 } 3452 }
3532 } 3453 }
3533out: 3454out:
3534 dprintk("\n");
3535 return status; 3455 return status;
3536root_path: 3456root_path:
3537/* a root pathname is sent as a zero component4 */ 3457/* a root pathname is sent as a zero component4 */
3538 path->ncomponents = 1; 3458 path->ncomponents = 1;
3539 path->components[0].len=0; 3459 path->components[0].len=0;
3540 path->components[0].data=NULL; 3460 path->components[0].data=NULL;
3541 dprintk("path /\n"); 3461 dprintk("pathname4: /\n");
3542 goto out; 3462 goto out;
3543out_eio: 3463out_eio:
3544 dprintk(" status %d", status); 3464 dprintk(" status %d", status);
@@ -3560,7 +3480,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
3560 status = 0; 3480 status = 0;
3561 if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) 3481 if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
3562 goto out; 3482 goto out;
3563 dprintk("%s: fsroot ", __func__); 3483 status = -EIO;
3484 /* Ignore borken servers that return unrequested attrs */
3485 if (unlikely(res == NULL))
3486 goto out;
3487 dprintk("%s: fsroot:\n", __func__);
3564 status = decode_pathname(xdr, &res->fs_path); 3488 status = decode_pathname(xdr, &res->fs_path);
3565 if (unlikely(status != 0)) 3489 if (unlikely(status != 0))
3566 goto out; 3490 goto out;
@@ -3581,7 +3505,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
3581 m = be32_to_cpup(p); 3505 m = be32_to_cpup(p);
3582 3506
3583 loc->nservers = 0; 3507 loc->nservers = 0;
3584 dprintk("%s: servers ", __func__); 3508 dprintk("%s: servers:\n", __func__);
3585 while (loc->nservers < m) { 3509 while (loc->nservers < m) {
3586 struct nfs4_string *server = &loc->servers[loc->nservers]; 3510 struct nfs4_string *server = &loc->servers[loc->nservers];
3587 status = decode_opaque_inline(xdr, &server->len, &server->data); 3511 status = decode_opaque_inline(xdr, &server->len, &server->data);
@@ -3613,7 +3537,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
3613 res->nlocations++; 3537 res->nlocations++;
3614 } 3538 }
3615 if (res->nlocations != 0) 3539 if (res->nlocations != 0)
3616 status = NFS_ATTR_FATTR_V4_REFERRAL; 3540 status = NFS_ATTR_FATTR_V4_LOCATIONS;
3617out: 3541out:
3618 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 3542 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
3619 return status; 3543 return status;
@@ -4157,7 +4081,7 @@ static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
4157 4081
4158static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) 4082static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
4159{ 4083{
4160 return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); 4084 return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
4161} 4085}
4162 4086
4163static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) 4087static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -4174,7 +4098,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
4174 4098
4175static int decode_verifier(struct xdr_stream *xdr, void *verifier) 4099static int decode_verifier(struct xdr_stream *xdr, void *verifier)
4176{ 4100{
4177 return decode_opaque_fixed(xdr, verifier, 8); 4101 return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);
4178} 4102}
4179 4103
4180static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) 4104static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
@@ -4224,6 +4148,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
4224 goto xdr_error; 4148 goto xdr_error;
4225 if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0) 4149 if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)
4226 goto xdr_error; 4150 goto xdr_error;
4151 if ((status = decode_attr_fh_expire_type(xdr, bitmap,
4152 &res->fh_expire_type)) != 0)
4153 goto xdr_error;
4227 if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0) 4154 if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)
4228 goto xdr_error; 4155 goto xdr_error;
4229 if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) 4156 if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0)
@@ -4294,6 +4221,7 @@ xdr_error:
4294 4221
4295static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, 4222static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4296 struct nfs_fattr *fattr, struct nfs_fh *fh, 4223 struct nfs_fattr *fattr, struct nfs_fh *fh,
4224 struct nfs4_fs_locations *fs_loc,
4297 const struct nfs_server *server) 4225 const struct nfs_server *server)
4298{ 4226{
4299 int status; 4227 int status;
@@ -4341,9 +4269,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4341 goto xdr_error; 4269 goto xdr_error;
4342 fattr->valid |= status; 4270 fattr->valid |= status;
4343 4271
4344 status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, 4272 status = decode_attr_fs_locations(xdr, bitmap, fs_loc);
4345 struct nfs4_fs_locations,
4346 fattr));
4347 if (status < 0) 4273 if (status < 0)
4348 goto xdr_error; 4274 goto xdr_error;
4349 fattr->valid |= status; 4275 fattr->valid |= status;
@@ -4407,7 +4333,8 @@ xdr_error:
4407} 4333}
4408 4334
4409static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4335static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4410 struct nfs_fh *fh, const struct nfs_server *server) 4336 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
4337 const struct nfs_server *server)
4411{ 4338{
4412 __be32 *savep; 4339 __be32 *savep;
4413 uint32_t attrlen, 4340 uint32_t attrlen,
@@ -4426,7 +4353,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4426 if (status < 0) 4353 if (status < 0)
4427 goto xdr_error; 4354 goto xdr_error;
4428 4355
4429 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server); 4356 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
4430 if (status < 0) 4357 if (status < 0)
4431 goto xdr_error; 4358 goto xdr_error;
4432 4359
@@ -4439,7 +4366,7 @@ xdr_error:
4439static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4366static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4440 const struct nfs_server *server) 4367 const struct nfs_server *server)
4441{ 4368{
4442 return decode_getfattr_generic(xdr, fattr, NULL, server); 4369 return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
4443} 4370}
4444 4371
4445/* 4372/*
@@ -4463,8 +4390,8 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
4463 return 0; 4390 return 0;
4464 } 4391 }
4465 if (num > 1) 4392 if (num > 1)
4466 printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " 4393 printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
4467 "per filesystem not supported\n", __func__); 4394 "drivers per filesystem not supported\n", __func__);
4468 4395
4469 /* Decode and set first layout type, move xdr->p past unused types */ 4396 /* Decode and set first layout type, move xdr->p past unused types */
4470 p = xdr_inline_decode(xdr, num * 4); 4397 p = xdr_inline_decode(xdr, num * 4);
@@ -4863,17 +4790,16 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
4863 size_t hdrlen; 4790 size_t hdrlen;
4864 u32 recvd, pglen = rcvbuf->page_len; 4791 u32 recvd, pglen = rcvbuf->page_len;
4865 int status; 4792 int status;
4793 __be32 verf[2];
4866 4794
4867 status = decode_op_hdr(xdr, OP_READDIR); 4795 status = decode_op_hdr(xdr, OP_READDIR);
4868 if (!status) 4796 if (!status)
4869 status = decode_verifier(xdr, readdir->verifier.data); 4797 status = decode_verifier(xdr, readdir->verifier.data);
4870 if (unlikely(status)) 4798 if (unlikely(status))
4871 return status; 4799 return status;
4800 memcpy(verf, readdir->verifier.data, sizeof(verf));
4872 dprintk("%s: verifier = %08x:%08x\n", 4801 dprintk("%s: verifier = %08x:%08x\n",
4873 __func__, 4802 __func__, verf[0], verf[1]);
4874 ((u32 *)readdir->verifier.data)[0],
4875 ((u32 *)readdir->verifier.data)[1]);
4876
4877 4803
4878 hdrlen = (char *) xdr->p - (char *) iov->iov_base; 4804 hdrlen = (char *) xdr->p - (char *) iov->iov_base;
4879 recvd = rcvbuf->len - hdrlen; 4805 recvd = rcvbuf->len - hdrlen;
@@ -5120,7 +5046,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
5120 goto out_overflow; 5046 goto out_overflow;
5121 res->count = be32_to_cpup(p++); 5047 res->count = be32_to_cpup(p++);
5122 res->verf->committed = be32_to_cpup(p++); 5048 res->verf->committed = be32_to_cpup(p++);
5123 memcpy(res->verf->verifier, p, 8); 5049 memcpy(res->verf->verifier, p, NFS4_VERIFIER_SIZE);
5124 return 0; 5050 return 0;
5125out_overflow: 5051out_overflow:
5126 print_overflow_msg(__func__, xdr); 5052 print_overflow_msg(__func__, xdr);
@@ -5214,6 +5140,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
5214 char *dummy_str; 5140 char *dummy_str;
5215 int status; 5141 int status;
5216 struct nfs_client *clp = res->client; 5142 struct nfs_client *clp = res->client;
5143 uint32_t impl_id_count;
5217 5144
5218 status = decode_op_hdr(xdr, OP_EXCHANGE_ID); 5145 status = decode_op_hdr(xdr, OP_EXCHANGE_ID);
5219 if (status) 5146 if (status)
@@ -5255,11 +5182,38 @@ static int decode_exchange_id(struct xdr_stream *xdr,
5255 memcpy(res->server_scope->server_scope, dummy_str, dummy); 5182 memcpy(res->server_scope->server_scope, dummy_str, dummy);
5256 res->server_scope->server_scope_sz = dummy; 5183 res->server_scope->server_scope_sz = dummy;
5257 5184
5258 /* Throw away Implementation id array */ 5185 /* Implementation Id */
5259 status = decode_opaque_inline(xdr, &dummy, &dummy_str); 5186 p = xdr_inline_decode(xdr, 4);
5260 if (unlikely(status)) 5187 if (unlikely(!p))
5261 return status; 5188 goto out_overflow;
5189 impl_id_count = be32_to_cpup(p++);
5190
5191 if (impl_id_count) {
5192 /* nii_domain */
5193 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
5194 if (unlikely(status))
5195 return status;
5196 if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
5197 return -EIO;
5198 memcpy(res->impl_id->domain, dummy_str, dummy);
5262 5199
5200 /* nii_name */
5201 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
5202 if (unlikely(status))
5203 return status;
5204 if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
5205 return -EIO;
5206 memcpy(res->impl_id->name, dummy_str, dummy);
5207
5208 /* nii_date */
5209 p = xdr_inline_decode(xdr, 12);
5210 if (unlikely(!p))
5211 goto out_overflow;
5212 p = xdr_decode_hyper(p, &res->impl_id->date.seconds);
5213 res->impl_id->date.nseconds = be32_to_cpup(p);
5214
5215 /* if there's more than one entry, ignore the rest */
5216 }
5263 return 0; 5217 return 0;
5264out_overflow: 5218out_overflow:
5265 print_overflow_msg(__func__, xdr); 5219 print_overflow_msg(__func__, xdr);
@@ -5285,8 +5239,8 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
5285 attrs->max_reqs = be32_to_cpup(p++); 5239 attrs->max_reqs = be32_to_cpup(p++);
5286 nr_attrs = be32_to_cpup(p); 5240 nr_attrs = be32_to_cpup(p);
5287 if (unlikely(nr_attrs > 1)) { 5241 if (unlikely(nr_attrs > 1)) {
5288 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", 5242 printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs "
5289 __func__, nr_attrs); 5243 "count %u\n", __func__, nr_attrs);
5290 return -EINVAL; 5244 return -EINVAL;
5291 } 5245 }
5292 if (nr_attrs == 1) { 5246 if (nr_attrs == 1) {
@@ -5436,14 +5390,14 @@ static int decode_getdevicelist(struct xdr_stream *xdr,
5436 p += 2; 5390 p += 2;
5437 5391
5438 /* Read verifier */ 5392 /* Read verifier */
5439 p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); 5393 p = xdr_decode_opaque_fixed(p, verftemp.verifier, NFS4_VERIFIER_SIZE);
5440 5394
5441 res->num_devs = be32_to_cpup(p); 5395 res->num_devs = be32_to_cpup(p);
5442 5396
5443 dprintk("%s: num_dev %d\n", __func__, res->num_devs); 5397 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5444 5398
5445 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { 5399 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5446 printk(KERN_ERR "%s too many result dev_num %u\n", 5400 printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
5447 __func__, res->num_devs); 5401 __func__, res->num_devs);
5448 return -EIO; 5402 return -EIO;
5449 } 5403 }
@@ -5537,11 +5491,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
5537 status = decode_op_hdr(xdr, OP_LAYOUTGET); 5491 status = decode_op_hdr(xdr, OP_LAYOUTGET);
5538 if (status) 5492 if (status)
5539 return status; 5493 return status;
5540 p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); 5494 p = xdr_inline_decode(xdr, 4);
5495 if (unlikely(!p))
5496 goto out_overflow;
5497 res->return_on_close = be32_to_cpup(p);
5498 decode_stateid(xdr, &res->stateid);
5499 p = xdr_inline_decode(xdr, 4);
5541 if (unlikely(!p)) 5500 if (unlikely(!p))
5542 goto out_overflow; 5501 goto out_overflow;
5543 res->return_on_close = be32_to_cpup(p++);
5544 p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
5545 layout_count = be32_to_cpup(p); 5502 layout_count = be32_to_cpup(p);
5546 if (!layout_count) { 5503 if (!layout_count) {
5547 dprintk("%s: server responded with empty layout array\n", 5504 dprintk("%s: server responded with empty layout array\n",
@@ -5666,7 +5623,8 @@ static int decode_test_stateid(struct xdr_stream *xdr,
5666 if (unlikely(!p)) 5623 if (unlikely(!p))
5667 goto out_overflow; 5624 goto out_overflow;
5668 res->status = be32_to_cpup(p++); 5625 res->status = be32_to_cpup(p++);
5669 return res->status; 5626
5627 return status;
5670out_overflow: 5628out_overflow:
5671 print_overflow_msg(__func__, xdr); 5629 print_overflow_msg(__func__, xdr);
5672out: 5630out:
@@ -6583,8 +6541,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
6583 if (status) 6541 if (status)
6584 goto out; 6542 goto out;
6585 xdr_enter_page(xdr, PAGE_SIZE); 6543 xdr_enter_page(xdr, PAGE_SIZE);
6586 status = decode_getfattr(xdr, &res->fs_locations->fattr, 6544 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
6587 res->fs_locations->server); 6545 NULL, res->fs_locations,
6546 res->fs_locations->server);
6588out: 6547out:
6589 return status; 6548 return status;
6590} 6549}
@@ -6964,7 +6923,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6964 goto out_overflow; 6923 goto out_overflow;
6965 6924
6966 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, 6925 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
6967 entry->server) < 0) 6926 NULL, entry->server) < 0)
6968 goto out_overflow; 6927 goto out_overflow;
6969 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) 6928 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
6970 entry->ino = entry->fattr->mounted_on_fileid; 6929 entry->ino = entry->fattr->mounted_on_fileid;
@@ -7112,7 +7071,7 @@ struct rpc_procinfo nfs4_procedures[] = {
7112#endif /* CONFIG_NFS_V4_1 */ 7071#endif /* CONFIG_NFS_V4_1 */
7113}; 7072};
7114 7073
7115struct rpc_version nfs_version4 = { 7074const struct rpc_version nfs_version4 = {
7116 .number = 4, 7075 .number = 4,
7117 .nrprocs = ARRAY_SIZE(nfs4_procedures), 7076 .nrprocs = ARRAY_SIZE(nfs4_procedures),
7118 .procs = nfs4_procedures 7077 .procs = nfs4_procedures
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c4744e1d513c..cd3c910d2d12 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -104,7 +104,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";
104/* server:export path string passed to super.c */ 104/* server:export path string passed to super.c */
105static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; 105static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = "";
106 106
107#ifdef RPC_DEBUG 107#ifdef NFS_DEBUG
108/* 108/*
109 * When the "nfsrootdebug" kernel command line option is specified, 109 * When the "nfsrootdebug" kernel command line option is specified,
110 * enable debugging messages for NFSROOT. 110 * enable debugging messages for NFSROOT.
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 55d01280a609..4bff4a3dab46 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -137,6 +137,7 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
137 struct objio_dev_ent *ode; 137 struct objio_dev_ent *ode;
138 struct osd_dev *od; 138 struct osd_dev *od;
139 struct osd_dev_info odi; 139 struct osd_dev_info odi;
140 bool retry_flag = true;
140 int err; 141 int err;
141 142
142 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 143 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
@@ -171,10 +172,18 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
171 goto out; 172 goto out;
172 } 173 }
173 174
175retry_lookup:
174 od = osduld_info_lookup(&odi); 176 od = osduld_info_lookup(&odi);
175 if (unlikely(IS_ERR(od))) { 177 if (unlikely(IS_ERR(od))) {
176 err = PTR_ERR(od); 178 err = PTR_ERR(od);
177 dprintk("%s: osduld_info_lookup => %d\n", __func__, err); 179 dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
180 if (err == -ENODEV && retry_flag) {
181 err = objlayout_autologin(deviceaddr);
182 if (likely(!err)) {
183 retry_flag = false;
184 goto retry_lookup;
185 }
186 }
178 goto out; 187 goto out;
179 } 188 }
180 189
@@ -205,25 +214,36 @@ static void copy_single_comp(struct ore_components *oc, unsigned c,
205int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, 214int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
206 struct objio_segment **pseg) 215 struct objio_segment **pseg)
207{ 216{
208 struct __alloc_objio_segment { 217/* This is the in memory structure of the objio_segment
209 struct objio_segment olseg; 218 *
210 struct ore_dev *ods[numdevs]; 219 * struct __alloc_objio_segment {
211 struct ore_comp comps[numdevs]; 220 * struct objio_segment olseg;
212 } *aolseg; 221 * struct ore_dev *ods[numdevs];
213 222 * struct ore_comp comps[numdevs];
214 aolseg = kzalloc(sizeof(*aolseg), gfp_flags); 223 * } *aolseg;
215 if (unlikely(!aolseg)) { 224 * NOTE: The code as above compiles and runs perfectly. It is elegant,
225 * type safe and compact. At some Past time Linus has decided he does not
226 * like variable length arrays, For the sake of this principal we uglify
227 * the code as below.
228 */
229 struct objio_segment *lseg;
230 size_t lseg_size = sizeof(*lseg) +
231 numdevs * sizeof(lseg->oc.ods[0]) +
232 numdevs * sizeof(*lseg->oc.comps);
233
234 lseg = kzalloc(lseg_size, gfp_flags);
235 if (unlikely(!lseg)) {
216 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, 236 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
217 numdevs, sizeof(*aolseg)); 237 numdevs, lseg_size);
218 return -ENOMEM; 238 return -ENOMEM;
219 } 239 }
220 240
221 aolseg->olseg.oc.numdevs = numdevs; 241 lseg->oc.numdevs = numdevs;
222 aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; 242 lseg->oc.single_comp = EC_MULTPLE_COMPS;
223 aolseg->olseg.oc.comps = aolseg->comps; 243 lseg->oc.ods = (void *)(lseg + 1);
224 aolseg->olseg.oc.ods = aolseg->ods; 244 lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
225 245
226 *pseg = &aolseg->olseg; 246 *pseg = lseg;
227 return 0; 247 return 0;
228} 248}
229 249
@@ -582,10 +602,10 @@ objlayout_init(void)
582 602
583 if (ret) 603 if (ret)
584 printk(KERN_INFO 604 printk(KERN_INFO
585 "%s: Registering OSD pNFS Layout Driver failed: error=%d\n", 605 "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
586 __func__, ret); 606 __func__, ret);
587 else 607 else
588 printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", 608 printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
589 __func__); 609 __func__);
590 return ret; 610 return ret;
591} 611}
@@ -594,7 +614,7 @@ static void __exit
594objlayout_exit(void) 614objlayout_exit(void)
595{ 615{
596 pnfs_unregister_layoutdriver(&objlayout_type); 616 pnfs_unregister_layoutdriver(&objlayout_type);
597 printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", 617 printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
598 __func__); 618 __func__);
599} 619}
600 620
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index b3c29039f5b8..8d45f1c318ce 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -37,6 +37,9 @@
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */ 38 */
39 39
40#include <linux/kmod.h>
41#include <linux/moduleparam.h>
42#include <linux/ratelimit.h>
40#include <scsi/osd_initiator.h> 43#include <scsi/osd_initiator.h>
41#include "objlayout.h" 44#include "objlayout.h"
42 45
@@ -156,7 +159,7 @@ last_byte_offset(u64 start, u64 len)
156 return end > start ? end - 1 : NFS4_MAX_UINT64; 159 return end > start ? end - 1 : NFS4_MAX_UINT64;
157} 160}
158 161
159void _fix_verify_io_params(struct pnfs_layout_segment *lseg, 162static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
160 struct page ***p_pages, unsigned *p_pgbase, 163 struct page ***p_pages, unsigned *p_pgbase,
161 u64 offset, unsigned long count) 164 u64 offset, unsigned long count)
162{ 165{
@@ -490,9 +493,9 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
490 if (!ioerr->oer_errno) 493 if (!ioerr->oer_errno)
491 continue; 494 continue;
492 495
493 printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " 496 printk(KERN_ERR "NFS: %s: err[%d]: errno=%d "
494 "dev(%llx:%llx) par=0x%llx obj=0x%llx " 497 "is_write=%d dev(%llx:%llx) par=0x%llx "
495 "offset=0x%llx length=0x%llx\n", 498 "obj=0x%llx offset=0x%llx length=0x%llx\n",
496 __func__, i, ioerr->oer_errno, 499 __func__, i, ioerr->oer_errno,
497 ioerr->oer_iswrite, 500 ioerr->oer_iswrite,
498 _DEVID_LO(&ioerr->oer_component.oid_device_id), 501 _DEVID_LO(&ioerr->oer_component.oid_device_id),
@@ -651,3 +654,134 @@ void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
651 __free_page(odi->page); 654 __free_page(odi->page);
652 kfree(odi); 655 kfree(odi);
653} 656}
657
658enum {
659 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
660 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
661 OSD_LOGIN_UPCALL_PATHLEN = 256
662};
663
664static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login";
665
666module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog),
667 0600);
668MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program");
669
670struct __auto_login {
671 char uri[OBJLAYOUT_MAX_URI_LEN];
672 char osdname[OBJLAYOUT_MAX_OSDNAME_LEN];
673 char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN];
674};
675
676static int __objlayout_upcall(struct __auto_login *login)
677{
678 static char *envp[] = { "HOME=/",
679 "TERM=linux",
680 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
681 NULL
682 };
683 char *argv[8];
684 int ret;
685
686 if (unlikely(!osd_login_prog[0])) {
687 dprintk("%s: osd_login_prog is disabled\n", __func__);
688 return -EACCES;
689 }
690
691 dprintk("%s uri: %s\n", __func__, login->uri);
692 dprintk("%s osdname %s\n", __func__, login->osdname);
693 dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex);
694
695 argv[0] = (char *)osd_login_prog;
696 argv[1] = "-u";
697 argv[2] = login->uri;
698 argv[3] = "-o";
699 argv[4] = login->osdname;
700 argv[5] = "-s";
701 argv[6] = login->systemid_hex;
702 argv[7] = NULL;
703
704 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
705 /*
706 * Disable the upcall mechanism if we're getting an ENOENT or
707 * EACCES error. The admin can re-enable it on the fly by using
708 * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
709 * the problem has been fixed.
710 */
711 if (ret == -ENOENT || ret == -EACCES) {
712 printk(KERN_ERR "PNFS-OBJ: %s was not found please set "
713 "objlayoutdriver.osd_login_prog kernel parameter!\n",
714 osd_login_prog);
715 osd_login_prog[0] = '\0';
716 }
717 dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret);
718
719 return ret;
720}
721
722/* Assume dest is all zeros */
723static void __copy_nfsS_and_zero_terminate(struct nfs4_string s,
724 char *dest, int max_len,
725 const char *var_name)
726{
727 if (!s.len)
728 return;
729
730 if (s.len >= max_len) {
731 pr_warn_ratelimited(
732 "objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
733 var_name, s.len, max_len);
734 s.len = max_len - 1; /* space for null terminator */
735 }
736
737 memcpy(dest, s.data, s.len);
738}
739
740/* Assume sysid is all zeros */
741static void _sysid_2_hex(struct nfs4_string s,
742 char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN])
743{
744 int i;
745 char *cur;
746
747 if (!s.len)
748 return;
749
750 if (s.len != OSD_SYSTEMID_LEN) {
751 pr_warn_ratelimited(
752 "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
753 s.len);
754 if (s.len > OSD_SYSTEMID_LEN)
755 s.len = OSD_SYSTEMID_LEN;
756 }
757
758 cur = sysid;
759 for (i = 0; i < s.len; i++)
760 cur = hex_byte_pack(cur, s.data[i]);
761}
762
763int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr)
764{
765 int rc;
766 struct __auto_login login;
767
768 if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len)
769 return -ENODEV;
770
771 memset(&login, 0, sizeof(login));
772 __copy_nfsS_and_zero_terminate(
773 deviceaddr->oda_targetaddr.ota_netaddr.r_addr,
774 login.uri, sizeof(login.uri), "URI");
775
776 __copy_nfsS_and_zero_terminate(
777 deviceaddr->oda_osdname,
778 login.osdname, sizeof(login.osdname), "OSDNAME");
779
780 _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex);
781
782 rc = __objlayout_upcall(&login);
783 if (rc > 0) /* script returns positive values */
784 rc = -ENODEV;
785
786 return rc;
787}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 8ec34727ed21..880ba086be94 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -184,4 +184,6 @@ extern void objlayout_encode_layoutreturn(
184 struct xdr_stream *, 184 struct xdr_stream *,
185 const struct nfs4_layoutreturn_args *); 185 const struct nfs4_layoutreturn_args *);
186 186
187extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr);
188
187#endif /* _OBJLAYOUT_H */ 189#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 5668f7c54c41..d21fceaa9f62 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
16#include <linux/nfs.h>
16#include <linux/nfs3.h> 17#include <linux/nfs3.h>
17#include <linux/nfs4.h> 18#include <linux/nfs4.h>
18#include <linux/nfs_page.h> 19#include <linux/nfs_page.h>
@@ -106,36 +107,6 @@ void nfs_unlock_request(struct nfs_page *req)
106 nfs_release_request(req); 107 nfs_release_request(req);
107} 108}
108 109
109/**
110 * nfs_set_page_tag_locked - Tag a request as locked
111 * @req:
112 */
113int nfs_set_page_tag_locked(struct nfs_page *req)
114{
115 if (!nfs_lock_request_dontget(req))
116 return 0;
117 if (test_bit(PG_MAPPED, &req->wb_flags))
118 radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
119 return 1;
120}
121
122/**
123 * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers
124 */
125void nfs_clear_page_tag_locked(struct nfs_page *req)
126{
127 if (test_bit(PG_MAPPED, &req->wb_flags)) {
128 struct inode *inode = req->wb_context->dentry->d_inode;
129 struct nfs_inode *nfsi = NFS_I(inode);
130
131 spin_lock(&inode->i_lock);
132 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
133 nfs_unlock_request(req);
134 spin_unlock(&inode->i_lock);
135 } else
136 nfs_unlock_request(req);
137}
138
139/* 110/*
140 * nfs_clear_request - Free up all resources allocated to the request 111 * nfs_clear_request - Free up all resources allocated to the request
141 * @req: 112 * @req:
@@ -425,67 +396,6 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
425 } 396 }
426} 397}
427 398
428#define NFS_SCAN_MAXENTRIES 16
429/**
430 * nfs_scan_list - Scan a list for matching requests
431 * @nfsi: NFS inode
432 * @dst: Destination list
433 * @idx_start: lower bound of page->index to scan
434 * @npages: idx_start + npages sets the upper bound to scan.
435 * @tag: tag to scan for
436 *
437 * Moves elements from one of the inode request lists.
438 * If the number of requests is set to 0, the entire address_space
439 * starting at index idx_start, is scanned.
440 * The requests are *not* checked to ensure that they form a contiguous set.
441 * You must be holding the inode's i_lock when calling this function
442 */
443int nfs_scan_list(struct nfs_inode *nfsi,
444 struct list_head *dst, pgoff_t idx_start,
445 unsigned int npages, int tag)
446{
447 struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
448 struct nfs_page *req;
449 pgoff_t idx_end;
450 int found, i;
451 int res;
452 struct list_head *list;
453
454 res = 0;
455 if (npages == 0)
456 idx_end = ~0;
457 else
458 idx_end = idx_start + npages - 1;
459
460 for (;;) {
461 found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
462 (void **)&pgvec[0], idx_start,
463 NFS_SCAN_MAXENTRIES, tag);
464 if (found <= 0)
465 break;
466 for (i = 0; i < found; i++) {
467 req = pgvec[i];
468 if (req->wb_index > idx_end)
469 goto out;
470 idx_start = req->wb_index + 1;
471 if (nfs_set_page_tag_locked(req)) {
472 kref_get(&req->wb_kref);
473 radix_tree_tag_clear(&nfsi->nfs_page_tree,
474 req->wb_index, tag);
475 list = pnfs_choose_commit_list(req, dst);
476 nfs_list_add_request(req, list);
477 res++;
478 if (res == INT_MAX)
479 goto out;
480 }
481 }
482 /* for latency reduction */
483 cond_resched_lock(&nfsi->vfs_inode.i_lock);
484 }
485out:
486 return res;
487}
488
489int __init nfs_init_nfspagecache(void) 399int __init nfs_init_nfspagecache(void)
490{ 400{
491 nfs_page_cachep = kmem_cache_create("nfs_page", 401 nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 17149a490065..b5d451586943 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -101,8 +101,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
101 goto out_no_driver; 101 goto out_no_driver;
102 if (!(server->nfs_client->cl_exchange_flags & 102 if (!(server->nfs_client->cl_exchange_flags &
103 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 103 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
104 printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, 104 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
105 id, server->nfs_client->cl_exchange_flags); 105 __func__, id, server->nfs_client->cl_exchange_flags);
106 goto out_no_driver; 106 goto out_no_driver;
107 } 107 }
108 ld_type = find_pnfs_driver(id); 108 ld_type = find_pnfs_driver(id);
@@ -122,8 +122,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
122 server->pnfs_curr_ld = ld_type; 122 server->pnfs_curr_ld = ld_type;
123 if (ld_type->set_layoutdriver 123 if (ld_type->set_layoutdriver
124 && ld_type->set_layoutdriver(server, mntfh)) { 124 && ld_type->set_layoutdriver(server, mntfh)) {
125 printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", 125 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
126 __func__, id); 126 "driver %u.\n", __func__, id);
127 module_put(ld_type->owner); 127 module_put(ld_type->owner);
128 goto out_no_driver; 128 goto out_no_driver;
129 } 129 }
@@ -143,11 +143,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
143 struct pnfs_layoutdriver_type *tmp; 143 struct pnfs_layoutdriver_type *tmp;
144 144
145 if (ld_type->id == 0) { 145 if (ld_type->id == 0) {
146 printk(KERN_ERR "%s id 0 is reserved\n", __func__); 146 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
147 return status; 147 return status;
148 } 148 }
149 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 149 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
150 printk(KERN_ERR "%s Layout driver must provide " 150 printk(KERN_ERR "NFS: %s Layout driver must provide "
151 "alloc_lseg and free_lseg.\n", __func__); 151 "alloc_lseg and free_lseg.\n", __func__);
152 return status; 152 return status;
153 } 153 }
@@ -160,7 +160,7 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
160 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 160 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
161 ld_type->name); 161 ld_type->name);
162 } else { 162 } else {
163 printk(KERN_ERR "%s Module with id %d already loaded!\n", 163 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
164 __func__, ld_type->id); 164 __func__, ld_type->id);
165 } 165 }
166 spin_unlock(&pnfs_spinlock); 166 spin_unlock(&pnfs_spinlock);
@@ -496,12 +496,12 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
496{ 496{
497 u32 oldseq, newseq; 497 u32 oldseq, newseq;
498 498
499 oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); 499 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
500 newseq = be32_to_cpu(new->stateid.seqid); 500 newseq = be32_to_cpu(new->seqid);
501 if ((int)(newseq - oldseq) > 0) { 501 if ((int)(newseq - oldseq) > 0) {
502 memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); 502 nfs4_stateid_copy(&lo->plh_stateid, new);
503 if (update_barrier) { 503 if (update_barrier) {
504 u32 new_barrier = be32_to_cpu(new->stateid.seqid); 504 u32 new_barrier = be32_to_cpu(new->seqid);
505 505
506 if ((int)(new_barrier - lo->plh_barrier)) 506 if ((int)(new_barrier - lo->plh_barrier))
507 lo->plh_barrier = new_barrier; 507 lo->plh_barrier = new_barrier;
@@ -525,7 +525,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
525 int lget) 525 int lget)
526{ 526{
527 if ((stateid) && 527 if ((stateid) &&
528 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) 528 (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
529 return true; 529 return true;
530 return lo->plh_block_lgets || 530 return lo->plh_block_lgets ||
531 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || 531 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
@@ -549,11 +549,10 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
549 549
550 do { 550 do {
551 seq = read_seqbegin(&open_state->seqlock); 551 seq = read_seqbegin(&open_state->seqlock);
552 memcpy(dst->data, open_state->stateid.data, 552 nfs4_stateid_copy(dst, &open_state->stateid);
553 sizeof(open_state->stateid.data));
554 } while (read_seqretry(&open_state->seqlock, seq)); 553 } while (read_seqretry(&open_state->seqlock, seq));
555 } else 554 } else
556 memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); 555 nfs4_stateid_copy(dst, &lo->plh_stateid);
557 spin_unlock(&lo->plh_inode->i_lock); 556 spin_unlock(&lo->plh_inode->i_lock);
558 dprintk("<-- %s\n", __func__); 557 dprintk("<-- %s\n", __func__);
559 return status; 558 return status;
@@ -590,7 +589,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
590 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 589 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
591 max_pages = max_resp_sz >> PAGE_SHIFT; 590 max_pages = max_resp_sz >> PAGE_SHIFT;
592 591
593 pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); 592 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
594 if (!pages) 593 if (!pages)
595 goto out_err_free; 594 goto out_err_free;
596 595
@@ -760,7 +759,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
760 } 759 }
761 if (!found) { 760 if (!found) {
762 struct pnfs_layout_hdr *lo = nfsi->layout; 761 struct pnfs_layout_hdr *lo = nfsi->layout;
763 u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); 762 u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
764 763
765 /* Since close does not return a layout stateid for use as 764 /* Since close does not return a layout stateid for use as
766 * a barrier, we choose the worst-case barrier. 765 * a barrier, we choose the worst-case barrier.
@@ -966,8 +965,7 @@ pnfs_update_layout(struct inode *ino,
966 } 965 }
967 966
968 /* Do we even need to bother with this? */ 967 /* Do we even need to bother with this? */
969 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || 968 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
970 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
971 dprintk("%s matches recall, use MDS\n", __func__); 969 dprintk("%s matches recall, use MDS\n", __func__);
972 goto out_unlock; 970 goto out_unlock;
973 } 971 }
@@ -1032,7 +1030,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1032 struct nfs4_layoutget_res *res = &lgp->res; 1030 struct nfs4_layoutget_res *res = &lgp->res;
1033 struct pnfs_layout_segment *lseg; 1031 struct pnfs_layout_segment *lseg;
1034 struct inode *ino = lo->plh_inode; 1032 struct inode *ino = lo->plh_inode;
1035 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
1036 int status = 0; 1033 int status = 0;
1037 1034
1038 /* Inject layout blob into I/O device driver */ 1035 /* Inject layout blob into I/O device driver */
@@ -1048,8 +1045,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1048 } 1045 }
1049 1046
1050 spin_lock(&ino->i_lock); 1047 spin_lock(&ino->i_lock);
1051 if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || 1048 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1052 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1053 dprintk("%s forget reply due to recall\n", __func__); 1049 dprintk("%s forget reply due to recall\n", __func__);
1054 goto out_forget_reply; 1050 goto out_forget_reply;
1055 } 1051 }
@@ -1214,6 +1210,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
1214 } 1210 }
1215 data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages); 1211 data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
1216 } 1212 }
1213 put_lseg(data->lseg);
1217 data->mds_ops->rpc_release(data); 1214 data->mds_ops->rpc_release(data);
1218} 1215}
1219EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1216EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
@@ -1227,6 +1224,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1227 nfs_list_add_request(data->req, &desc->pg_list); 1224 nfs_list_add_request(data->req, &desc->pg_list);
1228 nfs_pageio_reset_write_mds(desc); 1225 nfs_pageio_reset_write_mds(desc);
1229 desc->pg_recoalesce = 1; 1226 desc->pg_recoalesce = 1;
1227 put_lseg(data->lseg);
1230 nfs_writedata_release(data); 1228 nfs_writedata_release(data);
1231} 1229}
1232 1230
@@ -1327,6 +1325,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
1327 data->mds_ops->rpc_call_done(&data->task, data); 1325 data->mds_ops->rpc_call_done(&data->task, data);
1328 } else 1326 } else
1329 pnfs_ld_handle_read_error(data); 1327 pnfs_ld_handle_read_error(data);
1328 put_lseg(data->lseg);
1330 data->mds_ops->rpc_release(data); 1329 data->mds_ops->rpc_release(data);
1331} 1330}
1332EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1331EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1530,8 +1529,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1530 end_pos = nfsi->layout->plh_lwb; 1529 end_pos = nfsi->layout->plh_lwb;
1531 nfsi->layout->plh_lwb = 0; 1530 nfsi->layout->plh_lwb = 0;
1532 1531
1533 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, 1532 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
1534 sizeof(nfsi->layout->plh_stateid.data));
1535 spin_unlock(&inode->i_lock); 1533 spin_unlock(&inode->i_lock);
1536 1534
1537 data->args.inode = inode; 1535 data->args.inode = inode;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 53d593a0a4f2..442ebf68eeec 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,11 +94,10 @@ struct pnfs_layoutdriver_type {
94 const struct nfs_pageio_ops *pg_read_ops; 94 const struct nfs_pageio_ops *pg_read_ops;
95 const struct nfs_pageio_ops *pg_write_ops; 95 const struct nfs_pageio_ops *pg_write_ops;
96 96
97 /* Returns true if layoutdriver wants to divert this request to 97 void (*mark_request_commit) (struct nfs_page *req,
98 * driver's commit routine. 98 struct pnfs_layout_segment *lseg);
99 */ 99 void (*clear_request_commit) (struct nfs_page *req);
100 bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); 100 int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock);
101 struct list_head * (*choose_commit_list) (struct nfs_page *req);
102 int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); 101 int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
103 102
104 /* 103 /*
@@ -229,7 +228,6 @@ struct nfs4_deviceid_node {
229 atomic_t ref; 228 atomic_t ref;
230}; 229};
231 230
232void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
233struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 231struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
234void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 232void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
235void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 233void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
@@ -262,20 +260,6 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)
262 return nfss->pnfs_curr_ld != NULL; 260 return nfss->pnfs_curr_ld != NULL;
263} 261}
264 262
265static inline void
266pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
267{
268 if (lseg) {
269 struct pnfs_layoutdriver_type *ld;
270
271 ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
272 if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
273 set_bit(PG_PNFS_COMMIT, &req->wb_flags);
274 req->wb_commit_lseg = get_lseg(lseg);
275 }
276 }
277}
278
279static inline int 263static inline int
280pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) 264pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
281{ 265{
@@ -284,27 +268,42 @@ pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
284 return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); 268 return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
285} 269}
286 270
287static inline struct list_head * 271static inline bool
288pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) 272pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
289{ 273{
290 struct list_head *rv; 274 struct inode *inode = req->wb_context->dentry->d_inode;
275 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
291 276
292 if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { 277 if (lseg == NULL || ld->mark_request_commit == NULL)
293 struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; 278 return false;
279 ld->mark_request_commit(req, lseg);
280 return true;
281}
294 282
295 set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); 283static inline bool
296 rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); 284pnfs_clear_request_commit(struct nfs_page *req)
297 /* matched by ref taken when PG_PNFS_COMMIT is set */ 285{
298 put_lseg(req->wb_commit_lseg); 286 struct inode *inode = req->wb_context->dentry->d_inode;
299 } else 287 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
300 rv = mds; 288
301 return rv; 289 if (ld == NULL || ld->clear_request_commit == NULL)
290 return false;
291 ld->clear_request_commit(req);
292 return true;
302} 293}
303 294
304static inline void pnfs_clear_request_commit(struct nfs_page *req) 295static inline int
296pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
305{ 297{
306 if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) 298 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
307 put_lseg(req->wb_commit_lseg); 299 int ret;
300
301 if (ld == NULL || ld->scan_commit_lists == NULL)
302 return 0;
303 ret = ld->scan_commit_lists(inode, max, lock);
304 if (ret != 0)
305 set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
306 return ret;
308} 307}
309 308
310/* Should the pNFS client commit and return the layout upon a setattr */ 309/* Should the pNFS client commit and return the layout upon a setattr */
@@ -328,6 +327,13 @@ static inline int pnfs_return_layout(struct inode *ino)
328 return 0; 327 return 0;
329} 328}
330 329
330#ifdef NFS_DEBUG
331void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
332#else
333static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
334{
335}
336#endif /* NFS_DEBUG */
331#else /* CONFIG_NFS_V4_1 */ 337#else /* CONFIG_NFS_V4_1 */
332 338
333static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 339static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -400,35 +406,35 @@ static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, st
400 return false; 406 return false;
401} 407}
402 408
403static inline void
404pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
405{
406}
407
408static inline int 409static inline int
409pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) 410pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
410{ 411{
411 return PNFS_NOT_ATTEMPTED; 412 return PNFS_NOT_ATTEMPTED;
412} 413}
413 414
414static inline struct list_head * 415static inline bool
415pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) 416pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
416{ 417{
417 return mds; 418 return false;
418} 419}
419 420
420static inline void pnfs_clear_request_commit(struct nfs_page *req) 421static inline bool
422pnfs_clear_request_commit(struct nfs_page *req)
421{ 423{
424 return false;
422} 425}
423 426
424static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) 427static inline int
428pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)
425{ 429{
426 return 0; 430 return 0;
427} 431}
428 432
429static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl) 433static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
430{ 434{
435 return 0;
431} 436}
437
432#endif /* CONFIG_NFS_V4_1 */ 438#endif /* CONFIG_NFS_V4_1 */
433 439
434#endif /* FS_NFS_PNFS_H */ 440#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 4f359d2a26eb..73f701f1f4d3 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -43,6 +43,7 @@
43static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; 43static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
44static DEFINE_SPINLOCK(nfs4_deviceid_lock); 44static DEFINE_SPINLOCK(nfs4_deviceid_lock);
45 45
46#ifdef NFS_DEBUG
46void 47void
47nfs4_print_deviceid(const struct nfs4_deviceid *id) 48nfs4_print_deviceid(const struct nfs4_deviceid *id)
48{ 49{
@@ -52,6 +53,7 @@ nfs4_print_deviceid(const struct nfs4_deviceid *id)
52 p[0], p[1], p[2], p[3]); 53 p[0], p[1], p[2], p[3]);
53} 54}
54EXPORT_SYMBOL_GPL(nfs4_print_deviceid); 55EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
56#endif
55 57
56static inline u32 58static inline u32
57nfs4_deviceid_hash(const struct nfs4_deviceid *id) 59nfs4_deviceid_hash(const struct nfs4_deviceid *id)
@@ -92,7 +94,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
92 * @clp nfs_client associated with deviceid 94 * @clp nfs_client associated with deviceid
93 * @id deviceid to look up 95 * @id deviceid to look up
94 */ 96 */
95struct nfs4_deviceid_node * 97static struct nfs4_deviceid_node *
96_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 98_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
97 const struct nfs_client *clp, const struct nfs4_deviceid *id, 99 const struct nfs_client *clp, const struct nfs4_deviceid *id,
98 long hash) 100 long hash)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0c672588fe5a..b63b6f4d14fb 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -358,6 +358,11 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
358 msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; 358 msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];
359} 359}
360 360
361static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
362{
363 rpc_call_start(task);
364}
365
361static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) 366static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
362{ 367{
363 if (nfs_async_handle_expired_key(task)) 368 if (nfs_async_handle_expired_key(task))
@@ -372,6 +377,11 @@ nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
372 msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; 377 msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];
373} 378}
374 379
380static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
381{
382 rpc_call_start(task);
383}
384
375static int 385static int
376nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 386nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
377 struct inode *new_dir) 387 struct inode *new_dir)
@@ -651,6 +661,11 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
651 msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; 661 msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
652} 662}
653 663
664static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
665{
666 rpc_call_start(task);
667}
668
654static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 669static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
655{ 670{
656 if (nfs_async_handle_expired_key(task)) 671 if (nfs_async_handle_expired_key(task))
@@ -668,6 +683,11 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
668 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; 683 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
669} 684}
670 685
686static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
687{
688 rpc_call_start(task);
689}
690
671static void 691static void
672nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) 692nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
673{ 693{
@@ -721,9 +741,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
721 .create = nfs_proc_create, 741 .create = nfs_proc_create,
722 .remove = nfs_proc_remove, 742 .remove = nfs_proc_remove,
723 .unlink_setup = nfs_proc_unlink_setup, 743 .unlink_setup = nfs_proc_unlink_setup,
744 .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
724 .unlink_done = nfs_proc_unlink_done, 745 .unlink_done = nfs_proc_unlink_done,
725 .rename = nfs_proc_rename, 746 .rename = nfs_proc_rename,
726 .rename_setup = nfs_proc_rename_setup, 747 .rename_setup = nfs_proc_rename_setup,
748 .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
727 .rename_done = nfs_proc_rename_done, 749 .rename_done = nfs_proc_rename_done,
728 .link = nfs_proc_link, 750 .link = nfs_proc_link,
729 .symlink = nfs_proc_symlink, 751 .symlink = nfs_proc_symlink,
@@ -736,8 +758,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
736 .pathconf = nfs_proc_pathconf, 758 .pathconf = nfs_proc_pathconf,
737 .decode_dirent = nfs2_decode_dirent, 759 .decode_dirent = nfs2_decode_dirent,
738 .read_setup = nfs_proc_read_setup, 760 .read_setup = nfs_proc_read_setup,
761 .read_rpc_prepare = nfs_proc_read_rpc_prepare,
739 .read_done = nfs_read_done, 762 .read_done = nfs_read_done,
740 .write_setup = nfs_proc_write_setup, 763 .write_setup = nfs_proc_write_setup,
764 .write_rpc_prepare = nfs_proc_write_rpc_prepare,
741 .write_done = nfs_write_done, 765 .write_done = nfs_write_done,
742 .commit_setup = nfs_proc_commit_setup, 766 .commit_setup = nfs_proc_commit_setup,
743 .lock = nfs_proc_lock, 767 .lock = nfs_proc_lock,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index b83e89bf4a74..9a0e8ef4a409 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -65,7 +65,6 @@ void nfs_readdata_free(struct nfs_read_data *p)
65 65
66void nfs_readdata_release(struct nfs_read_data *rdata) 66void nfs_readdata_release(struct nfs_read_data *rdata)
67{ 67{
68 put_lseg(rdata->lseg);
69 put_nfs_open_context(rdata->args.context); 68 put_nfs_open_context(rdata->args.context);
70 nfs_readdata_free(rdata); 69 nfs_readdata_free(rdata);
71} 70}
@@ -464,23 +463,14 @@ static void nfs_readpage_release_partial(void *calldata)
464 nfs_readdata_release(calldata); 463 nfs_readdata_release(calldata);
465} 464}
466 465
467#if defined(CONFIG_NFS_V4_1)
468void nfs_read_prepare(struct rpc_task *task, void *calldata) 466void nfs_read_prepare(struct rpc_task *task, void *calldata)
469{ 467{
470 struct nfs_read_data *data = calldata; 468 struct nfs_read_data *data = calldata;
471 469 NFS_PROTO(data->inode)->read_rpc_prepare(task, data);
472 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
473 &data->args.seq_args, &data->res.seq_res,
474 0, task))
475 return;
476 rpc_call_start(task);
477} 470}
478#endif /* CONFIG_NFS_V4_1 */
479 471
480static const struct rpc_call_ops nfs_read_partial_ops = { 472static const struct rpc_call_ops nfs_read_partial_ops = {
481#if defined(CONFIG_NFS_V4_1)
482 .rpc_call_prepare = nfs_read_prepare, 473 .rpc_call_prepare = nfs_read_prepare,
483#endif /* CONFIG_NFS_V4_1 */
484 .rpc_call_done = nfs_readpage_result_partial, 474 .rpc_call_done = nfs_readpage_result_partial,
485 .rpc_release = nfs_readpage_release_partial, 475 .rpc_release = nfs_readpage_release_partial,
486}; 476};
@@ -544,9 +534,7 @@ static void nfs_readpage_release_full(void *calldata)
544} 534}
545 535
546static const struct rpc_call_ops nfs_read_full_ops = { 536static const struct rpc_call_ops nfs_read_full_ops = {
547#if defined(CONFIG_NFS_V4_1)
548 .rpc_call_prepare = nfs_read_prepare, 537 .rpc_call_prepare = nfs_read_prepare,
549#endif /* CONFIG_NFS_V4_1 */
550 .rpc_call_done = nfs_readpage_result_full, 538 .rpc_call_done = nfs_readpage_result_full,
551 .rpc_release = nfs_readpage_release_full, 539 .rpc_release = nfs_readpage_release_full,
552}; 540};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e3f6b2349411..37412f706b32 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -52,6 +52,8 @@
52#include <linux/nfs_xdr.h> 52#include <linux/nfs_xdr.h>
53#include <linux/magic.h> 53#include <linux/magic.h>
54#include <linux/parser.h> 54#include <linux/parser.h>
55#include <linux/nsproxy.h>
56#include <linux/rcupdate.h>
55 57
56#include <asm/uaccess.h> 58#include <asm/uaccess.h>
57 59
@@ -78,7 +80,6 @@ enum {
78 Opt_cto, Opt_nocto, 80 Opt_cto, Opt_nocto,
79 Opt_ac, Opt_noac, 81 Opt_ac, Opt_noac,
80 Opt_lock, Opt_nolock, 82 Opt_lock, Opt_nolock,
81 Opt_v2, Opt_v3, Opt_v4,
82 Opt_udp, Opt_tcp, Opt_rdma, 83 Opt_udp, Opt_tcp, Opt_rdma,
83 Opt_acl, Opt_noacl, 84 Opt_acl, Opt_noacl,
84 Opt_rdirplus, Opt_nordirplus, 85 Opt_rdirplus, Opt_nordirplus,
@@ -96,10 +97,10 @@ enum {
96 Opt_namelen, 97 Opt_namelen,
97 Opt_mountport, 98 Opt_mountport,
98 Opt_mountvers, 99 Opt_mountvers,
99 Opt_nfsvers,
100 Opt_minorversion, 100 Opt_minorversion,
101 101
102 /* Mount options that take string arguments */ 102 /* Mount options that take string arguments */
103 Opt_nfsvers,
103 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 104 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
104 Opt_addr, Opt_mountaddr, Opt_clientaddr, 105 Opt_addr, Opt_mountaddr, Opt_clientaddr,
105 Opt_lookupcache, 106 Opt_lookupcache,
@@ -131,9 +132,6 @@ static const match_table_t nfs_mount_option_tokens = {
131 { Opt_noac, "noac" }, 132 { Opt_noac, "noac" },
132 { Opt_lock, "lock" }, 133 { Opt_lock, "lock" },
133 { Opt_nolock, "nolock" }, 134 { Opt_nolock, "nolock" },
134 { Opt_v2, "v2" },
135 { Opt_v3, "v3" },
136 { Opt_v4, "v4" },
137 { Opt_udp, "udp" }, 135 { Opt_udp, "udp" },
138 { Opt_tcp, "tcp" }, 136 { Opt_tcp, "tcp" },
139 { Opt_rdma, "rdma" }, 137 { Opt_rdma, "rdma" },
@@ -162,9 +160,10 @@ static const match_table_t nfs_mount_option_tokens = {
162 { Opt_namelen, "namlen=%s" }, 160 { Opt_namelen, "namlen=%s" },
163 { Opt_mountport, "mountport=%s" }, 161 { Opt_mountport, "mountport=%s" },
164 { Opt_mountvers, "mountvers=%s" }, 162 { Opt_mountvers, "mountvers=%s" },
163 { Opt_minorversion, "minorversion=%s" },
164
165 { Opt_nfsvers, "nfsvers=%s" }, 165 { Opt_nfsvers, "nfsvers=%s" },
166 { Opt_nfsvers, "vers=%s" }, 166 { Opt_nfsvers, "vers=%s" },
167 { Opt_minorversion, "minorversion=%s" },
168 167
169 { Opt_sec, "sec=%s" }, 168 { Opt_sec, "sec=%s" },
170 { Opt_proto, "proto=%s" }, 169 { Opt_proto, "proto=%s" },
@@ -178,6 +177,9 @@ static const match_table_t nfs_mount_option_tokens = {
178 { Opt_fscache_uniq, "fsc=%s" }, 177 { Opt_fscache_uniq, "fsc=%s" },
179 { Opt_local_lock, "local_lock=%s" }, 178 { Opt_local_lock, "local_lock=%s" },
180 179
180 /* The following needs to be listed after all other options */
181 { Opt_nfsvers, "v%s" },
182
181 { Opt_err, NULL } 183 { Opt_err, NULL }
182}; 184};
183 185
@@ -258,6 +260,22 @@ static match_table_t nfs_local_lock_tokens = {
258 { Opt_local_lock_err, NULL } 260 { Opt_local_lock_err, NULL }
259}; 261};
260 262
263enum {
264 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
265 Opt_vers_4_1,
266
267 Opt_vers_err
268};
269
270static match_table_t nfs_vers_tokens = {
271 { Opt_vers_2, "2" },
272 { Opt_vers_3, "3" },
273 { Opt_vers_4, "4" },
274 { Opt_vers_4_0, "4.0" },
275 { Opt_vers_4_1, "4.1" },
276
277 { Opt_vers_err, NULL }
278};
261 279
262static void nfs_umount_begin(struct super_block *); 280static void nfs_umount_begin(struct super_block *);
263static int nfs_statfs(struct dentry *, struct kstatfs *); 281static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -619,7 +637,6 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
619 struct nfs_client *clp = nfss->nfs_client; 637 struct nfs_client *clp = nfss->nfs_client;
620 638
621 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); 639 seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
622 seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
623} 640}
624#else 641#else
625static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, 642static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
@@ -628,6 +645,15 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
628} 645}
629#endif 646#endif
630 647
648static void nfs_show_nfs_version(struct seq_file *m,
649 unsigned int version,
650 unsigned int minorversion)
651{
652 seq_printf(m, ",vers=%u", version);
653 if (version == 4)
654 seq_printf(m, ".%u", minorversion);
655}
656
631/* 657/*
632 * Describe the mount options in force on this server representation 658 * Describe the mount options in force on this server representation
633 */ 659 */
@@ -655,7 +681,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
655 u32 version = clp->rpc_ops->version; 681 u32 version = clp->rpc_ops->version;
656 int local_flock, local_fcntl; 682 int local_flock, local_fcntl;
657 683
658 seq_printf(m, ",vers=%u", version); 684 nfs_show_nfs_version(m, version, clp->cl_minorversion);
659 seq_printf(m, ",rsize=%u", nfss->rsize); 685 seq_printf(m, ",rsize=%u", nfss->rsize);
660 seq_printf(m, ",wsize=%u", nfss->wsize); 686 seq_printf(m, ",wsize=%u", nfss->wsize);
661 if (nfss->bsize != 0) 687 if (nfss->bsize != 0)
@@ -675,8 +701,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
675 else 701 else
676 seq_puts(m, nfs_infop->nostr); 702 seq_puts(m, nfs_infop->nostr);
677 } 703 }
704 rcu_read_lock();
678 seq_printf(m, ",proto=%s", 705 seq_printf(m, ",proto=%s",
679 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); 706 rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
707 rcu_read_unlock();
680 if (version == 4) { 708 if (version == 4) {
681 if (nfss->port != NFS_PORT) 709 if (nfss->port != NFS_PORT)
682 seq_printf(m, ",port=%u", nfss->port); 710 seq_printf(m, ",port=%u", nfss->port);
@@ -725,9 +753,11 @@ static int nfs_show_options(struct seq_file *m, struct dentry *root)
725 753
726 nfs_show_mount_options(m, nfss, 0); 754 nfs_show_mount_options(m, nfss, 0);
727 755
756 rcu_read_lock();
728 seq_printf(m, ",addr=%s", 757 seq_printf(m, ",addr=%s",
729 rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient, 758 rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,
730 RPC_DISPLAY_ADDR)); 759 RPC_DISPLAY_ADDR));
760 rcu_read_unlock();
731 761
732 return 0; 762 return 0;
733} 763}
@@ -744,7 +774,6 @@ static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
744#endif 774#endif
745#endif 775#endif
746 776
747#ifdef CONFIG_NFS_V4
748#ifdef CONFIG_NFS_V4_1 777#ifdef CONFIG_NFS_V4_1
749static void show_pnfs(struct seq_file *m, struct nfs_server *server) 778static void show_pnfs(struct seq_file *m, struct nfs_server *server)
750{ 779{
@@ -754,9 +783,26 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server)
754 else 783 else
755 seq_printf(m, "not configured"); 784 seq_printf(m, "not configured");
756} 785}
786
787static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
788{
789 if (nfss->nfs_client && nfss->nfs_client->impl_id) {
790 struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id;
791 seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s',"
792 "date='%llu,%u'",
793 impl_id->name, impl_id->domain,
794 impl_id->date.seconds, impl_id->date.nseconds);
795 }
796}
757#else 797#else
758static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} 798#ifdef CONFIG_NFS_V4
799static void show_pnfs(struct seq_file *m, struct nfs_server *server)
800{
801}
759#endif 802#endif
803static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss)
804{
805}
760#endif 806#endif
761 807
762static int nfs_show_devname(struct seq_file *m, struct dentry *root) 808static int nfs_show_devname(struct seq_file *m, struct dentry *root)
@@ -805,6 +851,8 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root)
805 851
806 seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); 852 seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
807 853
854 show_implementation_id(m, nfss);
855
808 seq_printf(m, "\n\tcaps:\t"); 856 seq_printf(m, "\n\tcaps:\t");
809 seq_printf(m, "caps=0x%x", nfss->caps); 857 seq_printf(m, "caps=0x%x", nfss->caps);
810 seq_printf(m, ",wtmult=%u", nfss->wtmult); 858 seq_printf(m, ",wtmult=%u", nfss->wtmult);
@@ -907,6 +955,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
907 data->auth_flavor_len = 1; 955 data->auth_flavor_len = 1;
908 data->version = version; 956 data->version = version;
909 data->minorversion = 0; 957 data->minorversion = 0;
958 data->net = current->nsproxy->net_ns;
910 security_init_mnt_opts(&data->lsm_opts); 959 security_init_mnt_opts(&data->lsm_opts);
911 } 960 }
912 return data; 961 return data;
@@ -1051,6 +1100,40 @@ static int nfs_parse_security_flavors(char *value,
1051 return 1; 1100 return 1;
1052} 1101}
1053 1102
1103static int nfs_parse_version_string(char *string,
1104 struct nfs_parsed_mount_data *mnt,
1105 substring_t *args)
1106{
1107 mnt->flags &= ~NFS_MOUNT_VER3;
1108 switch (match_token(string, nfs_vers_tokens, args)) {
1109 case Opt_vers_2:
1110 mnt->version = 2;
1111 break;
1112 case Opt_vers_3:
1113 mnt->flags |= NFS_MOUNT_VER3;
1114 mnt->version = 3;
1115 break;
1116 case Opt_vers_4:
1117 /* Backward compatibility option. In future,
1118 * the mount program should always supply
1119 * a NFSv4 minor version number.
1120 */
1121 mnt->version = 4;
1122 break;
1123 case Opt_vers_4_0:
1124 mnt->version = 4;
1125 mnt->minorversion = 0;
1126 break;
1127 case Opt_vers_4_1:
1128 mnt->version = 4;
1129 mnt->minorversion = 1;
1130 break;
1131 default:
1132 return 0;
1133 }
1134 return 1;
1135}
1136
1054static int nfs_get_option_str(substring_t args[], char **option) 1137static int nfs_get_option_str(substring_t args[], char **option)
1055{ 1138{
1056 kfree(*option); 1139 kfree(*option);
@@ -1156,18 +1239,6 @@ static int nfs_parse_mount_options(char *raw,
1156 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | 1239 mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |
1157 NFS_MOUNT_LOCAL_FCNTL); 1240 NFS_MOUNT_LOCAL_FCNTL);
1158 break; 1241 break;
1159 case Opt_v2:
1160 mnt->flags &= ~NFS_MOUNT_VER3;
1161 mnt->version = 2;
1162 break;
1163 case Opt_v3:
1164 mnt->flags |= NFS_MOUNT_VER3;
1165 mnt->version = 3;
1166 break;
1167 case Opt_v4:
1168 mnt->flags &= ~NFS_MOUNT_VER3;
1169 mnt->version = 4;
1170 break;
1171 case Opt_udp: 1242 case Opt_udp:
1172 mnt->flags &= ~NFS_MOUNT_TCP; 1243 mnt->flags &= ~NFS_MOUNT_TCP;
1173 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1244 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1294,26 +1365,6 @@ static int nfs_parse_mount_options(char *raw,
1294 goto out_invalid_value; 1365 goto out_invalid_value;
1295 mnt->mount_server.version = option; 1366 mnt->mount_server.version = option;
1296 break; 1367 break;
1297 case Opt_nfsvers:
1298 if (nfs_get_option_ul(args, &option))
1299 goto out_invalid_value;
1300 switch (option) {
1301 case NFS2_VERSION:
1302 mnt->flags &= ~NFS_MOUNT_VER3;
1303 mnt->version = 2;
1304 break;
1305 case NFS3_VERSION:
1306 mnt->flags |= NFS_MOUNT_VER3;
1307 mnt->version = 3;
1308 break;
1309 case NFS4_VERSION:
1310 mnt->flags &= ~NFS_MOUNT_VER3;
1311 mnt->version = 4;
1312 break;
1313 default:
1314 goto out_invalid_value;
1315 }
1316 break;
1317 case Opt_minorversion: 1368 case Opt_minorversion:
1318 if (nfs_get_option_ul(args, &option)) 1369 if (nfs_get_option_ul(args, &option))
1319 goto out_invalid_value; 1370 goto out_invalid_value;
@@ -1325,6 +1376,15 @@ static int nfs_parse_mount_options(char *raw,
1325 /* 1376 /*
1326 * options that take text values 1377 * options that take text values
1327 */ 1378 */
1379 case Opt_nfsvers:
1380 string = match_strdup(args);
1381 if (string == NULL)
1382 goto out_nomem;
1383 rc = nfs_parse_version_string(string, mnt, args);
1384 kfree(string);
1385 if (!rc)
1386 goto out_invalid_value;
1387 break;
1328 case Opt_sec: 1388 case Opt_sec:
1329 string = match_strdup(args); 1389 string = match_strdup(args);
1330 if (string == NULL) 1390 if (string == NULL)
@@ -1404,7 +1464,7 @@ static int nfs_parse_mount_options(char *raw,
1404 if (string == NULL) 1464 if (string == NULL)
1405 goto out_nomem; 1465 goto out_nomem;
1406 mnt->nfs_server.addrlen = 1466 mnt->nfs_server.addrlen =
1407 rpc_pton(string, strlen(string), 1467 rpc_pton(mnt->net, string, strlen(string),
1408 (struct sockaddr *) 1468 (struct sockaddr *)
1409 &mnt->nfs_server.address, 1469 &mnt->nfs_server.address,
1410 sizeof(mnt->nfs_server.address)); 1470 sizeof(mnt->nfs_server.address));
@@ -1426,7 +1486,7 @@ static int nfs_parse_mount_options(char *raw,
1426 if (string == NULL) 1486 if (string == NULL)
1427 goto out_nomem; 1487 goto out_nomem;
1428 mnt->mount_server.addrlen = 1488 mnt->mount_server.addrlen =
1429 rpc_pton(string, strlen(string), 1489 rpc_pton(mnt->net, string, strlen(string),
1430 (struct sockaddr *) 1490 (struct sockaddr *)
1431 &mnt->mount_server.address, 1491 &mnt->mount_server.address,
1432 sizeof(mnt->mount_server.address)); 1492 sizeof(mnt->mount_server.address));
@@ -1515,6 +1575,9 @@ static int nfs_parse_mount_options(char *raw,
1515 if (!sloppy && invalid_option) 1575 if (!sloppy && invalid_option)
1516 return 0; 1576 return 0;
1517 1577
1578 if (mnt->minorversion && mnt->version != 4)
1579 goto out_minorversion_mismatch;
1580
1518 /* 1581 /*
1519 * verify that any proto=/mountproto= options match the address 1582 * verify that any proto=/mountproto= options match the address
1520 * familiies in the addr=/mountaddr= options. 1583 * familiies in the addr=/mountaddr= options.
@@ -1548,6 +1611,10 @@ out_invalid_address:
1548out_invalid_value: 1611out_invalid_value:
1549 printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); 1612 printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
1550 return 0; 1613 return 0;
1614out_minorversion_mismatch:
1615 printk(KERN_INFO "NFS: mount option vers=%u does not support "
1616 "minorversion=%u\n", mnt->version, mnt->minorversion);
1617 return 0;
1551out_nomem: 1618out_nomem:
1552 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1619 printk(KERN_INFO "NFS: not enough memory to parse option\n");
1553 return 0; 1620 return 0;
@@ -1621,6 +1688,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1621 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1688 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1622 .auth_flav_len = &server_authlist_len, 1689 .auth_flav_len = &server_authlist_len,
1623 .auth_flavs = server_authlist, 1690 .auth_flavs = server_authlist,
1691 .net = args->net,
1624 }; 1692 };
1625 int status; 1693 int status;
1626 1694
@@ -2046,7 +2114,7 @@ static inline void nfs_initialise_sb(struct super_block *sb)
2046 2114
2047 /* We probably want something more informative here */ 2115 /* We probably want something more informative here */
2048 snprintf(sb->s_id, sizeof(sb->s_id), 2116 snprintf(sb->s_id, sizeof(sb->s_id),
2049 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); 2117 "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
2050 2118
2051 if (sb->s_blocksize == 0) 2119 if (sb->s_blocksize == 0)
2052 sb->s_blocksize = nfs_block_bits(server->wsize, 2120 sb->s_blocksize = nfs_block_bits(server->wsize,
@@ -2498,12 +2566,6 @@ static int nfs4_validate_text_mount_data(void *options,
2498 return -EINVAL; 2566 return -EINVAL;
2499 } 2567 }
2500 2568
2501 if (args->client_address == NULL) {
2502 dfprintk(MOUNT,
2503 "NFS4: mount program didn't pass callback address\n");
2504 return -EINVAL;
2505 }
2506
2507 return nfs_parse_devname(dev_name, 2569 return nfs_parse_devname(dev_name,
2508 &args->nfs_server.hostname, 2570 &args->nfs_server.hostname,
2509 NFS4_MAXNAMLEN, 2571 NFS4_MAXNAMLEN,
@@ -2662,8 +2724,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2662 if (!s->s_root) { 2724 if (!s->s_root) {
2663 /* initial superblock/root creation */ 2725 /* initial superblock/root creation */
2664 nfs4_fill_super(s); 2726 nfs4_fill_super(s);
2665 nfs_fscache_get_super_cookie( 2727 nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);
2666 s, data ? data->fscache_uniq : NULL, NULL);
2667 } 2728 }
2668 2729
2669 mntroot = nfs4_get_root(s, mntfh, dev_name); 2730 mntroot = nfs4_get_root(s, mntfh, dev_name);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 978aaeb8a093..ad4d2e787b20 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,7 +32,6 @@ static ctl_table nfs_cb_sysctls[] = {
32 .extra1 = (int *)&nfs_set_port_min, 32 .extra1 = (int *)&nfs_set_port_min,
33 .extra2 = (int *)&nfs_set_port_max, 33 .extra2 = (int *)&nfs_set_port_max,
34 }, 34 },
35#ifndef CONFIG_NFS_USE_NEW_IDMAPPER
36 { 35 {
37 .procname = "idmap_cache_timeout", 36 .procname = "idmap_cache_timeout",
38 .data = &nfs_idmap_cache_timeout, 37 .data = &nfs_idmap_cache_timeout,
@@ -40,7 +39,6 @@ static ctl_table nfs_cb_sysctls[] = {
40 .mode = 0644, 39 .mode = 0644,
41 .proc_handler = proc_dointvec_jiffies, 40 .proc_handler = proc_dointvec_jiffies,
42 }, 41 },
43#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
44#endif 42#endif
45 { 43 {
46 .procname = "nfs_mountpoint_timeout", 44 .procname = "nfs_mountpoint_timeout",
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 4f9319a2e567..3210a03342f9 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -20,15 +20,6 @@
20#include "iostat.h" 20#include "iostat.h"
21#include "delegation.h" 21#include "delegation.h"
22 22
23struct nfs_unlinkdata {
24 struct hlist_node list;
25 struct nfs_removeargs args;
26 struct nfs_removeres res;
27 struct inode *dir;
28 struct rpc_cred *cred;
29 struct nfs_fattr dir_attr;
30};
31
32/** 23/**
33 * nfs_free_unlinkdata - release data from a sillydelete operation. 24 * nfs_free_unlinkdata - release data from a sillydelete operation.
34 * @data: pointer to unlink structure. 25 * @data: pointer to unlink structure.
@@ -107,25 +98,16 @@ static void nfs_async_unlink_release(void *calldata)
107 nfs_sb_deactive(sb); 98 nfs_sb_deactive(sb);
108} 99}
109 100
110#if defined(CONFIG_NFS_V4_1) 101static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
111void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
112{ 102{
113 struct nfs_unlinkdata *data = calldata; 103 struct nfs_unlinkdata *data = calldata;
114 struct nfs_server *server = NFS_SERVER(data->dir); 104 NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);
115
116 if (nfs4_setup_sequence(server, &data->args.seq_args,
117 &data->res.seq_res, 1, task))
118 return;
119 rpc_call_start(task);
120} 105}
121#endif /* CONFIG_NFS_V4_1 */
122 106
123static const struct rpc_call_ops nfs_unlink_ops = { 107static const struct rpc_call_ops nfs_unlink_ops = {
124 .rpc_call_done = nfs_async_unlink_done, 108 .rpc_call_done = nfs_async_unlink_done,
125 .rpc_release = nfs_async_unlink_release, 109 .rpc_release = nfs_async_unlink_release,
126#if defined(CONFIG_NFS_V4_1)
127 .rpc_call_prepare = nfs_unlink_prepare, 110 .rpc_call_prepare = nfs_unlink_prepare,
128#endif /* CONFIG_NFS_V4_1 */
129}; 111};
130 112
131static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) 113static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data)
@@ -341,18 +323,6 @@ nfs_cancel_async_unlink(struct dentry *dentry)
341 spin_unlock(&dentry->d_lock); 323 spin_unlock(&dentry->d_lock);
342} 324}
343 325
344struct nfs_renamedata {
345 struct nfs_renameargs args;
346 struct nfs_renameres res;
347 struct rpc_cred *cred;
348 struct inode *old_dir;
349 struct dentry *old_dentry;
350 struct nfs_fattr old_fattr;
351 struct inode *new_dir;
352 struct dentry *new_dentry;
353 struct nfs_fattr new_fattr;
354};
355
356/** 326/**
357 * nfs_async_rename_done - Sillyrename post-processing 327 * nfs_async_rename_done - Sillyrename post-processing
358 * @task: rpc_task of the sillyrename 328 * @task: rpc_task of the sillyrename
@@ -403,25 +373,16 @@ static void nfs_async_rename_release(void *calldata)
403 kfree(data); 373 kfree(data);
404} 374}
405 375
406#if defined(CONFIG_NFS_V4_1)
407static void nfs_rename_prepare(struct rpc_task *task, void *calldata) 376static void nfs_rename_prepare(struct rpc_task *task, void *calldata)
408{ 377{
409 struct nfs_renamedata *data = calldata; 378 struct nfs_renamedata *data = calldata;
410 struct nfs_server *server = NFS_SERVER(data->old_dir); 379 NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);
411
412 if (nfs4_setup_sequence(server, &data->args.seq_args,
413 &data->res.seq_res, 1, task))
414 return;
415 rpc_call_start(task);
416} 380}
417#endif /* CONFIG_NFS_V4_1 */
418 381
419static const struct rpc_call_ops nfs_rename_ops = { 382static const struct rpc_call_ops nfs_rename_ops = {
420 .rpc_call_done = nfs_async_rename_done, 383 .rpc_call_done = nfs_async_rename_done,
421 .rpc_release = nfs_async_rename_release, 384 .rpc_release = nfs_async_rename_release,
422#if defined(CONFIG_NFS_V4_1)
423 .rpc_call_prepare = nfs_rename_prepare, 385 .rpc_call_prepare = nfs_rename_prepare,
424#endif /* CONFIG_NFS_V4_1 */
425}; 386};
426 387
427/** 388/**
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 834f0fe96f89..2c68818f68ac 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -100,7 +100,6 @@ void nfs_writedata_free(struct nfs_write_data *p)
100 100
101void nfs_writedata_release(struct nfs_write_data *wdata) 101void nfs_writedata_release(struct nfs_write_data *wdata)
102{ 102{
103 put_lseg(wdata->lseg);
104 put_nfs_open_context(wdata->args.context); 103 put_nfs_open_context(wdata->args.context);
105 nfs_writedata_free(wdata); 104 nfs_writedata_free(wdata);
106} 105}
@@ -236,10 +235,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo
236 req = nfs_page_find_request_locked(page); 235 req = nfs_page_find_request_locked(page);
237 if (req == NULL) 236 if (req == NULL)
238 break; 237 break;
239 if (nfs_set_page_tag_locked(req)) 238 if (nfs_lock_request_dontget(req))
240 break; 239 break;
241 /* Note: If we hold the page lock, as is the case in nfs_writepage, 240 /* Note: If we hold the page lock, as is the case in nfs_writepage,
242 * then the call to nfs_set_page_tag_locked() will always 241 * then the call to nfs_lock_request_dontget() will always
243 * succeed provided that someone hasn't already marked the 242 * succeed provided that someone hasn't already marked the
244 * request as dirty (in which case we don't care). 243 * request as dirty (in which case we don't care).
245 */ 244 */
@@ -375,21 +374,14 @@ out_err:
375/* 374/*
376 * Insert a write request into an inode 375 * Insert a write request into an inode
377 */ 376 */
378static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) 377static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
379{ 378{
380 struct nfs_inode *nfsi = NFS_I(inode); 379 struct nfs_inode *nfsi = NFS_I(inode);
381 int error;
382
383 error = radix_tree_preload(GFP_NOFS);
384 if (error != 0)
385 goto out;
386 380
387 /* Lock the request! */ 381 /* Lock the request! */
388 nfs_lock_request_dontget(req); 382 nfs_lock_request_dontget(req);
389 383
390 spin_lock(&inode->i_lock); 384 spin_lock(&inode->i_lock);
391 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
392 BUG_ON(error);
393 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) 385 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
394 inode->i_version++; 386 inode->i_version++;
395 set_bit(PG_MAPPED, &req->wb_flags); 387 set_bit(PG_MAPPED, &req->wb_flags);
@@ -397,12 +389,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
397 set_page_private(req->wb_page, (unsigned long)req); 389 set_page_private(req->wb_page, (unsigned long)req);
398 nfsi->npages++; 390 nfsi->npages++;
399 kref_get(&req->wb_kref); 391 kref_get(&req->wb_kref);
400 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
401 NFS_PAGE_TAG_LOCKED);
402 spin_unlock(&inode->i_lock); 392 spin_unlock(&inode->i_lock);
403 radix_tree_preload_end();
404out:
405 return error;
406} 393}
407 394
408/* 395/*
@@ -419,7 +406,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
419 set_page_private(req->wb_page, 0); 406 set_page_private(req->wb_page, 0);
420 ClearPagePrivate(req->wb_page); 407 ClearPagePrivate(req->wb_page);
421 clear_bit(PG_MAPPED, &req->wb_flags); 408 clear_bit(PG_MAPPED, &req->wb_flags);
422 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
423 nfsi->npages--; 409 nfsi->npages--;
424 spin_unlock(&inode->i_lock); 410 spin_unlock(&inode->i_lock);
425 nfs_release_request(req); 411 nfs_release_request(req);
@@ -432,39 +418,90 @@ nfs_mark_request_dirty(struct nfs_page *req)
432} 418}
433 419
434#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 420#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
435/* 421/**
436 * Add a request to the inode's commit list. 422 * nfs_request_add_commit_list - add request to a commit list
423 * @req: pointer to a struct nfs_page
424 * @head: commit list head
425 *
426 * This sets the PG_CLEAN bit, updates the inode global count of
427 * number of outstanding requests requiring a commit as well as
428 * the MM page stats.
429 *
430 * The caller must _not_ hold the inode->i_lock, but must be
431 * holding the nfs_page lock.
437 */ 432 */
438static void 433void
439nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) 434nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head)
440{ 435{
441 struct inode *inode = req->wb_context->dentry->d_inode; 436 struct inode *inode = req->wb_context->dentry->d_inode;
442 struct nfs_inode *nfsi = NFS_I(inode);
443 437
444 spin_lock(&inode->i_lock);
445 set_bit(PG_CLEAN, &(req)->wb_flags); 438 set_bit(PG_CLEAN, &(req)->wb_flags);
446 radix_tree_tag_set(&nfsi->nfs_page_tree, 439 spin_lock(&inode->i_lock);
447 req->wb_index, 440 nfs_list_add_request(req, head);
448 NFS_PAGE_TAG_COMMIT); 441 NFS_I(inode)->ncommit++;
449 nfsi->ncommit++;
450 spin_unlock(&inode->i_lock); 442 spin_unlock(&inode->i_lock);
451 pnfs_mark_request_commit(req, lseg);
452 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 443 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
453 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 444 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
454 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 445 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
455} 446}
447EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
456 448
457static int 449/**
450 * nfs_request_remove_commit_list - Remove request from a commit list
451 * @req: pointer to a nfs_page
452 *
453 * This clears the PG_CLEAN bit, and updates the inode global count of
454 * number of outstanding requests requiring a commit
455 * It does not update the MM page stats.
456 *
457 * The caller _must_ hold the inode->i_lock and the nfs_page lock.
458 */
459void
460nfs_request_remove_commit_list(struct nfs_page *req)
461{
462 struct inode *inode = req->wb_context->dentry->d_inode;
463
464 if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
465 return;
466 nfs_list_remove_request(req);
467 NFS_I(inode)->ncommit--;
468}
469EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
470
471
472/*
473 * Add a request to the inode's commit list.
474 */
475static void
476nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
477{
478 struct inode *inode = req->wb_context->dentry->d_inode;
479
480 if (pnfs_mark_request_commit(req, lseg))
481 return;
482 nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list);
483}
484
485static void
486nfs_clear_page_commit(struct page *page)
487{
488 dec_zone_page_state(page, NR_UNSTABLE_NFS);
489 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
490}
491
492static void
458nfs_clear_request_commit(struct nfs_page *req) 493nfs_clear_request_commit(struct nfs_page *req)
459{ 494{
460 struct page *page = req->wb_page; 495 if (test_bit(PG_CLEAN, &req->wb_flags)) {
496 struct inode *inode = req->wb_context->dentry->d_inode;
461 497
462 if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { 498 if (!pnfs_clear_request_commit(req)) {
463 dec_zone_page_state(page, NR_UNSTABLE_NFS); 499 spin_lock(&inode->i_lock);
464 dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); 500 nfs_request_remove_commit_list(req);
465 return 1; 501 spin_unlock(&inode->i_lock);
502 }
503 nfs_clear_page_commit(req->wb_page);
466 } 504 }
467 return 0;
468} 505}
469 506
470static inline 507static inline
@@ -491,15 +528,14 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
491 return 0; 528 return 0;
492} 529}
493#else 530#else
494static inline void 531static void
495nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) 532nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
496{ 533{
497} 534}
498 535
499static inline int 536static void
500nfs_clear_request_commit(struct nfs_page *req) 537nfs_clear_request_commit(struct nfs_page *req)
501{ 538{
502 return 0;
503} 539}
504 540
505static inline 541static inline
@@ -520,46 +556,65 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,
520static int 556static int
521nfs_need_commit(struct nfs_inode *nfsi) 557nfs_need_commit(struct nfs_inode *nfsi)
522{ 558{
523 return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); 559 return nfsi->ncommit > 0;
560}
561
562/* i_lock held by caller */
563static int
564nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max,
565 spinlock_t *lock)
566{
567 struct nfs_page *req, *tmp;
568 int ret = 0;
569
570 list_for_each_entry_safe(req, tmp, src, wb_list) {
571 if (!nfs_lock_request(req))
572 continue;
573 if (cond_resched_lock(lock))
574 list_safe_reset_next(req, tmp, wb_list);
575 nfs_request_remove_commit_list(req);
576 nfs_list_add_request(req, dst);
577 ret++;
578 if (ret == max)
579 break;
580 }
581 return ret;
524} 582}
525 583
526/* 584/*
527 * nfs_scan_commit - Scan an inode for commit requests 585 * nfs_scan_commit - Scan an inode for commit requests
528 * @inode: NFS inode to scan 586 * @inode: NFS inode to scan
529 * @dst: destination list 587 * @dst: destination list
530 * @idx_start: lower bound of page->index to scan.
531 * @npages: idx_start + npages sets the upper bound to scan.
532 * 588 *
533 * Moves requests from the inode's 'commit' request list. 589 * Moves requests from the inode's 'commit' request list.
534 * The requests are *not* checked to ensure that they form a contiguous set. 590 * The requests are *not* checked to ensure that they form a contiguous set.
535 */ 591 */
536static int 592static int
537nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 593nfs_scan_commit(struct inode *inode, struct list_head *dst)
538{ 594{
539 struct nfs_inode *nfsi = NFS_I(inode); 595 struct nfs_inode *nfsi = NFS_I(inode);
540 int ret; 596 int ret = 0;
541
542 if (!nfs_need_commit(nfsi))
543 return 0;
544 597
545 spin_lock(&inode->i_lock); 598 spin_lock(&inode->i_lock);
546 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 599 if (nfsi->ncommit > 0) {
547 if (ret > 0) 600 const int max = INT_MAX;
548 nfsi->ncommit -= ret;
549 spin_unlock(&inode->i_lock);
550
551 if (nfs_need_commit(NFS_I(inode)))
552 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
553 601
602 ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max,
603 &inode->i_lock);
604 ret += pnfs_scan_commit_lists(inode, max - ret,
605 &inode->i_lock);
606 }
607 spin_unlock(&inode->i_lock);
554 return ret; 608 return ret;
555} 609}
610
556#else 611#else
557static inline int nfs_need_commit(struct nfs_inode *nfsi) 612static inline int nfs_need_commit(struct nfs_inode *nfsi)
558{ 613{
559 return 0; 614 return 0;
560} 615}
561 616
562static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 617static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst)
563{ 618{
564 return 0; 619 return 0;
565} 620}
@@ -604,7 +659,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
604 || end < req->wb_offset) 659 || end < req->wb_offset)
605 goto out_flushme; 660 goto out_flushme;
606 661
607 if (nfs_set_page_tag_locked(req)) 662 if (nfs_lock_request_dontget(req))
608 break; 663 break;
609 664
610 /* The request is locked, so wait and then retry */ 665 /* The request is locked, so wait and then retry */
@@ -616,13 +671,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
616 spin_lock(&inode->i_lock); 671 spin_lock(&inode->i_lock);
617 } 672 }
618 673
619 if (nfs_clear_request_commit(req) &&
620 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
621 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
622 NFS_I(inode)->ncommit--;
623 pnfs_clear_request_commit(req);
624 }
625
626 /* Okay, the request matches. Update the region */ 674 /* Okay, the request matches. Update the region */
627 if (offset < req->wb_offset) { 675 if (offset < req->wb_offset) {
628 req->wb_offset = offset; 676 req->wb_offset = offset;
@@ -634,6 +682,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
634 req->wb_bytes = rqend - req->wb_offset; 682 req->wb_bytes = rqend - req->wb_offset;
635out_unlock: 683out_unlock:
636 spin_unlock(&inode->i_lock); 684 spin_unlock(&inode->i_lock);
685 nfs_clear_request_commit(req);
637 return req; 686 return req;
638out_flushme: 687out_flushme:
639 spin_unlock(&inode->i_lock); 688 spin_unlock(&inode->i_lock);
@@ -655,7 +704,6 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
655{ 704{
656 struct inode *inode = page->mapping->host; 705 struct inode *inode = page->mapping->host;
657 struct nfs_page *req; 706 struct nfs_page *req;
658 int error;
659 707
660 req = nfs_try_to_update_request(inode, page, offset, bytes); 708 req = nfs_try_to_update_request(inode, page, offset, bytes);
661 if (req != NULL) 709 if (req != NULL)
@@ -663,11 +711,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
663 req = nfs_create_request(ctx, inode, page, offset, bytes); 711 req = nfs_create_request(ctx, inode, page, offset, bytes);
664 if (IS_ERR(req)) 712 if (IS_ERR(req))
665 goto out; 713 goto out;
666 error = nfs_inode_add_request(inode, req); 714 nfs_inode_add_request(inode, req);
667 if (error != 0) {
668 nfs_release_request(req);
669 req = ERR_PTR(error);
670 }
671out: 715out:
672 return req; 716 return req;
673} 717}
@@ -684,7 +728,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
684 nfs_grow_file(page, offset, count); 728 nfs_grow_file(page, offset, count);
685 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 729 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
686 nfs_mark_request_dirty(req); 730 nfs_mark_request_dirty(req);
687 nfs_clear_page_tag_locked(req); 731 nfs_unlock_request(req);
688 return 0; 732 return 0;
689} 733}
690 734
@@ -777,7 +821,7 @@ static void nfs_writepage_release(struct nfs_page *req,
777 821
778 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) 822 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
779 nfs_inode_remove_request(req); 823 nfs_inode_remove_request(req);
780 nfs_clear_page_tag_locked(req); 824 nfs_unlock_request(req);
781 nfs_end_page_writeback(page); 825 nfs_end_page_writeback(page);
782} 826}
783 827
@@ -925,7 +969,7 @@ static void nfs_redirty_request(struct nfs_page *req)
925 struct page *page = req->wb_page; 969 struct page *page = req->wb_page;
926 970
927 nfs_mark_request_dirty(req); 971 nfs_mark_request_dirty(req);
928 nfs_clear_page_tag_locked(req); 972 nfs_unlock_request(req);
929 nfs_end_page_writeback(page); 973 nfs_end_page_writeback(page);
930} 974}
931 975
@@ -1128,23 +1172,14 @@ out:
1128 nfs_writedata_release(calldata); 1172 nfs_writedata_release(calldata);
1129} 1173}
1130 1174
1131#if defined(CONFIG_NFS_V4_1)
1132void nfs_write_prepare(struct rpc_task *task, void *calldata) 1175void nfs_write_prepare(struct rpc_task *task, void *calldata)
1133{ 1176{
1134 struct nfs_write_data *data = calldata; 1177 struct nfs_write_data *data = calldata;
1135 1178 NFS_PROTO(data->inode)->write_rpc_prepare(task, data);
1136 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
1137 &data->args.seq_args,
1138 &data->res.seq_res, 1, task))
1139 return;
1140 rpc_call_start(task);
1141} 1179}
1142#endif /* CONFIG_NFS_V4_1 */
1143 1180
1144static const struct rpc_call_ops nfs_write_partial_ops = { 1181static const struct rpc_call_ops nfs_write_partial_ops = {
1145#if defined(CONFIG_NFS_V4_1)
1146 .rpc_call_prepare = nfs_write_prepare, 1182 .rpc_call_prepare = nfs_write_prepare,
1147#endif /* CONFIG_NFS_V4_1 */
1148 .rpc_call_done = nfs_writeback_done_partial, 1183 .rpc_call_done = nfs_writeback_done_partial,
1149 .rpc_release = nfs_writeback_release_partial, 1184 .rpc_release = nfs_writeback_release_partial,
1150}; 1185};
@@ -1199,16 +1234,14 @@ static void nfs_writeback_release_full(void *calldata)
1199remove_request: 1234remove_request:
1200 nfs_inode_remove_request(req); 1235 nfs_inode_remove_request(req);
1201 next: 1236 next:
1202 nfs_clear_page_tag_locked(req); 1237 nfs_unlock_request(req);
1203 nfs_end_page_writeback(page); 1238 nfs_end_page_writeback(page);
1204 } 1239 }
1205 nfs_writedata_release(calldata); 1240 nfs_writedata_release(calldata);
1206} 1241}
1207 1242
1208static const struct rpc_call_ops nfs_write_full_ops = { 1243static const struct rpc_call_ops nfs_write_full_ops = {
1209#if defined(CONFIG_NFS_V4_1)
1210 .rpc_call_prepare = nfs_write_prepare, 1244 .rpc_call_prepare = nfs_write_prepare,
1211#endif /* CONFIG_NFS_V4_1 */
1212 .rpc_call_done = nfs_writeback_done_full, 1245 .rpc_call_done = nfs_writeback_done_full,
1213 .rpc_release = nfs_writeback_release_full, 1246 .rpc_release = nfs_writeback_release_full,
1214}; 1247};
@@ -1325,7 +1358,6 @@ void nfs_commitdata_release(void *data)
1325{ 1358{
1326 struct nfs_write_data *wdata = data; 1359 struct nfs_write_data *wdata = data;
1327 1360
1328 put_lseg(wdata->lseg);
1329 put_nfs_open_context(wdata->args.context); 1361 put_nfs_open_context(wdata->args.context);
1330 nfs_commit_free(wdata); 1362 nfs_commit_free(wdata);
1331} 1363}
@@ -1411,7 +1443,7 @@ void nfs_retry_commit(struct list_head *page_list,
1411 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1443 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1412 dec_bdi_stat(req->wb_page->mapping->backing_dev_info, 1444 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1413 BDI_RECLAIMABLE); 1445 BDI_RECLAIMABLE);
1414 nfs_clear_page_tag_locked(req); 1446 nfs_unlock_request(req);
1415 } 1447 }
1416} 1448}
1417EXPORT_SYMBOL_GPL(nfs_retry_commit); 1449EXPORT_SYMBOL_GPL(nfs_retry_commit);
@@ -1460,7 +1492,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
1460 while (!list_empty(&data->pages)) { 1492 while (!list_empty(&data->pages)) {
1461 req = nfs_list_entry(data->pages.next); 1493 req = nfs_list_entry(data->pages.next);
1462 nfs_list_remove_request(req); 1494 nfs_list_remove_request(req);
1463 nfs_clear_request_commit(req); 1495 nfs_clear_page_commit(req->wb_page);
1464 1496
1465 dprintk("NFS: commit (%s/%lld %d@%lld)", 1497 dprintk("NFS: commit (%s/%lld %d@%lld)",
1466 req->wb_context->dentry->d_sb->s_id, 1498 req->wb_context->dentry->d_sb->s_id,
@@ -1486,7 +1518,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)
1486 dprintk(" mismatch\n"); 1518 dprintk(" mismatch\n");
1487 nfs_mark_request_dirty(req); 1519 nfs_mark_request_dirty(req);
1488 next: 1520 next:
1489 nfs_clear_page_tag_locked(req); 1521 nfs_unlock_request(req);
1490 } 1522 }
1491} 1523}
1492EXPORT_SYMBOL_GPL(nfs_commit_release_pages); 1524EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
@@ -1501,9 +1533,7 @@ static void nfs_commit_release(void *calldata)
1501} 1533}
1502 1534
1503static const struct rpc_call_ops nfs_commit_ops = { 1535static const struct rpc_call_ops nfs_commit_ops = {
1504#if defined(CONFIG_NFS_V4_1)
1505 .rpc_call_prepare = nfs_write_prepare, 1536 .rpc_call_prepare = nfs_write_prepare,
1506#endif /* CONFIG_NFS_V4_1 */
1507 .rpc_call_done = nfs_commit_done, 1537 .rpc_call_done = nfs_commit_done,
1508 .rpc_release = nfs_commit_release, 1538 .rpc_release = nfs_commit_release,
1509}; 1539};
@@ -1517,7 +1547,7 @@ int nfs_commit_inode(struct inode *inode, int how)
1517 res = nfs_commit_set_lock(NFS_I(inode), may_wait); 1547 res = nfs_commit_set_lock(NFS_I(inode), may_wait);
1518 if (res <= 0) 1548 if (res <= 0)
1519 goto out_mark_dirty; 1549 goto out_mark_dirty;
1520 res = nfs_scan_commit(inode, &head, 0, 0); 1550 res = nfs_scan_commit(inode, &head);
1521 if (res) { 1551 if (res) {
1522 int error; 1552 int error;
1523 1553
@@ -1635,6 +1665,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1635 if (req == NULL) 1665 if (req == NULL)
1636 break; 1666 break;
1637 if (nfs_lock_request_dontget(req)) { 1667 if (nfs_lock_request_dontget(req)) {
1668 nfs_clear_request_commit(req);
1638 nfs_inode_remove_request(req); 1669 nfs_inode_remove_request(req);
1639 /* 1670 /*
1640 * In case nfs_inode_remove_request has marked the 1671 * In case nfs_inode_remove_request has marked the
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6f3ebb48b12f..0e262f32ac41 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -605,24 +605,24 @@ static struct rpc_version nfs_cb_version4 = {
605 .procs = nfs4_cb_procedures 605 .procs = nfs4_cb_procedures
606}; 606};
607 607
608static struct rpc_version *nfs_cb_version[] = { 608static const struct rpc_version *nfs_cb_version[] = {
609 &nfs_cb_version4, 609 &nfs_cb_version4,
610}; 610};
611 611
612static struct rpc_program cb_program; 612static const struct rpc_program cb_program;
613 613
614static struct rpc_stat cb_stats = { 614static struct rpc_stat cb_stats = {
615 .program = &cb_program 615 .program = &cb_program
616}; 616};
617 617
618#define NFS4_CALLBACK 0x40000000 618#define NFS4_CALLBACK 0x40000000
619static struct rpc_program cb_program = { 619static const struct rpc_program cb_program = {
620 .name = "nfs4_cb", 620 .name = "nfs4_cb",
621 .number = NFS4_CALLBACK, 621 .number = NFS4_CALLBACK,
622 .nrvers = ARRAY_SIZE(nfs_cb_version), 622 .nrvers = ARRAY_SIZE(nfs_cb_version),
623 .version = nfs_cb_version, 623 .version = nfs_cb_version,
624 .stats = &cb_stats, 624 .stats = &cb_stats,
625 .pipe_dir_name = "/nfsd4_cb", 625 .pipe_dir_name = "nfsd4_cb",
626}; 626};
627 627
628static int max_cb_time(void) 628static int max_cb_time(void)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e8c98f009670..c5cddd659429 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1308,7 +1308,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
1308 else 1308 else
1309 goto out_err; 1309 goto out_err;
1310 1310
1311 conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, 1311 conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,
1312 se->se_callback_addr_len, 1312 se->se_callback_addr_len,
1313 (struct sockaddr *)&conn->cb_addr, 1313 (struct sockaddr *)&conn->cb_addr,
1314 sizeof(conn->cb_addr)); 1314 sizeof(conn->cb_addr));
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 748eda93ce59..64c24af8d7ea 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -223,7 +223,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
223 if (qword_get(&buf, fo_path, size) < 0) 223 if (qword_get(&buf, fo_path, size) < 0)
224 return -EINVAL; 224 return -EINVAL;
225 225
226 if (rpc_pton(fo_path, size, sap, salen) == 0) 226 if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)
227 return -EINVAL; 227 return -EINVAL;
228 228
229 return nlmsvc_unlock_all_by_ip(sap); 229 return nlmsvc_unlock_all_by_ip(sap);
@@ -722,7 +722,7 @@ static ssize_t __write_ports_addxprt(char *buf)
722 nfsd_serv->sv_nrthreads--; 722 nfsd_serv->sv_nrthreads--;
723 return 0; 723 return 0;
724out_close: 724out_close:
725 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); 725 xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);
726 if (xprt != NULL) { 726 if (xprt != NULL) {
727 svc_close_xprt(xprt); 727 svc_close_xprt(xprt);
728 svc_xprt_put(xprt); 728 svc_xprt_put(xprt);
@@ -748,7 +748,7 @@ static ssize_t __write_ports_delxprt(char *buf)
748 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) 748 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
749 return -EINVAL; 749 return -EINVAL;
750 750
751 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 751 xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
752 if (xprt == NULL) 752 if (xprt == NULL)
753 return -ENOTCONN; 753 return -ENOTCONN;
754 754
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index eda7d7e55e05..fce472f5f39e 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -251,13 +251,13 @@ static void nfsd_shutdown(void)
251 nfsd_up = false; 251 nfsd_up = false;
252} 252}
253 253
254static void nfsd_last_thread(struct svc_serv *serv) 254static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
255{ 255{
256 /* When last nfsd thread exits we need to do some clean-up */ 256 /* When last nfsd thread exits we need to do some clean-up */
257 nfsd_serv = NULL; 257 nfsd_serv = NULL;
258 nfsd_shutdown(); 258 nfsd_shutdown();
259 259
260 svc_rpcb_cleanup(serv); 260 svc_rpcb_cleanup(serv, net);
261 261
262 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 262 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
263 "cache\n"); 263 "cache\n");
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a2e2402b2afb..6d4521feb6e3 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -25,6 +25,7 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/sunrpc/stats.h> 26#include <linux/sunrpc/stats.h>
27#include <linux/nfsd/stats.h> 27#include <linux/nfsd/stats.h>
28#include <net/net_namespace.h>
28 29
29#include "nfsd.h" 30#include "nfsd.h"
30 31
@@ -94,11 +95,11 @@ static const struct file_operations nfsd_proc_fops = {
94void 95void
95nfsd_stat_init(void) 96nfsd_stat_init(void)
96{ 97{
97 svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops); 98 svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops);
98} 99}
99 100
100void 101void
101nfsd_stat_shutdown(void) 102nfsd_stat_shutdown(void)
102{ 103{
103 svc_proc_unregister("nfsd"); 104 svc_proc_unregister(&init_net, "nfsd");
104} 105}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index ee188158a224..c887b1378f7e 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -447,7 +447,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
447 return event; 447 return event;
448} 448}
449 449
450__init int fsnotify_notification_init(void) 450static __init int fsnotify_notification_init(void)
451{ 451{
452 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); 452 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
453 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); 453 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
@@ -461,4 +461,3 @@ __init int fsnotify_notification_init(void)
461 return 0; 461 return 0;
462} 462}
463subsys_initcall(fsnotify_notification_init); 463subsys_initcall(fsnotify_notification_init);
464
diff --git a/fs/pipe.c b/fs/pipe.c
index fe0502f9beb2..25feaa3faac0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -13,6 +13,7 @@
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h> 14#include <linux/log2.h>
15#include <linux/mount.h> 15#include <linux/mount.h>
16#include <linux/magic.h>
16#include <linux/pipe_fs_i.h> 17#include <linux/pipe_fs_i.h>
17#include <linux/uio.h> 18#include <linux/uio.h>
18#include <linux/highmem.h> 19#include <linux/highmem.h>
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index cea4623f1ed6..5e325a42e33d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -18,7 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/posix_acl.h> 20#include <linux/posix_acl.h>
21#include <linux/module.h> 21#include <linux/export.h>
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c602b8d20f06..fbb53c249086 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -462,59 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
462 /* convert nsec -> ticks */ 462 /* convert nsec -> ticks */
463 start_time = nsec_to_clock_t(start_time); 463 start_time = nsec_to_clock_t(start_time);
464 464
465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ 465 seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 466 seq_put_decimal_ll(m, ' ', ppid);
467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", 467 seq_put_decimal_ll(m, ' ', pgid);
468 pid_nr_ns(pid, ns), 468 seq_put_decimal_ll(m, ' ', sid);
469 tcomm, 469 seq_put_decimal_ll(m, ' ', tty_nr);
470 state, 470 seq_put_decimal_ll(m, ' ', tty_pgrp);
471 ppid, 471 seq_put_decimal_ull(m, ' ', task->flags);
472 pgid, 472 seq_put_decimal_ull(m, ' ', min_flt);
473 sid, 473 seq_put_decimal_ull(m, ' ', cmin_flt);
474 tty_nr, 474 seq_put_decimal_ull(m, ' ', maj_flt);
475 tty_pgrp, 475 seq_put_decimal_ull(m, ' ', cmaj_flt);
476 task->flags, 476 seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
477 min_flt, 477 seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
478 cmin_flt, 478 seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
479 maj_flt, 479 seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
480 cmaj_flt, 480 seq_put_decimal_ll(m, ' ', priority);
481 cputime_to_clock_t(utime), 481 seq_put_decimal_ll(m, ' ', nice);
482 cputime_to_clock_t(stime), 482 seq_put_decimal_ll(m, ' ', num_threads);
483 cputime_to_clock_t(cutime), 483 seq_put_decimal_ull(m, ' ', 0);
484 cputime_to_clock_t(cstime), 484 seq_put_decimal_ull(m, ' ', start_time);
485 priority, 485 seq_put_decimal_ull(m, ' ', vsize);
486 nice, 486 seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0);
487 num_threads, 487 seq_put_decimal_ull(m, ' ', rsslim);
488 start_time, 488 seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
489 vsize, 489 seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
490 mm ? get_mm_rss(mm) : 0, 490 seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
491 rsslim, 491 seq_put_decimal_ull(m, ' ', esp);
492 mm ? (permitted ? mm->start_code : 1) : 0, 492 seq_put_decimal_ull(m, ' ', eip);
493 mm ? (permitted ? mm->end_code : 1) : 0, 493 /* The signal information here is obsolete.
494 (permitted && mm) ? mm->start_stack : 0, 494 * It must be decimal for Linux 2.0 compatibility.
495 esp, 495 * Use /proc/#/status for real-time signals.
496 eip, 496 */
497 /* The signal information here is obsolete. 497 seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
498 * It must be decimal for Linux 2.0 compatibility. 498 seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
499 * Use /proc/#/status for real-time signals. 499 seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
500 */ 500 seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
501 task->pending.signal.sig[0] & 0x7fffffffUL, 501 seq_put_decimal_ull(m, ' ', wchan);
502 task->blocked.sig[0] & 0x7fffffffUL, 502 seq_put_decimal_ull(m, ' ', 0);
503 sigign .sig[0] & 0x7fffffffUL, 503 seq_put_decimal_ull(m, ' ', 0);
504 sigcatch .sig[0] & 0x7fffffffUL, 504 seq_put_decimal_ll(m, ' ', task->exit_signal);
505 wchan, 505 seq_put_decimal_ll(m, ' ', task_cpu(task));
506 0UL, 506 seq_put_decimal_ull(m, ' ', task->rt_priority);
507 0UL, 507 seq_put_decimal_ull(m, ' ', task->policy);
508 task->exit_signal, 508 seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
509 task_cpu(task), 509 seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
510 task->rt_priority, 510 seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
511 task->policy, 511 seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0);
512 (unsigned long long)delayacct_blkio_ticks(task), 512 seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0);
513 cputime_to_clock_t(gtime), 513 seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0);
514 cputime_to_clock_t(cgtime), 514 seq_putc(m, '\n');
515 (mm && permitted) ? mm->start_data : 0,
516 (mm && permitted) ? mm->end_data : 0,
517 (mm && permitted) ? mm->start_brk : 0);
518 if (mm) 515 if (mm)
519 mmput(mm); 516 mmput(mm);
520 return 0; 517 return 0;
@@ -542,8 +539,20 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
542 size = task_statm(mm, &shared, &text, &data, &resident); 539 size = task_statm(mm, &shared, &text, &data, &resident);
543 mmput(mm); 540 mmput(mm);
544 } 541 }
545 seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", 542 /*
546 size, resident, shared, text, data); 543 * For quick read, open code by putting numbers directly
544 * expected format is
545 * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
546 * size, resident, shared, text, data);
547 */
548 seq_put_decimal_ull(m, 0, size);
549 seq_put_decimal_ull(m, ' ', resident);
550 seq_put_decimal_ull(m, ' ', shared);
551 seq_put_decimal_ull(m, ' ', text);
552 seq_put_decimal_ull(m, ' ', 0);
553 seq_put_decimal_ull(m, ' ', text);
554 seq_put_decimal_ull(m, ' ', 0);
555 seq_putc(m, '\n');
547 556
548 return 0; 557 return 0;
549} 558}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c44efe19798f..5f79bb8b4c60 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,12 +10,15 @@
10 */ 10 */
11 11
12#include <linux/proc_fs.h> 12#include <linux/proc_fs.h>
13struct ctl_table_header;
13 14
14extern struct proc_dir_entry proc_root; 15extern struct proc_dir_entry proc_root;
15#ifdef CONFIG_PROC_SYSCTL 16#ifdef CONFIG_PROC_SYSCTL
16extern int proc_sys_init(void); 17extern int proc_sys_init(void);
18extern void sysctl_head_put(struct ctl_table_header *head);
17#else 19#else
18static inline void proc_sys_init(void) { } 20static inline void proc_sys_init(void) { }
21static inline void sysctl_head_put(struct ctl_table_header *head) { }
19#endif 22#endif
20#ifdef CONFIG_NET 23#ifdef CONFIG_NET
21extern int proc_net_init(void); 24extern int proc_net_init(void);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e5e69aff6c69..86c67eee439f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -157,7 +157,8 @@ static int kcore_update_ram(void)
157 157
158#ifdef CONFIG_SPARSEMEM_VMEMMAP 158#ifdef CONFIG_SPARSEMEM_VMEMMAP
159/* calculate vmemmap's address from given system ram pfn and register it */ 159/* calculate vmemmap's address from given system ram pfn and register it */
160int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) 160static int
161get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
161{ 162{
162 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; 163 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
163 unsigned long nr_pages = ent->size >> PAGE_SHIFT; 164 unsigned long nr_pages = ent->size >> PAGE_SHIFT;
@@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
189 190
190} 191}
191#else 192#else
192int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) 193static int
194get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
193{ 195{
194 return 1; 196 return 1;
195} 197}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 27da860115c6..3551f1f839eb 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -53,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
53 ei->ns_ops = ns_ops; 53 ei->ns_ops = ns_ops;
54 ei->ns = ns; 54 ei->ns = ns;
55 55
56 dentry->d_op = &pid_dentry_operations; 56 d_set_d_op(dentry, &pid_dentry_operations);
57 d_add(dentry, inode); 57 d_add(dentry, inode);
58 /* Close the race of the process dying before we return the dentry */ 58 /* Close the race of the process dying before we return the dentry */
59 if (pid_revalidate(dentry, NULL)) 59 if (pid_revalidate(dentry, NULL))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 67bbf6e4e197..21d836f40292 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -9,6 +9,7 @@
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/namei.h> 10#include <linux/namei.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/module.h>
12#include "internal.h" 13#include "internal.h"
13 14
14static const struct dentry_operations proc_sys_dentry_operations; 15static const struct dentry_operations proc_sys_dentry_operations;
@@ -26,6 +27,371 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
26 wake_up_interruptible(&poll->wait); 27 wake_up_interruptible(&poll->wait);
27} 28}
28 29
30static struct ctl_table root_table[] = {
31 {
32 .procname = "",
33 .mode = S_IFDIR|S_IRUGO|S_IXUGO,
34 },
35 { }
36};
37static struct ctl_table_root sysctl_table_root = {
38 .default_set.dir.header = {
39 {{.count = 1,
40 .nreg = 1,
41 .ctl_table = root_table }},
42 .ctl_table_arg = root_table,
43 .root = &sysctl_table_root,
44 .set = &sysctl_table_root.default_set,
45 },
46};
47
48static DEFINE_SPINLOCK(sysctl_lock);
49
50static void drop_sysctl_table(struct ctl_table_header *header);
51static int sysctl_follow_link(struct ctl_table_header **phead,
52 struct ctl_table **pentry, struct nsproxy *namespaces);
53static int insert_links(struct ctl_table_header *head);
54static void put_links(struct ctl_table_header *header);
55
56static void sysctl_print_dir(struct ctl_dir *dir)
57{
58 if (dir->header.parent)
59 sysctl_print_dir(dir->header.parent);
60 printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
61}
62
63static int namecmp(const char *name1, int len1, const char *name2, int len2)
64{
65 int minlen;
66 int cmp;
67
68 minlen = len1;
69 if (minlen > len2)
70 minlen = len2;
71
72 cmp = memcmp(name1, name2, minlen);
73 if (cmp == 0)
74 cmp = len1 - len2;
75 return cmp;
76}
77
78/* Called under sysctl_lock */
79static struct ctl_table *find_entry(struct ctl_table_header **phead,
80 struct ctl_dir *dir, const char *name, int namelen)
81{
82 struct ctl_table_header *head;
83 struct ctl_table *entry;
84 struct rb_node *node = dir->root.rb_node;
85
86 while (node)
87 {
88 struct ctl_node *ctl_node;
89 const char *procname;
90 int cmp;
91
92 ctl_node = rb_entry(node, struct ctl_node, node);
93 head = ctl_node->header;
94 entry = &head->ctl_table[ctl_node - head->node];
95 procname = entry->procname;
96
97 cmp = namecmp(name, namelen, procname, strlen(procname));
98 if (cmp < 0)
99 node = node->rb_left;
100 else if (cmp > 0)
101 node = node->rb_right;
102 else {
103 *phead = head;
104 return entry;
105 }
106 }
107 return NULL;
108}
109
110static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
111{
112 struct rb_node *node = &head->node[entry - head->ctl_table].node;
113 struct rb_node **p = &head->parent->root.rb_node;
114 struct rb_node *parent = NULL;
115 const char *name = entry->procname;
116 int namelen = strlen(name);
117
118 while (*p) {
119 struct ctl_table_header *parent_head;
120 struct ctl_table *parent_entry;
121 struct ctl_node *parent_node;
122 const char *parent_name;
123 int cmp;
124
125 parent = *p;
126 parent_node = rb_entry(parent, struct ctl_node, node);
127 parent_head = parent_node->header;
128 parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
129 parent_name = parent_entry->procname;
130
131 cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
132 if (cmp < 0)
133 p = &(*p)->rb_left;
134 else if (cmp > 0)
135 p = &(*p)->rb_right;
136 else {
137 printk(KERN_ERR "sysctl duplicate entry: ");
138 sysctl_print_dir(head->parent);
139 printk(KERN_CONT "/%s\n", entry->procname);
140 return -EEXIST;
141 }
142 }
143
144 rb_link_node(node, parent, p);
145 return 0;
146}
147
148static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
149{
150 struct rb_node *node = &head->node[entry - head->ctl_table].node;
151
152 rb_erase(node, &head->parent->root);
153}
154
155static void init_header(struct ctl_table_header *head,
156 struct ctl_table_root *root, struct ctl_table_set *set,
157 struct ctl_node *node, struct ctl_table *table)
158{
159 head->ctl_table = table;
160 head->ctl_table_arg = table;
161 head->used = 0;
162 head->count = 1;
163 head->nreg = 1;
164 head->unregistering = NULL;
165 head->root = root;
166 head->set = set;
167 head->parent = NULL;
168 head->node = node;
169 if (node) {
170 struct ctl_table *entry;
171 for (entry = table; entry->procname; entry++, node++) {
172 rb_init_node(&node->node);
173 node->header = head;
174 }
175 }
176}
177
178static void erase_header(struct ctl_table_header *head)
179{
180 struct ctl_table *entry;
181 for (entry = head->ctl_table; entry->procname; entry++)
182 erase_entry(head, entry);
183}
184
185static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
186{
187 struct ctl_table *entry;
188 int err;
189
190 dir->header.nreg++;
191 header->parent = dir;
192 err = insert_links(header);
193 if (err)
194 goto fail_links;
195 for (entry = header->ctl_table; entry->procname; entry++) {
196 err = insert_entry(header, entry);
197 if (err)
198 goto fail;
199 }
200 return 0;
201fail:
202 erase_header(header);
203 put_links(header);
204fail_links:
205 header->parent = NULL;
206 drop_sysctl_table(&dir->header);
207 return err;
208}
209
210/* called under sysctl_lock */
211static int use_table(struct ctl_table_header *p)
212{
213 if (unlikely(p->unregistering))
214 return 0;
215 p->used++;
216 return 1;
217}
218
219/* called under sysctl_lock */
220static void unuse_table(struct ctl_table_header *p)
221{
222 if (!--p->used)
223 if (unlikely(p->unregistering))
224 complete(p->unregistering);
225}
226
227/* called under sysctl_lock, will reacquire if has to wait */
228static void start_unregistering(struct ctl_table_header *p)
229{
230 /*
231 * if p->used is 0, nobody will ever touch that entry again;
232 * we'll eliminate all paths to it before dropping sysctl_lock
233 */
234 if (unlikely(p->used)) {
235 struct completion wait;
236 init_completion(&wait);
237 p->unregistering = &wait;
238 spin_unlock(&sysctl_lock);
239 wait_for_completion(&wait);
240 spin_lock(&sysctl_lock);
241 } else {
242 /* anything non-NULL; we'll never dereference it */
243 p->unregistering = ERR_PTR(-EINVAL);
244 }
245 /*
246 * do not remove from the list until nobody holds it; walking the
247 * list in do_sysctl() relies on that.
248 */
249 erase_header(p);
250}
251
252static void sysctl_head_get(struct ctl_table_header *head)
253{
254 spin_lock(&sysctl_lock);
255 head->count++;
256 spin_unlock(&sysctl_lock);
257}
258
259void sysctl_head_put(struct ctl_table_header *head)
260{
261 spin_lock(&sysctl_lock);
262 if (!--head->count)
263 kfree_rcu(head, rcu);
264 spin_unlock(&sysctl_lock);
265}
266
267static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
268{
269 if (!head)
270 BUG();
271 spin_lock(&sysctl_lock);
272 if (!use_table(head))
273 head = ERR_PTR(-ENOENT);
274 spin_unlock(&sysctl_lock);
275 return head;
276}
277
278static void sysctl_head_finish(struct ctl_table_header *head)
279{
280 if (!head)
281 return;
282 spin_lock(&sysctl_lock);
283 unuse_table(head);
284 spin_unlock(&sysctl_lock);
285}
286
287static struct ctl_table_set *
288lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
289{
290 struct ctl_table_set *set = &root->default_set;
291 if (root->lookup)
292 set = root->lookup(root, namespaces);
293 return set;
294}
295
296static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
297 struct ctl_dir *dir,
298 const char *name, int namelen)
299{
300 struct ctl_table_header *head;
301 struct ctl_table *entry;
302
303 spin_lock(&sysctl_lock);
304 entry = find_entry(&head, dir, name, namelen);
305 if (entry && use_table(head))
306 *phead = head;
307 else
308 entry = NULL;
309 spin_unlock(&sysctl_lock);
310 return entry;
311}
312
313static struct ctl_node *first_usable_entry(struct rb_node *node)
314{
315 struct ctl_node *ctl_node;
316
317 for (;node; node = rb_next(node)) {
318 ctl_node = rb_entry(node, struct ctl_node, node);
319 if (use_table(ctl_node->header))
320 return ctl_node;
321 }
322 return NULL;
323}
324
325static void first_entry(struct ctl_dir *dir,
326 struct ctl_table_header **phead, struct ctl_table **pentry)
327{
328 struct ctl_table_header *head = NULL;
329 struct ctl_table *entry = NULL;
330 struct ctl_node *ctl_node;
331
332 spin_lock(&sysctl_lock);
333 ctl_node = first_usable_entry(rb_first(&dir->root));
334 spin_unlock(&sysctl_lock);
335 if (ctl_node) {
336 head = ctl_node->header;
337 entry = &head->ctl_table[ctl_node - head->node];
338 }
339 *phead = head;
340 *pentry = entry;
341}
342
343static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
344{
345 struct ctl_table_header *head = *phead;
346 struct ctl_table *entry = *pentry;
347 struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
348
349 spin_lock(&sysctl_lock);
350 unuse_table(head);
351
352 ctl_node = first_usable_entry(rb_next(&ctl_node->node));
353 spin_unlock(&sysctl_lock);
354 head = NULL;
355 if (ctl_node) {
356 head = ctl_node->header;
357 entry = &head->ctl_table[ctl_node - head->node];
358 }
359 *phead = head;
360 *pentry = entry;
361}
362
363void register_sysctl_root(struct ctl_table_root *root)
364{
365}
366
367/*
368 * sysctl_perm does NOT grant the superuser all rights automatically, because
369 * some sysctl variables are readonly even to root.
370 */
371
372static int test_perm(int mode, int op)
373{
374 if (!current_euid())
375 mode >>= 6;
376 else if (in_egroup_p(0))
377 mode >>= 3;
378 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
379 return 0;
380 return -EACCES;
381}
382
383static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
384{
385 int mode;
386
387 if (root->permissions)
388 mode = root->permissions(root, current->nsproxy, table);
389 else
390 mode = table->mode;
391
392 return test_perm(mode, op);
393}
394
29static struct inode *proc_sys_make_inode(struct super_block *sb, 395static struct inode *proc_sys_make_inode(struct super_block *sb,
30 struct ctl_table_header *head, struct ctl_table *table) 396 struct ctl_table_header *head, struct ctl_table *table)
31{ 397{
@@ -45,13 +411,12 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
45 411
46 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 412 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
47 inode->i_mode = table->mode; 413 inode->i_mode = table->mode;
48 if (!table->child) { 414 if (!S_ISDIR(table->mode)) {
49 inode->i_mode |= S_IFREG; 415 inode->i_mode |= S_IFREG;
50 inode->i_op = &proc_sys_inode_operations; 416 inode->i_op = &proc_sys_inode_operations;
51 inode->i_fop = &proc_sys_file_operations; 417 inode->i_fop = &proc_sys_file_operations;
52 } else { 418 } else {
53 inode->i_mode |= S_IFDIR; 419 inode->i_mode |= S_IFDIR;
54 clear_nlink(inode);
55 inode->i_op = &proc_sys_dir_operations; 420 inode->i_op = &proc_sys_dir_operations;
56 inode->i_fop = &proc_sys_dir_file_operations; 421 inode->i_fop = &proc_sys_dir_file_operations;
57 } 422 }
@@ -59,70 +424,42 @@ out:
59 return inode; 424 return inode;
60} 425}
61 426
62static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
63{
64 int len;
65 for ( ; p->procname; p++) {
66
67 if (!p->procname)
68 continue;
69
70 len = strlen(p->procname);
71 if (len != name->len)
72 continue;
73
74 if (memcmp(p->procname, name->name, len) != 0)
75 continue;
76
77 /* I have a match */
78 return p;
79 }
80 return NULL;
81}
82
83static struct ctl_table_header *grab_header(struct inode *inode) 427static struct ctl_table_header *grab_header(struct inode *inode)
84{ 428{
85 if (PROC_I(inode)->sysctl) 429 struct ctl_table_header *head = PROC_I(inode)->sysctl;
86 return sysctl_head_grab(PROC_I(inode)->sysctl); 430 if (!head)
87 else 431 head = &sysctl_table_root.default_set.dir.header;
88 return sysctl_head_next(NULL); 432 return sysctl_head_grab(head);
89} 433}
90 434
91static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, 435static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
92 struct nameidata *nd) 436 struct nameidata *nd)
93{ 437{
94 struct ctl_table_header *head = grab_header(dir); 438 struct ctl_table_header *head = grab_header(dir);
95 struct ctl_table *table = PROC_I(dir)->sysctl_entry;
96 struct ctl_table_header *h = NULL; 439 struct ctl_table_header *h = NULL;
97 struct qstr *name = &dentry->d_name; 440 struct qstr *name = &dentry->d_name;
98 struct ctl_table *p; 441 struct ctl_table *p;
99 struct inode *inode; 442 struct inode *inode;
100 struct dentry *err = ERR_PTR(-ENOENT); 443 struct dentry *err = ERR_PTR(-ENOENT);
444 struct ctl_dir *ctl_dir;
445 int ret;
101 446
102 if (IS_ERR(head)) 447 if (IS_ERR(head))
103 return ERR_CAST(head); 448 return ERR_CAST(head);
104 449
105 if (table && !table->child) { 450 ctl_dir = container_of(head, struct ctl_dir, header);
106 WARN_ON(1);
107 goto out;
108 }
109
110 table = table ? table->child : head->ctl_table;
111
112 p = find_in_table(table, name);
113 if (!p) {
114 for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
115 if (h->attached_to != table)
116 continue;
117 p = find_in_table(h->attached_by, name);
118 if (p)
119 break;
120 }
121 }
122 451
452 p = lookup_entry(&h, ctl_dir, name->name, name->len);
123 if (!p) 453 if (!p)
124 goto out; 454 goto out;
125 455
456 if (S_ISLNK(p->mode)) {
457 ret = sysctl_follow_link(&h, &p, current->nsproxy);
458 err = ERR_PTR(ret);
459 if (ret)
460 goto out;
461 }
462
126 err = ERR_PTR(-ENOMEM); 463 err = ERR_PTR(-ENOMEM);
127 inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); 464 inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
128 if (h) 465 if (h)
@@ -190,20 +527,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
190 527
191static int proc_sys_open(struct inode *inode, struct file *filp) 528static int proc_sys_open(struct inode *inode, struct file *filp)
192{ 529{
530 struct ctl_table_header *head = grab_header(inode);
193 struct ctl_table *table = PROC_I(inode)->sysctl_entry; 531 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
194 532
533 /* sysctl was unregistered */
534 if (IS_ERR(head))
535 return PTR_ERR(head);
536
195 if (table->poll) 537 if (table->poll)
196 filp->private_data = proc_sys_poll_event(table->poll); 538 filp->private_data = proc_sys_poll_event(table->poll);
197 539
540 sysctl_head_finish(head);
541
198 return 0; 542 return 0;
199} 543}
200 544
201static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) 545static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
202{ 546{
203 struct inode *inode = filp->f_path.dentry->d_inode; 547 struct inode *inode = filp->f_path.dentry->d_inode;
548 struct ctl_table_header *head = grab_header(inode);
204 struct ctl_table *table = PROC_I(inode)->sysctl_entry; 549 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
205 unsigned long event = (unsigned long)filp->private_data;
206 unsigned int ret = DEFAULT_POLLMASK; 550 unsigned int ret = DEFAULT_POLLMASK;
551 unsigned long event;
552
553 /* sysctl was unregistered */
554 if (IS_ERR(head))
555 return POLLERR | POLLHUP;
207 556
208 if (!table->proc_handler) 557 if (!table->proc_handler)
209 goto out; 558 goto out;
@@ -211,6 +560,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
211 if (!table->poll) 560 if (!table->poll)
212 goto out; 561 goto out;
213 562
563 event = (unsigned long)filp->private_data;
214 poll_wait(filp, &table->poll->wait, wait); 564 poll_wait(filp, &table->poll->wait, wait);
215 565
216 if (event != atomic_read(&table->poll->event)) { 566 if (event != atomic_read(&table->poll->event)) {
@@ -219,6 +569,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
219 } 569 }
220 570
221out: 571out:
572 sysctl_head_finish(head);
573
222 return ret; 574 return ret;
223} 575}
224 576
@@ -260,28 +612,45 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
260 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); 612 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
261} 613}
262 614
615static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
616 filldir_t filldir,
617 struct ctl_table_header *head,
618 struct ctl_table *table)
619{
620 int err, ret = 0;
621 head = sysctl_head_grab(head);
622
623 if (S_ISLNK(table->mode)) {
624 /* It is not an error if we can not follow the link ignore it */
625 err = sysctl_follow_link(&head, &table, current->nsproxy);
626 if (err)
627 goto out;
628 }
629
630 ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
631out:
632 sysctl_head_finish(head);
633 return ret;
634}
635
263static int scan(struct ctl_table_header *head, ctl_table *table, 636static int scan(struct ctl_table_header *head, ctl_table *table,
264 unsigned long *pos, struct file *file, 637 unsigned long *pos, struct file *file,
265 void *dirent, filldir_t filldir) 638 void *dirent, filldir_t filldir)
266{ 639{
640 int res;
267 641
268 for (; table->procname; table++, (*pos)++) { 642 if ((*pos)++ < file->f_pos)
269 int res; 643 return 0;
270
271 /* Can't do anything without a proc name */
272 if (!table->procname)
273 continue;
274
275 if (*pos < file->f_pos)
276 continue;
277 644
645 if (unlikely(S_ISLNK(table->mode)))
646 res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
647 else
278 res = proc_sys_fill_cache(file, dirent, filldir, head, table); 648 res = proc_sys_fill_cache(file, dirent, filldir, head, table);
279 if (res)
280 return res;
281 649
282 file->f_pos = *pos + 1; 650 if (res == 0)
283 } 651 file->f_pos = *pos;
284 return 0; 652
653 return res;
285} 654}
286 655
287static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) 656static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -289,20 +658,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
289 struct dentry *dentry = filp->f_path.dentry; 658 struct dentry *dentry = filp->f_path.dentry;
290 struct inode *inode = dentry->d_inode; 659 struct inode *inode = dentry->d_inode;
291 struct ctl_table_header *head = grab_header(inode); 660 struct ctl_table_header *head = grab_header(inode);
292 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
293 struct ctl_table_header *h = NULL; 661 struct ctl_table_header *h = NULL;
662 struct ctl_table *entry;
663 struct ctl_dir *ctl_dir;
294 unsigned long pos; 664 unsigned long pos;
295 int ret = -EINVAL; 665 int ret = -EINVAL;
296 666
297 if (IS_ERR(head)) 667 if (IS_ERR(head))
298 return PTR_ERR(head); 668 return PTR_ERR(head);
299 669
300 if (table && !table->child) { 670 ctl_dir = container_of(head, struct ctl_dir, header);
301 WARN_ON(1);
302 goto out;
303 }
304
305 table = table ? table->child : head->ctl_table;
306 671
307 ret = 0; 672 ret = 0;
308 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ 673 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -320,14 +685,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
320 } 685 }
321 pos = 2; 686 pos = 2;
322 687
323 ret = scan(head, table, &pos, filp, dirent, filldir); 688 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
324 if (ret) 689 ret = scan(h, entry, &pos, filp, dirent, filldir);
325 goto out;
326
327 for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
328 if (h->attached_to != table)
329 continue;
330 ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
331 if (ret) { 690 if (ret) {
332 sysctl_head_finish(h); 691 sysctl_head_finish(h);
333 break; 692 break;
@@ -447,6 +806,21 @@ static int proc_sys_delete(const struct dentry *dentry)
447 return !!PROC_I(dentry->d_inode)->sysctl->unregistering; 806 return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
448} 807}
449 808
809static int sysctl_is_seen(struct ctl_table_header *p)
810{
811 struct ctl_table_set *set = p->set;
812 int res;
813 spin_lock(&sysctl_lock);
814 if (p->unregistering)
815 res = 0;
816 else if (!set->is_seen)
817 res = 1;
818 else
819 res = set->is_seen(set);
820 spin_unlock(&sysctl_lock);
821 return res;
822}
823
450static int proc_sys_compare(const struct dentry *parent, 824static int proc_sys_compare(const struct dentry *parent,
451 const struct inode *pinode, 825 const struct inode *pinode,
452 const struct dentry *dentry, const struct inode *inode, 826 const struct dentry *dentry, const struct inode *inode,
@@ -472,6 +846,753 @@ static const struct dentry_operations proc_sys_dentry_operations = {
472 .d_compare = proc_sys_compare, 846 .d_compare = proc_sys_compare,
473}; 847};
474 848
849static struct ctl_dir *find_subdir(struct ctl_dir *dir,
850 const char *name, int namelen)
851{
852 struct ctl_table_header *head;
853 struct ctl_table *entry;
854
855 entry = find_entry(&head, dir, name, namelen);
856 if (!entry)
857 return ERR_PTR(-ENOENT);
858 if (!S_ISDIR(entry->mode))
859 return ERR_PTR(-ENOTDIR);
860 return container_of(head, struct ctl_dir, header);
861}
862
863static struct ctl_dir *new_dir(struct ctl_table_set *set,
864 const char *name, int namelen)
865{
866 struct ctl_table *table;
867 struct ctl_dir *new;
868 struct ctl_node *node;
869 char *new_name;
870
871 new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
872 sizeof(struct ctl_table)*2 + namelen + 1,
873 GFP_KERNEL);
874 if (!new)
875 return NULL;
876
877 node = (struct ctl_node *)(new + 1);
878 table = (struct ctl_table *)(node + 1);
879 new_name = (char *)(table + 2);
880 memcpy(new_name, name, namelen);
881 new_name[namelen] = '\0';
882 table[0].procname = new_name;
883 table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
884 init_header(&new->header, set->dir.header.root, set, node, table);
885
886 return new;
887}
888
889/**
890 * get_subdir - find or create a subdir with the specified name.
891 * @dir: Directory to create the subdirectory in
892 * @name: The name of the subdirectory to find or create
893 * @namelen: The length of name
894 *
895 * Takes a directory with an elevated reference count so we know that
896 * if we drop the lock the directory will not go away. Upon success
897 * the reference is moved from @dir to the returned subdirectory.
898 * Upon error an error code is returned and the reference on @dir is
899 * simply dropped.
900 */
901static struct ctl_dir *get_subdir(struct ctl_dir *dir,
902 const char *name, int namelen)
903{
904 struct ctl_table_set *set = dir->header.set;
905 struct ctl_dir *subdir, *new = NULL;
906 int err;
907
908 spin_lock(&sysctl_lock);
909 subdir = find_subdir(dir, name, namelen);
910 if (!IS_ERR(subdir))
911 goto found;
912 if (PTR_ERR(subdir) != -ENOENT)
913 goto failed;
914
915 spin_unlock(&sysctl_lock);
916 new = new_dir(set, name, namelen);
917 spin_lock(&sysctl_lock);
918 subdir = ERR_PTR(-ENOMEM);
919 if (!new)
920 goto failed;
921
922 /* Was the subdir added while we dropped the lock? */
923 subdir = find_subdir(dir, name, namelen);
924 if (!IS_ERR(subdir))
925 goto found;
926 if (PTR_ERR(subdir) != -ENOENT)
927 goto failed;
928
929 /* Nope. Use the our freshly made directory entry. */
930 err = insert_header(dir, &new->header);
931 subdir = ERR_PTR(err);
932 if (err)
933 goto failed;
934 subdir = new;
935found:
936 subdir->header.nreg++;
937failed:
938 if (unlikely(IS_ERR(subdir))) {
939 printk(KERN_ERR "sysctl could not get directory: ");
940 sysctl_print_dir(dir);
941 printk(KERN_CONT "/%*.*s %ld\n",
942 namelen, namelen, name, PTR_ERR(subdir));
943 }
944 drop_sysctl_table(&dir->header);
945 if (new)
946 drop_sysctl_table(&new->header);
947 spin_unlock(&sysctl_lock);
948 return subdir;
949}
950
951static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
952{
953 struct ctl_dir *parent;
954 const char *procname;
955 if (!dir->header.parent)
956 return &set->dir;
957 parent = xlate_dir(set, dir->header.parent);
958 if (IS_ERR(parent))
959 return parent;
960 procname = dir->header.ctl_table[0].procname;
961 return find_subdir(parent, procname, strlen(procname));
962}
963
964static int sysctl_follow_link(struct ctl_table_header **phead,
965 struct ctl_table **pentry, struct nsproxy *namespaces)
966{
967 struct ctl_table_header *head;
968 struct ctl_table_root *root;
969 struct ctl_table_set *set;
970 struct ctl_table *entry;
971 struct ctl_dir *dir;
972 int ret;
973
974 ret = 0;
975 spin_lock(&sysctl_lock);
976 root = (*pentry)->data;
977 set = lookup_header_set(root, namespaces);
978 dir = xlate_dir(set, (*phead)->parent);
979 if (IS_ERR(dir))
980 ret = PTR_ERR(dir);
981 else {
982 const char *procname = (*pentry)->procname;
983 head = NULL;
984 entry = find_entry(&head, dir, procname, strlen(procname));
985 ret = -ENOENT;
986 if (entry && use_table(head)) {
987 unuse_table(*phead);
988 *phead = head;
989 *pentry = entry;
990 ret = 0;
991 }
992 }
993
994 spin_unlock(&sysctl_lock);
995 return ret;
996}
997
998static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
999{
1000 struct va_format vaf;
1001 va_list args;
1002
1003 va_start(args, fmt);
1004 vaf.fmt = fmt;
1005 vaf.va = &args;
1006
1007 printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
1008 path, table->procname, &vaf);
1009
1010 va_end(args);
1011 return -EINVAL;
1012}
1013
1014static int sysctl_check_table(const char *path, struct ctl_table *table)
1015{
1016 int err = 0;
1017 for (; table->procname; table++) {
1018 if (table->child)
1019 err = sysctl_err(path, table, "Not a file");
1020
1021 if ((table->proc_handler == proc_dostring) ||
1022 (table->proc_handler == proc_dointvec) ||
1023 (table->proc_handler == proc_dointvec_minmax) ||
1024 (table->proc_handler == proc_dointvec_jiffies) ||
1025 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
1026 (table->proc_handler == proc_dointvec_ms_jiffies) ||
1027 (table->proc_handler == proc_doulongvec_minmax) ||
1028 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
1029 if (!table->data)
1030 err = sysctl_err(path, table, "No data");
1031 if (!table->maxlen)
1032 err = sysctl_err(path, table, "No maxlen");
1033 }
1034 if (!table->proc_handler)
1035 err = sysctl_err(path, table, "No proc_handler");
1036
1037 if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
1038 err = sysctl_err(path, table, "bogus .mode 0%o",
1039 table->mode);
1040 }
1041 return err;
1042}
1043
1044static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
1045 struct ctl_table_root *link_root)
1046{
1047 struct ctl_table *link_table, *entry, *link;
1048 struct ctl_table_header *links;
1049 struct ctl_node *node;
1050 char *link_name;
1051 int nr_entries, name_bytes;
1052
1053 name_bytes = 0;
1054 nr_entries = 0;
1055 for (entry = table; entry->procname; entry++) {
1056 nr_entries++;
1057 name_bytes += strlen(entry->procname) + 1;
1058 }
1059
1060 links = kzalloc(sizeof(struct ctl_table_header) +
1061 sizeof(struct ctl_node)*nr_entries +
1062 sizeof(struct ctl_table)*(nr_entries + 1) +
1063 name_bytes,
1064 GFP_KERNEL);
1065
1066 if (!links)
1067 return NULL;
1068
1069 node = (struct ctl_node *)(links + 1);
1070 link_table = (struct ctl_table *)(node + nr_entries);
1071 link_name = (char *)&link_table[nr_entries + 1];
1072
1073 for (link = link_table, entry = table; entry->procname; link++, entry++) {
1074 int len = strlen(entry->procname) + 1;
1075 memcpy(link_name, entry->procname, len);
1076 link->procname = link_name;
1077 link->mode = S_IFLNK|S_IRWXUGO;
1078 link->data = link_root;
1079 link_name += len;
1080 }
1081 init_header(links, dir->header.root, dir->header.set, node, link_table);
1082 links->nreg = nr_entries;
1083
1084 return links;
1085}
1086
1087static bool get_links(struct ctl_dir *dir,
1088 struct ctl_table *table, struct ctl_table_root *link_root)
1089{
1090 struct ctl_table_header *head;
1091 struct ctl_table *entry, *link;
1092
1093 /* Are there links available for every entry in table? */
1094 for (entry = table; entry->procname; entry++) {
1095 const char *procname = entry->procname;
1096 link = find_entry(&head, dir, procname, strlen(procname));
1097 if (!link)
1098 return false;
1099 if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
1100 continue;
1101 if (S_ISLNK(link->mode) && (link->data == link_root))
1102 continue;
1103 return false;
1104 }
1105
1106 /* The checks passed. Increase the registration count on the links */
1107 for (entry = table; entry->procname; entry++) {
1108 const char *procname = entry->procname;
1109 link = find_entry(&head, dir, procname, strlen(procname));
1110 head->nreg++;
1111 }
1112 return true;
1113}
1114
1115static int insert_links(struct ctl_table_header *head)
1116{
1117 struct ctl_table_set *root_set = &sysctl_table_root.default_set;
1118 struct ctl_dir *core_parent = NULL;
1119 struct ctl_table_header *links;
1120 int err;
1121
1122 if (head->set == root_set)
1123 return 0;
1124
1125 core_parent = xlate_dir(root_set, head->parent);
1126 if (IS_ERR(core_parent))
1127 return 0;
1128
1129 if (get_links(core_parent, head->ctl_table, head->root))
1130 return 0;
1131
1132 core_parent->header.nreg++;
1133 spin_unlock(&sysctl_lock);
1134
1135 links = new_links(core_parent, head->ctl_table, head->root);
1136
1137 spin_lock(&sysctl_lock);
1138 err = -ENOMEM;
1139 if (!links)
1140 goto out;
1141
1142 err = 0;
1143 if (get_links(core_parent, head->ctl_table, head->root)) {
1144 kfree(links);
1145 goto out;
1146 }
1147
1148 err = insert_header(core_parent, links);
1149 if (err)
1150 kfree(links);
1151out:
1152 drop_sysctl_table(&core_parent->header);
1153 return err;
1154}
1155
1156/**
1157 * __register_sysctl_table - register a leaf sysctl table
1158 * @set: Sysctl tree to register on
1159 * @path: The path to the directory the sysctl table is in.
1160 * @table: the top-level table structure
1161 *
1162 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1163 * array. A completely 0 filled entry terminates the table.
1164 *
1165 * The members of the &struct ctl_table structure are used as follows:
1166 *
1167 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1168 * enter a sysctl file
1169 *
1170 * data - a pointer to data for use by proc_handler
1171 *
1172 * maxlen - the maximum size in bytes of the data
1173 *
1174 * mode - the file permissions for the /proc/sys file
1175 *
1176 * child - must be %NULL.
1177 *
1178 * proc_handler - the text handler routine (described below)
1179 *
1180 * extra1, extra2 - extra pointers usable by the proc handler routines
1181 *
1182 * Leaf nodes in the sysctl tree will be represented by a single file
1183 * under /proc; non-leaf nodes will be represented by directories.
1184 *
1185 * There must be a proc_handler routine for any terminal nodes.
1186 * Several default handlers are available to cover common cases -
1187 *
1188 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1189 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1190 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1191 *
1192 * It is the handler's job to read the input buffer from user memory
1193 * and process it. The handler should return 0 on success.
1194 *
1195 * This routine returns %NULL on a failure to register, and a pointer
1196 * to the table header on success.
1197 */
1198struct ctl_table_header *__register_sysctl_table(
1199 struct ctl_table_set *set,
1200 const char *path, struct ctl_table *table)
1201{
1202 struct ctl_table_root *root = set->dir.header.root;
1203 struct ctl_table_header *header;
1204 const char *name, *nextname;
1205 struct ctl_dir *dir;
1206 struct ctl_table *entry;
1207 struct ctl_node *node;
1208 int nr_entries = 0;
1209
1210 for (entry = table; entry->procname; entry++)
1211 nr_entries++;
1212
1213 header = kzalloc(sizeof(struct ctl_table_header) +
1214 sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
1215 if (!header)
1216 return NULL;
1217
1218 node = (struct ctl_node *)(header + 1);
1219 init_header(header, root, set, node, table);
1220 if (sysctl_check_table(path, table))
1221 goto fail;
1222
1223 spin_lock(&sysctl_lock);
1224 dir = &set->dir;
1225 /* Reference moved down the diretory tree get_subdir */
1226 dir->header.nreg++;
1227 spin_unlock(&sysctl_lock);
1228
1229 /* Find the directory for the ctl_table */
1230 for (name = path; name; name = nextname) {
1231 int namelen;
1232 nextname = strchr(name, '/');
1233 if (nextname) {
1234 namelen = nextname - name;
1235 nextname++;
1236 } else {
1237 namelen = strlen(name);
1238 }
1239 if (namelen == 0)
1240 continue;
1241
1242 dir = get_subdir(dir, name, namelen);
1243 if (IS_ERR(dir))
1244 goto fail;
1245 }
1246
1247 spin_lock(&sysctl_lock);
1248 if (insert_header(dir, header))
1249 goto fail_put_dir_locked;
1250
1251 drop_sysctl_table(&dir->header);
1252 spin_unlock(&sysctl_lock);
1253
1254 return header;
1255
1256fail_put_dir_locked:
1257 drop_sysctl_table(&dir->header);
1258 spin_unlock(&sysctl_lock);
1259fail:
1260 kfree(header);
1261 dump_stack();
1262 return NULL;
1263}
1264
1265/**
1266 * register_sysctl - register a sysctl table
1267 * @path: The path to the directory the sysctl table is in.
1268 * @table: the table structure
1269 *
1270 * Register a sysctl table. @table should be a filled in ctl_table
1271 * array. A completely 0 filled entry terminates the table.
1272 *
1273 * See __register_sysctl_table for more details.
1274 */
1275struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
1276{
1277 return __register_sysctl_table(&sysctl_table_root.default_set,
1278 path, table);
1279}
1280EXPORT_SYMBOL(register_sysctl);
1281
1282static char *append_path(const char *path, char *pos, const char *name)
1283{
1284 int namelen;
1285 namelen = strlen(name);
1286 if (((pos - path) + namelen + 2) >= PATH_MAX)
1287 return NULL;
1288 memcpy(pos, name, namelen);
1289 pos[namelen] = '/';
1290 pos[namelen + 1] = '\0';
1291 pos += namelen + 1;
1292 return pos;
1293}
1294
1295static int count_subheaders(struct ctl_table *table)
1296{
1297 int has_files = 0;
1298 int nr_subheaders = 0;
1299 struct ctl_table *entry;
1300
1301 /* special case: no directory and empty directory */
1302 if (!table || !table->procname)
1303 return 1;
1304
1305 for (entry = table; entry->procname; entry++) {
1306 if (entry->child)
1307 nr_subheaders += count_subheaders(entry->child);
1308 else
1309 has_files = 1;
1310 }
1311 return nr_subheaders + has_files;
1312}
1313
1314static int register_leaf_sysctl_tables(const char *path, char *pos,
1315 struct ctl_table_header ***subheader, struct ctl_table_set *set,
1316 struct ctl_table *table)
1317{
1318 struct ctl_table *ctl_table_arg = NULL;
1319 struct ctl_table *entry, *files;
1320 int nr_files = 0;
1321 int nr_dirs = 0;
1322 int err = -ENOMEM;
1323
1324 for (entry = table; entry->procname; entry++) {
1325 if (entry->child)
1326 nr_dirs++;
1327 else
1328 nr_files++;
1329 }
1330
1331 files = table;
1332 /* If there are mixed files and directories we need a new table */
1333 if (nr_dirs && nr_files) {
1334 struct ctl_table *new;
1335 files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
1336 GFP_KERNEL);
1337 if (!files)
1338 goto out;
1339
1340 ctl_table_arg = files;
1341 for (new = files, entry = table; entry->procname; entry++) {
1342 if (entry->child)
1343 continue;
1344 *new = *entry;
1345 new++;
1346 }
1347 }
1348
1349 /* Register everything except a directory full of subdirectories */
1350 if (nr_files || !nr_dirs) {
1351 struct ctl_table_header *header;
1352 header = __register_sysctl_table(set, path, files);
1353 if (!header) {
1354 kfree(ctl_table_arg);
1355 goto out;
1356 }
1357
1358 /* Remember if we need to free the file table */
1359 header->ctl_table_arg = ctl_table_arg;
1360 **subheader = header;
1361 (*subheader)++;
1362 }
1363
1364 /* Recurse into the subdirectories. */
1365 for (entry = table; entry->procname; entry++) {
1366 char *child_pos;
1367
1368 if (!entry->child)
1369 continue;
1370
1371 err = -ENAMETOOLONG;
1372 child_pos = append_path(path, pos, entry->procname);
1373 if (!child_pos)
1374 goto out;
1375
1376 err = register_leaf_sysctl_tables(path, child_pos, subheader,
1377 set, entry->child);
1378 pos[0] = '\0';
1379 if (err)
1380 goto out;
1381 }
1382 err = 0;
1383out:
1384 /* On failure our caller will unregister all registered subheaders */
1385 return err;
1386}
1387
1388/**
1389 * __register_sysctl_paths - register a sysctl table hierarchy
1390 * @set: Sysctl tree to register on
1391 * @path: The path to the directory the sysctl table is in.
1392 * @table: the top-level table structure
1393 *
1394 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1395 * array. A completely 0 filled entry terminates the table.
1396 *
1397 * See __register_sysctl_table for more details.
1398 */
1399struct ctl_table_header *__register_sysctl_paths(
1400 struct ctl_table_set *set,
1401 const struct ctl_path *path, struct ctl_table *table)
1402{
1403 struct ctl_table *ctl_table_arg = table;
1404 int nr_subheaders = count_subheaders(table);
1405 struct ctl_table_header *header = NULL, **subheaders, **subheader;
1406 const struct ctl_path *component;
1407 char *new_path, *pos;
1408
1409 pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
1410 if (!new_path)
1411 return NULL;
1412
1413 pos[0] = '\0';
1414 for (component = path; component->procname; component++) {
1415 pos = append_path(new_path, pos, component->procname);
1416 if (!pos)
1417 goto out;
1418 }
1419 while (table->procname && table->child && !table[1].procname) {
1420 pos = append_path(new_path, pos, table->procname);
1421 if (!pos)
1422 goto out;
1423 table = table->child;
1424 }
1425 if (nr_subheaders == 1) {
1426 header = __register_sysctl_table(set, new_path, table);
1427 if (header)
1428 header->ctl_table_arg = ctl_table_arg;
1429 } else {
1430 header = kzalloc(sizeof(*header) +
1431 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
1432 if (!header)
1433 goto out;
1434
1435 subheaders = (struct ctl_table_header **) (header + 1);
1436 subheader = subheaders;
1437 header->ctl_table_arg = ctl_table_arg;
1438
1439 if (register_leaf_sysctl_tables(new_path, pos, &subheader,
1440 set, table))
1441 goto err_register_leaves;
1442 }
1443
1444out:
1445 kfree(new_path);
1446 return header;
1447
1448err_register_leaves:
1449 while (subheader > subheaders) {
1450 struct ctl_table_header *subh = *(--subheader);
1451 struct ctl_table *table = subh->ctl_table_arg;
1452 unregister_sysctl_table(subh);
1453 kfree(table);
1454 }
1455 kfree(header);
1456 header = NULL;
1457 goto out;
1458}
1459
1460/**
1461 * register_sysctl_table_path - register a sysctl table hierarchy
1462 * @path: The path to the directory the sysctl table is in.
1463 * @table: the top-level table structure
1464 *
1465 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1466 * array. A completely 0 filled entry terminates the table.
1467 *
1468 * See __register_sysctl_paths for more details.
1469 */
1470struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1471 struct ctl_table *table)
1472{
1473 return __register_sysctl_paths(&sysctl_table_root.default_set,
1474 path, table);
1475}
1476EXPORT_SYMBOL(register_sysctl_paths);
1477
1478/**
1479 * register_sysctl_table - register a sysctl table hierarchy
1480 * @table: the top-level table structure
1481 *
1482 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1483 * array. A completely 0 filled entry terminates the table.
1484 *
1485 * See register_sysctl_paths for more details.
1486 */
1487struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1488{
1489 static const struct ctl_path null_path[] = { {} };
1490
1491 return register_sysctl_paths(null_path, table);
1492}
1493EXPORT_SYMBOL(register_sysctl_table);
1494
1495static void put_links(struct ctl_table_header *header)
1496{
1497 struct ctl_table_set *root_set = &sysctl_table_root.default_set;
1498 struct ctl_table_root *root = header->root;
1499 struct ctl_dir *parent = header->parent;
1500 struct ctl_dir *core_parent;
1501 struct ctl_table *entry;
1502
1503 if (header->set == root_set)
1504 return;
1505
1506 core_parent = xlate_dir(root_set, parent);
1507 if (IS_ERR(core_parent))
1508 return;
1509
1510 for (entry = header->ctl_table; entry->procname; entry++) {
1511 struct ctl_table_header *link_head;
1512 struct ctl_table *link;
1513 const char *name = entry->procname;
1514
1515 link = find_entry(&link_head, core_parent, name, strlen(name));
1516 if (link &&
1517 ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
1518 (S_ISLNK(link->mode) && (link->data == root)))) {
1519 drop_sysctl_table(link_head);
1520 }
1521 else {
1522 printk(KERN_ERR "sysctl link missing during unregister: ");
1523 sysctl_print_dir(parent);
1524 printk(KERN_CONT "/%s\n", name);
1525 }
1526 }
1527}
1528
1529static void drop_sysctl_table(struct ctl_table_header *header)
1530{
1531 struct ctl_dir *parent = header->parent;
1532
1533 if (--header->nreg)
1534 return;
1535
1536 put_links(header);
1537 start_unregistering(header);
1538 if (!--header->count)
1539 kfree_rcu(header, rcu);
1540
1541 if (parent)
1542 drop_sysctl_table(&parent->header);
1543}
1544
1545/**
1546 * unregister_sysctl_table - unregister a sysctl table hierarchy
1547 * @header: the header returned from register_sysctl_table
1548 *
1549 * Unregisters the sysctl table and all children. proc entries may not
1550 * actually be removed until they are no longer used by anyone.
1551 */
1552void unregister_sysctl_table(struct ctl_table_header * header)
1553{
1554 int nr_subheaders;
1555 might_sleep();
1556
1557 if (header == NULL)
1558 return;
1559
1560 nr_subheaders = count_subheaders(header->ctl_table_arg);
1561 if (unlikely(nr_subheaders > 1)) {
1562 struct ctl_table_header **subheaders;
1563 int i;
1564
1565 subheaders = (struct ctl_table_header **)(header + 1);
1566 for (i = nr_subheaders -1; i >= 0; i--) {
1567 struct ctl_table_header *subh = subheaders[i];
1568 struct ctl_table *table = subh->ctl_table_arg;
1569 unregister_sysctl_table(subh);
1570 kfree(table);
1571 }
1572 kfree(header);
1573 return;
1574 }
1575
1576 spin_lock(&sysctl_lock);
1577 drop_sysctl_table(header);
1578 spin_unlock(&sysctl_lock);
1579}
1580EXPORT_SYMBOL(unregister_sysctl_table);
1581
1582void setup_sysctl_set(struct ctl_table_set *set,
1583 struct ctl_table_root *root,
1584 int (*is_seen)(struct ctl_table_set *))
1585{
1586 memset(set, 0, sizeof(*set));
1587 set->is_seen = is_seen;
1588 init_header(&set->dir.header, root, set, NULL, root_table);
1589}
1590
1591void retire_sysctl_set(struct ctl_table_set *set)
1592{
1593 WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
1594}
1595
475int __init proc_sys_init(void) 1596int __init proc_sys_init(void)
476{ 1597{
477 struct proc_dir_entry *proc_sys_root; 1598 struct proc_dir_entry *proc_sys_root;
@@ -480,5 +1601,6 @@ int __init proc_sys_init(void)
480 proc_sys_root->proc_iops = &proc_sys_dir_operations; 1601 proc_sys_root->proc_iops = &proc_sys_dir_operations;
481 proc_sys_root->proc_fops = &proc_sys_dir_file_operations; 1602 proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
482 proc_sys_root->nlink = 0; 1603 proc_sys_root->nlink = 0;
483 return 0; 1604
1605 return sysctl_init();
484} 1606}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 121f77cfef76..6a0c62d6e442 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -89,18 +89,19 @@ static int show_stat(struct seq_file *p, void *v)
89 } 89 }
90 sum += arch_irq_stat(); 90 sum += arch_irq_stat();
91 91
92 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " 92 seq_puts(p, "cpu ");
93 "%llu\n", 93 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
94 (unsigned long long)cputime64_to_clock_t(user), 94 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
95 (unsigned long long)cputime64_to_clock_t(nice), 95 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
96 (unsigned long long)cputime64_to_clock_t(system), 96 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
97 (unsigned long long)cputime64_to_clock_t(idle), 97 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
98 (unsigned long long)cputime64_to_clock_t(iowait), 98 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
99 (unsigned long long)cputime64_to_clock_t(irq), 99 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
100 (unsigned long long)cputime64_to_clock_t(softirq), 100 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
101 (unsigned long long)cputime64_to_clock_t(steal), 101 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
102 (unsigned long long)cputime64_to_clock_t(guest), 102 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
103 (unsigned long long)cputime64_to_clock_t(guest_nice)); 103 seq_putc(p, '\n');
104
104 for_each_online_cpu(i) { 105 for_each_online_cpu(i) {
105 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 106 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
106 user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; 107 user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -113,26 +114,24 @@ static int show_stat(struct seq_file *p, void *v)
113 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 114 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
114 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 115 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
115 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 116 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
116 seq_printf(p, 117 seq_printf(p, "cpu%d", i);
117 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " 118 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
118 "%llu\n", 119 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
119 i, 120 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
120 (unsigned long long)cputime64_to_clock_t(user), 121 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
121 (unsigned long long)cputime64_to_clock_t(nice), 122 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
122 (unsigned long long)cputime64_to_clock_t(system), 123 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
123 (unsigned long long)cputime64_to_clock_t(idle), 124 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
124 (unsigned long long)cputime64_to_clock_t(iowait), 125 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
125 (unsigned long long)cputime64_to_clock_t(irq), 126 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
126 (unsigned long long)cputime64_to_clock_t(softirq), 127 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
127 (unsigned long long)cputime64_to_clock_t(steal), 128 seq_putc(p, '\n');
128 (unsigned long long)cputime64_to_clock_t(guest),
129 (unsigned long long)cputime64_to_clock_t(guest_nice));
130 } 129 }
131 seq_printf(p, "intr %llu", (unsigned long long)sum); 130 seq_printf(p, "intr %llu", (unsigned long long)sum);
132 131
133 /* sum again ? it could be updated? */ 132 /* sum again ? it could be updated? */
134 for_each_irq_nr(j) 133 for_each_irq_nr(j)
135 seq_printf(p, " %u", kstat_irqs(j)); 134 seq_put_decimal_ull(p, ' ', kstat_irqs(j));
136 135
137 seq_printf(p, 136 seq_printf(p,
138 "\nctxt %llu\n" 137 "\nctxt %llu\n"
@@ -149,7 +148,7 @@ static int show_stat(struct seq_file *p, void *v)
149 seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); 148 seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
150 149
151 for (i = 0; i < NR_SOFTIRQS; i++) 150 for (i = 0; i < NR_SOFTIRQS; i++)
152 seq_printf(p, " %u", per_softirq_sums[i]); 151 seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
153 seq_putc(p, '\n'); 152 seq_putc(p, '\n');
154 153
155 return 0; 154 return 0;
@@ -157,11 +156,14 @@ static int show_stat(struct seq_file *p, void *v)
157 156
158static int stat_open(struct inode *inode, struct file *file) 157static int stat_open(struct inode *inode, struct file *file)
159{ 158{
160 unsigned size = 4096 * (1 + num_possible_cpus() / 32); 159 unsigned size = 1024 + 128 * num_possible_cpus();
161 char *buf; 160 char *buf;
162 struct seq_file *m; 161 struct seq_file *m;
163 int res; 162 int res;
164 163
164 /* minimum size to display an interrupt count : 2 bytes */
165 size += 2 * nr_irqs;
166
165 /* don't ask for more than the kmalloc() max size */ 167 /* don't ask for more than the kmalloc() max size */
166 if (size > KMALLOC_MAX_SIZE) 168 if (size > KMALLOC_MAX_SIZE)
167 size = KMALLOC_MAX_SIZE; 169 size = KMALLOC_MAX_SIZE;
@@ -173,7 +175,7 @@ static int stat_open(struct inode *inode, struct file *file)
173 if (!res) { 175 if (!res) {
174 m = file->private_data; 176 m = file->private_data;
175 m->buf = buf; 177 m->buf = buf;
176 m->size = size; 178 m->size = ksize(buf);
177 } else 179 } else
178 kfree(buf); 180 kfree(buf);
179 return res; 181 return res;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 9ec22d3b4293..82c585f715e3 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -68,9 +68,25 @@ void pstore_set_kmsg_bytes(int bytes)
68/* Tag each group of saved records with a sequence number */ 68/* Tag each group of saved records with a sequence number */
69static int oopscount; 69static int oopscount;
70 70
71static char *reason_str[] = { 71static const char *get_reason_str(enum kmsg_dump_reason reason)
72 "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency" 72{
73}; 73 switch (reason) {
74 case KMSG_DUMP_PANIC:
75 return "Panic";
76 case KMSG_DUMP_OOPS:
77 return "Oops";
78 case KMSG_DUMP_EMERG:
79 return "Emergency";
80 case KMSG_DUMP_RESTART:
81 return "Restart";
82 case KMSG_DUMP_HALT:
83 return "Halt";
84 case KMSG_DUMP_POWEROFF:
85 return "Poweroff";
86 default:
87 return "Unknown";
88 }
89}
74 90
75/* 91/*
76 * callback from kmsg_dump. (s2,l2) has the most recently 92 * callback from kmsg_dump. (s2,l2) has the most recently
@@ -85,17 +101,15 @@ static void pstore_dump(struct kmsg_dumper *dumper,
85 unsigned long s1_start, s2_start; 101 unsigned long s1_start, s2_start;
86 unsigned long l1_cpy, l2_cpy; 102 unsigned long l1_cpy, l2_cpy;
87 unsigned long size, total = 0; 103 unsigned long size, total = 0;
88 char *dst, *why; 104 char *dst;
105 const char *why;
89 u64 id; 106 u64 id;
90 int hsize, ret; 107 int hsize, ret;
91 unsigned int part = 1; 108 unsigned int part = 1;
92 unsigned long flags = 0; 109 unsigned long flags = 0;
93 int is_locked = 0; 110 int is_locked = 0;
94 111
95 if (reason < ARRAY_SIZE(reason_str)) 112 why = get_reason_str(reason);
96 why = reason_str[reason];
97 else
98 why = "Unknown";
99 113
100 if (in_nmi()) { 114 if (in_nmi()) {
101 is_locked = spin_trylock(&psinfo->buf_lock); 115 is_locked = spin_trylock(&psinfo->buf_lock);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b4f12b33f57..d69a1d1d7e15 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1110,6 +1110,13 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)
1110 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 1110 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
1111} 1111}
1112 1112
1113struct dquot_warn {
1114 struct super_block *w_sb;
1115 qid_t w_dq_id;
1116 short w_dq_type;
1117 short w_type;
1118};
1119
1113static int warning_issued(struct dquot *dquot, const int warntype) 1120static int warning_issued(struct dquot *dquot, const int warntype)
1114{ 1121{
1115 int flag = (warntype == QUOTA_NL_BHARDWARN || 1122 int flag = (warntype == QUOTA_NL_BHARDWARN ||
@@ -1125,41 +1132,42 @@ static int warning_issued(struct dquot *dquot, const int warntype)
1125#ifdef CONFIG_PRINT_QUOTA_WARNING 1132#ifdef CONFIG_PRINT_QUOTA_WARNING
1126static int flag_print_warnings = 1; 1133static int flag_print_warnings = 1;
1127 1134
1128static int need_print_warning(struct dquot *dquot) 1135static int need_print_warning(struct dquot_warn *warn)
1129{ 1136{
1130 if (!flag_print_warnings) 1137 if (!flag_print_warnings)
1131 return 0; 1138 return 0;
1132 1139
1133 switch (dquot->dq_type) { 1140 switch (warn->w_dq_type) {
1134 case USRQUOTA: 1141 case USRQUOTA:
1135 return current_fsuid() == dquot->dq_id; 1142 return current_fsuid() == warn->w_dq_id;
1136 case GRPQUOTA: 1143 case GRPQUOTA:
1137 return in_group_p(dquot->dq_id); 1144 return in_group_p(warn->w_dq_id);
1138 } 1145 }
1139 return 0; 1146 return 0;
1140} 1147}
1141 1148
1142/* Print warning to user which exceeded quota */ 1149/* Print warning to user which exceeded quota */
1143static void print_warning(struct dquot *dquot, const int warntype) 1150static void print_warning(struct dquot_warn *warn)
1144{ 1151{
1145 char *msg = NULL; 1152 char *msg = NULL;
1146 struct tty_struct *tty; 1153 struct tty_struct *tty;
1154 int warntype = warn->w_type;
1147 1155
1148 if (warntype == QUOTA_NL_IHARDBELOW || 1156 if (warntype == QUOTA_NL_IHARDBELOW ||
1149 warntype == QUOTA_NL_ISOFTBELOW || 1157 warntype == QUOTA_NL_ISOFTBELOW ||
1150 warntype == QUOTA_NL_BHARDBELOW || 1158 warntype == QUOTA_NL_BHARDBELOW ||
1151 warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) 1159 warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
1152 return; 1160 return;
1153 1161
1154 tty = get_current_tty(); 1162 tty = get_current_tty();
1155 if (!tty) 1163 if (!tty)
1156 return; 1164 return;
1157 tty_write_message(tty, dquot->dq_sb->s_id); 1165 tty_write_message(tty, warn->w_sb->s_id);
1158 if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) 1166 if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
1159 tty_write_message(tty, ": warning, "); 1167 tty_write_message(tty, ": warning, ");
1160 else 1168 else
1161 tty_write_message(tty, ": write failed, "); 1169 tty_write_message(tty, ": write failed, ");
1162 tty_write_message(tty, quotatypes[dquot->dq_type]); 1170 tty_write_message(tty, quotatypes[warn->w_dq_type]);
1163 switch (warntype) { 1171 switch (warntype) {
1164 case QUOTA_NL_IHARDWARN: 1172 case QUOTA_NL_IHARDWARN:
1165 msg = " file limit reached.\r\n"; 1173 msg = " file limit reached.\r\n";
@@ -1185,26 +1193,34 @@ static void print_warning(struct dquot *dquot, const int warntype)
1185} 1193}
1186#endif 1194#endif
1187 1195
1196static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
1197 int warntype)
1198{
1199 if (warning_issued(dquot, warntype))
1200 return;
1201 warn->w_type = warntype;
1202 warn->w_sb = dquot->dq_sb;
1203 warn->w_dq_id = dquot->dq_id;
1204 warn->w_dq_type = dquot->dq_type;
1205}
1206
1188/* 1207/*
1189 * Write warnings to the console and send warning messages over netlink. 1208 * Write warnings to the console and send warning messages over netlink.
1190 * 1209 *
1191 * Note that this function can sleep. 1210 * Note that this function can call into tty and networking code.
1192 */ 1211 */
1193static void flush_warnings(struct dquot *const *dquots, char *warntype) 1212static void flush_warnings(struct dquot_warn *warn)
1194{ 1213{
1195 struct dquot *dq;
1196 int i; 1214 int i;
1197 1215
1198 for (i = 0; i < MAXQUOTAS; i++) { 1216 for (i = 0; i < MAXQUOTAS; i++) {
1199 dq = dquots[i]; 1217 if (warn[i].w_type == QUOTA_NL_NOWARN)
1200 if (dq && warntype[i] != QUOTA_NL_NOWARN && 1218 continue;
1201 !warning_issued(dq, warntype[i])) {
1202#ifdef CONFIG_PRINT_QUOTA_WARNING 1219#ifdef CONFIG_PRINT_QUOTA_WARNING
1203 print_warning(dq, warntype[i]); 1220 print_warning(&warn[i]);
1204#endif 1221#endif
1205 quota_send_warning(dq->dq_type, dq->dq_id, 1222 quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id,
1206 dq->dq_sb->s_dev, warntype[i]); 1223 warn[i].w_sb->s_dev, warn[i].w_type);
1207 }
1208 } 1224 }
1209} 1225}
1210 1226
@@ -1218,11 +1234,11 @@ static int ignore_hardlimit(struct dquot *dquot)
1218} 1234}
1219 1235
1220/* needs dq_data_lock */ 1236/* needs dq_data_lock */
1221static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) 1237static int check_idq(struct dquot *dquot, qsize_t inodes,
1238 struct dquot_warn *warn)
1222{ 1239{
1223 qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; 1240 qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
1224 1241
1225 *warntype = QUOTA_NL_NOWARN;
1226 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1242 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1227 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1243 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1228 return 0; 1244 return 0;
@@ -1230,7 +1246,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1230 if (dquot->dq_dqb.dqb_ihardlimit && 1246 if (dquot->dq_dqb.dqb_ihardlimit &&
1231 newinodes > dquot->dq_dqb.dqb_ihardlimit && 1247 newinodes > dquot->dq_dqb.dqb_ihardlimit &&
1232 !ignore_hardlimit(dquot)) { 1248 !ignore_hardlimit(dquot)) {
1233 *warntype = QUOTA_NL_IHARDWARN; 1249 prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
1234 return -EDQUOT; 1250 return -EDQUOT;
1235 } 1251 }
1236 1252
@@ -1239,14 +1255,14 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1239 dquot->dq_dqb.dqb_itime && 1255 dquot->dq_dqb.dqb_itime &&
1240 get_seconds() >= dquot->dq_dqb.dqb_itime && 1256 get_seconds() >= dquot->dq_dqb.dqb_itime &&
1241 !ignore_hardlimit(dquot)) { 1257 !ignore_hardlimit(dquot)) {
1242 *warntype = QUOTA_NL_ISOFTLONGWARN; 1258 prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
1243 return -EDQUOT; 1259 return -EDQUOT;
1244 } 1260 }
1245 1261
1246 if (dquot->dq_dqb.dqb_isoftlimit && 1262 if (dquot->dq_dqb.dqb_isoftlimit &&
1247 newinodes > dquot->dq_dqb.dqb_isoftlimit && 1263 newinodes > dquot->dq_dqb.dqb_isoftlimit &&
1248 dquot->dq_dqb.dqb_itime == 0) { 1264 dquot->dq_dqb.dqb_itime == 0) {
1249 *warntype = QUOTA_NL_ISOFTWARN; 1265 prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
1250 dquot->dq_dqb.dqb_itime = get_seconds() + 1266 dquot->dq_dqb.dqb_itime = get_seconds() +
1251 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; 1267 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
1252 } 1268 }
@@ -1255,12 +1271,12 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1255} 1271}
1256 1272
1257/* needs dq_data_lock */ 1273/* needs dq_data_lock */
1258static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) 1274static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
1275 struct dquot_warn *warn)
1259{ 1276{
1260 qsize_t tspace; 1277 qsize_t tspace;
1261 struct super_block *sb = dquot->dq_sb; 1278 struct super_block *sb = dquot->dq_sb;
1262 1279
1263 *warntype = QUOTA_NL_NOWARN;
1264 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || 1280 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
1265 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1281 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1266 return 0; 1282 return 0;
@@ -1272,7 +1288,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1272 tspace > dquot->dq_dqb.dqb_bhardlimit && 1288 tspace > dquot->dq_dqb.dqb_bhardlimit &&
1273 !ignore_hardlimit(dquot)) { 1289 !ignore_hardlimit(dquot)) {
1274 if (!prealloc) 1290 if (!prealloc)
1275 *warntype = QUOTA_NL_BHARDWARN; 1291 prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
1276 return -EDQUOT; 1292 return -EDQUOT;
1277 } 1293 }
1278 1294
@@ -1282,7 +1298,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1282 get_seconds() >= dquot->dq_dqb.dqb_btime && 1298 get_seconds() >= dquot->dq_dqb.dqb_btime &&
1283 !ignore_hardlimit(dquot)) { 1299 !ignore_hardlimit(dquot)) {
1284 if (!prealloc) 1300 if (!prealloc)
1285 *warntype = QUOTA_NL_BSOFTLONGWARN; 1301 prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
1286 return -EDQUOT; 1302 return -EDQUOT;
1287 } 1303 }
1288 1304
@@ -1290,7 +1306,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1290 tspace > dquot->dq_dqb.dqb_bsoftlimit && 1306 tspace > dquot->dq_dqb.dqb_bsoftlimit &&
1291 dquot->dq_dqb.dqb_btime == 0) { 1307 dquot->dq_dqb.dqb_btime == 0) {
1292 if (!prealloc) { 1308 if (!prealloc) {
1293 *warntype = QUOTA_NL_BSOFTWARN; 1309 prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
1294 dquot->dq_dqb.dqb_btime = get_seconds() + 1310 dquot->dq_dqb.dqb_btime = get_seconds() +
1295 sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace; 1311 sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
1296 } 1312 }
@@ -1543,10 +1559,9 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1543int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) 1559int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1544{ 1560{
1545 int cnt, ret = 0; 1561 int cnt, ret = 0;
1546 char warntype[MAXQUOTAS]; 1562 struct dquot_warn warn[MAXQUOTAS];
1547 int warn = flags & DQUOT_SPACE_WARN; 1563 struct dquot **dquots = inode->i_dquot;
1548 int reserve = flags & DQUOT_SPACE_RESERVE; 1564 int reserve = flags & DQUOT_SPACE_RESERVE;
1549 int nofail = flags & DQUOT_SPACE_NOFAIL;
1550 1565
1551 /* 1566 /*
1552 * First test before acquiring mutex - solves deadlocks when we 1567 * First test before acquiring mutex - solves deadlocks when we
@@ -1559,36 +1574,36 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1559 1574
1560 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1575 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1561 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1576 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1562 warntype[cnt] = QUOTA_NL_NOWARN; 1577 warn[cnt].w_type = QUOTA_NL_NOWARN;
1563 1578
1564 spin_lock(&dq_data_lock); 1579 spin_lock(&dq_data_lock);
1565 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1580 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1566 if (!inode->i_dquot[cnt]) 1581 if (!dquots[cnt])
1567 continue; 1582 continue;
1568 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1583 ret = check_bdq(dquots[cnt], number,
1569 warntype+cnt); 1584 !(flags & DQUOT_SPACE_WARN), &warn[cnt]);
1570 if (ret && !nofail) { 1585 if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {
1571 spin_unlock(&dq_data_lock); 1586 spin_unlock(&dq_data_lock);
1572 goto out_flush_warn; 1587 goto out_flush_warn;
1573 } 1588 }
1574 } 1589 }
1575 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1590 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1576 if (!inode->i_dquot[cnt]) 1591 if (!dquots[cnt])
1577 continue; 1592 continue;
1578 if (reserve) 1593 if (reserve)
1579 dquot_resv_space(inode->i_dquot[cnt], number); 1594 dquot_resv_space(dquots[cnt], number);
1580 else 1595 else
1581 dquot_incr_space(inode->i_dquot[cnt], number); 1596 dquot_incr_space(dquots[cnt], number);
1582 } 1597 }
1583 inode_incr_space(inode, number, reserve); 1598 inode_incr_space(inode, number, reserve);
1584 spin_unlock(&dq_data_lock); 1599 spin_unlock(&dq_data_lock);
1585 1600
1586 if (reserve) 1601 if (reserve)
1587 goto out_flush_warn; 1602 goto out_flush_warn;
1588 mark_all_dquot_dirty(inode->i_dquot); 1603 mark_all_dquot_dirty(dquots);
1589out_flush_warn: 1604out_flush_warn:
1590 flush_warnings(inode->i_dquot, warntype);
1591 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1605 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1606 flush_warnings(warn);
1592out: 1607out:
1593 return ret; 1608 return ret;
1594} 1609}
@@ -1600,36 +1615,37 @@ EXPORT_SYMBOL(__dquot_alloc_space);
1600int dquot_alloc_inode(const struct inode *inode) 1615int dquot_alloc_inode(const struct inode *inode)
1601{ 1616{
1602 int cnt, ret = 0; 1617 int cnt, ret = 0;
1603 char warntype[MAXQUOTAS]; 1618 struct dquot_warn warn[MAXQUOTAS];
1619 struct dquot * const *dquots = inode->i_dquot;
1604 1620
1605 /* First test before acquiring mutex - solves deadlocks when we 1621 /* First test before acquiring mutex - solves deadlocks when we
1606 * re-enter the quota code and are already holding the mutex */ 1622 * re-enter the quota code and are already holding the mutex */
1607 if (!dquot_active(inode)) 1623 if (!dquot_active(inode))
1608 return 0; 1624 return 0;
1609 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1625 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1610 warntype[cnt] = QUOTA_NL_NOWARN; 1626 warn[cnt].w_type = QUOTA_NL_NOWARN;
1611 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1627 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1612 spin_lock(&dq_data_lock); 1628 spin_lock(&dq_data_lock);
1613 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1629 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1614 if (!inode->i_dquot[cnt]) 1630 if (!dquots[cnt])
1615 continue; 1631 continue;
1616 ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); 1632 ret = check_idq(dquots[cnt], 1, &warn[cnt]);
1617 if (ret) 1633 if (ret)
1618 goto warn_put_all; 1634 goto warn_put_all;
1619 } 1635 }
1620 1636
1621 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1637 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1622 if (!inode->i_dquot[cnt]) 1638 if (!dquots[cnt])
1623 continue; 1639 continue;
1624 dquot_incr_inodes(inode->i_dquot[cnt], 1); 1640 dquot_incr_inodes(dquots[cnt], 1);
1625 } 1641 }
1626 1642
1627warn_put_all: 1643warn_put_all:
1628 spin_unlock(&dq_data_lock); 1644 spin_unlock(&dq_data_lock);
1629 if (ret == 0) 1645 if (ret == 0)
1630 mark_all_dquot_dirty(inode->i_dquot); 1646 mark_all_dquot_dirty(dquots);
1631 flush_warnings(inode->i_dquot, warntype);
1632 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1647 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1648 flush_warnings(warn);
1633 return ret; 1649 return ret;
1634} 1650}
1635EXPORT_SYMBOL(dquot_alloc_inode); 1651EXPORT_SYMBOL(dquot_alloc_inode);
@@ -1669,7 +1685,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1669void __dquot_free_space(struct inode *inode, qsize_t number, int flags) 1685void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1670{ 1686{
1671 unsigned int cnt; 1687 unsigned int cnt;
1672 char warntype[MAXQUOTAS]; 1688 struct dquot_warn warn[MAXQUOTAS];
1689 struct dquot **dquots = inode->i_dquot;
1673 int reserve = flags & DQUOT_SPACE_RESERVE; 1690 int reserve = flags & DQUOT_SPACE_RESERVE;
1674 1691
1675 /* First test before acquiring mutex - solves deadlocks when we 1692 /* First test before acquiring mutex - solves deadlocks when we
@@ -1682,23 +1699,28 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1682 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1699 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1683 spin_lock(&dq_data_lock); 1700 spin_lock(&dq_data_lock);
1684 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1701 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1685 if (!inode->i_dquot[cnt]) 1702 int wtype;
1703
1704 warn[cnt].w_type = QUOTA_NL_NOWARN;
1705 if (!dquots[cnt])
1686 continue; 1706 continue;
1687 warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); 1707 wtype = info_bdq_free(dquots[cnt], number);
1708 if (wtype != QUOTA_NL_NOWARN)
1709 prepare_warning(&warn[cnt], dquots[cnt], wtype);
1688 if (reserve) 1710 if (reserve)
1689 dquot_free_reserved_space(inode->i_dquot[cnt], number); 1711 dquot_free_reserved_space(dquots[cnt], number);
1690 else 1712 else
1691 dquot_decr_space(inode->i_dquot[cnt], number); 1713 dquot_decr_space(dquots[cnt], number);
1692 } 1714 }
1693 inode_decr_space(inode, number, reserve); 1715 inode_decr_space(inode, number, reserve);
1694 spin_unlock(&dq_data_lock); 1716 spin_unlock(&dq_data_lock);
1695 1717
1696 if (reserve) 1718 if (reserve)
1697 goto out_unlock; 1719 goto out_unlock;
1698 mark_all_dquot_dirty(inode->i_dquot); 1720 mark_all_dquot_dirty(dquots);
1699out_unlock: 1721out_unlock:
1700 flush_warnings(inode->i_dquot, warntype);
1701 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1722 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1723 flush_warnings(warn);
1702} 1724}
1703EXPORT_SYMBOL(__dquot_free_space); 1725EXPORT_SYMBOL(__dquot_free_space);
1704 1726
@@ -1708,7 +1730,8 @@ EXPORT_SYMBOL(__dquot_free_space);
1708void dquot_free_inode(const struct inode *inode) 1730void dquot_free_inode(const struct inode *inode)
1709{ 1731{
1710 unsigned int cnt; 1732 unsigned int cnt;
1711 char warntype[MAXQUOTAS]; 1733 struct dquot_warn warn[MAXQUOTAS];
1734 struct dquot * const *dquots = inode->i_dquot;
1712 1735
1713 /* First test before acquiring mutex - solves deadlocks when we 1736 /* First test before acquiring mutex - solves deadlocks when we
1714 * re-enter the quota code and are already holding the mutex */ 1737 * re-enter the quota code and are already holding the mutex */
@@ -1718,15 +1741,20 @@ void dquot_free_inode(const struct inode *inode)
1718 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1741 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1719 spin_lock(&dq_data_lock); 1742 spin_lock(&dq_data_lock);
1720 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1743 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1721 if (!inode->i_dquot[cnt]) 1744 int wtype;
1745
1746 warn[cnt].w_type = QUOTA_NL_NOWARN;
1747 if (!dquots[cnt])
1722 continue; 1748 continue;
1723 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); 1749 wtype = info_idq_free(dquots[cnt], 1);
1724 dquot_decr_inodes(inode->i_dquot[cnt], 1); 1750 if (wtype != QUOTA_NL_NOWARN)
1751 prepare_warning(&warn[cnt], dquots[cnt], wtype);
1752 dquot_decr_inodes(dquots[cnt], 1);
1725 } 1753 }
1726 spin_unlock(&dq_data_lock); 1754 spin_unlock(&dq_data_lock);
1727 mark_all_dquot_dirty(inode->i_dquot); 1755 mark_all_dquot_dirty(dquots);
1728 flush_warnings(inode->i_dquot, warntype);
1729 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1756 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1757 flush_warnings(warn);
1730} 1758}
1731EXPORT_SYMBOL(dquot_free_inode); 1759EXPORT_SYMBOL(dquot_free_inode);
1732 1760
@@ -1747,16 +1775,20 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1747 struct dquot *transfer_from[MAXQUOTAS] = {}; 1775 struct dquot *transfer_from[MAXQUOTAS] = {};
1748 int cnt, ret = 0; 1776 int cnt, ret = 0;
1749 char is_valid[MAXQUOTAS] = {}; 1777 char is_valid[MAXQUOTAS] = {};
1750 char warntype_to[MAXQUOTAS]; 1778 struct dquot_warn warn_to[MAXQUOTAS];
1751 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1779 struct dquot_warn warn_from_inodes[MAXQUOTAS];
1780 struct dquot_warn warn_from_space[MAXQUOTAS];
1752 1781
1753 /* First test before acquiring mutex - solves deadlocks when we 1782 /* First test before acquiring mutex - solves deadlocks when we
1754 * re-enter the quota code and are already holding the mutex */ 1783 * re-enter the quota code and are already holding the mutex */
1755 if (IS_NOQUOTA(inode)) 1784 if (IS_NOQUOTA(inode))
1756 return 0; 1785 return 0;
1757 /* Initialize the arrays */ 1786 /* Initialize the arrays */
1758 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1787 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1759 warntype_to[cnt] = QUOTA_NL_NOWARN; 1788 warn_to[cnt].w_type = QUOTA_NL_NOWARN;
1789 warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
1790 warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
1791 }
1760 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1792 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1761 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1793 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1762 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1794 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1778,10 +1810,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1778 continue; 1810 continue;
1779 is_valid[cnt] = 1; 1811 is_valid[cnt] = 1;
1780 transfer_from[cnt] = inode->i_dquot[cnt]; 1812 transfer_from[cnt] = inode->i_dquot[cnt];
1781 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); 1813 ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);
1782 if (ret) 1814 if (ret)
1783 goto over_quota; 1815 goto over_quota;
1784 ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); 1816 ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
1785 if (ret) 1817 if (ret)
1786 goto over_quota; 1818 goto over_quota;
1787 } 1819 }
@@ -1794,10 +1826,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1794 continue; 1826 continue;
1795 /* Due to IO error we might not have transfer_from[] structure */ 1827 /* Due to IO error we might not have transfer_from[] structure */
1796 if (transfer_from[cnt]) { 1828 if (transfer_from[cnt]) {
1797 warntype_from_inodes[cnt] = 1829 int wtype;
1798 info_idq_free(transfer_from[cnt], 1); 1830 wtype = info_idq_free(transfer_from[cnt], 1);
1799 warntype_from_space[cnt] = 1831 if (wtype != QUOTA_NL_NOWARN)
1800 info_bdq_free(transfer_from[cnt], space); 1832 prepare_warning(&warn_from_inodes[cnt],
1833 transfer_from[cnt], wtype);
1834 wtype = info_bdq_free(transfer_from[cnt], space);
1835 if (wtype != QUOTA_NL_NOWARN)
1836 prepare_warning(&warn_from_space[cnt],
1837 transfer_from[cnt], wtype);
1801 dquot_decr_inodes(transfer_from[cnt], 1); 1838 dquot_decr_inodes(transfer_from[cnt], 1);
1802 dquot_decr_space(transfer_from[cnt], cur_space); 1839 dquot_decr_space(transfer_from[cnt], cur_space);
1803 dquot_free_reserved_space(transfer_from[cnt], 1840 dquot_free_reserved_space(transfer_from[cnt],
@@ -1815,9 +1852,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1815 1852
1816 mark_all_dquot_dirty(transfer_from); 1853 mark_all_dquot_dirty(transfer_from);
1817 mark_all_dquot_dirty(transfer_to); 1854 mark_all_dquot_dirty(transfer_to);
1818 flush_warnings(transfer_to, warntype_to); 1855 flush_warnings(warn_to);
1819 flush_warnings(transfer_from, warntype_from_inodes); 1856 flush_warnings(warn_from_inodes);
1820 flush_warnings(transfer_from, warntype_from_space); 1857 flush_warnings(warn_from_space);
1821 /* Pass back references to put */ 1858 /* Pass back references to put */
1822 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1859 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1823 if (is_valid[cnt]) 1860 if (is_valid[cnt])
@@ -1826,7 +1863,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1826over_quota: 1863over_quota:
1827 spin_unlock(&dq_data_lock); 1864 spin_unlock(&dq_data_lock);
1828 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1865 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1829 flush_warnings(transfer_to, warntype_to); 1866 flush_warnings(warn_to);
1830 return ret; 1867 return ret;
1831} 1868}
1832EXPORT_SYMBOL(__dquot_transfer); 1869EXPORT_SYMBOL(__dquot_transfer);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index fc2c4388d126..9a391204ca27 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -282,10 +282,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
282 case Q_XGETQUOTA: 282 case Q_XGETQUOTA:
283 return quota_getxquota(sb, type, id, addr); 283 return quota_getxquota(sb, type, id, addr);
284 case Q_XQUOTASYNC: 284 case Q_XQUOTASYNC:
285 /* caller already holds s_umount */
286 if (sb->s_flags & MS_RDONLY) 285 if (sb->s_flags & MS_RDONLY)
287 return -EROFS; 286 return -EROFS;
288 writeback_inodes_sb(sb, WB_REASON_SYNC); 287 /* XFS quotas are fully coherent now, making this call a noop */
289 return 0; 288 return 0;
290 default: 289 default:
291 return -EINVAL; 290 return -EINVAL;
diff --git a/fs/read_write.c b/fs/read_write.c
index 5ad4248b0cd8..ffc99d22e0a3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -11,7 +11,7 @@
11#include <linux/uio.h> 11#include <linux/uio.h>
12#include <linux/fsnotify.h> 12#include <linux/fsnotify.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/export.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/splice.h> 17#include <linux/splice.h>
diff --git a/fs/readdir.c b/fs/readdir.c
index 356f71528ad6..cc0a8227cddf 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/stddef.h> 7#include <linux/stddef.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/errno.h> 12#include <linux/errno.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 445d768eea44..a59d27126338 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -7,6 +7,7 @@
7#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/bug.h>
10#include <linux/workqueue.h> 11#include <linux/workqueue.h>
11#include <asm/unaligned.h> 12#include <asm/unaligned.h>
12#include <linux/bitops.h> 13#include <linux/bitops.h>
diff --git a/fs/select.c b/fs/select.c
index e782258d0de3..6fb8943d580b 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -17,7 +17,7 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/syscalls.h> 19#include <linux/syscalls.h>
20#include <linux/module.h> 20#include <linux/export.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/poll.h> 22#include <linux/poll.h>
23#include <linux/personality.h> /* for STICKY_TIMEOUTS */ 23#include <linux/personality.h> /* for STICKY_TIMEOUTS */
@@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
223 get_file(filp); 223 get_file(filp);
224 entry->filp = filp; 224 entry->filp = filp;
225 entry->wait_address = wait_address; 225 entry->wait_address = wait_address;
226 entry->key = p->key; 226 entry->key = p->_key;
227 init_waitqueue_func_entry(&entry->wait, pollwake); 227 init_waitqueue_func_entry(&entry->wait, pollwake);
228 entry->wait.private = pwq; 228 entry->wait.private = pwq;
229 add_wait_queue(wait_address, &entry->wait); 229 add_wait_queue(wait_address, &entry->wait);
@@ -386,13 +386,11 @@ get_max:
386static inline void wait_key_set(poll_table *wait, unsigned long in, 386static inline void wait_key_set(poll_table *wait, unsigned long in,
387 unsigned long out, unsigned long bit) 387 unsigned long out, unsigned long bit)
388{ 388{
389 if (wait) { 389 wait->_key = POLLEX_SET;
390 wait->key = POLLEX_SET; 390 if (in & bit)
391 if (in & bit) 391 wait->_key |= POLLIN_SET;
392 wait->key |= POLLIN_SET; 392 if (out & bit)
393 if (out & bit) 393 wait->_key |= POLLOUT_SET;
394 wait->key |= POLLOUT_SET;
395 }
396} 394}
397 395
398int do_select(int n, fd_set_bits *fds, struct timespec *end_time) 396int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
@@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
414 poll_initwait(&table); 412 poll_initwait(&table);
415 wait = &table.pt; 413 wait = &table.pt;
416 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 414 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
417 wait = NULL; 415 wait->_qproc = NULL;
418 timed_out = 1; 416 timed_out = 1;
419 } 417 }
420 418
@@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
459 if ((mask & POLLIN_SET) && (in & bit)) { 457 if ((mask & POLLIN_SET) && (in & bit)) {
460 res_in |= bit; 458 res_in |= bit;
461 retval++; 459 retval++;
462 wait = NULL; 460 wait->_qproc = NULL;
463 } 461 }
464 if ((mask & POLLOUT_SET) && (out & bit)) { 462 if ((mask & POLLOUT_SET) && (out & bit)) {
465 res_out |= bit; 463 res_out |= bit;
466 retval++; 464 retval++;
467 wait = NULL; 465 wait->_qproc = NULL;
468 } 466 }
469 if ((mask & POLLEX_SET) && (ex & bit)) { 467 if ((mask & POLLEX_SET) && (ex & bit)) {
470 res_ex |= bit; 468 res_ex |= bit;
471 retval++; 469 retval++;
472 wait = NULL; 470 wait->_qproc = NULL;
473 } 471 }
474 } 472 }
475 } 473 }
@@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
481 *rexp = res_ex; 479 *rexp = res_ex;
482 cond_resched(); 480 cond_resched();
483 } 481 }
484 wait = NULL; 482 wait->_qproc = NULL;
485 if (retval || timed_out || signal_pending(current)) 483 if (retval || timed_out || signal_pending(current))
486 break; 484 break;
487 if (table.error) { 485 if (table.error) {
@@ -720,7 +718,7 @@ struct poll_list {
720 * interested in events matching the pollfd->events mask, and the result 718 * interested in events matching the pollfd->events mask, and the result
721 * matching that mask is both recorded in pollfd->revents and returned. The 719 * matching that mask is both recorded in pollfd->revents and returned. The
722 * pwait poll_table will be used by the fd-provided poll handler for waiting, 720 * pwait poll_table will be used by the fd-provided poll handler for waiting,
723 * if non-NULL. 721 * if pwait->_qproc is non-NULL.
724 */ 722 */
725static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) 723static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
726{ 724{
@@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
738 if (file != NULL) { 736 if (file != NULL) {
739 mask = DEFAULT_POLLMASK; 737 mask = DEFAULT_POLLMASK;
740 if (file->f_op && file->f_op->poll) { 738 if (file->f_op && file->f_op->poll) {
741 if (pwait) 739 pwait->_key = pollfd->events|POLLERR|POLLHUP;
742 pwait->key = pollfd->events |
743 POLLERR | POLLHUP;
744 mask = file->f_op->poll(file, pwait); 740 mask = file->f_op->poll(file, pwait);
745 } 741 }
746 /* Mask out unneeded events. */ 742 /* Mask out unneeded events. */
@@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
763 759
764 /* Optimise the no-wait case */ 760 /* Optimise the no-wait case */
765 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 761 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
766 pt = NULL; 762 pt->_qproc = NULL;
767 timed_out = 1; 763 timed_out = 1;
768 } 764 }
769 765
@@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
781 for (; pfd != pfd_end; pfd++) { 777 for (; pfd != pfd_end; pfd++) {
782 /* 778 /*
783 * Fish for events. If we found one, record it 779 * Fish for events. If we found one, record it
784 * and kill the poll_table, so we don't 780 * and kill poll_table->_qproc, so we don't
785 * needlessly register any other waiters after 781 * needlessly register any other waiters after
786 * this. They'll get immediately deregistered 782 * this. They'll get immediately deregistered
787 * when we break out and return. 783 * when we break out and return.
788 */ 784 */
789 if (do_pollfd(pfd, pt)) { 785 if (do_pollfd(pfd, pt)) {
790 count++; 786 count++;
791 pt = NULL; 787 pt->_qproc = NULL;
792 } 788 }
793 } 789 }
794 } 790 }
795 /* 791 /*
796 * All waiters have already been registered, so don't provide 792 * All waiters have already been registered, so don't provide
797 * a poll_table to them on the next loop iteration. 793 * a poll_table->_qproc to them on the next loop iteration.
798 */ 794 */
799 pt = NULL; 795 pt->_qproc = NULL;
800 if (!count) { 796 if (!count) {
801 count = wait->error; 797 count = wait->error;
802 if (signal_pending(current)) 798 if (signal_pending(current))
diff --git a/fs/seq_file.c b/fs/seq_file.c
index aa242dc99373..0cbd0494b79e 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -6,13 +6,29 @@
6 */ 6 */
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12 12
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/page.h> 14#include <asm/page.h>
15 15
16
17/*
18 * seq_files have a buffer which can may overflow. When this happens a larger
19 * buffer is reallocated and all the data will be printed again.
20 * The overflow state is true when m->count == m->size.
21 */
22static bool seq_overflow(struct seq_file *m)
23{
24 return m->count == m->size;
25}
26
27static void seq_set_overflow(struct seq_file *m)
28{
29 m->count = m->size;
30}
31
16/** 32/**
17 * seq_open - initialize sequential file 33 * seq_open - initialize sequential file
18 * @file: file we initialize 34 * @file: file we initialize
@@ -92,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset)
92 error = 0; 108 error = 0;
93 m->count = 0; 109 m->count = 0;
94 } 110 }
95 if (m->count == m->size) 111 if (seq_overflow(m))
96 goto Eoverflow; 112 goto Eoverflow;
97 if (pos + m->count > offset) { 113 if (pos + m->count > offset) {
98 m->from = offset - pos; 114 m->from = offset - pos;
@@ -234,7 +250,7 @@ Fill:
234 break; 250 break;
235 } 251 }
236 err = m->op->show(m, p); 252 err = m->op->show(m, p);
237 if (m->count == m->size || err) { 253 if (seq_overflow(m) || err) {
238 m->count = offs; 254 m->count = offs;
239 if (likely(err <= 0)) 255 if (likely(err <= 0))
240 break; 256 break;
@@ -361,7 +377,7 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
361 *p++ = '0' + (c & 07); 377 *p++ = '0' + (c & 07);
362 continue; 378 continue;
363 } 379 }
364 m->count = m->size; 380 seq_set_overflow(m);
365 return -1; 381 return -1;
366 } 382 }
367 m->count = p - m->buf; 383 m->count = p - m->buf;
@@ -383,7 +399,7 @@ int seq_printf(struct seq_file *m, const char *f, ...)
383 return 0; 399 return 0;
384 } 400 }
385 } 401 }
386 m->count = m->size; 402 seq_set_overflow(m);
387 return -1; 403 return -1;
388} 404}
389EXPORT_SYMBOL(seq_printf); 405EXPORT_SYMBOL(seq_printf);
@@ -512,7 +528,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
512 return 0; 528 return 0;
513 } 529 }
514 } 530 }
515 m->count = m->size; 531 seq_set_overflow(m);
516 return -1; 532 return -1;
517} 533}
518EXPORT_SYMBOL(seq_bitmap); 534EXPORT_SYMBOL(seq_bitmap);
@@ -528,7 +544,7 @@ int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
528 return 0; 544 return 0;
529 } 545 }
530 } 546 }
531 m->count = m->size; 547 seq_set_overflow(m);
532 return -1; 548 return -1;
533} 549}
534EXPORT_SYMBOL(seq_bitmap_list); 550EXPORT_SYMBOL(seq_bitmap_list);
@@ -639,11 +655,63 @@ int seq_puts(struct seq_file *m, const char *s)
639 m->count += len; 655 m->count += len;
640 return 0; 656 return 0;
641 } 657 }
642 m->count = m->size; 658 seq_set_overflow(m);
643 return -1; 659 return -1;
644} 660}
645EXPORT_SYMBOL(seq_puts); 661EXPORT_SYMBOL(seq_puts);
646 662
663/*
664 * A helper routine for putting decimal numbers without rich format of printf().
665 * only 'unsigned long long' is supported.
666 * This routine will put one byte delimiter + number into seq_file.
667 * This routine is very quick when you show lots of numbers.
668 * In usual cases, it will be better to use seq_printf(). It's easier to read.
669 */
670int seq_put_decimal_ull(struct seq_file *m, char delimiter,
671 unsigned long long num)
672{
673 int len;
674
675 if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
676 goto overflow;
677
678 if (delimiter)
679 m->buf[m->count++] = delimiter;
680
681 if (num < 10) {
682 m->buf[m->count++] = num + '0';
683 return 0;
684 }
685
686 len = num_to_str(m->buf + m->count, m->size - m->count, num);
687 if (!len)
688 goto overflow;
689 m->count += len;
690 return 0;
691overflow:
692 seq_set_overflow(m);
693 return -1;
694}
695EXPORT_SYMBOL(seq_put_decimal_ull);
696
697int seq_put_decimal_ll(struct seq_file *m, char delimiter,
698 long long num)
699{
700 if (num < 0) {
701 if (m->count + 3 >= m->size) {
702 seq_set_overflow(m);
703 return -1;
704 }
705 if (delimiter)
706 m->buf[m->count++] = delimiter;
707 num = -num;
708 delimiter = '-';
709 }
710 return seq_put_decimal_ull(m, delimiter, num);
711
712}
713EXPORT_SYMBOL(seq_put_decimal_ll);
714
647/** 715/**
648 * seq_write - write arbitrary data to buffer 716 * seq_write - write arbitrary data to buffer
649 * @seq: seq_file identifying the buffer to which data should be written 717 * @seq: seq_file identifying the buffer to which data should be written
@@ -659,7 +727,7 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)
659 seq->count += len; 727 seq->count += len;
660 return 0; 728 return 0;
661 } 729 }
662 seq->count = seq->size; 730 seq_set_overflow(seq);
663 return -1; 731 return -1;
664} 732}
665EXPORT_SYMBOL(seq_write); 733EXPORT_SYMBOL(seq_write);
diff --git a/fs/splice.c b/fs/splice.c
index f16402ed915c..5f883de7ef3a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -25,7 +25,7 @@
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/swap.h> 26#include <linux/swap.h>
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/module.h> 28#include <linux/export.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/uio.h> 30#include <linux/uio.h>
31#include <linux/security.h> 31#include <linux/security.h>
diff --git a/fs/stack.c b/fs/stack.c
index 9c11519245a6..5b5388250e29 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -1,4 +1,4 @@
1#include <linux/module.h> 1#include <linux/export.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/fs_stack.h> 3#include <linux/fs_stack.h>
4 4
diff --git a/fs/stat.c b/fs/stat.c
index 86f13563a463..c733dc5753ae 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/module.h> 7#include <linux/export.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/file.h> 10#include <linux/file.h>
diff --git a/fs/statfs.c b/fs/statfs.c
index 2aa6a22e0be2..43e6b6fe4e85 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -1,5 +1,5 @@
1#include <linux/syscalls.h> 1#include <linux/syscalls.h>
2#include <linux/module.h> 2#include <linux/export.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/file.h> 4#include <linux/file.h>
5#include <linux/mount.h> 5#include <linux/mount.h>
diff --git a/fs/super.c b/fs/super.c
index 7fcb1354c554..cf001775617f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -20,7 +20,7 @@
20 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 20 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
21 */ 21 */
22 22
23#include <linux/module.h> 23#include <linux/export.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/acct.h> 25#include <linux/acct.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
diff --git a/fs/sync.c b/fs/sync.c
index f3501ef39235..0e8db939d96f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -6,7 +6,7 @@
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/module.h> 9#include <linux/export.h>
10#include <linux/namei.h> 10#include <linux/namei.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/writeback.h> 12#include <linux/writeback.h>
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index f922cbacdb96..1934084e2088 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -36,7 +36,7 @@
36 36
37#ifdef CONFIG_UBIFS_FS_DEBUG 37#ifdef CONFIG_UBIFS_FS_DEBUG
38 38
39DEFINE_SPINLOCK(dbg_lock); 39static DEFINE_SPINLOCK(dbg_lock);
40 40
41static const char *get_key_fmt(int fmt) 41static const char *get_key_fmt(int fmt)
42{ 42{
@@ -221,15 +221,15 @@ const char *dbg_jhead(int jhead)
221 221
222static void dump_ch(const struct ubifs_ch *ch) 222static void dump_ch(const struct ubifs_ch *ch)
223{ 223{
224 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); 224 printk(KERN_ERR "\tmagic %#x\n", le32_to_cpu(ch->magic));
225 printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc)); 225 printk(KERN_ERR "\tcrc %#x\n", le32_to_cpu(ch->crc));
226 printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type, 226 printk(KERN_ERR "\tnode_type %d (%s)\n", ch->node_type,
227 dbg_ntype(ch->node_type)); 227 dbg_ntype(ch->node_type));
228 printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type, 228 printk(KERN_ERR "\tgroup_type %d (%s)\n", ch->group_type,
229 dbg_gtype(ch->group_type)); 229 dbg_gtype(ch->group_type));
230 printk(KERN_DEBUG "\tsqnum %llu\n", 230 printk(KERN_ERR "\tsqnum %llu\n",
231 (unsigned long long)le64_to_cpu(ch->sqnum)); 231 (unsigned long long)le64_to_cpu(ch->sqnum));
232 printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); 232 printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len));
233} 233}
234 234
235void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) 235void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
@@ -240,43 +240,43 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
240 struct ubifs_dent_node *dent, *pdent = NULL; 240 struct ubifs_dent_node *dent, *pdent = NULL;
241 int count = 2; 241 int count = 2;
242 242
243 printk(KERN_DEBUG "Dump in-memory inode:"); 243 printk(KERN_ERR "Dump in-memory inode:");
244 printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); 244 printk(KERN_ERR "\tinode %lu\n", inode->i_ino);
245 printk(KERN_DEBUG "\tsize %llu\n", 245 printk(KERN_ERR "\tsize %llu\n",
246 (unsigned long long)i_size_read(inode)); 246 (unsigned long long)i_size_read(inode));
247 printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); 247 printk(KERN_ERR "\tnlink %u\n", inode->i_nlink);
248 printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); 248 printk(KERN_ERR "\tuid %u\n", (unsigned int)inode->i_uid);
249 printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); 249 printk(KERN_ERR "\tgid %u\n", (unsigned int)inode->i_gid);
250 printk(KERN_DEBUG "\tatime %u.%u\n", 250 printk(KERN_ERR "\tatime %u.%u\n",
251 (unsigned int)inode->i_atime.tv_sec, 251 (unsigned int)inode->i_atime.tv_sec,
252 (unsigned int)inode->i_atime.tv_nsec); 252 (unsigned int)inode->i_atime.tv_nsec);
253 printk(KERN_DEBUG "\tmtime %u.%u\n", 253 printk(KERN_ERR "\tmtime %u.%u\n",
254 (unsigned int)inode->i_mtime.tv_sec, 254 (unsigned int)inode->i_mtime.tv_sec,
255 (unsigned int)inode->i_mtime.tv_nsec); 255 (unsigned int)inode->i_mtime.tv_nsec);
256 printk(KERN_DEBUG "\tctime %u.%u\n", 256 printk(KERN_ERR "\tctime %u.%u\n",
257 (unsigned int)inode->i_ctime.tv_sec, 257 (unsigned int)inode->i_ctime.tv_sec,
258 (unsigned int)inode->i_ctime.tv_nsec); 258 (unsigned int)inode->i_ctime.tv_nsec);
259 printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); 259 printk(KERN_ERR "\tcreat_sqnum %llu\n", ui->creat_sqnum);
260 printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); 260 printk(KERN_ERR "\txattr_size %u\n", ui->xattr_size);
261 printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); 261 printk(KERN_ERR "\txattr_cnt %u\n", ui->xattr_cnt);
262 printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); 262 printk(KERN_ERR "\txattr_names %u\n", ui->xattr_names);
263 printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); 263 printk(KERN_ERR "\tdirty %u\n", ui->dirty);
264 printk(KERN_DEBUG "\txattr %u\n", ui->xattr); 264 printk(KERN_ERR "\txattr %u\n", ui->xattr);
265 printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); 265 printk(KERN_ERR "\tbulk_read %u\n", ui->xattr);
266 printk(KERN_DEBUG "\tsynced_i_size %llu\n", 266 printk(KERN_ERR "\tsynced_i_size %llu\n",
267 (unsigned long long)ui->synced_i_size); 267 (unsigned long long)ui->synced_i_size);
268 printk(KERN_DEBUG "\tui_size %llu\n", 268 printk(KERN_ERR "\tui_size %llu\n",
269 (unsigned long long)ui->ui_size); 269 (unsigned long long)ui->ui_size);
270 printk(KERN_DEBUG "\tflags %d\n", ui->flags); 270 printk(KERN_ERR "\tflags %d\n", ui->flags);
271 printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); 271 printk(KERN_ERR "\tcompr_type %d\n", ui->compr_type);
272 printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); 272 printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read);
273 printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); 273 printk(KERN_ERR "\tread_in_a_row %lu\n", ui->read_in_a_row);
274 printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); 274 printk(KERN_ERR "\tdata_len %d\n", ui->data_len);
275 275
276 if (!S_ISDIR(inode->i_mode)) 276 if (!S_ISDIR(inode->i_mode))
277 return; 277 return;
278 278
279 printk(KERN_DEBUG "List of directory entries:\n"); 279 printk(KERN_ERR "List of directory entries:\n");
280 ubifs_assert(!mutex_is_locked(&c->tnc_mutex)); 280 ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
281 281
282 lowest_dent_key(c, &key, inode->i_ino); 282 lowest_dent_key(c, &key, inode->i_ino);
@@ -284,11 +284,11 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
284 dent = ubifs_tnc_next_ent(c, &key, &nm); 284 dent = ubifs_tnc_next_ent(c, &key, &nm);
285 if (IS_ERR(dent)) { 285 if (IS_ERR(dent)) {
286 if (PTR_ERR(dent) != -ENOENT) 286 if (PTR_ERR(dent) != -ENOENT)
287 printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent)); 287 printk(KERN_ERR "error %ld\n", PTR_ERR(dent));
288 break; 288 break;
289 } 289 }
290 290
291 printk(KERN_DEBUG "\t%d: %s (%s)\n", 291 printk(KERN_ERR "\t%d: %s (%s)\n",
292 count++, dent->name, get_dent_type(dent->type)); 292 count++, dent->name, get_dent_type(dent->type));
293 293
294 nm.name = dent->name; 294 nm.name = dent->name;
@@ -312,8 +312,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
312 312
313 /* If the magic is incorrect, just hexdump the first bytes */ 313 /* If the magic is incorrect, just hexdump the first bytes */
314 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { 314 if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
315 printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); 315 printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ);
316 print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, 316 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
317 (void *)node, UBIFS_CH_SZ, 1); 317 (void *)node, UBIFS_CH_SZ, 1);
318 return; 318 return;
319 } 319 }
@@ -326,7 +326,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
326 { 326 {
327 const struct ubifs_pad_node *pad = node; 327 const struct ubifs_pad_node *pad = node;
328 328
329 printk(KERN_DEBUG "\tpad_len %u\n", 329 printk(KERN_ERR "\tpad_len %u\n",
330 le32_to_cpu(pad->pad_len)); 330 le32_to_cpu(pad->pad_len));
331 break; 331 break;
332 } 332 }
@@ -335,50 +335,50 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
335 const struct ubifs_sb_node *sup = node; 335 const struct ubifs_sb_node *sup = node;
336 unsigned int sup_flags = le32_to_cpu(sup->flags); 336 unsigned int sup_flags = le32_to_cpu(sup->flags);
337 337
338 printk(KERN_DEBUG "\tkey_hash %d (%s)\n", 338 printk(KERN_ERR "\tkey_hash %d (%s)\n",
339 (int)sup->key_hash, get_key_hash(sup->key_hash)); 339 (int)sup->key_hash, get_key_hash(sup->key_hash));
340 printk(KERN_DEBUG "\tkey_fmt %d (%s)\n", 340 printk(KERN_ERR "\tkey_fmt %d (%s)\n",
341 (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); 341 (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
342 printk(KERN_DEBUG "\tflags %#x\n", sup_flags); 342 printk(KERN_ERR "\tflags %#x\n", sup_flags);
343 printk(KERN_DEBUG "\t big_lpt %u\n", 343 printk(KERN_ERR "\t big_lpt %u\n",
344 !!(sup_flags & UBIFS_FLG_BIGLPT)); 344 !!(sup_flags & UBIFS_FLG_BIGLPT));
345 printk(KERN_DEBUG "\t space_fixup %u\n", 345 printk(KERN_ERR "\t space_fixup %u\n",
346 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); 346 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
347 printk(KERN_DEBUG "\tmin_io_size %u\n", 347 printk(KERN_ERR "\tmin_io_size %u\n",
348 le32_to_cpu(sup->min_io_size)); 348 le32_to_cpu(sup->min_io_size));
349 printk(KERN_DEBUG "\tleb_size %u\n", 349 printk(KERN_ERR "\tleb_size %u\n",
350 le32_to_cpu(sup->leb_size)); 350 le32_to_cpu(sup->leb_size));
351 printk(KERN_DEBUG "\tleb_cnt %u\n", 351 printk(KERN_ERR "\tleb_cnt %u\n",
352 le32_to_cpu(sup->leb_cnt)); 352 le32_to_cpu(sup->leb_cnt));
353 printk(KERN_DEBUG "\tmax_leb_cnt %u\n", 353 printk(KERN_ERR "\tmax_leb_cnt %u\n",
354 le32_to_cpu(sup->max_leb_cnt)); 354 le32_to_cpu(sup->max_leb_cnt));
355 printk(KERN_DEBUG "\tmax_bud_bytes %llu\n", 355 printk(KERN_ERR "\tmax_bud_bytes %llu\n",
356 (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); 356 (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
357 printk(KERN_DEBUG "\tlog_lebs %u\n", 357 printk(KERN_ERR "\tlog_lebs %u\n",
358 le32_to_cpu(sup->log_lebs)); 358 le32_to_cpu(sup->log_lebs));
359 printk(KERN_DEBUG "\tlpt_lebs %u\n", 359 printk(KERN_ERR "\tlpt_lebs %u\n",
360 le32_to_cpu(sup->lpt_lebs)); 360 le32_to_cpu(sup->lpt_lebs));
361 printk(KERN_DEBUG "\torph_lebs %u\n", 361 printk(KERN_ERR "\torph_lebs %u\n",
362 le32_to_cpu(sup->orph_lebs)); 362 le32_to_cpu(sup->orph_lebs));
363 printk(KERN_DEBUG "\tjhead_cnt %u\n", 363 printk(KERN_ERR "\tjhead_cnt %u\n",
364 le32_to_cpu(sup->jhead_cnt)); 364 le32_to_cpu(sup->jhead_cnt));
365 printk(KERN_DEBUG "\tfanout %u\n", 365 printk(KERN_ERR "\tfanout %u\n",
366 le32_to_cpu(sup->fanout)); 366 le32_to_cpu(sup->fanout));
367 printk(KERN_DEBUG "\tlsave_cnt %u\n", 367 printk(KERN_ERR "\tlsave_cnt %u\n",
368 le32_to_cpu(sup->lsave_cnt)); 368 le32_to_cpu(sup->lsave_cnt));
369 printk(KERN_DEBUG "\tdefault_compr %u\n", 369 printk(KERN_ERR "\tdefault_compr %u\n",
370 (int)le16_to_cpu(sup->default_compr)); 370 (int)le16_to_cpu(sup->default_compr));
371 printk(KERN_DEBUG "\trp_size %llu\n", 371 printk(KERN_ERR "\trp_size %llu\n",
372 (unsigned long long)le64_to_cpu(sup->rp_size)); 372 (unsigned long long)le64_to_cpu(sup->rp_size));
373 printk(KERN_DEBUG "\trp_uid %u\n", 373 printk(KERN_ERR "\trp_uid %u\n",
374 le32_to_cpu(sup->rp_uid)); 374 le32_to_cpu(sup->rp_uid));
375 printk(KERN_DEBUG "\trp_gid %u\n", 375 printk(KERN_ERR "\trp_gid %u\n",
376 le32_to_cpu(sup->rp_gid)); 376 le32_to_cpu(sup->rp_gid));
377 printk(KERN_DEBUG "\tfmt_version %u\n", 377 printk(KERN_ERR "\tfmt_version %u\n",
378 le32_to_cpu(sup->fmt_version)); 378 le32_to_cpu(sup->fmt_version));
379 printk(KERN_DEBUG "\ttime_gran %u\n", 379 printk(KERN_ERR "\ttime_gran %u\n",
380 le32_to_cpu(sup->time_gran)); 380 le32_to_cpu(sup->time_gran));
381 printk(KERN_DEBUG "\tUUID %pUB\n", 381 printk(KERN_ERR "\tUUID %pUB\n",
382 sup->uuid); 382 sup->uuid);
383 break; 383 break;
384 } 384 }
@@ -386,61 +386,61 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
386 { 386 {
387 const struct ubifs_mst_node *mst = node; 387 const struct ubifs_mst_node *mst = node;
388 388
389 printk(KERN_DEBUG "\thighest_inum %llu\n", 389 printk(KERN_ERR "\thighest_inum %llu\n",
390 (unsigned long long)le64_to_cpu(mst->highest_inum)); 390 (unsigned long long)le64_to_cpu(mst->highest_inum));
391 printk(KERN_DEBUG "\tcommit number %llu\n", 391 printk(KERN_ERR "\tcommit number %llu\n",
392 (unsigned long long)le64_to_cpu(mst->cmt_no)); 392 (unsigned long long)le64_to_cpu(mst->cmt_no));
393 printk(KERN_DEBUG "\tflags %#x\n", 393 printk(KERN_ERR "\tflags %#x\n",
394 le32_to_cpu(mst->flags)); 394 le32_to_cpu(mst->flags));
395 printk(KERN_DEBUG "\tlog_lnum %u\n", 395 printk(KERN_ERR "\tlog_lnum %u\n",
396 le32_to_cpu(mst->log_lnum)); 396 le32_to_cpu(mst->log_lnum));
397 printk(KERN_DEBUG "\troot_lnum %u\n", 397 printk(KERN_ERR "\troot_lnum %u\n",
398 le32_to_cpu(mst->root_lnum)); 398 le32_to_cpu(mst->root_lnum));
399 printk(KERN_DEBUG "\troot_offs %u\n", 399 printk(KERN_ERR "\troot_offs %u\n",
400 le32_to_cpu(mst->root_offs)); 400 le32_to_cpu(mst->root_offs));
401 printk(KERN_DEBUG "\troot_len %u\n", 401 printk(KERN_ERR "\troot_len %u\n",
402 le32_to_cpu(mst->root_len)); 402 le32_to_cpu(mst->root_len));
403 printk(KERN_DEBUG "\tgc_lnum %u\n", 403 printk(KERN_ERR "\tgc_lnum %u\n",
404 le32_to_cpu(mst->gc_lnum)); 404 le32_to_cpu(mst->gc_lnum));
405 printk(KERN_DEBUG "\tihead_lnum %u\n", 405 printk(KERN_ERR "\tihead_lnum %u\n",
406 le32_to_cpu(mst->ihead_lnum)); 406 le32_to_cpu(mst->ihead_lnum));
407 printk(KERN_DEBUG "\tihead_offs %u\n", 407 printk(KERN_ERR "\tihead_offs %u\n",
408 le32_to_cpu(mst->ihead_offs)); 408 le32_to_cpu(mst->ihead_offs));
409 printk(KERN_DEBUG "\tindex_size %llu\n", 409 printk(KERN_ERR "\tindex_size %llu\n",
410 (unsigned long long)le64_to_cpu(mst->index_size)); 410 (unsigned long long)le64_to_cpu(mst->index_size));
411 printk(KERN_DEBUG "\tlpt_lnum %u\n", 411 printk(KERN_ERR "\tlpt_lnum %u\n",
412 le32_to_cpu(mst->lpt_lnum)); 412 le32_to_cpu(mst->lpt_lnum));
413 printk(KERN_DEBUG "\tlpt_offs %u\n", 413 printk(KERN_ERR "\tlpt_offs %u\n",
414 le32_to_cpu(mst->lpt_offs)); 414 le32_to_cpu(mst->lpt_offs));
415 printk(KERN_DEBUG "\tnhead_lnum %u\n", 415 printk(KERN_ERR "\tnhead_lnum %u\n",
416 le32_to_cpu(mst->nhead_lnum)); 416 le32_to_cpu(mst->nhead_lnum));
417 printk(KERN_DEBUG "\tnhead_offs %u\n", 417 printk(KERN_ERR "\tnhead_offs %u\n",
418 le32_to_cpu(mst->nhead_offs)); 418 le32_to_cpu(mst->nhead_offs));
419 printk(KERN_DEBUG "\tltab_lnum %u\n", 419 printk(KERN_ERR "\tltab_lnum %u\n",
420 le32_to_cpu(mst->ltab_lnum)); 420 le32_to_cpu(mst->ltab_lnum));
421 printk(KERN_DEBUG "\tltab_offs %u\n", 421 printk(KERN_ERR "\tltab_offs %u\n",
422 le32_to_cpu(mst->ltab_offs)); 422 le32_to_cpu(mst->ltab_offs));
423 printk(KERN_DEBUG "\tlsave_lnum %u\n", 423 printk(KERN_ERR "\tlsave_lnum %u\n",
424 le32_to_cpu(mst->lsave_lnum)); 424 le32_to_cpu(mst->lsave_lnum));
425 printk(KERN_DEBUG "\tlsave_offs %u\n", 425 printk(KERN_ERR "\tlsave_offs %u\n",
426 le32_to_cpu(mst->lsave_offs)); 426 le32_to_cpu(mst->lsave_offs));
427 printk(KERN_DEBUG "\tlscan_lnum %u\n", 427 printk(KERN_ERR "\tlscan_lnum %u\n",
428 le32_to_cpu(mst->lscan_lnum)); 428 le32_to_cpu(mst->lscan_lnum));
429 printk(KERN_DEBUG "\tleb_cnt %u\n", 429 printk(KERN_ERR "\tleb_cnt %u\n",
430 le32_to_cpu(mst->leb_cnt)); 430 le32_to_cpu(mst->leb_cnt));
431 printk(KERN_DEBUG "\tempty_lebs %u\n", 431 printk(KERN_ERR "\tempty_lebs %u\n",
432 le32_to_cpu(mst->empty_lebs)); 432 le32_to_cpu(mst->empty_lebs));
433 printk(KERN_DEBUG "\tidx_lebs %u\n", 433 printk(KERN_ERR "\tidx_lebs %u\n",
434 le32_to_cpu(mst->idx_lebs)); 434 le32_to_cpu(mst->idx_lebs));
435 printk(KERN_DEBUG "\ttotal_free %llu\n", 435 printk(KERN_ERR "\ttotal_free %llu\n",
436 (unsigned long long)le64_to_cpu(mst->total_free)); 436 (unsigned long long)le64_to_cpu(mst->total_free));
437 printk(KERN_DEBUG "\ttotal_dirty %llu\n", 437 printk(KERN_ERR "\ttotal_dirty %llu\n",
438 (unsigned long long)le64_to_cpu(mst->total_dirty)); 438 (unsigned long long)le64_to_cpu(mst->total_dirty));
439 printk(KERN_DEBUG "\ttotal_used %llu\n", 439 printk(KERN_ERR "\ttotal_used %llu\n",
440 (unsigned long long)le64_to_cpu(mst->total_used)); 440 (unsigned long long)le64_to_cpu(mst->total_used));
441 printk(KERN_DEBUG "\ttotal_dead %llu\n", 441 printk(KERN_ERR "\ttotal_dead %llu\n",
442 (unsigned long long)le64_to_cpu(mst->total_dead)); 442 (unsigned long long)le64_to_cpu(mst->total_dead));
443 printk(KERN_DEBUG "\ttotal_dark %llu\n", 443 printk(KERN_ERR "\ttotal_dark %llu\n",
444 (unsigned long long)le64_to_cpu(mst->total_dark)); 444 (unsigned long long)le64_to_cpu(mst->total_dark));
445 break; 445 break;
446 } 446 }
@@ -448,11 +448,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
448 { 448 {
449 const struct ubifs_ref_node *ref = node; 449 const struct ubifs_ref_node *ref = node;
450 450
451 printk(KERN_DEBUG "\tlnum %u\n", 451 printk(KERN_ERR "\tlnum %u\n",
452 le32_to_cpu(ref->lnum)); 452 le32_to_cpu(ref->lnum));
453 printk(KERN_DEBUG "\toffs %u\n", 453 printk(KERN_ERR "\toffs %u\n",
454 le32_to_cpu(ref->offs)); 454 le32_to_cpu(ref->offs));
455 printk(KERN_DEBUG "\tjhead %u\n", 455 printk(KERN_ERR "\tjhead %u\n",
456 le32_to_cpu(ref->jhead)); 456 le32_to_cpu(ref->jhead));
457 break; 457 break;
458 } 458 }
@@ -461,40 +461,40 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
461 const struct ubifs_ino_node *ino = node; 461 const struct ubifs_ino_node *ino = node;
462 462
463 key_read(c, &ino->key, &key); 463 key_read(c, &ino->key, &key);
464 printk(KERN_DEBUG "\tkey %s\n", 464 printk(KERN_ERR "\tkey %s\n",
465 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); 465 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
466 printk(KERN_DEBUG "\tcreat_sqnum %llu\n", 466 printk(KERN_ERR "\tcreat_sqnum %llu\n",
467 (unsigned long long)le64_to_cpu(ino->creat_sqnum)); 467 (unsigned long long)le64_to_cpu(ino->creat_sqnum));
468 printk(KERN_DEBUG "\tsize %llu\n", 468 printk(KERN_ERR "\tsize %llu\n",
469 (unsigned long long)le64_to_cpu(ino->size)); 469 (unsigned long long)le64_to_cpu(ino->size));
470 printk(KERN_DEBUG "\tnlink %u\n", 470 printk(KERN_ERR "\tnlink %u\n",
471 le32_to_cpu(ino->nlink)); 471 le32_to_cpu(ino->nlink));
472 printk(KERN_DEBUG "\tatime %lld.%u\n", 472 printk(KERN_ERR "\tatime %lld.%u\n",
473 (long long)le64_to_cpu(ino->atime_sec), 473 (long long)le64_to_cpu(ino->atime_sec),
474 le32_to_cpu(ino->atime_nsec)); 474 le32_to_cpu(ino->atime_nsec));
475 printk(KERN_DEBUG "\tmtime %lld.%u\n", 475 printk(KERN_ERR "\tmtime %lld.%u\n",
476 (long long)le64_to_cpu(ino->mtime_sec), 476 (long long)le64_to_cpu(ino->mtime_sec),
477 le32_to_cpu(ino->mtime_nsec)); 477 le32_to_cpu(ino->mtime_nsec));
478 printk(KERN_DEBUG "\tctime %lld.%u\n", 478 printk(KERN_ERR "\tctime %lld.%u\n",
479 (long long)le64_to_cpu(ino->ctime_sec), 479 (long long)le64_to_cpu(ino->ctime_sec),
480 le32_to_cpu(ino->ctime_nsec)); 480 le32_to_cpu(ino->ctime_nsec));
481 printk(KERN_DEBUG "\tuid %u\n", 481 printk(KERN_ERR "\tuid %u\n",
482 le32_to_cpu(ino->uid)); 482 le32_to_cpu(ino->uid));
483 printk(KERN_DEBUG "\tgid %u\n", 483 printk(KERN_ERR "\tgid %u\n",
484 le32_to_cpu(ino->gid)); 484 le32_to_cpu(ino->gid));
485 printk(KERN_DEBUG "\tmode %u\n", 485 printk(KERN_ERR "\tmode %u\n",
486 le32_to_cpu(ino->mode)); 486 le32_to_cpu(ino->mode));
487 printk(KERN_DEBUG "\tflags %#x\n", 487 printk(KERN_ERR "\tflags %#x\n",
488 le32_to_cpu(ino->flags)); 488 le32_to_cpu(ino->flags));
489 printk(KERN_DEBUG "\txattr_cnt %u\n", 489 printk(KERN_ERR "\txattr_cnt %u\n",
490 le32_to_cpu(ino->xattr_cnt)); 490 le32_to_cpu(ino->xattr_cnt));
491 printk(KERN_DEBUG "\txattr_size %u\n", 491 printk(KERN_ERR "\txattr_size %u\n",
492 le32_to_cpu(ino->xattr_size)); 492 le32_to_cpu(ino->xattr_size));
493 printk(KERN_DEBUG "\txattr_names %u\n", 493 printk(KERN_ERR "\txattr_names %u\n",
494 le32_to_cpu(ino->xattr_names)); 494 le32_to_cpu(ino->xattr_names));
495 printk(KERN_DEBUG "\tcompr_type %#x\n", 495 printk(KERN_ERR "\tcompr_type %#x\n",
496 (int)le16_to_cpu(ino->compr_type)); 496 (int)le16_to_cpu(ino->compr_type));
497 printk(KERN_DEBUG "\tdata len %u\n", 497 printk(KERN_ERR "\tdata len %u\n",
498 le32_to_cpu(ino->data_len)); 498 le32_to_cpu(ino->data_len));
499 break; 499 break;
500 } 500 }
@@ -505,16 +505,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
505 int nlen = le16_to_cpu(dent->nlen); 505 int nlen = le16_to_cpu(dent->nlen);
506 506
507 key_read(c, &dent->key, &key); 507 key_read(c, &dent->key, &key);
508 printk(KERN_DEBUG "\tkey %s\n", 508 printk(KERN_ERR "\tkey %s\n",
509 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); 509 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
510 printk(KERN_DEBUG "\tinum %llu\n", 510 printk(KERN_ERR "\tinum %llu\n",
511 (unsigned long long)le64_to_cpu(dent->inum)); 511 (unsigned long long)le64_to_cpu(dent->inum));
512 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); 512 printk(KERN_ERR "\ttype %d\n", (int)dent->type);
513 printk(KERN_DEBUG "\tnlen %d\n", nlen); 513 printk(KERN_ERR "\tnlen %d\n", nlen);
514 printk(KERN_DEBUG "\tname "); 514 printk(KERN_ERR "\tname ");
515 515
516 if (nlen > UBIFS_MAX_NLEN) 516 if (nlen > UBIFS_MAX_NLEN)
517 printk(KERN_DEBUG "(bad name length, not printing, " 517 printk(KERN_ERR "(bad name length, not printing, "
518 "bad or corrupted node)"); 518 "bad or corrupted node)");
519 else { 519 else {
520 for (i = 0; i < nlen && dent->name[i]; i++) 520 for (i = 0; i < nlen && dent->name[i]; i++)
@@ -530,16 +530,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
530 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; 530 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
531 531
532 key_read(c, &dn->key, &key); 532 key_read(c, &dn->key, &key);
533 printk(KERN_DEBUG "\tkey %s\n", 533 printk(KERN_ERR "\tkey %s\n",
534 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); 534 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
535 printk(KERN_DEBUG "\tsize %u\n", 535 printk(KERN_ERR "\tsize %u\n",
536 le32_to_cpu(dn->size)); 536 le32_to_cpu(dn->size));
537 printk(KERN_DEBUG "\tcompr_typ %d\n", 537 printk(KERN_ERR "\tcompr_typ %d\n",
538 (int)le16_to_cpu(dn->compr_type)); 538 (int)le16_to_cpu(dn->compr_type));
539 printk(KERN_DEBUG "\tdata size %d\n", 539 printk(KERN_ERR "\tdata size %d\n",
540 dlen); 540 dlen);
541 printk(KERN_DEBUG "\tdata:\n"); 541 printk(KERN_ERR "\tdata:\n");
542 print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, 542 print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
543 (void *)&dn->data, dlen, 0); 543 (void *)&dn->data, dlen, 0);
544 break; 544 break;
545 } 545 }
@@ -547,11 +547,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
547 { 547 {
548 const struct ubifs_trun_node *trun = node; 548 const struct ubifs_trun_node *trun = node;
549 549
550 printk(KERN_DEBUG "\tinum %u\n", 550 printk(KERN_ERR "\tinum %u\n",
551 le32_to_cpu(trun->inum)); 551 le32_to_cpu(trun->inum));
552 printk(KERN_DEBUG "\told_size %llu\n", 552 printk(KERN_ERR "\told_size %llu\n",
553 (unsigned long long)le64_to_cpu(trun->old_size)); 553 (unsigned long long)le64_to_cpu(trun->old_size));
554 printk(KERN_DEBUG "\tnew_size %llu\n", 554 printk(KERN_ERR "\tnew_size %llu\n",
555 (unsigned long long)le64_to_cpu(trun->new_size)); 555 (unsigned long long)le64_to_cpu(trun->new_size));
556 break; 556 break;
557 } 557 }
@@ -560,17 +560,17 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
560 const struct ubifs_idx_node *idx = node; 560 const struct ubifs_idx_node *idx = node;
561 561
562 n = le16_to_cpu(idx->child_cnt); 562 n = le16_to_cpu(idx->child_cnt);
563 printk(KERN_DEBUG "\tchild_cnt %d\n", n); 563 printk(KERN_ERR "\tchild_cnt %d\n", n);
564 printk(KERN_DEBUG "\tlevel %d\n", 564 printk(KERN_ERR "\tlevel %d\n",
565 (int)le16_to_cpu(idx->level)); 565 (int)le16_to_cpu(idx->level));
566 printk(KERN_DEBUG "\tBranches:\n"); 566 printk(KERN_ERR "\tBranches:\n");
567 567
568 for (i = 0; i < n && i < c->fanout - 1; i++) { 568 for (i = 0; i < n && i < c->fanout - 1; i++) {
569 const struct ubifs_branch *br; 569 const struct ubifs_branch *br;
570 570
571 br = ubifs_idx_branch(c, idx, i); 571 br = ubifs_idx_branch(c, idx, i);
572 key_read(c, &br->key, &key); 572 key_read(c, &br->key, &key);
573 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", 573 printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n",
574 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), 574 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
575 le32_to_cpu(br->len), 575 le32_to_cpu(br->len),
576 dbg_snprintf_key(c, &key, key_buf, 576 dbg_snprintf_key(c, &key, key_buf,
@@ -584,20 +584,20 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
584 { 584 {
585 const struct ubifs_orph_node *orph = node; 585 const struct ubifs_orph_node *orph = node;
586 586
587 printk(KERN_DEBUG "\tcommit number %llu\n", 587 printk(KERN_ERR "\tcommit number %llu\n",
588 (unsigned long long) 588 (unsigned long long)
589 le64_to_cpu(orph->cmt_no) & LLONG_MAX); 589 le64_to_cpu(orph->cmt_no) & LLONG_MAX);
590 printk(KERN_DEBUG "\tlast node flag %llu\n", 590 printk(KERN_ERR "\tlast node flag %llu\n",
591 (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); 591 (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
592 n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; 592 n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
593 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); 593 printk(KERN_ERR "\t%d orphan inode numbers:\n", n);
594 for (i = 0; i < n; i++) 594 for (i = 0; i < n; i++)
595 printk(KERN_DEBUG "\t ino %llu\n", 595 printk(KERN_ERR "\t ino %llu\n",
596 (unsigned long long)le64_to_cpu(orph->inos[i])); 596 (unsigned long long)le64_to_cpu(orph->inos[i]));
597 break; 597 break;
598 } 598 }
599 default: 599 default:
600 printk(KERN_DEBUG "node type %d was not recognized\n", 600 printk(KERN_ERR "node type %d was not recognized\n",
601 (int)ch->node_type); 601 (int)ch->node_type);
602 } 602 }
603 spin_unlock(&dbg_lock); 603 spin_unlock(&dbg_lock);
@@ -606,16 +606,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
606void dbg_dump_budget_req(const struct ubifs_budget_req *req) 606void dbg_dump_budget_req(const struct ubifs_budget_req *req)
607{ 607{
608 spin_lock(&dbg_lock); 608 spin_lock(&dbg_lock);
609 printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", 609 printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n",
610 req->new_ino, req->dirtied_ino); 610 req->new_ino, req->dirtied_ino);
611 printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n", 611 printk(KERN_ERR "\tnew_ino_d %d, dirtied_ino_d %d\n",
612 req->new_ino_d, req->dirtied_ino_d); 612 req->new_ino_d, req->dirtied_ino_d);
613 printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n", 613 printk(KERN_ERR "\tnew_page %d, dirtied_page %d\n",
614 req->new_page, req->dirtied_page); 614 req->new_page, req->dirtied_page);
615 printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n", 615 printk(KERN_ERR "\tnew_dent %d, mod_dent %d\n",
616 req->new_dent, req->mod_dent); 616 req->new_dent, req->mod_dent);
617 printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth); 617 printk(KERN_ERR "\tidx_growth %d\n", req->idx_growth);
618 printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n", 618 printk(KERN_ERR "\tdata_growth %d dd_growth %d\n",
619 req->data_growth, req->dd_growth); 619 req->data_growth, req->dd_growth);
620 spin_unlock(&dbg_lock); 620 spin_unlock(&dbg_lock);
621} 621}
@@ -623,12 +623,12 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
623void dbg_dump_lstats(const struct ubifs_lp_stats *lst) 623void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
624{ 624{
625 spin_lock(&dbg_lock); 625 spin_lock(&dbg_lock);
626 printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " 626 printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, "
627 "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); 627 "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
628 printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " 628 printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, "
629 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, 629 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
630 lst->total_dirty); 630 lst->total_dirty);
631 printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " 631 printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, "
632 "total_dead %lld\n", lst->total_used, lst->total_dark, 632 "total_dead %lld\n", lst->total_used, lst->total_dark,
633 lst->total_dead); 633 lst->total_dead);
634 spin_unlock(&dbg_lock); 634 spin_unlock(&dbg_lock);
@@ -644,21 +644,21 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
644 644
645 spin_lock(&c->space_lock); 645 spin_lock(&c->space_lock);
646 spin_lock(&dbg_lock); 646 spin_lock(&dbg_lock);
647 printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, " 647 printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, "
648 "total budget sum %lld\n", current->pid, 648 "total budget sum %lld\n", current->pid,
649 bi->data_growth + bi->dd_growth, 649 bi->data_growth + bi->dd_growth,
650 bi->data_growth + bi->dd_growth + bi->idx_growth); 650 bi->data_growth + bi->dd_growth + bi->idx_growth);
651 printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, " 651 printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, "
652 "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, 652 "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
653 bi->idx_growth); 653 bi->idx_growth);
654 printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, " 654 printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, "
655 "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, 655 "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
656 bi->uncommitted_idx); 656 bi->uncommitted_idx);
657 printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n", 657 printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
658 bi->page_budget, bi->inode_budget, bi->dent_budget); 658 bi->page_budget, bi->inode_budget, bi->dent_budget);
659 printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n", 659 printk(KERN_ERR "\tnospace %u, nospace_rp %u\n",
660 bi->nospace, bi->nospace_rp); 660 bi->nospace, bi->nospace_rp);
661 printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", 661 printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
662 c->dark_wm, c->dead_wm, c->max_idx_node_sz); 662 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
663 663
664 if (bi != &c->bi) 664 if (bi != &c->bi)
@@ -669,38 +669,38 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
669 */ 669 */
670 goto out_unlock; 670 goto out_unlock;
671 671
672 printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", 672 printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
673 c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); 673 c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
674 printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " 674 printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
675 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), 675 "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
676 atomic_long_read(&c->dirty_zn_cnt), 676 atomic_long_read(&c->dirty_zn_cnt),
677 atomic_long_read(&c->clean_zn_cnt)); 677 atomic_long_read(&c->clean_zn_cnt));
678 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 678 printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n",
679 c->gc_lnum, c->ihead_lnum); 679 c->gc_lnum, c->ihead_lnum);
680 680
681 /* If we are in R/O mode, journal heads do not exist */ 681 /* If we are in R/O mode, journal heads do not exist */
682 if (c->jheads) 682 if (c->jheads)
683 for (i = 0; i < c->jhead_cnt; i++) 683 for (i = 0; i < c->jhead_cnt; i++)
684 printk(KERN_DEBUG "\tjhead %s\t LEB %d\n", 684 printk(KERN_ERR "\tjhead %s\t LEB %d\n",
685 dbg_jhead(c->jheads[i].wbuf.jhead), 685 dbg_jhead(c->jheads[i].wbuf.jhead),
686 c->jheads[i].wbuf.lnum); 686 c->jheads[i].wbuf.lnum);
687 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 687 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
688 bud = rb_entry(rb, struct ubifs_bud, rb); 688 bud = rb_entry(rb, struct ubifs_bud, rb);
689 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 689 printk(KERN_ERR "\tbud LEB %d\n", bud->lnum);
690 } 690 }
691 list_for_each_entry(bud, &c->old_buds, list) 691 list_for_each_entry(bud, &c->old_buds, list)
692 printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); 692 printk(KERN_ERR "\told bud LEB %d\n", bud->lnum);
693 list_for_each_entry(idx_gc, &c->idx_gc, list) 693 list_for_each_entry(idx_gc, &c->idx_gc, list)
694 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", 694 printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n",
695 idx_gc->lnum, idx_gc->unmap); 695 idx_gc->lnum, idx_gc->unmap);
696 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); 696 printk(KERN_ERR "\tcommit state %d\n", c->cmt_state);
697 697
698 /* Print budgeting predictions */ 698 /* Print budgeting predictions */
699 available = ubifs_calc_available(c, c->bi.min_idx_lebs); 699 available = ubifs_calc_available(c, c->bi.min_idx_lebs);
700 outstanding = c->bi.data_growth + c->bi.dd_growth; 700 outstanding = c->bi.data_growth + c->bi.dd_growth;
701 free = ubifs_get_free_space_nolock(c); 701 free = ubifs_get_free_space_nolock(c);
702 printk(KERN_DEBUG "Budgeting predictions:\n"); 702 printk(KERN_ERR "Budgeting predictions:\n");
703 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", 703 printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n",
704 available, outstanding, free); 704 available, outstanding, free);
705out_unlock: 705out_unlock:
706 spin_unlock(&dbg_lock); 706 spin_unlock(&dbg_lock);
@@ -720,11 +720,11 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
720 dark = ubifs_calc_dark(c, spc); 720 dark = ubifs_calc_dark(c, spc);
721 721
722 if (lp->flags & LPROPS_INDEX) 722 if (lp->flags & LPROPS_INDEX)
723 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " 723 printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "
724 "free + dirty %-8d flags %#x (", lp->lnum, lp->free, 724 "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
725 lp->dirty, c->leb_size - spc, spc, lp->flags); 725 lp->dirty, c->leb_size - spc, spc, lp->flags);
726 else 726 else
727 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " 727 printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "
728 "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d " 728 "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
729 "flags %#-4x (", lp->lnum, lp->free, lp->dirty, 729 "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
730 c->leb_size - spc, spc, dark, dead, 730 c->leb_size - spc, spc, dark, dead,
@@ -807,7 +807,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
807 struct ubifs_lprops lp; 807 struct ubifs_lprops lp;
808 struct ubifs_lp_stats lst; 808 struct ubifs_lp_stats lst;
809 809
810 printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n", 810 printk(KERN_ERR "(pid %d) start dumping LEB properties\n",
811 current->pid); 811 current->pid);
812 ubifs_get_lp_stats(c, &lst); 812 ubifs_get_lp_stats(c, &lst);
813 dbg_dump_lstats(&lst); 813 dbg_dump_lstats(&lst);
@@ -819,7 +819,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
819 819
820 dbg_dump_lprop(c, &lp); 820 dbg_dump_lprop(c, &lp);
821 } 821 }
822 printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n", 822 printk(KERN_ERR "(pid %d) finish dumping LEB properties\n",
823 current->pid); 823 current->pid);
824} 824}
825 825
@@ -828,35 +828,35 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
828 int i; 828 int i;
829 829
830 spin_lock(&dbg_lock); 830 spin_lock(&dbg_lock);
831 printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid); 831 printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid);
832 printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); 832 printk(KERN_ERR "\tlpt_sz: %lld\n", c->lpt_sz);
833 printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); 833 printk(KERN_ERR "\tpnode_sz: %d\n", c->pnode_sz);
834 printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); 834 printk(KERN_ERR "\tnnode_sz: %d\n", c->nnode_sz);
835 printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); 835 printk(KERN_ERR "\tltab_sz: %d\n", c->ltab_sz);
836 printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); 836 printk(KERN_ERR "\tlsave_sz: %d\n", c->lsave_sz);
837 printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); 837 printk(KERN_ERR "\tbig_lpt: %d\n", c->big_lpt);
838 printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); 838 printk(KERN_ERR "\tlpt_hght: %d\n", c->lpt_hght);
839 printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); 839 printk(KERN_ERR "\tpnode_cnt: %d\n", c->pnode_cnt);
840 printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); 840 printk(KERN_ERR "\tnnode_cnt: %d\n", c->nnode_cnt);
841 printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); 841 printk(KERN_ERR "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt);
842 printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); 842 printk(KERN_ERR "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt);
843 printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); 843 printk(KERN_ERR "\tlsave_cnt: %d\n", c->lsave_cnt);
844 printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); 844 printk(KERN_ERR "\tspace_bits: %d\n", c->space_bits);
845 printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); 845 printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
846 printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); 846 printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
847 printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); 847 printk(KERN_ERR "\tlpt_spc_bits: %d\n", c->lpt_spc_bits);
848 printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); 848 printk(KERN_ERR "\tpcnt_bits: %d\n", c->pcnt_bits);
849 printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); 849 printk(KERN_ERR "\tlnum_bits: %d\n", c->lnum_bits);
850 printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); 850 printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
851 printk(KERN_DEBUG "\tLPT head is at %d:%d\n", 851 printk(KERN_ERR "\tLPT head is at %d:%d\n",
852 c->nhead_lnum, c->nhead_offs); 852 c->nhead_lnum, c->nhead_offs);
853 printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", 853 printk(KERN_ERR "\tLPT ltab is at %d:%d\n",
854 c->ltab_lnum, c->ltab_offs); 854 c->ltab_lnum, c->ltab_offs);
855 if (c->big_lpt) 855 if (c->big_lpt)
856 printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", 856 printk(KERN_ERR "\tLPT lsave is at %d:%d\n",
857 c->lsave_lnum, c->lsave_offs); 857 c->lsave_lnum, c->lsave_offs);
858 for (i = 0; i < c->lpt_lebs; i++) 858 for (i = 0; i < c->lpt_lebs; i++)
859 printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " 859 printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d "
860 "cmt %d\n", i + c->lpt_first, c->ltab[i].free, 860 "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
861 c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); 861 c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
862 spin_unlock(&dbg_lock); 862 spin_unlock(&dbg_lock);
@@ -867,12 +867,12 @@ void dbg_dump_sleb(const struct ubifs_info *c,
867{ 867{
868 struct ubifs_scan_node *snod; 868 struct ubifs_scan_node *snod;
869 869
870 printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", 870 printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n",
871 current->pid, sleb->lnum, offs); 871 current->pid, sleb->lnum, offs);
872 872
873 list_for_each_entry(snod, &sleb->nodes, list) { 873 list_for_each_entry(snod, &sleb->nodes, list) {
874 cond_resched(); 874 cond_resched();
875 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, 875 printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
876 snod->offs, snod->len); 876 snod->offs, snod->len);
877 dbg_dump_node(c, snod->node); 877 dbg_dump_node(c, snod->node);
878 } 878 }
@@ -887,7 +887,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
887 if (dbg_is_tst_rcvry(c)) 887 if (dbg_is_tst_rcvry(c))
888 return; 888 return;
889 889
890 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 890 printk(KERN_ERR "(pid %d) start dumping LEB %d\n",
891 current->pid, lnum); 891 current->pid, lnum);
892 892
893 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); 893 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
@@ -902,17 +902,17 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
902 goto out; 902 goto out;
903 } 903 }
904 904
905 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, 905 printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum,
906 sleb->nodes_cnt, sleb->endpt); 906 sleb->nodes_cnt, sleb->endpt);
907 907
908 list_for_each_entry(snod, &sleb->nodes, list) { 908 list_for_each_entry(snod, &sleb->nodes, list) {
909 cond_resched(); 909 cond_resched();
910 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, 910 printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum,
911 snod->offs, snod->len); 911 snod->offs, snod->len);
912 dbg_dump_node(c, snod->node); 912 dbg_dump_node(c, snod->node);
913 } 913 }
914 914
915 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", 915 printk(KERN_ERR "(pid %d) finish dumping LEB %d\n",
916 current->pid, lnum); 916 current->pid, lnum);
917 ubifs_scan_destroy(sleb); 917 ubifs_scan_destroy(sleb);
918 918
@@ -934,7 +934,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
934 else 934 else
935 zbr = &c->zroot; 935 zbr = &c->zroot;
936 936
937 printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" 937 printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
938 " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, 938 " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
939 zbr->len, znode->parent, znode->iip, znode->level, 939 zbr->len, znode->parent, znode->iip, znode->level,
940 znode->child_cnt, znode->flags); 940 znode->child_cnt, znode->flags);
@@ -944,18 +944,18 @@ void dbg_dump_znode(const struct ubifs_info *c,
944 return; 944 return;
945 } 945 }
946 946
947 printk(KERN_DEBUG "zbranches:\n"); 947 printk(KERN_ERR "zbranches:\n");
948 for (n = 0; n < znode->child_cnt; n++) { 948 for (n = 0; n < znode->child_cnt; n++) {
949 zbr = &znode->zbranch[n]; 949 zbr = &znode->zbranch[n];
950 if (znode->level > 0) 950 if (znode->level > 0)
951 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " 951 printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key "
952 "%s\n", n, zbr->znode, zbr->lnum, 952 "%s\n", n, zbr->znode, zbr->lnum,
953 zbr->offs, zbr->len, 953 zbr->offs, zbr->len,
954 dbg_snprintf_key(c, &zbr->key, 954 dbg_snprintf_key(c, &zbr->key,
955 key_buf, 955 key_buf,
956 DBG_KEY_BUF_LEN)); 956 DBG_KEY_BUF_LEN));
957 else 957 else
958 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " 958 printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key "
959 "%s\n", n, zbr->znode, zbr->lnum, 959 "%s\n", n, zbr->znode, zbr->lnum,
960 zbr->offs, zbr->len, 960 zbr->offs, zbr->len,
961 dbg_snprintf_key(c, &zbr->key, 961 dbg_snprintf_key(c, &zbr->key,
@@ -969,16 +969,16 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
969{ 969{
970 int i; 970 int i;
971 971
972 printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n", 972 printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n",
973 current->pid, cat, heap->cnt); 973 current->pid, cat, heap->cnt);
974 for (i = 0; i < heap->cnt; i++) { 974 for (i = 0; i < heap->cnt; i++) {
975 struct ubifs_lprops *lprops = heap->arr[i]; 975 struct ubifs_lprops *lprops = heap->arr[i];
976 976
977 printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " 977 printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d "
978 "flags %d\n", i, lprops->lnum, lprops->hpos, 978 "flags %d\n", i, lprops->lnum, lprops->hpos,
979 lprops->free, lprops->dirty, lprops->flags); 979 lprops->free, lprops->dirty, lprops->flags);
980 } 980 }
981 printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid); 981 printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid);
982} 982}
983 983
984void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, 984void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -986,15 +986,15 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
986{ 986{
987 int i; 987 int i;
988 988
989 printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid); 989 printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid);
990 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", 990 printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n",
991 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); 991 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
992 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", 992 printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n",
993 pnode->flags, iip, pnode->level, pnode->num); 993 pnode->flags, iip, pnode->level, pnode->num);
994 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 994 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
995 struct ubifs_lprops *lp = &pnode->lprops[i]; 995 struct ubifs_lprops *lp = &pnode->lprops[i];
996 996
997 printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", 997 printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n",
998 i, lp->free, lp->dirty, lp->flags, lp->lnum); 998 i, lp->free, lp->dirty, lp->flags, lp->lnum);
999 } 999 }
1000} 1000}
@@ -1004,20 +1004,20 @@ void dbg_dump_tnc(struct ubifs_info *c)
1004 struct ubifs_znode *znode; 1004 struct ubifs_znode *znode;
1005 int level; 1005 int level;
1006 1006
1007 printk(KERN_DEBUG "\n"); 1007 printk(KERN_ERR "\n");
1008 printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid); 1008 printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid);
1009 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); 1009 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
1010 level = znode->level; 1010 level = znode->level;
1011 printk(KERN_DEBUG "== Level %d ==\n", level); 1011 printk(KERN_ERR "== Level %d ==\n", level);
1012 while (znode) { 1012 while (znode) {
1013 if (level != znode->level) { 1013 if (level != znode->level) {
1014 level = znode->level; 1014 level = znode->level;
1015 printk(KERN_DEBUG "== Level %d ==\n", level); 1015 printk(KERN_ERR "== Level %d ==\n", level);
1016 } 1016 }
1017 dbg_dump_znode(c, znode); 1017 dbg_dump_znode(c, znode);
1018 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); 1018 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
1019 } 1019 }
1020 printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid); 1020 printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid);
1021} 1021}
1022 1022
1023static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, 1023static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index ad1a6fee6010..9f717655df18 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -164,9 +164,7 @@ struct ubifs_global_debug_info {
164#define dbg_dump_stack() dump_stack() 164#define dbg_dump_stack() dump_stack()
165 165
166#define dbg_err(fmt, ...) do { \ 166#define dbg_err(fmt, ...) do { \
167 spin_lock(&dbg_lock); \
168 ubifs_err(fmt, ##__VA_ARGS__); \ 167 ubifs_err(fmt, ##__VA_ARGS__); \
169 spin_unlock(&dbg_lock); \
170} while (0) 168} while (0)
171 169
172#define ubifs_dbg_msg(type, fmt, ...) \ 170#define ubifs_dbg_msg(type, fmt, ...) \
@@ -217,7 +215,6 @@ struct ubifs_global_debug_info {
217/* Additional recovery messages */ 215/* Additional recovery messages */
218#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) 216#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
219 217
220extern spinlock_t dbg_lock;
221extern struct ubifs_global_debug_info ubifs_dbg; 218extern struct ubifs_global_debug_info ubifs_dbg;
222 219
223static inline int dbg_is_chk_gen(const struct ubifs_info *c) 220static inline int dbg_is_chk_gen(const struct ubifs_info *c)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index d6fe1c79f18b..ec9f1870ab7f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -566,6 +566,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
566 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 566 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
567 int err, budgeted = 1; 567 int err, budgeted = 1;
568 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 568 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
569 unsigned int saved_nlink = inode->i_nlink;
569 570
570 /* 571 /*
571 * Budget request settings: deletion direntry, deletion inode (+1 for 572 * Budget request settings: deletion direntry, deletion inode (+1 for
@@ -613,7 +614,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
613out_cancel: 614out_cancel:
614 dir->i_size += sz_change; 615 dir->i_size += sz_change;
615 dir_ui->ui_size = dir->i_size; 616 dir_ui->ui_size = dir->i_size;
616 inc_nlink(inode); 617 set_nlink(inode, saved_nlink);
617 unlock_2_inodes(dir, inode); 618 unlock_2_inodes(dir, inode);
618 if (budgeted) 619 if (budgeted)
619 ubifs_release_budget(c, &req); 620 ubifs_release_budget(c, &req);
@@ -704,8 +705,7 @@ out_cancel:
704 dir->i_size += sz_change; 705 dir->i_size += sz_change;
705 dir_ui->ui_size = dir->i_size; 706 dir_ui->ui_size = dir->i_size;
706 inc_nlink(dir); 707 inc_nlink(dir);
707 inc_nlink(inode); 708 set_nlink(inode, 2);
708 inc_nlink(inode);
709 unlock_2_inodes(dir, inode); 709 unlock_2_inodes(dir, inode);
710 if (budgeted) 710 if (budgeted)
711 ubifs_release_budget(c, &req); 711 ubifs_release_budget(c, &req);
@@ -977,6 +977,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
977 struct ubifs_budget_req ino_req = { .dirtied_ino = 1, 977 struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
978 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; 978 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
979 struct timespec time; 979 struct timespec time;
980 unsigned int saved_nlink;
980 981
981 /* 982 /*
982 * Budget request settings: deletion direntry, new direntry, removing 983 * Budget request settings: deletion direntry, new direntry, removing
@@ -1059,13 +1060,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1059 if (unlink) { 1060 if (unlink) {
1060 /* 1061 /*
1061 * Directories cannot have hard-links, so if this is a 1062 * Directories cannot have hard-links, so if this is a
1062 * directory, decrement its @i_nlink twice because an empty 1063 * directory, just clear @i_nlink.
1063 * directory has @i_nlink 2.
1064 */ 1064 */
1065 saved_nlink = new_inode->i_nlink;
1065 if (is_dir) 1066 if (is_dir)
1067 clear_nlink(new_inode);
1068 else
1066 drop_nlink(new_inode); 1069 drop_nlink(new_inode);
1067 new_inode->i_ctime = time; 1070 new_inode->i_ctime = time;
1068 drop_nlink(new_inode);
1069 } else { 1071 } else {
1070 new_dir->i_size += new_sz; 1072 new_dir->i_size += new_sz;
1071 ubifs_inode(new_dir)->ui_size = new_dir->i_size; 1073 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1102,9 +1104,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1102 1104
1103out_cancel: 1105out_cancel:
1104 if (unlink) { 1106 if (unlink) {
1105 if (is_dir) 1107 set_nlink(new_inode, saved_nlink);
1106 inc_nlink(new_inode);
1107 inc_nlink(new_inode);
1108 } else { 1108 } else {
1109 new_dir->i_size -= new_sz; 1109 new_dir->i_size -= new_sz;
1110 ubifs_inode(new_dir)->ui_size = new_dir->i_size; 1110 ubifs_inode(new_dir)->ui_size = new_dir->i_size;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index ee4f43f4bb99..2a935b317232 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -679,7 +679,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
679 ret == SCANNED_GARBAGE || 679 ret == SCANNED_GARBAGE ||
680 ret == SCANNED_A_BAD_PAD_NODE || 680 ret == SCANNED_A_BAD_PAD_NODE ||
681 ret == SCANNED_A_CORRUPT_NODE) { 681 ret == SCANNED_A_CORRUPT_NODE) {
682 dbg_rcvry("found corruption - %d", ret); 682 dbg_rcvry("found corruption (%d) at %d:%d",
683 ret, lnum, offs);
683 break; 684 break;
684 } else { 685 } else {
685 dbg_err("unexpected return value %d", ret); 686 dbg_err("unexpected return value %d", ret);
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 6094c5a5d7a8..771f7fb6ce92 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -410,13 +410,23 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
410 } 410 }
411 411
412 if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { 412 if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
413 err = 7; 413 ubifs_err("too few main LEBs count %d, must be at least %d",
414 c->main_lebs, UBIFS_MIN_MAIN_LEBS);
414 goto failed; 415 goto failed;
415 } 416 }
416 417
417 if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || 418 max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
418 c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { 419 if (c->max_bud_bytes < max_bytes) {
419 err = 8; 420 ubifs_err("too small journal (%lld bytes), must be at least "
421 "%lld bytes", c->max_bud_bytes, max_bytes);
422 goto failed;
423 }
424
425 max_bytes = (long long)c->leb_size * c->main_lebs;
426 if (c->max_bud_bytes > max_bytes) {
427 ubifs_err("too large journal size (%lld bytes), only %lld bytes"
428 "available in the main area",
429 c->max_bud_bytes, max_bytes);
420 goto failed; 430 goto failed;
421 } 431 }
422 432
@@ -450,7 +460,6 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
450 goto failed; 460 goto failed;
451 } 461 }
452 462
453 max_bytes = c->main_lebs * (long long)c->leb_size;
454 if (c->rp_size < 0 || max_bytes < c->rp_size) { 463 if (c->rp_size < 0 || max_bytes < c->rp_size) {
455 err = 14; 464 err = 14;
456 goto failed; 465 goto failed;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 12e94774aa88..93d59aceaaef 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -84,9 +84,6 @@
84#define INUM_WARN_WATERMARK 0xFFF00000 84#define INUM_WARN_WATERMARK 0xFFF00000
85#define INUM_WATERMARK 0xFFFFFF00 85#define INUM_WATERMARK 0xFFFFFF00
86 86
87/* Largest key size supported in this implementation */
88#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
89
90/* Maximum number of entries in each LPT (LEB category) heap */ 87/* Maximum number of entries in each LPT (LEB category) heap */
91#define LPT_HEAP_SZ 256 88#define LPT_HEAP_SZ 256
92 89
@@ -277,10 +274,10 @@ struct ubifs_old_idx {
277 274
278/* The below union makes it easier to deal with keys */ 275/* The below union makes it easier to deal with keys */
279union ubifs_key { 276union ubifs_key {
280 uint8_t u8[CUR_MAX_KEY_LEN]; 277 uint8_t u8[UBIFS_SK_LEN];
281 uint32_t u32[CUR_MAX_KEY_LEN/4]; 278 uint32_t u32[UBIFS_SK_LEN/4];
282 uint64_t u64[CUR_MAX_KEY_LEN/8]; 279 uint64_t u64[UBIFS_SK_LEN/8];
283 __le32 j32[CUR_MAX_KEY_LEN/4]; 280 __le32 j32[UBIFS_SK_LEN/4];
284}; 281};
285 282
286/** 283/**
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 987585bb0a1d..1ba2baaf4367 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -105,7 +105,6 @@ static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
105} 105}
106 106
107static void udf_bitmap_free_blocks(struct super_block *sb, 107static void udf_bitmap_free_blocks(struct super_block *sb,
108 struct inode *inode,
109 struct udf_bitmap *bitmap, 108 struct udf_bitmap *bitmap,
110 struct kernel_lb_addr *bloc, 109 struct kernel_lb_addr *bloc,
111 uint32_t offset, 110 uint32_t offset,
@@ -172,7 +171,6 @@ error_return:
172} 171}
173 172
174static int udf_bitmap_prealloc_blocks(struct super_block *sb, 173static int udf_bitmap_prealloc_blocks(struct super_block *sb,
175 struct inode *inode,
176 struct udf_bitmap *bitmap, 174 struct udf_bitmap *bitmap,
177 uint16_t partition, uint32_t first_block, 175 uint16_t partition, uint32_t first_block,
178 uint32_t block_count) 176 uint32_t block_count)
@@ -223,7 +221,6 @@ out:
223} 221}
224 222
225static int udf_bitmap_new_block(struct super_block *sb, 223static int udf_bitmap_new_block(struct super_block *sb,
226 struct inode *inode,
227 struct udf_bitmap *bitmap, uint16_t partition, 224 struct udf_bitmap *bitmap, uint16_t partition,
228 uint32_t goal, int *err) 225 uint32_t goal, int *err)
229{ 226{
@@ -349,7 +346,6 @@ error_return:
349} 346}
350 347
351static void udf_table_free_blocks(struct super_block *sb, 348static void udf_table_free_blocks(struct super_block *sb,
352 struct inode *inode,
353 struct inode *table, 349 struct inode *table,
354 struct kernel_lb_addr *bloc, 350 struct kernel_lb_addr *bloc,
355 uint32_t offset, 351 uint32_t offset,
@@ -581,7 +577,6 @@ error_return:
581} 577}
582 578
583static int udf_table_prealloc_blocks(struct super_block *sb, 579static int udf_table_prealloc_blocks(struct super_block *sb,
584 struct inode *inode,
585 struct inode *table, uint16_t partition, 580 struct inode *table, uint16_t partition,
586 uint32_t first_block, uint32_t block_count) 581 uint32_t first_block, uint32_t block_count)
587{ 582{
@@ -643,7 +638,6 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
643} 638}
644 639
645static int udf_table_new_block(struct super_block *sb, 640static int udf_table_new_block(struct super_block *sb,
646 struct inode *inode,
647 struct inode *table, uint16_t partition, 641 struct inode *table, uint16_t partition,
648 uint32_t goal, int *err) 642 uint32_t goal, int *err)
649{ 643{
@@ -743,18 +737,23 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,
743 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 737 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
744 738
745 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { 739 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
746 udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, 740 udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,
747 bloc, offset, count); 741 bloc, offset, count);
748 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { 742 } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
749 udf_table_free_blocks(sb, inode, map->s_uspace.s_table, 743 udf_table_free_blocks(sb, map->s_uspace.s_table,
750 bloc, offset, count); 744 bloc, offset, count);
751 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { 745 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
752 udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, 746 udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,
753 bloc, offset, count); 747 bloc, offset, count);
754 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { 748 } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
755 udf_table_free_blocks(sb, inode, map->s_fspace.s_table, 749 udf_table_free_blocks(sb, map->s_fspace.s_table,
756 bloc, offset, count); 750 bloc, offset, count);
757 } 751 }
752
753 if (inode) {
754 inode_sub_bytes(inode,
755 ((sector_t)count) << sb->s_blocksize_bits);
756 }
758} 757}
759 758
760inline int udf_prealloc_blocks(struct super_block *sb, 759inline int udf_prealloc_blocks(struct super_block *sb,
@@ -763,29 +762,34 @@ inline int udf_prealloc_blocks(struct super_block *sb,
763 uint32_t block_count) 762 uint32_t block_count)
764{ 763{
765 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 764 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
765 sector_t allocated;
766 766
767 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) 767 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
768 return udf_bitmap_prealloc_blocks(sb, inode, 768 allocated = udf_bitmap_prealloc_blocks(sb,
769 map->s_uspace.s_bitmap, 769 map->s_uspace.s_bitmap,
770 partition, first_block, 770 partition, first_block,
771 block_count); 771 block_count);
772 else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) 772 else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
773 return udf_table_prealloc_blocks(sb, inode, 773 allocated = udf_table_prealloc_blocks(sb,
774 map->s_uspace.s_table, 774 map->s_uspace.s_table,
775 partition, first_block, 775 partition, first_block,
776 block_count); 776 block_count);
777 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) 777 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
778 return udf_bitmap_prealloc_blocks(sb, inode, 778 allocated = udf_bitmap_prealloc_blocks(sb,
779 map->s_fspace.s_bitmap, 779 map->s_fspace.s_bitmap,
780 partition, first_block, 780 partition, first_block,
781 block_count); 781 block_count);
782 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) 782 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
783 return udf_table_prealloc_blocks(sb, inode, 783 allocated = udf_table_prealloc_blocks(sb,
784 map->s_fspace.s_table, 784 map->s_fspace.s_table,
785 partition, first_block, 785 partition, first_block,
786 block_count); 786 block_count);
787 else 787 else
788 return 0; 788 return 0;
789
790 if (inode && allocated > 0)
791 inode_add_bytes(inode, allocated << sb->s_blocksize_bits);
792 return allocated;
789} 793}
790 794
791inline int udf_new_block(struct super_block *sb, 795inline int udf_new_block(struct super_block *sb,
@@ -793,25 +797,29 @@ inline int udf_new_block(struct super_block *sb,
793 uint16_t partition, uint32_t goal, int *err) 797 uint16_t partition, uint32_t goal, int *err)
794{ 798{
795 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 799 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
800 int block;
796 801
797 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) 802 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
798 return udf_bitmap_new_block(sb, inode, 803 block = udf_bitmap_new_block(sb,
799 map->s_uspace.s_bitmap, 804 map->s_uspace.s_bitmap,
800 partition, goal, err); 805 partition, goal, err);
801 else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) 806 else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
802 return udf_table_new_block(sb, inode, 807 block = udf_table_new_block(sb,
803 map->s_uspace.s_table, 808 map->s_uspace.s_table,
804 partition, goal, err);
805 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
806 return udf_bitmap_new_block(sb, inode,
807 map->s_fspace.s_bitmap,
808 partition, goal, err); 809 partition, goal, err);
810 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
811 block = udf_bitmap_new_block(sb,
812 map->s_fspace.s_bitmap,
813 partition, goal, err);
809 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) 814 else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
810 return udf_table_new_block(sb, inode, 815 block = udf_table_new_block(sb,
811 map->s_fspace.s_table, 816 map->s_fspace.s_table,
812 partition, goal, err); 817 partition, goal, err);
813 else { 818 else {
814 *err = -EIO; 819 *err = -EIO;
815 return 0; 820 return 0;
816 } 821 }
822 if (inode && block)
823 inode_add_bytes(inode, sb->s_blocksize);
824 return block;
817} 825}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 05ab48195be9..7e5aae4bf46f 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -116,6 +116,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
116 iinfo->i_lenEAttr = 0; 116 iinfo->i_lenEAttr = 0;
117 iinfo->i_lenAlloc = 0; 117 iinfo->i_lenAlloc = 0;
118 iinfo->i_use = 0; 118 iinfo->i_use = 0;
119 iinfo->i_checkpoint = 1;
119 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) 120 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
120 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; 121 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
121 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 122 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7699df7b3198..7d7528008359 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1358,6 +1358,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1358 iinfo->i_unique = le64_to_cpu(fe->uniqueID); 1358 iinfo->i_unique = le64_to_cpu(fe->uniqueID);
1359 iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); 1359 iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
1360 iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); 1360 iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
1361 iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);
1361 offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr; 1362 offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr;
1362 } else { 1363 } else {
1363 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << 1364 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
@@ -1379,6 +1380,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1379 iinfo->i_unique = le64_to_cpu(efe->uniqueID); 1380 iinfo->i_unique = le64_to_cpu(efe->uniqueID);
1380 iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); 1381 iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
1381 iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); 1382 iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
1383 iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
1382 offset = sizeof(struct extendedFileEntry) + 1384 offset = sizeof(struct extendedFileEntry) +
1383 iinfo->i_lenEAttr; 1385 iinfo->i_lenEAttr;
1384 } 1386 }
@@ -1495,6 +1497,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1495 struct buffer_head *bh = NULL; 1497 struct buffer_head *bh = NULL;
1496 struct fileEntry *fe; 1498 struct fileEntry *fe;
1497 struct extendedFileEntry *efe; 1499 struct extendedFileEntry *efe;
1500 uint64_t lb_recorded;
1498 uint32_t udfperms; 1501 uint32_t udfperms;
1499 uint16_t icbflags; 1502 uint16_t icbflags;
1500 uint16_t crclen; 1503 uint16_t crclen;
@@ -1589,13 +1592,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1589 dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); 1592 dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));
1590 } 1593 }
1591 1594
1595 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1596 lb_recorded = 0; /* No extents => no blocks! */
1597 else
1598 lb_recorded =
1599 (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
1600 (blocksize_bits - 9);
1601
1592 if (iinfo->i_efe == 0) { 1602 if (iinfo->i_efe == 0) {
1593 memcpy(bh->b_data + sizeof(struct fileEntry), 1603 memcpy(bh->b_data + sizeof(struct fileEntry),
1594 iinfo->i_ext.i_data, 1604 iinfo->i_ext.i_data,
1595 inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1605 inode->i_sb->s_blocksize - sizeof(struct fileEntry));
1596 fe->logicalBlocksRecorded = cpu_to_le64( 1606 fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
1597 (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
1598 (blocksize_bits - 9));
1599 1607
1600 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); 1608 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
1601 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); 1609 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
@@ -1607,6 +1615,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1607 fe->uniqueID = cpu_to_le64(iinfo->i_unique); 1615 fe->uniqueID = cpu_to_le64(iinfo->i_unique);
1608 fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); 1616 fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
1609 fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); 1617 fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
1618 fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
1610 fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); 1619 fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);
1611 crclen = sizeof(struct fileEntry); 1620 crclen = sizeof(struct fileEntry);
1612 } else { 1621 } else {
@@ -1615,9 +1624,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1615 inode->i_sb->s_blocksize - 1624 inode->i_sb->s_blocksize -
1616 sizeof(struct extendedFileEntry)); 1625 sizeof(struct extendedFileEntry));
1617 efe->objectSize = cpu_to_le64(inode->i_size); 1626 efe->objectSize = cpu_to_le64(inode->i_size);
1618 efe->logicalBlocksRecorded = cpu_to_le64( 1627 efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
1619 (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
1620 (blocksize_bits - 9));
1621 1628
1622 if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec || 1629 if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||
1623 (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && 1630 (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec &&
@@ -1646,6 +1653,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1646 efe->uniqueID = cpu_to_le64(iinfo->i_unique); 1653 efe->uniqueID = cpu_to_le64(iinfo->i_unique);
1647 efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); 1654 efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
1648 efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); 1655 efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
1656 efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
1649 efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); 1657 efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
1650 crclen = sizeof(struct extendedFileEntry); 1658 crclen = sizeof(struct extendedFileEntry);
1651 } 1659 }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 85067b4c7e14..ac8a348dcb69 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -950,11 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
950 else 950 else
951 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ 951 bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
952 952
953 if (bitmap == NULL) { 953 if (bitmap == NULL)
954 udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n",
955 nr_groups);
956 return NULL; 954 return NULL;
957 }
958 955
959 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1); 956 bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
960 bitmap->s_nr_groups = nr_groups; 957 bitmap->s_nr_groups = nr_groups;
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index d1bd31ea724e..bb8309dcd5c1 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -23,6 +23,7 @@ struct udf_inode_info {
23 __u64 i_lenExtents; 23 __u64 i_lenExtents;
24 __u32 i_next_alloc_block; 24 __u32 i_next_alloc_block;
25 __u32 i_next_alloc_goal; 25 __u32 i_next_alloc_goal;
26 __u32 i_checkpoint;
26 unsigned i_alloc_type : 3; 27 unsigned i_alloc_type : 3;
27 unsigned i_efe : 1; /* extendedFileEntry */ 28 unsigned i_efe : 1; /* extendedFileEntry */
28 unsigned i_use : 1; /* unallocSpaceEntry */ 29 unsigned i_use : 1; /* unallocSpaceEntry */
diff --git a/fs/xattr.c b/fs/xattr.c
index 82f43376c7cd..d6dfd247bb2f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -16,7 +16,7 @@
16#include <linux/security.h> 16#include <linux/security.h>
17#include <linux/evm.h> 17#include <linux/evm.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/module.h> 19#include <linux/export.h>
20#include <linux/fsnotify.h> 20#include <linux/fsnotify.h>
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 8d5a506c82eb..69d06b07b169 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/export.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h> 11#include <linux/gfp.h>
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 427a4e82a588..0a9977983f92 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
96 xfs_qm_bhv.o \ 96 xfs_qm_bhv.o \
97 xfs_qm.o \ 97 xfs_qm.o \
98 xfs_quotaops.o 98 xfs_quotaops.o
99ifeq ($(CONFIG_XFS_QUOTA),y)
100xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o
101endif
102xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o 99xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
103xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o 100xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
104xfs-$(CONFIG_PROC_FS) += xfs_stats.o 101xfs-$(CONFIG_PROC_FS) += xfs_stats.o
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ce84ffd0264c..0f0df2759b09 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -35,6 +35,7 @@
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37 37
38struct workqueue_struct *xfs_alloc_wq;
38 39
39#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 40#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
40 41
@@ -68,7 +69,7 @@ xfs_alloc_lookup_eq(
68 * Lookup the first record greater than or equal to [bno, len] 69 * Lookup the first record greater than or equal to [bno, len]
69 * in the btree given by cur. 70 * in the btree given by cur.
70 */ 71 */
71STATIC int /* error */ 72int /* error */
72xfs_alloc_lookup_ge( 73xfs_alloc_lookup_ge(
73 struct xfs_btree_cur *cur, /* btree cursor */ 74 struct xfs_btree_cur *cur, /* btree cursor */
74 xfs_agblock_t bno, /* starting block of extent */ 75 xfs_agblock_t bno, /* starting block of extent */
@@ -2207,7 +2208,7 @@ xfs_alloc_read_agf(
2207 * group or loop over the allocation groups to find the result. 2208 * group or loop over the allocation groups to find the result.
2208 */ 2209 */
2209int /* error */ 2210int /* error */
2210xfs_alloc_vextent( 2211__xfs_alloc_vextent(
2211 xfs_alloc_arg_t *args) /* allocation argument structure */ 2212 xfs_alloc_arg_t *args) /* allocation argument structure */
2212{ 2213{
2213 xfs_agblock_t agsize; /* allocation group size */ 2214 xfs_agblock_t agsize; /* allocation group size */
@@ -2417,6 +2418,37 @@ error0:
2417 return error; 2418 return error;
2418} 2419}
2419 2420
2421static void
2422xfs_alloc_vextent_worker(
2423 struct work_struct *work)
2424{
2425 struct xfs_alloc_arg *args = container_of(work,
2426 struct xfs_alloc_arg, work);
2427 unsigned long pflags;
2428
2429 /* we are in a transaction context here */
2430 current_set_flags_nested(&pflags, PF_FSTRANS);
2431
2432 args->result = __xfs_alloc_vextent(args);
2433 complete(args->done);
2434
2435 current_restore_flags_nested(&pflags, PF_FSTRANS);
2436}
2437
2438
2439int /* error */
2440xfs_alloc_vextent(
2441 xfs_alloc_arg_t *args) /* allocation argument structure */
2442{
2443 DECLARE_COMPLETION_ONSTACK(done);
2444
2445 args->done = &done;
2446 INIT_WORK(&args->work, xfs_alloc_vextent_worker);
2447 queue_work(xfs_alloc_wq, &args->work);
2448 wait_for_completion(&done);
2449 return args->result;
2450}
2451
2420/* 2452/*
2421 * Free an extent. 2453 * Free an extent.
2422 * Just break up the extent address and hand off to xfs_free_ag_extent 2454 * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2f52b924be79..3a7e7d8f8ded 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -25,6 +25,8 @@ struct xfs_perag;
25struct xfs_trans; 25struct xfs_trans;
26struct xfs_busy_extent; 26struct xfs_busy_extent;
27 27
28extern struct workqueue_struct *xfs_alloc_wq;
29
28/* 30/*
29 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 31 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
30 */ 32 */
@@ -119,6 +121,9 @@ typedef struct xfs_alloc_arg {
119 char isfl; /* set if is freelist blocks - !acctg */ 121 char isfl; /* set if is freelist blocks - !acctg */
120 char userdata; /* set if this is user data */ 122 char userdata; /* set if this is user data */
121 xfs_fsblock_t firstblock; /* io first block allocated */ 123 xfs_fsblock_t firstblock; /* io first block allocated */
124 struct completion *done;
125 struct work_struct work;
126 int result;
122} xfs_alloc_arg_t; 127} xfs_alloc_arg_t;
123 128
124/* 129/*
@@ -243,6 +248,13 @@ xfs_alloc_lookup_le(
243 xfs_extlen_t len, /* length of extent */ 248 xfs_extlen_t len, /* length of extent */
244 int *stat); /* success/failure */ 249 int *stat); /* success/failure */
245 250
251int /* error */
252xfs_alloc_lookup_ge(
253 struct xfs_btree_cur *cur, /* btree cursor */
254 xfs_agblock_t bno, /* starting block of extent */
255 xfs_extlen_t len, /* length of extent */
256 int *stat); /* success/failure */
257
246int /* error */ 258int /* error */
247xfs_alloc_get_rec( 259xfs_alloc_get_rec(
248 struct xfs_btree_cur *cur, /* btree cursor */ 260 struct xfs_btree_cur *cur, /* btree cursor */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 74b9baf36ac3..0dbb9e70fe21 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,6 +26,7 @@
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_dinode.h" 27#include "xfs_dinode.h"
28#include "xfs_inode.h" 28#include "xfs_inode.h"
29#include "xfs_inode_item.h"
29#include "xfs_alloc.h" 30#include "xfs_alloc.h"
30#include "xfs_error.h" 31#include "xfs_error.h"
31#include "xfs_rw.h" 32#include "xfs_rw.h"
@@ -99,23 +100,6 @@ xfs_destroy_ioend(
99} 100}
100 101
101/* 102/*
102 * If the end of the current ioend is beyond the current EOF,
103 * return the new EOF value, otherwise zero.
104 */
105STATIC xfs_fsize_t
106xfs_ioend_new_eof(
107 xfs_ioend_t *ioend)
108{
109 xfs_inode_t *ip = XFS_I(ioend->io_inode);
110 xfs_fsize_t isize;
111 xfs_fsize_t bsize;
112
113 bsize = ioend->io_offset + ioend->io_size;
114 isize = MIN(i_size_read(VFS_I(ip)), bsize);
115 return isize > ip->i_d.di_size ? isize : 0;
116}
117
118/*
119 * Fast and loose check if this write could update the on-disk inode size. 103 * Fast and loose check if this write could update the on-disk inode size.
120 */ 104 */
121static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) 105static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
@@ -124,32 +108,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
124 XFS_I(ioend->io_inode)->i_d.di_size; 108 XFS_I(ioend->io_inode)->i_d.di_size;
125} 109}
126 110
111STATIC int
112xfs_setfilesize_trans_alloc(
113 struct xfs_ioend *ioend)
114{
115 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
116 struct xfs_trans *tp;
117 int error;
118
119 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
120
121 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
122 if (error) {
123 xfs_trans_cancel(tp, 0);
124 return error;
125 }
126
127 ioend->io_append_trans = tp;
128
129 /*
130 * We hand off the transaction to the completion thread now, so
131 * clear the flag here.
132 */
133 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
134 return 0;
135}
136
127/* 137/*
128 * Update on-disk file size now that data has been written to disk. 138 * Update on-disk file size now that data has been written to disk.
129 *
130 * This function does not block as blocking on the inode lock in IO completion
131 * can lead to IO completion order dependency deadlocks.. If it can't get the
132 * inode ilock it will return EAGAIN. Callers must handle this.
133 */ 139 */
134STATIC int 140STATIC int
135xfs_setfilesize( 141xfs_setfilesize(
136 xfs_ioend_t *ioend) 142 struct xfs_ioend *ioend)
137{ 143{
138 xfs_inode_t *ip = XFS_I(ioend->io_inode); 144 struct xfs_inode *ip = XFS_I(ioend->io_inode);
145 struct xfs_trans *tp = ioend->io_append_trans;
139 xfs_fsize_t isize; 146 xfs_fsize_t isize;
140 147
141 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 148 /*
142 return EAGAIN; 149 * The transaction was allocated in the I/O submission thread,
150 * thus we need to mark ourselves as beeing in a transaction
151 * manually.
152 */
153 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
143 154
144 isize = xfs_ioend_new_eof(ioend); 155 xfs_ilock(ip, XFS_ILOCK_EXCL);
145 if (isize) { 156 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
146 trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); 157 if (!isize) {
147 ip->i_d.di_size = isize; 158 xfs_iunlock(ip, XFS_ILOCK_EXCL);
148 xfs_mark_inode_dirty(ip); 159 xfs_trans_cancel(tp, 0);
160 return 0;
149 } 161 }
150 162
151 xfs_iunlock(ip, XFS_ILOCK_EXCL); 163 trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
152 return 0; 164
165 ip->i_d.di_size = isize;
166 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
167 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
168
169 return xfs_trans_commit(tp, 0);
153} 170}
154 171
155/* 172/*
@@ -163,10 +180,12 @@ xfs_finish_ioend(
163 struct xfs_ioend *ioend) 180 struct xfs_ioend *ioend)
164{ 181{
165 if (atomic_dec_and_test(&ioend->io_remaining)) { 182 if (atomic_dec_and_test(&ioend->io_remaining)) {
183 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
184
166 if (ioend->io_type == IO_UNWRITTEN) 185 if (ioend->io_type == IO_UNWRITTEN)
167 queue_work(xfsconvertd_workqueue, &ioend->io_work); 186 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
168 else if (xfs_ioend_is_append(ioend)) 187 else if (ioend->io_append_trans)
169 queue_work(xfsdatad_workqueue, &ioend->io_work); 188 queue_work(mp->m_data_workqueue, &ioend->io_work);
170 else 189 else
171 xfs_destroy_ioend(ioend); 190 xfs_destroy_ioend(ioend);
172 } 191 }
@@ -195,35 +214,36 @@ xfs_end_io(
195 * range to normal written extens after the data I/O has finished. 214 * range to normal written extens after the data I/O has finished.
196 */ 215 */
197 if (ioend->io_type == IO_UNWRITTEN) { 216 if (ioend->io_type == IO_UNWRITTEN) {
217 /*
218 * For buffered I/O we never preallocate a transaction when
219 * doing the unwritten extent conversion, but for direct I/O
220 * we do not know if we are converting an unwritten extent
221 * or not at the point where we preallocate the transaction.
222 */
223 if (ioend->io_append_trans) {
224 ASSERT(ioend->io_isdirect);
225
226 current_set_flags_nested(
227 &ioend->io_append_trans->t_pflags, PF_FSTRANS);
228 xfs_trans_cancel(ioend->io_append_trans, 0);
229 }
230
198 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 231 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
199 ioend->io_size); 232 ioend->io_size);
200 if (error) { 233 if (error) {
201 ioend->io_error = -error; 234 ioend->io_error = -error;
202 goto done; 235 goto done;
203 } 236 }
237 } else if (ioend->io_append_trans) {
238 error = xfs_setfilesize(ioend);
239 if (error)
240 ioend->io_error = -error;
241 } else {
242 ASSERT(!xfs_ioend_is_append(ioend));
204 } 243 }
205 244
206 /*
207 * We might have to update the on-disk file size after extending
208 * writes.
209 */
210 error = xfs_setfilesize(ioend);
211 ASSERT(!error || error == EAGAIN);
212
213done: 245done:
214 /* 246 xfs_destroy_ioend(ioend);
215 * If we didn't complete processing of the ioend, requeue it to the
216 * tail of the workqueue for another attempt later. Otherwise destroy
217 * it.
218 */
219 if (error == EAGAIN) {
220 atomic_inc(&ioend->io_remaining);
221 xfs_finish_ioend(ioend);
222 /* ensure we don't spin on blocked ioends */
223 delay(1);
224 } else {
225 xfs_destroy_ioend(ioend);
226 }
227} 247}
228 248
229/* 249/*
@@ -259,6 +279,7 @@ xfs_alloc_ioend(
259 */ 279 */
260 atomic_set(&ioend->io_remaining, 1); 280 atomic_set(&ioend->io_remaining, 1);
261 ioend->io_isasync = 0; 281 ioend->io_isasync = 0;
282 ioend->io_isdirect = 0;
262 ioend->io_error = 0; 283 ioend->io_error = 0;
263 ioend->io_list = NULL; 284 ioend->io_list = NULL;
264 ioend->io_type = type; 285 ioend->io_type = type;
@@ -269,6 +290,7 @@ xfs_alloc_ioend(
269 ioend->io_size = 0; 290 ioend->io_size = 0;
270 ioend->io_iocb = NULL; 291 ioend->io_iocb = NULL;
271 ioend->io_result = 0; 292 ioend->io_result = 0;
293 ioend->io_append_trans = NULL;
272 294
273 INIT_WORK(&ioend->io_work, xfs_end_io); 295 INIT_WORK(&ioend->io_work, xfs_end_io);
274 return ioend; 296 return ioend;
@@ -379,14 +401,6 @@ xfs_submit_ioend_bio(
379 atomic_inc(&ioend->io_remaining); 401 atomic_inc(&ioend->io_remaining);
380 bio->bi_private = ioend; 402 bio->bi_private = ioend;
381 bio->bi_end_io = xfs_end_bio; 403 bio->bi_end_io = xfs_end_bio;
382
383 /*
384 * If the I/O is beyond EOF we mark the inode dirty immediately
385 * but don't update the inode size until I/O completion.
386 */
387 if (xfs_ioend_new_eof(ioend))
388 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
389
390 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); 404 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
391} 405}
392 406
@@ -1033,8 +1047,20 @@ xfs_vm_writepage(
1033 wbc, end_index); 1047 wbc, end_index);
1034 } 1048 }
1035 1049
1036 if (iohead) 1050 if (iohead) {
1051 /*
1052 * Reserve log space if we might write beyond the on-disk
1053 * inode size.
1054 */
1055 if (ioend->io_type != IO_UNWRITTEN &&
1056 xfs_ioend_is_append(ioend)) {
1057 err = xfs_setfilesize_trans_alloc(ioend);
1058 if (err)
1059 goto error;
1060 }
1061
1037 xfs_submit_ioend(wbc, iohead); 1062 xfs_submit_ioend(wbc, iohead);
1063 }
1038 1064
1039 return 0; 1065 return 0;
1040 1066
@@ -1314,17 +1340,32 @@ xfs_vm_direct_IO(
1314{ 1340{
1315 struct inode *inode = iocb->ki_filp->f_mapping->host; 1341 struct inode *inode = iocb->ki_filp->f_mapping->host;
1316 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1342 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1343 struct xfs_ioend *ioend = NULL;
1317 ssize_t ret; 1344 ssize_t ret;
1318 1345
1319 if (rw & WRITE) { 1346 if (rw & WRITE) {
1320 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT); 1347 size_t size = iov_length(iov, nr_segs);
1348
1349 /*
1350 * We need to preallocate a transaction for a size update
1351 * here. In the case that this write both updates the size
1352 * and converts at least on unwritten extent we will cancel
1353 * the still clean transaction after the I/O has finished.
1354 */
1355 iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
1356 if (offset + size > XFS_I(inode)->i_d.di_size) {
1357 ret = xfs_setfilesize_trans_alloc(ioend);
1358 if (ret)
1359 goto out_destroy_ioend;
1360 ioend->io_isdirect = 1;
1361 }
1321 1362
1322 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1363 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1323 offset, nr_segs, 1364 offset, nr_segs,
1324 xfs_get_blocks_direct, 1365 xfs_get_blocks_direct,
1325 xfs_end_io_direct_write, NULL, 0); 1366 xfs_end_io_direct_write, NULL, 0);
1326 if (ret != -EIOCBQUEUED && iocb->private) 1367 if (ret != -EIOCBQUEUED && iocb->private)
1327 xfs_destroy_ioend(iocb->private); 1368 goto out_trans_cancel;
1328 } else { 1369 } else {
1329 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1370 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1330 offset, nr_segs, 1371 offset, nr_segs,
@@ -1333,6 +1374,16 @@ xfs_vm_direct_IO(
1333 } 1374 }
1334 1375
1335 return ret; 1376 return ret;
1377
1378out_trans_cancel:
1379 if (ioend->io_append_trans) {
1380 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1381 PF_FSTRANS);
1382 xfs_trans_cancel(ioend->io_append_trans, 0);
1383 }
1384out_destroy_ioend:
1385 xfs_destroy_ioend(ioend);
1386 return ret;
1336} 1387}
1337 1388
1338STATIC void 1389STATIC void
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c37034..84eafbcb0d9d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,8 +18,6 @@
18#ifndef __XFS_AOPS_H__ 18#ifndef __XFS_AOPS_H__
19#define __XFS_AOPS_H__ 19#define __XFS_AOPS_H__
20 20
21extern struct workqueue_struct *xfsdatad_workqueue;
22extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 21extern mempool_t *xfs_ioend_pool;
24 22
25/* 23/*
@@ -48,12 +46,14 @@ typedef struct xfs_ioend {
48 int io_error; /* I/O error code */ 46 int io_error; /* I/O error code */
49 atomic_t io_remaining; /* hold count */ 47 atomic_t io_remaining; /* hold count */
50 unsigned int io_isasync : 1; /* needs aio_complete */ 48 unsigned int io_isasync : 1; /* needs aio_complete */
49 unsigned int io_isdirect : 1;/* direct I/O */
51 struct inode *io_inode; /* file being written to */ 50 struct inode *io_inode; /* file being written to */
52 struct buffer_head *io_buffer_head;/* buffer linked list head */ 51 struct buffer_head *io_buffer_head;/* buffer linked list head */
53 struct buffer_head *io_buffer_tail;/* buffer linked list tail */ 52 struct buffer_head *io_buffer_tail;/* buffer linked list tail */
54 size_t io_size; /* size of the extent */ 53 size_t io_size; /* size of the extent */
55 xfs_off_t io_offset; /* offset in the file */ 54 xfs_off_t io_offset; /* offset in the file */
56 struct work_struct io_work; /* xfsdatad work queue */ 55 struct work_struct io_work; /* xfsdatad work queue */
56 struct xfs_trans *io_append_trans;/* xact. for size update */
57 struct kiocb *io_iocb; 57 struct kiocb *io_iocb;
58 int io_result; 58 int io_result;
59} xfs_ioend_t; 59} xfs_ioend_t;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 08b9ac644c31..65d61b948ead 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -853,6 +853,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
853{ 853{
854 int newsize, forkoff, retval; 854 int newsize, forkoff, retval;
855 855
856 trace_xfs_attr_sf_addname(args);
857
856 retval = xfs_attr_shortform_lookup(args); 858 retval = xfs_attr_shortform_lookup(args);
857 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { 859 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
858 return(retval); 860 return(retval);
@@ -896,6 +898,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
896 xfs_dabuf_t *bp; 898 xfs_dabuf_t *bp;
897 int retval, error, committed, forkoff; 899 int retval, error, committed, forkoff;
898 900
901 trace_xfs_attr_leaf_addname(args);
902
899 /* 903 /*
900 * Read the (only) block in the attribute list in. 904 * Read the (only) block in the attribute list in.
901 */ 905 */
@@ -920,6 +924,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
920 xfs_da_brelse(args->trans, bp); 924 xfs_da_brelse(args->trans, bp);
921 return(retval); 925 return(retval);
922 } 926 }
927
928 trace_xfs_attr_leaf_replace(args);
929
923 args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ 930 args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
924 args->blkno2 = args->blkno; /* set 2nd entry info*/ 931 args->blkno2 = args->blkno; /* set 2nd entry info*/
925 args->index2 = args->index; 932 args->index2 = args->index;
@@ -1090,6 +1097,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1090 xfs_dabuf_t *bp; 1097 xfs_dabuf_t *bp;
1091 int error, committed, forkoff; 1098 int error, committed, forkoff;
1092 1099
1100 trace_xfs_attr_leaf_removename(args);
1101
1093 /* 1102 /*
1094 * Remove the attribute. 1103 * Remove the attribute.
1095 */ 1104 */
@@ -1223,6 +1232,8 @@ xfs_attr_node_addname(xfs_da_args_t *args)
1223 xfs_mount_t *mp; 1232 xfs_mount_t *mp;
1224 int committed, retval, error; 1233 int committed, retval, error;
1225 1234
1235 trace_xfs_attr_node_addname(args);
1236
1226 /* 1237 /*
1227 * Fill in bucket of arguments/results/context to carry around. 1238 * Fill in bucket of arguments/results/context to carry around.
1228 */ 1239 */
@@ -1249,6 +1260,9 @@ restart:
1249 } else if (retval == EEXIST) { 1260 } else if (retval == EEXIST) {
1250 if (args->flags & ATTR_CREATE) 1261 if (args->flags & ATTR_CREATE)
1251 goto out; 1262 goto out;
1263
1264 trace_xfs_attr_node_replace(args);
1265
1252 args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ 1266 args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
1253 args->blkno2 = args->blkno; /* set 2nd entry info*/ 1267 args->blkno2 = args->blkno; /* set 2nd entry info*/
1254 args->index2 = args->index; 1268 args->index2 = args->index;
@@ -1480,6 +1494,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1480 xfs_dabuf_t *bp; 1494 xfs_dabuf_t *bp;
1481 int retval, error, committed, forkoff; 1495 int retval, error, committed, forkoff;
1482 1496
1497 trace_xfs_attr_node_removename(args);
1498
1483 /* 1499 /*
1484 * Tie a string around our finger to remind us where we are. 1500 * Tie a string around our finger to remind us where we are.
1485 */ 1501 */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d25eafd4d28d..76d93dc953e1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -235,6 +235,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args)
235 xfs_inode_t *dp; 235 xfs_inode_t *dp;
236 xfs_ifork_t *ifp; 236 xfs_ifork_t *ifp;
237 237
238 trace_xfs_attr_sf_create(args);
239
238 dp = args->dp; 240 dp = args->dp;
239 ASSERT(dp != NULL); 241 ASSERT(dp != NULL);
240 ifp = dp->i_afp; 242 ifp = dp->i_afp;
@@ -268,6 +270,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
268 xfs_inode_t *dp; 270 xfs_inode_t *dp;
269 xfs_ifork_t *ifp; 271 xfs_ifork_t *ifp;
270 272
273 trace_xfs_attr_sf_add(args);
274
271 dp = args->dp; 275 dp = args->dp;
272 mp = dp->i_mount; 276 mp = dp->i_mount;
273 dp->i_d.di_forkoff = forkoff; 277 dp->i_d.di_forkoff = forkoff;
@@ -337,6 +341,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
337 xfs_mount_t *mp; 341 xfs_mount_t *mp;
338 xfs_inode_t *dp; 342 xfs_inode_t *dp;
339 343
344 trace_xfs_attr_sf_remove(args);
345
340 dp = args->dp; 346 dp = args->dp;
341 mp = dp->i_mount; 347 mp = dp->i_mount;
342 base = sizeof(xfs_attr_sf_hdr_t); 348 base = sizeof(xfs_attr_sf_hdr_t);
@@ -405,6 +411,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
405 int i; 411 int i;
406 xfs_ifork_t *ifp; 412 xfs_ifork_t *ifp;
407 413
414 trace_xfs_attr_sf_lookup(args);
415
408 ifp = args->dp->i_afp; 416 ifp = args->dp->i_afp;
409 ASSERT(ifp->if_flags & XFS_IFINLINE); 417 ASSERT(ifp->if_flags & XFS_IFINLINE);
410 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 418 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -476,6 +484,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
476 xfs_dabuf_t *bp; 484 xfs_dabuf_t *bp;
477 xfs_ifork_t *ifp; 485 xfs_ifork_t *ifp;
478 486
487 trace_xfs_attr_sf_to_leaf(args);
488
479 dp = args->dp; 489 dp = args->dp;
480 ifp = dp->i_afp; 490 ifp = dp->i_afp;
481 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; 491 sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
@@ -775,6 +785,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
775 char *tmpbuffer; 785 char *tmpbuffer;
776 int error, i; 786 int error, i;
777 787
788 trace_xfs_attr_leaf_to_sf(args);
789
778 dp = args->dp; 790 dp = args->dp;
779 tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); 791 tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
780 ASSERT(tmpbuffer != NULL); 792 ASSERT(tmpbuffer != NULL);
@@ -848,6 +860,8 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
848 xfs_dablk_t blkno; 860 xfs_dablk_t blkno;
849 int error; 861 int error;
850 862
863 trace_xfs_attr_leaf_to_node(args);
864
851 dp = args->dp; 865 dp = args->dp;
852 bp1 = bp2 = NULL; 866 bp1 = bp2 = NULL;
853 error = xfs_da_grow_inode(args, &blkno); 867 error = xfs_da_grow_inode(args, &blkno);
@@ -911,6 +925,8 @@ xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
911 xfs_dabuf_t *bp; 925 xfs_dabuf_t *bp;
912 int error; 926 int error;
913 927
928 trace_xfs_attr_leaf_create(args);
929
914 dp = args->dp; 930 dp = args->dp;
915 ASSERT(dp != NULL); 931 ASSERT(dp != NULL);
916 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, 932 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
@@ -948,6 +964,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
948 xfs_dablk_t blkno; 964 xfs_dablk_t blkno;
949 int error; 965 int error;
950 966
967 trace_xfs_attr_leaf_split(state->args);
968
951 /* 969 /*
952 * Allocate space for a new leaf node. 970 * Allocate space for a new leaf node.
953 */ 971 */
@@ -977,10 +995,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
977 * 995 *
978 * Insert the "new" entry in the correct block. 996 * Insert the "new" entry in the correct block.
979 */ 997 */
980 if (state->inleaf) 998 if (state->inleaf) {
999 trace_xfs_attr_leaf_add_old(state->args);
981 error = xfs_attr_leaf_add(oldblk->bp, state->args); 1000 error = xfs_attr_leaf_add(oldblk->bp, state->args);
982 else 1001 } else {
1002 trace_xfs_attr_leaf_add_new(state->args);
983 error = xfs_attr_leaf_add(newblk->bp, state->args); 1003 error = xfs_attr_leaf_add(newblk->bp, state->args);
1004 }
984 1005
985 /* 1006 /*
986 * Update last hashval in each block since we added the name. 1007 * Update last hashval in each block since we added the name.
@@ -1001,6 +1022,8 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
1001 xfs_attr_leaf_map_t *map; 1022 xfs_attr_leaf_map_t *map;
1002 int tablesize, entsize, sum, tmp, i; 1023 int tablesize, entsize, sum, tmp, i;
1003 1024
1025 trace_xfs_attr_leaf_add(args);
1026
1004 leaf = bp->data; 1027 leaf = bp->data;
1005 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1028 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1006 ASSERT((args->index >= 0) 1029 ASSERT((args->index >= 0)
@@ -1128,8 +1151,6 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
1128 (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); 1151 (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
1129 1152
1130 /* 1153 /*
1131 * Copy the attribute name and value into the new space.
1132 *
1133 * For "remote" attribute values, simply note that we need to 1154 * For "remote" attribute values, simply note that we need to
1134 * allocate space for the "remote" value. We can't actually 1155 * allocate space for the "remote" value. We can't actually
1135 * allocate the extents in this transaction, and we can't decide 1156 * allocate the extents in this transaction, and we can't decide
@@ -1265,6 +1286,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1265 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1286 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1266 args = state->args; 1287 args = state->args;
1267 1288
1289 trace_xfs_attr_leaf_rebalance(args);
1290
1268 /* 1291 /*
1269 * Check ordering of blocks, reverse if it makes things simpler. 1292 * Check ordering of blocks, reverse if it makes things simpler.
1270 * 1293 *
@@ -1810,6 +1833,8 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1810 xfs_mount_t *mp; 1833 xfs_mount_t *mp;
1811 char *tmpbuffer; 1834 char *tmpbuffer;
1812 1835
1836 trace_xfs_attr_leaf_unbalance(state->args);
1837
1813 /* 1838 /*
1814 * Set up environment. 1839 * Set up environment.
1815 */ 1840 */
@@ -1919,6 +1944,8 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
1919 int probe, span; 1944 int probe, span;
1920 xfs_dahash_t hashval; 1945 xfs_dahash_t hashval;
1921 1946
1947 trace_xfs_attr_leaf_lookup(args);
1948
1922 leaf = bp->data; 1949 leaf = bp->data;
1923 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1950 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1924 ASSERT(be16_to_cpu(leaf->hdr.count) 1951 ASSERT(be16_to_cpu(leaf->hdr.count)
@@ -2445,6 +2472,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2445 char *name; 2472 char *name;
2446#endif /* DEBUG */ 2473#endif /* DEBUG */
2447 2474
2475 trace_xfs_attr_leaf_clearflag(args);
2448 /* 2476 /*
2449 * Set up the operation. 2477 * Set up the operation.
2450 */ 2478 */
@@ -2509,6 +2537,8 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2509 xfs_dabuf_t *bp; 2537 xfs_dabuf_t *bp;
2510 int error; 2538 int error;
2511 2539
2540 trace_xfs_attr_leaf_setflag(args);
2541
2512 /* 2542 /*
2513 * Set up the operation. 2543 * Set up the operation.
2514 */ 2544 */
@@ -2565,6 +2595,8 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2565 char *name1, *name2; 2595 char *name1, *name2;
2566#endif /* DEBUG */ 2596#endif /* DEBUG */
2567 2597
2598 trace_xfs_attr_leaf_flipflags(args);
2599
2568 /* 2600 /*
2569 * Read the block containing the "old" attr 2601 * Read the block containing the "old" attr
2570 */ 2602 */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 188ef2fbd628..85e7e327bcd8 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5124,6 +5124,15 @@ xfs_bunmapi(
5124 cur->bc_private.b.flags = 0; 5124 cur->bc_private.b.flags = 0;
5125 } else 5125 } else
5126 cur = NULL; 5126 cur = NULL;
5127
5128 if (isrt) {
5129 /*
5130 * Synchronize by locking the bitmap inode.
5131 */
5132 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
5133 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
5134 }
5135
5127 extno = 0; 5136 extno = 0;
5128 while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && 5137 while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
5129 (nexts == 0 || extno < nexts)) { 5138 (nexts == 0 || extno < nexts)) {
@@ -5536,8 +5545,12 @@ xfs_getbmap(
5536 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) 5545 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
5537 return XFS_ERROR(ENOMEM); 5546 return XFS_ERROR(ENOMEM);
5538 out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); 5547 out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
5539 if (!out) 5548 if (!out) {
5540 return XFS_ERROR(ENOMEM); 5549 out = kmem_zalloc_large(bmv->bmv_count *
5550 sizeof(struct getbmapx));
5551 if (!out)
5552 return XFS_ERROR(ENOMEM);
5553 }
5541 5554
5542 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5555 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5543 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5556 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
@@ -5661,7 +5674,10 @@ xfs_getbmap(
5661 break; 5674 break;
5662 } 5675 }
5663 5676
5664 kmem_free(out); 5677 if (is_vmalloc_addr(out))
5678 kmem_free_large(out);
5679 else
5680 kmem_free(out);
5665 return error; 5681 return error;
5666} 5682}
5667 5683
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4dff85c7d7eb..6819b5163e33 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -45,8 +45,6 @@ static kmem_zone_t *xfs_buf_zone;
45STATIC int xfsbufd(void *); 45STATIC int xfsbufd(void *);
46 46
47static struct workqueue_struct *xfslogd_workqueue; 47static struct workqueue_struct *xfslogd_workqueue;
48struct workqueue_struct *xfsdatad_workqueue;
49struct workqueue_struct *xfsconvertd_workqueue;
50 48
51#ifdef XFS_BUF_LOCK_TRACKING 49#ifdef XFS_BUF_LOCK_TRACKING
52# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 50# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
@@ -1793,21 +1791,8 @@ xfs_buf_init(void)
1793 if (!xfslogd_workqueue) 1791 if (!xfslogd_workqueue)
1794 goto out_free_buf_zone; 1792 goto out_free_buf_zone;
1795 1793
1796 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
1797 if (!xfsdatad_workqueue)
1798 goto out_destroy_xfslogd_workqueue;
1799
1800 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
1801 WQ_MEM_RECLAIM, 1);
1802 if (!xfsconvertd_workqueue)
1803 goto out_destroy_xfsdatad_workqueue;
1804
1805 return 0; 1794 return 0;
1806 1795
1807 out_destroy_xfsdatad_workqueue:
1808 destroy_workqueue(xfsdatad_workqueue);
1809 out_destroy_xfslogd_workqueue:
1810 destroy_workqueue(xfslogd_workqueue);
1811 out_free_buf_zone: 1796 out_free_buf_zone:
1812 kmem_zone_destroy(xfs_buf_zone); 1797 kmem_zone_destroy(xfs_buf_zone);
1813 out: 1798 out:
@@ -1817,8 +1802,6 @@ xfs_buf_init(void)
1817void 1802void
1818xfs_buf_terminate(void) 1803xfs_buf_terminate(void)
1819{ 1804{
1820 destroy_workqueue(xfsconvertd_workqueue);
1821 destroy_workqueue(xfsdatad_workqueue);
1822 destroy_workqueue(xfslogd_workqueue); 1805 destroy_workqueue(xfslogd_workqueue);
1823 kmem_zone_destroy(xfs_buf_zone); 1806 kmem_zone_destroy(xfs_buf_zone);
1824} 1807}
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 77c74257c2a3..7f1a6f5b05a6 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -108,6 +108,8 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
108 int error; 108 int error;
109 xfs_trans_t *tp; 109 xfs_trans_t *tp;
110 110
111 trace_xfs_da_node_create(args);
112
111 tp = args->trans; 113 tp = args->trans;
112 error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); 114 error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
113 if (error) 115 if (error)
@@ -140,6 +142,8 @@ xfs_da_split(xfs_da_state_t *state)
140 xfs_dabuf_t *bp; 142 xfs_dabuf_t *bp;
141 int max, action, error, i; 143 int max, action, error, i;
142 144
145 trace_xfs_da_split(state->args);
146
143 /* 147 /*
144 * Walk back up the tree splitting/inserting/adjusting as necessary. 148 * Walk back up the tree splitting/inserting/adjusting as necessary.
145 * If we need to insert and there isn't room, split the node, then 149 * If we need to insert and there isn't room, split the node, then
@@ -178,10 +182,12 @@ xfs_da_split(xfs_da_state_t *state)
178 state->extravalid = 1; 182 state->extravalid = 1;
179 if (state->inleaf) { 183 if (state->inleaf) {
180 state->extraafter = 0; /* before newblk */ 184 state->extraafter = 0; /* before newblk */
185 trace_xfs_attr_leaf_split_before(state->args);
181 error = xfs_attr_leaf_split(state, oldblk, 186 error = xfs_attr_leaf_split(state, oldblk,
182 &state->extrablk); 187 &state->extrablk);
183 } else { 188 } else {
184 state->extraafter = 1; /* after newblk */ 189 state->extraafter = 1; /* after newblk */
190 trace_xfs_attr_leaf_split_after(state->args);
185 error = xfs_attr_leaf_split(state, newblk, 191 error = xfs_attr_leaf_split(state, newblk,
186 &state->extrablk); 192 &state->extrablk);
187 } 193 }
@@ -300,6 +306,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
300 xfs_mount_t *mp; 306 xfs_mount_t *mp;
301 xfs_dir2_leaf_t *leaf; 307 xfs_dir2_leaf_t *leaf;
302 308
309 trace_xfs_da_root_split(state->args);
310
303 /* 311 /*
304 * Copy the existing (incorrect) block from the root node position 312 * Copy the existing (incorrect) block from the root node position
305 * to a free space somewhere. 313 * to a free space somewhere.
@@ -380,6 +388,8 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
380 int newcount, error; 388 int newcount, error;
381 int useextra; 389 int useextra;
382 390
391 trace_xfs_da_node_split(state->args);
392
383 node = oldblk->bp->data; 393 node = oldblk->bp->data;
384 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); 394 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
385 395
@@ -466,6 +476,8 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
466 int count, tmp; 476 int count, tmp;
467 xfs_trans_t *tp; 477 xfs_trans_t *tp;
468 478
479 trace_xfs_da_node_rebalance(state->args);
480
469 node1 = blk1->bp->data; 481 node1 = blk1->bp->data;
470 node2 = blk2->bp->data; 482 node2 = blk2->bp->data;
471 /* 483 /*
@@ -574,6 +586,8 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
574 xfs_da_node_entry_t *btree; 586 xfs_da_node_entry_t *btree;
575 int tmp; 587 int tmp;
576 588
589 trace_xfs_da_node_add(state->args);
590
577 node = oldblk->bp->data; 591 node = oldblk->bp->data;
578 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); 592 ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
579 ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); 593 ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
@@ -619,6 +633,8 @@ xfs_da_join(xfs_da_state_t *state)
619 xfs_da_state_blk_t *drop_blk, *save_blk; 633 xfs_da_state_blk_t *drop_blk, *save_blk;
620 int action, error; 634 int action, error;
621 635
636 trace_xfs_da_join(state->args);
637
622 action = 0; 638 action = 0;
623 drop_blk = &state->path.blk[ state->path.active-1 ]; 639 drop_blk = &state->path.blk[ state->path.active-1 ];
624 save_blk = &state->altpath.blk[ state->path.active-1 ]; 640 save_blk = &state->altpath.blk[ state->path.active-1 ];
@@ -723,6 +739,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
723 xfs_dabuf_t *bp; 739 xfs_dabuf_t *bp;
724 int error; 740 int error;
725 741
742 trace_xfs_da_root_join(state->args);
743
726 args = state->args; 744 args = state->args;
727 ASSERT(args != NULL); 745 ASSERT(args != NULL);
728 ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); 746 ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
@@ -941,6 +959,8 @@ xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
941 xfs_da_node_entry_t *btree; 959 xfs_da_node_entry_t *btree;
942 int tmp; 960 int tmp;
943 961
962 trace_xfs_da_node_remove(state->args);
963
944 node = drop_blk->bp->data; 964 node = drop_blk->bp->data;
945 ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); 965 ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
946 ASSERT(drop_blk->index >= 0); 966 ASSERT(drop_blk->index >= 0);
@@ -984,6 +1004,8 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
984 int tmp; 1004 int tmp;
985 xfs_trans_t *tp; 1005 xfs_trans_t *tp;
986 1006
1007 trace_xfs_da_node_unbalance(state->args);
1008
987 drop_node = drop_blk->bp->data; 1009 drop_node = drop_blk->bp->data;
988 save_node = save_blk->bp->data; 1010 save_node = save_blk->bp->data;
989 ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); 1011 ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
@@ -1230,6 +1252,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1230 /* 1252 /*
1231 * Link new block in before existing block. 1253 * Link new block in before existing block.
1232 */ 1254 */
1255 trace_xfs_da_link_before(args);
1233 new_info->forw = cpu_to_be32(old_blk->blkno); 1256 new_info->forw = cpu_to_be32(old_blk->blkno);
1234 new_info->back = old_info->back; 1257 new_info->back = old_info->back;
1235 if (old_info->back) { 1258 if (old_info->back) {
@@ -1251,6 +1274,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1251 /* 1274 /*
1252 * Link new block in after existing block. 1275 * Link new block in after existing block.
1253 */ 1276 */
1277 trace_xfs_da_link_after(args);
1254 new_info->forw = old_info->forw; 1278 new_info->forw = old_info->forw;
1255 new_info->back = cpu_to_be32(old_blk->blkno); 1279 new_info->back = cpu_to_be32(old_blk->blkno);
1256 if (old_info->forw) { 1280 if (old_info->forw) {
@@ -1348,6 +1372,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1348 * Unlink the leaf block from the doubly linked chain of leaves. 1372 * Unlink the leaf block from the doubly linked chain of leaves.
1349 */ 1373 */
1350 if (be32_to_cpu(save_info->back) == drop_blk->blkno) { 1374 if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1375 trace_xfs_da_unlink_back(args);
1351 save_info->back = drop_info->back; 1376 save_info->back = drop_info->back;
1352 if (drop_info->back) { 1377 if (drop_info->back) {
1353 error = xfs_da_read_buf(args->trans, args->dp, 1378 error = xfs_da_read_buf(args->trans, args->dp,
@@ -1365,6 +1390,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1365 xfs_da_buf_done(bp); 1390 xfs_da_buf_done(bp);
1366 } 1391 }
1367 } else { 1392 } else {
1393 trace_xfs_da_unlink_forward(args);
1368 save_info->forw = drop_info->forw; 1394 save_info->forw = drop_info->forw;
1369 if (drop_info->forw) { 1395 if (drop_info->forw) {
1370 error = xfs_da_read_buf(args->trans, args->dp, 1396 error = xfs_da_read_buf(args->trans, args->dp,
@@ -1652,6 +1678,8 @@ xfs_da_grow_inode(
1652 int count; 1678 int count;
1653 int error; 1679 int error;
1654 1680
1681 trace_xfs_da_grow_inode(args);
1682
1655 if (args->whichfork == XFS_DATA_FORK) { 1683 if (args->whichfork == XFS_DATA_FORK) {
1656 bno = args->dp->i_mount->m_dirleafblk; 1684 bno = args->dp->i_mount->m_dirleafblk;
1657 count = args->dp->i_mount->m_dirblkfsbs; 1685 count = args->dp->i_mount->m_dirblkfsbs;
@@ -1690,6 +1718,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
1690 xfs_dir2_leaf_t *dead_leaf2; 1718 xfs_dir2_leaf_t *dead_leaf2;
1691 xfs_dahash_t dead_hash; 1719 xfs_dahash_t dead_hash;
1692 1720
1721 trace_xfs_da_swap_lastblock(args);
1722
1693 dead_buf = *dead_bufp; 1723 dead_buf = *dead_bufp;
1694 dead_blkno = *dead_blknop; 1724 dead_blkno = *dead_blknop;
1695 tp = args->trans; 1725 tp = args->trans;
@@ -1878,6 +1908,8 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1878 xfs_trans_t *tp; 1908 xfs_trans_t *tp;
1879 xfs_mount_t *mp; 1909 xfs_mount_t *mp;
1880 1910
1911 trace_xfs_da_shrink_inode(args);
1912
1881 dp = args->dp; 1913 dp = args->dp;
1882 w = args->whichfork; 1914 w = args->whichfork;
1883 tp = args->trans; 1915 tp = args->trans;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index dd974a55c77d..1137bbc5eccb 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -215,7 +215,7 @@ xfs_swap_extents(
215 xfs_trans_t *tp; 215 xfs_trans_t *tp;
216 xfs_bstat_t *sbp = &sxp->sx_stat; 216 xfs_bstat_t *sbp = &sxp->sx_stat;
217 xfs_ifork_t *tempifp, *ifp, *tifp; 217 xfs_ifork_t *tempifp, *ifp, *tifp;
218 int ilf_fields, tilf_fields; 218 int src_log_flags, target_log_flags;
219 int error = 0; 219 int error = 0;
220 int aforkblks = 0; 220 int aforkblks = 0;
221 int taforkblks = 0; 221 int taforkblks = 0;
@@ -385,9 +385,8 @@ xfs_swap_extents(
385 tip->i_delayed_blks = ip->i_delayed_blks; 385 tip->i_delayed_blks = ip->i_delayed_blks;
386 ip->i_delayed_blks = 0; 386 ip->i_delayed_blks = 0;
387 387
388 ilf_fields = XFS_ILOG_CORE; 388 src_log_flags = XFS_ILOG_CORE;
389 389 switch (ip->i_d.di_format) {
390 switch(ip->i_d.di_format) {
391 case XFS_DINODE_FMT_EXTENTS: 390 case XFS_DINODE_FMT_EXTENTS:
392 /* If the extents fit in the inode, fix the 391 /* If the extents fit in the inode, fix the
393 * pointer. Otherwise it's already NULL or 392 * pointer. Otherwise it's already NULL or
@@ -397,16 +396,15 @@ xfs_swap_extents(
397 ifp->if_u1.if_extents = 396 ifp->if_u1.if_extents =
398 ifp->if_u2.if_inline_ext; 397 ifp->if_u2.if_inline_ext;
399 } 398 }
400 ilf_fields |= XFS_ILOG_DEXT; 399 src_log_flags |= XFS_ILOG_DEXT;
401 break; 400 break;
402 case XFS_DINODE_FMT_BTREE: 401 case XFS_DINODE_FMT_BTREE:
403 ilf_fields |= XFS_ILOG_DBROOT; 402 src_log_flags |= XFS_ILOG_DBROOT;
404 break; 403 break;
405 } 404 }
406 405
407 tilf_fields = XFS_ILOG_CORE; 406 target_log_flags = XFS_ILOG_CORE;
408 407 switch (tip->i_d.di_format) {
409 switch(tip->i_d.di_format) {
410 case XFS_DINODE_FMT_EXTENTS: 408 case XFS_DINODE_FMT_EXTENTS:
411 /* If the extents fit in the inode, fix the 409 /* If the extents fit in the inode, fix the
412 * pointer. Otherwise it's already NULL or 410 * pointer. Otherwise it's already NULL or
@@ -416,10 +414,10 @@ xfs_swap_extents(
416 tifp->if_u1.if_extents = 414 tifp->if_u1.if_extents =
417 tifp->if_u2.if_inline_ext; 415 tifp->if_u2.if_inline_ext;
418 } 416 }
419 tilf_fields |= XFS_ILOG_DEXT; 417 target_log_flags |= XFS_ILOG_DEXT;
420 break; 418 break;
421 case XFS_DINODE_FMT_BTREE: 419 case XFS_DINODE_FMT_BTREE:
422 tilf_fields |= XFS_ILOG_DBROOT; 420 target_log_flags |= XFS_ILOG_DBROOT;
423 break; 421 break;
424 } 422 }
425 423
@@ -427,8 +425,8 @@ xfs_swap_extents(
427 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 425 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
428 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 426 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
429 427
430 xfs_trans_log_inode(tp, ip, ilf_fields); 428 xfs_trans_log_inode(tp, ip, src_log_flags);
431 xfs_trans_log_inode(tp, tip, tilf_fields); 429 xfs_trans_log_inode(tp, tip, target_log_flags);
432 430
433 /* 431 /*
434 * If this is a synchronous mount, make sure that the 432 * If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 9245e029b8ea..d3b63aefd01d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
29#include "xfs_dinode.h" 29#include "xfs_dinode.h"
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_dir2.h"
32#include "xfs_dir2_format.h" 33#include "xfs_dir2_format.h"
33#include "xfs_dir2_priv.h" 34#include "xfs_dir2_priv.h"
34#include "xfs_error.h" 35#include "xfs_error.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 286a051f12cf..1ad3a4b8ca40 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -37,9 +37,9 @@ STATIC int
37xfs_trim_extents( 37xfs_trim_extents(
38 struct xfs_mount *mp, 38 struct xfs_mount *mp,
39 xfs_agnumber_t agno, 39 xfs_agnumber_t agno,
40 xfs_fsblock_t start, 40 xfs_daddr_t start,
41 xfs_fsblock_t end, 41 xfs_daddr_t end,
42 xfs_fsblock_t minlen, 42 xfs_daddr_t minlen,
43 __uint64_t *blocks_trimmed) 43 __uint64_t *blocks_trimmed)
44{ 44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev; 45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
@@ -67,7 +67,7 @@ xfs_trim_extents(
67 /* 67 /*
68 * Look up the longest btree in the AGF and start with it. 68 * Look up the longest btree in the AGF and start with it.
69 */ 69 */
70 error = xfs_alloc_lookup_le(cur, 0, 70 error = xfs_alloc_lookup_ge(cur, 0,
71 be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); 71 be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
72 if (error) 72 if (error)
73 goto out_del_cursor; 73 goto out_del_cursor;
@@ -77,8 +77,10 @@ xfs_trim_extents(
77 * enough to be worth discarding. 77 * enough to be worth discarding.
78 */ 78 */
79 while (i) { 79 while (i) {
80 xfs_agblock_t fbno; 80 xfs_agblock_t fbno;
81 xfs_extlen_t flen; 81 xfs_extlen_t flen;
82 xfs_daddr_t dbno;
83 xfs_extlen_t dlen;
82 84
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 85 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error) 86 if (error)
@@ -87,9 +89,17 @@ xfs_trim_extents(
87 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); 89 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
88 90
89 /* 91 /*
92 * use daddr format for all range/len calculations as that is
93 * the format the range/len variables are supplied in by
94 * userspace.
95 */
96 dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
97 dlen = XFS_FSB_TO_BB(mp, flen);
98
99 /*
90 * Too small? Give up. 100 * Too small? Give up.
91 */ 101 */
92 if (flen < minlen) { 102 if (dlen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen); 103 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor; 104 goto out_del_cursor;
95 } 105 }
@@ -99,8 +109,7 @@ xfs_trim_extents(
99 * supposed to discard skip it. Do not bother to trim 109 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now. 110 * down partially overlapping ranges for now.
101 */ 111 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start || 112 if (dbno + dlen < start || dbno > end) {
103 XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen); 113 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent; 114 goto next_extent;
106 } 115 }
@@ -115,10 +124,7 @@ xfs_trim_extents(
115 } 124 }
116 125
117 trace_xfs_discard_extent(mp, agno, fbno, flen); 126 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev, 127 error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error) 128 if (error)
123 goto out_del_cursor; 129 goto out_del_cursor;
124 *blocks_trimmed += flen; 130 *blocks_trimmed += flen;
@@ -137,6 +143,15 @@ out_put_perag:
137 return error; 143 return error;
138} 144}
139 145
146/*
147 * trim a range of the filesystem.
148 *
149 * Note: the parameters passed from userspace are byte ranges into the
150 * filesystem which does not match to the format we use for filesystem block
151 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
152 * is a linear address range. Hence we need to use DADDR based conversions and
153 * comparisons for determining the correct offset and regions to trim.
154 */
140int 155int
141xfs_ioc_trim( 156xfs_ioc_trim(
142 struct xfs_mount *mp, 157 struct xfs_mount *mp,
@@ -145,7 +160,7 @@ xfs_ioc_trim(
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; 160 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity; 161 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range; 162 struct fstrim_range range;
148 xfs_fsblock_t start, end, minlen; 163 xfs_daddr_t start, end, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno; 164 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0; 165 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0; 166 int error, last_error = 0;
@@ -159,22 +174,22 @@ xfs_ioc_trim(
159 174
160 /* 175 /*
161 * Truncating down the len isn't actually quite correct, but using 176 * Truncating down the len isn't actually quite correct, but using
162 * XFS_B_TO_FSB would mean we trivially get overflows for values 177 * BBTOB would mean we trivially get overflows for values
163 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 178 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
164 * used by the fstrim application. In the end it really doesn't 179 * used by the fstrim application. In the end it really doesn't
165 * matter as trimming blocks is an advisory interface. 180 * matter as trimming blocks is an advisory interface.
166 */ 181 */
167 start = XFS_B_TO_FSBT(mp, range.start); 182 start = BTOBB(range.start);
168 end = start + XFS_B_TO_FSBT(mp, range.len) - 1; 183 end = start + BTOBBT(range.len) - 1;
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen)); 184 minlen = BTOBB(max_t(u64, granularity, range.minlen));
170 185
171 if (start >= mp->m_sb.sb_dblocks) 186 if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
172 return -XFS_ERROR(EINVAL); 187 return -XFS_ERROR(EINVAL);
173 if (end > mp->m_sb.sb_dblocks - 1) 188 if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
174 end = mp->m_sb.sb_dblocks - 1; 189 end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
175 190
176 start_agno = XFS_FSB_TO_AGNO(mp, start); 191 start_agno = xfs_daddr_to_agno(mp, start);
177 end_agno = XFS_FSB_TO_AGNO(mp, end); 192 end_agno = xfs_daddr_to_agno(mp, end);
178 193
179 for (agno = start_agno; agno <= end_agno; agno++) { 194 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, end, minlen, 195 error = -xfs_trim_extents(mp, agno, start, end, minlen,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 53db20ee3e77..1155208fa830 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -43,11 +43,10 @@
43 * Lock order: 43 * Lock order:
44 * 44 *
45 * ip->i_lock 45 * ip->i_lock
46 * qh->qh_lock 46 * qi->qi_tree_lock
47 * qi->qi_dqlist_lock 47 * dquot->q_qlock (xfs_dqlock() and friends)
48 * dquot->q_qlock (xfs_dqlock() and friends) 48 * dquot->q_flush (xfs_dqflock() and friends)
49 * dquot->q_flush (xfs_dqflock() and friends) 49 * qi->qi_lru_lock
50 * xfs_Gqm->qm_dqfrlist_lock
51 * 50 *
52 * If two dquots need to be locked the order is user before group/project, 51 * If two dquots need to be locked the order is user before group/project,
53 * otherwise by the lowest id first, see xfs_dqlock2. 52 * otherwise by the lowest id first, see xfs_dqlock2.
@@ -60,6 +59,9 @@ int xfs_dqreq_num;
60int xfs_dqerror_mod = 33; 59int xfs_dqerror_mod = 33;
61#endif 60#endif
62 61
62struct kmem_zone *xfs_qm_dqtrxzone;
63static struct kmem_zone *xfs_qm_dqzone;
64
63static struct lock_class_key xfs_dquot_other_class; 65static struct lock_class_key xfs_dquot_other_class;
64 66
65/* 67/*
@@ -69,12 +71,12 @@ void
69xfs_qm_dqdestroy( 71xfs_qm_dqdestroy(
70 xfs_dquot_t *dqp) 72 xfs_dquot_t *dqp)
71{ 73{
72 ASSERT(list_empty(&dqp->q_freelist)); 74 ASSERT(list_empty(&dqp->q_lru));
73 75
74 mutex_destroy(&dqp->q_qlock); 76 mutex_destroy(&dqp->q_qlock);
75 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 77 kmem_zone_free(xfs_qm_dqzone, dqp);
76 78
77 atomic_dec(&xfs_Gqm->qm_totaldquots); 79 XFS_STATS_DEC(xs_qm_dquot);
78} 80}
79 81
80/* 82/*
@@ -282,7 +284,7 @@ xfs_qm_dqalloc(
282 * Return if this type of quotas is turned off while we didn't 284 * Return if this type of quotas is turned off while we didn't
283 * have an inode lock 285 * have an inode lock
284 */ 286 */
285 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 287 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
286 xfs_iunlock(quotip, XFS_ILOCK_EXCL); 288 xfs_iunlock(quotip, XFS_ILOCK_EXCL);
287 return (ESRCH); 289 return (ESRCH);
288 } 290 }
@@ -384,7 +386,7 @@ xfs_qm_dqtobp(
384 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 386 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
385 387
386 xfs_ilock(quotip, XFS_ILOCK_SHARED); 388 xfs_ilock(quotip, XFS_ILOCK_SHARED);
387 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 389 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
388 /* 390 /*
389 * Return if this type of quotas is turned off while we 391 * Return if this type of quotas is turned off while we
390 * didn't have the quota inode lock. 392 * didn't have the quota inode lock.
@@ -492,12 +494,12 @@ xfs_qm_dqread(
492 int cancelflags = 0; 494 int cancelflags = 0;
493 495
494 496
495 dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); 497 dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
496 498
497 dqp->dq_flags = type; 499 dqp->dq_flags = type;
498 dqp->q_core.d_id = cpu_to_be32(id); 500 dqp->q_core.d_id = cpu_to_be32(id);
499 dqp->q_mount = mp; 501 dqp->q_mount = mp;
500 INIT_LIST_HEAD(&dqp->q_freelist); 502 INIT_LIST_HEAD(&dqp->q_lru);
501 mutex_init(&dqp->q_qlock); 503 mutex_init(&dqp->q_qlock);
502 init_waitqueue_head(&dqp->q_pinwait); 504 init_waitqueue_head(&dqp->q_pinwait);
503 505
@@ -516,7 +518,7 @@ xfs_qm_dqread(
516 if (!(type & XFS_DQ_USER)) 518 if (!(type & XFS_DQ_USER))
517 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); 519 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
518 520
519 atomic_inc(&xfs_Gqm->qm_totaldquots); 521 XFS_STATS_INC(xs_qm_dquot);
520 522
521 trace_xfs_dqread(dqp); 523 trace_xfs_dqread(dqp);
522 524
@@ -602,60 +604,6 @@ error0:
602} 604}
603 605
604/* 606/*
605 * Lookup a dquot in the incore dquot hashtable. We keep two separate
606 * hashtables for user and group dquots; and, these are global tables
607 * inside the XQM, not per-filesystem tables.
608 * The hash chain must be locked by caller, and it is left locked
609 * on return. Returning dquot is locked.
610 */
611STATIC int
612xfs_qm_dqlookup(
613 xfs_mount_t *mp,
614 xfs_dqid_t id,
615 xfs_dqhash_t *qh,
616 xfs_dquot_t **O_dqpp)
617{
618 xfs_dquot_t *dqp;
619
620 ASSERT(mutex_is_locked(&qh->qh_lock));
621
622 /*
623 * Traverse the hashchain looking for a match
624 */
625 list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
626 /*
627 * We already have the hashlock. We don't need the
628 * dqlock to look at the id field of the dquot, since the
629 * id can't be modified without the hashlock anyway.
630 */
631 if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
632 continue;
633
634 trace_xfs_dqlookup_found(dqp);
635
636 xfs_dqlock(dqp);
637 if (dqp->dq_flags & XFS_DQ_FREEING) {
638 *O_dqpp = NULL;
639 xfs_dqunlock(dqp);
640 return -1;
641 }
642
643 dqp->q_nrefs++;
644
645 /*
646 * move the dquot to the front of the hashchain
647 */
648 list_move(&dqp->q_hashlist, &qh->qh_list);
649 trace_xfs_dqlookup_done(dqp);
650 *O_dqpp = dqp;
651 return 0;
652 }
653
654 *O_dqpp = NULL;
655 return 1;
656}
657
658/*
659 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a 607 * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
660 * a locked dquot, doing an allocation (if requested) as needed. 608 * a locked dquot, doing an allocation (if requested) as needed.
661 * When both an inode and an id are given, the inode's id takes precedence. 609 * When both an inode and an id are given, the inode's id takes precedence.
@@ -672,10 +620,10 @@ xfs_qm_dqget(
672 uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ 620 uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
673 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ 621 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
674{ 622{
675 xfs_dquot_t *dqp; 623 struct xfs_quotainfo *qi = mp->m_quotainfo;
676 xfs_dqhash_t *h; 624 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
677 uint version; 625 struct xfs_dquot *dqp;
678 int error; 626 int error;
679 627
680 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 628 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
681 if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || 629 if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -683,7 +631,6 @@ xfs_qm_dqget(
683 (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { 631 (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
684 return (ESRCH); 632 return (ESRCH);
685 } 633 }
686 h = XFS_DQ_HASH(mp, id, type);
687 634
688#ifdef DEBUG 635#ifdef DEBUG
689 if (xfs_do_dqerror) { 636 if (xfs_do_dqerror) {
@@ -699,42 +646,33 @@ xfs_qm_dqget(
699 type == XFS_DQ_GROUP); 646 type == XFS_DQ_GROUP);
700 if (ip) { 647 if (ip) {
701 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 648 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
702 if (type == XFS_DQ_USER) 649 ASSERT(xfs_inode_dquot(ip, type) == NULL);
703 ASSERT(ip->i_udquot == NULL);
704 else
705 ASSERT(ip->i_gdquot == NULL);
706 } 650 }
707#endif 651#endif
708 652
709restart: 653restart:
710 mutex_lock(&h->qh_lock); 654 mutex_lock(&qi->qi_tree_lock);
655 dqp = radix_tree_lookup(tree, id);
656 if (dqp) {
657 xfs_dqlock(dqp);
658 if (dqp->dq_flags & XFS_DQ_FREEING) {
659 xfs_dqunlock(dqp);
660 mutex_unlock(&qi->qi_tree_lock);
661 trace_xfs_dqget_freeing(dqp);
662 delay(1);
663 goto restart;
664 }
711 665
712 /* 666 dqp->q_nrefs++;
713 * Look in the cache (hashtable). 667 mutex_unlock(&qi->qi_tree_lock);
714 * The chain is kept locked during lookup. 668
715 */ 669 trace_xfs_dqget_hit(dqp);
716 switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) { 670 XFS_STATS_INC(xs_qm_dqcachehits);
717 case -1: 671 *O_dqpp = dqp;
718 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); 672 return 0;
719 mutex_unlock(&h->qh_lock);
720 delay(1);
721 goto restart;
722 case 0:
723 XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
724 /*
725 * The dquot was found, moved to the front of the chain,
726 * taken off the freelist if it was on it, and locked
727 * at this point. Just unlock the hashchain and return.
728 */
729 ASSERT(*O_dqpp);
730 ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
731 mutex_unlock(&h->qh_lock);
732 trace_xfs_dqget_hit(*O_dqpp);
733 return 0; /* success */
734 default:
735 XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
736 break;
737 } 673 }
674 mutex_unlock(&qi->qi_tree_lock);
675 XFS_STATS_INC(xs_qm_dqcachemisses);
738 676
739 /* 677 /*
740 * Dquot cache miss. We don't want to keep the inode lock across 678 * Dquot cache miss. We don't want to keep the inode lock across
@@ -745,12 +683,6 @@ restart:
745 */ 683 */
746 if (ip) 684 if (ip)
747 xfs_iunlock(ip, XFS_ILOCK_EXCL); 685 xfs_iunlock(ip, XFS_ILOCK_EXCL);
748 /*
749 * Save the hashchain version stamp, and unlock the chain, so that
750 * we don't keep the lock across a disk read
751 */
752 version = h->qh_version;
753 mutex_unlock(&h->qh_lock);
754 686
755 error = xfs_qm_dqread(mp, id, type, flags, &dqp); 687 error = xfs_qm_dqread(mp, id, type, flags, &dqp);
756 688
@@ -760,97 +692,53 @@ restart:
760 if (error) 692 if (error)
761 return error; 693 return error;
762 694
763 /*
764 * Dquot lock comes after hashlock in the lock ordering
765 */
766 if (ip) { 695 if (ip) {
767 /* 696 /*
768 * A dquot could be attached to this inode by now, since 697 * A dquot could be attached to this inode by now, since
769 * we had dropped the ilock. 698 * we had dropped the ilock.
770 */ 699 */
771 if (type == XFS_DQ_USER) { 700 if (xfs_this_quota_on(mp, type)) {
772 if (!XFS_IS_UQUOTA_ON(mp)) { 701 struct xfs_dquot *dqp1;
773 /* inode stays locked on return */ 702
774 xfs_qm_dqdestroy(dqp); 703 dqp1 = xfs_inode_dquot(ip, type);
775 return XFS_ERROR(ESRCH); 704 if (dqp1) {
776 }
777 if (ip->i_udquot) {
778 xfs_qm_dqdestroy(dqp); 705 xfs_qm_dqdestroy(dqp);
779 dqp = ip->i_udquot; 706 dqp = dqp1;
780 xfs_dqlock(dqp); 707 xfs_dqlock(dqp);
781 goto dqret; 708 goto dqret;
782 } 709 }
783 } else { 710 } else {
784 if (!XFS_IS_OQUOTA_ON(mp)) { 711 /* inode stays locked on return */
785 /* inode stays locked on return */ 712 xfs_qm_dqdestroy(dqp);
786 xfs_qm_dqdestroy(dqp); 713 return XFS_ERROR(ESRCH);
787 return XFS_ERROR(ESRCH);
788 }
789 if (ip->i_gdquot) {
790 xfs_qm_dqdestroy(dqp);
791 dqp = ip->i_gdquot;
792 xfs_dqlock(dqp);
793 goto dqret;
794 }
795 } 714 }
796 } 715 }
797 716
798 /* 717 mutex_lock(&qi->qi_tree_lock);
799 * Hashlock comes after ilock in lock order 718 error = -radix_tree_insert(tree, id, dqp);
800 */ 719 if (unlikely(error)) {
801 mutex_lock(&h->qh_lock); 720 WARN_ON(error != EEXIST);
802 if (version != h->qh_version) { 721
803 xfs_dquot_t *tmpdqp;
804 /* 722 /*
805 * Now, see if somebody else put the dquot in the 723 * Duplicate found. Just throw away the new dquot and start
806 * hashtable before us. This can happen because we didn't 724 * over.
807 * keep the hashchain lock. We don't have to worry about
808 * lock order between the two dquots here since dqp isn't
809 * on any findable lists yet.
810 */ 725 */
811 switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) { 726 mutex_unlock(&qi->qi_tree_lock);
812 case 0: 727 trace_xfs_dqget_dup(dqp);
813 case -1: 728 xfs_qm_dqdestroy(dqp);
814 /* 729 XFS_STATS_INC(xs_qm_dquot_dups);
815 * Duplicate found, either in cache or on its way out. 730 goto restart;
816 * Just throw away the new dquot and start over.
817 */
818 if (tmpdqp)
819 xfs_qm_dqput(tmpdqp);
820 mutex_unlock(&h->qh_lock);
821 xfs_qm_dqdestroy(dqp);
822 XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
823 goto restart;
824 default:
825 break;
826 }
827 } 731 }
828 732
829 /* 733 /*
830 * Put the dquot at the beginning of the hash-chain and mp's list
831 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
832 */
833 ASSERT(mutex_is_locked(&h->qh_lock));
834 dqp->q_hash = h;
835 list_add(&dqp->q_hashlist, &h->qh_list);
836 h->qh_version++;
837
838 /*
839 * Attach this dquot to this filesystem's list of all dquots,
840 * kept inside the mount structure in m_quotainfo field
841 */
842 mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
843
844 /*
845 * We return a locked dquot to the caller, with a reference taken 734 * We return a locked dquot to the caller, with a reference taken
846 */ 735 */
847 xfs_dqlock(dqp); 736 xfs_dqlock(dqp);
848 dqp->q_nrefs = 1; 737 dqp->q_nrefs = 1;
849 738
850 list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); 739 qi->qi_dquots++;
851 mp->m_quotainfo->qi_dquots++; 740 mutex_unlock(&qi->qi_tree_lock);
852 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 741
853 mutex_unlock(&h->qh_lock);
854 dqret: 742 dqret:
855 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 743 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
856 trace_xfs_dqget_miss(dqp); 744 trace_xfs_dqget_miss(dqp);
@@ -859,37 +747,22 @@ restart:
859} 747}
860 748
861 749
862/* 750STATIC void
863 * Release a reference to the dquot (decrement ref-count) 751xfs_qm_dqput_final(
864 * and unlock it. If there is a group quota attached to this
865 * dquot, carefully release that too without tripping over
866 * deadlocks'n'stuff.
867 */
868void
869xfs_qm_dqput(
870 struct xfs_dquot *dqp) 752 struct xfs_dquot *dqp)
871{ 753{
754 struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
872 struct xfs_dquot *gdqp; 755 struct xfs_dquot *gdqp;
873 756
874 ASSERT(dqp->q_nrefs > 0);
875 ASSERT(XFS_DQ_IS_LOCKED(dqp));
876
877 trace_xfs_dqput(dqp);
878
879recurse:
880 if (--dqp->q_nrefs > 0) {
881 xfs_dqunlock(dqp);
882 return;
883 }
884
885 trace_xfs_dqput_free(dqp); 757 trace_xfs_dqput_free(dqp);
886 758
887 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 759 mutex_lock(&qi->qi_lru_lock);
888 if (list_empty(&dqp->q_freelist)) { 760 if (list_empty(&dqp->q_lru)) {
889 list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); 761 list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
890 xfs_Gqm->qm_dqfrlist_cnt++; 762 qi->qi_lru_count++;
763 XFS_STATS_INC(xs_qm_dquot_unused);
891 } 764 }
892 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 765 mutex_unlock(&qi->qi_lru_lock);
893 766
894 /* 767 /*
895 * If we just added a udquot to the freelist, then we want to release 768 * If we just added a udquot to the freelist, then we want to release
@@ -906,10 +779,29 @@ recurse:
906 /* 779 /*
907 * If we had a group quota hint, release it now. 780 * If we had a group quota hint, release it now.
908 */ 781 */
909 if (gdqp) { 782 if (gdqp)
910 dqp = gdqp; 783 xfs_qm_dqput(gdqp);
911 goto recurse; 784}
912 } 785
786/*
787 * Release a reference to the dquot (decrement ref-count) and unlock it.
788 *
789 * If there is a group quota attached to this dquot, carefully release that
790 * too without tripping over deadlocks'n'stuff.
791 */
792void
793xfs_qm_dqput(
794 struct xfs_dquot *dqp)
795{
796 ASSERT(dqp->q_nrefs > 0);
797 ASSERT(XFS_DQ_IS_LOCKED(dqp));
798
799 trace_xfs_dqput(dqp);
800
801 if (--dqp->q_nrefs > 0)
802 xfs_dqunlock(dqp);
803 else
804 xfs_qm_dqput_final(dqp);
913} 805}
914 806
915/* 807/*
@@ -1091,17 +983,6 @@ xfs_qm_dqflush(
1091 983
1092} 984}
1093 985
1094void
1095xfs_dqunlock(
1096 xfs_dquot_t *dqp)
1097{
1098 xfs_dqunlock_nonotify(dqp);
1099 if (dqp->q_logitem.qli_dquot == dqp) {
1100 xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
1101 &dqp->q_logitem.qli_item);
1102 }
1103}
1104
1105/* 986/*
1106 * Lock two xfs_dquot structures. 987 * Lock two xfs_dquot structures.
1107 * 988 *
@@ -1131,85 +1012,6 @@ xfs_dqlock2(
1131} 1012}
1132 1013
1133/* 1014/*
1134 * Take a dquot out of the mount's dqlist as well as the hashlist. This is
1135 * called via unmount as well as quotaoff, and the purge will always succeed.
1136 */
1137void
1138xfs_qm_dqpurge(
1139 struct xfs_dquot *dqp)
1140{
1141 struct xfs_mount *mp = dqp->q_mount;
1142 struct xfs_dqhash *qh = dqp->q_hash;
1143
1144 xfs_dqlock(dqp);
1145
1146 /*
1147 * If we're turning off quotas, we have to make sure that, for
1148 * example, we don't delete quota disk blocks while dquots are
1149 * in the process of getting written to those disk blocks.
1150 * This dquot might well be on AIL, and we can't leave it there
1151 * if we're turning off quotas. Basically, we need this flush
1152 * lock, and are willing to block on it.
1153 */
1154 if (!xfs_dqflock_nowait(dqp)) {
1155 /*
1156 * Block on the flush lock after nudging dquot buffer,
1157 * if it is incore.
1158 */
1159 xfs_dqflock_pushbuf_wait(dqp);
1160 }
1161
1162 /*
1163 * If we are turning this type of quotas off, we don't care
1164 * about the dirty metadata sitting in this dquot. OTOH, if
1165 * we're unmounting, we do care, so we flush it and wait.
1166 */
1167 if (XFS_DQ_IS_DIRTY(dqp)) {
1168 int error;
1169
1170 /*
1171 * We don't care about getting disk errors here. We need
1172 * to purge this dquot anyway, so we go ahead regardless.
1173 */
1174 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1175 if (error)
1176 xfs_warn(mp, "%s: dquot %p flush failed",
1177 __func__, dqp);
1178 xfs_dqflock(dqp);
1179 }
1180
1181 ASSERT(atomic_read(&dqp->q_pincount) == 0);
1182 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
1183 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
1184
1185 xfs_dqfunlock(dqp);
1186 xfs_dqunlock(dqp);
1187
1188 mutex_lock(&qh->qh_lock);
1189 list_del_init(&dqp->q_hashlist);
1190 qh->qh_version++;
1191 mutex_unlock(&qh->qh_lock);
1192
1193 mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
1194 list_del_init(&dqp->q_mplist);
1195 mp->m_quotainfo->qi_dqreclaims++;
1196 mp->m_quotainfo->qi_dquots--;
1197 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1198
1199 /*
1200 * We move dquots to the freelist as soon as their reference count
1201 * hits zero, so it really should be on the freelist here.
1202 */
1203 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1204 ASSERT(!list_empty(&dqp->q_freelist));
1205 list_del_init(&dqp->q_freelist);
1206 xfs_Gqm->qm_dqfrlist_cnt--;
1207 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1208
1209 xfs_qm_dqdestroy(dqp);
1210}
1211
1212/*
1213 * Give the buffer a little push if it is incore and 1015 * Give the buffer a little push if it is incore and
1214 * wait on the flush lock. 1016 * wait on the flush lock.
1215 */ 1017 */
@@ -1241,3 +1043,31 @@ xfs_dqflock_pushbuf_wait(
1241out_lock: 1043out_lock:
1242 xfs_dqflock(dqp); 1044 xfs_dqflock(dqp);
1243} 1045}
1046
1047int __init
1048xfs_qm_init(void)
1049{
1050 xfs_qm_dqzone =
1051 kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
1052 if (!xfs_qm_dqzone)
1053 goto out;
1054
1055 xfs_qm_dqtrxzone =
1056 kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
1057 if (!xfs_qm_dqtrxzone)
1058 goto out_free_dqzone;
1059
1060 return 0;
1061
1062out_free_dqzone:
1063 kmem_zone_destroy(xfs_qm_dqzone);
1064out:
1065 return -ENOMEM;
1066}
1067
1068void
1069xfs_qm_exit(void)
1070{
1071 kmem_zone_destroy(xfs_qm_dqtrxzone);
1072 kmem_zone_destroy(xfs_qm_dqzone);
1073}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index a1d91d8f1802..ef9190bd8b30 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -29,16 +29,6 @@
29 * when quotas are off. 29 * when quotas are off.
30 */ 30 */
31 31
32/*
33 * The hash chain headers (hash buckets)
34 */
35typedef struct xfs_dqhash {
36 struct list_head qh_list;
37 struct mutex qh_lock;
38 uint qh_version; /* ever increasing version */
39 uint qh_nelems; /* number of dquots on the list */
40} xfs_dqhash_t;
41
42struct xfs_mount; 32struct xfs_mount;
43struct xfs_trans; 33struct xfs_trans;
44 34
@@ -47,10 +37,7 @@ struct xfs_trans;
47 */ 37 */
48typedef struct xfs_dquot { 38typedef struct xfs_dquot {
49 uint dq_flags; /* various flags (XFS_DQ_*) */ 39 uint dq_flags; /* various flags (XFS_DQ_*) */
50 struct list_head q_freelist; /* global free list of dquots */ 40 struct list_head q_lru; /* global free list of dquots */
51 struct list_head q_mplist; /* mount's list of dquots */
52 struct list_head q_hashlist; /* gloabl hash list of dquots */
53 xfs_dqhash_t *q_hash; /* the hashchain header */
54 struct xfs_mount*q_mount; /* filesystem this relates to */ 41 struct xfs_mount*q_mount; /* filesystem this relates to */
55 struct xfs_trans*q_transp; /* trans this belongs to currently */ 42 struct xfs_trans*q_transp; /* trans this belongs to currently */
56 uint q_nrefs; /* # active refs from inodes */ 43 uint q_nrefs; /* # active refs from inodes */
@@ -110,11 +97,37 @@ static inline void xfs_dqlock(struct xfs_dquot *dqp)
110 mutex_lock(&dqp->q_qlock); 97 mutex_lock(&dqp->q_qlock);
111} 98}
112 99
113static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp) 100static inline void xfs_dqunlock(struct xfs_dquot *dqp)
114{ 101{
115 mutex_unlock(&dqp->q_qlock); 102 mutex_unlock(&dqp->q_qlock);
116} 103}
117 104
105static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
106{
107 switch (type & XFS_DQ_ALLTYPES) {
108 case XFS_DQ_USER:
109 return XFS_IS_UQUOTA_ON(mp);
110 case XFS_DQ_GROUP:
111 case XFS_DQ_PROJ:
112 return XFS_IS_OQUOTA_ON(mp);
113 default:
114 return 0;
115 }
116}
117
118static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
119{
120 switch (type & XFS_DQ_ALLTYPES) {
121 case XFS_DQ_USER:
122 return ip->i_udquot;
123 case XFS_DQ_GROUP:
124 case XFS_DQ_PROJ:
125 return ip->i_gdquot;
126 default:
127 return NULL;
128 }
129}
130
118#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 131#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
119#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 132#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
120#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 133#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
@@ -125,15 +138,10 @@ static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
125 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ 138 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
126 XFS_DQ_TO_QINF(dqp)->qi_gquotaip) 139 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
127 140
128#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
129 (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
130 (XFS_IS_OQUOTA_ON((d)->q_mount))))
131
132extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, 141extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
133 uint, struct xfs_dquot **); 142 uint, struct xfs_dquot **);
134extern void xfs_qm_dqdestroy(xfs_dquot_t *); 143extern void xfs_qm_dqdestroy(xfs_dquot_t *);
135extern int xfs_qm_dqflush(xfs_dquot_t *, uint); 144extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
136extern void xfs_qm_dqpurge(xfs_dquot_t *);
137extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); 145extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
138extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, 146extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
139 xfs_disk_dquot_t *); 147 xfs_disk_dquot_t *);
@@ -144,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
144extern void xfs_qm_dqput(xfs_dquot_t *); 152extern void xfs_qm_dqput(xfs_dquot_t *);
145 153
146extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); 154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
147extern void xfs_dqunlock(struct xfs_dquot *);
148extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); 155extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
149 156
150static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) 157static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7e5bc872f2b4..54a67dd9ac0a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -163,7 +163,6 @@ xfs_file_fsync(
163 struct inode *inode = file->f_mapping->host; 163 struct inode *inode = file->f_mapping->host;
164 struct xfs_inode *ip = XFS_I(inode); 164 struct xfs_inode *ip = XFS_I(inode);
165 struct xfs_mount *mp = ip->i_mount; 165 struct xfs_mount *mp = ip->i_mount;
166 struct xfs_trans *tp;
167 int error = 0; 166 int error = 0;
168 int log_flushed = 0; 167 int log_flushed = 0;
169 xfs_lsn_t lsn = 0; 168 xfs_lsn_t lsn = 0;
@@ -194,75 +193,18 @@ xfs_file_fsync(
194 } 193 }
195 194
196 /* 195 /*
197 * We always need to make sure that the required inode state is safe on 196 * All metadata updates are logged, which means that we just have
198 * disk. The inode might be clean but we still might need to force the 197 * to flush the log up to the latest LSN that touched the inode.
199 * log because of committed transactions that haven't hit the disk yet.
200 * Likewise, there could be unflushed non-transactional changes to the
201 * inode core that have to go to disk and this requires us to issue
202 * a synchronous transaction to capture these changes correctly.
203 *
204 * This code relies on the assumption that if the i_update_core field
205 * of the inode is clear and the inode is unpinned then it is clean
206 * and no action is required.
207 */ 198 */
208 xfs_ilock(ip, XFS_ILOCK_SHARED); 199 xfs_ilock(ip, XFS_ILOCK_SHARED);
209 200 if (xfs_ipincount(ip)) {
210 /* 201 if (!datasync ||
211 * First check if the VFS inode is marked dirty. All the dirtying 202 (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
212 * of non-transactional updates do not go through mark_inode_dirty*,
213 * which allows us to distinguish between pure timestamp updates
214 * and i_size updates which need to be caught for fdatasync.
215 * After that also check for the dirty state in the XFS inode, which
216 * might gets cleared when the inode gets written out via the AIL
217 * or xfs_iflush_cluster.
218 */
219 if (((inode->i_state & I_DIRTY_DATASYNC) ||
220 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
221 ip->i_update_core) {
222 /*
223 * Kick off a transaction to log the inode core to get the
224 * updates. The sync transaction will also force the log.
225 */
226 xfs_iunlock(ip, XFS_ILOCK_SHARED);
227 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
228 error = xfs_trans_reserve(tp, 0,
229 XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
230 if (error) {
231 xfs_trans_cancel(tp, 0);
232 return -error;
233 }
234 xfs_ilock(ip, XFS_ILOCK_EXCL);
235
236 /*
237 * Note - it's possible that we might have pushed ourselves out
238 * of the way during trans_reserve which would flush the inode.
239 * But there's no guarantee that the inode buffer has actually
240 * gone out yet (it's delwri). Plus the buffer could be pinned
241 * anyway if it's part of an inode in another recent
242 * transaction. So we play it safe and fire off the
243 * transaction anyway.
244 */
245 xfs_trans_ijoin(tp, ip, 0);
246 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
247 error = xfs_trans_commit(tp, 0);
248
249 lsn = ip->i_itemp->ili_last_lsn;
250 xfs_iunlock(ip, XFS_ILOCK_EXCL);
251 } else {
252 /*
253 * Timestamps/size haven't changed since last inode flush or
254 * inode transaction commit. That means either nothing got
255 * written or a transaction committed which caught the updates.
256 * If the latter happened and the transaction hasn't hit the
257 * disk yet, the inode will be still be pinned. If it is,
258 * force the log.
259 */
260 if (xfs_ipincount(ip))
261 lsn = ip->i_itemp->ili_last_lsn; 203 lsn = ip->i_itemp->ili_last_lsn;
262 xfs_iunlock(ip, XFS_ILOCK_SHARED);
263 } 204 }
205 xfs_iunlock(ip, XFS_ILOCK_SHARED);
264 206
265 if (!error && lsn) 207 if (lsn)
266 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); 208 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
267 209
268 /* 210 /*
@@ -659,9 +601,6 @@ restart:
659 return error; 601 return error;
660 } 602 }
661 603
662 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
663 file_update_time(file);
664
665 /* 604 /*
666 * If the offset is beyond the size of the file, we need to zero any 605 * If the offset is beyond the size of the file, we need to zero any
667 * blocks that fall between the existing EOF and the start of this 606 * blocks that fall between the existing EOF and the start of this
@@ -685,6 +624,15 @@ restart:
685 return error; 624 return error;
686 625
687 /* 626 /*
627 * Updating the timestamps will grab the ilock again from
628 * xfs_fs_dirty_inode, so we have to call it after dropping the
629 * lock above. Eventually we should look into a way to avoid
630 * the pointless lock roundtrip.
631 */
632 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
633 file_update_time(file);
634
635 /*
688 * If we're writing the file then make sure to clear the setuid and 636 * If we're writing the file then make sure to clear the setuid and
689 * setgid bits if the process is not being run by root. This keeps 637 * setgid bits if the process is not being run by root. This keeps
690 * people from modifying setuid and setgid binaries. 638 * people from modifying setuid and setgid binaries.
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8c3e46394d48..bcc6c249b2c7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,7 +91,6 @@ xfs_inode_alloc(
91 ip->i_afp = NULL; 91 ip->i_afp = NULL;
92 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 92 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
93 ip->i_flags = 0; 93 ip->i_flags = 0;
94 ip->i_update_core = 0;
95 ip->i_delayed_blks = 0; 94 ip->i_delayed_blks = 0;
96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 95 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
97 96
@@ -290,7 +289,7 @@ xfs_iget_cache_hit(
290 if (lock_flags != 0) 289 if (lock_flags != 0)
291 xfs_ilock(ip, lock_flags); 290 xfs_ilock(ip, lock_flags);
292 291
293 xfs_iflags_clear(ip, XFS_ISTALE); 292 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
294 XFS_STATS_INC(xs_ig_found); 293 XFS_STATS_INC(xs_ig_found);
295 294
296 return 0; 295 return 0;
@@ -315,6 +314,7 @@ xfs_iget_cache_miss(
315 struct xfs_inode *ip; 314 struct xfs_inode *ip;
316 int error; 315 int error;
317 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 316 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
317 int iflags;
318 318
319 ip = xfs_inode_alloc(mp, ino); 319 ip = xfs_inode_alloc(mp, ino);
320 if (!ip) 320 if (!ip)
@@ -350,9 +350,23 @@ xfs_iget_cache_miss(
350 BUG(); 350 BUG();
351 } 351 }
352 352
353 spin_lock(&pag->pag_ici_lock); 353 /*
354 * These values must be set before inserting the inode into the radix
355 * tree as the moment it is inserted a concurrent lookup (allowed by the
356 * RCU locking mechanism) can find it and that lookup must see that this
357 * is an inode currently under construction (i.e. that XFS_INEW is set).
358 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
359 * memory barrier that ensures this detection works correctly at lookup
360 * time.
361 */
362 iflags = XFS_INEW;
363 if (flags & XFS_IGET_DONTCACHE)
364 iflags |= XFS_IDONTCACHE;
365 ip->i_udquot = ip->i_gdquot = NULL;
366 xfs_iflags_set(ip, iflags);
354 367
355 /* insert the new inode */ 368 /* insert the new inode */
369 spin_lock(&pag->pag_ici_lock);
356 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 370 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
357 if (unlikely(error)) { 371 if (unlikely(error)) {
358 WARN_ON(error != -EEXIST); 372 WARN_ON(error != -EEXIST);
@@ -360,11 +374,6 @@ xfs_iget_cache_miss(
360 error = EAGAIN; 374 error = EAGAIN;
361 goto out_preload_end; 375 goto out_preload_end;
362 } 376 }
363
364 /* These values _must_ be set before releasing the radix tree lock! */
365 ip->i_udquot = ip->i_gdquot = NULL;
366 xfs_iflags_set(ip, XFS_INEW);
367
368 spin_unlock(&pag->pag_ici_lock); 377 spin_unlock(&pag->pag_ici_lock);
369 radix_tree_preload_end(); 378 radix_tree_preload_end();
370 379
@@ -418,6 +427,15 @@ xfs_iget(
418 xfs_perag_t *pag; 427 xfs_perag_t *pag;
419 xfs_agino_t agino; 428 xfs_agino_t agino;
420 429
430 /*
431 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
432 * doesn't get freed while it's being referenced during a
433 * radix tree traversal here. It assumes this function
434 * aqcuires only the ILOCK (and therefore it has no need to
435 * involve the IOLOCK in this synchronization).
436 */
437 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
438
421 /* reject inode numbers outside existing AGs */ 439 /* reject inode numbers outside existing AGs */
422 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 440 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
423 return EINVAL; 441 return EINVAL;
@@ -642,8 +660,7 @@ xfs_iunlock(
642 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 660 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
643 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 661 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
644 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 662 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
645 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY | 663 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
646 XFS_LOCK_DEP_MASK)) == 0);
647 ASSERT(lock_flags != 0); 664 ASSERT(lock_flags != 0);
648 665
649 if (lock_flags & XFS_IOLOCK_EXCL) 666 if (lock_flags & XFS_IOLOCK_EXCL)
@@ -656,16 +673,6 @@ xfs_iunlock(
656 else if (lock_flags & XFS_ILOCK_SHARED) 673 else if (lock_flags & XFS_ILOCK_SHARED)
657 mrunlock_shared(&ip->i_lock); 674 mrunlock_shared(&ip->i_lock);
658 675
659 if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
660 !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
661 /*
662 * Let the AIL know that this item has been unlocked in case
663 * it is in the AIL and anyone is waiting on it. Don't do
664 * this if the caller has asked us not to.
665 */
666 xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
667 (xfs_log_item_t*)(ip->i_itemp));
668 }
669 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 676 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
670} 677}
671 678
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b21022499c2e..bc46c0a133d3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1656,14 +1656,13 @@ retry:
1656 iip = ip->i_itemp; 1656 iip = ip->i_itemp;
1657 if (!iip || xfs_inode_clean(ip)) { 1657 if (!iip || xfs_inode_clean(ip)) {
1658 ASSERT(ip != free_ip); 1658 ASSERT(ip != free_ip);
1659 ip->i_update_core = 0;
1660 xfs_ifunlock(ip); 1659 xfs_ifunlock(ip);
1661 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1660 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1662 continue; 1661 continue;
1663 } 1662 }
1664 1663
1665 iip->ili_last_fields = iip->ili_format.ilf_fields; 1664 iip->ili_last_fields = iip->ili_fields;
1666 iip->ili_format.ilf_fields = 0; 1665 iip->ili_fields = 0;
1667 iip->ili_logged = 1; 1666 iip->ili_logged = 1;
1668 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 1667 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1669 &iip->ili_item.li_lsn); 1668 &iip->ili_item.li_lsn);
@@ -2177,7 +2176,7 @@ xfs_iflush_fork(
2177 mp = ip->i_mount; 2176 mp = ip->i_mount;
2178 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2177 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2179 case XFS_DINODE_FMT_LOCAL: 2178 case XFS_DINODE_FMT_LOCAL:
2180 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2179 if ((iip->ili_fields & dataflag[whichfork]) &&
2181 (ifp->if_bytes > 0)) { 2180 (ifp->if_bytes > 0)) {
2182 ASSERT(ifp->if_u1.if_data != NULL); 2181 ASSERT(ifp->if_u1.if_data != NULL);
2183 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2182 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
@@ -2187,8 +2186,8 @@ xfs_iflush_fork(
2187 2186
2188 case XFS_DINODE_FMT_EXTENTS: 2187 case XFS_DINODE_FMT_EXTENTS:
2189 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2188 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2190 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2189 !(iip->ili_fields & extflag[whichfork]));
2191 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2190 if ((iip->ili_fields & extflag[whichfork]) &&
2192 (ifp->if_bytes > 0)) { 2191 (ifp->if_bytes > 0)) {
2193 ASSERT(xfs_iext_get_ext(ifp, 0)); 2192 ASSERT(xfs_iext_get_ext(ifp, 0));
2194 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2193 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
@@ -2198,7 +2197,7 @@ xfs_iflush_fork(
2198 break; 2197 break;
2199 2198
2200 case XFS_DINODE_FMT_BTREE: 2199 case XFS_DINODE_FMT_BTREE:
2201 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 2200 if ((iip->ili_fields & brootflag[whichfork]) &&
2202 (ifp->if_broot_bytes > 0)) { 2201 (ifp->if_broot_bytes > 0)) {
2203 ASSERT(ifp->if_broot != NULL); 2202 ASSERT(ifp->if_broot != NULL);
2204 ASSERT(ifp->if_broot_bytes <= 2203 ASSERT(ifp->if_broot_bytes <=
@@ -2211,14 +2210,14 @@ xfs_iflush_fork(
2211 break; 2210 break;
2212 2211
2213 case XFS_DINODE_FMT_DEV: 2212 case XFS_DINODE_FMT_DEV:
2214 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2213 if (iip->ili_fields & XFS_ILOG_DEV) {
2215 ASSERT(whichfork == XFS_DATA_FORK); 2214 ASSERT(whichfork == XFS_DATA_FORK);
2216 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2215 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2217 } 2216 }
2218 break; 2217 break;
2219 2218
2220 case XFS_DINODE_FMT_UUID: 2219 case XFS_DINODE_FMT_UUID:
2221 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2220 if (iip->ili_fields & XFS_ILOG_UUID) {
2222 ASSERT(whichfork == XFS_DATA_FORK); 2221 ASSERT(whichfork == XFS_DATA_FORK);
2223 memcpy(XFS_DFORK_DPTR(dip), 2222 memcpy(XFS_DFORK_DPTR(dip),
2224 &ip->i_df.if_u2.if_uuid, 2223 &ip->i_df.if_u2.if_uuid,
@@ -2451,9 +2450,8 @@ xfs_iflush(
2451 * to disk, because the log record didn't make it to disk! 2450 * to disk, because the log record didn't make it to disk!
2452 */ 2451 */
2453 if (XFS_FORCED_SHUTDOWN(mp)) { 2452 if (XFS_FORCED_SHUTDOWN(mp)) {
2454 ip->i_update_core = 0;
2455 if (iip) 2453 if (iip)
2456 iip->ili_format.ilf_fields = 0; 2454 iip->ili_fields = 0;
2457 xfs_ifunlock(ip); 2455 xfs_ifunlock(ip);
2458 return XFS_ERROR(EIO); 2456 return XFS_ERROR(EIO);
2459 } 2457 }
@@ -2533,26 +2531,6 @@ xfs_iflush_int(
2533 /* set *dip = inode's place in the buffer */ 2531 /* set *dip = inode's place in the buffer */
2534 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2532 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
2535 2533
2536 /*
2537 * Clear i_update_core before copying out the data.
2538 * This is for coordination with our timestamp updates
2539 * that don't hold the inode lock. They will always
2540 * update the timestamps BEFORE setting i_update_core,
2541 * so if we clear i_update_core after they set it we
2542 * are guaranteed to see their updates to the timestamps.
2543 * I believe that this depends on strongly ordered memory
2544 * semantics, but we have that. We use the SYNCHRONIZE
2545 * macro to make sure that the compiler does not reorder
2546 * the i_update_core access below the data copy below.
2547 */
2548 ip->i_update_core = 0;
2549 SYNCHRONIZE();
2550
2551 /*
2552 * Make sure to get the latest timestamps from the Linux inode.
2553 */
2554 xfs_synchronize_times(ip);
2555
2556 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 2534 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2557 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2535 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2558 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2536 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
@@ -2663,36 +2641,33 @@ xfs_iflush_int(
2663 xfs_inobp_check(mp, bp); 2641 xfs_inobp_check(mp, bp);
2664 2642
2665 /* 2643 /*
2666 * We've recorded everything logged in the inode, so we'd 2644 * We've recorded everything logged in the inode, so we'd like to clear
2667 * like to clear the ilf_fields bits so we don't log and 2645 * the ili_fields bits so we don't log and flush things unnecessarily.
2668 * flush things unnecessarily. However, we can't stop 2646 * However, we can't stop logging all this information until the data
2669 * logging all this information until the data we've copied 2647 * we've copied into the disk buffer is written to disk. If we did we
2670 * into the disk buffer is written to disk. If we did we might 2648 * might overwrite the copy of the inode in the log with all the data
2671 * overwrite the copy of the inode in the log with all the 2649 * after re-logging only part of it, and in the face of a crash we
2672 * data after re-logging only part of it, and in the face of 2650 * wouldn't have all the data we need to recover.
2673 * a crash we wouldn't have all the data we need to recover.
2674 * 2651 *
2675 * What we do is move the bits to the ili_last_fields field. 2652 * What we do is move the bits to the ili_last_fields field. When
2676 * When logging the inode, these bits are moved back to the 2653 * logging the inode, these bits are moved back to the ili_fields field.
2677 * ilf_fields field. In the xfs_iflush_done() routine we 2654 * In the xfs_iflush_done() routine we clear ili_last_fields, since we
2678 * clear ili_last_fields, since we know that the information 2655 * know that the information those bits represent is permanently on
2679 * those bits represent is permanently on disk. As long as 2656 * disk. As long as the flush completes before the inode is logged
2680 * the flush completes before the inode is logged again, then 2657 * again, then both ili_fields and ili_last_fields will be cleared.
2681 * both ilf_fields and ili_last_fields will be cleared.
2682 * 2658 *
2683 * We can play with the ilf_fields bits here, because the inode 2659 * We can play with the ili_fields bits here, because the inode lock
2684 * lock must be held exclusively in order to set bits there 2660 * must be held exclusively in order to set bits there and the flush
2685 * and the flush lock protects the ili_last_fields bits. 2661 * lock protects the ili_last_fields bits. Set ili_logged so the flush
2686 * Set ili_logged so the flush done 2662 * done routine can tell whether or not to look in the AIL. Also, store
2687 * routine can tell whether or not to look in the AIL. 2663 * the current LSN of the inode so that we can tell whether the item has
2688 * Also, store the current LSN of the inode so that we can tell 2664 * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
2689 * whether the item has moved in the AIL from xfs_iflush_done(). 2665 * need the AIL lock, because it is a 64 bit value that cannot be read
2690 * In order to read the lsn we need the AIL lock, because 2666 * atomically.
2691 * it is a 64 bit value that cannot be read atomically.
2692 */ 2667 */
2693 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 2668 if (iip != NULL && iip->ili_fields != 0) {
2694 iip->ili_last_fields = iip->ili_format.ilf_fields; 2669 iip->ili_last_fields = iip->ili_fields;
2695 iip->ili_format.ilf_fields = 0; 2670 iip->ili_fields = 0;
2696 iip->ili_logged = 1; 2671 iip->ili_logged = 1;
2697 2672
2698 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2673 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
@@ -2711,8 +2686,7 @@ xfs_iflush_int(
2711 } else { 2686 } else {
2712 /* 2687 /*
2713 * We're flushing an inode which is not in the AIL and has 2688 * We're flushing an inode which is not in the AIL and has
2714 * not been logged but has i_update_core set. For this 2689 * not been logged. For this case we can immediately drop
2715 * case we can use a B_DELWRI flush and immediately drop
2716 * the inode flush lock because we can avoid the whole 2690 * the inode flush lock because we can avoid the whole
2717 * AIL state thing. It's OK to drop the flush lock now, 2691 * AIL state thing. It's OK to drop the flush lock now,
2718 * because we've already locked the buffer and to do anything 2692 * because we've already locked the buffer and to do anything
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2f27b7454085..7fee3387e1c8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -241,7 +241,6 @@ typedef struct xfs_inode {
241 spinlock_t i_flags_lock; /* inode i_flags lock */ 241 spinlock_t i_flags_lock; /* inode i_flags lock */
242 /* Miscellaneous state. */ 242 /* Miscellaneous state. */
243 unsigned long i_flags; /* see defined flags below */ 243 unsigned long i_flags; /* see defined flags below */
244 unsigned char i_update_core; /* timestamps/size is dirty */
245 unsigned int i_delayed_blks; /* count of delay alloc blks */ 244 unsigned int i_delayed_blks; /* count of delay alloc blks */
246 245
247 xfs_icdinode_t i_d; /* most of ondisk inode */ 246 xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -275,6 +274,20 @@ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
275} 274}
276 275
277/* 276/*
277 * If this I/O goes past the on-disk inode size update it unless it would
278 * be past the current in-core inode size.
279 */
280static inline xfs_fsize_t
281xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
282{
283 xfs_fsize_t i_size = i_size_read(VFS_I(ip));
284
285 if (new_size > i_size)
286 new_size = i_size;
287 return new_size > ip->i_d.di_size ? new_size : 0;
288}
289
290/*
278 * i_flags helper functions 291 * i_flags helper functions
279 */ 292 */
280static inline void 293static inline void
@@ -374,10 +387,11 @@ xfs_set_projid(struct xfs_inode *ip,
374#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) 387#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
375#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ 388#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
376#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) 389#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
390#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
377 391
378/* 392/*
379 * Per-lifetime flags need to be reset when re-using a reclaimable inode during 393 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
380 * inode lookup. Thi prevents unintended behaviour on the new inode from 394 * inode lookup. This prevents unintended behaviour on the new inode from
381 * ocurring. 395 * ocurring.
382 */ 396 */
383#define XFS_IRECLAIM_RESET_FLAGS \ 397#define XFS_IRECLAIM_RESET_FLAGS \
@@ -422,7 +436,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
422#define XFS_IOLOCK_SHARED (1<<1) 436#define XFS_IOLOCK_SHARED (1<<1)
423#define XFS_ILOCK_EXCL (1<<2) 437#define XFS_ILOCK_EXCL (1<<2)
424#define XFS_ILOCK_SHARED (1<<3) 438#define XFS_ILOCK_SHARED (1<<3)
425#define XFS_IUNLOCK_NONOTIFY (1<<4)
426 439
427#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ 440#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
428 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) 441 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
@@ -431,8 +444,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
431 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ 444 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
432 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ 445 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
433 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ 446 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
434 { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ 447 { XFS_ILOCK_SHARED, "ILOCK_SHARED" }
435 { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
436 448
437 449
438/* 450/*
@@ -522,10 +534,6 @@ void xfs_promote_inode(struct xfs_inode *);
522void xfs_lock_inodes(xfs_inode_t **, int, uint); 534void xfs_lock_inodes(xfs_inode_t **, int, uint);
523void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 535void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
524 536
525void xfs_synchronize_times(xfs_inode_t *);
526void xfs_mark_inode_dirty(xfs_inode_t *);
527void xfs_mark_inode_dirty_sync(xfs_inode_t *);
528
529#define IHOLD(ip) \ 537#define IHOLD(ip) \
530do { \ 538do { \
531 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 539 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -546,6 +554,7 @@ do { \
546 */ 554 */
547#define XFS_IGET_CREATE 0x1 555#define XFS_IGET_CREATE 0x1
548#define XFS_IGET_UNTRUSTED 0x2 556#define XFS_IGET_UNTRUSTED 0x2
557#define XFS_IGET_DONTCACHE 0x4
549 558
550int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, 559int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
551 xfs_ino_t, struct xfs_dinode **, 560 xfs_ino_t, struct xfs_dinode **,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91d71dcd4852..05d924efceaf 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -57,77 +57,28 @@ xfs_inode_item_size(
57 struct xfs_inode *ip = iip->ili_inode; 57 struct xfs_inode *ip = iip->ili_inode;
58 uint nvecs = 2; 58 uint nvecs = 2;
59 59
60 /*
61 * Only log the data/extents/b-tree root if there is something
62 * left to log.
63 */
64 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
65
66 switch (ip->i_d.di_format) { 60 switch (ip->i_d.di_format) {
67 case XFS_DINODE_FMT_EXTENTS: 61 case XFS_DINODE_FMT_EXTENTS:
68 iip->ili_format.ilf_fields &= 62 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
69 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 63 ip->i_d.di_nextents > 0 &&
70 XFS_ILOG_DEV | XFS_ILOG_UUID); 64 ip->i_df.if_bytes > 0)
71 if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
72 (ip->i_d.di_nextents > 0) &&
73 (ip->i_df.if_bytes > 0)) {
74 ASSERT(ip->i_df.if_u1.if_extents != NULL);
75 nvecs++; 65 nvecs++;
76 } else {
77 iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
78 }
79 break; 66 break;
80 67
81 case XFS_DINODE_FMT_BTREE: 68 case XFS_DINODE_FMT_BTREE:
82 iip->ili_format.ilf_fields &= 69 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
83 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 70 ip->i_df.if_broot_bytes > 0)
84 XFS_ILOG_DEV | XFS_ILOG_UUID);
85 if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
86 (ip->i_df.if_broot_bytes > 0)) {
87 ASSERT(ip->i_df.if_broot != NULL);
88 nvecs++; 71 nvecs++;
89 } else {
90 ASSERT(!(iip->ili_format.ilf_fields &
91 XFS_ILOG_DBROOT));
92#ifdef XFS_TRANS_DEBUG
93 if (iip->ili_root_size > 0) {
94 ASSERT(iip->ili_root_size ==
95 ip->i_df.if_broot_bytes);
96 ASSERT(memcmp(iip->ili_orig_root,
97 ip->i_df.if_broot,
98 iip->ili_root_size) == 0);
99 } else {
100 ASSERT(ip->i_df.if_broot_bytes == 0);
101 }
102#endif
103 iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
104 }
105 break; 72 break;
106 73
107 case XFS_DINODE_FMT_LOCAL: 74 case XFS_DINODE_FMT_LOCAL:
108 iip->ili_format.ilf_fields &= 75 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
109 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | 76 ip->i_df.if_bytes > 0)
110 XFS_ILOG_DEV | XFS_ILOG_UUID);
111 if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
112 (ip->i_df.if_bytes > 0)) {
113 ASSERT(ip->i_df.if_u1.if_data != NULL);
114 ASSERT(ip->i_d.di_size > 0);
115 nvecs++; 77 nvecs++;
116 } else {
117 iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
118 }
119 break; 78 break;
120 79
121 case XFS_DINODE_FMT_DEV: 80 case XFS_DINODE_FMT_DEV:
122 iip->ili_format.ilf_fields &=
123 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
124 XFS_ILOG_DEXT | XFS_ILOG_UUID);
125 break;
126
127 case XFS_DINODE_FMT_UUID: 81 case XFS_DINODE_FMT_UUID:
128 iip->ili_format.ilf_fields &=
129 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
130 XFS_ILOG_DEXT | XFS_ILOG_DEV);
131 break; 82 break;
132 83
133 default: 84 default:
@@ -135,56 +86,31 @@ xfs_inode_item_size(
135 break; 86 break;
136 } 87 }
137 88
138 /* 89 if (!XFS_IFORK_Q(ip))
139 * If there are no attributes associated with this file,
140 * then there cannot be anything more to log.
141 * Clear all attribute-related log flags.
142 */
143 if (!XFS_IFORK_Q(ip)) {
144 iip->ili_format.ilf_fields &=
145 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
146 return nvecs; 90 return nvecs;
147 } 91
148 92
149 /* 93 /*
150 * Log any necessary attribute data. 94 * Log any necessary attribute data.
151 */ 95 */
152 switch (ip->i_d.di_aformat) { 96 switch (ip->i_d.di_aformat) {
153 case XFS_DINODE_FMT_EXTENTS: 97 case XFS_DINODE_FMT_EXTENTS:
154 iip->ili_format.ilf_fields &= 98 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
155 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); 99 ip->i_d.di_anextents > 0 &&
156 if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && 100 ip->i_afp->if_bytes > 0)
157 (ip->i_d.di_anextents > 0) &&
158 (ip->i_afp->if_bytes > 0)) {
159 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
160 nvecs++; 101 nvecs++;
161 } else {
162 iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
163 }
164 break; 102 break;
165 103
166 case XFS_DINODE_FMT_BTREE: 104 case XFS_DINODE_FMT_BTREE:
167 iip->ili_format.ilf_fields &= 105 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
168 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); 106 ip->i_afp->if_broot_bytes > 0)
169 if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
170 (ip->i_afp->if_broot_bytes > 0)) {
171 ASSERT(ip->i_afp->if_broot != NULL);
172 nvecs++; 107 nvecs++;
173 } else {
174 iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
175 }
176 break; 108 break;
177 109
178 case XFS_DINODE_FMT_LOCAL: 110 case XFS_DINODE_FMT_LOCAL:
179 iip->ili_format.ilf_fields &= 111 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
180 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); 112 ip->i_afp->if_bytes > 0)
181 if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
182 (ip->i_afp->if_bytes > 0)) {
183 ASSERT(ip->i_afp->if_u1.if_data != NULL);
184 nvecs++; 113 nvecs++;
185 } else {
186 iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
187 }
188 break; 114 break;
189 115
190 default: 116 default:
@@ -254,48 +180,11 @@ xfs_inode_item_format(
254 vecp++; 180 vecp++;
255 nvecs = 1; 181 nvecs = 1;
256 182
257 /*
258 * Clear i_update_core if the timestamps (or any other
259 * non-transactional modification) need flushing/logging
260 * and we're about to log them with the rest of the core.
261 *
262 * This is the same logic as xfs_iflush() but this code can't
263 * run at the same time as xfs_iflush because we're in commit
264 * processing here and so we have the inode lock held in
265 * exclusive mode. Although it doesn't really matter
266 * for the timestamps if both routines were to grab the
267 * timestamps or not. That would be ok.
268 *
269 * We clear i_update_core before copying out the data.
270 * This is for coordination with our timestamp updates
271 * that don't hold the inode lock. They will always
272 * update the timestamps BEFORE setting i_update_core,
273 * so if we clear i_update_core after they set it we
274 * are guaranteed to see their updates to the timestamps
275 * either here. Likewise, if they set it after we clear it
276 * here, we'll see it either on the next commit of this
277 * inode or the next time the inode gets flushed via
278 * xfs_iflush(). This depends on strongly ordered memory
279 * semantics, but we have that. We use the SYNCHRONIZE
280 * macro to make sure that the compiler does not reorder
281 * the i_update_core access below the data copy below.
282 */
283 if (ip->i_update_core) {
284 ip->i_update_core = 0;
285 SYNCHRONIZE();
286 }
287
288 /*
289 * Make sure to get the latest timestamps from the Linux inode.
290 */
291 xfs_synchronize_times(ip);
292
293 vecp->i_addr = &ip->i_d; 183 vecp->i_addr = &ip->i_d;
294 vecp->i_len = sizeof(struct xfs_icdinode); 184 vecp->i_len = sizeof(struct xfs_icdinode);
295 vecp->i_type = XLOG_REG_TYPE_ICORE; 185 vecp->i_type = XLOG_REG_TYPE_ICORE;
296 vecp++; 186 vecp++;
297 nvecs++; 187 nvecs++;
298 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
299 188
300 /* 189 /*
301 * If this is really an old format inode, then we need to 190 * If this is really an old format inode, then we need to
@@ -328,16 +217,17 @@ xfs_inode_item_format(
328 217
329 switch (ip->i_d.di_format) { 218 switch (ip->i_d.di_format) {
330 case XFS_DINODE_FMT_EXTENTS: 219 case XFS_DINODE_FMT_EXTENTS:
331 ASSERT(!(iip->ili_format.ilf_fields & 220 iip->ili_fields &=
332 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 221 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
333 XFS_ILOG_DEV | XFS_ILOG_UUID))); 222 XFS_ILOG_DEV | XFS_ILOG_UUID);
334 if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { 223
335 ASSERT(ip->i_df.if_bytes > 0); 224 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
225 ip->i_d.di_nextents > 0 &&
226 ip->i_df.if_bytes > 0) {
336 ASSERT(ip->i_df.if_u1.if_extents != NULL); 227 ASSERT(ip->i_df.if_u1.if_extents != NULL);
337 ASSERT(ip->i_d.di_nextents > 0); 228 ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
338 ASSERT(iip->ili_extents_buf == NULL); 229 ASSERT(iip->ili_extents_buf == NULL);
339 ASSERT((ip->i_df.if_bytes / 230
340 (uint)sizeof(xfs_bmbt_rec_t)) > 0);
341#ifdef XFS_NATIVE_HOST 231#ifdef XFS_NATIVE_HOST
342 if (ip->i_d.di_nextents == ip->i_df.if_bytes / 232 if (ip->i_d.di_nextents == ip->i_df.if_bytes /
343 (uint)sizeof(xfs_bmbt_rec_t)) { 233 (uint)sizeof(xfs_bmbt_rec_t)) {
@@ -359,15 +249,18 @@ xfs_inode_item_format(
359 iip->ili_format.ilf_dsize = vecp->i_len; 249 iip->ili_format.ilf_dsize = vecp->i_len;
360 vecp++; 250 vecp++;
361 nvecs++; 251 nvecs++;
252 } else {
253 iip->ili_fields &= ~XFS_ILOG_DEXT;
362 } 254 }
363 break; 255 break;
364 256
365 case XFS_DINODE_FMT_BTREE: 257 case XFS_DINODE_FMT_BTREE:
366 ASSERT(!(iip->ili_format.ilf_fields & 258 iip->ili_fields &=
367 (XFS_ILOG_DDATA | XFS_ILOG_DEXT | 259 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
368 XFS_ILOG_DEV | XFS_ILOG_UUID))); 260 XFS_ILOG_DEV | XFS_ILOG_UUID);
369 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { 261
370 ASSERT(ip->i_df.if_broot_bytes > 0); 262 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
263 ip->i_df.if_broot_bytes > 0) {
371 ASSERT(ip->i_df.if_broot != NULL); 264 ASSERT(ip->i_df.if_broot != NULL);
372 vecp->i_addr = ip->i_df.if_broot; 265 vecp->i_addr = ip->i_df.if_broot;
373 vecp->i_len = ip->i_df.if_broot_bytes; 266 vecp->i_len = ip->i_df.if_broot_bytes;
@@ -375,15 +268,30 @@ xfs_inode_item_format(
375 vecp++; 268 vecp++;
376 nvecs++; 269 nvecs++;
377 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 270 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
271 } else {
272 ASSERT(!(iip->ili_fields &
273 XFS_ILOG_DBROOT));
274#ifdef XFS_TRANS_DEBUG
275 if (iip->ili_root_size > 0) {
276 ASSERT(iip->ili_root_size ==
277 ip->i_df.if_broot_bytes);
278 ASSERT(memcmp(iip->ili_orig_root,
279 ip->i_df.if_broot,
280 iip->ili_root_size) == 0);
281 } else {
282 ASSERT(ip->i_df.if_broot_bytes == 0);
283 }
284#endif
285 iip->ili_fields &= ~XFS_ILOG_DBROOT;
378 } 286 }
379 break; 287 break;
380 288
381 case XFS_DINODE_FMT_LOCAL: 289 case XFS_DINODE_FMT_LOCAL:
382 ASSERT(!(iip->ili_format.ilf_fields & 290 iip->ili_fields &=
383 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 291 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
384 XFS_ILOG_DEV | XFS_ILOG_UUID))); 292 XFS_ILOG_DEV | XFS_ILOG_UUID);
385 if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { 293 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
386 ASSERT(ip->i_df.if_bytes > 0); 294 ip->i_df.if_bytes > 0) {
387 ASSERT(ip->i_df.if_u1.if_data != NULL); 295 ASSERT(ip->i_df.if_u1.if_data != NULL);
388 ASSERT(ip->i_d.di_size > 0); 296 ASSERT(ip->i_d.di_size > 0);
389 297
@@ -401,24 +309,26 @@ xfs_inode_item_format(
401 vecp++; 309 vecp++;
402 nvecs++; 310 nvecs++;
403 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 311 iip->ili_format.ilf_dsize = (unsigned)data_bytes;
312 } else {
313 iip->ili_fields &= ~XFS_ILOG_DDATA;
404 } 314 }
405 break; 315 break;
406 316
407 case XFS_DINODE_FMT_DEV: 317 case XFS_DINODE_FMT_DEV:
408 ASSERT(!(iip->ili_format.ilf_fields & 318 iip->ili_fields &=
409 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 319 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
410 XFS_ILOG_DDATA | XFS_ILOG_UUID))); 320 XFS_ILOG_DEXT | XFS_ILOG_UUID);
411 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 321 if (iip->ili_fields & XFS_ILOG_DEV) {
412 iip->ili_format.ilf_u.ilfu_rdev = 322 iip->ili_format.ilf_u.ilfu_rdev =
413 ip->i_df.if_u2.if_rdev; 323 ip->i_df.if_u2.if_rdev;
414 } 324 }
415 break; 325 break;
416 326
417 case XFS_DINODE_FMT_UUID: 327 case XFS_DINODE_FMT_UUID:
418 ASSERT(!(iip->ili_format.ilf_fields & 328 iip->ili_fields &=
419 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 329 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
420 XFS_ILOG_DDATA | XFS_ILOG_DEV))); 330 XFS_ILOG_DEXT | XFS_ILOG_DEV);
421 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 331 if (iip->ili_fields & XFS_ILOG_UUID) {
422 iip->ili_format.ilf_u.ilfu_uuid = 332 iip->ili_format.ilf_u.ilfu_uuid =
423 ip->i_df.if_u2.if_uuid; 333 ip->i_df.if_u2.if_uuid;
424 } 334 }
@@ -430,31 +340,25 @@ xfs_inode_item_format(
430 } 340 }
431 341
432 /* 342 /*
433 * If there are no attributes associated with the file, 343 * If there are no attributes associated with the file, then we're done.
434 * then we're done.
435 * Assert that no attribute-related log flags are set.
436 */ 344 */
437 if (!XFS_IFORK_Q(ip)) { 345 if (!XFS_IFORK_Q(ip)) {
438 iip->ili_format.ilf_size = nvecs; 346 iip->ili_fields &=
439 ASSERT(!(iip->ili_format.ilf_fields & 347 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
440 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 348 goto out;
441 return;
442 } 349 }
443 350
444 switch (ip->i_d.di_aformat) { 351 switch (ip->i_d.di_aformat) {
445 case XFS_DINODE_FMT_EXTENTS: 352 case XFS_DINODE_FMT_EXTENTS:
446 ASSERT(!(iip->ili_format.ilf_fields & 353 iip->ili_fields &=
447 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); 354 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
448 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { 355
449#ifdef DEBUG 356 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
450 int nrecs = ip->i_afp->if_bytes / 357 ip->i_d.di_anextents > 0 &&
451 (uint)sizeof(xfs_bmbt_rec_t); 358 ip->i_afp->if_bytes > 0) {
452 ASSERT(nrecs > 0); 359 ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
453 ASSERT(nrecs == ip->i_d.di_anextents); 360 ip->i_d.di_anextents);
454 ASSERT(ip->i_afp->if_bytes > 0);
455 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 361 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
456 ASSERT(ip->i_d.di_anextents > 0);
457#endif
458#ifdef XFS_NATIVE_HOST 362#ifdef XFS_NATIVE_HOST
459 /* 363 /*
460 * There are not delayed allocation extents 364 * There are not delayed allocation extents
@@ -471,29 +375,36 @@ xfs_inode_item_format(
471 iip->ili_format.ilf_asize = vecp->i_len; 375 iip->ili_format.ilf_asize = vecp->i_len;
472 vecp++; 376 vecp++;
473 nvecs++; 377 nvecs++;
378 } else {
379 iip->ili_fields &= ~XFS_ILOG_AEXT;
474 } 380 }
475 break; 381 break;
476 382
477 case XFS_DINODE_FMT_BTREE: 383 case XFS_DINODE_FMT_BTREE:
478 ASSERT(!(iip->ili_format.ilf_fields & 384 iip->ili_fields &=
479 (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); 385 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
480 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { 386
481 ASSERT(ip->i_afp->if_broot_bytes > 0); 387 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
388 ip->i_afp->if_broot_bytes > 0) {
482 ASSERT(ip->i_afp->if_broot != NULL); 389 ASSERT(ip->i_afp->if_broot != NULL);
390
483 vecp->i_addr = ip->i_afp->if_broot; 391 vecp->i_addr = ip->i_afp->if_broot;
484 vecp->i_len = ip->i_afp->if_broot_bytes; 392 vecp->i_len = ip->i_afp->if_broot_bytes;
485 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; 393 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
486 vecp++; 394 vecp++;
487 nvecs++; 395 nvecs++;
488 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 396 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
397 } else {
398 iip->ili_fields &= ~XFS_ILOG_ABROOT;
489 } 399 }
490 break; 400 break;
491 401
492 case XFS_DINODE_FMT_LOCAL: 402 case XFS_DINODE_FMT_LOCAL:
493 ASSERT(!(iip->ili_format.ilf_fields & 403 iip->ili_fields &=
494 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 404 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
495 if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { 405
496 ASSERT(ip->i_afp->if_bytes > 0); 406 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
407 ip->i_afp->if_bytes > 0) {
497 ASSERT(ip->i_afp->if_u1.if_data != NULL); 408 ASSERT(ip->i_afp->if_u1.if_data != NULL);
498 409
499 vecp->i_addr = ip->i_afp->if_u1.if_data; 410 vecp->i_addr = ip->i_afp->if_u1.if_data;
@@ -510,6 +421,8 @@ xfs_inode_item_format(
510 vecp++; 421 vecp++;
511 nvecs++; 422 nvecs++;
512 iip->ili_format.ilf_asize = (unsigned)data_bytes; 423 iip->ili_format.ilf_asize = (unsigned)data_bytes;
424 } else {
425 iip->ili_fields &= ~XFS_ILOG_ADATA;
513 } 426 }
514 break; 427 break;
515 428
@@ -518,6 +431,15 @@ xfs_inode_item_format(
518 break; 431 break;
519 } 432 }
520 433
434out:
435 /*
436 * Now update the log format that goes out to disk from the in-core
437 * values. We always write the inode core to make the arithmetic
438 * games in recovery easier, which isn't a big deal as just about any
439 * transaction would dirty it anyway.
440 */
441 iip->ili_format.ilf_fields = XFS_ILOG_CORE |
442 (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
521 iip->ili_format.ilf_size = nvecs; 443 iip->ili_format.ilf_size = nvecs;
522} 444}
523 445
@@ -596,17 +518,13 @@ xfs_inode_item_trylock(
596 /* Stale items should force out the iclog */ 518 /* Stale items should force out the iclog */
597 if (ip->i_flags & XFS_ISTALE) { 519 if (ip->i_flags & XFS_ISTALE) {
598 xfs_ifunlock(ip); 520 xfs_ifunlock(ip);
599 /* 521 xfs_iunlock(ip, XFS_ILOCK_SHARED);
600 * we hold the AIL lock - notify the unlock routine of this
601 * so it doesn't try to get the lock again.
602 */
603 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
604 return XFS_ITEM_PINNED; 522 return XFS_ITEM_PINNED;
605 } 523 }
606 524
607#ifdef DEBUG 525#ifdef DEBUG
608 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 526 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
609 ASSERT(iip->ili_format.ilf_fields != 0); 527 ASSERT(iip->ili_fields != 0);
610 ASSERT(iip->ili_logged == 0); 528 ASSERT(iip->ili_logged == 0);
611 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 529 ASSERT(lip->li_flags & XFS_LI_IN_AIL);
612 } 530 }
@@ -638,7 +556,7 @@ xfs_inode_item_unlock(
638 if (iip->ili_extents_buf != NULL) { 556 if (iip->ili_extents_buf != NULL) {
639 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); 557 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
640 ASSERT(ip->i_d.di_nextents > 0); 558 ASSERT(ip->i_d.di_nextents > 0);
641 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); 559 ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
642 ASSERT(ip->i_df.if_bytes > 0); 560 ASSERT(ip->i_df.if_bytes > 0);
643 kmem_free(iip->ili_extents_buf); 561 kmem_free(iip->ili_extents_buf);
644 iip->ili_extents_buf = NULL; 562 iip->ili_extents_buf = NULL;
@@ -646,7 +564,7 @@ xfs_inode_item_unlock(
646 if (iip->ili_aextents_buf != NULL) { 564 if (iip->ili_aextents_buf != NULL) {
647 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); 565 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
648 ASSERT(ip->i_d.di_anextents > 0); 566 ASSERT(ip->i_d.di_anextents > 0);
649 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); 567 ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
650 ASSERT(ip->i_afp->if_bytes > 0); 568 ASSERT(ip->i_afp->if_bytes > 0);
651 kmem_free(iip->ili_aextents_buf); 569 kmem_free(iip->ili_aextents_buf);
652 iip->ili_aextents_buf = NULL; 570 iip->ili_aextents_buf = NULL;
@@ -761,8 +679,7 @@ xfs_inode_item_push(
761 * lock without sleeping, then there must not have been 679 * lock without sleeping, then there must not have been
762 * anyone in the process of flushing the inode. 680 * anyone in the process of flushing the inode.
763 */ 681 */
764 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || 682 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
765 iip->ili_format.ilf_fields != 0);
766 683
767 /* 684 /*
768 * Push the inode to it's backing buffer. This will not remove the 685 * Push the inode to it's backing buffer. This will not remove the
@@ -985,7 +902,7 @@ xfs_iflush_abort(
985 * Clear the inode logging fields so no more flushes are 902 * Clear the inode logging fields so no more flushes are
986 * attempted. 903 * attempted.
987 */ 904 */
988 iip->ili_format.ilf_fields = 0; 905 iip->ili_fields = 0;
989 } 906 }
990 /* 907 /*
991 * Release the inode's flush lock since we're done with it. 908 * Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index d3dee61e6d91..41d61c3b7a36 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 {
86#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ 86#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
87#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ 87#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
88 88
89
90/*
91 * The timestamps are dirty, but not necessarily anything else in the inode
92 * core. Unlike the other fields above this one must never make it to disk
93 * in the ilf_fields of the inode_log_format, but is purely store in-memory in
94 * ili_fields in the inode_log_item.
95 */
96#define XFS_ILOG_TIMESTAMP 0x4000
97
89#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ 98#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
90 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ 99 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
91 XFS_ILOG_UUID | XFS_ILOG_ADATA | \ 100 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
@@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 {
101 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ 110 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
102 XFS_ILOG_DEV | XFS_ILOG_UUID | \ 111 XFS_ILOG_DEV | XFS_ILOG_UUID | \
103 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ 112 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
104 XFS_ILOG_ABROOT) 113 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
105 114
106static inline int xfs_ilog_fbroot(int w) 115static inline int xfs_ilog_fbroot(int w)
107{ 116{
@@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item {
134 unsigned short ili_lock_flags; /* lock flags */ 143 unsigned short ili_lock_flags; /* lock flags */
135 unsigned short ili_logged; /* flushed logged data */ 144 unsigned short ili_logged; /* flushed logged data */
136 unsigned int ili_last_fields; /* fields when flushed */ 145 unsigned int ili_last_fields; /* fields when flushed */
146 unsigned int ili_fields; /* fields to be logged */
137 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged 147 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
138 data exts */ 148 data exts */
139 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged 149 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
@@ -148,9 +158,7 @@ typedef struct xfs_inode_log_item {
148 158
149static inline int xfs_inode_clean(xfs_inode_t *ip) 159static inline int xfs_inode_clean(xfs_inode_t *ip)
150{ 160{
151 return (!ip->i_itemp || 161 return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
152 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
153 !ip->i_update_core;
154} 162}
155 163
156extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); 164extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 76f3ca5cfc36..91f8ff547ab3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -209,6 +209,7 @@ xfs_open_by_handle(
209 struct file *filp; 209 struct file *filp;
210 struct inode *inode; 210 struct inode *inode;
211 struct dentry *dentry; 211 struct dentry *dentry;
212 fmode_t fmode;
212 213
213 if (!capable(CAP_SYS_ADMIN)) 214 if (!capable(CAP_SYS_ADMIN))
214 return -XFS_ERROR(EPERM); 215 return -XFS_ERROR(EPERM);
@@ -228,26 +229,21 @@ xfs_open_by_handle(
228 hreq->oflags |= O_LARGEFILE; 229 hreq->oflags |= O_LARGEFILE;
229#endif 230#endif
230 231
231 /* Put open permission in namei format. */
232 permflag = hreq->oflags; 232 permflag = hreq->oflags;
233 if ((permflag+1) & O_ACCMODE) 233 fmode = OPEN_FMODE(permflag);
234 permflag++;
235 if (permflag & O_TRUNC)
236 permflag |= 2;
237
238 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && 234 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
239 (permflag & FMODE_WRITE) && IS_APPEND(inode)) { 235 (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
240 error = -XFS_ERROR(EPERM); 236 error = -XFS_ERROR(EPERM);
241 goto out_dput; 237 goto out_dput;
242 } 238 }
243 239
244 if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { 240 if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
245 error = -XFS_ERROR(EACCES); 241 error = -XFS_ERROR(EACCES);
246 goto out_dput; 242 goto out_dput;
247 } 243 }
248 244
249 /* Can't write directories. */ 245 /* Can't write directories. */
250 if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { 246 if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
251 error = -XFS_ERROR(EISDIR); 247 error = -XFS_ERROR(EISDIR);
252 goto out_dput; 248 goto out_dput;
253 } 249 }
@@ -450,9 +446,12 @@ xfs_attrmulti_attr_get(
450 446
451 if (*len > XATTR_SIZE_MAX) 447 if (*len > XATTR_SIZE_MAX)
452 return EINVAL; 448 return EINVAL;
453 kbuf = kmalloc(*len, GFP_KERNEL); 449 kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
454 if (!kbuf) 450 if (!kbuf) {
455 return ENOMEM; 451 kbuf = kmem_zalloc_large(*len);
452 if (!kbuf)
453 return ENOMEM;
454 }
456 455
457 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 456 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
458 if (error) 457 if (error)
@@ -462,7 +461,10 @@ xfs_attrmulti_attr_get(
462 error = EFAULT; 461 error = EFAULT;
463 462
464 out_kfree: 463 out_kfree:
465 kfree(kbuf); 464 if (is_vmalloc_addr(kbuf))
465 kmem_free_large(kbuf);
466 else
467 kmem_free(kbuf);
466 return error; 468 return error;
467} 469}
468 470
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index f9ccb7b7c043..a849a5473aff 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -293,7 +293,7 @@ xfs_compat_ioc_bulkstat(
293 int res; 293 int res;
294 294
295 error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, 295 error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
296 sizeof(compat_xfs_bstat_t), 0, &res); 296 sizeof(compat_xfs_bstat_t), NULL, &res);
297 } else if (cmd == XFS_IOC_FSBULKSTAT_32) { 297 } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
298 error = xfs_bulkstat(mp, &inlast, &count, 298 error = xfs_bulkstat(mp, &inlast, &count,
299 xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), 299 xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 246c7d57c6f9..71a464503c43 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -31,6 +31,7 @@
31#include "xfs_ialloc_btree.h" 31#include "xfs_ialloc_btree.h"
32#include "xfs_dinode.h" 32#include "xfs_dinode.h"
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_inode_item.h"
34#include "xfs_btree.h" 35#include "xfs_btree.h"
35#include "xfs_bmap.h" 36#include "xfs_bmap.h"
36#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
@@ -645,6 +646,7 @@ xfs_iomap_write_unwritten(
645 xfs_trans_t *tp; 646 xfs_trans_t *tp;
646 xfs_bmbt_irec_t imap; 647 xfs_bmbt_irec_t imap;
647 xfs_bmap_free_t free_list; 648 xfs_bmap_free_t free_list;
649 xfs_fsize_t i_size;
648 uint resblks; 650 uint resblks;
649 int committed; 651 int committed;
650 int error; 652 int error;
@@ -705,7 +707,22 @@ xfs_iomap_write_unwritten(
705 if (error) 707 if (error)
706 goto error_on_bmapi_transaction; 708 goto error_on_bmapi_transaction;
707 709
708 error = xfs_bmap_finish(&(tp), &(free_list), &committed); 710 /*
711 * Log the updated inode size as we go. We have to be careful
712 * to only log it up to the actual write offset if it is
713 * halfway into a block.
714 */
715 i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
716 if (i_size > offset + count)
717 i_size = offset + count;
718
719 i_size = xfs_new_eof(ip, i_size);
720 if (i_size) {
721 ip->i_d.di_size = i_size;
722 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
723 }
724
725 error = xfs_bmap_finish(&tp, &free_list, &committed);
709 if (error) 726 if (error)
710 goto error_on_bmapi_transaction; 727 goto error_on_bmapi_transaction;
711 728
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab302539e5b9..3011b879f850 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -50,65 +50,15 @@
50#include <linux/fiemap.h> 50#include <linux/fiemap.h>
51#include <linux/slab.h> 51#include <linux/slab.h>
52 52
53/* 53static int
54 * Bring the timestamps in the XFS inode uptodate. 54xfs_initxattrs(
55 * 55 struct inode *inode,
56 * Used before writing the inode to disk. 56 const struct xattr *xattr_array,
57 */ 57 void *fs_info)
58void
59xfs_synchronize_times(
60 xfs_inode_t *ip)
61{
62 struct inode *inode = VFS_I(ip);
63
64 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
65 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
66 ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
67 ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
68 ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
69 ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
70}
71
72/*
73 * If the linux inode is valid, mark it dirty, else mark the dirty state
74 * in the XFS inode to make sure we pick it up when reclaiming the inode.
75 */
76void
77xfs_mark_inode_dirty_sync(
78 xfs_inode_t *ip)
79{
80 struct inode *inode = VFS_I(ip);
81
82 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
83 mark_inode_dirty_sync(inode);
84 else {
85 barrier();
86 ip->i_update_core = 1;
87 }
88}
89
90void
91xfs_mark_inode_dirty(
92 xfs_inode_t *ip)
93{
94 struct inode *inode = VFS_I(ip);
95
96 if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
97 mark_inode_dirty(inode);
98 else {
99 barrier();
100 ip->i_update_core = 1;
101 }
102
103}
104
105
106int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
107 void *fs_info)
108{ 58{
109 const struct xattr *xattr; 59 const struct xattr *xattr;
110 struct xfs_inode *ip = XFS_I(inode); 60 struct xfs_inode *ip = XFS_I(inode);
111 int error = 0; 61 int error = 0;
112 62
113 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 63 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
114 error = xfs_attr_set(ip, xattr->name, xattr->value, 64 error = xfs_attr_set(ip, xattr->name, xattr->value,
@@ -678,19 +628,16 @@ xfs_setattr_nonsize(
678 inode->i_atime = iattr->ia_atime; 628 inode->i_atime = iattr->ia_atime;
679 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; 629 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
680 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; 630 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
681 ip->i_update_core = 1;
682 } 631 }
683 if (mask & ATTR_CTIME) { 632 if (mask & ATTR_CTIME) {
684 inode->i_ctime = iattr->ia_ctime; 633 inode->i_ctime = iattr->ia_ctime;
685 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; 634 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
686 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; 635 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
687 ip->i_update_core = 1;
688 } 636 }
689 if (mask & ATTR_MTIME) { 637 if (mask & ATTR_MTIME) {
690 inode->i_mtime = iattr->ia_mtime; 638 inode->i_mtime = iattr->ia_mtime;
691 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; 639 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
692 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; 640 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
693 ip->i_update_core = 1;
694 } 641 }
695 642
696 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 643 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -918,13 +865,11 @@ xfs_setattr_size(
918 inode->i_ctime = iattr->ia_ctime; 865 inode->i_ctime = iattr->ia_ctime;
919 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; 866 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
920 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; 867 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
921 ip->i_update_core = 1;
922 } 868 }
923 if (mask & ATTR_MTIME) { 869 if (mask & ATTR_MTIME) {
924 inode->i_mtime = iattr->ia_mtime; 870 inode->i_mtime = iattr->ia_mtime;
925 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; 871 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
926 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; 872 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
927 ip->i_update_core = 1;
928 } 873 }
929 874
930 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 875 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 751e94fe1f77..acc2bf264dab 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -62,7 +62,6 @@ xfs_bulkstat_one_int(
62{ 62{
63 struct xfs_icdinode *dic; /* dinode core info pointer */ 63 struct xfs_icdinode *dic; /* dinode core info pointer */
64 struct xfs_inode *ip; /* incore inode pointer */ 64 struct xfs_inode *ip; /* incore inode pointer */
65 struct inode *inode;
66 struct xfs_bstat *buf; /* return buffer */ 65 struct xfs_bstat *buf; /* return buffer */
67 int error = 0; /* error value */ 66 int error = 0; /* error value */
68 67
@@ -76,7 +75,8 @@ xfs_bulkstat_one_int(
76 return XFS_ERROR(ENOMEM); 75 return XFS_ERROR(ENOMEM);
77 76
78 error = xfs_iget(mp, NULL, ino, 77 error = xfs_iget(mp, NULL, ino,
79 XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip); 78 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
79 XFS_ILOCK_SHARED, &ip);
80 if (error) { 80 if (error) {
81 *stat = BULKSTAT_RV_NOTHING; 81 *stat = BULKSTAT_RV_NOTHING;
82 goto out_free; 82 goto out_free;
@@ -86,7 +86,6 @@ xfs_bulkstat_one_int(
86 ASSERT(ip->i_imap.im_blkno != 0); 86 ASSERT(ip->i_imap.im_blkno != 0);
87 87
88 dic = &ip->i_d; 88 dic = &ip->i_d;
89 inode = VFS_I(ip);
90 89
91 /* xfs_iget returns the following without needing 90 /* xfs_iget returns the following without needing
92 * further change. 91 * further change.
@@ -99,19 +98,12 @@ xfs_bulkstat_one_int(
99 buf->bs_uid = dic->di_uid; 98 buf->bs_uid = dic->di_uid;
100 buf->bs_gid = dic->di_gid; 99 buf->bs_gid = dic->di_gid;
101 buf->bs_size = dic->di_size; 100 buf->bs_size = dic->di_size;
102 101 buf->bs_atime.tv_sec = dic->di_atime.t_sec;
103 /* 102 buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
104 * We need to read the timestamps from the Linux inode because 103 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
105 * the VFS keeps writing directly into the inode structure instead 104 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
106 * of telling us about the updates. 105 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
107 */ 106 buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
108 buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
109 buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
110 buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
111 buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
112 buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
113 buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
114
115 buf->bs_xflags = xfs_ip2xflags(ip); 107 buf->bs_xflags = xfs_ip2xflags(ip);
116 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; 108 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
117 buf->bs_extents = dic->di_nextents; 109 buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e2cc3568c299..6db1fef38bff 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -67,15 +67,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log,
67 int eventual_size); 67 int eventual_size);
68STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); 68STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
69 69
70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(struct log *log, 70STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 71 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 72STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 73 xlog_ticket_t *ticket);
77STATIC int xlog_regrant_write_log_space(xlog_t *log,
78 xlog_ticket_t *ticket);
79STATIC void xlog_ungrant_log_space(xlog_t *log, 74STATIC void xlog_ungrant_log_space(xlog_t *log,
80 xlog_ticket_t *ticket); 75 xlog_ticket_t *ticket);
81 76
@@ -150,78 +145,93 @@ xlog_grant_add_space(
150 } while (head_val != old); 145 } while (head_val != old);
151} 146}
152 147
153STATIC bool 148STATIC void
154xlog_reserveq_wake( 149xlog_grant_head_init(
155 struct log *log, 150 struct xlog_grant_head *head)
156 int *free_bytes) 151{
152 xlog_assign_grant_head(&head->grant, 1, 0);
153 INIT_LIST_HEAD(&head->waiters);
154 spin_lock_init(&head->lock);
155}
156
157STATIC void
158xlog_grant_head_wake_all(
159 struct xlog_grant_head *head)
157{ 160{
158 struct xlog_ticket *tic; 161 struct xlog_ticket *tic;
159 int need_bytes;
160 162
161 list_for_each_entry(tic, &log->l_reserveq, t_queue) { 163 spin_lock(&head->lock);
164 list_for_each_entry(tic, &head->waiters, t_queue)
165 wake_up_process(tic->t_task);
166 spin_unlock(&head->lock);
167}
168
169static inline int
170xlog_ticket_reservation(
171 struct log *log,
172 struct xlog_grant_head *head,
173 struct xlog_ticket *tic)
174{
175 if (head == &log->l_write_head) {
176 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
177 return tic->t_unit_res;
178 } else {
162 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 179 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
163 need_bytes = tic->t_unit_res * tic->t_cnt; 180 return tic->t_unit_res * tic->t_cnt;
164 else 181 else
165 need_bytes = tic->t_unit_res; 182 return tic->t_unit_res;
166
167 if (*free_bytes < need_bytes)
168 return false;
169 *free_bytes -= need_bytes;
170
171 trace_xfs_log_grant_wake_up(log, tic);
172 wake_up(&tic->t_wait);
173 } 183 }
174
175 return true;
176} 184}
177 185
178STATIC bool 186STATIC bool
179xlog_writeq_wake( 187xlog_grant_head_wake(
180 struct log *log, 188 struct log *log,
189 struct xlog_grant_head *head,
181 int *free_bytes) 190 int *free_bytes)
182{ 191{
183 struct xlog_ticket *tic; 192 struct xlog_ticket *tic;
184 int need_bytes; 193 int need_bytes;
185 194
186 list_for_each_entry(tic, &log->l_writeq, t_queue) { 195 list_for_each_entry(tic, &head->waiters, t_queue) {
187 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 196 need_bytes = xlog_ticket_reservation(log, head, tic);
188
189 need_bytes = tic->t_unit_res;
190
191 if (*free_bytes < need_bytes) 197 if (*free_bytes < need_bytes)
192 return false; 198 return false;
193 *free_bytes -= need_bytes;
194 199
195 trace_xfs_log_regrant_write_wake_up(log, tic); 200 *free_bytes -= need_bytes;
196 wake_up(&tic->t_wait); 201 trace_xfs_log_grant_wake_up(log, tic);
202 wake_up_process(tic->t_task);
197 } 203 }
198 204
199 return true; 205 return true;
200} 206}
201 207
202STATIC int 208STATIC int
203xlog_reserveq_wait( 209xlog_grant_head_wait(
204 struct log *log, 210 struct log *log,
211 struct xlog_grant_head *head,
205 struct xlog_ticket *tic, 212 struct xlog_ticket *tic,
206 int need_bytes) 213 int need_bytes)
207{ 214{
208 list_add_tail(&tic->t_queue, &log->l_reserveq); 215 list_add_tail(&tic->t_queue, &head->waiters);
209 216
210 do { 217 do {
211 if (XLOG_FORCED_SHUTDOWN(log)) 218 if (XLOG_FORCED_SHUTDOWN(log))
212 goto shutdown; 219 goto shutdown;
213 xlog_grant_push_ail(log, need_bytes); 220 xlog_grant_push_ail(log, need_bytes);
214 221
222 __set_current_state(TASK_UNINTERRUPTIBLE);
223 spin_unlock(&head->lock);
224
215 XFS_STATS_INC(xs_sleep_logspace); 225 XFS_STATS_INC(xs_sleep_logspace);
216 trace_xfs_log_grant_sleep(log, tic);
217 226
218 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); 227 trace_xfs_log_grant_sleep(log, tic);
228 schedule();
219 trace_xfs_log_grant_wake(log, tic); 229 trace_xfs_log_grant_wake(log, tic);
220 230
221 spin_lock(&log->l_grant_reserve_lock); 231 spin_lock(&head->lock);
222 if (XLOG_FORCED_SHUTDOWN(log)) 232 if (XLOG_FORCED_SHUTDOWN(log))
223 goto shutdown; 233 goto shutdown;
224 } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); 234 } while (xlog_space_left(log, &head->grant) < need_bytes);
225 235
226 list_del_init(&tic->t_queue); 236 list_del_init(&tic->t_queue);
227 return 0; 237 return 0;
@@ -230,35 +240,58 @@ shutdown:
230 return XFS_ERROR(EIO); 240 return XFS_ERROR(EIO);
231} 241}
232 242
243/*
244 * Atomically get the log space required for a log ticket.
245 *
246 * Once a ticket gets put onto head->waiters, it will only return after the
247 * needed reservation is satisfied.
248 *
249 * This function is structured so that it has a lock free fast path. This is
250 * necessary because every new transaction reservation will come through this
251 * path. Hence any lock will be globally hot if we take it unconditionally on
252 * every pass.
253 *
254 * As tickets are only ever moved on and off head->waiters under head->lock, we
255 * only need to take that lock if we are going to add the ticket to the queue
256 * and sleep. We can avoid taking the lock if the ticket was never added to
257 * head->waiters because the t_queue list head will be empty and we hold the
258 * only reference to it so it can safely be checked unlocked.
259 */
233STATIC int 260STATIC int
234xlog_writeq_wait( 261xlog_grant_head_check(
235 struct log *log, 262 struct log *log,
263 struct xlog_grant_head *head,
236 struct xlog_ticket *tic, 264 struct xlog_ticket *tic,
237 int need_bytes) 265 int *need_bytes)
238{ 266{
239 list_add_tail(&tic->t_queue, &log->l_writeq); 267 int free_bytes;
240 268 int error = 0;
241 do {
242 if (XLOG_FORCED_SHUTDOWN(log))
243 goto shutdown;
244 xlog_grant_push_ail(log, need_bytes);
245
246 XFS_STATS_INC(xs_sleep_logspace);
247 trace_xfs_log_regrant_write_sleep(log, tic);
248 269
249 xlog_wait(&tic->t_wait, &log->l_grant_write_lock); 270 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
250 trace_xfs_log_regrant_write_wake(log, tic);
251 271
252 spin_lock(&log->l_grant_write_lock); 272 /*
253 if (XLOG_FORCED_SHUTDOWN(log)) 273 * If there are other waiters on the queue then give them a chance at
254 goto shutdown; 274 * logspace before us. Wake up the first waiters, if we do not wake
255 } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); 275 * up all the waiters then go to sleep waiting for more free space,
276 * otherwise try to get some space for this transaction.
277 */
278 *need_bytes = xlog_ticket_reservation(log, head, tic);
279 free_bytes = xlog_space_left(log, &head->grant);
280 if (!list_empty_careful(&head->waiters)) {
281 spin_lock(&head->lock);
282 if (!xlog_grant_head_wake(log, head, &free_bytes) ||
283 free_bytes < *need_bytes) {
284 error = xlog_grant_head_wait(log, head, tic,
285 *need_bytes);
286 }
287 spin_unlock(&head->lock);
288 } else if (free_bytes < *need_bytes) {
289 spin_lock(&head->lock);
290 error = xlog_grant_head_wait(log, head, tic, *need_bytes);
291 spin_unlock(&head->lock);
292 }
256 293
257 list_del_init(&tic->t_queue); 294 return error;
258 return 0;
259shutdown:
260 list_del_init(&tic->t_queue);
261 return XFS_ERROR(EIO);
262} 295}
263 296
264static void 297static void
@@ -286,6 +319,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
286} 319}
287 320
288/* 321/*
322 * Replenish the byte reservation required by moving the grant write head.
323 */
324int
325xfs_log_regrant(
326 struct xfs_mount *mp,
327 struct xlog_ticket *tic)
328{
329 struct log *log = mp->m_log;
330 int need_bytes;
331 int error = 0;
332
333 if (XLOG_FORCED_SHUTDOWN(log))
334 return XFS_ERROR(EIO);
335
336 XFS_STATS_INC(xs_try_logspace);
337
338 /*
339 * This is a new transaction on the ticket, so we need to change the
340 * transaction ID so that the next transaction has a different TID in
341 * the log. Just add one to the existing tid so that we can see chains
342 * of rolling transactions in the log easily.
343 */
344 tic->t_tid++;
345
346 xlog_grant_push_ail(log, tic->t_unit_res);
347
348 tic->t_curr_res = tic->t_unit_res;
349 xlog_tic_reset_res(tic);
350
351 if (tic->t_cnt > 0)
352 return 0;
353
354 trace_xfs_log_regrant(log, tic);
355
356 error = xlog_grant_head_check(log, &log->l_write_head, tic,
357 &need_bytes);
358 if (error)
359 goto out_error;
360
361 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
362 trace_xfs_log_regrant_exit(log, tic);
363 xlog_verify_grant_tail(log);
364 return 0;
365
366out_error:
367 /*
368 * If we are failing, make sure the ticket doesn't have any current
369 * reservations. We don't want to add this back when the ticket/
370 * transaction gets cancelled.
371 */
372 tic->t_curr_res = 0;
373 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
374 return error;
375}
376
377/*
378 * Reserve log space and return a ticket corresponding the reservation.
379 *
380 * Each reservation is going to reserve extra space for a log record header.
381 * When writes happen to the on-disk log, we don't subtract the length of the
382 * log record header from any reservation. By wasting space in each
383 * reservation, we prevent over allocation problems.
384 */
385int
386xfs_log_reserve(
387 struct xfs_mount *mp,
388 int unit_bytes,
389 int cnt,
390 struct xlog_ticket **ticp,
391 __uint8_t client,
392 bool permanent,
393 uint t_type)
394{
395 struct log *log = mp->m_log;
396 struct xlog_ticket *tic;
397 int need_bytes;
398 int error = 0;
399
400 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
401
402 if (XLOG_FORCED_SHUTDOWN(log))
403 return XFS_ERROR(EIO);
404
405 XFS_STATS_INC(xs_try_logspace);
406
407 ASSERT(*ticp == NULL);
408 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
409 KM_SLEEP | KM_MAYFAIL);
410 if (!tic)
411 return XFS_ERROR(ENOMEM);
412
413 tic->t_trans_type = t_type;
414 *ticp = tic;
415
416 xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
417
418 trace_xfs_log_reserve(log, tic);
419
420 error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
421 &need_bytes);
422 if (error)
423 goto out_error;
424
425 xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
426 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
427 trace_xfs_log_reserve_exit(log, tic);
428 xlog_verify_grant_tail(log);
429 return 0;
430
431out_error:
432 /*
433 * If we are failing, make sure the ticket doesn't have any current
434 * reservations. We don't want to add this back when the ticket/
435 * transaction gets cancelled.
436 */
437 tic->t_curr_res = 0;
438 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
439 return error;
440}
441
442
443/*
289 * NOTES: 444 * NOTES:
290 * 445 *
291 * 1. currblock field gets updated at startup and after in-core logs 446 * 1. currblock field gets updated at startup and after in-core logs
@@ -395,88 +550,6 @@ xfs_log_release_iclog(
395} 550}
396 551
397/* 552/*
398 * 1. Reserve an amount of on-disk log space and return a ticket corresponding
399 * to the reservation.
400 * 2. Potentially, push buffers at tail of log to disk.
401 *
402 * Each reservation is going to reserve extra space for a log record header.
403 * When writes happen to the on-disk log, we don't subtract the length of the
404 * log record header from any reservation. By wasting space in each
405 * reservation, we prevent over allocation problems.
406 */
407int
408xfs_log_reserve(
409 struct xfs_mount *mp,
410 int unit_bytes,
411 int cnt,
412 struct xlog_ticket **ticket,
413 __uint8_t client,
414 uint flags,
415 uint t_type)
416{
417 struct log *log = mp->m_log;
418 struct xlog_ticket *internal_ticket;
419 int retval = 0;
420
421 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
422
423 if (XLOG_FORCED_SHUTDOWN(log))
424 return XFS_ERROR(EIO);
425
426 XFS_STATS_INC(xs_try_logspace);
427
428
429 if (*ticket != NULL) {
430 ASSERT(flags & XFS_LOG_PERM_RESERV);
431 internal_ticket = *ticket;
432
433 /*
434 * this is a new transaction on the ticket, so we need to
435 * change the transaction ID so that the next transaction has a
436 * different TID in the log. Just add one to the existing tid
437 * so that we can see chains of rolling transactions in the log
438 * easily.
439 */
440 internal_ticket->t_tid++;
441
442 trace_xfs_log_reserve(log, internal_ticket);
443
444 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
445 retval = xlog_regrant_write_log_space(log, internal_ticket);
446 } else {
447 /* may sleep if need to allocate more tickets */
448 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
449 client, flags,
450 KM_SLEEP|KM_MAYFAIL);
451 if (!internal_ticket)
452 return XFS_ERROR(ENOMEM);
453 internal_ticket->t_trans_type = t_type;
454 *ticket = internal_ticket;
455
456 trace_xfs_log_reserve(log, internal_ticket);
457
458 xlog_grant_push_ail(log,
459 (internal_ticket->t_unit_res *
460 internal_ticket->t_cnt));
461 retval = xlog_grant_log_space(log, internal_ticket);
462 }
463
464 if (unlikely(retval)) {
465 /*
466 * If we are failing, make sure the ticket doesn't have any
467 * current reservations. We don't want to add this back
468 * when the ticket/ transaction gets cancelled.
469 */
470 internal_ticket->t_curr_res = 0;
471 /* ungrant will give back unit_res * t_cnt. */
472 internal_ticket->t_cnt = 0;
473 }
474
475 return retval;
476}
477
478
479/*
480 * Mount a log filesystem 553 * Mount a log filesystem
481 * 554 *
482 * mp - ubiquitous xfs mount point structure 555 * mp - ubiquitous xfs mount point structure
@@ -653,8 +726,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
653 .lv_iovecp = &reg, 726 .lv_iovecp = &reg,
654 }; 727 };
655 728
656 /* remove inited flag */ 729 /* remove inited flag, and account for space used */
657 tic->t_flags = 0; 730 tic->t_flags = 0;
731 tic->t_curr_res -= sizeof(magic);
658 error = xlog_write(log, &vec, tic, &lsn, 732 error = xlog_write(log, &vec, tic, &lsn,
659 NULL, XLOG_UNMOUNT_TRANS); 733 NULL, XLOG_UNMOUNT_TRANS);
660 /* 734 /*
@@ -760,64 +834,35 @@ xfs_log_item_init(
760 INIT_LIST_HEAD(&item->li_cil); 834 INIT_LIST_HEAD(&item->li_cil);
761} 835}
762 836
837/*
838 * Wake up processes waiting for log space after we have moved the log tail.
839 */
763void 840void
764xfs_log_move_tail(xfs_mount_t *mp, 841xfs_log_space_wake(
765 xfs_lsn_t tail_lsn) 842 struct xfs_mount *mp)
766{ 843{
767 xlog_ticket_t *tic; 844 struct log *log = mp->m_log;
768 xlog_t *log = mp->m_log; 845 int free_bytes;
769 int need_bytes, free_bytes;
770 846
771 if (XLOG_FORCED_SHUTDOWN(log)) 847 if (XLOG_FORCED_SHUTDOWN(log))
772 return; 848 return;
773 849
774 if (tail_lsn == 0) 850 if (!list_empty_careful(&log->l_write_head.waiters)) {
775 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 851 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
776
777 /* tail_lsn == 1 implies that we weren't passed a valid value. */
778 if (tail_lsn != 1)
779 atomic64_set(&log->l_tail_lsn, tail_lsn);
780
781 if (!list_empty_careful(&log->l_writeq)) {
782#ifdef DEBUG
783 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
784 panic("Recovery problem");
785#endif
786 spin_lock(&log->l_grant_write_lock);
787 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
788 list_for_each_entry(tic, &log->l_writeq, t_queue) {
789 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
790 852
791 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 853 spin_lock(&log->l_write_head.lock);
792 break; 854 free_bytes = xlog_space_left(log, &log->l_write_head.grant);
793 tail_lsn = 0; 855 xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
794 free_bytes -= tic->t_unit_res; 856 spin_unlock(&log->l_write_head.lock);
795 trace_xfs_log_regrant_write_wake_up(log, tic);
796 wake_up(&tic->t_wait);
797 }
798 spin_unlock(&log->l_grant_write_lock);
799 } 857 }
800 858
801 if (!list_empty_careful(&log->l_reserveq)) { 859 if (!list_empty_careful(&log->l_reserve_head.waiters)) {
802#ifdef DEBUG 860 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
803 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 861
804 panic("Recovery problem"); 862 spin_lock(&log->l_reserve_head.lock);
805#endif 863 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
806 spin_lock(&log->l_grant_reserve_lock); 864 xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
807 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); 865 spin_unlock(&log->l_reserve_head.lock);
808 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
809 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
810 need_bytes = tic->t_unit_res*tic->t_cnt;
811 else
812 need_bytes = tic->t_unit_res;
813 if (free_bytes < need_bytes && tail_lsn != 1)
814 break;
815 tail_lsn = 0;
816 free_bytes -= need_bytes;
817 trace_xfs_log_grant_wake_up(log, tic);
818 wake_up(&tic->t_wait);
819 }
820 spin_unlock(&log->l_grant_reserve_lock);
821 } 866 }
822} 867}
823 868
@@ -867,21 +912,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
867 return needed; 912 return needed;
868} 913}
869 914
870/****************************************************************************** 915/*
871 *
872 * local routines
873 *
874 ******************************************************************************
875 */
876
877/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
878 * The log manager must keep track of the last LR which was committed
879 * to disk. The lsn of this LR will become the new tail_lsn whenever
880 * xfs_trans_tail_ail returns 0. If we don't do this, we run into
881 * the situation where stuff could be written into the log but nothing
882 * was ever in the AIL when asked. Eventually, we panic since the
883 * tail hits the head.
884 *
885 * We may be holding the log iclog lock upon entering this routine. 916 * We may be holding the log iclog lock upon entering this routine.
886 */ 917 */
887xfs_lsn_t 918xfs_lsn_t
@@ -891,10 +922,17 @@ xlog_assign_tail_lsn(
891 xfs_lsn_t tail_lsn; 922 xfs_lsn_t tail_lsn;
892 struct log *log = mp->m_log; 923 struct log *log = mp->m_log;
893 924
925 /*
926 * To make sure we always have a valid LSN for the log tail we keep
927 * track of the last LSN which was committed in log->l_last_sync_lsn,
928 * and use that when the AIL was empty and xfs_ail_min_lsn returns 0.
929 *
930 * If the AIL has been emptied we also need to wake any process
931 * waiting for this condition.
932 */
894 tail_lsn = xfs_ail_min_lsn(mp->m_ail); 933 tail_lsn = xfs_ail_min_lsn(mp->m_ail);
895 if (!tail_lsn) 934 if (!tail_lsn)
896 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 935 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
897
898 atomic64_set(&log->l_tail_lsn, tail_lsn); 936 atomic64_set(&log->l_tail_lsn, tail_lsn);
899 return tail_lsn; 937 return tail_lsn;
900} 938}
@@ -1100,12 +1138,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1100 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); 1138 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1101 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); 1139 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1102 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1140 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1103 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0); 1141
1104 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0); 1142 xlog_grant_head_init(&log->l_reserve_head);
1105 INIT_LIST_HEAD(&log->l_reserveq); 1143 xlog_grant_head_init(&log->l_write_head);
1106 INIT_LIST_HEAD(&log->l_writeq);
1107 spin_lock_init(&log->l_grant_reserve_lock);
1108 spin_lock_init(&log->l_grant_write_lock);
1109 1144
1110 error = EFSCORRUPTED; 1145 error = EFSCORRUPTED;
1111 if (xfs_sb_version_hassector(&mp->m_sb)) { 1146 if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1280,7 +1315,7 @@ xlog_grant_push_ail(
1280 1315
1281 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1316 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1282 1317
1283 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); 1318 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
1284 free_blocks = BTOBBT(free_bytes); 1319 free_blocks = BTOBBT(free_bytes);
1285 1320
1286 /* 1321 /*
@@ -1412,8 +1447,8 @@ xlog_sync(xlog_t *log,
1412 roundoff < BBTOB(1))); 1447 roundoff < BBTOB(1)));
1413 1448
1414 /* move grant heads by roundoff in sync */ 1449 /* move grant heads by roundoff in sync */
1415 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff); 1450 xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
1416 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff); 1451 xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
1417 1452
1418 /* put cycle number in every block */ 1453 /* put cycle number in every block */
1419 xlog_pack_data(log, iclog, roundoff); 1454 xlog_pack_data(log, iclog, roundoff);
@@ -2566,119 +2601,6 @@ restart:
2566 return 0; 2601 return 0;
2567} /* xlog_state_get_iclog_space */ 2602} /* xlog_state_get_iclog_space */
2568 2603
2569/*
2570 * Atomically get the log space required for a log ticket.
2571 *
2572 * Once a ticket gets put onto the reserveq, it will only return after the
2573 * needed reservation is satisfied.
2574 *
2575 * This function is structured so that it has a lock free fast path. This is
2576 * necessary because every new transaction reservation will come through this
2577 * path. Hence any lock will be globally hot if we take it unconditionally on
2578 * every pass.
2579 *
2580 * As tickets are only ever moved on and off the reserveq under the
2581 * l_grant_reserve_lock, we only need to take that lock if we are going to add
2582 * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
2583 * was never added to the reserveq because the t_queue list head will be empty
2584 * and we hold the only reference to it so it can safely be checked unlocked.
2585 */
2586STATIC int
2587xlog_grant_log_space(
2588 struct log *log,
2589 struct xlog_ticket *tic)
2590{
2591 int free_bytes, need_bytes;
2592 int error = 0;
2593
2594 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
2595
2596 trace_xfs_log_grant_enter(log, tic);
2597
2598 /*
2599 * If there are other waiters on the queue then give them a chance at
2600 * logspace before us. Wake up the first waiters, if we do not wake
2601 * up all the waiters then go to sleep waiting for more free space,
2602 * otherwise try to get some space for this transaction.
2603 */
2604 need_bytes = tic->t_unit_res;
2605 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2606 need_bytes *= tic->t_ocnt;
2607 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2608 if (!list_empty_careful(&log->l_reserveq)) {
2609 spin_lock(&log->l_grant_reserve_lock);
2610 if (!xlog_reserveq_wake(log, &free_bytes) ||
2611 free_bytes < need_bytes)
2612 error = xlog_reserveq_wait(log, tic, need_bytes);
2613 spin_unlock(&log->l_grant_reserve_lock);
2614 } else if (free_bytes < need_bytes) {
2615 spin_lock(&log->l_grant_reserve_lock);
2616 error = xlog_reserveq_wait(log, tic, need_bytes);
2617 spin_unlock(&log->l_grant_reserve_lock);
2618 }
2619 if (error)
2620 return error;
2621
2622 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2623 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2624 trace_xfs_log_grant_exit(log, tic);
2625 xlog_verify_grant_tail(log);
2626 return 0;
2627}
2628
2629/*
2630 * Replenish the byte reservation required by moving the grant write head.
2631 *
2632 * Similar to xlog_grant_log_space, the function is structured to have a lock
2633 * free fast path.
2634 */
2635STATIC int
2636xlog_regrant_write_log_space(
2637 struct log *log,
2638 struct xlog_ticket *tic)
2639{
2640 int free_bytes, need_bytes;
2641 int error = 0;
2642
2643 tic->t_curr_res = tic->t_unit_res;
2644 xlog_tic_reset_res(tic);
2645
2646 if (tic->t_cnt > 0)
2647 return 0;
2648
2649 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
2650
2651 trace_xfs_log_regrant_write_enter(log, tic);
2652
2653 /*
2654 * If there are other waiters on the queue then give them a chance at
2655 * logspace before us. Wake up the first waiters, if we do not wake
2656 * up all the waiters then go to sleep waiting for more free space,
2657 * otherwise try to get some space for this transaction.
2658 */
2659 need_bytes = tic->t_unit_res;
2660 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2661 if (!list_empty_careful(&log->l_writeq)) {
2662 spin_lock(&log->l_grant_write_lock);
2663 if (!xlog_writeq_wake(log, &free_bytes) ||
2664 free_bytes < need_bytes)
2665 error = xlog_writeq_wait(log, tic, need_bytes);
2666 spin_unlock(&log->l_grant_write_lock);
2667 } else if (free_bytes < need_bytes) {
2668 spin_lock(&log->l_grant_write_lock);
2669 error = xlog_writeq_wait(log, tic, need_bytes);
2670 spin_unlock(&log->l_grant_write_lock);
2671 }
2672
2673 if (error)
2674 return error;
2675
2676 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2677 trace_xfs_log_regrant_write_exit(log, tic);
2678 xlog_verify_grant_tail(log);
2679 return 0;
2680}
2681
2682/* The first cnt-1 times through here we don't need to 2604/* The first cnt-1 times through here we don't need to
2683 * move the grant write head because the permanent 2605 * move the grant write head because the permanent
2684 * reservation has reserved cnt times the unit amount. 2606 * reservation has reserved cnt times the unit amount.
@@ -2695,9 +2617,9 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2695 if (ticket->t_cnt > 0) 2617 if (ticket->t_cnt > 0)
2696 ticket->t_cnt--; 2618 ticket->t_cnt--;
2697 2619
2698 xlog_grant_sub_space(log, &log->l_grant_reserve_head, 2620 xlog_grant_sub_space(log, &log->l_reserve_head.grant,
2699 ticket->t_curr_res); 2621 ticket->t_curr_res);
2700 xlog_grant_sub_space(log, &log->l_grant_write_head, 2622 xlog_grant_sub_space(log, &log->l_write_head.grant,
2701 ticket->t_curr_res); 2623 ticket->t_curr_res);
2702 ticket->t_curr_res = ticket->t_unit_res; 2624 ticket->t_curr_res = ticket->t_unit_res;
2703 xlog_tic_reset_res(ticket); 2625 xlog_tic_reset_res(ticket);
@@ -2708,7 +2630,7 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2708 if (ticket->t_cnt > 0) 2630 if (ticket->t_cnt > 0)
2709 return; 2631 return;
2710 2632
2711 xlog_grant_add_space(log, &log->l_grant_reserve_head, 2633 xlog_grant_add_space(log, &log->l_reserve_head.grant,
2712 ticket->t_unit_res); 2634 ticket->t_unit_res);
2713 2635
2714 trace_xfs_log_regrant_reserve_exit(log, ticket); 2636 trace_xfs_log_regrant_reserve_exit(log, ticket);
@@ -2754,14 +2676,13 @@ xlog_ungrant_log_space(xlog_t *log,
2754 bytes += ticket->t_unit_res*ticket->t_cnt; 2676 bytes += ticket->t_unit_res*ticket->t_cnt;
2755 } 2677 }
2756 2678
2757 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes); 2679 xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
2758 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes); 2680 xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
2759 2681
2760 trace_xfs_log_ungrant_exit(log, ticket); 2682 trace_xfs_log_ungrant_exit(log, ticket);
2761 2683
2762 xfs_log_move_tail(log->l_mp, 1); 2684 xfs_log_space_wake(log->l_mp);
2763} /* xlog_ungrant_log_space */ 2685}
2764
2765 2686
2766/* 2687/*
2767 * Flush iclog to disk if this is the last reference to the given iclog and 2688 * Flush iclog to disk if this is the last reference to the given iclog and
@@ -3219,7 +3140,7 @@ xlog_ticket_alloc(
3219 int unit_bytes, 3140 int unit_bytes,
3220 int cnt, 3141 int cnt,
3221 char client, 3142 char client,
3222 uint xflags, 3143 bool permanent,
3223 int alloc_flags) 3144 int alloc_flags)
3224{ 3145{
3225 struct xlog_ticket *tic; 3146 struct xlog_ticket *tic;
@@ -3313,6 +3234,7 @@ xlog_ticket_alloc(
3313 } 3234 }
3314 3235
3315 atomic_set(&tic->t_ref, 1); 3236 atomic_set(&tic->t_ref, 1);
3237 tic->t_task = current;
3316 INIT_LIST_HEAD(&tic->t_queue); 3238 INIT_LIST_HEAD(&tic->t_queue);
3317 tic->t_unit_res = unit_bytes; 3239 tic->t_unit_res = unit_bytes;
3318 tic->t_curr_res = unit_bytes; 3240 tic->t_curr_res = unit_bytes;
@@ -3322,9 +3244,8 @@ xlog_ticket_alloc(
3322 tic->t_clientid = client; 3244 tic->t_clientid = client;
3323 tic->t_flags = XLOG_TIC_INITED; 3245 tic->t_flags = XLOG_TIC_INITED;
3324 tic->t_trans_type = 0; 3246 tic->t_trans_type = 0;
3325 if (xflags & XFS_LOG_PERM_RESERV) 3247 if (permanent)
3326 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3248 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3327 init_waitqueue_head(&tic->t_wait);
3328 3249
3329 xlog_tic_reset_res(tic); 3250 xlog_tic_reset_res(tic);
3330 3251
@@ -3380,7 +3301,7 @@ xlog_verify_grant_tail(
3380 int tail_cycle, tail_blocks; 3301 int tail_cycle, tail_blocks;
3381 int cycle, space; 3302 int cycle, space;
3382 3303
3383 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); 3304 xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
3384 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); 3305 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3385 if (tail_cycle != cycle) { 3306 if (tail_cycle != cycle) {
3386 if (cycle - 1 != tail_cycle && 3307 if (cycle - 1 != tail_cycle &&
@@ -3582,7 +3503,6 @@ xfs_log_force_umount(
3582 struct xfs_mount *mp, 3503 struct xfs_mount *mp,
3583 int logerror) 3504 int logerror)
3584{ 3505{
3585 xlog_ticket_t *tic;
3586 xlog_t *log; 3506 xlog_t *log;
3587 int retval; 3507 int retval;
3588 3508
@@ -3650,15 +3570,8 @@ xfs_log_force_umount(
3650 * we don't enqueue anything once the SHUTDOWN flag is set, and this 3570 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3651 * action is protected by the grant locks. 3571 * action is protected by the grant locks.
3652 */ 3572 */
3653 spin_lock(&log->l_grant_reserve_lock); 3573 xlog_grant_head_wake_all(&log->l_reserve_head);
3654 list_for_each_entry(tic, &log->l_reserveq, t_queue) 3574 xlog_grant_head_wake_all(&log->l_write_head);
3655 wake_up(&tic->t_wait);
3656 spin_unlock(&log->l_grant_reserve_lock);
3657
3658 spin_lock(&log->l_grant_write_lock);
3659 list_for_each_entry(tic, &log->l_writeq, t_queue)
3660 wake_up(&tic->t_wait);
3661 spin_unlock(&log->l_grant_write_lock);
3662 3575
3663 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3576 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3664 ASSERT(!logerror); 3577 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2aee3b22d29c..2c622bedb302 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -53,15 +53,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
53#define XFS_LOG_REL_PERM_RESERV 0x1 53#define XFS_LOG_REL_PERM_RESERV 0x1
54 54
55/* 55/*
56 * Flags to xfs_log_reserve()
57 *
58 * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
59 * performed against this type of reservation, the reservation
60 * is not decreased. Long running transactions should use this.
61 */
62#define XFS_LOG_PERM_RESERV 0x2
63
64/*
65 * Flags to xfs_log_force() 56 * Flags to xfs_log_force()
66 * 57 *
67 * XFS_LOG_SYNC: Synchronous force in-core log to disk 58 * XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -160,8 +151,8 @@ int xfs_log_mount(struct xfs_mount *mp,
160 xfs_daddr_t start_block, 151 xfs_daddr_t start_block,
161 int num_bblocks); 152 int num_bblocks);
162int xfs_log_mount_finish(struct xfs_mount *mp); 153int xfs_log_mount_finish(struct xfs_mount *mp);
163void xfs_log_move_tail(struct xfs_mount *mp, 154xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
164 xfs_lsn_t tail_lsn); 155void xfs_log_space_wake(struct xfs_mount *mp);
165int xfs_log_notify(struct xfs_mount *mp, 156int xfs_log_notify(struct xfs_mount *mp,
166 struct xlog_in_core *iclog, 157 struct xlog_in_core *iclog,
167 xfs_log_callback_t *callback_entry); 158 xfs_log_callback_t *callback_entry);
@@ -172,8 +163,9 @@ int xfs_log_reserve(struct xfs_mount *mp,
172 int count, 163 int count,
173 struct xlog_ticket **ticket, 164 struct xlog_ticket **ticket,
174 __uint8_t clientid, 165 __uint8_t clientid,
175 uint flags, 166 bool permanent,
176 uint t_type); 167 uint t_type);
168int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
177int xfs_log_unmount_write(struct xfs_mount *mp); 169int xfs_log_unmount_write(struct xfs_mount *mp);
178void xfs_log_unmount(struct xfs_mount *mp); 170void xfs_log_unmount(struct xfs_mount *mp);
179int xfs_log_force_umount(struct xfs_mount *mp, int logerror); 171int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2d3b6a498d63..2152900b79d4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -239,8 +239,8 @@ typedef struct xlog_res {
239} xlog_res_t; 239} xlog_res_t;
240 240
241typedef struct xlog_ticket { 241typedef struct xlog_ticket {
242 wait_queue_head_t t_wait; /* ticket wait queue */
243 struct list_head t_queue; /* reserve/write queue */ 242 struct list_head t_queue; /* reserve/write queue */
243 struct task_struct *t_task; /* task that owns this ticket */
244 xlog_tid_t t_tid; /* transaction identifier : 4 */ 244 xlog_tid_t t_tid; /* transaction identifier : 4 */
245 atomic_t t_ref; /* ticket reference count : 4 */ 245 atomic_t t_ref; /* ticket reference count : 4 */
246 int t_curr_res; /* current reservation in bytes : 4 */ 246 int t_curr_res; /* current reservation in bytes : 4 */
@@ -470,6 +470,16 @@ struct xfs_cil {
470#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) 470#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
471 471
472/* 472/*
473 * ticket grant locks, queues and accounting have their own cachlines
474 * as these are quite hot and can be operated on concurrently.
475 */
476struct xlog_grant_head {
477 spinlock_t lock ____cacheline_aligned_in_smp;
478 struct list_head waiters;
479 atomic64_t grant;
480};
481
482/*
473 * The reservation head lsn is not made up of a cycle number and block number. 483 * The reservation head lsn is not made up of a cycle number and block number.
474 * Instead, it uses a cycle number and byte number. Logs don't expect to 484 * Instead, it uses a cycle number and byte number. Logs don't expect to
475 * overflow 31 bits worth of byte offset, so using a byte number will mean 485 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -520,17 +530,8 @@ typedef struct log {
520 /* lsn of 1st LR with unflushed * buffers */ 530 /* lsn of 1st LR with unflushed * buffers */
521 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; 531 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
522 532
523 /* 533 struct xlog_grant_head l_reserve_head;
524 * ticket grant locks, queues and accounting have their own cachlines 534 struct xlog_grant_head l_write_head;
525 * as these are quite hot and can be operated on concurrently.
526 */
527 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
528 struct list_head l_reserveq;
529 atomic64_t l_grant_reserve_head;
530
531 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
532 struct list_head l_writeq;
533 atomic64_t l_grant_write_head;
534 535
535 /* The following field are used for debugging; need to hold icloglock */ 536 /* The following field are used for debugging; need to hold icloglock */
536#ifdef DEBUG 537#ifdef DEBUG
@@ -545,14 +546,13 @@ typedef struct log {
545#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 546#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
546 547
547/* common routines */ 548/* common routines */
548extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
549extern int xlog_recover(xlog_t *log); 549extern int xlog_recover(xlog_t *log);
550extern int xlog_recover_finish(xlog_t *log); 550extern int xlog_recover_finish(xlog_t *log);
551extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 551extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
552 552
553extern kmem_zone_t *xfs_log_ticket_zone; 553extern kmem_zone_t *xfs_log_ticket_zone;
554struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, 554struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
555 int count, char client, uint xflags, 555 int count, char client, bool permanent,
556 int alloc_flags); 556 int alloc_flags);
557 557
558 558
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0ed9ee77937c..8ecad5bad66c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -965,9 +965,9 @@ xlog_find_tail(
965 log->l_curr_cycle++; 965 log->l_curr_cycle++;
966 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); 966 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
967 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); 967 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
968 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, 968 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
969 BBTOB(log->l_curr_block)); 969 BBTOB(log->l_curr_block));
970 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, 970 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
971 BBTOB(log->l_curr_block)); 971 BBTOB(log->l_curr_block));
972 972
973 /* 973 /*
@@ -3161,37 +3161,26 @@ xlog_recover_process_iunlinks(
3161 */ 3161 */
3162 continue; 3162 continue;
3163 } 3163 }
3164 /*
3165 * Unlock the buffer so that it can be acquired in the normal
3166 * course of the transaction to truncate and free each inode.
3167 * Because we are not racing with anyone else here for the AGI
3168 * buffer, we don't even need to hold it locked to read the
3169 * initial unlinked bucket entries out of the buffer. We keep
3170 * buffer reference though, so that it stays pinned in memory
3171 * while we need the buffer.
3172 */
3164 agi = XFS_BUF_TO_AGI(agibp); 3173 agi = XFS_BUF_TO_AGI(agibp);
3174 xfs_buf_unlock(agibp);
3165 3175
3166 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { 3176 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3167 agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3177 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3168 while (agino != NULLAGINO) { 3178 while (agino != NULLAGINO) {
3169 /*
3170 * Release the agi buffer so that it can
3171 * be acquired in the normal course of the
3172 * transaction to truncate and free the inode.
3173 */
3174 xfs_buf_relse(agibp);
3175
3176 agino = xlog_recover_process_one_iunlink(mp, 3179 agino = xlog_recover_process_one_iunlink(mp,
3177 agno, agino, bucket); 3180 agno, agino, bucket);
3178
3179 /*
3180 * Reacquire the agibuffer and continue around
3181 * the loop. This should never fail as we know
3182 * the buffer was good earlier on.
3183 */
3184 error = xfs_read_agi(mp, NULL, agno, &agibp);
3185 ASSERT(error == 0);
3186 agi = XFS_BUF_TO_AGI(agibp);
3187 } 3181 }
3188 } 3182 }
3189 3183 xfs_buf_rele(agibp);
3190 /*
3191 * Release the buffer for the current agi so we can
3192 * go on to the next one.
3193 */
3194 xfs_buf_relse(agibp);
3195 } 3184 }
3196 3185
3197 mp->m_dmevmask = mp_dmevmask; 3186 mp->m_dmevmask = mp_dmevmask;
@@ -3695,7 +3684,7 @@ xlog_do_recover(
3695 3684
3696 /* Convert superblock from on-disk format */ 3685 /* Convert superblock from on-disk format */
3697 sbp = &log->l_mp->m_sb; 3686 sbp = &log->l_mp->m_sb;
3698 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 3687 xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
3699 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 3688 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3700 ASSERT(xfs_sb_good_version(sbp)); 3689 ASSERT(xfs_sb_good_version(sbp));
3701 xfs_buf_relse(bp); 3690 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d06afbc3540d..1ffead4b2296 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -158,7 +158,7 @@ xfs_uuid_mount(
158 158
159 out_duplicate: 159 out_duplicate:
160 mutex_unlock(&xfs_uuid_table_mutex); 160 mutex_unlock(&xfs_uuid_table_mutex);
161 xfs_warn(mp, "Filesystem has duplicate UUID - can't mount"); 161 xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
162 return XFS_ERROR(EINVAL); 162 return XFS_ERROR(EINVAL);
163} 163}
164 164
@@ -553,9 +553,11 @@ out_unwind:
553 553
554void 554void
555xfs_sb_from_disk( 555xfs_sb_from_disk(
556 xfs_sb_t *to, 556 struct xfs_mount *mp,
557 xfs_dsb_t *from) 557 xfs_dsb_t *from)
558{ 558{
559 struct xfs_sb *to = &mp->m_sb;
560
559 to->sb_magicnum = be32_to_cpu(from->sb_magicnum); 561 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
560 to->sb_blocksize = be32_to_cpu(from->sb_blocksize); 562 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
561 to->sb_dblocks = be64_to_cpu(from->sb_dblocks); 563 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -693,7 +695,7 @@ reread:
693 * Initialize the mount structure from the superblock. 695 * Initialize the mount structure from the superblock.
694 * But first do some basic consistency checking. 696 * But first do some basic consistency checking.
695 */ 697 */
696 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 698 xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
697 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 699 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
698 if (error) { 700 if (error) {
699 if (loud) 701 if (loud)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19f69e232509..9eba73887829 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -211,6 +211,9 @@ typedef struct xfs_mount {
211 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 211 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
212 int64_t m_low_space[XFS_LOWSP_MAX]; 212 int64_t m_low_space[XFS_LOWSP_MAX];
213 /* low free space thresholds */ 213 /* low free space thresholds */
214
215 struct workqueue_struct *m_data_workqueue;
216 struct workqueue_struct *m_unwritten_workqueue;
214} xfs_mount_t; 217} xfs_mount_t;
215 218
216/* 219/*
@@ -395,7 +398,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
395extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 398extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
396extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, 399extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
397 xfs_agnumber_t *); 400 xfs_agnumber_t *);
398extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 401extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
399extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 402extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
400 403
401#endif /* __XFS_MOUNT_H__ */ 404#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index c436def733bf..55c6afedc879 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -48,194 +48,189 @@
48 * quota functionality, including maintaining the freelist and hash 48 * quota functionality, including maintaining the freelist and hash
49 * tables of dquots. 49 * tables of dquots.
50 */ 50 */
51struct mutex xfs_Gqm_lock;
52struct xfs_qm *xfs_Gqm;
53
54kmem_zone_t *qm_dqzone;
55kmem_zone_t *qm_dqtrxzone;
56
57STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
58STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
59
60STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 51STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
61STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 52STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
62STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); 53STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
63 54
64static struct shrinker xfs_qm_shaker = {
65 .shrink = xfs_qm_shake,
66 .seeks = DEFAULT_SEEKS,
67};
68
69/* 55/*
70 * Initialize the XQM structure. 56 * We use the batch lookup interface to iterate over the dquots as it
71 * Note that there is not one quota manager per file system. 57 * currently is the only interface into the radix tree code that allows
58 * fuzzy lookups instead of exact matches. Holding the lock over multiple
59 * operations is fine as all callers are used either during mount/umount
60 * or quotaoff.
72 */ 61 */
73STATIC struct xfs_qm * 62#define XFS_DQ_LOOKUP_BATCH 32
74xfs_Gqm_init(void) 63
64STATIC int
65xfs_qm_dquot_walk(
66 struct xfs_mount *mp,
67 int type,
68 int (*execute)(struct xfs_dquot *dqp))
75{ 69{
76 xfs_dqhash_t *udqhash, *gdqhash; 70 struct xfs_quotainfo *qi = mp->m_quotainfo;
77 xfs_qm_t *xqm; 71 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
78 size_t hsize; 72 uint32_t next_index;
79 uint i; 73 int last_error = 0;
74 int skipped;
75 int nr_found;
76
77restart:
78 skipped = 0;
79 next_index = 0;
80 nr_found = 0;
81
82 while (1) {
83 struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
84 int error = 0;
85 int i;
86
87 mutex_lock(&qi->qi_tree_lock);
88 nr_found = radix_tree_gang_lookup(tree, (void **)batch,
89 next_index, XFS_DQ_LOOKUP_BATCH);
90 if (!nr_found) {
91 mutex_unlock(&qi->qi_tree_lock);
92 break;
93 }
80 94
81 /* 95 for (i = 0; i < nr_found; i++) {
82 * Initialize the dquot hash tables. 96 struct xfs_dquot *dqp = batch[i];
83 */
84 udqhash = kmem_zalloc_greedy(&hsize,
85 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
86 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
87 if (!udqhash)
88 goto out;
89 97
90 gdqhash = kmem_zalloc_large(hsize); 98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
91 if (!gdqhash)
92 goto out_free_udqhash;
93 99
94 hsize /= sizeof(xfs_dqhash_t); 100 error = execute(batch[i]);
101 if (error == EAGAIN) {
102 skipped++;
103 continue;
104 }
105 if (error && last_error != EFSCORRUPTED)
106 last_error = error;
107 }
95 108
96 xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); 109 mutex_unlock(&qi->qi_tree_lock);
97 xqm->qm_dqhashmask = hsize - 1;
98 xqm->qm_usr_dqhtable = udqhash;
99 xqm->qm_grp_dqhtable = gdqhash;
100 ASSERT(xqm->qm_usr_dqhtable != NULL);
101 ASSERT(xqm->qm_grp_dqhtable != NULL);
102 110
103 for (i = 0; i < hsize; i++) { 111 /* bail out if the filesystem is corrupted. */
104 xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i); 112 if (last_error == EFSCORRUPTED) {
105 xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i); 113 skipped = 0;
114 break;
115 }
106 } 116 }
107 117
108 /* 118 if (skipped) {
109 * Freelist of all dquots of all file systems 119 delay(1);
110 */ 120 goto restart;
111 INIT_LIST_HEAD(&xqm->qm_dqfrlist); 121 }
112 xqm->qm_dqfrlist_cnt = 0;
113 mutex_init(&xqm->qm_dqfrlist_lock);
114
115 /*
116 * dquot zone. we register our own low-memory callback.
117 */
118 if (!qm_dqzone) {
119 xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
120 "xfs_dquots");
121 qm_dqzone = xqm->qm_dqzone;
122 } else
123 xqm->qm_dqzone = qm_dqzone;
124
125 register_shrinker(&xfs_qm_shaker);
126
127 /*
128 * The t_dqinfo portion of transactions.
129 */
130 if (!qm_dqtrxzone) {
131 xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
132 "xfs_dqtrx");
133 qm_dqtrxzone = xqm->qm_dqtrxzone;
134 } else
135 xqm->qm_dqtrxzone = qm_dqtrxzone;
136
137 atomic_set(&xqm->qm_totaldquots, 0);
138 xqm->qm_nrefs = 0;
139 return xqm;
140 122
141 out_free_udqhash: 123 return last_error;
142 kmem_free_large(udqhash);
143 out:
144 return NULL;
145} 124}
146 125
126
147/* 127/*
148 * Destroy the global quota manager when its reference count goes to zero. 128 * Purge a dquot from all tracking data structures and free it.
149 */ 129 */
150STATIC void 130STATIC int
151xfs_qm_destroy( 131xfs_qm_dqpurge(
152 struct xfs_qm *xqm) 132 struct xfs_dquot *dqp)
153{ 133{
154 int hsize, i; 134 struct xfs_mount *mp = dqp->q_mount;
135 struct xfs_quotainfo *qi = mp->m_quotainfo;
136 struct xfs_dquot *gdqp = NULL;
155 137
156 ASSERT(xqm != NULL); 138 xfs_dqlock(dqp);
157 ASSERT(xqm->qm_nrefs == 0); 139 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
140 xfs_dqunlock(dqp);
141 return EAGAIN;
142 }
158 143
159 unregister_shrinker(&xfs_qm_shaker); 144 /*
145 * If this quota has a group hint attached, prepare for releasing it
146 * now.
147 */
148 gdqp = dqp->q_gdquot;
149 if (gdqp) {
150 xfs_dqlock(gdqp);
151 dqp->q_gdquot = NULL;
152 }
160 153
161 mutex_lock(&xqm->qm_dqfrlist_lock); 154 dqp->dq_flags |= XFS_DQ_FREEING;
162 ASSERT(list_empty(&xqm->qm_dqfrlist));
163 mutex_unlock(&xqm->qm_dqfrlist_lock);
164 155
165 hsize = xqm->qm_dqhashmask + 1; 156 /*
166 for (i = 0; i < hsize; i++) { 157 * If we're turning off quotas, we have to make sure that, for
167 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 158 * example, we don't delete quota disk blocks while dquots are
168 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); 159 * in the process of getting written to those disk blocks.
160 * This dquot might well be on AIL, and we can't leave it there
161 * if we're turning off quotas. Basically, we need this flush
162 * lock, and are willing to block on it.
163 */
164 if (!xfs_dqflock_nowait(dqp)) {
165 /*
166 * Block on the flush lock after nudging dquot buffer,
167 * if it is incore.
168 */
169 xfs_dqflock_pushbuf_wait(dqp);
169 } 170 }
170 kmem_free_large(xqm->qm_usr_dqhtable);
171 kmem_free_large(xqm->qm_grp_dqhtable);
172 xqm->qm_usr_dqhtable = NULL;
173 xqm->qm_grp_dqhtable = NULL;
174 xqm->qm_dqhashmask = 0;
175 171
176 kmem_free(xqm);
177}
178
179/*
180 * Called at mount time to let XQM know that another file system is
181 * starting quotas. This isn't crucial information as the individual mount
182 * structures are pretty independent, but it helps the XQM keep a
183 * global view of what's going on.
184 */
185/* ARGSUSED */
186STATIC int
187xfs_qm_hold_quotafs_ref(
188 struct xfs_mount *mp)
189{
190 /* 172 /*
191 * Need to lock the xfs_Gqm structure for things like this. For example, 173 * If we are turning this type of quotas off, we don't care
192 * the structure could disappear between the entry to this routine and 174 * about the dirty metadata sitting in this dquot. OTOH, if
193 * a HOLD operation if not locked. 175 * we're unmounting, we do care, so we flush it and wait.
194 */ 176 */
195 mutex_lock(&xfs_Gqm_lock); 177 if (XFS_DQ_IS_DIRTY(dqp)) {
178 int error;
196 179
197 if (!xfs_Gqm) { 180 /*
198 xfs_Gqm = xfs_Gqm_init(); 181 * We don't care about getting disk errors here. We need
199 if (!xfs_Gqm) { 182 * to purge this dquot anyway, so we go ahead regardless.
200 mutex_unlock(&xfs_Gqm_lock); 183 */
201 return ENOMEM; 184 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
202 } 185 if (error)
186 xfs_warn(mp, "%s: dquot %p flush failed",
187 __func__, dqp);
188 xfs_dqflock(dqp);
203 } 189 }
204 190
191 ASSERT(atomic_read(&dqp->q_pincount) == 0);
192 ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
193 !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
194
195 xfs_dqfunlock(dqp);
196 xfs_dqunlock(dqp);
197
198 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
199 be32_to_cpu(dqp->q_core.d_id));
200 qi->qi_dquots--;
201
205 /* 202 /*
206 * We can keep a list of all filesystems with quotas mounted for 203 * We move dquots to the freelist as soon as their reference count
207 * debugging and statistical purposes, but ... 204 * hits zero, so it really should be on the freelist here.
208 * Just take a reference and get out.
209 */ 205 */
210 xfs_Gqm->qm_nrefs++; 206 mutex_lock(&qi->qi_lru_lock);
211 mutex_unlock(&xfs_Gqm_lock); 207 ASSERT(!list_empty(&dqp->q_lru));
208 list_del_init(&dqp->q_lru);
209 qi->qi_lru_count--;
210 XFS_STATS_DEC(xs_qm_dquot_unused);
211 mutex_unlock(&qi->qi_lru_lock);
212 212
213 xfs_qm_dqdestroy(dqp);
214
215 if (gdqp)
216 xfs_qm_dqput(gdqp);
213 return 0; 217 return 0;
214} 218}
215 219
216
217/* 220/*
218 * Release the reference that a filesystem took at mount time, 221 * Purge the dquot cache.
219 * so that we know when we need to destroy the entire quota manager.
220 */ 222 */
221/* ARGSUSED */ 223void
222STATIC void 224xfs_qm_dqpurge_all(
223xfs_qm_rele_quotafs_ref( 225 struct xfs_mount *mp,
224 struct xfs_mount *mp) 226 uint flags)
225{ 227{
226 ASSERT(xfs_Gqm); 228 if (flags & XFS_QMOPT_UQUOTA)
227 ASSERT(xfs_Gqm->qm_nrefs > 0); 229 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge);
228 230 if (flags & XFS_QMOPT_GQUOTA)
229 /* 231 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge);
230 * Destroy the entire XQM. If somebody mounts with quotaon, this'll 232 if (flags & XFS_QMOPT_PQUOTA)
231 * be restarted. 233 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge);
232 */
233 mutex_lock(&xfs_Gqm_lock);
234 if (--xfs_Gqm->qm_nrefs == 0) {
235 xfs_qm_destroy(xfs_Gqm);
236 xfs_Gqm = NULL;
237 }
238 mutex_unlock(&xfs_Gqm_lock);
239} 234}
240 235
241/* 236/*
@@ -376,175 +371,6 @@ xfs_qm_unmount_quotas(
376 } 371 }
377} 372}
378 373
379/*
380 * Flush all dquots of the given file system to disk. The dquots are
381 * _not_ purged from memory here, just their data written to disk.
382 */
383STATIC int
384xfs_qm_dqflush_all(
385 struct xfs_mount *mp)
386{
387 struct xfs_quotainfo *q = mp->m_quotainfo;
388 int recl;
389 struct xfs_dquot *dqp;
390 int error;
391
392 if (!q)
393 return 0;
394again:
395 mutex_lock(&q->qi_dqlist_lock);
396 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
397 xfs_dqlock(dqp);
398 if ((dqp->dq_flags & XFS_DQ_FREEING) ||
399 !XFS_DQ_IS_DIRTY(dqp)) {
400 xfs_dqunlock(dqp);
401 continue;
402 }
403
404 /* XXX a sentinel would be better */
405 recl = q->qi_dqreclaims;
406 if (!xfs_dqflock_nowait(dqp)) {
407 /*
408 * If we can't grab the flush lock then check
409 * to see if the dquot has been flushed delayed
410 * write. If so, grab its buffer and send it
411 * out immediately. We'll be able to acquire
412 * the flush lock when the I/O completes.
413 */
414 xfs_dqflock_pushbuf_wait(dqp);
415 }
416 /*
417 * Let go of the mplist lock. We don't want to hold it
418 * across a disk write.
419 */
420 mutex_unlock(&q->qi_dqlist_lock);
421 error = xfs_qm_dqflush(dqp, 0);
422 xfs_dqunlock(dqp);
423 if (error)
424 return error;
425
426 mutex_lock(&q->qi_dqlist_lock);
427 if (recl != q->qi_dqreclaims) {
428 mutex_unlock(&q->qi_dqlist_lock);
429 /* XXX restart limit */
430 goto again;
431 }
432 }
433
434 mutex_unlock(&q->qi_dqlist_lock);
435 /* return ! busy */
436 return 0;
437}
438
439/*
440 * Release the group dquot pointers the user dquots may be
441 * carrying around as a hint. mplist is locked on entry and exit.
442 */
443STATIC void
444xfs_qm_detach_gdquots(
445 struct xfs_mount *mp)
446{
447 struct xfs_quotainfo *q = mp->m_quotainfo;
448 struct xfs_dquot *dqp, *gdqp;
449
450 again:
451 ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
452 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
453 xfs_dqlock(dqp);
454 if (dqp->dq_flags & XFS_DQ_FREEING) {
455 xfs_dqunlock(dqp);
456 mutex_unlock(&q->qi_dqlist_lock);
457 delay(1);
458 mutex_lock(&q->qi_dqlist_lock);
459 goto again;
460 }
461
462 gdqp = dqp->q_gdquot;
463 if (gdqp)
464 dqp->q_gdquot = NULL;
465 xfs_dqunlock(dqp);
466
467 if (gdqp)
468 xfs_qm_dqrele(gdqp);
469 }
470}
471
472/*
473 * Go through all the incore dquots of this file system and take them
474 * off the mplist and hashlist, if the dquot type matches the dqtype
475 * parameter. This is used when turning off quota accounting for
476 * users and/or groups, as well as when the filesystem is unmounting.
477 */
478STATIC int
479xfs_qm_dqpurge_int(
480 struct xfs_mount *mp,
481 uint flags)
482{
483 struct xfs_quotainfo *q = mp->m_quotainfo;
484 struct xfs_dquot *dqp, *n;
485 uint dqtype;
486 int nmisses = 0;
487 LIST_HEAD (dispose_list);
488
489 if (!q)
490 return 0;
491
492 dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
493 dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
494 dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
495
496 mutex_lock(&q->qi_dqlist_lock);
497
498 /*
499 * In the first pass through all incore dquots of this filesystem,
500 * we release the group dquot pointers the user dquots may be
501 * carrying around as a hint. We need to do this irrespective of
502 * what's being turned off.
503 */
504 xfs_qm_detach_gdquots(mp);
505
506 /*
507 * Try to get rid of all of the unwanted dquots.
508 */
509 list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
510 xfs_dqlock(dqp);
511 if ((dqp->dq_flags & dqtype) != 0 &&
512 !(dqp->dq_flags & XFS_DQ_FREEING)) {
513 if (dqp->q_nrefs == 0) {
514 dqp->dq_flags |= XFS_DQ_FREEING;
515 list_move_tail(&dqp->q_mplist, &dispose_list);
516 } else
517 nmisses++;
518 }
519 xfs_dqunlock(dqp);
520 }
521 mutex_unlock(&q->qi_dqlist_lock);
522
523 list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist)
524 xfs_qm_dqpurge(dqp);
525
526 return nmisses;
527}
528
529int
530xfs_qm_dqpurge_all(
531 xfs_mount_t *mp,
532 uint flags)
533{
534 int ndquots;
535
536 /*
537 * Purge the dquot cache.
538 * None of the dquots should really be busy at this point.
539 */
540 if (mp->m_quotainfo) {
541 while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
542 delay(ndquots * 10);
543 }
544 }
545 return 0;
546}
547
548STATIC int 374STATIC int
549xfs_qm_dqattach_one( 375xfs_qm_dqattach_one(
550 xfs_inode_t *ip, 376 xfs_inode_t *ip,
@@ -783,14 +609,6 @@ xfs_qm_dqdetach(
783} 609}
784 610
785/* 611/*
786 * The hash chains and the mplist use the same xfs_dqhash structure as
787 * their list head, but we can take the mplist qh_lock and one of the
788 * hash qh_locks at the same time without any problem as they aren't
789 * related.
790 */
791static struct lock_class_key xfs_quota_mplist_class;
792
793/*
794 * This initializes all the quota information that's kept in the 612 * This initializes all the quota information that's kept in the
795 * mount structure 613 * mount structure
796 */ 614 */
@@ -804,13 +622,6 @@ xfs_qm_init_quotainfo(
804 622
805 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 623 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
806 624
807 /*
808 * Tell XQM that we exist as soon as possible.
809 */
810 if ((error = xfs_qm_hold_quotafs_ref(mp))) {
811 return error;
812 }
813
814 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 625 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
815 626
816 /* 627 /*
@@ -823,11 +634,13 @@ xfs_qm_init_quotainfo(
823 return error; 634 return error;
824 } 635 }
825 636
826 INIT_LIST_HEAD(&qinf->qi_dqlist); 637 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
827 mutex_init(&qinf->qi_dqlist_lock); 638 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
828 lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); 639 mutex_init(&qinf->qi_tree_lock);
829 640
830 qinf->qi_dqreclaims = 0; 641 INIT_LIST_HEAD(&qinf->qi_lru_list);
642 qinf->qi_lru_count = 0;
643 mutex_init(&qinf->qi_lru_lock);
831 644
832 /* mutex used to serialize quotaoffs */ 645 /* mutex used to serialize quotaoffs */
833 mutex_init(&qinf->qi_quotaofflock); 646 mutex_init(&qinf->qi_quotaofflock);
@@ -894,6 +707,9 @@ xfs_qm_init_quotainfo(
894 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; 707 qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
895 } 708 }
896 709
710 qinf->qi_shrinker.shrink = xfs_qm_shake;
711 qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
712 register_shrinker(&qinf->qi_shrinker);
897 return 0; 713 return 0;
898} 714}
899 715
@@ -911,17 +727,8 @@ xfs_qm_destroy_quotainfo(
911 727
912 qi = mp->m_quotainfo; 728 qi = mp->m_quotainfo;
913 ASSERT(qi != NULL); 729 ASSERT(qi != NULL);
914 ASSERT(xfs_Gqm != NULL);
915
916 /*
917 * Release the reference that XQM kept, so that we know
918 * when the XQM structure should be freed. We cannot assume
919 * that xfs_Gqm is non-null after this point.
920 */
921 xfs_qm_rele_quotafs_ref(mp);
922 730
923 ASSERT(list_empty(&qi->qi_dqlist)); 731 unregister_shrinker(&qi->qi_shrinker);
924 mutex_destroy(&qi->qi_dqlist_lock);
925 732
926 if (qi->qi_uquotaip) { 733 if (qi->qi_uquotaip) {
927 IRELE(qi->qi_uquotaip); 734 IRELE(qi->qi_uquotaip);
@@ -936,30 +743,6 @@ xfs_qm_destroy_quotainfo(
936 mp->m_quotainfo = NULL; 743 mp->m_quotainfo = NULL;
937} 744}
938 745
939
940
941/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
942
943/* ARGSUSED */
944STATIC void
945xfs_qm_list_init(
946 xfs_dqlist_t *list,
947 char *str,
948 int n)
949{
950 mutex_init(&list->qh_lock);
951 INIT_LIST_HEAD(&list->qh_list);
952 list->qh_version = 0;
953 list->qh_nelems = 0;
954}
955
956STATIC void
957xfs_qm_list_destroy(
958 xfs_dqlist_t *list)
959{
960 mutex_destroy(&(list->qh_lock));
961}
962
963/* 746/*
964 * Create an inode and return with a reference already taken, but unlocked 747 * Create an inode and return with a reference already taken, but unlocked
965 * This is how we create quota inodes 748 * This is how we create quota inodes
@@ -1397,6 +1180,28 @@ error0:
1397 return error; 1180 return error;
1398} 1181}
1399 1182
1183STATIC int
1184xfs_qm_flush_one(
1185 struct xfs_dquot *dqp)
1186{
1187 int error = 0;
1188
1189 xfs_dqlock(dqp);
1190 if (dqp->dq_flags & XFS_DQ_FREEING)
1191 goto out_unlock;
1192 if (!XFS_DQ_IS_DIRTY(dqp))
1193 goto out_unlock;
1194
1195 if (!xfs_dqflock_nowait(dqp))
1196 xfs_dqflock_pushbuf_wait(dqp);
1197
1198 error = xfs_qm_dqflush(dqp, 0);
1199
1200out_unlock:
1201 xfs_dqunlock(dqp);
1202 return error;
1203}
1204
1400/* 1205/*
1401 * Walk thru all the filesystem inodes and construct a consistent view 1206 * Walk thru all the filesystem inodes and construct a consistent view
1402 * of the disk quota world. If the quotacheck fails, disable quotas. 1207 * of the disk quota world. If the quotacheck fails, disable quotas.
@@ -1405,7 +1210,7 @@ int
1405xfs_qm_quotacheck( 1210xfs_qm_quotacheck(
1406 xfs_mount_t *mp) 1211 xfs_mount_t *mp)
1407{ 1212{
1408 int done, count, error; 1213 int done, count, error, error2;
1409 xfs_ino_t lastino; 1214 xfs_ino_t lastino;
1410 size_t structsz; 1215 size_t structsz;
1411 xfs_inode_t *uip, *gip; 1216 xfs_inode_t *uip, *gip;
@@ -1419,12 +1224,6 @@ xfs_qm_quotacheck(
1419 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); 1224 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
1420 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1225 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1421 1226
1422 /*
1423 * There should be no cached dquots. The (simplistic) quotacheck
1424 * algorithm doesn't like that.
1425 */
1426 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1427
1428 xfs_notice(mp, "Quotacheck needed: Please wait."); 1227 xfs_notice(mp, "Quotacheck needed: Please wait.");
1429 1228
1430 /* 1229 /*
@@ -1463,12 +1262,21 @@ xfs_qm_quotacheck(
1463 } while (!done); 1262 } while (!done);
1464 1263
1465 /* 1264 /*
1466 * We've made all the changes that we need to make incore. 1265 * We've made all the changes that we need to make incore. Flush them
1467 * Flush them down to disk buffers if everything was updated 1266 * down to disk buffers if everything was updated successfully.
1468 * successfully.
1469 */ 1267 */
1470 if (!error) 1268 if (XFS_IS_UQUOTA_ON(mp))
1471 error = xfs_qm_dqflush_all(mp); 1269 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one);
1270 if (XFS_IS_GQUOTA_ON(mp)) {
1271 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one);
1272 if (!error)
1273 error = error2;
1274 }
1275 if (XFS_IS_PQUOTA_ON(mp)) {
1276 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one);
1277 if (!error)
1278 error = error2;
1279 }
1472 1280
1473 /* 1281 /*
1474 * We can get this error if we couldn't do a dquot allocation inside 1282 * We can get this error if we couldn't do a dquot allocation inside
@@ -1496,7 +1304,7 @@ xfs_qm_quotacheck(
1496 * quotachecked status, since we won't be doing accounting for 1304 * quotachecked status, since we won't be doing accounting for
1497 * that type anymore. 1305 * that type anymore.
1498 */ 1306 */
1499 mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); 1307 mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
1500 mp->m_qflags |= flags; 1308 mp->m_qflags |= flags;
1501 1309
1502 error_return: 1310 error_return:
@@ -1508,7 +1316,6 @@ xfs_qm_quotacheck(
1508 * We must turn off quotas. 1316 * We must turn off quotas.
1509 */ 1317 */
1510 ASSERT(mp->m_quotainfo != NULL); 1318 ASSERT(mp->m_quotainfo != NULL);
1511 ASSERT(xfs_Gqm != NULL);
1512 xfs_qm_destroy_quotainfo(mp); 1319 xfs_qm_destroy_quotainfo(mp);
1513 if (xfs_mount_reset_sbqflags(mp)) { 1320 if (xfs_mount_reset_sbqflags(mp)) {
1514 xfs_warn(mp, 1321 xfs_warn(mp,
@@ -1604,16 +1411,12 @@ xfs_qm_dqfree_one(
1604 struct xfs_mount *mp = dqp->q_mount; 1411 struct xfs_mount *mp = dqp->q_mount;
1605 struct xfs_quotainfo *qi = mp->m_quotainfo; 1412 struct xfs_quotainfo *qi = mp->m_quotainfo;
1606 1413
1607 mutex_lock(&dqp->q_hash->qh_lock); 1414 mutex_lock(&qi->qi_tree_lock);
1608 list_del_init(&dqp->q_hashlist); 1415 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
1609 dqp->q_hash->qh_version++; 1416 be32_to_cpu(dqp->q_core.d_id));
1610 mutex_unlock(&dqp->q_hash->qh_lock);
1611 1417
1612 mutex_lock(&qi->qi_dqlist_lock);
1613 list_del_init(&dqp->q_mplist);
1614 qi->qi_dquots--; 1418 qi->qi_dquots--;
1615 qi->qi_dqreclaims++; 1419 mutex_unlock(&qi->qi_tree_lock);
1616 mutex_unlock(&qi->qi_dqlist_lock);
1617 1420
1618 xfs_qm_dqdestroy(dqp); 1421 xfs_qm_dqdestroy(dqp);
1619} 1422}
@@ -1624,6 +1427,7 @@ xfs_qm_dqreclaim_one(
1624 struct list_head *dispose_list) 1427 struct list_head *dispose_list)
1625{ 1428{
1626 struct xfs_mount *mp = dqp->q_mount; 1429 struct xfs_mount *mp = dqp->q_mount;
1430 struct xfs_quotainfo *qi = mp->m_quotainfo;
1627 int error; 1431 int error;
1628 1432
1629 if (!xfs_dqlock_nowait(dqp)) 1433 if (!xfs_dqlock_nowait(dqp))
@@ -1637,16 +1441,14 @@ xfs_qm_dqreclaim_one(
1637 xfs_dqunlock(dqp); 1441 xfs_dqunlock(dqp);
1638 1442
1639 trace_xfs_dqreclaim_want(dqp); 1443 trace_xfs_dqreclaim_want(dqp);
1640 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1444 XFS_STATS_INC(xs_qm_dqwants);
1641 1445
1642 list_del_init(&dqp->q_freelist); 1446 list_del_init(&dqp->q_lru);
1643 xfs_Gqm->qm_dqfrlist_cnt--; 1447 qi->qi_lru_count--;
1448 XFS_STATS_DEC(xs_qm_dquot_unused);
1644 return; 1449 return;
1645 } 1450 }
1646 1451
1647 ASSERT(dqp->q_hash);
1648 ASSERT(!list_empty(&dqp->q_mplist));
1649
1650 /* 1452 /*
1651 * Try to grab the flush lock. If this dquot is in the process of 1453 * Try to grab the flush lock. If this dquot is in the process of
1652 * getting flushed to disk, we don't want to reclaim it. 1454 * getting flushed to disk, we don't want to reclaim it.
@@ -1688,11 +1490,12 @@ xfs_qm_dqreclaim_one(
1688 xfs_dqunlock(dqp); 1490 xfs_dqunlock(dqp);
1689 1491
1690 ASSERT(dqp->q_nrefs == 0); 1492 ASSERT(dqp->q_nrefs == 0);
1691 list_move_tail(&dqp->q_freelist, dispose_list); 1493 list_move_tail(&dqp->q_lru, dispose_list);
1692 xfs_Gqm->qm_dqfrlist_cnt--; 1494 qi->qi_lru_count--;
1495 XFS_STATS_DEC(xs_qm_dquot_unused);
1693 1496
1694 trace_xfs_dqreclaim_done(dqp); 1497 trace_xfs_dqreclaim_done(dqp);
1695 XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); 1498 XFS_STATS_INC(xs_qm_dqreclaims);
1696 return; 1499 return;
1697 1500
1698out_busy: 1501out_busy:
@@ -1701,10 +1504,10 @@ out_busy:
1701 /* 1504 /*
1702 * Move the dquot to the tail of the list so that we don't spin on it. 1505 * Move the dquot to the tail of the list so that we don't spin on it.
1703 */ 1506 */
1704 list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); 1507 list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
1705 1508
1706 trace_xfs_dqreclaim_busy(dqp); 1509 trace_xfs_dqreclaim_busy(dqp);
1707 XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); 1510 XFS_STATS_INC(xs_qm_dqreclaim_misses);
1708} 1511}
1709 1512
1710STATIC int 1513STATIC int
@@ -1712,6 +1515,8 @@ xfs_qm_shake(
1712 struct shrinker *shrink, 1515 struct shrinker *shrink,
1713 struct shrink_control *sc) 1516 struct shrink_control *sc)
1714{ 1517{
1518 struct xfs_quotainfo *qi =
1519 container_of(shrink, struct xfs_quotainfo, qi_shrinker);
1715 int nr_to_scan = sc->nr_to_scan; 1520 int nr_to_scan = sc->nr_to_scan;
1716 LIST_HEAD (dispose_list); 1521 LIST_HEAD (dispose_list);
1717 struct xfs_dquot *dqp; 1522 struct xfs_dquot *dqp;
@@ -1721,24 +1526,23 @@ xfs_qm_shake(
1721 if (!nr_to_scan) 1526 if (!nr_to_scan)
1722 goto out; 1527 goto out;
1723 1528
1724 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1529 mutex_lock(&qi->qi_lru_lock);
1725 while (!list_empty(&xfs_Gqm->qm_dqfrlist)) { 1530 while (!list_empty(&qi->qi_lru_list)) {
1726 if (nr_to_scan-- <= 0) 1531 if (nr_to_scan-- <= 0)
1727 break; 1532 break;
1728 dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot, 1533 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
1729 q_freelist); 1534 q_lru);
1730 xfs_qm_dqreclaim_one(dqp, &dispose_list); 1535 xfs_qm_dqreclaim_one(dqp, &dispose_list);
1731 } 1536 }
1732 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1537 mutex_unlock(&qi->qi_lru_lock);
1733 1538
1734 while (!list_empty(&dispose_list)) { 1539 while (!list_empty(&dispose_list)) {
1735 dqp = list_first_entry(&dispose_list, struct xfs_dquot, 1540 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
1736 q_freelist); 1541 list_del_init(&dqp->q_lru);
1737 list_del_init(&dqp->q_freelist);
1738 xfs_qm_dqfree_one(dqp); 1542 xfs_qm_dqfree_one(dqp);
1739 } 1543 }
1740out: 1544out:
1741 return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure; 1545 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
1742} 1546}
1743 1547
1744/* 1548/*
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9a9b997e1a0a..44b858b79d71 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -21,21 +21,10 @@
21#include "xfs_dquot_item.h" 21#include "xfs_dquot_item.h"
22#include "xfs_dquot.h" 22#include "xfs_dquot.h"
23#include "xfs_quota_priv.h" 23#include "xfs_quota_priv.h"
24#include "xfs_qm_stats.h"
25 24
26struct xfs_qm;
27struct xfs_inode; 25struct xfs_inode;
28 26
29extern struct mutex xfs_Gqm_lock; 27extern struct kmem_zone *xfs_qm_dqtrxzone;
30extern struct xfs_qm *xfs_Gqm;
31extern kmem_zone_t *qm_dqzone;
32extern kmem_zone_t *qm_dqtrxzone;
33
34/*
35 * Dquot hashtable constants/threshold values.
36 */
37#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t))
38#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
39 28
40/* 29/*
41 * This defines the unit of allocation of dquots. 30 * This defines the unit of allocation of dquots.
@@ -48,36 +37,20 @@ extern kmem_zone_t *qm_dqtrxzone;
48 */ 37 */
49#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 38#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
50 39
51typedef xfs_dqhash_t xfs_dqlist_t;
52
53/*
54 * Quota Manager (global) structure. Lives only in core.
55 */
56typedef struct xfs_qm {
57 xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
58 xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
59 uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
60 struct list_head qm_dqfrlist; /* freelist of dquots */
61 struct mutex qm_dqfrlist_lock;
62 int qm_dqfrlist_cnt;
63 atomic_t qm_totaldquots; /* total incore dquots */
64 uint qm_nrefs; /* file systems with quota on */
65 kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
66 kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
67} xfs_qm_t;
68
69/* 40/*
70 * Various quota information for individual filesystems. 41 * Various quota information for individual filesystems.
71 * The mount structure keeps a pointer to this. 42 * The mount structure keeps a pointer to this.
72 */ 43 */
73typedef struct xfs_quotainfo { 44typedef struct xfs_quotainfo {
45 struct radix_tree_root qi_uquota_tree;
46 struct radix_tree_root qi_gquota_tree;
47 struct mutex qi_tree_lock;
74 xfs_inode_t *qi_uquotaip; /* user quota inode */ 48 xfs_inode_t *qi_uquotaip; /* user quota inode */
75 xfs_inode_t *qi_gquotaip; /* group quota inode */ 49 xfs_inode_t *qi_gquotaip; /* group quota inode */
76 struct list_head qi_dqlist; /* all dquots in filesys */ 50 struct list_head qi_lru_list;
77 struct mutex qi_dqlist_lock; 51 struct mutex qi_lru_lock;
52 int qi_lru_count;
78 int qi_dquots; 53 int qi_dquots;
79 int qi_dqreclaims; /* a change here indicates
80 a removal in the dqlist */
81 time_t qi_btimelimit; /* limit for blks timer */ 54 time_t qi_btimelimit; /* limit for blks timer */
82 time_t qi_itimelimit; /* limit for inodes timer */ 55 time_t qi_itimelimit; /* limit for inodes timer */
83 time_t qi_rtbtimelimit;/* limit for rt blks timer */ 56 time_t qi_rtbtimelimit;/* limit for rt blks timer */
@@ -93,8 +66,14 @@ typedef struct xfs_quotainfo {
93 xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ 66 xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
94 xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ 67 xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
95 xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ 68 xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
69 struct shrinker qi_shrinker;
96} xfs_quotainfo_t; 70} xfs_quotainfo_t;
97 71
72#define XFS_DQUOT_TREE(qi, type) \
73 ((type & XFS_DQ_USER) ? \
74 &((qi)->qi_uquota_tree) : \
75 &((qi)->qi_gquota_tree))
76
98 77
99extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); 78extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
100extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, 79extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
@@ -130,7 +109,7 @@ extern int xfs_qm_quotacheck(xfs_mount_t *);
130extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); 109extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
131 110
132/* dquot stuff */ 111/* dquot stuff */
133extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); 112extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint);
134extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 113extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
135 114
136/* quota ops */ 115/* quota ops */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca9..e6986b5d80d8 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -40,28 +40,28 @@
40STATIC void 40STATIC void
41xfs_fill_statvfs_from_dquot( 41xfs_fill_statvfs_from_dquot(
42 struct kstatfs *statp, 42 struct kstatfs *statp,
43 xfs_disk_dquot_t *dp) 43 struct xfs_dquot *dqp)
44{ 44{
45 __uint64_t limit; 45 __uint64_t limit;
46 46
47 limit = dp->d_blk_softlimit ? 47 limit = dqp->q_core.d_blk_softlimit ?
48 be64_to_cpu(dp->d_blk_softlimit) : 48 be64_to_cpu(dqp->q_core.d_blk_softlimit) :
49 be64_to_cpu(dp->d_blk_hardlimit); 49 be64_to_cpu(dqp->q_core.d_blk_hardlimit);
50 if (limit && statp->f_blocks > limit) { 50 if (limit && statp->f_blocks > limit) {
51 statp->f_blocks = limit; 51 statp->f_blocks = limit;
52 statp->f_bfree = statp->f_bavail = 52 statp->f_bfree = statp->f_bavail =
53 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? 53 (statp->f_blocks > dqp->q_res_bcount) ?
54 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; 54 (statp->f_blocks - dqp->q_res_bcount) : 0;
55 } 55 }
56 56
57 limit = dp->d_ino_softlimit ? 57 limit = dqp->q_core.d_ino_softlimit ?
58 be64_to_cpu(dp->d_ino_softlimit) : 58 be64_to_cpu(dqp->q_core.d_ino_softlimit) :
59 be64_to_cpu(dp->d_ino_hardlimit); 59 be64_to_cpu(dqp->q_core.d_ino_hardlimit);
60 if (limit && statp->f_files > limit) { 60 if (limit && statp->f_files > limit) {
61 statp->f_files = limit; 61 statp->f_files = limit;
62 statp->f_ffree = 62 statp->f_ffree =
63 (statp->f_files > be64_to_cpu(dp->d_icount)) ? 63 (statp->f_files > dqp->q_res_icount) ?
64 (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; 64 (statp->f_ffree - dqp->q_res_icount) : 0;
65 } 65 }
66} 66}
67 67
@@ -82,7 +82,7 @@ xfs_qm_statvfs(
82 xfs_dquot_t *dqp; 82 xfs_dquot_t *dqp;
83 83
84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { 84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); 85 xfs_fill_statvfs_from_dquot(statp, dqp);
86 xfs_qm_dqput(dqp); 86 xfs_qm_dqput(dqp);
87 } 87 }
88} 88}
@@ -156,21 +156,3 @@ xfs_qm_newmount(
156 156
157 return 0; 157 return 0;
158} 158}
159
160void __init
161xfs_qm_init(void)
162{
163 printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
164 mutex_init(&xfs_Gqm_lock);
165 xfs_qm_init_procfs();
166}
167
168void __exit
169xfs_qm_exit(void)
170{
171 xfs_qm_cleanup_procfs();
172 if (qm_dqzone)
173 kmem_zone_destroy(qm_dqzone);
174 if (qm_dqtrxzone)
175 kmem_zone_destroy(qm_dqtrxzone);
176}
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
deleted file mode 100644
index 5729ba570877..000000000000
--- a/fs/xfs/xfs_qm_stats.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_alloc.h"
27#include "xfs_quota.h"
28#include "xfs_mount.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_inode.h"
31#include "xfs_itable.h"
32#include "xfs_bmap.h"
33#include "xfs_rtalloc.h"
34#include "xfs_error.h"
35#include "xfs_attr.h"
36#include "xfs_buf_item.h"
37#include "xfs_qm.h"
38
39struct xqmstats xqmstats;
40
41static int xqm_proc_show(struct seq_file *m, void *v)
42{
43 /* maximum; incore; ratio free to inuse; freelist */
44 seq_printf(m, "%d\t%d\t%d\t%u\n",
45 0,
46 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
47 0,
48 xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
49 return 0;
50}
51
52static int xqm_proc_open(struct inode *inode, struct file *file)
53{
54 return single_open(file, xqm_proc_show, NULL);
55}
56
57static const struct file_operations xqm_proc_fops = {
58 .owner = THIS_MODULE,
59 .open = xqm_proc_open,
60 .read = seq_read,
61 .llseek = seq_lseek,
62 .release = single_release,
63};
64
65static int xqmstat_proc_show(struct seq_file *m, void *v)
66{
67 /* quota performance statistics */
68 seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
69 xqmstats.xs_qm_dqreclaims,
70 xqmstats.xs_qm_dqreclaim_misses,
71 xqmstats.xs_qm_dquot_dups,
72 xqmstats.xs_qm_dqcachemisses,
73 xqmstats.xs_qm_dqcachehits,
74 xqmstats.xs_qm_dqwants,
75 xqmstats.xs_qm_dqshake_reclaims,
76 xqmstats.xs_qm_dqinact_reclaims);
77 return 0;
78}
79
80static int xqmstat_proc_open(struct inode *inode, struct file *file)
81{
82 return single_open(file, xqmstat_proc_show, NULL);
83}
84
85static const struct file_operations xqmstat_proc_fops = {
86 .owner = THIS_MODULE,
87 .open = xqmstat_proc_open,
88 .read = seq_read,
89 .llseek = seq_lseek,
90 .release = single_release,
91};
92
93void
94xfs_qm_init_procfs(void)
95{
96 proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
97 proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
98}
99
100void
101xfs_qm_cleanup_procfs(void)
102{
103 remove_proc_entry("fs/xfs/xqm", NULL);
104 remove_proc_entry("fs/xfs/xqmstat", NULL);
105}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
deleted file mode 100644
index 5b964fc0dc09..000000000000
--- a/fs/xfs/xfs_qm_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * Copyright (c) 2002 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QM_STATS_H__
19#define __XFS_QM_STATS_H__
20
21#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
22
23/*
24 * XQM global statistics
25 */
26struct xqmstats {
27 __uint32_t xs_qm_dqreclaims;
28 __uint32_t xs_qm_dqreclaim_misses;
29 __uint32_t xs_qm_dquot_dups;
30 __uint32_t xs_qm_dqcachemisses;
31 __uint32_t xs_qm_dqcachehits;
32 __uint32_t xs_qm_dqwants;
33 __uint32_t xs_qm_dqshake_reclaims;
34 __uint32_t xs_qm_dqinact_reclaims;
35};
36
37extern struct xqmstats xqmstats;
38
39# define XQM_STATS_INC(count) ( (count)++ )
40
41extern void xfs_qm_init_procfs(void);
42extern void xfs_qm_cleanup_procfs(void);
43
44#else
45
46# define XQM_STATS_INC(count) do { } while (0)
47
48static inline void xfs_qm_init_procfs(void) { };
49static inline void xfs_qm_cleanup_procfs(void) { };
50
51#endif
52
53#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 711a86e39ff0..c4f396e437a8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -47,9 +47,6 @@ STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
47 uint); 47 uint);
48STATIC uint xfs_qm_export_flags(uint); 48STATIC uint xfs_qm_export_flags(uint);
49STATIC uint xfs_qm_export_qtype_flags(uint); 49STATIC uint xfs_qm_export_qtype_flags(uint);
50STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
51 fs_disk_quota_t *);
52
53 50
54/* 51/*
55 * Turn off quota accounting and/or enforcement for all udquots and/or 52 * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -69,7 +66,6 @@ xfs_qm_scall_quotaoff(
69 int error; 66 int error;
70 uint inactivate_flags; 67 uint inactivate_flags;
71 xfs_qoff_logitem_t *qoffstart; 68 xfs_qoff_logitem_t *qoffstart;
72 int nculprits;
73 69
74 /* 70 /*
75 * No file system can have quotas enabled on disk but not in core. 71 * No file system can have quotas enabled on disk but not in core.
@@ -175,18 +171,13 @@ xfs_qm_scall_quotaoff(
175 * This isn't protected by a particular lock directly, because we 171 * This isn't protected by a particular lock directly, because we
176 * don't want to take a mrlock every time we depend on quotas being on. 172 * don't want to take a mrlock every time we depend on quotas being on.
177 */ 173 */
178 mp->m_qflags &= ~(flags); 174 mp->m_qflags &= ~flags;
179 175
180 /* 176 /*
181 * Go through all the dquots of this file system and purge them, 177 * Go through all the dquots of this file system and purge them,
182 * according to what was turned off. We may not be able to get rid 178 * according to what was turned off.
183 * of all dquots, because dquots can have temporary references that
184 * are not attached to inodes. eg. xfs_setattr, xfs_create.
185 * So, if we couldn't purge all the dquots from the filesystem,
186 * we can't get rid of the incore data structures.
187 */ 179 */
188 while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) 180 xfs_qm_dqpurge_all(mp, dqtype);
189 delay(10 * nculprits);
190 181
191 /* 182 /*
192 * Transactions that had started before ACTIVE state bit was cleared 183 * Transactions that had started before ACTIVE state bit was cleared
@@ -635,42 +626,6 @@ xfs_qm_scall_setqlim(
635 return error; 626 return error;
636} 627}
637 628
638int
639xfs_qm_scall_getquota(
640 xfs_mount_t *mp,
641 xfs_dqid_t id,
642 uint type,
643 fs_disk_quota_t *out)
644{
645 xfs_dquot_t *dqp;
646 int error;
647
648 /*
649 * Try to get the dquot. We don't want it allocated on disk, so
650 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
651 * exist, we'll get ENOENT back.
652 */
653 if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
654 return (error);
655 }
656
657 /*
658 * If everything's NULL, this dquot doesn't quite exist as far as
659 * our utility programs are concerned.
660 */
661 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
662 xfs_qm_dqput(dqp);
663 return XFS_ERROR(ENOENT);
664 }
665 /*
666 * Convert the disk dquot to the exportable format
667 */
668 xfs_qm_export_dquot(mp, &dqp->q_core, out);
669 xfs_qm_dqput(dqp);
670 return (error ? XFS_ERROR(EFAULT) : 0);
671}
672
673
674STATIC int 629STATIC int
675xfs_qm_log_quotaoff_end( 630xfs_qm_log_quotaoff_end(
676 xfs_mount_t *mp, 631 xfs_mount_t *mp,
@@ -759,50 +714,66 @@ error0:
759} 714}
760 715
761 716
762/* 717int
763 * Translate an internal style on-disk-dquot to the exportable format. 718xfs_qm_scall_getquota(
764 * The main differences are that the counters/limits are all in Basic 719 struct xfs_mount *mp,
765 * Blocks (BBs) instead of the internal FSBs, and all on-disk data has 720 xfs_dqid_t id,
766 * to be converted to the native endianness. 721 uint type,
767 */
768STATIC void
769xfs_qm_export_dquot(
770 xfs_mount_t *mp,
771 xfs_disk_dquot_t *src,
772 struct fs_disk_quota *dst) 722 struct fs_disk_quota *dst)
773{ 723{
724 struct xfs_dquot *dqp;
725 int error;
726
727 /*
728 * Try to get the dquot. We don't want it allocated on disk, so
729 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
730 * exist, we'll get ENOENT back.
731 */
732 error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
733 if (error)
734 return error;
735
736 /*
737 * If everything's NULL, this dquot doesn't quite exist as far as
738 * our utility programs are concerned.
739 */
740 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
741 error = XFS_ERROR(ENOENT);
742 goto out_put;
743 }
744
774 memset(dst, 0, sizeof(*dst)); 745 memset(dst, 0, sizeof(*dst));
775 dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */ 746 dst->d_version = FS_DQUOT_VERSION;
776 dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags); 747 dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
777 dst->d_id = be32_to_cpu(src->d_id); 748 dst->d_id = be32_to_cpu(dqp->q_core.d_id);
778 dst->d_blk_hardlimit = 749 dst->d_blk_hardlimit =
779 XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit)); 750 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
780 dst->d_blk_softlimit = 751 dst->d_blk_softlimit =
781 XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit)); 752 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
782 dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit); 753 dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
783 dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit); 754 dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
784 dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount)); 755 dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
785 dst->d_icount = be64_to_cpu(src->d_icount); 756 dst->d_icount = dqp->q_res_icount;
786 dst->d_btimer = be32_to_cpu(src->d_btimer); 757 dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
787 dst->d_itimer = be32_to_cpu(src->d_itimer); 758 dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
788 dst->d_iwarns = be16_to_cpu(src->d_iwarns); 759 dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
789 dst->d_bwarns = be16_to_cpu(src->d_bwarns); 760 dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
790 dst->d_rtb_hardlimit = 761 dst->d_rtb_hardlimit =
791 XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit)); 762 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
792 dst->d_rtb_softlimit = 763 dst->d_rtb_softlimit =
793 XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit)); 764 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
794 dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount)); 765 dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
795 dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer); 766 dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
796 dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns); 767 dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
797 768
798 /* 769 /*
799 * Internally, we don't reset all the timers when quota enforcement 770 * Internally, we don't reset all the timers when quota enforcement
800 * gets turned off. No need to confuse the user level code, 771 * gets turned off. No need to confuse the user level code,
801 * so return zeroes in that case. 772 * so return zeroes in that case.
802 */ 773 */
803 if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) || 774 if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
804 (!XFS_IS_OQUOTA_ENFORCED(mp) && 775 (!XFS_IS_OQUOTA_ENFORCED(mp) &&
805 (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { 776 (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
806 dst->d_btimer = 0; 777 dst->d_btimer = 0;
807 dst->d_itimer = 0; 778 dst->d_itimer = 0;
808 dst->d_rtbtimer = 0; 779 dst->d_rtbtimer = 0;
@@ -823,6 +794,9 @@ xfs_qm_export_dquot(
823 } 794 }
824 } 795 }
825#endif 796#endif
797out_put:
798 xfs_qm_dqput(dqp);
799 return error;
826} 800}
827 801
828STATIC uint 802STATIC uint
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 8a0807e0f979..b50ec5b95d5a 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -174,6 +174,8 @@ typedef struct xfs_qoff_logformat {
174#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ 174#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
175#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ 175#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
176#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ 176#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
177#define XFS_ALL_QUOTA_ACTIVE \
178 (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
177 179
178/* 180/*
179 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees 181 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d716..6d86219d93da 100644
--- a/fs/xfs/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
@@ -24,17 +24,6 @@
24 */ 24 */
25#define XFS_DQITER_MAP_SIZE 10 25#define XFS_DQITER_MAP_SIZE 10
26 26
27/*
28 * Hash into a bucket in the dquot hash table, based on <mp, id>.
29 */
30#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
31 (__psunsigned_t)(id)) & \
32 (xfs_Gqm->qm_dqhashmask - 1))
33#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \
34 (xfs_Gqm->qm_usr_dqhtable + \
35 XFS_DQ_HASHVAL(mp, id)) : \
36 (xfs_Gqm->qm_grp_dqhtable + \
37 XFS_DQ_HASHVAL(mp, id)))
38#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ 27#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
39 !dqp->q_core.d_blk_hardlimit && \ 28 !dqp->q_core.d_blk_hardlimit && \
40 !dqp->q_core.d_blk_softlimit && \ 29 !dqp->q_core.d_blk_softlimit && \
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 87323f1ded64..ca4f31534a0a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -183,6 +183,7 @@ error_cancel:
183 oblocks = map.br_startoff + map.br_blockcount; 183 oblocks = map.br_startoff + map.br_blockcount;
184 } 184 }
185 return 0; 185 return 0;
186
186error: 187error:
187 return error; 188 return error;
188} 189}
@@ -2139,11 +2140,9 @@ xfs_rtfree_extent(
2139 xfs_buf_t *sumbp; /* summary file block buffer */ 2140 xfs_buf_t *sumbp; /* summary file block buffer */
2140 2141
2141 mp = tp->t_mountp; 2142 mp = tp->t_mountp;
2142 /* 2143
2143 * Synchronize by locking the bitmap inode. 2144 ASSERT(mp->m_rbmip->i_itemp != NULL);
2144 */ 2145 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2145 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2146 xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2147 2146
2148#if defined(__KERNEL__) && defined(DEBUG) 2147#if defined(__KERNEL__) && defined(DEBUG)
2149 /* 2148 /*
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index cb6ae715814a..f429d9d5d325 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
529#define XFS_BB_TO_FSB(mp,bb) \ 529#define XFS_BB_TO_FSB(mp,bb) \
530 (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) 530 (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
531#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) 531#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
532#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
533 532
534/* 533/*
535 * File system block to byte conversions. 534 * File system block to byte conversions.
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc5861932..ce372b7d5644 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -20,9 +20,18 @@
20 20
21DEFINE_PER_CPU(struct xfsstats, xfsstats); 21DEFINE_PER_CPU(struct xfsstats, xfsstats);
22 22
23static int counter_val(int idx)
24{
25 int val = 0, cpu;
26
27 for_each_possible_cpu(cpu)
28 val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
29 return val;
30}
31
23static int xfs_stat_proc_show(struct seq_file *m, void *v) 32static int xfs_stat_proc_show(struct seq_file *m, void *v)
24{ 33{
25 int c, i, j, val; 34 int i, j;
26 __uint64_t xs_xstrat_bytes = 0; 35 __uint64_t xs_xstrat_bytes = 0;
27 __uint64_t xs_write_bytes = 0; 36 __uint64_t xs_write_bytes = 0;
28 __uint64_t xs_read_bytes = 0; 37 __uint64_t xs_read_bytes = 0;
@@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
50 { "abtc2", XFSSTAT_END_ABTC_V2 }, 59 { "abtc2", XFSSTAT_END_ABTC_V2 },
51 { "bmbt2", XFSSTAT_END_BMBT_V2 }, 60 { "bmbt2", XFSSTAT_END_BMBT_V2 },
52 { "ibt2", XFSSTAT_END_IBT_V2 }, 61 { "ibt2", XFSSTAT_END_IBT_V2 },
62 /* we print both series of quota information together */
63 { "qm", XFSSTAT_END_QM },
53 }; 64 };
54 65
55 /* Loop over all stats groups */ 66 /* Loop over all stats groups */
56 for (i=j = 0; i < ARRAY_SIZE(xstats); i++) { 67 for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
57 seq_printf(m, "%s", xstats[i].desc); 68 seq_printf(m, "%s", xstats[i].desc);
58 /* inner loop does each group */ 69 /* inner loop does each group */
59 while (j < xstats[i].endpoint) { 70 for (; j < xstats[i].endpoint; j++)
60 val = 0; 71 seq_printf(m, " %u", counter_val(j));
61 /* sum over all cpus */
62 for_each_possible_cpu(c)
63 val += *(((__u32*)&per_cpu(xfsstats, c) + j));
64 seq_printf(m, " %u", val);
65 j++;
66 }
67 seq_putc(m, '\n'); 72 seq_putc(m, '\n');
68 } 73 }
69 /* extra precision counters */ 74 /* extra precision counters */
@@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = {
97 .release = single_release, 102 .release = single_release,
98}; 103};
99 104
105/* legacy quota interfaces */
106#ifdef CONFIG_XFS_QUOTA
107static int xqm_proc_show(struct seq_file *m, void *v)
108{
109 /* maximum; incore; ratio free to inuse; freelist */
110 seq_printf(m, "%d\t%d\t%d\t%u\n",
111 0,
112 counter_val(XFSSTAT_END_XQMSTAT),
113 0,
114 counter_val(XFSSTAT_END_XQMSTAT + 1));
115 return 0;
116}
117
118static int xqm_proc_open(struct inode *inode, struct file *file)
119{
120 return single_open(file, xqm_proc_show, NULL);
121}
122
123static const struct file_operations xqm_proc_fops = {
124 .owner = THIS_MODULE,
125 .open = xqm_proc_open,
126 .read = seq_read,
127 .llseek = seq_lseek,
128 .release = single_release,
129};
130
131/* legacy quota stats interface no 2 */
132static int xqmstat_proc_show(struct seq_file *m, void *v)
133{
134 int j;
135
136 seq_printf(m, "qm");
137 for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
138 seq_printf(m, " %u", counter_val(j));
139 seq_putc(m, '\n');
140 return 0;
141}
142
143static int xqmstat_proc_open(struct inode *inode, struct file *file)
144{
145 return single_open(file, xqmstat_proc_show, NULL);
146}
147
148static const struct file_operations xqmstat_proc_fops = {
149 .owner = THIS_MODULE,
150 .open = xqmstat_proc_open,
151 .read = seq_read,
152 .llseek = seq_lseek,
153 .release = single_release,
154};
155#endif /* CONFIG_XFS_QUOTA */
156
100int 157int
101xfs_init_procfs(void) 158xfs_init_procfs(void)
102{ 159{
@@ -105,10 +162,24 @@ xfs_init_procfs(void)
105 162
106 if (!proc_create("fs/xfs/stat", 0, NULL, 163 if (!proc_create("fs/xfs/stat", 0, NULL,
107 &xfs_stat_proc_fops)) 164 &xfs_stat_proc_fops))
108 goto out_remove_entry; 165 goto out_remove_xfs_dir;
166#ifdef CONFIG_XFS_QUOTA
167 if (!proc_create("fs/xfs/xqmstat", 0, NULL,
168 &xqmstat_proc_fops))
169 goto out_remove_stat_file;
170 if (!proc_create("fs/xfs/xqm", 0, NULL,
171 &xqm_proc_fops))
172 goto out_remove_xqmstat_file;
173#endif
109 return 0; 174 return 0;
110 175
111 out_remove_entry: 176#ifdef CONFIG_XFS_QUOTA
177 out_remove_xqmstat_file:
178 remove_proc_entry("fs/xfs/xqmstat", NULL);
179 out_remove_stat_file:
180 remove_proc_entry("fs/xfs/stat", NULL);
181#endif
182 out_remove_xfs_dir:
112 remove_proc_entry("fs/xfs", NULL); 183 remove_proc_entry("fs/xfs", NULL);
113 out: 184 out:
114 return -ENOMEM; 185 return -ENOMEM;
@@ -117,6 +188,10 @@ xfs_init_procfs(void)
117void 188void
118xfs_cleanup_procfs(void) 189xfs_cleanup_procfs(void)
119{ 190{
191#ifdef CONFIG_XFS_QUOTA
192 remove_proc_entry("fs/xfs/xqm", NULL);
193 remove_proc_entry("fs/xfs/xqmstat", NULL);
194#endif
120 remove_proc_entry("fs/xfs/stat", NULL); 195 remove_proc_entry("fs/xfs/stat", NULL);
121 remove_proc_entry("fs/xfs", NULL); 196 remove_proc_entry("fs/xfs", NULL);
122} 197}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1a..c03ad38ceaeb 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,6 +183,16 @@ struct xfsstats {
183 __uint32_t xs_ibt_2_alloc; 183 __uint32_t xs_ibt_2_alloc;
184 __uint32_t xs_ibt_2_free; 184 __uint32_t xs_ibt_2_free;
185 __uint32_t xs_ibt_2_moves; 185 __uint32_t xs_ibt_2_moves;
186#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6)
187 __uint32_t xs_qm_dqreclaims;
188 __uint32_t xs_qm_dqreclaim_misses;
189 __uint32_t xs_qm_dquot_dups;
190 __uint32_t xs_qm_dqcachemisses;
191 __uint32_t xs_qm_dqcachehits;
192 __uint32_t xs_qm_dqwants;
193#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
194 __uint32_t xs_qm_dquot;
195 __uint32_t xs_qm_dquot_unused;
186/* Extra precision counters */ 196/* Extra precision counters */
187 __uint64_t xs_xstrat_bytes; 197 __uint64_t xs_xstrat_bytes;
188 __uint64_t xs_write_bytes; 198 __uint64_t xs_write_bytes;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index baf40e378d35..dab9a5f6dfd6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -324,10 +324,9 @@ xfs_parseargs(
324 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { 324 } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
325 mp->m_flags |= XFS_MOUNT_FILESTREAMS; 325 mp->m_flags |= XFS_MOUNT_FILESTREAMS;
326 } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { 326 } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
327 mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | 327 mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
328 XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | 328 mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
329 XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | 329 mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
330 XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
331 } else if (!strcmp(this_char, MNTOPT_QUOTA) || 330 } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
332 !strcmp(this_char, MNTOPT_UQUOTA) || 331 !strcmp(this_char, MNTOPT_UQUOTA) ||
333 !strcmp(this_char, MNTOPT_USRQUOTA)) { 332 !strcmp(this_char, MNTOPT_USRQUOTA)) {
@@ -760,6 +759,36 @@ xfs_setup_devices(
760 return 0; 759 return 0;
761} 760}
762 761
762STATIC int
763xfs_init_mount_workqueues(
764 struct xfs_mount *mp)
765{
766 mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
767 WQ_MEM_RECLAIM, 0, mp->m_fsname);
768 if (!mp->m_data_workqueue)
769 goto out;
770
771 mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
772 WQ_MEM_RECLAIM, 0, mp->m_fsname);
773 if (!mp->m_unwritten_workqueue)
774 goto out_destroy_data_iodone_queue;
775
776 return 0;
777
778out_destroy_data_iodone_queue:
779 destroy_workqueue(mp->m_data_workqueue);
780out:
781 return -ENOMEM;
782}
783
784STATIC void
785xfs_destroy_mount_workqueues(
786 struct xfs_mount *mp)
787{
788 destroy_workqueue(mp->m_data_workqueue);
789 destroy_workqueue(mp->m_unwritten_workqueue);
790}
791
763/* Catch misguided souls that try to use this interface on XFS */ 792/* Catch misguided souls that try to use this interface on XFS */
764STATIC struct inode * 793STATIC struct inode *
765xfs_fs_alloc_inode( 794xfs_fs_alloc_inode(
@@ -834,91 +863,58 @@ xfs_fs_inode_init_once(
834} 863}
835 864
836/* 865/*
837 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 866 * This is called by the VFS when dirtying inode metadata. This can happen
838 * we catch unlogged VFS level updates to the inode. 867 * for a few reasons, but we only care about timestamp updates, given that
868 * we handled the rest ourselves. In theory no other calls should happen,
869 * but for example generic_write_end() keeps dirtying the inode after
870 * updating i_size. Thus we check that the flags are exactly I_DIRTY_SYNC,
871 * and skip this call otherwise.
839 * 872 *
840 * We need the barrier() to maintain correct ordering between unlogged 873 * We'll hopefull get a different method just for updating timestamps soon,
841 * updates and the transaction commit code that clears the i_update_core 874 * at which point this hack can go away, and maybe we'll also get real
842 * field. This requires all updates to be completed before marking the 875 * error handling here.
843 * inode dirty.
844 */ 876 */
845STATIC void 877STATIC void
846xfs_fs_dirty_inode( 878xfs_fs_dirty_inode(
847 struct inode *inode,
848 int flags)
849{
850 barrier();
851 XFS_I(inode)->i_update_core = 1;
852}
853
854STATIC int
855xfs_fs_write_inode(
856 struct inode *inode, 879 struct inode *inode,
857 struct writeback_control *wbc) 880 int flags)
858{ 881{
859 struct xfs_inode *ip = XFS_I(inode); 882 struct xfs_inode *ip = XFS_I(inode);
860 struct xfs_mount *mp = ip->i_mount; 883 struct xfs_mount *mp = ip->i_mount;
861 int error = EAGAIN; 884 struct xfs_trans *tp;
862 885 int error;
863 trace_xfs_write_inode(ip);
864
865 if (XFS_FORCED_SHUTDOWN(mp))
866 return -XFS_ERROR(EIO);
867 886
868 if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { 887 if (flags != I_DIRTY_SYNC)
869 /* 888 return;
870 * Make sure the inode has made it it into the log. Instead
871 * of forcing it all the way to stable storage using a
872 * synchronous transaction we let the log force inside the
873 * ->sync_fs call do that for thus, which reduces the number
874 * of synchronous log forces dramatically.
875 */
876 error = xfs_log_dirty_inode(ip, NULL, 0);
877 if (error)
878 goto out;
879 return 0;
880 } else {
881 if (!ip->i_update_core)
882 return 0;
883 889
884 /* 890 trace_xfs_dirty_inode(ip);
885 * We make this non-blocking if the inode is contended, return
886 * EAGAIN to indicate to the caller that they did not succeed.
887 * This prevents the flush path from blocking on inodes inside
888 * another operation right now, they get caught later by
889 * xfs_sync.
890 */
891 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
892 goto out;
893 891
894 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) 892 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
895 goto out_unlock; 893 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
896 894 if (error) {
897 /* 895 xfs_trans_cancel(tp, 0);
898 * Now we have the flush lock and the inode is not pinned, we 896 goto trouble;
899 * can check if the inode is really clean as we know that
900 * there are no pending transaction completions, it is not
901 * waiting on the delayed write queue and there is no IO in
902 * progress.
903 */
904 if (xfs_inode_clean(ip)) {
905 xfs_ifunlock(ip);
906 error = 0;
907 goto out_unlock;
908 }
909 error = xfs_iflush(ip, SYNC_TRYLOCK);
910 } 897 }
911 898 xfs_ilock(ip, XFS_ILOCK_EXCL);
912 out_unlock:
913 xfs_iunlock(ip, XFS_ILOCK_SHARED);
914 out:
915 /* 899 /*
916 * if we failed to write out the inode then mark 900 * Grab all the latest timestamps from the Linux inode.
917 * it dirty again so we'll try again later.
918 */ 901 */
902 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
903 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
904 ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
905 ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
906 ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
907 ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
908
909 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
910 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
911 error = xfs_trans_commit(tp, 0);
919 if (error) 912 if (error)
920 xfs_mark_inode_dirty_sync(ip); 913 goto trouble;
921 return -error; 914 return;
915
916trouble:
917 xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
922} 918}
923 919
924STATIC void 920STATIC void
@@ -954,6 +950,22 @@ xfs_fs_evict_inode(
954 xfs_inactive(ip); 950 xfs_inactive(ip);
955} 951}
956 952
953/*
954 * We do an unlocked check for XFS_IDONTCACHE here because we are already
955 * serialised against cache hits here via the inode->i_lock and igrab() in
956 * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be
957 * racing with us, and it avoids needing to grab a spinlock here for every inode
958 * we drop the final reference on.
959 */
960STATIC int
961xfs_fs_drop_inode(
962 struct inode *inode)
963{
964 struct xfs_inode *ip = XFS_I(inode);
965
966 return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
967}
968
957STATIC void 969STATIC void
958xfs_free_fsname( 970xfs_free_fsname(
959 struct xfs_mount *mp) 971 struct xfs_mount *mp)
@@ -983,6 +995,7 @@ xfs_fs_put_super(
983 xfs_unmountfs(mp); 995 xfs_unmountfs(mp);
984 xfs_freesb(mp); 996 xfs_freesb(mp);
985 xfs_icsb_destroy_counters(mp); 997 xfs_icsb_destroy_counters(mp);
998 xfs_destroy_mount_workqueues(mp);
986 xfs_close_devices(mp); 999 xfs_close_devices(mp);
987 xfs_free_fsname(mp); 1000 xfs_free_fsname(mp);
988 kfree(mp); 1001 kfree(mp);
@@ -1309,10 +1322,14 @@ xfs_fs_fill_super(
1309 if (error) 1322 if (error)
1310 goto out_free_fsname; 1323 goto out_free_fsname;
1311 1324
1312 error = xfs_icsb_init_counters(mp); 1325 error = xfs_init_mount_workqueues(mp);
1313 if (error) 1326 if (error)
1314 goto out_close_devices; 1327 goto out_close_devices;
1315 1328
1329 error = xfs_icsb_init_counters(mp);
1330 if (error)
1331 goto out_destroy_workqueues;
1332
1316 error = xfs_readsb(mp, flags); 1333 error = xfs_readsb(mp, flags);
1317 if (error) 1334 if (error)
1318 goto out_destroy_counters; 1335 goto out_destroy_counters;
@@ -1376,6 +1393,8 @@ xfs_fs_fill_super(
1376 xfs_freesb(mp); 1393 xfs_freesb(mp);
1377 out_destroy_counters: 1394 out_destroy_counters:
1378 xfs_icsb_destroy_counters(mp); 1395 xfs_icsb_destroy_counters(mp);
1396out_destroy_workqueues:
1397 xfs_destroy_mount_workqueues(mp);
1379 out_close_devices: 1398 out_close_devices:
1380 xfs_close_devices(mp); 1399 xfs_close_devices(mp);
1381 out_free_fsname: 1400 out_free_fsname:
@@ -1429,8 +1448,8 @@ static const struct super_operations xfs_super_operations = {
1429 .alloc_inode = xfs_fs_alloc_inode, 1448 .alloc_inode = xfs_fs_alloc_inode,
1430 .destroy_inode = xfs_fs_destroy_inode, 1449 .destroy_inode = xfs_fs_destroy_inode,
1431 .dirty_inode = xfs_fs_dirty_inode, 1450 .dirty_inode = xfs_fs_dirty_inode,
1432 .write_inode = xfs_fs_write_inode,
1433 .evict_inode = xfs_fs_evict_inode, 1451 .evict_inode = xfs_fs_evict_inode,
1452 .drop_inode = xfs_fs_drop_inode,
1434 .put_super = xfs_fs_put_super, 1453 .put_super = xfs_fs_put_super,
1435 .sync_fs = xfs_fs_sync_fs, 1454 .sync_fs = xfs_fs_sync_fs,
1436 .freeze_fs = xfs_fs_freeze, 1455 .freeze_fs = xfs_fs_freeze,
@@ -1604,12 +1623,28 @@ xfs_init_workqueues(void)
1604 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); 1623 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
1605 if (!xfs_syncd_wq) 1624 if (!xfs_syncd_wq)
1606 return -ENOMEM; 1625 return -ENOMEM;
1626
1627 /*
1628 * The allocation workqueue can be used in memory reclaim situations
1629 * (writepage path), and parallelism is only limited by the number of
1630 * AGs in all the filesystems mounted. Hence use the default large
1631 * max_active value for this workqueue.
1632 */
1633 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
1634 if (!xfs_alloc_wq)
1635 goto out_destroy_syncd;
1636
1607 return 0; 1637 return 0;
1638
1639out_destroy_syncd:
1640 destroy_workqueue(xfs_syncd_wq);
1641 return -ENOMEM;
1608} 1642}
1609 1643
1610STATIC void 1644STATIC void
1611xfs_destroy_workqueues(void) 1645xfs_destroy_workqueues(void)
1612{ 1646{
1647 destroy_workqueue(xfs_alloc_wq);
1613 destroy_workqueue(xfs_syncd_wq); 1648 destroy_workqueue(xfs_syncd_wq);
1614} 1649}
1615 1650
@@ -1651,13 +1686,17 @@ init_xfs_fs(void)
1651 if (error) 1686 if (error)
1652 goto out_cleanup_procfs; 1687 goto out_cleanup_procfs;
1653 1688
1654 vfs_initquota(); 1689 error = xfs_qm_init();
1690 if (error)
1691 goto out_sysctl_unregister;
1655 1692
1656 error = register_filesystem(&xfs_fs_type); 1693 error = register_filesystem(&xfs_fs_type);
1657 if (error) 1694 if (error)
1658 goto out_sysctl_unregister; 1695 goto out_qm_exit;
1659 return 0; 1696 return 0;
1660 1697
1698 out_qm_exit:
1699 xfs_qm_exit();
1661 out_sysctl_unregister: 1700 out_sysctl_unregister:
1662 xfs_sysctl_unregister(); 1701 xfs_sysctl_unregister();
1663 out_cleanup_procfs: 1702 out_cleanup_procfs:
@@ -1679,7 +1718,7 @@ init_xfs_fs(void)
1679STATIC void __exit 1718STATIC void __exit
1680exit_xfs_fs(void) 1719exit_xfs_fs(void)
1681{ 1720{
1682 vfs_exitquota(); 1721 xfs_qm_exit();
1683 unregister_filesystem(&xfs_fs_type); 1722 unregister_filesystem(&xfs_fs_type);
1684 xfs_sysctl_unregister(); 1723 xfs_sysctl_unregister();
1685 xfs_cleanup_procfs(); 1724 xfs_cleanup_procfs();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999e..09b0c26b2245 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -21,13 +21,11 @@
21#include <linux/exportfs.h> 21#include <linux/exportfs.h>
22 22
23#ifdef CONFIG_XFS_QUOTA 23#ifdef CONFIG_XFS_QUOTA
24extern void xfs_qm_init(void); 24extern int xfs_qm_init(void);
25extern void xfs_qm_exit(void); 25extern void xfs_qm_exit(void);
26# define vfs_initquota() xfs_qm_init()
27# define vfs_exitquota() xfs_qm_exit()
28#else 26#else
29# define vfs_initquota() do { } while (0) 27# define xfs_qm_init() (0)
30# define vfs_exitquota() do { } while (0) 28# define xfs_qm_exit() do { } while (0)
31#endif 29#endif
32 30
33#ifdef CONFIG_XFS_POSIX_ACL 31#ifdef CONFIG_XFS_POSIX_ACL
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 40b75eecd2b4..205ebcb34d9e 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -336,32 +336,6 @@ xfs_sync_fsdata(
336 return error; 336 return error;
337} 337}
338 338
339int
340xfs_log_dirty_inode(
341 struct xfs_inode *ip,
342 struct xfs_perag *pag,
343 int flags)
344{
345 struct xfs_mount *mp = ip->i_mount;
346 struct xfs_trans *tp;
347 int error;
348
349 if (!ip->i_update_core)
350 return 0;
351
352 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
353 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
354 if (error) {
355 xfs_trans_cancel(tp, 0);
356 return error;
357 }
358
359 xfs_ilock(ip, XFS_ILOCK_EXCL);
360 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
361 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
362 return xfs_trans_commit(tp, 0);
363}
364
365/* 339/*
366 * When remounting a filesystem read-only or freezing the filesystem, we have 340 * When remounting a filesystem read-only or freezing the filesystem, we have
367 * two phases to execute. This first phase is syncing the data before we 341 * two phases to execute. This first phase is syncing the data before we
@@ -385,16 +359,6 @@ xfs_quiesce_data(
385{ 359{
386 int error, error2 = 0; 360 int error, error2 = 0;
387 361
388 /*
389 * Log all pending size and timestamp updates. The vfs writeback
390 * code is supposed to do this, but due to its overagressive
391 * livelock detection it will skip inodes where appending writes
392 * were written out in the first non-blocking sync phase if their
393 * completion took long enough that it happened after taking the
394 * timestamp for the cut-off in the blocking phase.
395 */
396 xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
397
398 /* force out the log */ 362 /* force out the log */
399 xfs_log_force(mp, XFS_LOG_SYNC); 363 xfs_log_force(mp, XFS_LOG_SYNC);
400 364
@@ -913,17 +877,15 @@ reclaim:
913 * can reference the inodes in the cache without taking references. 877 * can reference the inodes in the cache without taking references.
914 * 878 *
915 * We make that OK here by ensuring that we wait until the inode is 879 * We make that OK here by ensuring that we wait until the inode is
916 * unlocked after the lookup before we go ahead and free it. We get 880 * unlocked after the lookup before we go ahead and free it.
917 * both the ilock and the iolock because the code may need to drop the
918 * ilock one but will still hold the iolock.
919 */ 881 */
920 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 882 xfs_ilock(ip, XFS_ILOCK_EXCL);
921 xfs_qm_dqdetach(ip); 883 xfs_qm_dqdetach(ip);
922 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 884 xfs_iunlock(ip, XFS_ILOCK_EXCL);
923 885
924 xfs_inode_free(ip); 886 xfs_inode_free(ip);
925 return error;
926 887
888 return error;
927} 889}
928 890
929/* 891/*
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index fa965479d788..941202e7ac6e 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
34 34
35void xfs_flush_inodes(struct xfs_inode *ip); 35void xfs_flush_inodes(struct xfs_inode *ip);
36 36
37int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
38
39int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
40int xfs_reclaim_inodes_count(struct xfs_mount *mp); 38int xfs_reclaim_inodes_count(struct xfs_mount *mp);
41void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bb134a819930..06838c42b2a0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -580,7 +580,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
580DEFINE_INODE_EVENT(xfs_dir_fsync); 580DEFINE_INODE_EVENT(xfs_dir_fsync);
581DEFINE_INODE_EVENT(xfs_file_fsync); 581DEFINE_INODE_EVENT(xfs_file_fsync);
582DEFINE_INODE_EVENT(xfs_destroy_inode); 582DEFINE_INODE_EVENT(xfs_destroy_inode);
583DEFINE_INODE_EVENT(xfs_write_inode); 583DEFINE_INODE_EVENT(xfs_dirty_inode);
584DEFINE_INODE_EVENT(xfs_evict_inode); 584DEFINE_INODE_EVENT(xfs_evict_inode);
585 585
586DEFINE_INODE_EVENT(xfs_dquot_dqalloc); 586DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -627,16 +627,19 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
627 TP_STRUCT__entry( 627 TP_STRUCT__entry(
628 __field(dev_t, dev) 628 __field(dev_t, dev)
629 __field(xfs_ino_t, dp_ino) 629 __field(xfs_ino_t, dp_ino)
630 __field(int, namelen)
630 __dynamic_array(char, name, name->len) 631 __dynamic_array(char, name, name->len)
631 ), 632 ),
632 TP_fast_assign( 633 TP_fast_assign(
633 __entry->dev = VFS_I(dp)->i_sb->s_dev; 634 __entry->dev = VFS_I(dp)->i_sb->s_dev;
634 __entry->dp_ino = dp->i_ino; 635 __entry->dp_ino = dp->i_ino;
636 __entry->namelen = name->len;
635 memcpy(__get_str(name), name->name, name->len); 637 memcpy(__get_str(name), name->name, name->len);
636 ), 638 ),
637 TP_printk("dev %d:%d dp ino 0x%llx name %s", 639 TP_printk("dev %d:%d dp ino 0x%llx name %.*s",
638 MAJOR(__entry->dev), MINOR(__entry->dev), 640 MAJOR(__entry->dev), MINOR(__entry->dev),
639 __entry->dp_ino, 641 __entry->dp_ino,
642 __entry->namelen,
640 __get_str(name)) 643 __get_str(name))
641) 644)
642 645
@@ -658,6 +661,8 @@ TRACE_EVENT(xfs_rename,
658 __field(dev_t, dev) 661 __field(dev_t, dev)
659 __field(xfs_ino_t, src_dp_ino) 662 __field(xfs_ino_t, src_dp_ino)
660 __field(xfs_ino_t, target_dp_ino) 663 __field(xfs_ino_t, target_dp_ino)
664 __field(int, src_namelen)
665 __field(int, target_namelen)
661 __dynamic_array(char, src_name, src_name->len) 666 __dynamic_array(char, src_name, src_name->len)
662 __dynamic_array(char, target_name, target_name->len) 667 __dynamic_array(char, target_name, target_name->len)
663 ), 668 ),
@@ -665,15 +670,20 @@ TRACE_EVENT(xfs_rename,
665 __entry->dev = VFS_I(src_dp)->i_sb->s_dev; 670 __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
666 __entry->src_dp_ino = src_dp->i_ino; 671 __entry->src_dp_ino = src_dp->i_ino;
667 __entry->target_dp_ino = target_dp->i_ino; 672 __entry->target_dp_ino = target_dp->i_ino;
673 __entry->src_namelen = src_name->len;
674 __entry->target_namelen = target_name->len;
668 memcpy(__get_str(src_name), src_name->name, src_name->len); 675 memcpy(__get_str(src_name), src_name->name, src_name->len);
669 memcpy(__get_str(target_name), target_name->name, target_name->len); 676 memcpy(__get_str(target_name), target_name->name,
677 target_name->len);
670 ), 678 ),
671 TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" 679 TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
672 " src name %s target name %s", 680 " src name %.*s target name %.*s",
673 MAJOR(__entry->dev), MINOR(__entry->dev), 681 MAJOR(__entry->dev), MINOR(__entry->dev),
674 __entry->src_dp_ino, 682 __entry->src_dp_ino,
675 __entry->target_dp_ino, 683 __entry->target_dp_ino,
684 __entry->src_namelen,
676 __get_str(src_name), 685 __get_str(src_name),
686 __entry->target_namelen,
677 __get_str(target_name)) 687 __get_str(target_name))
678) 688)
679 689
@@ -741,10 +751,10 @@ DEFINE_DQUOT_EVENT(xfs_dqalloc);
741DEFINE_DQUOT_EVENT(xfs_dqtobp_read); 751DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
742DEFINE_DQUOT_EVENT(xfs_dqread); 752DEFINE_DQUOT_EVENT(xfs_dqread);
743DEFINE_DQUOT_EVENT(xfs_dqread_fail); 753DEFINE_DQUOT_EVENT(xfs_dqread_fail);
744DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
745DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
746DEFINE_DQUOT_EVENT(xfs_dqget_hit); 754DEFINE_DQUOT_EVENT(xfs_dqget_hit);
747DEFINE_DQUOT_EVENT(xfs_dqget_miss); 755DEFINE_DQUOT_EVENT(xfs_dqget_miss);
756DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
757DEFINE_DQUOT_EVENT(xfs_dqget_dup);
748DEFINE_DQUOT_EVENT(xfs_dqput); 758DEFINE_DQUOT_EVENT(xfs_dqput);
749DEFINE_DQUOT_EVENT(xfs_dqput_wait); 759DEFINE_DQUOT_EVENT(xfs_dqput_wait);
750DEFINE_DQUOT_EVENT(xfs_dqput_free); 760DEFINE_DQUOT_EVENT(xfs_dqput_free);
@@ -782,12 +792,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
782 __entry->curr_res = tic->t_curr_res; 792 __entry->curr_res = tic->t_curr_res;
783 __entry->unit_res = tic->t_unit_res; 793 __entry->unit_res = tic->t_unit_res;
784 __entry->flags = tic->t_flags; 794 __entry->flags = tic->t_flags;
785 __entry->reserveq = list_empty(&log->l_reserveq); 795 __entry->reserveq = list_empty(&log->l_reserve_head.waiters);
786 __entry->writeq = list_empty(&log->l_writeq); 796 __entry->writeq = list_empty(&log->l_write_head.waiters);
787 xlog_crack_grant_head(&log->l_grant_reserve_head, 797 xlog_crack_grant_head(&log->l_reserve_head.grant,
788 &__entry->grant_reserve_cycle, 798 &__entry->grant_reserve_cycle,
789 &__entry->grant_reserve_bytes); 799 &__entry->grant_reserve_bytes);
790 xlog_crack_grant_head(&log->l_grant_write_head, 800 xlog_crack_grant_head(&log->l_write_head.grant,
791 &__entry->grant_write_cycle, 801 &__entry->grant_write_cycle,
792 &__entry->grant_write_bytes); 802 &__entry->grant_write_bytes);
793 __entry->curr_cycle = log->l_curr_cycle; 803 __entry->curr_cycle = log->l_curr_cycle;
@@ -826,20 +836,14 @@ DEFINE_EVENT(xfs_loggrant_class, name, \
826 TP_ARGS(log, tic)) 836 TP_ARGS(log, tic))
827DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); 837DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
828DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); 838DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
829DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
830DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); 839DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
831DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
832DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
833DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
834DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); 840DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); 841DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); 842DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
837DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 843DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
838DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 844DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -1414,7 +1418,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
1414DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); 1418DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
1415DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); 1419DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
1416 1420
1417DECLARE_EVENT_CLASS(xfs_dir2_class, 1421DECLARE_EVENT_CLASS(xfs_da_class,
1418 TP_PROTO(struct xfs_da_args *args), 1422 TP_PROTO(struct xfs_da_args *args),
1419 TP_ARGS(args), 1423 TP_ARGS(args),
1420 TP_STRUCT__entry( 1424 TP_STRUCT__entry(
@@ -1449,7 +1453,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_class,
1449) 1453)
1450 1454
1451#define DEFINE_DIR2_EVENT(name) \ 1455#define DEFINE_DIR2_EVENT(name) \
1452DEFINE_EVENT(xfs_dir2_class, name, \ 1456DEFINE_EVENT(xfs_da_class, name, \
1453 TP_PROTO(struct xfs_da_args *args), \ 1457 TP_PROTO(struct xfs_da_args *args), \
1454 TP_ARGS(args)) 1458 TP_ARGS(args))
1455DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); 1459DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
@@ -1478,6 +1482,64 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1478DEFINE_DIR2_EVENT(xfs_dir2_node_removename); 1482DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1479DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); 1483DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1480 1484
1485#define DEFINE_ATTR_EVENT(name) \
1486DEFINE_EVENT(xfs_da_class, name, \
1487 TP_PROTO(struct xfs_da_args *args), \
1488 TP_ARGS(args))
1489DEFINE_ATTR_EVENT(xfs_attr_sf_add);
1490DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
1491DEFINE_ATTR_EVENT(xfs_attr_sf_create);
1492DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
1493DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
1494DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
1495DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
1496
1497DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
1498DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
1499DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
1500DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
1501DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
1502DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
1503DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
1504DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
1505DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
1506DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
1507DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after);
1508DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag);
1509DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag);
1510DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags);
1511DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
1512DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
1513DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
1514DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
1515
1516DEFINE_ATTR_EVENT(xfs_attr_node_addname);
1517DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
1518DEFINE_ATTR_EVENT(xfs_attr_node_replace);
1519DEFINE_ATTR_EVENT(xfs_attr_node_removename);
1520
1521#define DEFINE_DA_EVENT(name) \
1522DEFINE_EVENT(xfs_da_class, name, \
1523 TP_PROTO(struct xfs_da_args *args), \
1524 TP_ARGS(args))
1525DEFINE_DA_EVENT(xfs_da_split);
1526DEFINE_DA_EVENT(xfs_da_join);
1527DEFINE_DA_EVENT(xfs_da_link_before);
1528DEFINE_DA_EVENT(xfs_da_link_after);
1529DEFINE_DA_EVENT(xfs_da_unlink_back);
1530DEFINE_DA_EVENT(xfs_da_unlink_forward);
1531DEFINE_DA_EVENT(xfs_da_root_split);
1532DEFINE_DA_EVENT(xfs_da_root_join);
1533DEFINE_DA_EVENT(xfs_da_node_add);
1534DEFINE_DA_EVENT(xfs_da_node_create);
1535DEFINE_DA_EVENT(xfs_da_node_split);
1536DEFINE_DA_EVENT(xfs_da_node_remove);
1537DEFINE_DA_EVENT(xfs_da_node_rebalance);
1538DEFINE_DA_EVENT(xfs_da_node_unbalance);
1539DEFINE_DA_EVENT(xfs_da_swap_lastblock);
1540DEFINE_DA_EVENT(xfs_da_grow_inode);
1541DEFINE_DA_EVENT(xfs_da_shrink_inode);
1542
1481DECLARE_EVENT_CLASS(xfs_dir2_space_class, 1543DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1482 TP_PROTO(struct xfs_da_args *args, int idx), 1544 TP_PROTO(struct xfs_da_args *args, int idx),
1483 TP_ARGS(args, idx), 1545 TP_ARGS(args, idx),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7adcdf15ae0c..103b00c90004 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -681,7 +681,6 @@ xfs_trans_reserve(
681 uint flags, 681 uint flags,
682 uint logcount) 682 uint logcount)
683{ 683{
684 int log_flags;
685 int error = 0; 684 int error = 0;
686 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 685 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
687 686
@@ -707,24 +706,32 @@ xfs_trans_reserve(
707 * Reserve the log space needed for this transaction. 706 * Reserve the log space needed for this transaction.
708 */ 707 */
709 if (logspace > 0) { 708 if (logspace > 0) {
710 ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace)); 709 bool permanent = false;
711 ASSERT((tp->t_log_count == 0) || 710
712 (tp->t_log_count == logcount)); 711 ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
712 ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
713
713 if (flags & XFS_TRANS_PERM_LOG_RES) { 714 if (flags & XFS_TRANS_PERM_LOG_RES) {
714 log_flags = XFS_LOG_PERM_RESERV;
715 tp->t_flags |= XFS_TRANS_PERM_LOG_RES; 715 tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
716 permanent = true;
716 } else { 717 } else {
717 ASSERT(tp->t_ticket == NULL); 718 ASSERT(tp->t_ticket == NULL);
718 ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); 719 ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
719 log_flags = 0;
720 } 720 }
721 721
722 error = xfs_log_reserve(tp->t_mountp, logspace, logcount, 722 if (tp->t_ticket != NULL) {
723 &tp->t_ticket, 723 ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
724 XFS_TRANSACTION, log_flags, tp->t_type); 724 error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
725 if (error) { 725 } else {
726 goto undo_blocks; 726 error = xfs_log_reserve(tp->t_mountp, logspace,
727 logcount, &tp->t_ticket,
728 XFS_TRANSACTION, permanent,
729 tp->t_type);
727 } 730 }
731
732 if (error)
733 goto undo_blocks;
734
728 tp->t_log_res = logspace; 735 tp->t_log_res = logspace;
729 tp->t_log_count = logcount; 736 tp->t_log_count = logcount;
730 } 737 }
@@ -752,6 +759,8 @@ xfs_trans_reserve(
752 */ 759 */
753undo_log: 760undo_log:
754 if (logspace > 0) { 761 if (logspace > 0) {
762 int log_flags;
763
755 if (flags & XFS_TRANS_PERM_LOG_RES) { 764 if (flags & XFS_TRANS_PERM_LOG_RES) {
756 log_flags = XFS_LOG_REL_PERM_RESERV; 765 log_flags = XFS_LOG_REL_PERM_RESERV;
757 } else { 766 } else {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index ed9252bcdac9..1dead07f092c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -611,50 +611,6 @@ xfs_ail_push_all(
611} 611}
612 612
613/* 613/*
614 * This is to be called when an item is unlocked that may have
615 * been in the AIL. It will wake up the first member of the AIL
616 * wait list if this item's unlocking might allow it to progress.
617 * If the item is in the AIL, then we need to get the AIL lock
618 * while doing our checking so we don't race with someone going
619 * to sleep waiting for this event in xfs_trans_push_ail().
620 */
621void
622xfs_trans_unlocked_item(
623 struct xfs_ail *ailp,
624 xfs_log_item_t *lip)
625{
626 xfs_log_item_t *min_lip;
627
628 /*
629 * If we're forcibly shutting down, we may have
630 * unlocked log items arbitrarily. The last thing
631 * we want to do is to move the tail of the log
632 * over some potentially valid data.
633 */
634 if (!(lip->li_flags & XFS_LI_IN_AIL) ||
635 XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
636 return;
637 }
638
639 /*
640 * This is the one case where we can call into xfs_ail_min()
641 * without holding the AIL lock because we only care about the
642 * case where we are at the tail of the AIL. If the object isn't
643 * at the tail, it doesn't matter what result we get back. This
644 * is slightly racy because since we were just unlocked, we could
645 * go to sleep between the call to xfs_ail_min and the call to
646 * xfs_log_move_tail, have someone else lock us, commit to us disk,
647 * move us out of the tail of the AIL, and then we wake up. However,
648 * the call to xfs_log_move_tail() doesn't do anything if there's
649 * not enough free space to wake people up so we're safe calling it.
650 */
651 min_lip = xfs_ail_min(ailp);
652
653 if (min_lip == lip)
654 xfs_log_move_tail(ailp->xa_mount, 1);
655} /* xfs_trans_unlocked_item */
656
657/*
658 * xfs_trans_ail_update - bulk AIL insertion operation. 614 * xfs_trans_ail_update - bulk AIL insertion operation.
659 * 615 *
660 * @xfs_trans_ail_update takes an array of log items that all need to be 616 * @xfs_trans_ail_update takes an array of log items that all need to be
@@ -685,7 +641,6 @@ xfs_trans_ail_update_bulk(
685 xfs_lsn_t lsn) __releases(ailp->xa_lock) 641 xfs_lsn_t lsn) __releases(ailp->xa_lock)
686{ 642{
687 xfs_log_item_t *mlip; 643 xfs_log_item_t *mlip;
688 xfs_lsn_t tail_lsn;
689 int mlip_changed = 0; 644 int mlip_changed = 0;
690 int i; 645 int i;
691 LIST_HEAD(tmp); 646 LIST_HEAD(tmp);
@@ -712,22 +667,12 @@ xfs_trans_ail_update_bulk(
712 667
713 if (!list_empty(&tmp)) 668 if (!list_empty(&tmp))
714 xfs_ail_splice(ailp, cur, &tmp, lsn); 669 xfs_ail_splice(ailp, cur, &tmp, lsn);
670 spin_unlock(&ailp->xa_lock);
715 671
716 if (!mlip_changed) { 672 if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
717 spin_unlock(&ailp->xa_lock); 673 xlog_assign_tail_lsn(ailp->xa_mount);
718 return; 674 xfs_log_space_wake(ailp->xa_mount);
719 } 675 }
720
721 /*
722 * It is not safe to access mlip after the AIL lock is dropped, so we
723 * must get a copy of li_lsn before we do so. This is especially
724 * important on 32-bit platforms where accessing and updating 64-bit
725 * values like li_lsn is not atomic.
726 */
727 mlip = xfs_ail_min(ailp);
728 tail_lsn = mlip->li_lsn;
729 spin_unlock(&ailp->xa_lock);
730 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
731} 676}
732 677
733/* 678/*
@@ -758,7 +703,6 @@ xfs_trans_ail_delete_bulk(
758 int nr_items) __releases(ailp->xa_lock) 703 int nr_items) __releases(ailp->xa_lock)
759{ 704{
760 xfs_log_item_t *mlip; 705 xfs_log_item_t *mlip;
761 xfs_lsn_t tail_lsn;
762 int mlip_changed = 0; 706 int mlip_changed = 0;
763 int i; 707 int i;
764 708
@@ -785,23 +729,12 @@ xfs_trans_ail_delete_bulk(
785 if (mlip == lip) 729 if (mlip == lip)
786 mlip_changed = 1; 730 mlip_changed = 1;
787 } 731 }
732 spin_unlock(&ailp->xa_lock);
788 733
789 if (!mlip_changed) { 734 if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
790 spin_unlock(&ailp->xa_lock); 735 xlog_assign_tail_lsn(ailp->xa_mount);
791 return; 736 xfs_log_space_wake(ailp->xa_mount);
792 } 737 }
793
794 /*
795 * It is not safe to access mlip after the AIL lock is dropped, so we
796 * must get a copy of li_lsn before we do so. This is especially
797 * important on 32-bit platforms where accessing and updating 64-bit
798 * values like li_lsn is not atomic. It is possible we've emptied the
799 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
800 */
801 mlip = xfs_ail_min(ailp);
802 tail_lsn = mlip ? mlip->li_lsn : 0;
803 spin_unlock(&ailp->xa_lock);
804 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
805} 738}
806 739
807/* 740/*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 475a4ded4f41..1302d1d95a58 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -463,19 +463,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
463 * Default to a normal brelse() call if the tp is NULL. 463 * Default to a normal brelse() call if the tp is NULL.
464 */ 464 */
465 if (tp == NULL) { 465 if (tp == NULL) {
466 struct xfs_log_item *lip = bp->b_fspriv;
467
468 ASSERT(bp->b_transp == NULL); 466 ASSERT(bp->b_transp == NULL);
469
470 /*
471 * If there's a buf log item attached to the buffer,
472 * then let the AIL know that the buffer is being
473 * unlocked.
474 */
475 if (lip != NULL && lip->li_type == XFS_LI_BUF) {
476 bip = bp->b_fspriv;
477 xfs_trans_unlocked_item(bip->bli_item.li_ailp, lip);
478 }
479 xfs_buf_relse(bp); 467 xfs_buf_relse(bp);
480 return; 468 return;
481 } 469 }
@@ -550,21 +538,10 @@ xfs_trans_brelse(xfs_trans_t *tp,
550 ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); 538 ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
551 ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); 539 ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
552 xfs_buf_item_relse(bp); 540 xfs_buf_item_relse(bp);
553 bip = NULL;
554 }
555 bp->b_transp = NULL;
556
557 /*
558 * If we've still got a buf log item on the buffer, then
559 * tell the AIL that the buffer is being unlocked.
560 */
561 if (bip != NULL) {
562 xfs_trans_unlocked_item(bip->bli_item.li_ailp,
563 (xfs_log_item_t*)bip);
564 } 541 }
565 542
543 bp->b_transp = NULL;
566 xfs_buf_relse(bp); 544 xfs_buf_relse(bp);
567 return;
568} 545}
569 546
570/* 547/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c4ba366d24e6..279099717ed2 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -605,7 +605,7 @@ xfs_trans_dqresv(
605 time_t timer; 605 time_t timer;
606 xfs_qwarncnt_t warns; 606 xfs_qwarncnt_t warns;
607 xfs_qwarncnt_t warnlimit; 607 xfs_qwarncnt_t warnlimit;
608 xfs_qcnt_t count; 608 xfs_qcnt_t total_count;
609 xfs_qcnt_t *resbcountp; 609 xfs_qcnt_t *resbcountp;
610 xfs_quotainfo_t *q = mp->m_quotainfo; 610 xfs_quotainfo_t *q = mp->m_quotainfo;
611 611
@@ -648,13 +648,12 @@ xfs_trans_dqresv(
648 * hardlimit or exceed the timelimit if we allocate 648 * hardlimit or exceed the timelimit if we allocate
649 * nblks. 649 * nblks.
650 */ 650 */
651 if (hardlimit > 0ULL && 651 total_count = *resbcountp + nblks;
652 hardlimit < nblks + *resbcountp) { 652 if (hardlimit && total_count > hardlimit) {
653 xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); 653 xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
654 goto error_return; 654 goto error_return;
655 } 655 }
656 if (softlimit > 0ULL && 656 if (softlimit && total_count > softlimit) {
657 softlimit < nblks + *resbcountp) {
658 if ((timer != 0 && get_seconds() > timer) || 657 if ((timer != 0 && get_seconds() > timer) ||
659 (warns != 0 && warns >= warnlimit)) { 658 (warns != 0 && warns >= warnlimit)) {
660 xfs_quota_warn(mp, dqp, 659 xfs_quota_warn(mp, dqp,
@@ -666,7 +665,7 @@ xfs_trans_dqresv(
666 } 665 }
667 } 666 }
668 if (ninos > 0) { 667 if (ninos > 0) {
669 count = be64_to_cpu(dqp->q_core.d_icount); 668 total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
670 timer = be32_to_cpu(dqp->q_core.d_itimer); 669 timer = be32_to_cpu(dqp->q_core.d_itimer);
671 warns = be16_to_cpu(dqp->q_core.d_iwarns); 670 warns = be16_to_cpu(dqp->q_core.d_iwarns);
672 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; 671 warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
@@ -677,13 +676,11 @@ xfs_trans_dqresv(
677 if (!softlimit) 676 if (!softlimit)
678 softlimit = q->qi_isoftlimit; 677 softlimit = q->qi_isoftlimit;
679 678
680 if (hardlimit > 0ULL && 679 if (hardlimit && total_count > hardlimit) {
681 hardlimit < ninos + count) {
682 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); 680 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
683 goto error_return; 681 goto error_return;
684 } 682 }
685 if (softlimit > 0ULL && 683 if (softlimit && total_count > softlimit) {
686 softlimit < ninos + count) {
687 if ((timer != 0 && get_seconds() > timer) || 684 if ((timer != 0 && get_seconds() > timer) ||
688 (warns != 0 && warns >= warnlimit)) { 685 (warns != 0 && warns >= warnlimit)) {
689 xfs_quota_warn(mp, dqp, 686 xfs_quota_warn(mp, dqp,
@@ -878,7 +875,7 @@ STATIC void
878xfs_trans_alloc_dqinfo( 875xfs_trans_alloc_dqinfo(
879 xfs_trans_t *tp) 876 xfs_trans_t *tp)
880{ 877{
881 tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP); 878 tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
882} 879}
883 880
884void 881void
@@ -887,6 +884,6 @@ xfs_trans_free_dqinfo(
887{ 884{
888 if (!tp->t_dqinfo) 885 if (!tp->t_dqinfo)
889 return; 886 return;
890 kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo); 887 kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
891 tp->t_dqinfo = NULL; 888 tp->t_dqinfo = NULL;
892} 889}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 32f0288ae10f..7a7442c03f2b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -95,10 +95,14 @@ xfs_trans_ichgtime(
95 if ((flags & XFS_ICHGTIME_MOD) && 95 if ((flags & XFS_ICHGTIME_MOD) &&
96 !timespec_equal(&inode->i_mtime, &tv)) { 96 !timespec_equal(&inode->i_mtime, &tv)) {
97 inode->i_mtime = tv; 97 inode->i_mtime = tv;
98 ip->i_d.di_mtime.t_sec = tv.tv_sec;
99 ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
98 } 100 }
99 if ((flags & XFS_ICHGTIME_CHG) && 101 if ((flags & XFS_ICHGTIME_CHG) &&
100 !timespec_equal(&inode->i_ctime, &tv)) { 102 !timespec_equal(&inode->i_ctime, &tv)) {
101 inode->i_ctime = tv; 103 inode->i_ctime = tv;
104 ip->i_d.di_ctime.t_sec = tv.tv_sec;
105 ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
102 } 106 }
103} 107}
104 108
@@ -126,12 +130,12 @@ xfs_trans_log_inode(
126 /* 130 /*
127 * Always OR in the bits from the ili_last_fields field. 131 * Always OR in the bits from the ili_last_fields field.
128 * This is to coordinate with the xfs_iflush() and xfs_iflush_done() 132 * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
129 * routines in the eventual clearing of the ilf_fields bits. 133 * routines in the eventual clearing of the ili_fields bits.
130 * See the big comment in xfs_iflush() for an explanation of 134 * See the big comment in xfs_iflush() for an explanation of
131 * this coordination mechanism. 135 * this coordination mechanism.
132 */ 136 */
133 flags |= ip->i_itemp->ili_last_fields; 137 flags |= ip->i_itemp->ili_last_fields;
134 ip->i_itemp->ili_format.ilf_fields |= flags; 138 ip->i_itemp->ili_fields |= flags;
135} 139}
136 140
137#ifdef XFS_TRANS_DEBUG 141#ifdef XFS_TRANS_DEBUG
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 44820b9fcb43..8ab2ced415f1 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -104,9 +104,6 @@ void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
104void xfs_ail_push_all(struct xfs_ail *); 104void xfs_ail_push_all(struct xfs_ail *);
105xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); 105xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
106 106
107void xfs_trans_unlocked_item(struct xfs_ail *,
108 xfs_log_item_t *);
109
110struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, 107struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
111 struct xfs_ail_cursor *cur, 108 struct xfs_ail_cursor *cur,
112 xfs_lsn_t lsn); 109 xfs_lsn_t lsn);
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227bc..db14d0c08682 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -22,7 +22,6 @@
22 22
23struct file; 23struct file;
24struct xfs_inode; 24struct xfs_inode;
25struct xfs_iomap;
26struct attrlist_cursor_kern; 25struct attrlist_cursor_kern;
27 26
28/* 27/*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 0c877cbde142..447e146b2ba6 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -10,7 +10,6 @@ struct kiocb;
10struct pipe_inode_info; 10struct pipe_inode_info;
11struct uio; 11struct uio;
12struct xfs_inode; 12struct xfs_inode;
13struct xfs_iomap;
14 13
15 14
16int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); 15int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
@@ -49,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
49int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
50int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
51 int flags, struct attrlist_cursor_kern *cursor); 50 int flags, struct attrlist_cursor_kern *cursor);
52int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
53 int flags, struct xfs_iomap *iomapp, int *niomaps);
54void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, 51void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
55 xfs_off_t last, int fiopt); 52 xfs_off_t last, int fiopt);
56int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, 53int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,