aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/Kconfig24
-rw-r--r--fs/Kconfig.binfmt7
-rw-r--r--fs/Makefile2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c13
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/dir.c11
-rw-r--r--fs/affs/file.c49
-rw-r--r--fs/affs/inode.c7
-rw-r--r--fs/affs/namei.c47
-rw-r--r--fs/affs/super.c69
-rw-r--r--fs/afs/rxrpc.c14
-rw-r--r--fs/afs/volume.c2
-rw-r--r--fs/aio.c13
-rw-r--r--fs/befs/linuxvfs.c6
-rw-r--r--fs/binfmt_som.c299
-rw-r--r--fs/block_dev.c77
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c6
-rw-r--r--fs/ceph/addr.c1
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/locks.c63
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/super.c20
-rw-r--r--fs/char_dev.c24
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c35
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/coda/dir.c138
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/configfs_internal.h2
-rw-r--r--fs/configfs/inode.c17
-rw-r--r--fs/configfs/mount.c11
-rw-r--r--fs/dax.c534
-rw-r--r--fs/dcache.c189
-rw-r--r--fs/debugfs/inode.c291
-rw-r--r--fs/dlm/netlink.c7
-rw-r--r--fs/drop_caches.c14
-rw-r--r--fs/ecryptfs/inode.c1
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efivarfs/Kconfig1
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c10
-rw-r--r--fs/exofs/inode.c3
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/Kconfig11
-rw-r--r--fs/ext2/Makefile1
-rw-r--r--fs/ext2/ext2.h10
-rw-r--r--fs/ext2/file.c44
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c38
-rw-r--r--fs/ext2/namei.c13
-rw-r--r--fs/ext2/super.c53
-rw-r--r--fs/ext2/xip.c91
-rw-r--r--fs/ext2/xip.h26
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/file.c50
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c159
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/super.c93
-rw-r--r--fs/f2fs/Kconfig10
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/checkpoint.c95
-rw-r--r--fs/f2fs/data.c218
-rw-r--r--fs/f2fs/debug.c59
-rw-r--r--fs/f2fs/dir.c3
-rw-r--r--fs/f2fs/f2fs.h120
-rw-r--r--fs/f2fs/file.c101
-rw-r--r--fs/f2fs/gc.c38
-rw-r--r--fs/f2fs/gc.h33
-rw-r--r--fs/f2fs/inline.c32
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c2
-rw-r--r--fs/f2fs/node.c154
-rw-r--r--fs/f2fs/node.h45
-rw-r--r--fs/f2fs/recovery.c11
-rw-r--r--fs/f2fs/segment.c194
-rw-r--r--fs/f2fs/segment.h29
-rw-r--r--fs/f2fs/super.c75
-rw-r--r--fs/f2fs/trace.c159
-rw-r--r--fs/f2fs/trace.h46
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fs-writeback.c76
-rw-r--r--fs/fs_pin.c96
-rw-r--r--fs/fuse/file.c11
-rw-r--r--fs/fuse/inode.c1
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c14
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/quota.c11
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c138
-rw-r--r--fs/internal.h9
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/isofs/util.c18
-rw-r--r--fs/jffs2/compr_rubin.c5
-rw-r--r--fs/jffs2/scan.c5
-rw-r--r--fs/jfs/endian24.h49
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_dtree.c4
-rw-r--r--fs/jfs/jfs_types.h55
-rw-r--r--fs/jfs/jfs_xtree.h25
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/kernfs/dir.c24
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/kernfs/inode.c13
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/mount.c1
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/mon.c13
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/lockd/svcsubs.c26
-rw-r--r--fs/lockd/xdr.c8
-rw-r--r--fs/locks.c588
-rw-r--r--fs/mount.h4
-rw-r--r--fs/namei.c143
-rw-r--r--fs/namespace.c50
-rw-r--r--fs/ncpfs/dir.c98
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/ncpfs/ncp_fs_i.h1
-rw-r--r--fs/ncpfs/ncplib_kernel.h30
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.c8
-rw-r--r--fs/nfs/delegation.c43
-rw-r--r--fs/nfs/direct.c112
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c317
-rw-r--r--fs/nfs/filelayout/filelayout.h40
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c469
-rw-r--r--fs/nfs/flexfilelayout/Makefile5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1574
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h155
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c552
-rw-r--r--fs/nfs/idmap.c3
-rw-r--r--fs/nfs/inode.c8
-rw-r--r--fs/nfs/internal.h54
-rw-r--r--fs/nfs/nfs2xdr.c10
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3client.c41
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs3super.c2
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h9
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4proc.c315
-rw-r--r--fs/nfs/nfs4state.c101
-rw-r--r--fs/nfs/nfs4super.c4
-rw-r--r--fs/nfs/nfs4xdr.c113
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/objlayout/objio_osd.c5
-rw-r--r--fs/nfs/pagelist.c300
-rw-r--r--fs/nfs/pnfs.c471
-rw-r--r--fs/nfs/pnfs.h135
-rw-r--r--fs/nfs/pnfs_nfs.c840
-rw-r--r--fs/nfs/read.c33
-rw-r--r--fs/nfs/super.c33
-rw-r--r--fs/nfs/write.c99
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile8
-rw-r--r--fs/nfsd/blocklayout.c189
-rw-r--r--fs/nfsd/blocklayoutxdr.c157
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/nfs4callback.c99
-rw-r--r--fs/nfsd/nfs4layouts.c721
-rw-r--r--fs/nfsd/nfs4proc.c310
-rw-r--r--fs/nfsd/nfs4state.c97
-rw-r--r--fs/nfsd/nfs4xdr.c362
-rw-r--r--fs/nfsd/nfsctl.c9
-rw-r--r--fs/nfsd/nfsd.h16
-rw-r--r--fs/nfsd/nfsfh.h18
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nfsd/pnfs.h86
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/trace.c5
-rw-r--r--fs/nfsd/trace.h54
-rw-r--r--fs/nfsd/xdr4.h59
-rw-r--r--fs/nfsd/xdr4cb.h7
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/page.c4
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/super.c6
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c35
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ocfs2/acl.c14
-rw-r--r--fs/ocfs2/alloc.c18
-rw-r--r--fs/ocfs2/aops.c242
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h12
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/dlm/dlmast.c6
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c14
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c80
-rw-r--r--fs/ocfs2/file.h9
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c111
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c284
-rw-r--r--fs/ocfs2/namei.h8
-rw-r--r--fs/ocfs2/ocfs2.h25
-rw-r--r--fs/ocfs2/ocfs2_fs.h14
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_local.c20
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/reservations.c2
-rw-r--r--fs/ocfs2/super.c51
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/open.c15
-rw-r--r--fs/proc/array.c44
-rw-r--r--fs/proc/generic.c27
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/task_mmu.c250
-rw-r--r--fs/proc/vmcore.c8
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/pstore/Kconfig10
-rw-r--r--fs/pstore/Makefile2
-rw-r--r--fs/pstore/inode.c26
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/pmsg.c114
-rw-r--r--fs/pstore/ram.c53
-rw-r--r--fs/quota/Kconfig1
-rw-r--r--fs/quota/dquot.c107
-rw-r--r--fs/quota/quota.c52
-rw-r--r--fs/quota/quota_v1.c4
-rw-r--r--fs/quota/quota_v2.c16
-rw-r--r--fs/ramfs/file-nommu.c7
-rw-r--r--fs/ramfs/inode.c21
-rw-r--r--fs/read_write.c48
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/romfs/mmap-nommu.c10
-rw-r--r--fs/romfs/super.c3
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c32
-rw-r--r--fs/splice.c23
-rw-r--r--fs/super.c63
-rw-r--r--fs/sync.c8
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/ubifs/replay.c19
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c112
-rw-r--r--fs/udf/Kconfig10
-rw-r--r--fs/udf/inode.c32
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/ufs/super.c8
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c20
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h33
-rw-r--r--fs/xfs/libxfs/xfs_format.h24
-rw-r--r--fs/xfs/libxfs/xfs_fs.h (renamed from fs/xfs/xfs_fs.h)0
-rw-r--r--fs/xfs/libxfs/xfs_sb.c320
-rw-r--r--fs/xfs/libxfs/xfs_sb.h11
-rw-r--r--fs/xfs/libxfs/xfs_shared.h33
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c14
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h1
-rw-r--r--fs/xfs/libxfs/xfs_types.h (renamed from fs/xfs/xfs_types.h)0
-rw-r--r--fs/xfs/xfs_aops.c149
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_bmap_util.h37
-rw-r--r--fs/xfs/xfs_buf.c13
-rw-r--r--fs/xfs/xfs_buf_item.c6
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_file.c67
-rw-r--r--fs/xfs/xfs_fsops.c34
-rw-r--r--fs/xfs/xfs_inode.c136
-rw-r--r--fs/xfs/xfs_inode.h11
-rw-r--r--fs/xfs/xfs_ioctl.c501
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_log.c28
-rw-r--r--fs/xfs/xfs_mount.c107
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_qm.c55
-rw-r--r--fs/xfs/xfs_qm.h1
-rw-r--r--fs/xfs/xfs_qm_syscalls.c88
-rw-r--r--fs/xfs/xfs_quotaops.c59
-rw-r--r--fs/xfs/xfs_super.c27
-rw-r--r--fs/xfs/xfs_sysctl.c18
-rw-r--r--fs/xfs/xfs_trans.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c5
323 files changed, 12718 insertions, 5323 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
335 } 335 }
336 init_rwsem(&v9ses->rename_sem); 336 init_rwsem(&v9ses->rename_sem);
337 337
338 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); 338 rc = bdi_setup_and_register(&v9ses->bdi, "9p");
339 if (rc) { 339 if (rc) {
340 kfree(v9ses->aname); 340 kfree(v9ses->aname);
341 kfree(v9ses->uname); 341 kfree(v9ses->uname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
831 .fault = filemap_fault, 831 .fault = filemap_fault,
832 .map_pages = filemap_map_pages, 832 .map_pages = filemap_map_pages,
833 .page_mkwrite = v9fs_vm_page_mkwrite, 833 .page_mkwrite = v9fs_vm_page_mkwrite,
834 .remap_pages = generic_file_remap_pages,
835}; 834};
836 835
837static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { 836static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
839 .fault = filemap_fault, 838 .fault = filemap_fault,
840 .map_pages = filemap_map_pages, 839 .map_pages = filemap_map_pages,
841 .page_mkwrite = v9fs_vm_page_mkwrite, 840 .page_mkwrite = v9fs_vm_page_mkwrite,
842 .remap_pages = generic_file_remap_pages,
843}; 841};
844 842
845 843
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..ec35851e5b71 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
13source "fs/ext2/Kconfig" 13source "fs/ext2/Kconfig"
14source "fs/ext3/Kconfig" 14source "fs/ext3/Kconfig"
15source "fs/ext4/Kconfig" 15source "fs/ext4/Kconfig"
16
17config FS_XIP
18# execute in place
19 bool
20 depends on EXT2_FS_XIP
21 default y
22
23source "fs/jbd/Kconfig" 16source "fs/jbd/Kconfig"
24source "fs/jbd2/Kconfig" 17source "fs/jbd2/Kconfig"
25 18
@@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig"
40source "fs/btrfs/Kconfig" 33source "fs/btrfs/Kconfig"
41source "fs/nilfs2/Kconfig" 34source "fs/nilfs2/Kconfig"
42 35
36config FS_DAX
37 bool "Direct Access (DAX) support"
38 depends on MMU
39 depends on !(ARM || MIPS || SPARC)
40 help
41 Direct Access (DAX) can be used on memory-backed block devices.
42 If the block device supports DAX and the filesystem supports DAX,
43 then you can avoid using the pagecache to buffer I/Os. Turning
44 on this option will compile in support for DAX; you will need to
45 mount the filesystem using the -o dax option.
46
47 If you do not have a block device that is capable of using this,
48 or if unsure, say N. Saying Y will increase the size of the kernel
49 by about 5kB.
50
43endif # BLOCK 51endif # BLOCK
44 52
45# Posix ACL utility routines 53# Posix ACL utility routines
@@ -165,6 +173,7 @@ config HUGETLB_PAGE
165 def_bool HUGETLBFS 173 def_bool HUGETLBFS
166 174
167source "fs/configfs/Kconfig" 175source "fs/configfs/Kconfig"
176source "fs/efivarfs/Kconfig"
168 177
169endmenu 178endmenu
170 179
@@ -209,7 +218,6 @@ source "fs/sysv/Kconfig"
209source "fs/ufs/Kconfig" 218source "fs/ufs/Kconfig"
210source "fs/exofs/Kconfig" 219source "fs/exofs/Kconfig"
211source "fs/f2fs/Kconfig" 220source "fs/f2fs/Kconfig"
212source "fs/efivarfs/Kconfig"
213 221
214endif # MISC_FILESYSTEMS 222endif # MISC_FILESYSTEMS
215 223
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c055d56ec63d..270c48148f79 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -149,13 +149,6 @@ config BINFMT_EM86
149 later load the module when you want to use a Linux/Intel binary. The 149 later load the module when you want to use a Linux/Intel binary. The
150 module will be called binfmt_em86. If unsure, say Y. 150 module will be called binfmt_em86. If unsure, say Y.
151 151
152config BINFMT_SOM
153 tristate "Kernel support for SOM binaries"
154 depends on PARISC && HPUX
155 help
156 SOM is a binary executable format inherited from HP/UX. Say
157 Y here to be able to load and execute SOM binaries directly.
158
159config BINFMT_MISC 152config BINFMT_MISC
160 tristate "Kernel support for MISC binaries" 153 tristate "Kernel support for MISC binaries"
161 ---help--- 154 ---help---
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..a88ac4838c9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
28obj-$(CONFIG_TIMERFD) += timerfd.o 28obj-$(CONFIG_TIMERFD) += timerfd.o
29obj-$(CONFIG_EVENTFD) += eventfd.o 29obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_AIO) += aio.o 30obj-$(CONFIG_AIO) += aio.o
31obj-$(CONFIG_FS_DAX) += dax.o
31obj-$(CONFIG_FILE_LOCKING) += locks.o 32obj-$(CONFIG_FILE_LOCKING) += locks.o
32obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 33obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 34obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
37obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o 38obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
38obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o 39obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
39obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o 40obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
40obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
41obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 41obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
42 42
43obj-$(CONFIG_FS_MBCACHE) += mbcache.o 43obj-$(CONFIG_FS_MBCACHE) += mbcache.o
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index ff44ff3ff015..c8764bd7497d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -30,6 +30,8 @@
30#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) 30#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
31#define AFFS_AC_MASK (AFFS_AC_SIZE-1) 31#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
32 32
33#define AFFSNAMEMAX 30U
34
33struct affs_ext_key { 35struct affs_ext_key {
34 u32 ext; /* idx of the extended block */ 36 u32 ext; /* idx of the extended block */
35 u32 key; /* block number */ 37 u32 key; /* block number */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index c852f2fa1710..388da1ea815d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -30,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
30 ino = bh->b_blocknr; 30 ino = bh->b_blocknr;
31 offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); 31 offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
32 32
33 pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino); 33 pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino);
34 34
35 dir_bh = affs_bread(sb, dir->i_ino); 35 dir_bh = affs_bread(sb, dir->i_ino);
36 if (!dir_bh) 36 if (!dir_bh)
@@ -80,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
80 sb = dir->i_sb; 80 sb = dir->i_sb;
81 rem_ino = rem_bh->b_blocknr; 81 rem_ino = rem_bh->b_blocknr;
82 offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); 82 offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
83 pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n", 83 pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino,
84 __func__, (u32)dir->i_ino, rem_ino, offset); 84 rem_ino, offset);
85 85
86 bh = affs_bread(sb, dir->i_ino); 86 bh = affs_bread(sb, dir->i_ino);
87 if (!bh) 87 if (!bh)
@@ -483,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
483{ 483{
484 int i; 484 int i;
485 485
486 if (len > 30) { 486 if (len > AFFSNAMEMAX) {
487 if (notruncate) 487 if (notruncate)
488 return -ENAMETOOLONG; 488 return -ENAMETOOLONG;
489 else 489 len = AFFSNAMEMAX;
490 len = 30;
491 } 490 }
492 for (i = 0; i < len; i++) { 491 for (i = 0; i < len; i++) {
493 if (name[i] < ' ' || name[i] == ':' 492 if (name[i] < ' ' || name[i] == ':'
@@ -508,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
508int 507int
509affs_copy_name(unsigned char *bstr, struct dentry *dentry) 508affs_copy_name(unsigned char *bstr, struct dentry *dentry)
510{ 509{
511 int len = min(dentry->d_name.len, 30u); 510 u32 len = min(dentry->d_name.len, AFFSNAMEMAX);
512 511
513 *bstr++ = len; 512 *bstr++ = len;
514 memcpy(bstr, dentry->d_name.name, len); 513 memcpy(bstr, dentry->d_name.name, len);
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c8de51185c23..675148950fed 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -99,7 +99,6 @@ err_bh_read:
99 99
100err_range: 100err_range:
101 affs_error(sb, "affs_free_block","Block %u outside partition", block); 101 affs_error(sb, "affs_free_block","Block %u outside partition", block);
102 return;
103} 102}
104 103
105/* 104/*
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 59f07bec92a6..ac4f318aafba 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
54 u32 ino; 54 u32 ino;
55 int error = 0; 55 int error = 0;
56 56
57 pr_debug("%s(ino=%lu,f_pos=%lx)\n", 57 pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
58 __func__, inode->i_ino, (unsigned long)ctx->pos);
59 58
60 if (ctx->pos < 2) { 59 if (ctx->pos < 2) {
61 file->private_data = (void *)0; 60 file->private_data = (void *)0;
@@ -115,11 +114,11 @@ inside:
115 break; 114 break;
116 } 115 }
117 116
118 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); 117 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0],
118 (u8)AFFSNAMEMAX);
119 name = AFFS_TAIL(sb, fh_bh)->name + 1; 119 name = AFFS_TAIL(sb, fh_bh)->name + 1;
120 pr_debug("readdir(): dir_emit(\"%.*s\", " 120 pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n",
121 "ino=%u), hash=%d, f_pos=%x\n", 121 namelen, name, ino, hash_pos, ctx->pos);
122 namelen, name, ino, hash_pos, (u32)ctx->pos);
123 122
124 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) 123 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
125 goto done; 124 goto done;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8faa6593ca6d..d2468bf95669 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -180,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
180 ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); 180 ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
181 if (ext < AFFS_I(inode)->i_extcnt) 181 if (ext < AFFS_I(inode)->i_extcnt)
182 goto read_ext; 182 goto read_ext;
183 if (ext > AFFS_I(inode)->i_extcnt) 183 BUG_ON(ext > AFFS_I(inode)->i_extcnt);
184 BUG();
185 bh = affs_alloc_extblock(inode, bh, ext); 184 bh = affs_alloc_extblock(inode, bh, ext);
186 if (IS_ERR(bh)) 185 if (IS_ERR(bh))
187 return bh; 186 return bh;
@@ -198,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
198 struct buffer_head *prev_bh; 197 struct buffer_head *prev_bh;
199 198
200 /* allocate a new extended block */ 199 /* allocate a new extended block */
201 if (ext > AFFS_I(inode)->i_extcnt) 200 BUG_ON(ext > AFFS_I(inode)->i_extcnt);
202 BUG();
203 201
204 /* get previous extended block */ 202 /* get previous extended block */
205 prev_bh = affs_get_extblock(inode, ext - 1); 203 prev_bh = affs_get_extblock(inode, ext - 1);
@@ -299,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
299 struct buffer_head *ext_bh; 297 struct buffer_head *ext_bh;
300 u32 ext; 298 u32 ext;
301 299
302 pr_debug("%s(%u, %lu)\n", 300 pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino,
303 __func__, (u32)inode->i_ino, (unsigned long)block); 301 (unsigned long long)block);
304 302
305 BUG_ON(block > (sector_t)0x7fffffffUL); 303 BUG_ON(block > (sector_t)0x7fffffffUL);
306 304
@@ -330,8 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
330 328
331 /* store new block */ 329 /* store new block */
332 if (bh_result->b_blocknr) 330 if (bh_result->b_blocknr)
333 affs_warning(sb, "get_block", "block already set (%lx)", 331 affs_warning(sb, "get_block",
334 (unsigned long)bh_result->b_blocknr); 332 "block already set (%llx)",
333 (unsigned long long)bh_result->b_blocknr);
335 AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); 334 AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
336 AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); 335 AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
337 affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); 336 affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -353,8 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
353 return 0; 352 return 0;
354 353
355err_big: 354err_big:
356 affs_error(inode->i_sb, "get_block", "strange block request %d", 355 affs_error(inode->i_sb, "get_block", "strange block request %llu",
357 (int)block); 356 (unsigned long long)block);
358 return -EIO; 357 return -EIO;
359err_ext: 358err_ext:
360 // unlock cache 359 // unlock cache
@@ -399,6 +398,13 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
399 size_t count = iov_iter_count(iter); 398 size_t count = iov_iter_count(iter);
400 ssize_t ret; 399 ssize_t ret;
401 400
401 if (rw == WRITE) {
402 loff_t size = offset + count;
403
404 if (AFFS_I(inode)->mmu_private < size)
405 return 0;
406 }
407
402 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); 408 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
403 if (ret < 0 && (rw & WRITE)) 409 if (ret < 0 && (rw & WRITE))
404 affs_write_failed(mapping, offset + count); 410 affs_write_failed(mapping, offset + count);
@@ -503,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
503 u32 bidx, boff, bsize; 509 u32 bidx, boff, bsize;
504 u32 tmp; 510 u32 tmp;
505 511
506 pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino, 512 pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
507 page->index, to); 513 page->index, to);
508 BUG_ON(to > PAGE_CACHE_SIZE); 514 BUG_ON(to > PAGE_CACHE_SIZE);
509 kmap(page); 515 kmap(page);
@@ -539,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
539 u32 size, bsize; 545 u32 size, bsize;
540 u32 tmp; 546 u32 tmp;
541 547
542 pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize); 548 pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize);
543 bsize = AFFS_SB(sb)->s_data_blksize; 549 bsize = AFFS_SB(sb)->s_data_blksize;
544 bh = NULL; 550 bh = NULL;
545 size = AFFS_I(inode)->mmu_private; 551 size = AFFS_I(inode)->mmu_private;
@@ -608,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
608 u32 to; 614 u32 to;
609 int err; 615 int err;
610 616
611 pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index); 617 pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
612 to = PAGE_CACHE_SIZE; 618 to = PAGE_CACHE_SIZE;
613 if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { 619 if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
614 to = inode->i_size & ~PAGE_CACHE_MASK; 620 to = inode->i_size & ~PAGE_CACHE_MASK;
@@ -631,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
631 pgoff_t index; 637 pgoff_t index;
632 int err = 0; 638 int err = 0;
633 639
634 pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino, 640 pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
635 (unsigned long long)pos, (unsigned long long)pos + len); 641 pos + len);
636 if (pos > AFFS_I(inode)->mmu_private) { 642 if (pos > AFFS_I(inode)->mmu_private) {
637 /* XXX: this probably leaves a too-big i_size in case of 643 /* XXX: this probably leaves a too-big i_size in case of
638 * failure. Should really be updating i_size at write_end time 644 * failure. Should really be updating i_size at write_end time
@@ -681,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
681 * due to write_begin. 687 * due to write_begin.
682 */ 688 */
683 689
684 pr_debug("%s(%u, %llu, %llu)\n", 690 pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
685 __func__, (u32)inode->i_ino, (unsigned long long)pos, 691 pos + len);
686 (unsigned long long)pos + len);
687 bsize = AFFS_SB(sb)->s_data_blksize; 692 bsize = AFFS_SB(sb)->s_data_blksize;
688 data = page_address(page); 693 data = page_address(page);
689 694
@@ -831,8 +836,8 @@ affs_truncate(struct inode *inode)
831 struct buffer_head *ext_bh; 836 struct buffer_head *ext_bh;
832 int i; 837 int i;
833 838
834 pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n", 839 pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n",
835 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); 840 inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size);
836 841
837 last_blk = 0; 842 last_blk = 0;
838 ext = 0; 843 ext = 0;
@@ -863,7 +868,7 @@ affs_truncate(struct inode *inode)
863 if (IS_ERR(ext_bh)) { 868 if (IS_ERR(ext_bh)) {
864 affs_warning(sb, "truncate", 869 affs_warning(sb, "truncate",
865 "unexpected read error for ext block %u (%ld)", 870 "unexpected read error for ext block %u (%ld)",
866 (unsigned int)ext, PTR_ERR(ext_bh)); 871 ext, PTR_ERR(ext_bh));
867 return; 872 return;
868 } 873 }
869 if (AFFS_I(inode)->i_lc) { 874 if (AFFS_I(inode)->i_lc) {
@@ -911,7 +916,7 @@ affs_truncate(struct inode *inode)
911 if (IS_ERR(bh)) { 916 if (IS_ERR(bh)) {
912 affs_warning(sb, "truncate", 917 affs_warning(sb, "truncate",
913 "unexpected read error for last block %u (%ld)", 918 "unexpected read error for last block %u (%ld)",
914 (unsigned int)ext, PTR_ERR(bh)); 919 ext, PTR_ERR(bh));
915 return; 920 return;
916 } 921 }
917 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 922 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index d0609a282e1d..6f34510449e8 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -13,8 +13,6 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include "affs.h" 14#include "affs.h"
15 15
16extern const struct inode_operations affs_symlink_inode_operations;
17
18struct inode *affs_iget(struct super_block *sb, unsigned long ino) 16struct inode *affs_iget(struct super_block *sb, unsigned long ino)
19{ 17{
20 struct affs_sb_info *sbi = AFFS_SB(sb); 18 struct affs_sb_info *sbi = AFFS_SB(sb);
@@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
348 u32 block = 0; 346 u32 block = 0;
349 int retval; 347 int retval;
350 348
351 pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n", 349 pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__,
352 __func__, (u32)dir->i_ino, 350 dir->i_ino, inode->i_ino, dentry, type);
353 (u32)inode->i_ino, dentry, type);
354 351
355 retval = -EIO; 352 retval = -EIO;
356 bh = affs_bread(sb, inode->i_ino); 353 bh = affs_bread(sb, inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index bbc38530e924..ffb7bd82c2a5 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
64{ 64{
65 const u8 *name = qstr->name; 65 const u8 *name = qstr->name;
66 unsigned long hash; 66 unsigned long hash;
67 int i; 67 int retval;
68 u32 len;
68 69
69 i = affs_check_name(qstr->name, qstr->len, notruncate); 70 retval = affs_check_name(qstr->name, qstr->len, notruncate);
70 if (i) 71 if (retval)
71 return i; 72 return retval;
72 73
73 hash = init_name_hash(); 74 hash = init_name_hash();
74 i = min(qstr->len, 30u); 75 len = min(qstr->len, AFFSNAMEMAX);
75 for (; i > 0; name++, i--) 76 for (; len > 0; name++, len--)
76 hash = partial_name_hash(toupper(*name), hash); 77 hash = partial_name_hash(toupper(*name), hash);
77 qstr->hash = end_name_hash(hash); 78 qstr->hash = end_name_hash(hash);
78 79
@@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len,
114 * If the names are longer than the allowed 30 chars, 115 * If the names are longer than the allowed 30 chars,
115 * the excess is ignored, so their length may differ. 116 * the excess is ignored, so their length may differ.
116 */ 117 */
117 if (len >= 30) { 118 if (len >= AFFSNAMEMAX) {
118 if (name->len < 30) 119 if (name->len < AFFSNAMEMAX)
119 return 1; 120 return 1;
120 len = 30; 121 len = AFFSNAMEMAX;
121 } else if (len != name->len) 122 } else if (len != name->len)
122 return 1; 123 return 1;
123 124
@@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
156 const u8 *name = dentry->d_name.name; 157 const u8 *name = dentry->d_name.name;
157 int len = dentry->d_name.len; 158 int len = dentry->d_name.len;
158 159
159 if (len >= 30) { 160 if (len >= AFFSNAMEMAX) {
160 if (*name2 < 30) 161 if (*name2 < AFFSNAMEMAX)
161 return 0; 162 return 0;
162 len = 30; 163 len = AFFSNAMEMAX;
163 } else if (len != *name2) 164 } else if (len != *name2)
164 return 0; 165 return 0;
165 166
@@ -173,9 +174,9 @@ int
173affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) 174affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
174{ 175{
175 toupper_t toupper = affs_get_toupper(sb); 176 toupper_t toupper = affs_get_toupper(sb);
176 int hash; 177 u32 hash;
177 178
178 hash = len = min(len, 30u); 179 hash = len = min(len, AFFSNAMEMAX);
179 for (; len > 0; len--) 180 for (; len > 0; len--)
180 hash = (hash * 13 + toupper(*name++)) & 0x7ff; 181 hash = (hash * 13 + toupper(*name++)) & 0x7ff;
181 182
@@ -248,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
248int 249int
249affs_unlink(struct inode *dir, struct dentry *dentry) 250affs_unlink(struct inode *dir, struct dentry *dentry)
250{ 251{
251 pr_debug("%s(dir=%d, %lu \"%pd\")\n", 252 pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
252 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, 253 dentry->d_inode->i_ino, dentry);
253 dentry);
254 254
255 return affs_remove_header(dentry); 255 return affs_remove_header(dentry);
256} 256}
@@ -317,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
317int 317int
318affs_rmdir(struct inode *dir, struct dentry *dentry) 318affs_rmdir(struct inode *dir, struct dentry *dentry)
319{ 319{
320 pr_debug("%s(dir=%u, %lu \"%pd\")\n", 320 pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
321 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, 321 dentry->d_inode->i_ino, dentry);
322 dentry);
323 322
324 return affs_remove_header(dentry); 323 return affs_remove_header(dentry);
325} 324}
@@ -404,8 +403,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
404{ 403{
405 struct inode *inode = old_dentry->d_inode; 404 struct inode *inode = old_dentry->d_inode;
406 405
407 pr_debug("%s(%u, %u, \"%pd\")\n", 406 pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
408 __func__, (u32)inode->i_ino, (u32)dir->i_ino,
409 dentry); 407 dentry);
410 408
411 return affs_add_entry(dir, inode, dentry, ST_LINKFILE); 409 return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -419,9 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
419 struct buffer_head *bh = NULL; 417 struct buffer_head *bh = NULL;
420 int retval; 418 int retval;
421 419
422 pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n", 420 pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
423 __func__, (u32)old_dir->i_ino, old_dentry, 421 old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
424 (u32)new_dir->i_ino, new_dentry);
425 422
426 retval = affs_check_name(new_dentry->d_name.name, 423 retval = affs_check_name(new_dentry->d_name.name,
427 new_dentry->d_name.len, 424 new_dentry->d_name.len,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index f754ab68a840..4cf0e9113fb6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -432,39 +432,39 @@ got_root:
432 sb->s_flags |= MS_RDONLY; 432 sb->s_flags |= MS_RDONLY;
433 } 433 }
434 switch (chksum) { 434 switch (chksum) {
435 case MUFS_FS: 435 case MUFS_FS:
436 case MUFS_INTLFFS: 436 case MUFS_INTLFFS:
437 case MUFS_DCFFS: 437 case MUFS_DCFFS:
438 sbi->s_flags |= SF_MUFS; 438 sbi->s_flags |= SF_MUFS;
439 /* fall thru */ 439 /* fall thru */
440 case FS_INTLFFS: 440 case FS_INTLFFS:
441 case FS_DCFFS: 441 case FS_DCFFS:
442 sbi->s_flags |= SF_INTL; 442 sbi->s_flags |= SF_INTL;
443 break; 443 break;
444 case MUFS_FFS: 444 case MUFS_FFS:
445 sbi->s_flags |= SF_MUFS; 445 sbi->s_flags |= SF_MUFS;
446 break; 446 break;
447 case FS_FFS: 447 case FS_FFS:
448 break; 448 break;
449 case MUFS_OFS: 449 case MUFS_OFS:
450 sbi->s_flags |= SF_MUFS; 450 sbi->s_flags |= SF_MUFS;
451 /* fall thru */ 451 /* fall thru */
452 case FS_OFS: 452 case FS_OFS:
453 sbi->s_flags |= SF_OFS; 453 sbi->s_flags |= SF_OFS;
454 sb->s_flags |= MS_NOEXEC; 454 sb->s_flags |= MS_NOEXEC;
455 break; 455 break;
456 case MUFS_DCOFS: 456 case MUFS_DCOFS:
457 case MUFS_INTLOFS: 457 case MUFS_INTLOFS:
458 sbi->s_flags |= SF_MUFS; 458 sbi->s_flags |= SF_MUFS;
459 case FS_DCOFS: 459 case FS_DCOFS:
460 case FS_INTLOFS: 460 case FS_INTLOFS:
461 sbi->s_flags |= SF_INTL | SF_OFS; 461 sbi->s_flags |= SF_INTL | SF_OFS;
462 sb->s_flags |= MS_NOEXEC; 462 sb->s_flags |= MS_NOEXEC;
463 break; 463 break;
464 default: 464 default:
465 pr_err("Unknown filesystem on device %s: %08X\n", 465 pr_err("Unknown filesystem on device %s: %08X\n",
466 sb->s_id, chksum); 466 sb->s_id, chksum);
467 return -EINVAL; 467 return -EINVAL;
468 } 468 }
469 469
470 if (mount_flags & SF_VERBOSE) { 470 if (mount_flags & SF_VERBOSE) {
@@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
584 buf->f_bavail = free; 584 buf->f_bavail = free;
585 buf->f_fsid.val[0] = (u32)id; 585 buf->f_fsid.val[0] = (u32)id;
586 buf->f_fsid.val[1] = (u32)(id >> 32); 586 buf->f_fsid.val[1] = (u32)(id >> 32);
587 buf->f_namelen = 30; 587 buf->f_namelen = AFFSNAMEMAX;
588 return 0; 588 return 0;
589} 589}
590 590
@@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb)
602 affs_free_bitmap(sb); 602 affs_free_bitmap(sb);
603 affs_brelse(sbi->s_root_bh); 603 affs_brelse(sbi->s_root_bh);
604 kfree(sbi->s_prefix); 604 kfree(sbi->s_prefix);
605 mutex_destroy(&sbi->s_bmlock);
605 kfree(sbi); 606 kfree(sbi);
606 } 607 }
607} 608}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 06e14bfb3496..dbc732e9a5c0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
306 306
307 _debug("- range %u-%u%s", 307 _debug("- range %u-%u%s",
308 offset, to, msg->msg_flags ? " [more]" : ""); 308 offset, to, msg->msg_flags ? " [more]" : "");
309 iov_iter_init(&msg->msg_iter, WRITE, 309 iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
310 (struct iovec *) iov, 1, to - offset); 310 iov, 1, to - offset);
311 311
312 /* have to change the state *before* sending the last 312 /* have to change the state *before* sending the last
313 * packet as RxRPC might give us the reply before it 313 * packet as RxRPC might give us the reply before it
@@ -384,7 +384,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
384 384
385 msg.msg_name = NULL; 385 msg.msg_name = NULL;
386 msg.msg_namelen = 0; 386 msg.msg_namelen = 0;
387 iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1, 387 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
388 call->request_size); 388 call->request_size);
389 msg.msg_control = NULL; 389 msg.msg_control = NULL;
390 msg.msg_controllen = 0; 390 msg.msg_controllen = 0;
@@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
770void afs_send_empty_reply(struct afs_call *call) 770void afs_send_empty_reply(struct afs_call *call)
771{ 771{
772 struct msghdr msg; 772 struct msghdr msg;
773 struct iovec iov[1]; 773 struct kvec iov[1];
774 774
775 _enter(""); 775 _enter("");
776 776
@@ -778,7 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
778 iov[0].iov_len = 0; 778 iov[0].iov_len = 0;
779 msg.msg_name = NULL; 779 msg.msg_name = NULL;
780 msg.msg_namelen = 0; 780 msg.msg_namelen = 0;
781 iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */ 781 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */
782 msg.msg_control = NULL; 782 msg.msg_control = NULL;
783 msg.msg_controllen = 0; 783 msg.msg_controllen = 0;
784 msg.msg_flags = 0; 784 msg.msg_flags = 0;
@@ -805,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call)
805void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) 805void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
806{ 806{
807 struct msghdr msg; 807 struct msghdr msg;
808 struct iovec iov[1]; 808 struct kvec iov[1];
809 int n; 809 int n;
810 810
811 _enter(""); 811 _enter("");
@@ -814,7 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
814 iov[0].iov_len = len; 814 iov[0].iov_len = len;
815 msg.msg_name = NULL; 815 msg.msg_name = NULL;
816 msg.msg_namelen = 0; 816 msg.msg_namelen = 0;
817 iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len); 817 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
818 msg.msg_control = NULL; 818 msg.msg_control = NULL;
819 msg.msg_controllen = 0; 819 msg.msg_controllen = 0;
820 msg.msg_flags = 0; 820 msg.msg_flags = 0;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
106 volume->cell = params->cell; 106 volume->cell = params->cell;
107 volume->vid = vlocation->vldb.vid[params->type]; 107 volume->vid = vlocation->vldb.vid[params->type];
108 108
109 ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); 109 ret = bdi_setup_and_register(&volume->bdi, "afs");
110 if (ret) 110 if (ret)
111 goto error_bdi; 111 goto error_bdi;
112 112
diff --git a/fs/aio.c b/fs/aio.c
index c428871f1093..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
165static const struct file_operations aio_ring_fops; 165static const struct file_operations aio_ring_fops;
166static const struct address_space_operations aio_ctx_aops; 166static const struct address_space_operations aio_ctx_aops;
167 167
168/* Backing dev info for aio fs.
169 * -no dirty page accounting or writeback happens
170 */
171static struct backing_dev_info aio_fs_backing_dev_info = {
172 .name = "aiofs",
173 .state = 0,
174 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
175};
176
177static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) 168static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
178{ 169{
179 struct qstr this = QSTR_INIT("[aio]", 5); 170 struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
185 176
186 inode->i_mapping->a_ops = &aio_ctx_aops; 177 inode->i_mapping->a_ops = &aio_ctx_aops;
187 inode->i_mapping->private_data = ctx; 178 inode->i_mapping->private_data = ctx;
188 inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
189 inode->i_size = PAGE_SIZE * nr_pages; 179 inode->i_size = PAGE_SIZE * nr_pages;
190 180
191 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); 181 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
230 if (IS_ERR(aio_mnt)) 220 if (IS_ERR(aio_mnt))
231 panic("Failed to create aio fs mount."); 221 panic("Failed to create aio fs mount.");
232 222
233 if (bdi_init(&aio_fs_backing_dev_info))
234 panic("Failed to init aio fs backing dev info.");
235
236 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 223 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
237 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 224 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
238 225
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index edf47774b03d..e089f1985fca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -274,9 +274,9 @@ more:
274static struct inode * 274static struct inode *
275befs_alloc_inode(struct super_block *sb) 275befs_alloc_inode(struct super_block *sb)
276{ 276{
277 struct befs_inode_info *bi; 277 struct befs_inode_info *bi;
278 bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep, 278
279 GFP_KERNEL); 279 bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
280 if (!bi) 280 if (!bi)
281 return NULL; 281 return NULL;
282 return &bi->vfs_inode; 282 return &bi->vfs_inode;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
deleted file mode 100644
index 4e00ed68d4a6..000000000000
--- a/fs/binfmt_som.c
+++ /dev/null
@@ -1,299 +0,0 @@
1/*
2 * linux/fs/binfmt_som.c
3 *
4 * These are the functions used to load SOM format executables as used
5 * by HP-UX.
6 *
7 * Copyright 1999 Matthew Wilcox <willy@bofh.ai>
8 * based on binfmt_elf which is
9 * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
10 */
11
12#include <linux/module.h>
13
14#include <linux/fs.h>
15#include <linux/stat.h>
16#include <linux/sched.h>
17#include <linux/mm.h>
18#include <linux/mman.h>
19#include <linux/errno.h>
20#include <linux/signal.h>
21#include <linux/binfmts.h>
22#include <linux/som.h>
23#include <linux/string.h>
24#include <linux/file.h>
25#include <linux/fcntl.h>
26#include <linux/ptrace.h>
27#include <linux/slab.h>
28#include <linux/shm.h>
29#include <linux/personality.h>
30#include <linux/init.h>
31
32#include <asm/uaccess.h>
33#include <asm/pgtable.h>
34
35
36#include <linux/elf.h>
37
38static int load_som_binary(struct linux_binprm * bprm);
39static int load_som_library(struct file *);
40
41/*
42 * If we don't support core dumping, then supply a NULL so we
43 * don't even try.
44 */
45#if 0
46static int som_core_dump(struct coredump_params *cprm);
47#else
48#define som_core_dump NULL
49#endif
50
51#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1))
52#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1))
53#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1))
54
55static struct linux_binfmt som_format = {
56 .module = THIS_MODULE,
57 .load_binary = load_som_binary,
58 .load_shlib = load_som_library,
59 .core_dump = som_core_dump,
60 .min_coredump = SOM_PAGESIZE
61};
62
63/*
64 * create_som_tables() parses the env- and arg-strings in new user
65 * memory and creates the pointer tables from them, and puts their
66 * addresses on the "stack", returning the new stack pointer value.
67 */
68static void create_som_tables(struct linux_binprm *bprm)
69{
70 char **argv, **envp;
71 int argc = bprm->argc;
72 int envc = bprm->envc;
73 unsigned long p;
74 unsigned long *sp;
75
76 /* Word-align the stack pointer */
77 sp = (unsigned long *)((bprm->p + 3) & ~3);
78
79 envp = (char **) sp;
80 sp += envc + 1;
81 argv = (char **) sp;
82 sp += argc + 1;
83
84 __put_user((unsigned long) envp,++sp);
85 __put_user((unsigned long) argv,++sp);
86
87 __put_user(argc, ++sp);
88
89 bprm->p = (unsigned long) sp;
90
91 p = current->mm->arg_start;
92 while (argc-- > 0) {
93 __put_user((char *)p,argv++);
94 p += strlen_user((char *)p);
95 }
96 __put_user(NULL, argv);
97 current->mm->arg_end = current->mm->env_start = p;
98 while (envc-- > 0) {
99 __put_user((char *)p,envp++);
100 p += strlen_user((char *)p);
101 }
102 __put_user(NULL, envp);
103 current->mm->env_end = p;
104}
105
106static int check_som_header(struct som_hdr *som_ex)
107{
108 int *buf = (int *)som_ex;
109 int i, ck;
110
111 if (som_ex->system_id != SOM_SID_PARISC_1_0 &&
112 som_ex->system_id != SOM_SID_PARISC_1_1 &&
113 som_ex->system_id != SOM_SID_PARISC_2_0)
114 return -ENOEXEC;
115
116 if (som_ex->a_magic != SOM_EXEC_NONSHARE &&
117 som_ex->a_magic != SOM_EXEC_SHARE &&
118 som_ex->a_magic != SOM_EXEC_DEMAND)
119 return -ENOEXEC;
120
121 if (som_ex->version_id != SOM_ID_OLD &&
122 som_ex->version_id != SOM_ID_NEW)
123 return -ENOEXEC;
124
125 ck = 0;
126 for (i=0; i<32; i++)
127 ck ^= buf[i];
128 if (ck != 0)
129 return -ENOEXEC;
130
131 return 0;
132}
133
134static int map_som_binary(struct file *file,
135 const struct som_exec_auxhdr *hpuxhdr)
136{
137 unsigned long code_start, code_size, data_start, data_size;
138 unsigned long bss_start, som_brk;
139 int retval;
140 int prot = PROT_READ | PROT_EXEC;
141 int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
142
143 mm_segment_t old_fs = get_fs();
144 set_fs(get_ds());
145
146 code_start = SOM_PAGESTART(hpuxhdr->exec_tmem);
147 code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
148 current->mm->start_code = code_start;
149 current->mm->end_code = code_start + code_size;
150 retval = vm_mmap(file, code_start, code_size, prot,
151 flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
152 if (retval < 0 && retval > -1024)
153 goto out;
154
155 data_start = SOM_PAGESTART(hpuxhdr->exec_dmem);
156 data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
157 current->mm->start_data = data_start;
158 current->mm->end_data = bss_start = data_start + data_size;
159 retval = vm_mmap(file, data_start, data_size,
160 prot | PROT_WRITE, flags,
161 SOM_PAGESTART(hpuxhdr->exec_dfile));
162 if (retval < 0 && retval > -1024)
163 goto out;
164
165 som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
166 current->mm->start_brk = current->mm->brk = som_brk;
167 retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
168 prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
169 if (retval > 0 || retval < -1024)
170 retval = 0;
171out:
172 set_fs(old_fs);
173 return retval;
174}
175
176
177/*
178 * These are the functions used to load SOM executables and shared
179 * libraries. There is no binary dependent code anywhere else.
180 */
181
182static int
183load_som_binary(struct linux_binprm * bprm)
184{
185 int retval;
186 unsigned int size;
187 unsigned long som_entry;
188 struct som_hdr *som_ex;
189 struct som_exec_auxhdr *hpuxhdr;
190 struct pt_regs *regs = current_pt_regs();
191
192 /* Get the exec-header */
193 som_ex = (struct som_hdr *) bprm->buf;
194
195 retval = check_som_header(som_ex);
196 if (retval != 0)
197 goto out;
198
199 /* Now read in the auxiliary header information */
200
201 retval = -ENOMEM;
202 size = som_ex->aux_header_size;
203 if (size > SOM_PAGESIZE)
204 goto out;
205 hpuxhdr = kmalloc(size, GFP_KERNEL);
206 if (!hpuxhdr)
207 goto out;
208
209 retval = kernel_read(bprm->file, som_ex->aux_header_location,
210 (char *) hpuxhdr, size);
211 if (retval != size) {
212 if (retval >= 0)
213 retval = -EIO;
214 goto out_free;
215 }
216
217 /* Flush all traces of the currently running executable */
218 retval = flush_old_exec(bprm);
219 if (retval)
220 goto out_free;
221
222 /* OK, This is the point of no return */
223 current->personality = PER_HPUX;
224 setup_new_exec(bprm);
225
226 /* Set the task size for HP-UX processes such that
227 * the gateway page is outside the address space.
228 * This can be fixed later, but for now, this is much
229 * easier.
230 */
231
232 current->thread.task_size = 0xc0000000;
233
234 /* Set map base to allow enough room for hp-ux heap growth */
235
236 current->thread.map_base = 0x80000000;
237
238 retval = map_som_binary(bprm->file, hpuxhdr);
239 if (retval < 0)
240 goto out_free;
241
242 som_entry = hpuxhdr->exec_entry;
243 kfree(hpuxhdr);
244
245 set_binfmt(&som_format);
246 install_exec_creds(bprm);
247 setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
248
249 create_som_tables(bprm);
250
251 current->mm->start_stack = bprm->p;
252
253#if 0
254 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
255 printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code);
256 printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code);
257 printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data);
258 printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack);
259 printk("(brk) %08lx\n" , (unsigned long) current->mm->brk);
260#endif
261
262 map_hpux_gateway_page(current,current->mm);
263
264 start_thread_som(regs, som_entry, bprm->p);
265 return 0;
266
267 /* error cleanup */
268out_free:
269 kfree(hpuxhdr);
270out:
271 return retval;
272}
273
274static int load_som_library(struct file *f)
275{
276/* No lib support in SOM yet. gizza chance.. */
277 return -ENOEXEC;
278}
279 /* Install the SOM loader.
280 * N.B. We *rely* on the table being the right size with the
281 * right number of free slots...
282 */
283
284static int __init init_som_binfmt(void)
285{
286 register_binfmt(&som_format);
287 return 0;
288}
289
290static void __exit exit_som_binfmt(void)
291{
292 /* Remove the SOM loader. */
293 unregister_binfmt(&som_format);
294}
295
296core_initcall(init_som_binfmt);
297module_exit(exit_som_binfmt);
298
299MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..975266be67d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
49} 49}
50EXPORT_SYMBOL(I_BDEV); 50EXPORT_SYMBOL(I_BDEV);
51 51
52/* 52static void bdev_write_inode(struct inode *inode)
53 * Move the inode from its current bdi to a new bdi. Make sure the inode
54 * is clean before moving so that it doesn't linger on the old bdi.
55 */
56static void bdev_inode_switch_bdi(struct inode *inode,
57 struct backing_dev_info *dst)
58{ 53{
59 while (true) { 54 spin_lock(&inode->i_lock);
60 spin_lock(&inode->i_lock); 55 while (inode->i_state & I_DIRTY) {
61 if (!(inode->i_state & I_DIRTY)) {
62 inode->i_data.backing_dev_info = dst;
63 spin_unlock(&inode->i_lock);
64 return;
65 }
66 spin_unlock(&inode->i_lock); 56 spin_unlock(&inode->i_lock);
67 WARN_ON_ONCE(write_inode_now(inode, true)); 57 WARN_ON_ONCE(write_inode_now(inode, true));
58 spin_lock(&inode->i_lock);
68 } 59 }
60 spin_unlock(&inode->i_lock);
69} 61}
70 62
71/* Kill _all_ buffers and pagecache , dirty or not.. */ 63/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
429} 421}
430EXPORT_SYMBOL_GPL(bdev_write_page); 422EXPORT_SYMBOL_GPL(bdev_write_page);
431 423
424/**
425 * bdev_direct_access() - Get the address for directly-accessibly memory
426 * @bdev: The device containing the memory
427 * @sector: The offset within the device
428 * @addr: Where to put the address of the memory
429 * @pfn: The Page Frame Number for the memory
430 * @size: The number of bytes requested
431 *
432 * If a block device is made up of directly addressable memory, this function
433 * will tell the caller the PFN and the address of the memory. The address
434 * may be directly dereferenced within the kernel without the need to call
435 * ioremap(), kmap() or similar. The PFN is suitable for inserting into
436 * page tables.
437 *
438 * Return: negative errno if an error occurs, otherwise the number of bytes
439 * accessible at this address.
440 */
441long bdev_direct_access(struct block_device *bdev, sector_t sector,
442 void **addr, unsigned long *pfn, long size)
443{
444 long avail;
445 const struct block_device_operations *ops = bdev->bd_disk->fops;
446
447 if (size < 0)
448 return size;
449 if (!ops->direct_access)
450 return -EOPNOTSUPP;
451 if ((sector + DIV_ROUND_UP(size, 512)) >
452 part_nr_sects_read(bdev->bd_part))
453 return -ERANGE;
454 sector += get_start_sect(bdev);
455 if (sector % (PAGE_SIZE / 512))
456 return -EINVAL;
457 avail = ops->direct_access(bdev, sector, addr, pfn, size);
458 if (!avail)
459 return -ERANGE;
460 return min(avail, size);
461}
462EXPORT_SYMBOL_GPL(bdev_direct_access);
463
432/* 464/*
433 * pseudo-fs 465 * pseudo-fs
434 */ 466 */
@@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev)
584 inode->i_bdev = bdev; 616 inode->i_bdev = bdev;
585 inode->i_data.a_ops = &def_blk_aops; 617 inode->i_data.a_ops = &def_blk_aops;
586 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 618 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
587 inode->i_data.backing_dev_info = &default_backing_dev_info;
588 spin_lock(&bdev_lock); 619 spin_lock(&bdev_lock);
589 list_add(&bdev->bd_list, &all_bdevs); 620 list_add(&bdev->bd_list, &all_bdevs);
590 spin_unlock(&bdev_lock); 621 spin_unlock(&bdev_lock);
@@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1145 bdev->bd_queue = disk->queue; 1176 bdev->bd_queue = disk->queue;
1146 bdev->bd_contains = bdev; 1177 bdev->bd_contains = bdev;
1147 if (!partno) { 1178 if (!partno) {
1148 struct backing_dev_info *bdi;
1149
1150 ret = -ENXIO; 1179 ret = -ENXIO;
1151 bdev->bd_part = disk_get_part(disk, partno); 1180 bdev->bd_part = disk_get_part(disk, partno);
1152 if (!bdev->bd_part) 1181 if (!bdev->bd_part)
@@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1172 } 1201 }
1173 } 1202 }
1174 1203
1175 if (!ret) { 1204 if (!ret)
1176 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1205 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1177 bdi = blk_get_backing_dev_info(bdev);
1178 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1179 }
1180 1206
1181 /* 1207 /*
1182 * If the device is invalidated, rescan partition 1208 * If the device is invalidated, rescan partition
@@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1203 if (ret) 1229 if (ret)
1204 goto out_clear; 1230 goto out_clear;
1205 bdev->bd_contains = whole; 1231 bdev->bd_contains = whole;
1206 bdev_inode_switch_bdi(bdev->bd_inode,
1207 whole->bd_inode->i_data.backing_dev_info);
1208 bdev->bd_part = disk_get_part(disk, partno); 1232 bdev->bd_part = disk_get_part(disk, partno);
1209 if (!(disk->flags & GENHD_FL_UP) || 1233 if (!(disk->flags & GENHD_FL_UP) ||
1210 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1234 !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1244 bdev->bd_disk = NULL; 1268 bdev->bd_disk = NULL;
1245 bdev->bd_part = NULL; 1269 bdev->bd_part = NULL;
1246 bdev->bd_queue = NULL; 1270 bdev->bd_queue = NULL;
1247 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1248 if (bdev != bdev->bd_contains) 1271 if (bdev != bdev->bd_contains)
1249 __blkdev_put(bdev->bd_contains, mode, 1); 1272 __blkdev_put(bdev->bd_contains, mode, 1);
1250 bdev->bd_contains = NULL; 1273 bdev->bd_contains = NULL;
@@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1464 WARN_ON_ONCE(bdev->bd_holders); 1487 WARN_ON_ONCE(bdev->bd_holders);
1465 sync_blockdev(bdev); 1488 sync_blockdev(bdev);
1466 kill_bdev(bdev); 1489 kill_bdev(bdev);
1467 /* ->release can cause the old bdi to disappear, 1490 /*
1468 * so must switch it out first 1491 * ->release can cause the queue to disappear, so flush all
1492 * dirty data before.
1469 */ 1493 */
1470 bdev_inode_switch_bdi(bdev->bd_inode, 1494 bdev_write_inode(bdev->bd_inode);
1471 &default_backing_dev_info);
1472 } 1495 }
1473 if (bdev->bd_contains == bdev) { 1496 if (bdev->bd_contains == bdev) {
1474 if (disk->fops->release) 1497 if (disk->fops->release)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768ebc8d1..80e9c18ea64f 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
8 select LZO_DECOMPRESS 8 select LZO_DECOMPRESS
9 select RAID6_PQ 9 select RAID6_PQ
10 select XOR_BLOCKS 10 select XOR_BLOCKS
11 select SRCU
11 12
12 help 13 help
13 Btrfs is a general purpose copy-on-write filesystem with extents, 14 Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,12 +1715,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1715{ 1715{
1716 int err; 1716 int err;
1717 1717
1718 bdi->capabilities = BDI_CAP_MAP_COPY; 1718 err = bdi_setup_and_register(bdi, "btrfs");
1719 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1720 if (err) 1719 if (err)
1721 return err; 1720 return err;
1722 1721
1723 bdi->ra_pages = default_backing_dev_info.ra_pages; 1722 bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
1724 bdi->congested_fn = btrfs_congested_fn; 1723 bdi->congested_fn = btrfs_congested_fn;
1725 bdi->congested_data = info; 1724 bdi->congested_data = info;
1726 return 0; 1725 return 0;
@@ -2319,7 +2318,6 @@ int open_ctree(struct super_block *sb,
2319 */ 2318 */
2320 fs_info->btree_inode->i_size = OFFSET_MAX; 2319 fs_info->btree_inode->i_size = OFFSET_MAX;
2321 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 2320 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2322 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
2323 2321
2324 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); 2322 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2325 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 2323 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 790dbae3343c..c73df6a7c9b6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1407,8 +1407,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1407 while (index <= end_index) { 1407 while (index <= end_index) {
1408 page = find_get_page(inode->i_mapping, index); 1408 page = find_get_page(inode->i_mapping, index);
1409 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1409 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1410 account_page_redirty(page);
1411 __set_page_dirty_nobuffers(page); 1410 __set_page_dirty_nobuffers(page);
1411 account_page_redirty(page);
1412 page_cache_release(page); 1412 page_cache_release(page);
1413 index++; 1413 index++;
1414 } 1414 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1746 1746
1747 mutex_lock(&inode->i_mutex); 1747 mutex_lock(&inode->i_mutex);
1748 1748
1749 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1749 current->backing_dev_info = inode_to_bdi(inode);
1750 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1750 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1751 if (err) { 1751 if (err) {
1752 mutex_unlock(&inode->i_mutex); 1752 mutex_unlock(&inode->i_mutex);
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
2081 .fault = filemap_fault, 2081 .fault = filemap_fault,
2082 .map_pages = filemap_map_pages, 2082 .map_pages = filemap_map_pages,
2083 .page_mkwrite = btrfs_page_mkwrite, 2083 .page_mkwrite = btrfs_page_mkwrite,
2084 .remap_pages = generic_file_remap_pages,
2085}; 2084};
2086 2085
2087static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 2086static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
3608 switch (inode->i_mode & S_IFMT) { 3608 switch (inode->i_mode & S_IFMT) {
3609 case S_IFREG: 3609 case S_IFREG:
3610 inode->i_mapping->a_ops = &btrfs_aops; 3610 inode->i_mapping->a_ops = &btrfs_aops;
3611 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3612 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3611 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3613 inode->i_fop = &btrfs_file_operations; 3612 inode->i_fop = &btrfs_file_operations;
3614 inode->i_op = &btrfs_file_inode_operations; 3613 inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
3623 case S_IFLNK: 3622 case S_IFLNK:
3624 inode->i_op = &btrfs_symlink_inode_operations; 3623 inode->i_op = &btrfs_symlink_inode_operations;
3625 inode->i_mapping->a_ops = &btrfs_symlink_aops; 3624 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3626 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3627 break; 3625 break;
3628 default: 3626 default:
3629 inode->i_op = &btrfs_special_inode_operations; 3627 inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
6088 inode->i_fop = &btrfs_file_operations; 6086 inode->i_fop = &btrfs_file_operations;
6089 inode->i_op = &btrfs_file_inode_operations; 6087 inode->i_op = &btrfs_file_inode_operations;
6090 inode->i_mapping->a_ops = &btrfs_aops; 6088 inode->i_mapping->a_ops = &btrfs_aops;
6091 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6092 6089
6093 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6090 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6094 if (err) 6091 if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9203 inode->i_fop = &btrfs_file_operations; 9200 inode->i_fop = &btrfs_file_operations;
9204 inode->i_op = &btrfs_file_inode_operations; 9201 inode->i_op = &btrfs_file_inode_operations;
9205 inode->i_mapping->a_ops = &btrfs_aops; 9202 inode->i_mapping->a_ops = &btrfs_aops;
9206 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9207 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9203 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9208 9204
9209 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 9205 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9247 9243
9248 inode->i_op = &btrfs_symlink_inode_operations; 9244 inode->i_op = &btrfs_symlink_inode_operations;
9249 inode->i_mapping->a_ops = &btrfs_symlink_aops; 9245 inode->i_mapping->a_ops = &btrfs_symlink_aops;
9250 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9251 inode_set_bytes(inode, name_len); 9246 inode_set_bytes(inode, name_len);
9252 btrfs_i_size_write(inode, name_len); 9247 btrfs_i_size_write(inode, name_len);
9253 err = btrfs_update_inode(trans, root, inode); 9248 err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9459 inode->i_op = &btrfs_file_inode_operations; 9454 inode->i_op = &btrfs_file_inode_operations;
9460 9455
9461 inode->i_mapping->a_ops = &btrfs_aops; 9456 inode->i_mapping->a_ops = &btrfs_aops;
9462 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9463 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9457 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9464 9458
9465 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 9459 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7d05e37874d4..fd5599d32362 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1574,7 +1574,6 @@ out:
1574static struct vm_operations_struct ceph_vmops = { 1574static struct vm_operations_struct ceph_vmops = {
1575 .fault = ceph_filemap_fault, 1575 .fault = ceph_filemap_fault,
1576 .page_mkwrite = ceph_page_mkwrite, 1576 .page_mkwrite = ceph_page_mkwrite,
1577 .remap_pages = generic_file_remap_pages,
1578}; 1577};
1579 1578
1580int ceph_mmap(struct file *file, struct vm_area_struct *vma) 1579int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 848969ee24db..a3d774b35149 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -952,7 +952,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
952 mutex_lock(&inode->i_mutex); 952 mutex_lock(&inode->i_mutex);
953 953
954 /* We can write back this queue in page reclaim */ 954 /* We can write back this queue in page reclaim */
955 current->backing_dev_info = file->f_mapping->backing_dev_info; 955 current->backing_dev_info = inode_to_bdi(inode);
956 956
957 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 957 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
958 if (err) 958 if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be3af18e4cf1..119c43c80638 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
783 } 783 }
784 784
785 inode->i_mapping->a_ops = &ceph_aops; 785 inode->i_mapping->a_ops = &ceph_aops;
786 inode->i_mapping->backing_dev_info =
787 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
788 786
789 switch (inode->i_mode & S_IFMT) { 787 switch (inode->i_mode & S_IFMT) {
790 case S_IFIFO: 788 case S_IFIFO:
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index c35c5c614e38..4347039ecc18 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -239,23 +239,26 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
239 return err; 239 return err;
240} 240}
241 241
242/** 242/*
243 * Must be called with lock_flocks() already held. Fills in the passed 243 * Fills in the passed counter variables, so you can prepare pagelist metadata
244 * counter variables, so you can prepare pagelist metadata before calling 244 * before calling ceph_encode_locks.
245 * ceph_encode_locks.
246 */ 245 */
247void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) 246void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
248{ 247{
249 struct file_lock *lock; 248 struct file_lock *lock;
249 struct file_lock_context *ctx;
250 250
251 *fcntl_count = 0; 251 *fcntl_count = 0;
252 *flock_count = 0; 252 *flock_count = 0;
253 253
254 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 254 ctx = inode->i_flctx;
255 if (lock->fl_flags & FL_POSIX) 255 if (ctx) {
256 spin_lock(&ctx->flc_lock);
257 list_for_each_entry(lock, &ctx->flc_posix, fl_list)
256 ++(*fcntl_count); 258 ++(*fcntl_count);
257 else if (lock->fl_flags & FL_FLOCK) 259 list_for_each_entry(lock, &ctx->flc_flock, fl_list)
258 ++(*flock_count); 260 ++(*flock_count);
261 spin_unlock(&ctx->flc_lock);
259 } 262 }
260 dout("counted %d flock locks and %d fcntl locks", 263 dout("counted %d flock locks and %d fcntl locks",
261 *flock_count, *fcntl_count); 264 *flock_count, *fcntl_count);
@@ -271,6 +274,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
271 int num_fcntl_locks, int num_flock_locks) 274 int num_fcntl_locks, int num_flock_locks)
272{ 275{
273 struct file_lock *lock; 276 struct file_lock *lock;
277 struct file_lock_context *ctx = inode->i_flctx;
274 int err = 0; 278 int err = 0;
275 int seen_fcntl = 0; 279 int seen_fcntl = 0;
276 int seen_flock = 0; 280 int seen_flock = 0;
@@ -279,33 +283,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
279 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 283 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
280 num_fcntl_locks); 284 num_fcntl_locks);
281 285
282 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 286 if (!ctx)
283 if (lock->fl_flags & FL_POSIX) { 287 return 0;
284 ++seen_fcntl; 288
285 if (seen_fcntl > num_fcntl_locks) { 289 spin_lock(&ctx->flc_lock);
286 err = -ENOSPC; 290 list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
287 goto fail; 291 ++seen_fcntl;
288 } 292 if (seen_fcntl > num_fcntl_locks) {
289 err = lock_to_ceph_filelock(lock, &flocks[l]); 293 err = -ENOSPC;
290 if (err) 294 goto fail;
291 goto fail;
292 ++l;
293 } 295 }
296 err = lock_to_ceph_filelock(lock, &flocks[l]);
297 if (err)
298 goto fail;
299 ++l;
294 } 300 }
295 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 301 list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
296 if (lock->fl_flags & FL_FLOCK) { 302 ++seen_flock;
297 ++seen_flock; 303 if (seen_flock > num_flock_locks) {
298 if (seen_flock > num_flock_locks) { 304 err = -ENOSPC;
299 err = -ENOSPC; 305 goto fail;
300 goto fail;
301 }
302 err = lock_to_ceph_filelock(lock, &flocks[l]);
303 if (err)
304 goto fail;
305 ++l;
306 } 306 }
307 err = lock_to_ceph_filelock(lock, &flocks[l]);
308 if (err)
309 goto fail;
310 ++l;
307 } 311 }
308fail: 312fail:
313 spin_unlock(&ctx->flc_lock);
309 return err; 314 return err;
310} 315}
311 316
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4c1e36a171af..71c073f38e54 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2764,20 +2764,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2764 struct ceph_filelock *flocks; 2764 struct ceph_filelock *flocks;
2765 2765
2766encode_again: 2766encode_again:
2767 spin_lock(&inode->i_lock);
2768 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2767 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2769 spin_unlock(&inode->i_lock);
2770 flocks = kmalloc((num_fcntl_locks+num_flock_locks) * 2768 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2771 sizeof(struct ceph_filelock), GFP_NOFS); 2769 sizeof(struct ceph_filelock), GFP_NOFS);
2772 if (!flocks) { 2770 if (!flocks) {
2773 err = -ENOMEM; 2771 err = -ENOMEM;
2774 goto out_free; 2772 goto out_free;
2775 } 2773 }
2776 spin_lock(&inode->i_lock);
2777 err = ceph_encode_locks_to_buffer(inode, flocks, 2774 err = ceph_encode_locks_to_buffer(inode, flocks,
2778 num_fcntl_locks, 2775 num_fcntl_locks,
2779 num_flock_locks); 2776 num_flock_locks);
2780 spin_unlock(&inode->i_lock);
2781 if (err) { 2777 if (err) {
2782 kfree(flocks); 2778 kfree(flocks);
2783 if (err == -ENOSPC) 2779 if (err == -ENOSPC)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8f8983f38b82..a63997b8bcff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
40 40
41 dout("put_super\n"); 41 dout("put_super\n");
42 ceph_mdsc_close_sessions(fsc->mdsc); 42 ceph_mdsc_close_sessions(fsc->mdsc);
43
44 /*
45 * ensure we release the bdi before put_anon_super releases
46 * the device name.
47 */
48 if (s->s_bdi == &fsc->backing_dev_info) {
49 bdi_unregister(&fsc->backing_dev_info);
50 s->s_bdi = NULL;
51 }
52
53 return;
54} 43}
55 44
56static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 45static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -914,7 +903,7 @@ static int ceph_register_bdi(struct super_block *sb,
914 >> PAGE_SHIFT; 903 >> PAGE_SHIFT;
915 else 904 else
916 fsc->backing_dev_info.ra_pages = 905 fsc->backing_dev_info.ra_pages =
917 default_backing_dev_info.ra_pages; 906 VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
918 907
919 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", 908 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
920 atomic_long_inc_return(&bdi_seq)); 909 atomic_long_inc_return(&bdi_seq));
@@ -1006,11 +995,16 @@ out_final:
1006static void ceph_kill_sb(struct super_block *s) 995static void ceph_kill_sb(struct super_block *s)
1007{ 996{
1008 struct ceph_fs_client *fsc = ceph_sb_to_client(s); 997 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
998 dev_t dev = s->s_dev;
999
1009 dout("kill_sb %p\n", s); 1000 dout("kill_sb %p\n", s);
1001
1010 ceph_mdsc_pre_umount(fsc->mdsc); 1002 ceph_mdsc_pre_umount(fsc->mdsc);
1011 kill_anon_super(s); /* will call put_super after sb is r/o */ 1003 generic_shutdown_super(s);
1012 ceph_mdsc_destroy(fsc); 1004 ceph_mdsc_destroy(fsc);
1005
1013 destroy_fs_client(fsc); 1006 destroy_fs_client(fsc);
1007 free_anon_bdev(dev);
1014} 1008}
1015 1009
1016static struct file_system_type ceph_fs_type = { 1010static struct file_system_type ceph_fs_type = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
24 24
25#include "internal.h" 25#include "internal.h"
26 26
27/*
28 * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
29 * devices
30 * - permits shared-mmap for read, write and/or exec
31 * - does not permit private mmap in NOMMU mode (can't do COW)
32 * - no readahead or I/O queue unplugging required
33 */
34struct backing_dev_info directly_mappable_cdev_bdi = {
35 .name = "char",
36 .capabilities = (
37#ifdef CONFIG_MMU
38 /* permit private copies of the data to be taken */
39 BDI_CAP_MAP_COPY |
40#endif
41 /* permit direct mmap, for read, write or exec */
42 BDI_CAP_MAP_DIRECT |
43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
44 /* no writeback happens */
45 BDI_CAP_NO_ACCT_AND_WRITEBACK),
46};
47
48static struct kobj_map *cdev_map; 27static struct kobj_map *cdev_map;
49 28
50static DEFINE_MUTEX(chrdevs_lock); 29static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
575void __init chrdev_init(void) 554void __init chrdev_init(void)
576{ 555{
577 cdev_map = kobj_map_init(base_probe, &chrdevs_lock); 556 cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
578 if (bdi_init(&directly_mappable_cdev_bdi))
579 panic("Failed to init directly mappable cdev bdi");
580} 557}
581 558
582 559
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
590EXPORT_SYMBOL(cdev_add); 567EXPORT_SYMBOL(cdev_add);
591EXPORT_SYMBOL(__register_chrdev); 568EXPORT_SYMBOL(__register_chrdev);
592EXPORT_SYMBOL(__unregister_chrdev); 569EXPORT_SYMBOL(__unregister_chrdev);
593EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3446 int referral_walks_count = 0; 3446 int referral_walks_count = 0;
3447#endif 3447#endif
3448 3448
3449 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 3449 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
3450 if (rc) 3450 if (rc)
3451 return rc; 3451 return rc;
3452 3452
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74f12877493a..a94b3e673182 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1113,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
1113 return rc; 1113 return rc;
1114} 1114}
1115 1115
1116/* copied from fs/locks.c with a name change */
1117#define cifs_for_each_lock(inode, lockp) \
1118 for (lockp = &inode->i_flock; *lockp != NULL; \
1119 lockp = &(*lockp)->fl_next)
1120
1121struct lock_to_push { 1116struct lock_to_push {
1122 struct list_head llist; 1117 struct list_head llist;
1123 __u64 offset; 1118 __u64 offset;
@@ -1132,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1132{ 1127{
1133 struct inode *inode = cfile->dentry->d_inode; 1128 struct inode *inode = cfile->dentry->d_inode;
1134 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1129 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1135 struct file_lock *flock, **before; 1130 struct file_lock *flock;
1136 unsigned int count = 0, i = 0; 1131 struct file_lock_context *flctx = inode->i_flctx;
1132 unsigned int count = 0, i;
1137 int rc = 0, xid, type; 1133 int rc = 0, xid, type;
1138 struct list_head locks_to_send, *el; 1134 struct list_head locks_to_send, *el;
1139 struct lock_to_push *lck, *tmp; 1135 struct lock_to_push *lck, *tmp;
@@ -1141,12 +1137,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1141 1137
1142 xid = get_xid(); 1138 xid = get_xid();
1143 1139
1144 spin_lock(&inode->i_lock); 1140 if (!flctx)
1145 cifs_for_each_lock(inode, before) { 1141 goto out;
1146 if ((*before)->fl_flags & FL_POSIX) 1142
1147 count++; 1143 spin_lock(&flctx->flc_lock);
1144 list_for_each(el, &flctx->flc_posix) {
1145 count++;
1148 } 1146 }
1149 spin_unlock(&inode->i_lock); 1147 spin_unlock(&flctx->flc_lock);
1150 1148
1151 INIT_LIST_HEAD(&locks_to_send); 1149 INIT_LIST_HEAD(&locks_to_send);
1152 1150
@@ -1155,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1155 * added to the list while we are holding cinode->lock_sem that 1153 * added to the list while we are holding cinode->lock_sem that
1156 * protects locking operations of this inode. 1154 * protects locking operations of this inode.
1157 */ 1155 */
1158 for (; i < count; i++) { 1156 for (i = 0; i < count; i++) {
1159 lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); 1157 lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
1160 if (!lck) { 1158 if (!lck) {
1161 rc = -ENOMEM; 1159 rc = -ENOMEM;
@@ -1165,11 +1163,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1165 } 1163 }
1166 1164
1167 el = locks_to_send.next; 1165 el = locks_to_send.next;
1168 spin_lock(&inode->i_lock); 1166 spin_lock(&flctx->flc_lock);
1169 cifs_for_each_lock(inode, before) { 1167 list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
1170 flock = *before;
1171 if ((flock->fl_flags & FL_POSIX) == 0)
1172 continue;
1173 if (el == &locks_to_send) { 1168 if (el == &locks_to_send) {
1174 /* 1169 /*
1175 * The list ended. We don't have enough allocated 1170 * The list ended. We don't have enough allocated
@@ -1189,9 +1184,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1189 lck->length = length; 1184 lck->length = length;
1190 lck->type = type; 1185 lck->type = type;
1191 lck->offset = flock->fl_start; 1186 lck->offset = flock->fl_start;
1192 el = el->next;
1193 } 1187 }
1194 spin_unlock(&inode->i_lock); 1188 spin_unlock(&flctx->flc_lock);
1195 1189
1196 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { 1190 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
1197 int stored_rc; 1191 int stored_rc;
@@ -3248,7 +3242,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
3248 .fault = filemap_fault, 3242 .fault = filemap_fault,
3249 .map_pages = filemap_map_pages, 3243 .map_pages = filemap_map_pages,
3250 .page_mkwrite = cifs_page_mkwrite, 3244 .page_mkwrite = cifs_page_mkwrite,
3251 .remap_pages = generic_file_remap_pages,
3252}; 3245};
3253 3246
3254int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) 3247int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
937 inode->i_flags |= S_NOATIME | S_NOCMTIME; 937 inode->i_flags |= S_NOATIME | S_NOCMTIME;
938 if (inode->i_state & I_NEW) { 938 if (inode->i_state & I_NEW) {
939 inode->i_ino = hash; 939 inode->i_ino = hash;
940 if (S_ISREG(inode->i_mode))
941 inode->i_data.backing_dev_info = sb->s_bdi;
942#ifdef CONFIG_CIFS_FSCACHE 940#ifdef CONFIG_CIFS_FSCACHE
943 /* initialize per-inode cache cookie pointer */ 941 /* initialize per-inode cache cookie pointer */
944 CIFS_I(inode)->fscache = NULL; 942 CIFS_I(inode)->fscache = NULL;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 86c893884eb9..281ee011bb6a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -28,29 +28,6 @@
28 28
29#include "coda_int.h" 29#include "coda_int.h"
30 30
31/* dir inode-ops */
32static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl);
33static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
34static int coda_link(struct dentry *old_dentry, struct inode *dir_inode,
35 struct dentry *entry);
36static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
37static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
38 const char *symname);
39static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
40static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
41static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
42 struct inode *new_inode, struct dentry *new_dentry);
43
44/* dir file-ops */
45static int coda_readdir(struct file *file, struct dir_context *ctx);
46
47/* dentry ops */
48static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
49static int coda_dentry_delete(const struct dentry *);
50
51/* support routines */
52static int coda_venus_readdir(struct file *, struct dir_context *);
53
54/* same as fs/bad_inode.c */ 31/* same as fs/bad_inode.c */
55static int coda_return_EIO(void) 32static int coda_return_EIO(void)
56{ 33{
@@ -58,38 +35,6 @@ static int coda_return_EIO(void)
58} 35}
59#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) 36#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
60 37
61const struct dentry_operations coda_dentry_operations =
62{
63 .d_revalidate = coda_dentry_revalidate,
64 .d_delete = coda_dentry_delete,
65};
66
67const struct inode_operations coda_dir_inode_operations =
68{
69 .create = coda_create,
70 .lookup = coda_lookup,
71 .link = coda_link,
72 .unlink = coda_unlink,
73 .symlink = coda_symlink,
74 .mkdir = coda_mkdir,
75 .rmdir = coda_rmdir,
76 .mknod = CODA_EIO_ERROR,
77 .rename = coda_rename,
78 .permission = coda_permission,
79 .getattr = coda_getattr,
80 .setattr = coda_setattr,
81};
82
83const struct file_operations coda_dir_operations = {
84 .llseek = generic_file_llseek,
85 .read = generic_read_dir,
86 .iterate = coda_readdir,
87 .open = coda_open,
88 .release = coda_release,
89 .fsync = coda_fsync,
90};
91
92
93/* inode operations for directories */ 38/* inode operations for directories */
94/* access routines: lookup, readlink, permission */ 39/* access routines: lookup, readlink, permission */
95static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) 40static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
@@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
374 return error; 319 return error;
375} 320}
376 321
377
378/* file operations for directories */
379static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
380{
381 struct coda_file_info *cfi;
382 struct file *host_file;
383 int ret;
384
385 cfi = CODA_FTOC(coda_file);
386 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
387 host_file = cfi->cfi_container;
388
389 if (host_file->f_op->iterate) {
390 struct inode *host_inode = file_inode(host_file);
391 mutex_lock(&host_inode->i_mutex);
392 ret = -ENOENT;
393 if (!IS_DEADDIR(host_inode)) {
394 ret = host_file->f_op->iterate(host_file, ctx);
395 file_accessed(host_file);
396 }
397 mutex_unlock(&host_inode->i_mutex);
398 return ret;
399 }
400 /* Venus: we must read Venus dirents from a file */
401 return coda_venus_readdir(coda_file, ctx);
402}
403
404static inline unsigned int CDT2DT(unsigned char cdt) 322static inline unsigned int CDT2DT(unsigned char cdt)
405{ 323{
406 unsigned int dt; 324 unsigned int dt;
@@ -495,6 +413,33 @@ out:
495 return 0; 413 return 0;
496} 414}
497 415
416/* file operations for directories */
417static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
418{
419 struct coda_file_info *cfi;
420 struct file *host_file;
421 int ret;
422
423 cfi = CODA_FTOC(coda_file);
424 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
425 host_file = cfi->cfi_container;
426
427 if (host_file->f_op->iterate) {
428 struct inode *host_inode = file_inode(host_file);
429
430 mutex_lock(&host_inode->i_mutex);
431 ret = -ENOENT;
432 if (!IS_DEADDIR(host_inode)) {
433 ret = host_file->f_op->iterate(host_file, ctx);
434 file_accessed(host_file);
435 }
436 mutex_unlock(&host_inode->i_mutex);
437 return ret;
438 }
439 /* Venus: we must read Venus dirents from a file */
440 return coda_venus_readdir(coda_file, ctx);
441}
442
498/* called when a cache lookup succeeds */ 443/* called when a cache lookup succeeds */
499static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) 444static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
500{ 445{
@@ -603,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode)
603 } 548 }
604 return 0; 549 return 0;
605} 550}
551
552const struct dentry_operations coda_dentry_operations = {
553 .d_revalidate = coda_dentry_revalidate,
554 .d_delete = coda_dentry_delete,
555};
556
557const struct inode_operations coda_dir_inode_operations = {
558 .create = coda_create,
559 .lookup = coda_lookup,
560 .link = coda_link,
561 .unlink = coda_unlink,
562 .symlink = coda_symlink,
563 .mkdir = coda_mkdir,
564 .rmdir = coda_rmdir,
565 .mknod = CODA_EIO_ERROR,
566 .rename = coda_rename,
567 .permission = coda_permission,
568 .getattr = coda_getattr,
569 .setattr = coda_setattr,
570};
571
572const struct file_operations coda_dir_operations = {
573 .llseek = generic_file_llseek,
574 .read = generic_read_dir,
575 .iterate = coda_readdir,
576 .open = coda_open,
577 .release = coda_release,
578 .fsync = coda_fsync,
579};
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
183 goto unlock_out; 183 goto unlock_out;
184 } 184 }
185 185
186 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); 186 error = bdi_setup_and_register(&vc->bdi, "coda");
187 if (error) 187 if (error)
188 goto unlock_out; 188 goto unlock_out;
189 189
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
70 70
71extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); 71extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
72extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); 72extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
73extern int configfs_inode_init(void);
74extern void configfs_inode_exit(void);
75 73
76extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); 74extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
77extern int configfs_make_dirent(struct configfs_dirent *, 75extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
50 .write_end = simple_write_end, 50 .write_end = simple_write_end,
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
55 .ra_pages = 0, /* No readahead */
56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
57};
58
59static const struct inode_operations configfs_inode_operations ={ 53static const struct inode_operations configfs_inode_operations ={
60 .setattr = configfs_setattr, 54 .setattr = configfs_setattr,
61}; 55};
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
137 if (inode) { 131 if (inode) {
138 inode->i_ino = get_next_ino(); 132 inode->i_ino = get_next_ino();
139 inode->i_mapping->a_ops = &configfs_aops; 133 inode->i_mapping->a_ops = &configfs_aops;
140 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
141 inode->i_op = &configfs_inode_operations; 134 inode->i_op = &configfs_inode_operations;
142 135
143 if (sd->s_iattr) { 136 if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
283 } 276 }
284 mutex_unlock(&dir->d_inode->i_mutex); 277 mutex_unlock(&dir->d_inode->i_mutex);
285} 278}
286
287int __init configfs_inode_init(void)
288{
289 return bdi_init(&configfs_backing_dev_info);
290}
291
292void configfs_inode_exit(void)
293{
294 bdi_destroy(&configfs_backing_dev_info);
295}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
145 if (!config_kobj) 145 if (!config_kobj)
146 goto out2; 146 goto out2;
147 147
148 err = configfs_inode_init();
149 if (err)
150 goto out3;
151
152 err = register_filesystem(&configfs_fs_type); 148 err = register_filesystem(&configfs_fs_type);
153 if (err) 149 if (err)
154 goto out4; 150 goto out3;
155 151
156 return 0; 152 return 0;
157out4:
158 pr_err("Unable to register filesystem!\n");
159 configfs_inode_exit();
160out3: 153out3:
154 pr_err("Unable to register filesystem!\n");
161 kobject_put(config_kobj); 155 kobject_put(config_kobj);
162out2: 156out2:
163 kmem_cache_destroy(configfs_dir_cachep); 157 kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
172 kobject_put(config_kobj); 166 kobject_put(config_kobj);
173 kmem_cache_destroy(configfs_dir_cachep); 167 kmem_cache_destroy(configfs_dir_cachep);
174 configfs_dir_cachep = NULL; 168 configfs_dir_cachep = NULL;
175 configfs_inode_exit();
176} 169}
177 170
178MODULE_AUTHOR("Oracle"); 171MODULE_AUTHOR("Oracle");
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..ed1619ec6537
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
1/*
2 * fs/dax.c - Direct Access filesystem code
3 * Copyright (c) 2013-2014 Intel Corporation
4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/atomic.h>
18#include <linux/blkdev.h>
19#include <linux/buffer_head.h>
20#include <linux/fs.h>
21#include <linux/genhd.h>
22#include <linux/highmem.h>
23#include <linux/memcontrol.h>
24#include <linux/mm.h>
25#include <linux/mutex.h>
26#include <linux/sched.h>
27#include <linux/uio.h>
28#include <linux/vmstat.h>
29
30int dax_clear_blocks(struct inode *inode, sector_t block, long size)
31{
32 struct block_device *bdev = inode->i_sb->s_bdev;
33 sector_t sector = block << (inode->i_blkbits - 9);
34
35 might_sleep();
36 do {
37 void *addr;
38 unsigned long pfn;
39 long count;
40
41 count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
42 if (count < 0)
43 return count;
44 BUG_ON(size < count);
45 while (count > 0) {
46 unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
47 if (pgsz > count)
48 pgsz = count;
49 if (pgsz < PAGE_SIZE)
50 memset(addr, 0, pgsz);
51 else
52 clear_page(addr);
53 addr += pgsz;
54 size -= pgsz;
55 count -= pgsz;
56 BUG_ON(pgsz & 511);
57 sector += pgsz / 512;
58 cond_resched();
59 }
60 } while (size);
61
62 return 0;
63}
64EXPORT_SYMBOL_GPL(dax_clear_blocks);
65
66static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
67{
68 unsigned long pfn;
69 sector_t sector = bh->b_blocknr << (blkbits - 9);
70 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
71}
72
73static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
74 loff_t end)
75{
76 loff_t final = end - pos + first; /* The final byte of the buffer */
77
78 if (first > 0)
79 memset(addr, 0, first);
80 if (final < size)
81 memset(addr + final, 0, size - final);
82}
83
84static bool buffer_written(struct buffer_head *bh)
85{
86 return buffer_mapped(bh) && !buffer_unwritten(bh);
87}
88
89/*
90 * When ext4 encounters a hole, it returns without modifying the buffer_head
91 * which means that we can't trust b_size. To cope with this, we set b_state
92 * to 0 before calling get_block and, if any bit is set, we know we can trust
93 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
94 * and would save us time calling get_block repeatedly.
95 */
96static bool buffer_size_valid(struct buffer_head *bh)
97{
98 return bh->b_state != 0;
99}
100
101static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
102 loff_t start, loff_t end, get_block_t get_block,
103 struct buffer_head *bh)
104{
105 ssize_t retval = 0;
106 loff_t pos = start;
107 loff_t max = start;
108 loff_t bh_max = start;
109 void *addr;
110 bool hole = false;
111
112 if (rw != WRITE)
113 end = min(end, i_size_read(inode));
114
115 while (pos < end) {
116 unsigned len;
117 if (pos == max) {
118 unsigned blkbits = inode->i_blkbits;
119 sector_t block = pos >> blkbits;
120 unsigned first = pos - (block << blkbits);
121 long size;
122
123 if (pos == bh_max) {
124 bh->b_size = PAGE_ALIGN(end - pos);
125 bh->b_state = 0;
126 retval = get_block(inode, block, bh,
127 rw == WRITE);
128 if (retval)
129 break;
130 if (!buffer_size_valid(bh))
131 bh->b_size = 1 << blkbits;
132 bh_max = pos - first + bh->b_size;
133 } else {
134 unsigned done = bh->b_size -
135 (bh_max - (pos - first));
136 bh->b_blocknr += done >> blkbits;
137 bh->b_size -= done;
138 }
139
140 hole = (rw != WRITE) && !buffer_written(bh);
141 if (hole) {
142 addr = NULL;
143 size = bh->b_size - first;
144 } else {
145 retval = dax_get_addr(bh, &addr, blkbits);
146 if (retval < 0)
147 break;
148 if (buffer_unwritten(bh) || buffer_new(bh))
149 dax_new_buf(addr, retval, first, pos,
150 end);
151 addr += first;
152 size = retval - first;
153 }
154 max = min(pos + size, end);
155 }
156
157 if (rw == WRITE)
158 len = copy_from_iter(addr, max - pos, iter);
159 else if (!hole)
160 len = copy_to_iter(addr, max - pos, iter);
161 else
162 len = iov_iter_zero(max - pos, iter);
163
164 if (!len)
165 break;
166
167 pos += len;
168 addr += len;
169 }
170
171 return (pos == start) ? retval : pos - start;
172}
173
174/**
175 * dax_do_io - Perform I/O to a DAX file
176 * @rw: READ to read or WRITE to write
177 * @iocb: The control block for this I/O
178 * @inode: The file which the I/O is directed at
179 * @iter: The addresses to do I/O from or to
180 * @pos: The file offset where the I/O starts
181 * @get_block: The filesystem method used to translate file offsets to blocks
182 * @end_io: A filesystem callback for I/O completion
183 * @flags: See below
184 *
185 * This function uses the same locking scheme as do_blockdev_direct_IO:
186 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
187 * caller for writes. For reads, we take and release the i_mutex ourselves.
188 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
189 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
190 * is in progress.
191 */
192ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
193 struct iov_iter *iter, loff_t pos,
194 get_block_t get_block, dio_iodone_t end_io, int flags)
195{
196 struct buffer_head bh;
197 ssize_t retval = -EINVAL;
198 loff_t end = pos + iov_iter_count(iter);
199
200 memset(&bh, 0, sizeof(bh));
201
202 if ((flags & DIO_LOCKING) && (rw == READ)) {
203 struct address_space *mapping = inode->i_mapping;
204 mutex_lock(&inode->i_mutex);
205 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
206 if (retval) {
207 mutex_unlock(&inode->i_mutex);
208 goto out;
209 }
210 }
211
212 /* Protects against truncate */
213 atomic_inc(&inode->i_dio_count);
214
215 retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
216
217 if ((flags & DIO_LOCKING) && (rw == READ))
218 mutex_unlock(&inode->i_mutex);
219
220 if ((retval > 0) && end_io)
221 end_io(iocb, pos, retval, bh.b_private);
222
223 inode_dio_done(inode);
224 out:
225 return retval;
226}
227EXPORT_SYMBOL_GPL(dax_do_io);
228
229/*
230 * The user has performed a load from a hole in the file. Allocating
231 * a new page in the file would cause excessive storage usage for
232 * workloads with sparse files. We allocate a page cache page instead.
233 * We'll kick it out of the page cache if it's ever written to,
234 * otherwise it will simply fall out of the page cache under memory
235 * pressure without ever having been dirtied.
236 */
237static int dax_load_hole(struct address_space *mapping, struct page *page,
238 struct vm_fault *vmf)
239{
240 unsigned long size;
241 struct inode *inode = mapping->host;
242 if (!page)
243 page = find_or_create_page(mapping, vmf->pgoff,
244 GFP_KERNEL | __GFP_ZERO);
245 if (!page)
246 return VM_FAULT_OOM;
247 /* Recheck i_size under page lock to avoid truncate race */
248 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
249 if (vmf->pgoff >= size) {
250 unlock_page(page);
251 page_cache_release(page);
252 return VM_FAULT_SIGBUS;
253 }
254
255 vmf->page = page;
256 return VM_FAULT_LOCKED;
257}
258
259static int copy_user_bh(struct page *to, struct buffer_head *bh,
260 unsigned blkbits, unsigned long vaddr)
261{
262 void *vfrom, *vto;
263 if (dax_get_addr(bh, &vfrom, blkbits) < 0)
264 return -EIO;
265 vto = kmap_atomic(to);
266 copy_user_page(vto, vfrom, vaddr, to);
267 kunmap_atomic(vto);
268 return 0;
269}
270
271static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
272 struct vm_area_struct *vma, struct vm_fault *vmf)
273{
274 struct address_space *mapping = inode->i_mapping;
275 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
276 unsigned long vaddr = (unsigned long)vmf->virtual_address;
277 void *addr;
278 unsigned long pfn;
279 pgoff_t size;
280 int error;
281
282 i_mmap_lock_read(mapping);
283
284 /*
285 * Check truncate didn't happen while we were allocating a block.
286 * If it did, this block may or may not be still allocated to the
287 * file. We can't tell the filesystem to free it because we can't
288 * take i_mutex here. In the worst case, the file still has blocks
289 * allocated past the end of the file.
290 */
291 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
292 if (unlikely(vmf->pgoff >= size)) {
293 error = -EIO;
294 goto out;
295 }
296
297 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
298 if (error < 0)
299 goto out;
300 if (error < PAGE_SIZE) {
301 error = -EIO;
302 goto out;
303 }
304
305 if (buffer_unwritten(bh) || buffer_new(bh))
306 clear_page(addr);
307
308 error = vm_insert_mixed(vma, vaddr, pfn);
309
310 out:
311 i_mmap_unlock_read(mapping);
312
313 if (bh->b_end_io)
314 bh->b_end_io(bh, 1);
315
316 return error;
317}
318
319static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
320 get_block_t get_block)
321{
322 struct file *file = vma->vm_file;
323 struct address_space *mapping = file->f_mapping;
324 struct inode *inode = mapping->host;
325 struct page *page;
326 struct buffer_head bh;
327 unsigned long vaddr = (unsigned long)vmf->virtual_address;
328 unsigned blkbits = inode->i_blkbits;
329 sector_t block;
330 pgoff_t size;
331 int error;
332 int major = 0;
333
334 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
335 if (vmf->pgoff >= size)
336 return VM_FAULT_SIGBUS;
337
338 memset(&bh, 0, sizeof(bh));
339 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
340 bh.b_size = PAGE_SIZE;
341
342 repeat:
343 page = find_get_page(mapping, vmf->pgoff);
344 if (page) {
345 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
346 page_cache_release(page);
347 return VM_FAULT_RETRY;
348 }
349 if (unlikely(page->mapping != mapping)) {
350 unlock_page(page);
351 page_cache_release(page);
352 goto repeat;
353 }
354 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
355 if (unlikely(vmf->pgoff >= size)) {
356 /*
357 * We have a struct page covering a hole in the file
358 * from a read fault and we've raced with a truncate
359 */
360 error = -EIO;
361 goto unlock_page;
362 }
363 }
364
365 error = get_block(inode, block, &bh, 0);
366 if (!error && (bh.b_size < PAGE_SIZE))
367 error = -EIO; /* fs corruption? */
368 if (error)
369 goto unlock_page;
370
371 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
372 if (vmf->flags & FAULT_FLAG_WRITE) {
373 error = get_block(inode, block, &bh, 1);
374 count_vm_event(PGMAJFAULT);
375 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
376 major = VM_FAULT_MAJOR;
377 if (!error && (bh.b_size < PAGE_SIZE))
378 error = -EIO;
379 if (error)
380 goto unlock_page;
381 } else {
382 return dax_load_hole(mapping, page, vmf);
383 }
384 }
385
386 if (vmf->cow_page) {
387 struct page *new_page = vmf->cow_page;
388 if (buffer_written(&bh))
389 error = copy_user_bh(new_page, &bh, blkbits, vaddr);
390 else
391 clear_user_highpage(new_page, vaddr);
392 if (error)
393 goto unlock_page;
394 vmf->page = page;
395 if (!page) {
396 i_mmap_lock_read(mapping);
397 /* Check we didn't race with truncate */
398 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
399 PAGE_SHIFT;
400 if (vmf->pgoff >= size) {
401 i_mmap_unlock_read(mapping);
402 error = -EIO;
403 goto out;
404 }
405 }
406 return VM_FAULT_LOCKED;
407 }
408
409 /* Check we didn't race with a read fault installing a new page */
410 if (!page && major)
411 page = find_lock_page(mapping, vmf->pgoff);
412
413 if (page) {
414 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
415 PAGE_CACHE_SIZE, 0);
416 delete_from_page_cache(page);
417 unlock_page(page);
418 page_cache_release(page);
419 }
420
421 error = dax_insert_mapping(inode, &bh, vma, vmf);
422
423 out:
424 if (error == -ENOMEM)
425 return VM_FAULT_OOM | major;
426 /* -EBUSY is fine, somebody else faulted on the same PTE */
427 if ((error < 0) && (error != -EBUSY))
428 return VM_FAULT_SIGBUS | major;
429 return VM_FAULT_NOPAGE | major;
430
431 unlock_page:
432 if (page) {
433 unlock_page(page);
434 page_cache_release(page);
435 }
436 goto out;
437}
438
439/**
440 * dax_fault - handle a page fault on a DAX file
441 * @vma: The virtual memory area where the fault occurred
442 * @vmf: The description of the fault
443 * @get_block: The filesystem method used to translate file offsets to blocks
444 *
445 * When a page fault occurs, filesystems may call this helper in their
446 * fault handler for DAX files.
447 */
448int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
449 get_block_t get_block)
450{
451 int result;
452 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
453
454 if (vmf->flags & FAULT_FLAG_WRITE) {
455 sb_start_pagefault(sb);
456 file_update_time(vma->vm_file);
457 }
458 result = do_dax_fault(vma, vmf, get_block);
459 if (vmf->flags & FAULT_FLAG_WRITE)
460 sb_end_pagefault(sb);
461
462 return result;
463}
464EXPORT_SYMBOL_GPL(dax_fault);
465
466/**
467 * dax_zero_page_range - zero a range within a page of a DAX file
468 * @inode: The file being truncated
469 * @from: The file offset that is being truncated to
470 * @length: The number of bytes to zero
471 * @get_block: The filesystem method used to translate file offsets to blocks
472 *
473 * This function can be called by a filesystem when it is zeroing part of a
474 * page in a DAX file. This is intended for hole-punch operations. If
475 * you are truncating a file, the helper function dax_truncate_page() may be
476 * more convenient.
477 *
478 * We work in terms of PAGE_CACHE_SIZE here for commonality with
479 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
480 * took care of disposing of the unnecessary blocks. Even if the filesystem
481 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
482 * since the file might be mmapped.
483 */
484int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
485 get_block_t get_block)
486{
487 struct buffer_head bh;
488 pgoff_t index = from >> PAGE_CACHE_SHIFT;
489 unsigned offset = from & (PAGE_CACHE_SIZE-1);
490 int err;
491
492 /* Block boundary? Nothing to do */
493 if (!length)
494 return 0;
495 BUG_ON((offset + length) > PAGE_CACHE_SIZE);
496
497 memset(&bh, 0, sizeof(bh));
498 bh.b_size = PAGE_CACHE_SIZE;
499 err = get_block(inode, index, &bh, 0);
500 if (err < 0)
501 return err;
502 if (buffer_written(&bh)) {
503 void *addr;
504 err = dax_get_addr(&bh, &addr, inode->i_blkbits);
505 if (err < 0)
506 return err;
507 memset(addr + offset, 0, length);
508 }
509
510 return 0;
511}
512EXPORT_SYMBOL_GPL(dax_zero_page_range);
513
514/**
515 * dax_truncate_page - handle a partial page being truncated in a DAX file
516 * @inode: The file being truncated
517 * @from: The file offset that is being truncated to
518 * @get_block: The filesystem method used to translate file offsets to blocks
519 *
520 * Similar to block_truncate_page(), this function can be called by a
521 * filesystem when it is truncating a DAX file to handle the partial page.
522 *
523 * We work in terms of PAGE_CACHE_SIZE here for commonality with
524 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
525 * took care of disposing of the unnecessary blocks. Even if the filesystem
526 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
527 * since the file might be mmapped.
528 */
529int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
530{
531 unsigned length = PAGE_CACHE_ALIGN(from) - from;
532 return dax_zero_page_range(inode, from, length, get_block);
533}
534EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..dc400fd29f4d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,6 +38,8 @@
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/list_lru.h> 40#include <linux/list_lru.h>
41#include <linux/kasan.h>
42
41#include "internal.h" 43#include "internal.h"
42#include "mount.h" 44#include "mount.h"
43 45
@@ -400,19 +402,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
400 * LRU lists entirely, while shrink_move moves it to the indicated 402 * LRU lists entirely, while shrink_move moves it to the indicated
401 * private list. 403 * private list.
402 */ 404 */
403static void d_lru_isolate(struct dentry *dentry) 405static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
404{ 406{
405 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 407 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
406 dentry->d_flags &= ~DCACHE_LRU_LIST; 408 dentry->d_flags &= ~DCACHE_LRU_LIST;
407 this_cpu_dec(nr_dentry_unused); 409 this_cpu_dec(nr_dentry_unused);
408 list_del_init(&dentry->d_lru); 410 list_lru_isolate(lru, &dentry->d_lru);
409} 411}
410 412
411static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) 413static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
414 struct list_head *list)
412{ 415{
413 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 416 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
414 dentry->d_flags |= DCACHE_SHRINK_LIST; 417 dentry->d_flags |= DCACHE_SHRINK_LIST;
415 list_move_tail(&dentry->d_lru, list); 418 list_lru_isolate_move(lru, &dentry->d_lru, list);
416} 419}
417 420
418/* 421/*
@@ -508,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry)
508 * dentry_iput drops the locks, at which point nobody (except 511 * dentry_iput drops the locks, at which point nobody (except
509 * transient RCU lookups) can reach this dentry. 512 * transient RCU lookups) can reach this dentry.
510 */ 513 */
511 BUG_ON((int)dentry->d_lockref.count > 0); 514 BUG_ON(dentry->d_lockref.count > 0);
512 this_cpu_dec(nr_dentry); 515 this_cpu_dec(nr_dentry);
513 if (dentry->d_op && dentry->d_op->d_release) 516 if (dentry->d_op && dentry->d_op->d_release)
514 dentry->d_op->d_release(dentry); 517 dentry->d_op->d_release(dentry);
@@ -561,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
561 struct dentry *parent = dentry->d_parent; 564 struct dentry *parent = dentry->d_parent;
562 if (IS_ROOT(dentry)) 565 if (IS_ROOT(dentry))
563 return NULL; 566 return NULL;
564 if (unlikely((int)dentry->d_lockref.count < 0)) 567 if (unlikely(dentry->d_lockref.count < 0))
565 return NULL; 568 return NULL;
566 if (likely(spin_trylock(&parent->d_lock))) 569 if (likely(spin_trylock(&parent->d_lock)))
567 return parent; 570 return parent;
@@ -590,6 +593,110 @@ again:
590 return parent; 593 return parent;
591} 594}
592 595
596/*
597 * Try to do a lockless dput(), and return whether that was successful.
598 *
599 * If unsuccessful, we return false, having already taken the dentry lock.
600 *
601 * The caller needs to hold the RCU read lock, so that the dentry is
602 * guaranteed to stay around even if the refcount goes down to zero!
603 */
604static inline bool fast_dput(struct dentry *dentry)
605{
606 int ret;
607 unsigned int d_flags;
608
609 /*
610 * If we have a d_op->d_delete() operation, we sould not
611 * let the dentry count go to zero, so use "put__or_lock".
612 */
613 if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
614 return lockref_put_or_lock(&dentry->d_lockref);
615
616 /*
617 * .. otherwise, we can try to just decrement the
618 * lockref optimistically.
619 */
620 ret = lockref_put_return(&dentry->d_lockref);
621
622 /*
623 * If the lockref_put_return() failed due to the lock being held
624 * by somebody else, the fast path has failed. We will need to
625 * get the lock, and then check the count again.
626 */
627 if (unlikely(ret < 0)) {
628 spin_lock(&dentry->d_lock);
629 if (dentry->d_lockref.count > 1) {
630 dentry->d_lockref.count--;
631 spin_unlock(&dentry->d_lock);
632 return 1;
633 }
634 return 0;
635 }
636
637 /*
638 * If we weren't the last ref, we're done.
639 */
640 if (ret)
641 return 1;
642
643 /*
644 * Careful, careful. The reference count went down
645 * to zero, but we don't hold the dentry lock, so
646 * somebody else could get it again, and do another
647 * dput(), and we need to not race with that.
648 *
649 * However, there is a very special and common case
650 * where we don't care, because there is nothing to
651 * do: the dentry is still hashed, it does not have
652 * a 'delete' op, and it's referenced and already on
653 * the LRU list.
654 *
655 * NOTE! Since we aren't locked, these values are
656 * not "stable". However, it is sufficient that at
657 * some point after we dropped the reference the
658 * dentry was hashed and the flags had the proper
659 * value. Other dentry users may have re-gotten
660 * a reference to the dentry and change that, but
661 * our work is done - we can leave the dentry
662 * around with a zero refcount.
663 */
664 smp_rmb();
665 d_flags = ACCESS_ONCE(dentry->d_flags);
666 d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
667
668 /* Nothing to do? Dropping the reference was all we needed? */
669 if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
670 return 1;
671
672 /*
673 * Not the fast normal case? Get the lock. We've already decremented
674 * the refcount, but we'll need to re-check the situation after
675 * getting the lock.
676 */
677 spin_lock(&dentry->d_lock);
678
679 /*
680 * Did somebody else grab a reference to it in the meantime, and
681 * we're no longer the last user after all? Alternatively, somebody
682 * else could have killed it and marked it dead. Either way, we
683 * don't need to do anything else.
684 */
685 if (dentry->d_lockref.count) {
686 spin_unlock(&dentry->d_lock);
687 return 1;
688 }
689
690 /*
691 * Re-get the reference we optimistically dropped. We hold the
692 * lock, and we just tested that it was zero, so we can just
693 * set it to 1.
694 */
695 dentry->d_lockref.count = 1;
696 return 0;
697}
698
699
593/* 700/*
594 * This is dput 701 * This is dput
595 * 702 *
@@ -622,8 +729,14 @@ void dput(struct dentry *dentry)
622 return; 729 return;
623 730
624repeat: 731repeat:
625 if (lockref_put_or_lock(&dentry->d_lockref)) 732 rcu_read_lock();
733 if (likely(fast_dput(dentry))) {
734 rcu_read_unlock();
626 return; 735 return;
736 }
737
738 /* Slow case: now with the dentry lock held */
739 rcu_read_unlock();
627 740
628 /* Unreachable? Get rid of it */ 741 /* Unreachable? Get rid of it */
629 if (unlikely(d_unhashed(dentry))) 742 if (unlikely(d_unhashed(dentry)))
@@ -810,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list)
810 * We found an inuse dentry which was not removed from 923 * We found an inuse dentry which was not removed from
811 * the LRU because of laziness during lookup. Do not free it. 924 * the LRU because of laziness during lookup. Do not free it.
812 */ 925 */
813 if ((int)dentry->d_lockref.count > 0) { 926 if (dentry->d_lockref.count > 0) {
814 spin_unlock(&dentry->d_lock); 927 spin_unlock(&dentry->d_lock);
815 if (parent) 928 if (parent)
816 spin_unlock(&parent->d_lock); 929 spin_unlock(&parent->d_lock);
@@ -869,8 +982,8 @@ static void shrink_dentry_list(struct list_head *list)
869 } 982 }
870} 983}
871 984
872static enum lru_status 985static enum lru_status dentry_lru_isolate(struct list_head *item,
873dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) 986 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
874{ 987{
875 struct list_head *freeable = arg; 988 struct list_head *freeable = arg;
876 struct dentry *dentry = container_of(item, struct dentry, d_lru); 989 struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +1003,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
890 * another pass through the LRU. 1003 * another pass through the LRU.
891 */ 1004 */
892 if (dentry->d_lockref.count) { 1005 if (dentry->d_lockref.count) {
893 d_lru_isolate(dentry); 1006 d_lru_isolate(lru, dentry);
894 spin_unlock(&dentry->d_lock); 1007 spin_unlock(&dentry->d_lock);
895 return LRU_REMOVED; 1008 return LRU_REMOVED;
896 } 1009 }
@@ -921,7 +1034,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
921 return LRU_ROTATE; 1034 return LRU_ROTATE;
922 } 1035 }
923 1036
924 d_lru_shrink_move(dentry, freeable); 1037 d_lru_shrink_move(lru, dentry, freeable);
925 spin_unlock(&dentry->d_lock); 1038 spin_unlock(&dentry->d_lock);
926 1039
927 return LRU_REMOVED; 1040 return LRU_REMOVED;
@@ -930,30 +1043,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
930/** 1043/**
931 * prune_dcache_sb - shrink the dcache 1044 * prune_dcache_sb - shrink the dcache
932 * @sb: superblock 1045 * @sb: superblock
933 * @nr_to_scan : number of entries to try to free 1046 * @sc: shrink control, passed to list_lru_shrink_walk()
934 * @nid: which node to scan for freeable entities
935 * 1047 *
936 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 1048 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
937 * done when we need more memory an called from the superblock shrinker 1049 * is done when we need more memory and called from the superblock shrinker
938 * function. 1050 * function.
939 * 1051 *
940 * This function may fail to free any resources if all the dentries are in 1052 * This function may fail to free any resources if all the dentries are in
941 * use. 1053 * use.
942 */ 1054 */
943long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 1055long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
944 int nid)
945{ 1056{
946 LIST_HEAD(dispose); 1057 LIST_HEAD(dispose);
947 long freed; 1058 long freed;
948 1059
949 freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, 1060 freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
950 &dispose, &nr_to_scan); 1061 dentry_lru_isolate, &dispose);
951 shrink_dentry_list(&dispose); 1062 shrink_dentry_list(&dispose);
952 return freed; 1063 return freed;
953} 1064}
954 1065
955static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, 1066static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
956 spinlock_t *lru_lock, void *arg) 1067 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
957{ 1068{
958 struct list_head *freeable = arg; 1069 struct list_head *freeable = arg;
959 struct dentry *dentry = container_of(item, struct dentry, d_lru); 1070 struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -966,7 +1077,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
966 if (!spin_trylock(&dentry->d_lock)) 1077 if (!spin_trylock(&dentry->d_lock))
967 return LRU_SKIP; 1078 return LRU_SKIP;
968 1079
969 d_lru_shrink_move(dentry, freeable); 1080 d_lru_shrink_move(lru, dentry, freeable);
970 spin_unlock(&dentry->d_lock); 1081 spin_unlock(&dentry->d_lock);
971 1082
972 return LRU_REMOVED; 1083 return LRU_REMOVED;
@@ -1430,6 +1541,9 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1430 } 1541 }
1431 atomic_set(&p->u.count, 1); 1542 atomic_set(&p->u.count, 1);
1432 dname = p->name; 1543 dname = p->name;
1544 if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
1545 kasan_unpoison_shadow(dname,
1546 round_up(name->len + 1, sizeof(unsigned long)));
1433 } else { 1547 } else {
1434 dname = dentry->d_iname; 1548 dname = dentry->d_iname;
1435 } 1549 }
@@ -2187,37 +2301,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
2187} 2301}
2188EXPORT_SYMBOL(d_hash_and_lookup); 2302EXPORT_SYMBOL(d_hash_and_lookup);
2189 2303
2190/**
2191 * d_validate - verify dentry provided from insecure source (deprecated)
2192 * @dentry: The dentry alleged to be valid child of @dparent
2193 * @dparent: The parent dentry (known to be valid)
2194 *
2195 * An insecure source has sent us a dentry, here we verify it and dget() it.
2196 * This is used by ncpfs in its readdir implementation.
2197 * Zero is returned in the dentry is invalid.
2198 *
2199 * This function is slow for big directories, and deprecated, do not use it.
2200 */
2201int d_validate(struct dentry *dentry, struct dentry *dparent)
2202{
2203 struct dentry *child;
2204
2205 spin_lock(&dparent->d_lock);
2206 list_for_each_entry(child, &dparent->d_subdirs, d_child) {
2207 if (dentry == child) {
2208 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2209 __dget_dlock(dentry);
2210 spin_unlock(&dentry->d_lock);
2211 spin_unlock(&dparent->d_lock);
2212 return 1;
2213 }
2214 }
2215 spin_unlock(&dparent->d_lock);
2216
2217 return 0;
2218}
2219EXPORT_SYMBOL(d_validate);
2220
2221/* 2304/*
2222 * When a file is deleted, we have two options: 2305 * When a file is deleted, we have two options:
2223 * - turn this dentry into a negative dentry 2306 * - turn this dentry into a negative dentry
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 05f2960ed7c3..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount;
34static int debugfs_mount_count; 34static int debugfs_mount_count;
35static bool debugfs_registered; 35static bool debugfs_registered;
36 36
37static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev, 37static struct inode *debugfs_get_inode(struct super_block *sb)
38 void *data, const struct file_operations *fops)
39
40{ 38{
41 struct inode *inode = new_inode(sb); 39 struct inode *inode = new_inode(sb);
42
43 if (inode) { 40 if (inode) {
44 inode->i_ino = get_next_ino(); 41 inode->i_ino = get_next_ino();
45 inode->i_mode = mode;
46 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 42 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
47 switch (mode & S_IFMT) {
48 default:
49 init_special_inode(inode, mode, dev);
50 break;
51 case S_IFREG:
52 inode->i_fop = fops ? fops : &debugfs_file_operations;
53 inode->i_private = data;
54 break;
55 case S_IFLNK:
56 inode->i_op = &debugfs_link_operations;
57 inode->i_private = data;
58 break;
59 case S_IFDIR:
60 inode->i_op = &simple_dir_inode_operations;
61 inode->i_fop = &simple_dir_operations;
62
63 /* directory inodes start off with i_nlink == 2
64 * (for "." entry) */
65 inc_nlink(inode);
66 break;
67 }
68 } 43 }
69 return inode; 44 return inode;
70} 45}
71 46
72/* SMP-safe */
73static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
74 umode_t mode, dev_t dev, void *data,
75 const struct file_operations *fops)
76{
77 struct inode *inode;
78 int error = -EPERM;
79
80 if (dentry->d_inode)
81 return -EEXIST;
82
83 inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
84 if (inode) {
85 d_instantiate(dentry, inode);
86 dget(dentry);
87 error = 0;
88 }
89 return error;
90}
91
92static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
93{
94 int res;
95
96 mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
97 res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL);
98 if (!res) {
99 inc_nlink(dir);
100 fsnotify_mkdir(dir, dentry);
101 }
102 return res;
103}
104
105static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
106 void *data)
107{
108 mode = (mode & S_IALLUGO) | S_IFLNK;
109 return debugfs_mknod(dir, dentry, mode, 0, data, NULL);
110}
111
112static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
113 void *data, const struct file_operations *fops)
114{
115 int res;
116
117 mode = (mode & S_IALLUGO) | S_IFREG;
118 res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
119 if (!res)
120 fsnotify_create(dir, dentry);
121 return res;
122}
123
124static inline int debugfs_positive(struct dentry *dentry) 47static inline int debugfs_positive(struct dentry *dentry)
125{ 48{
126 return dentry->d_inode && !d_unhashed(dentry); 49 return dentry->d_inode && !d_unhashed(dentry);
@@ -252,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
252 .show_options = debugfs_show_options, 175 .show_options = debugfs_show_options,
253}; 176};
254 177
178static struct vfsmount *debugfs_automount(struct path *path)
179{
180 struct vfsmount *(*f)(void *);
181 f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
182 return f(path->dentry->d_inode->i_private);
183}
184
185static const struct dentry_operations debugfs_dops = {
186 .d_delete = always_delete_dentry,
187 .d_automount = debugfs_automount,
188};
189
255static int debug_fill_super(struct super_block *sb, void *data, int silent) 190static int debug_fill_super(struct super_block *sb, void *data, int silent)
256{ 191{
257 static struct tree_descr debug_files[] = {{""}}; 192 static struct tree_descr debug_files[] = {{""}};
@@ -276,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
276 goto fail; 211 goto fail;
277 212
278 sb->s_op = &debugfs_super_operations; 213 sb->s_op = &debugfs_super_operations;
214 sb->s_d_op = &debugfs_dops;
279 215
280 debugfs_apply_options(sb); 216 debugfs_apply_options(sb);
281 217
@@ -302,11 +238,9 @@ static struct file_system_type debug_fs_type = {
302}; 238};
303MODULE_ALIAS_FS("debugfs"); 239MODULE_ALIAS_FS("debugfs");
304 240
305static struct dentry *__create_file(const char *name, umode_t mode, 241static struct dentry *start_creating(const char *name, struct dentry *parent)
306 struct dentry *parent, void *data,
307 const struct file_operations *fops)
308{ 242{
309 struct dentry *dentry = NULL; 243 struct dentry *dentry;
310 int error; 244 int error;
311 245
312 pr_debug("debugfs: creating file '%s'\n",name); 246 pr_debug("debugfs: creating file '%s'\n",name);
@@ -314,7 +248,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
314 error = simple_pin_fs(&debug_fs_type, &debugfs_mount, 248 error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
315 &debugfs_mount_count); 249 &debugfs_mount_count);
316 if (error) 250 if (error)
317 goto exit; 251 return ERR_PTR(error);
318 252
319 /* If the parent is not specified, we create it in the root. 253 /* If the parent is not specified, we create it in the root.
320 * We need the root dentry to do this, which is in the super 254 * We need the root dentry to do this, which is in the super
@@ -326,31 +260,26 @@ static struct dentry *__create_file(const char *name, umode_t mode,
326 260
327 mutex_lock(&parent->d_inode->i_mutex); 261 mutex_lock(&parent->d_inode->i_mutex);
328 dentry = lookup_one_len(name, parent, strlen(name)); 262 dentry = lookup_one_len(name, parent, strlen(name));
329 if (!IS_ERR(dentry)) { 263 if (!IS_ERR(dentry) && dentry->d_inode) {
330 switch (mode & S_IFMT) {
331 case S_IFDIR:
332 error = debugfs_mkdir(parent->d_inode, dentry, mode);
333
334 break;
335 case S_IFLNK:
336 error = debugfs_link(parent->d_inode, dentry, mode,
337 data);
338 break;
339 default:
340 error = debugfs_create(parent->d_inode, dentry, mode,
341 data, fops);
342 break;
343 }
344 dput(dentry); 264 dput(dentry);
345 } else 265 dentry = ERR_PTR(-EEXIST);
346 error = PTR_ERR(dentry);
347 mutex_unlock(&parent->d_inode->i_mutex);
348
349 if (error) {
350 dentry = NULL;
351 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
352 } 266 }
353exit: 267 if (IS_ERR(dentry))
268 mutex_unlock(&parent->d_inode->i_mutex);
269 return dentry;
270}
271
272static struct dentry *failed_creating(struct dentry *dentry)
273{
274 mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
275 dput(dentry);
276 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
277 return NULL;
278}
279
280static struct dentry *end_creating(struct dentry *dentry)
281{
282 mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
354 return dentry; 283 return dentry;
355} 284}
356 285
@@ -384,19 +313,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
384 struct dentry *parent, void *data, 313 struct dentry *parent, void *data,
385 const struct file_operations *fops) 314 const struct file_operations *fops)
386{ 315{
387 switch (mode & S_IFMT) { 316 struct dentry *dentry;
388 case S_IFREG: 317 struct inode *inode;
389 case 0: 318
390 break; 319 if (!(mode & S_IFMT))
391 default: 320 mode |= S_IFREG;
392 BUG(); 321 BUG_ON(!S_ISREG(mode));
393 } 322 dentry = start_creating(name, parent);
323
324 if (IS_ERR(dentry))
325 return NULL;
394 326
395 return __create_file(name, mode, parent, data, fops); 327 inode = debugfs_get_inode(dentry->d_sb);
328 if (unlikely(!inode))
329 return failed_creating(dentry);
330
331 inode->i_mode = mode;
332 inode->i_fop = fops ? fops : &debugfs_file_operations;
333 inode->i_private = data;
334 d_instantiate(dentry, inode);
335 fsnotify_create(dentry->d_parent->d_inode, dentry);
336 return end_creating(dentry);
396} 337}
397EXPORT_SYMBOL_GPL(debugfs_create_file); 338EXPORT_SYMBOL_GPL(debugfs_create_file);
398 339
399/** 340/**
341 * debugfs_create_file_size - create a file in the debugfs filesystem
342 * @name: a pointer to a string containing the name of the file to create.
343 * @mode: the permission that the file should have.
344 * @parent: a pointer to the parent dentry for this file. This should be a
345 * directory dentry if set. If this parameter is NULL, then the
346 * file will be created in the root of the debugfs filesystem.
347 * @data: a pointer to something that the caller will want to get to later
348 * on. The inode.i_private pointer will point to this value on
349 * the open() call.
350 * @fops: a pointer to a struct file_operations that should be used for
351 * this file.
352 * @file_size: initial file size
353 *
354 * This is the basic "create a file" function for debugfs. It allows for a
355 * wide range of flexibility in creating a file, or a directory (if you want
356 * to create a directory, the debugfs_create_dir() function is
357 * recommended to be used instead.)
358 *
359 * This function will return a pointer to a dentry if it succeeds. This
360 * pointer must be passed to the debugfs_remove() function when the file is
361 * to be removed (no automatic cleanup happens if your module is unloaded,
362 * you are responsible here.) If an error occurs, %NULL will be returned.
363 *
364 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
365 * returned.
366 */
367struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
368 struct dentry *parent, void *data,
369 const struct file_operations *fops,
370 loff_t file_size)
371{
372 struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
373
374 if (de)
375 de->d_inode->i_size = file_size;
376 return de;
377}
378EXPORT_SYMBOL_GPL(debugfs_create_file_size);
379
380/**
400 * debugfs_create_dir - create a directory in the debugfs filesystem 381 * debugfs_create_dir - create a directory in the debugfs filesystem
401 * @name: a pointer to a string containing the name of the directory to 382 * @name: a pointer to a string containing the name of the directory to
402 * create. 383 * create.
@@ -416,12 +397,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
416 */ 397 */
417struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) 398struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
418{ 399{
419 return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 400 struct dentry *dentry = start_creating(name, parent);
420 parent, NULL, NULL); 401 struct inode *inode;
402
403 if (IS_ERR(dentry))
404 return NULL;
405
406 inode = debugfs_get_inode(dentry->d_sb);
407 if (unlikely(!inode))
408 return failed_creating(dentry);
409
410 inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
411 inode->i_op = &simple_dir_inode_operations;
412 inode->i_fop = &simple_dir_operations;
413
414 /* directory inodes start off with i_nlink == 2 (for "." entry) */
415 inc_nlink(inode);
416 d_instantiate(dentry, inode);
417 inc_nlink(dentry->d_parent->d_inode);
418 fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
419 return end_creating(dentry);
421} 420}
422EXPORT_SYMBOL_GPL(debugfs_create_dir); 421EXPORT_SYMBOL_GPL(debugfs_create_dir);
423 422
424/** 423/**
424 * debugfs_create_automount - create automount point in the debugfs filesystem
425 * @name: a pointer to a string containing the name of the file to create.
426 * @parent: a pointer to the parent dentry for this file. This should be a
427 * directory dentry if set. If this parameter is NULL, then the
428 * file will be created in the root of the debugfs filesystem.
429 * @f: function to be called when pathname resolution steps on that one.
430 * @data: opaque argument to pass to f().
431 *
432 * @f should return what ->d_automount() would.
433 */
434struct dentry *debugfs_create_automount(const char *name,
435 struct dentry *parent,
436 struct vfsmount *(*f)(void *),
437 void *data)
438{
439 struct dentry *dentry = start_creating(name, parent);
440 struct inode *inode;
441
442 if (IS_ERR(dentry))
443 return NULL;
444
445 inode = debugfs_get_inode(dentry->d_sb);
446 if (unlikely(!inode))
447 return failed_creating(dentry);
448
449 inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
450 inode->i_flags |= S_AUTOMOUNT;
451 inode->i_private = data;
452 dentry->d_fsdata = (void *)f;
453 d_instantiate(dentry, inode);
454 return end_creating(dentry);
455}
456EXPORT_SYMBOL(debugfs_create_automount);
457
458/**
425 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem 459 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
426 * @name: a pointer to a string containing the name of the symbolic link to 460 * @name: a pointer to a string containing the name of the symbolic link to
427 * create. 461 * create.
@@ -447,17 +481,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
447struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, 481struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
448 const char *target) 482 const char *target)
449{ 483{
450 struct dentry *result; 484 struct dentry *dentry;
451 char *link; 485 struct inode *inode;
452 486 char *link = kstrdup(target, GFP_KERNEL);
453 link = kstrdup(target, GFP_KERNEL);
454 if (!link) 487 if (!link)
455 return NULL; 488 return NULL;
456 489
457 result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL); 490 dentry = start_creating(name, parent);
458 if (!result) 491 if (IS_ERR(dentry)) {
459 kfree(link); 492 kfree(link);
460 return result; 493 return NULL;
494 }
495
496 inode = debugfs_get_inode(dentry->d_sb);
497 if (unlikely(!inode)) {
498 kfree(link);
499 return failed_creating(dentry);
500 }
501 inode->i_mode = S_IFLNK | S_IRWXUGO;
502 inode->i_op = &debugfs_link_operations;
503 inode->i_private = link;
504 d_instantiate(dentry, inode);
505 return end_creating(dentry);
461} 506}
462EXPORT_SYMBOL_GPL(debugfs_create_symlink); 507EXPORT_SYMBOL_GPL(debugfs_create_symlink);
463 508
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
56{ 56{
57 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 57 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
58 void *data = genlmsg_data(genlhdr); 58 void *data = genlmsg_data(genlhdr);
59 int rv;
60 59
61 rv = genlmsg_end(skb, data); 60 genlmsg_end(skb, data);
62 if (rv < 0) {
63 nlmsg_free(skb);
64 return rv;
65 }
66 61
67 return genlmsg_unicast(&init_net, skb, listener_nlportid); 62 return genlmsg_unicast(&init_net, skb, listener_nlportid);
68} 63}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
37 iput(toput_inode); 37 iput(toput_inode);
38} 38}
39 39
40static void drop_slab(void)
41{
42 int nr_objects;
43
44 do {
45 int nid;
46
47 nr_objects = 0;
48 for_each_online_node(nid)
49 nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
50 1000, 1000);
51 } while (nr_objects > 10);
52}
53
54int drop_caches_sysctl_handler(struct ctl_table *table, int write, 40int drop_caches_sysctl_handler(struct ctl_table *table, int write,
55 void __user *buffer, size_t *length, loff_t *ppos) 41 void __user *buffer, size_t *length, loff_t *ppos)
56{ 42{
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
67 inode->i_ino = lower_inode->i_ino; 67 inode->i_ino = lower_inode->i_ino;
68 inode->i_version++; 68 inode->i_version++;
69 inode->i_mapping->a_ops = &ecryptfs_aops; 69 inode->i_mapping->a_ops = &ecryptfs_aops;
70 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
71 70
72 if (S_ISLNK(inode->i_mode)) 71 if (S_ISLNK(inode->i_mode))
73 inode->i_op = &ecryptfs_symlink_iops; 72 inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
520 goto out; 520 goto out;
521 } 521 }
522 522
523 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); 523 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
524 if (rc) 524 if (rc)
525 goto out1; 525 goto out1;
526 526
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 367bbb10c543..c2499ef174a2 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
1config EFIVAR_FS 1config EFIVAR_FS
2 tristate "EFI Variable filesystem" 2 tristate "EFI Variable filesystem"
3 depends on EFI 3 depends on EFI
4 default m
4 help 5 help
5 efivarfs is a replacement filesystem for the old EFI 6 efivarfs is a replacement filesystem for the old EFI
6 variable support via sysfs, as it doesn't suffer from the 7 variable support via sysfs, as it doesn't suffer from the
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
140 140
141 name[len] = '-'; 141 name[len] = '-';
142 142
143 efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); 143 efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
144 144
145 name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; 145 name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
146 146
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 4b0a226024fa..8d0c0df01854 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
118{ 118{
119 struct eventfd_ctx *ctx = file->private_data; 119 struct eventfd_ctx *ctx = file->private_data;
120 unsigned int events = 0; 120 unsigned int events = 0;
121 unsigned long flags; 121 u64 count;
122 122
123 poll_wait(file, &ctx->wqh, wait); 123 poll_wait(file, &ctx->wqh, wait);
124 smp_rmb();
125 count = ctx->count;
124 126
125 spin_lock_irqsave(&ctx->wqh.lock, flags); 127 if (count > 0)
126 if (ctx->count > 0)
127 events |= POLLIN; 128 events |= POLLIN;
128 if (ctx->count == ULLONG_MAX) 129 if (count == ULLONG_MAX)
129 events |= POLLERR; 130 events |= POLLERR;
130 if (ULLONG_MAX - 1 > ctx->count) 131 if (ULLONG_MAX - 1 > count)
131 events |= POLLOUT; 132 events |= POLLOUT;
132 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
133 133
134 return events; 134 return events;
135} 135}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f94491352..1e009cad8d5c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1639,9 +1639,9 @@ fetch_events:
1639 1639
1640 spin_lock_irqsave(&ep->lock, flags); 1640 spin_lock_irqsave(&ep->lock, flags);
1641 } 1641 }
1642 __remove_wait_queue(&ep->wq, &wait);
1643 1642
1644 set_current_state(TASK_RUNNING); 1643 __remove_wait_queue(&ep->wq, &wait);
1644 __set_current_state(TASK_RUNNING);
1645 } 1645 }
1646check_events: 1646check_events:
1647 /* Is it worth to try to dig for events ? */ 1647 /* Is it worth to try to dig for events ? */
diff --git a/fs/exec.c b/fs/exec.c
index ad8798e26be9..c7f9b733406d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -794,8 +794,14 @@ exit:
794 794
795struct file *open_exec(const char *name) 795struct file *open_exec(const char *name)
796{ 796{
797 struct filename tmp = { .name = name }; 797 struct filename *filename = getname_kernel(name);
798 return do_open_execat(AT_FDCWD, &tmp, 0); 798 struct file *f = ERR_CAST(filename);
799
800 if (!IS_ERR(filename)) {
801 f = do_open_execat(AT_FDCWD, filename, 0);
802 putname(filename);
803 }
804 return f;
799} 805}
800EXPORT_SYMBOL(open_exec); 806EXPORT_SYMBOL(open_exec);
801 807
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
985 .direct_IO = exofs_direct_IO, 985 .direct_IO = exofs_direct_IO,
986 986
987 /* With these NULL has special meaning or default is not exported */ 987 /* With these NULL has special meaning or default is not exported */
988 .get_xip_mem = NULL,
989 .migratepage = NULL, 988 .migratepage = NULL,
990 .launder_page = NULL, 989 .launder_page = NULL,
991 .is_partially_uptodate = NULL, 990 .is_partially_uptodate = NULL,
@@ -1214,7 +1213,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1214 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1213 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
1215 } 1214 }
1216 1215
1217 inode->i_mapping->backing_dev_info = sb->s_bdi;
1218 if (S_ISREG(inode->i_mode)) { 1216 if (S_ISREG(inode->i_mode)) {
1219 inode->i_op = &exofs_file_inode_operations; 1217 inode->i_op = &exofs_file_inode_operations;
1220 inode->i_fop = &exofs_file_operations; 1218 inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1312,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
1314 1312
1315 set_obj_2bcreated(oi); 1313 set_obj_2bcreated(oi);
1316 1314
1317 inode->i_mapping->backing_dev_info = sb->s_bdi;
1318 inode_init_owner(inode, dir, mode); 1315 inode_init_owner(inode, dir, mode);
1319 inode->i_ino = sbi->s_nextid++; 1316 inode->i_ino = sbi->s_nextid++;
1320 inode->i_blkbits = EXOFS_BLKSHIFT; 1317 inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
836 goto free_sbi; 836 goto free_sbi;
837 } 837 }
838 838
839 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); 839 ret = bdi_setup_and_register(&sbi->bdi, "exofs");
840 if (ret) { 840 if (ret) {
841 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); 841 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
842 dput(sb->s_root); 842 dput(sb->s_root);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
42 42
43 If you are not using a security module that requires using 43 If you are not using a security module that requires using
44 extended attributes for file security labels, say N. 44 extended attributes for file security labels, say N.
45
46config EXT2_FS_XIP
47 bool "Ext2 execute in place support"
48 depends on EXT2_FS && MMU
49 help
50 Execute in place can be used on memory-backed block devices. If you
51 enable this option, you can select to mount block devices which are
52 capable of this feature without using the page cache.
53
54 If you do not use a block device that is capable of using this,
55 or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index f42af45cfd88..445b0e996a12 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o 11ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
12ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o 12ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o
13ext2-$(CONFIG_EXT2_FS_XIP) += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e4279ead4a05..678f9ab08c48 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,10 +380,15 @@ struct ext2_inode {
380#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ 380#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
381#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ 381#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */
382#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ 382#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */
383#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ 383#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */
384#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ 384#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
385#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ 385#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
386#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ 386#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
387#ifdef CONFIG_FS_DAX
388#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
389#else
390#define EXT2_MOUNT_DAX 0
391#endif
387 392
388 393
389#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt 394#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
788 int datasync); 793 int datasync);
789extern const struct inode_operations ext2_file_inode_operations; 794extern const struct inode_operations ext2_file_inode_operations;
790extern const struct file_operations ext2_file_operations; 795extern const struct file_operations ext2_file_operations;
791extern const struct file_operations ext2_xip_file_operations; 796extern const struct file_operations ext2_dax_file_operations;
792 797
793/* inode.c */ 798/* inode.c */
794extern const struct address_space_operations ext2_aops; 799extern const struct address_space_operations ext2_aops;
795extern const struct address_space_operations ext2_aops_xip;
796extern const struct address_space_operations ext2_nobh_aops; 800extern const struct address_space_operations ext2_nobh_aops;
797 801
798/* namei.c */ 802/* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..e31701713516 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
25#include "xattr.h" 25#include "xattr.h"
26#include "acl.h" 26#include "acl.h"
27 27
28#ifdef CONFIG_FS_DAX
29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
30{
31 return dax_fault(vma, vmf, ext2_get_block);
32}
33
34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
35{
36 return dax_mkwrite(vma, vmf, ext2_get_block);
37}
38
39static const struct vm_operations_struct ext2_dax_vm_ops = {
40 .fault = ext2_dax_fault,
41 .page_mkwrite = ext2_dax_mkwrite,
42};
43
44static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
45{
46 if (!IS_DAX(file_inode(file)))
47 return generic_file_mmap(file, vma);
48
49 file_accessed(file);
50 vma->vm_ops = &ext2_dax_vm_ops;
51 vma->vm_flags |= VM_MIXEDMAP;
52 return 0;
53}
54#else
55#define ext2_file_mmap generic_file_mmap
56#endif
57
28/* 58/*
29 * Called when filp is released. This happens when all file descriptors 59 * Called when filp is released. This happens when all file descriptors
30 * for a single struct file are closed. Note that different open() calls 60 * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
70#ifdef CONFIG_COMPAT 100#ifdef CONFIG_COMPAT
71 .compat_ioctl = ext2_compat_ioctl, 101 .compat_ioctl = ext2_compat_ioctl,
72#endif 102#endif
73 .mmap = generic_file_mmap, 103 .mmap = ext2_file_mmap,
74 .open = dquot_file_open, 104 .open = dquot_file_open,
75 .release = ext2_release_file, 105 .release = ext2_release_file,
76 .fsync = ext2_fsync, 106 .fsync = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
78 .splice_write = iter_file_splice_write, 108 .splice_write = iter_file_splice_write,
79}; 109};
80 110
81#ifdef CONFIG_EXT2_FS_XIP 111#ifdef CONFIG_FS_DAX
82const struct file_operations ext2_xip_file_operations = { 112const struct file_operations ext2_dax_file_operations = {
83 .llseek = generic_file_llseek, 113 .llseek = generic_file_llseek,
84 .read = xip_file_read, 114 .read = new_sync_read,
85 .write = xip_file_write, 115 .write = new_sync_write,
116 .read_iter = generic_file_read_iter,
117 .write_iter = generic_file_write_iter,
86 .unlocked_ioctl = ext2_ioctl, 118 .unlocked_ioctl = ext2_ioctl,
87#ifdef CONFIG_COMPAT 119#ifdef CONFIG_COMPAT
88 .compat_ioctl = ext2_compat_ioctl, 120 .compat_ioctl = ext2_compat_ioctl,
89#endif 121#endif
90 .mmap = xip_file_mmap, 122 .mmap = ext2_file_mmap,
91 .open = dquot_file_open, 123 .open = dquot_file_open,
92 .release = ext2_release_file, 124 .release = ext2_release_file,
93 .fsync = ext2_fsync, 125 .fsync = ext2_fsync,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
170 struct ext2_group_desc * gdp; 170 struct ext2_group_desc * gdp;
171 struct backing_dev_info *bdi; 171 struct backing_dev_info *bdi;
172 172
173 bdi = inode->i_mapping->backing_dev_info; 173 bdi = inode_to_bdi(inode);
174 if (bdi_read_congested(bdi)) 174 if (bdi_read_congested(bdi))
175 return; 175 return;
176 if (bdi_write_congested(bdi)) 176 if (bdi_write_congested(bdi))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..6434bc000125 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -34,7 +34,6 @@
34#include <linux/aio.h> 34#include <linux/aio.h>
35#include "ext2.h" 35#include "ext2.h"
36#include "acl.h" 36#include "acl.h"
37#include "xip.h"
38#include "xattr.h" 37#include "xattr.h"
39 38
40static int __ext2_write_inode(struct inode *inode, int do_sync); 39static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
731 goto cleanup; 730 goto cleanup;
732 } 731 }
733 732
734 if (ext2_use_xip(inode->i_sb)) { 733 if (IS_DAX(inode)) {
735 /* 734 /*
736 * we need to clear the block 735 * block must be initialised before we put it in the tree
736 * so that it's not found by another thread before it's
737 * initialised
737 */ 738 */
738 err = ext2_clear_xip_target (inode, 739 err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
739 le32_to_cpu(chain[depth-1].key)); 740 1 << inode->i_blkbits);
740 if (err) { 741 if (err) {
741 mutex_unlock(&ei->truncate_mutex); 742 mutex_unlock(&ei->truncate_mutex);
742 goto cleanup; 743 goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
859 size_t count = iov_iter_count(iter); 860 size_t count = iov_iter_count(iter);
860 ssize_t ret; 861 ssize_t ret;
861 862
862 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); 863 if (IS_DAX(inode))
864 ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
865 NULL, DIO_LOCKING);
866 else
867 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
868 ext2_get_block);
863 if (ret < 0 && (rw & WRITE)) 869 if (ret < 0 && (rw & WRITE))
864 ext2_write_failed(mapping, offset + count); 870 ext2_write_failed(mapping, offset + count);
865 return ret; 871 return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
885 .error_remove_page = generic_error_remove_page, 891 .error_remove_page = generic_error_remove_page,
886}; 892};
887 893
888const struct address_space_operations ext2_aops_xip = {
889 .bmap = ext2_bmap,
890 .get_xip_mem = ext2_get_xip_mem,
891};
892
893const struct address_space_operations ext2_nobh_aops = { 894const struct address_space_operations ext2_nobh_aops = {
894 .readpage = ext2_readpage, 895 .readpage = ext2_readpage,
895 .readpages = ext2_readpages, 896 .readpages = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
1201 1202
1202 inode_dio_wait(inode); 1203 inode_dio_wait(inode);
1203 1204
1204 if (mapping_is_xip(inode->i_mapping)) 1205 if (IS_DAX(inode))
1205 error = xip_truncate_page(inode->i_mapping, newsize); 1206 error = dax_truncate_page(inode, newsize, ext2_get_block);
1206 else if (test_opt(inode->i_sb, NOBH)) 1207 else if (test_opt(inode->i_sb, NOBH))
1207 error = nobh_truncate_page(inode->i_mapping, 1208 error = nobh_truncate_page(inode->i_mapping,
1208 newsize, ext2_get_block); 1209 newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
1273{ 1274{
1274 unsigned int flags = EXT2_I(inode)->i_flags; 1275 unsigned int flags = EXT2_I(inode)->i_flags;
1275 1276
1276 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 1277 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
1278 S_DIRSYNC | S_DAX);
1277 if (flags & EXT2_SYNC_FL) 1279 if (flags & EXT2_SYNC_FL)
1278 inode->i_flags |= S_SYNC; 1280 inode->i_flags |= S_SYNC;
1279 if (flags & EXT2_APPEND_FL) 1281 if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
1284 inode->i_flags |= S_NOATIME; 1286 inode->i_flags |= S_NOATIME;
1285 if (flags & EXT2_DIRSYNC_FL) 1287 if (flags & EXT2_DIRSYNC_FL)
1286 inode->i_flags |= S_DIRSYNC; 1288 inode->i_flags |= S_DIRSYNC;
1289 if (test_opt(inode->i_sb, DAX))
1290 inode->i_flags |= S_DAX;
1287} 1291}
1288 1292
1289/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ 1293/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1384 1388
1385 if (S_ISREG(inode->i_mode)) { 1389 if (S_ISREG(inode->i_mode)) {
1386 inode->i_op = &ext2_file_inode_operations; 1390 inode->i_op = &ext2_file_inode_operations;
1387 if (ext2_use_xip(inode->i_sb)) { 1391 if (test_opt(inode->i_sb, DAX)) {
1388 inode->i_mapping->a_ops = &ext2_aops_xip; 1392 inode->i_mapping->a_ops = &ext2_aops;
1389 inode->i_fop = &ext2_xip_file_operations; 1393 inode->i_fop = &ext2_dax_file_operations;
1390 } else if (test_opt(inode->i_sb, NOBH)) { 1394 } else if (test_opt(inode->i_sb, NOBH)) {
1391 inode->i_mapping->a_ops = &ext2_nobh_aops; 1395 inode->i_mapping->a_ops = &ext2_nobh_aops;
1392 inode->i_fop = &ext2_file_operations; 1396 inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c268d0af1db9..148f6e3789ea 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -35,7 +35,6 @@
35#include "ext2.h" 35#include "ext2.h"
36#include "xattr.h" 36#include "xattr.h"
37#include "acl.h" 37#include "acl.h"
38#include "xip.h"
39 38
40static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) 39static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
41{ 40{
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
105 return PTR_ERR(inode); 104 return PTR_ERR(inode);
106 105
107 inode->i_op = &ext2_file_inode_operations; 106 inode->i_op = &ext2_file_inode_operations;
108 if (ext2_use_xip(inode->i_sb)) { 107 if (test_opt(inode->i_sb, DAX)) {
109 inode->i_mapping->a_ops = &ext2_aops_xip; 108 inode->i_mapping->a_ops = &ext2_aops;
110 inode->i_fop = &ext2_xip_file_operations; 109 inode->i_fop = &ext2_dax_file_operations;
111 } else if (test_opt(inode->i_sb, NOBH)) { 110 } else if (test_opt(inode->i_sb, NOBH)) {
112 inode->i_mapping->a_ops = &ext2_nobh_aops; 111 inode->i_mapping->a_ops = &ext2_nobh_aops;
113 inode->i_fop = &ext2_file_operations; 112 inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
126 return PTR_ERR(inode); 125 return PTR_ERR(inode);
127 126
128 inode->i_op = &ext2_file_inode_operations; 127 inode->i_op = &ext2_file_inode_operations;
129 if (ext2_use_xip(inode->i_sb)) { 128 if (test_opt(inode->i_sb, DAX)) {
130 inode->i_mapping->a_ops = &ext2_aops_xip; 129 inode->i_mapping->a_ops = &ext2_aops;
131 inode->i_fop = &ext2_xip_file_operations; 130 inode->i_fop = &ext2_dax_file_operations;
132 } else if (test_opt(inode->i_sb, NOBH)) { 131 } else if (test_opt(inode->i_sb, NOBH)) {
133 inode->i_mapping->a_ops = &ext2_nobh_aops; 132 inode->i_mapping->a_ops = &ext2_nobh_aops;
134 inode->i_fop = &ext2_file_operations; 133 inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ae55fddc26a9..d0e746e96511 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -35,7 +35,6 @@
35#include "ext2.h" 35#include "ext2.h"
36#include "xattr.h" 36#include "xattr.h"
37#include "acl.h" 37#include "acl.h"
38#include "xip.h"
39 38
40static void ext2_sync_super(struct super_block *sb, 39static void ext2_sync_super(struct super_block *sb,
41 struct ext2_super_block *es, int wait); 40 struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
292 seq_puts(seq, ",grpquota"); 291 seq_puts(seq, ",grpquota");
293#endif 292#endif
294 293
295#if defined(CONFIG_EXT2_FS_XIP) 294#ifdef CONFIG_FS_DAX
296 if (sbi->s_mount_opt & EXT2_MOUNT_XIP) 295 if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
297 seq_puts(seq, ",xip"); 296 seq_puts(seq, ",xip");
297 if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
298 seq_puts(seq, ",dax");
298#endif 299#endif
299 300
300 if (!test_opt(sb, RESERVATION)) 301 if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
403 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, 404 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
404 Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, 405 Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
405 Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, 406 Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
406 Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, 407 Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
407 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation 408 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
408}; 409};
409 410
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
432 {Opt_acl, "acl"}, 433 {Opt_acl, "acl"},
433 {Opt_noacl, "noacl"}, 434 {Opt_noacl, "noacl"},
434 {Opt_xip, "xip"}, 435 {Opt_xip, "xip"},
436 {Opt_dax, "dax"},
435 {Opt_grpquota, "grpquota"}, 437 {Opt_grpquota, "grpquota"},
436 {Opt_ignore, "noquota"}, 438 {Opt_ignore, "noquota"},
437 {Opt_quota, "quota"}, 439 {Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
559 break; 561 break;
560#endif 562#endif
561 case Opt_xip: 563 case Opt_xip:
562#ifdef CONFIG_EXT2_FS_XIP 564 ext2_msg(sb, KERN_INFO, "use dax instead of xip");
563 set_opt (sbi->s_mount_opt, XIP); 565 set_opt(sbi->s_mount_opt, XIP);
566 /* Fall through */
567 case Opt_dax:
568#ifdef CONFIG_FS_DAX
569 set_opt(sbi->s_mount_opt, DAX);
564#else 570#else
565 ext2_msg(sb, KERN_INFO, "xip option not supported"); 571 ext2_msg(sb, KERN_INFO, "dax option not supported");
566#endif 572#endif
567 break; 573 break;
568 574
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
877 ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? 883 ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
878 MS_POSIXACL : 0); 884 MS_POSIXACL : 0);
879 885
880 ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
881 EXT2_MOUNT_XIP if not */
882
883 if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && 886 if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
884 (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || 887 (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
885 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || 888 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
909 912
910 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 913 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
911 914
912 if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { 915 if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
913 if (!silent) 916 if (blocksize != PAGE_SIZE) {
914 ext2_msg(sb, KERN_ERR, 917 ext2_msg(sb, KERN_ERR,
915 "error: unsupported blocksize for xip"); 918 "error: unsupported blocksize for dax");
916 goto failed_mount; 919 goto failed_mount;
920 }
921 if (!sb->s_bdev->bd_disk->fops->direct_access) {
922 ext2_msg(sb, KERN_ERR,
923 "error: device does not support dax");
924 goto failed_mount;
925 }
917 } 926 }
918 927
919 /* If the blocksize doesn't match, re-read the thing.. */ 928 /* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1259{ 1268{
1260 struct ext2_sb_info * sbi = EXT2_SB(sb); 1269 struct ext2_sb_info * sbi = EXT2_SB(sb);
1261 struct ext2_super_block * es; 1270 struct ext2_super_block * es;
1262 unsigned long old_mount_opt = sbi->s_mount_opt;
1263 struct ext2_mount_options old_opts; 1271 struct ext2_mount_options old_opts;
1264 unsigned long old_sb_flags; 1272 unsigned long old_sb_flags;
1265 int err; 1273 int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1284 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1292 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1285 ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1293 ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1286 1294
1287 ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
1288 EXT2_MOUNT_XIP if not */
1289
1290 if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
1291 ext2_msg(sb, KERN_WARNING,
1292 "warning: unsupported blocksize for xip");
1293 err = -EINVAL;
1294 goto restore_opts;
1295 }
1296
1297 es = sbi->s_es; 1295 es = sbi->s_es;
1298 if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { 1296 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
1299 ext2_msg(sb, KERN_WARNING, "warning: refusing change of " 1297 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1300 "xip flag with busy inodes while remounting"); 1298 "dax flag with busy inodes while remounting");
1301 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1299 sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
1302 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1303 } 1300 }
1304 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1301 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1305 spin_unlock(&sbi->s_lock); 1302 spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644
index e98171a11cfe..000000000000
--- a/fs/ext2/xip.c
+++ /dev/null
@@ -1,91 +0,0 @@
1/*
2 * linux/fs/ext2/xip.c
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte (cotte@de.ibm.com)
6 */
7
8#include <linux/mm.h>
9#include <linux/fs.h>
10#include <linux/genhd.h>
11#include <linux/buffer_head.h>
12#include <linux/blkdev.h>
13#include "ext2.h"
14#include "xip.h"
15
16static inline int
17__inode_direct_access(struct inode *inode, sector_t block,
18 void **kaddr, unsigned long *pfn)
19{
20 struct block_device *bdev = inode->i_sb->s_bdev;
21 const struct block_device_operations *ops = bdev->bd_disk->fops;
22 sector_t sector;
23
24 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
25
26 BUG_ON(!ops->direct_access);
27 return ops->direct_access(bdev, sector, kaddr, pfn);
28}
29
30static inline int
31__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
32 sector_t *result)
33{
34 struct buffer_head tmp;
35 int rc;
36
37 memset(&tmp, 0, sizeof(struct buffer_head));
38 tmp.b_size = 1 << inode->i_blkbits;
39 rc = ext2_get_block(inode, pgoff, &tmp, create);
40 *result = tmp.b_blocknr;
41
42 /* did we get a sparse block (hole in the file)? */
43 if (!tmp.b_blocknr && !rc) {
44 BUG_ON(create);
45 rc = -ENODATA;
46 }
47
48 return rc;
49}
50
51int
52ext2_clear_xip_target(struct inode *inode, sector_t block)
53{
54 void *kaddr;
55 unsigned long pfn;
56 int rc;
57
58 rc = __inode_direct_access(inode, block, &kaddr, &pfn);
59 if (!rc)
60 clear_page(kaddr);
61 return rc;
62}
63
64void ext2_xip_verify_sb(struct super_block *sb)
65{
66 struct ext2_sb_info *sbi = EXT2_SB(sb);
67
68 if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
69 !sb->s_bdev->bd_disk->fops->direct_access) {
70 sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
71 ext2_msg(sb, KERN_WARNING,
72 "warning: ignoring xip option - "
73 "not supported by bdev");
74 }
75}
76
77int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
78 void **kmem, unsigned long *pfn)
79{
80 int rc;
81 sector_t block;
82
83 /* first, retrieve the sector number */
84 rc = __ext2_get_block(mapping->host, pgoff, create, &block);
85 if (rc)
86 return rc;
87
88 /* retrieve address of the target data */
89 rc = __inode_direct_access(mapping->host, block, kmem, pfn);
90 return rc;
91}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644
index 18b34d2f31b3..000000000000
--- a/fs/ext2/xip.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/*
2 * linux/fs/ext2/xip.h
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte (cotte@de.ibm.com)
6 */
7
8#ifdef CONFIG_EXT2_FS_XIP
9extern void ext2_xip_verify_sb (struct super_block *);
10extern int ext2_clear_xip_target (struct inode *, sector_t);
11
12static inline int ext2_use_xip (struct super_block *sb)
13{
14 struct ext2_sb_info *sbi = EXT2_SB(sb);
15 return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
16}
17int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
18 void **, unsigned long *);
19#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
20#else
21#define mapping_is_xip(map) 0
22#define ext2_xip_verify_sb(sb) do { } while (0)
23#define ext2_use_xip(sb) 0
24#define ext2_clear_xip_target(inode, chain) 0
25#define ext2_get_xip_mem NULL
26#endif
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9b4e7d750d4f..d4dbf3c259b3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb)
466 } 466 }
467 sb->s_fs_info = NULL; 467 sb->s_fs_info = NULL;
468 kfree(sbi->s_blockgroup_lock); 468 kfree(sbi->s_blockgroup_lock);
469 mutex_destroy(&sbi->s_orphan_lock);
470 mutex_destroy(&sbi->s_resize_lock);
469 kfree(sbi); 471 kfree(sbi);
470} 472}
471 473
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba67bb1f..982d934fd9ac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -965,6 +965,11 @@ struct ext4_inode_info {
965#define EXT4_MOUNT_ERRORS_MASK 0x00070 965#define EXT4_MOUNT_ERRORS_MASK 0x00070
966#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ 966#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
967#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ 967#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
968#ifdef CONFIG_FS_DAX
969#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
970#else
971#define EXT4_MOUNT_DAX 0
972#endif
968#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ 973#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
969#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ 974#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
970#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ 975#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
2578/* file.c */ 2583/* file.c */
2579extern const struct inode_operations ext4_file_inode_operations; 2584extern const struct inode_operations ext4_file_inode_operations;
2580extern const struct file_operations ext4_file_operations; 2585extern const struct file_operations ext4_file_operations;
2586extern const struct file_operations ext4_dax_file_operations;
2581extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); 2587extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
2582 2588
2583/* inline.c */ 2589/* inline.c */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
95 struct inode *inode = file_inode(iocb->ki_filp); 95 struct inode *inode = file_inode(iocb->ki_filp);
96 struct mutex *aio_mutex = NULL; 96 struct mutex *aio_mutex = NULL;
97 struct blk_plug plug; 97 struct blk_plug plug;
98 int o_direct = file->f_flags & O_DIRECT; 98 int o_direct = io_is_direct(file);
99 int overwrite = 0; 99 int overwrite = 0;
100 size_t length = iov_iter_count(from); 100 size_t length = iov_iter_count(from);
101 ssize_t ret; 101 ssize_t ret;
@@ -191,17 +191,41 @@ errout:
191 return ret; 191 return ret;
192} 192}
193 193
194#ifdef CONFIG_FS_DAX
195static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
196{
197 return dax_fault(vma, vmf, ext4_get_block);
198 /* Is this the right get_block? */
199}
200
201static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
202{
203 return dax_mkwrite(vma, vmf, ext4_get_block);
204}
205
206static const struct vm_operations_struct ext4_dax_vm_ops = {
207 .fault = ext4_dax_fault,
208 .page_mkwrite = ext4_dax_mkwrite,
209};
210#else
211#define ext4_dax_vm_ops ext4_file_vm_ops
212#endif
213
194static const struct vm_operations_struct ext4_file_vm_ops = { 214static const struct vm_operations_struct ext4_file_vm_ops = {
195 .fault = filemap_fault, 215 .fault = filemap_fault,
196 .map_pages = filemap_map_pages, 216 .map_pages = filemap_map_pages,
197 .page_mkwrite = ext4_page_mkwrite, 217 .page_mkwrite = ext4_page_mkwrite,
198 .remap_pages = generic_file_remap_pages,
199}; 218};
200 219
201static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 220static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
202{ 221{
203 file_accessed(file); 222 file_accessed(file);
204 vma->vm_ops = &ext4_file_vm_ops; 223 if (IS_DAX(file_inode(file))) {
224 vma->vm_ops = &ext4_dax_vm_ops;
225 vma->vm_flags |= VM_MIXEDMAP;
226 } else {
227 vma->vm_ops = &ext4_file_vm_ops;
228 }
205 return 0; 229 return 0;
206} 230}
207 231
@@ -600,6 +624,26 @@ const struct file_operations ext4_file_operations = {
600 .fallocate = ext4_fallocate, 624 .fallocate = ext4_fallocate,
601}; 625};
602 626
627#ifdef CONFIG_FS_DAX
628const struct file_operations ext4_dax_file_operations = {
629 .llseek = ext4_llseek,
630 .read = new_sync_read,
631 .write = new_sync_write,
632 .read_iter = generic_file_read_iter,
633 .write_iter = ext4_file_write_iter,
634 .unlocked_ioctl = ext4_ioctl,
635#ifdef CONFIG_COMPAT
636 .compat_ioctl = ext4_compat_ioctl,
637#endif
638 .mmap = ext4_file_mmap,
639 .open = ext4_file_open,
640 .release = ext4_release_file,
641 .fsync = ext4_sync_file,
642 /* Splice not yet supported with DAX */
643 .fallocate = ext4_fallocate,
644};
645#endif
646
603const struct inode_operations ext4_file_inode_operations = { 647const struct inode_operations ext4_file_inode_operations = {
604 .setattr = ext4_setattr, 648 .setattr = ext4_setattr,
605 .getattr = ext4_getattr, 649 .getattr = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..6b9878a24182 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
689 inode_dio_done(inode); 689 inode_dio_done(inode);
690 goto locked; 690 goto locked;
691 } 691 }
692 ret = __blockdev_direct_IO(rw, iocb, inode, 692 if (IS_DAX(inode))
693 inode->i_sb->s_bdev, iter, offset, 693 ret = dax_do_io(rw, iocb, inode, iter, offset,
694 ext4_get_block, NULL, NULL, 0); 694 ext4_get_block, NULL, 0);
695 else
696 ret = __blockdev_direct_IO(rw, iocb, inode,
697 inode->i_sb->s_bdev, iter, offset,
698 ext4_get_block, NULL, NULL, 0);
695 inode_dio_done(inode); 699 inode_dio_done(inode);
696 } else { 700 } else {
697locked: 701locked:
698 ret = blockdev_direct_IO(rw, iocb, inode, iter, 702 if (IS_DAX(inode))
699 offset, ext4_get_block); 703 ret = dax_do_io(rw, iocb, inode, iter, offset,
704 ext4_get_block, NULL, DIO_LOCKING);
705 else
706 ret = blockdev_direct_IO(rw, iocb, inode, iter,
707 offset, ext4_get_block);
700 708
701 if (unlikely((rw & WRITE) && ret < 0)) { 709 if (unlikely((rw & WRITE) && ret < 0)) {
702 loff_t isize = i_size_read(inode); 710 loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..85404f15e53a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
657 return retval; 657 return retval;
658} 658}
659 659
660static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
661{
662 struct inode *inode = bh->b_assoc_map->host;
663 /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
664 loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
665 int err;
666 if (!uptodate)
667 return;
668 WARN_ON(!buffer_unwritten(bh));
669 err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
670}
671
660/* Maximum number of blocks we map for direct IO at once. */ 672/* Maximum number of blocks we map for direct IO at once. */
661#define DIO_MAX_BLOCKS 4096 673#define DIO_MAX_BLOCKS 4096
662 674
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
694 706
695 map_bh(bh, inode->i_sb, map.m_pblk); 707 map_bh(bh, inode->i_sb, map.m_pblk);
696 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 708 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
709 if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
710 bh->b_assoc_map = inode->i_mapping;
711 bh->b_private = (void *)(unsigned long)iblock;
712 bh->b_end_io = ext4_end_io_unwritten;
713 }
697 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) 714 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
698 set_buffer_defer_completion(bh); 715 set_buffer_defer_completion(bh);
699 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 716 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3010 get_block_func = ext4_get_block_write; 3027 get_block_func = ext4_get_block_write;
3011 dio_flags = DIO_LOCKING; 3028 dio_flags = DIO_LOCKING;
3012 } 3029 }
3013 ret = __blockdev_direct_IO(rw, iocb, inode, 3030 if (IS_DAX(inode))
3014 inode->i_sb->s_bdev, iter, 3031 ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
3015 offset, 3032 ext4_end_io_dio, dio_flags);
3016 get_block_func, 3033 else
3017 ext4_end_io_dio, 3034 ret = __blockdev_direct_IO(rw, iocb, inode,
3018 NULL, 3035 inode->i_sb->s_bdev, iter, offset,
3019 dio_flags); 3036 get_block_func,
3037 ext4_end_io_dio, NULL, dio_flags);
3020 3038
3021 /* 3039 /*
3022 * Put our reference to io_end. This can free the io_end structure e.g. 3040 * Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
3180 inode->i_mapping->a_ops = &ext4_aops; 3198 inode->i_mapping->a_ops = &ext4_aops;
3181} 3199}
3182 3200
3183/* 3201static int __ext4_block_zero_page_range(handle_t *handle,
3184 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3185 * starting from file offset 'from'. The range to be zero'd must
3186 * be contained with in one block. If the specified range exceeds
3187 * the end of the block it will be shortened to end of the block
3188 * that cooresponds to 'from'
3189 */
3190static int ext4_block_zero_page_range(handle_t *handle,
3191 struct address_space *mapping, loff_t from, loff_t length) 3202 struct address_space *mapping, loff_t from, loff_t length)
3192{ 3203{
3193 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3204 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3194 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3205 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3195 unsigned blocksize, max, pos; 3206 unsigned blocksize, pos;
3196 ext4_lblk_t iblock; 3207 ext4_lblk_t iblock;
3197 struct inode *inode = mapping->host; 3208 struct inode *inode = mapping->host;
3198 struct buffer_head *bh; 3209 struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
3205 return -ENOMEM; 3216 return -ENOMEM;
3206 3217
3207 blocksize = inode->i_sb->s_blocksize; 3218 blocksize = inode->i_sb->s_blocksize;
3208 max = blocksize - (offset & (blocksize - 1));
3209
3210 /*
3211 * correct length if it does not fall between
3212 * 'from' and the end of the block
3213 */
3214 if (length > max || length < 0)
3215 length = max;
3216 3219
3217 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3220 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3218 3221
@@ -3278,6 +3281,33 @@ unlock:
3278} 3281}
3279 3282
3280/* 3283/*
3284 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3285 * starting from file offset 'from'. The range to be zero'd must
3286 * be contained with in one block. If the specified range exceeds
3287 * the end of the block it will be shortened to end of the block
3288 * that cooresponds to 'from'
3289 */
3290static int ext4_block_zero_page_range(handle_t *handle,
3291 struct address_space *mapping, loff_t from, loff_t length)
3292{
3293 struct inode *inode = mapping->host;
3294 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3295 unsigned blocksize = inode->i_sb->s_blocksize;
3296 unsigned max = blocksize - (offset & (blocksize - 1));
3297
3298 /*
3299 * correct length if it does not fall between
3300 * 'from' and the end of the block
3301 */
3302 if (length > max || length < 0)
3303 length = max;
3304
3305 if (IS_DAX(inode))
3306 return dax_zero_page_range(inode, from, length, ext4_get_block);
3307 return __ext4_block_zero_page_range(handle, mapping, from, length);
3308}
3309
3310/*
3281 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3311 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3282 * up to the end of the block which corresponds to `from'. 3312 * up to the end of the block which corresponds to `from'.
3283 * This required during truncate. We need to physically zero the tail end 3313 * This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
3798 new_fl |= S_NOATIME; 3828 new_fl |= S_NOATIME;
3799 if (flags & EXT4_DIRSYNC_FL) 3829 if (flags & EXT4_DIRSYNC_FL)
3800 new_fl |= S_DIRSYNC; 3830 new_fl |= S_DIRSYNC;
3831 if (test_opt(inode->i_sb, DAX))
3832 new_fl |= S_DAX;
3801 inode_set_flags(inode, new_fl, 3833 inode_set_flags(inode, new_fl,
3802 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 3834 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
3803} 3835}
3804 3836
3805/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 3837/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4052 4084
4053 if (S_ISREG(inode->i_mode)) { 4085 if (S_ISREG(inode->i_mode)) {
4054 inode->i_op = &ext4_file_inode_operations; 4086 inode->i_op = &ext4_file_inode_operations;
4055 inode->i_fop = &ext4_file_operations; 4087 if (test_opt(inode->i_sb, DAX))
4088 inode->i_fop = &ext4_dax_file_operations;
4089 else
4090 inode->i_fop = &ext4_file_operations;
4056 ext4_set_aops(inode); 4091 ext4_set_aops(inode);
4057 } else if (S_ISDIR(inode->i_mode)) { 4092 } else if (S_ISDIR(inode->i_mode)) {
4058 inode->i_op = &ext4_dir_inode_operations; 4093 inode->i_op = &ext4_dir_inode_operations;
@@ -4139,6 +4174,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
4139 return 0; 4174 return 0;
4140} 4175}
4141 4176
4177struct other_inode {
4178 unsigned long orig_ino;
4179 struct ext4_inode *raw_inode;
4180};
4181
4182static int other_inode_match(struct inode * inode, unsigned long ino,
4183 void *data)
4184{
4185 struct other_inode *oi = (struct other_inode *) data;
4186
4187 if ((inode->i_ino != ino) ||
4188 (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
4189 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
4190 ((inode->i_state & I_DIRTY_TIME) == 0))
4191 return 0;
4192 spin_lock(&inode->i_lock);
4193 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
4194 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
4195 (inode->i_state & I_DIRTY_TIME)) {
4196 struct ext4_inode_info *ei = EXT4_I(inode);
4197
4198 inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
4199 spin_unlock(&inode->i_lock);
4200
4201 spin_lock(&ei->i_raw_lock);
4202 EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
4203 EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
4204 EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
4205 ext4_inode_csum_set(inode, oi->raw_inode, ei);
4206 spin_unlock(&ei->i_raw_lock);
4207 trace_ext4_other_inode_update_time(inode, oi->orig_ino);
4208 return -1;
4209 }
4210 spin_unlock(&inode->i_lock);
4211 return -1;
4212}
4213
4214/*
4215 * Opportunistically update the other time fields for other inodes in
4216 * the same inode table block.
4217 */
4218static void ext4_update_other_inodes_time(struct super_block *sb,
4219 unsigned long orig_ino, char *buf)
4220{
4221 struct other_inode oi;
4222 unsigned long ino;
4223 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4224 int inode_size = EXT4_INODE_SIZE(sb);
4225
4226 oi.orig_ino = orig_ino;
4227 ino = orig_ino & ~(inodes_per_block - 1);
4228 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
4229 if (ino == orig_ino)
4230 continue;
4231 oi.raw_inode = (struct ext4_inode *) buf;
4232 (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
4233 }
4234}
4235
4142/* 4236/*
4143 * Post the struct inode info into an on-disk inode location in the 4237 * Post the struct inode info into an on-disk inode location in the
4144 * buffer-cache. This gobbles the caller's reference to the 4238 * buffer-cache. This gobbles the caller's reference to the
@@ -4248,10 +4342,11 @@ static int ext4_do_update_inode(handle_t *handle,
4248 cpu_to_le16(ei->i_extra_isize); 4342 cpu_to_le16(ei->i_extra_isize);
4249 } 4343 }
4250 } 4344 }
4251
4252 ext4_inode_csum_set(inode, raw_inode, ei); 4345 ext4_inode_csum_set(inode, raw_inode, ei);
4253
4254 spin_unlock(&ei->i_raw_lock); 4346 spin_unlock(&ei->i_raw_lock);
4347 if (inode->i_sb->s_flags & MS_LAZYTIME)
4348 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
4349 bh->b_data);
4255 4350
4256 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4351 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4257 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 4352 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4534,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4534 * Truncate pagecache after we've waited for commit 4629 * Truncate pagecache after we've waited for commit
4535 * in data=journal mode to make pages freeable. 4630 * in data=journal mode to make pages freeable.
4536 */ 4631 */
4537 truncate_pagecache(inode, inode->i_size); 4632 truncate_pagecache(inode, inode->i_size);
4538 } 4633 }
4539 /* 4634 /*
4540 * We want to call ext4_truncate() even if attr->ia_size == 4635 * We want to call ext4_truncate() even if attr->ia_size ==
@@ -4840,11 +4935,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4840 * If the inode is marked synchronous, we don't honour that here - doing 4935 * If the inode is marked synchronous, we don't honour that here - doing
4841 * so would cause a commit on atime updates, which we don't bother doing. 4936 * so would cause a commit on atime updates, which we don't bother doing.
4842 * We handle synchronous inodes at the highest possible level. 4937 * We handle synchronous inodes at the highest possible level.
4938 *
4939 * If only the I_DIRTY_TIME flag is set, we can skip everything. If
4940 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
4941 * to copy into the on-disk inode structure are the timestamp files.
4843 */ 4942 */
4844void ext4_dirty_inode(struct inode *inode, int flags) 4943void ext4_dirty_inode(struct inode *inode, int flags)
4845{ 4944{
4846 handle_t *handle; 4945 handle_t *handle;
4847 4946
4947 if (flags == I_DIRTY_TIME)
4948 return;
4848 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4949 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4849 if (IS_ERR(handle)) 4950 if (IS_ERR(handle))
4850 goto out; 4951 goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
2235 err = PTR_ERR(inode); 2235 err = PTR_ERR(inode);
2236 if (!IS_ERR(inode)) { 2236 if (!IS_ERR(inode)) {
2237 inode->i_op = &ext4_file_inode_operations; 2237 inode->i_op = &ext4_file_inode_operations;
2238 inode->i_fop = &ext4_file_operations; 2238 if (test_opt(inode->i_sb, DAX))
2239 inode->i_fop = &ext4_dax_file_operations;
2240 else
2241 inode->i_fop = &ext4_file_operations;
2239 ext4_set_aops(inode); 2242 ext4_set_aops(inode);
2240 err = ext4_add_nondir(handle, dentry, inode); 2243 err = ext4_add_nondir(handle, dentry, inode);
2241 if (!err && IS_DIRSYNC(dir)) 2244 if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
2299 err = PTR_ERR(inode); 2302 err = PTR_ERR(inode);
2300 if (!IS_ERR(inode)) { 2303 if (!IS_ERR(inode)) {
2301 inode->i_op = &ext4_file_inode_operations; 2304 inode->i_op = &ext4_file_inode_operations;
2302 inode->i_fop = &ext4_file_operations; 2305 if (test_opt(inode->i_sb, DAX))
2306 inode->i_fop = &ext4_dax_file_operations;
2307 else
2308 inode->i_fop = &ext4_file_operations;
2303 ext4_set_aops(inode); 2309 ext4_set_aops(inode);
2304 d_tmpfile(dentry, inode); 2310 d_tmpfile(dentry, inode);
2305 err = ext4_orphan_add(handle, inode); 2311 err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..1adac6868e6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
334static int block_device_ejected(struct super_block *sb) 334static int block_device_ejected(struct super_block *sb)
335{ 335{
336 struct inode *bd_inode = sb->s_bdev->bd_inode; 336 struct inode *bd_inode = sb->s_bdev->bd_inode;
337 struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; 337 struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
338 338
339 return bdi->dev == NULL; 339 return bdi->dev == NULL;
340} 340}
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
1046static int ext4_write_info(struct super_block *sb, int type); 1046static int ext4_write_info(struct super_block *sb, int type);
1047static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1047static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1048 struct path *path); 1048 struct path *path);
1049static int ext4_quota_on_sysfile(struct super_block *sb, int type,
1050 int format_id);
1051static int ext4_quota_off(struct super_block *sb, int type); 1049static int ext4_quota_off(struct super_block *sb, int type);
1052static int ext4_quota_off_sysfile(struct super_block *sb, int type);
1053static int ext4_quota_on_mount(struct super_block *sb, int type); 1050static int ext4_quota_on_mount(struct super_block *sb, int type);
1054static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1051static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1055 size_t len, loff_t off); 1052 size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
1084 .get_dqblk = dquot_get_dqblk, 1081 .get_dqblk = dquot_get_dqblk,
1085 .set_dqblk = dquot_set_dqblk 1082 .set_dqblk = dquot_set_dqblk
1086}; 1083};
1087
1088static const struct quotactl_ops ext4_qctl_sysfile_operations = {
1089 .quota_on_meta = ext4_quota_on_sysfile,
1090 .quota_off = ext4_quota_off_sysfile,
1091 .quota_sync = dquot_quota_sync,
1092 .get_info = dquot_get_dqinfo,
1093 .set_info = dquot_set_dqinfo,
1094 .get_dqblk = dquot_get_dqblk,
1095 .set_dqblk = dquot_set_dqblk
1096};
1097#endif 1084#endif
1098 1085
1099static const struct super_operations ext4_sops = { 1086static const struct super_operations ext4_sops = {
@@ -1137,8 +1124,9 @@ enum {
1137 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1124 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1138 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1125 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1139 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, 1126 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1140 Opt_usrquota, Opt_grpquota, Opt_i_version, 1127 Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
1141 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, 1128 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1129 Opt_lazytime, Opt_nolazytime,
1142 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1130 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1143 Opt_inode_readahead_blks, Opt_journal_ioprio, 1131 Opt_inode_readahead_blks, Opt_journal_ioprio,
1144 Opt_dioread_nolock, Opt_dioread_lock, 1132 Opt_dioread_nolock, Opt_dioread_lock,
@@ -1200,8 +1188,11 @@ static const match_table_t tokens = {
1200 {Opt_barrier, "barrier"}, 1188 {Opt_barrier, "barrier"},
1201 {Opt_nobarrier, "nobarrier"}, 1189 {Opt_nobarrier, "nobarrier"},
1202 {Opt_i_version, "i_version"}, 1190 {Opt_i_version, "i_version"},
1191 {Opt_dax, "dax"},
1203 {Opt_stripe, "stripe=%u"}, 1192 {Opt_stripe, "stripe=%u"},
1204 {Opt_delalloc, "delalloc"}, 1193 {Opt_delalloc, "delalloc"},
1194 {Opt_lazytime, "lazytime"},
1195 {Opt_nolazytime, "nolazytime"},
1205 {Opt_nodelalloc, "nodelalloc"}, 1196 {Opt_nodelalloc, "nodelalloc"},
1206 {Opt_removed, "mblk_io_submit"}, 1197 {Opt_removed, "mblk_io_submit"},
1207 {Opt_removed, "nomblk_io_submit"}, 1198 {Opt_removed, "nomblk_io_submit"},
@@ -1384,6 +1375,7 @@ static const struct mount_opts {
1384 {Opt_min_batch_time, 0, MOPT_GTE0}, 1375 {Opt_min_batch_time, 0, MOPT_GTE0},
1385 {Opt_inode_readahead_blks, 0, MOPT_GTE0}, 1376 {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1386 {Opt_init_itable, 0, MOPT_GTE0}, 1377 {Opt_init_itable, 0, MOPT_GTE0},
1378 {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
1387 {Opt_stripe, 0, MOPT_GTE0}, 1379 {Opt_stripe, 0, MOPT_GTE0},
1388 {Opt_resuid, 0, MOPT_GTE0}, 1380 {Opt_resuid, 0, MOPT_GTE0},
1389 {Opt_resgid, 0, MOPT_GTE0}, 1381 {Opt_resgid, 0, MOPT_GTE0},
@@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1459 case Opt_i_version: 1451 case Opt_i_version:
1460 sb->s_flags |= MS_I_VERSION; 1452 sb->s_flags |= MS_I_VERSION;
1461 return 1; 1453 return 1;
1454 case Opt_lazytime:
1455 sb->s_flags |= MS_LAZYTIME;
1456 return 1;
1457 case Opt_nolazytime:
1458 sb->s_flags &= ~MS_LAZYTIME;
1459 return 1;
1462 } 1460 }
1463 1461
1464 for (m = ext4_mount_opts; m->token != Opt_err; m++) 1462 for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1620 } 1618 }
1621 sbi->s_jquota_fmt = m->mount_opt; 1619 sbi->s_jquota_fmt = m->mount_opt;
1622#endif 1620#endif
1621#ifndef CONFIG_FS_DAX
1622 } else if (token == Opt_dax) {
1623 ext4_msg(sb, KERN_INFO, "dax option not supported");
1624 return -1;
1625#endif
1623 } else { 1626 } else {
1624 if (!args->from) 1627 if (!args->from)
1625 arg = 1; 1628 arg = 1;
@@ -3602,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3602 "both data=journal and dioread_nolock"); 3605 "both data=journal and dioread_nolock");
3603 goto failed_mount; 3606 goto failed_mount;
3604 } 3607 }
3608 if (test_opt(sb, DAX)) {
3609 ext4_msg(sb, KERN_ERR, "can't mount with "
3610 "both data=journal and dax");
3611 goto failed_mount;
3612 }
3605 if (test_opt(sb, DELALLOC)) 3613 if (test_opt(sb, DELALLOC))
3606 clear_opt(sb, DELALLOC); 3614 clear_opt(sb, DELALLOC);
3607 } 3615 }
@@ -3665,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3665 goto failed_mount; 3673 goto failed_mount;
3666 } 3674 }
3667 3675
3676 if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
3677 if (blocksize != PAGE_SIZE) {
3678 ext4_msg(sb, KERN_ERR,
3679 "error: unsupported blocksize for dax");
3680 goto failed_mount;
3681 }
3682 if (!sb->s_bdev->bd_disk->fops->direct_access) {
3683 ext4_msg(sb, KERN_ERR,
3684 "error: device does not support dax");
3685 goto failed_mount;
3686 }
3687 }
3688
3668 if (sb->s_blocksize != blocksize) { 3689 if (sb->s_blocksize != blocksize) {
3669 /* Validate the filesystem blocksize */ 3690 /* Validate the filesystem blocksize */
3670 if (!sb_set_blocksize(sb, blocksize)) { 3691 if (!sb_set_blocksize(sb, blocksize)) {
@@ -3935,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3935#ifdef CONFIG_QUOTA 3956#ifdef CONFIG_QUOTA
3936 sb->dq_op = &ext4_quota_operations; 3957 sb->dq_op = &ext4_quota_operations;
3937 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) 3958 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
3938 sb->s_qcop = &ext4_qctl_sysfile_operations; 3959 sb->s_qcop = &dquot_quotactl_sysfile_ops;
3939 else 3960 else
3940 sb->s_qcop = &ext4_qctl_operations; 3961 sb->s_qcop = &ext4_qctl_operations;
3941 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 3962 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -4882,6 +4903,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4882 err = -EINVAL; 4903 err = -EINVAL;
4883 goto restore_opts; 4904 goto restore_opts;
4884 } 4905 }
4906 if (test_opt(sb, DAX)) {
4907 ext4_msg(sb, KERN_ERR, "can't mount with "
4908 "both data=journal and dax");
4909 err = -EINVAL;
4910 goto restore_opts;
4911 }
4912 }
4913
4914 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
4915 ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
4916 "dax flag with busy inodes while remounting");
4917 sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
4885 } 4918 }
4886 4919
4887 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 4920 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
@@ -5020,6 +5053,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
5020 } 5053 }
5021#endif 5054#endif
5022 5055
5056 *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
5023 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); 5057 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
5024 kfree(orig_data); 5058 kfree(orig_data);
5025 return 0; 5059 return 0;
@@ -5288,21 +5322,6 @@ static int ext4_enable_quotas(struct super_block *sb)
5288 return 0; 5322 return 0;
5289} 5323}
5290 5324
5291/*
5292 * quota_on function that is used when QUOTA feature is set.
5293 */
5294static int ext4_quota_on_sysfile(struct super_block *sb, int type,
5295 int format_id)
5296{
5297 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
5298 return -EINVAL;
5299
5300 /*
5301 * USAGE was enabled at mount time. Only need to enable LIMITS now.
5302 */
5303 return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
5304}
5305
5306static int ext4_quota_off(struct super_block *sb, int type) 5325static int ext4_quota_off(struct super_block *sb, int type)
5307{ 5326{
5308 struct inode *inode = sb_dqopt(sb)->files[type]; 5327 struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5348,6 @@ out:
5329 return dquot_quota_off(sb, type); 5348 return dquot_quota_off(sb, type);
5330} 5349}
5331 5350
5332/*
5333 * quota_off function that is used when QUOTA feature is set.
5334 */
5335static int ext4_quota_off_sysfile(struct super_block *sb, int type)
5336{
5337 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
5338 return -EINVAL;
5339
5340 /* Disable only the limits. */
5341 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
5342}
5343
5344/* Read data from quotafile - avoid pagecache and such because we cannot afford 5351/* Read data from quotafile - avoid pagecache and such because we cannot afford
5345 * acquiring the locks... As quota files are never truncated and quota code 5352 * acquiring the locks... As quota files are never truncated and quota code
5346 * itself serializes the operations (and no one else should touch the files) 5353 * itself serializes the operations (and no one else should touch the files)
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 736a348509f7..94e2d2ffabe1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -71,3 +71,13 @@ config F2FS_CHECK_FS
71 Enables BUG_ONs which check the filesystem consistency in runtime. 71 Enables BUG_ONs which check the filesystem consistency in runtime.
72 72
73 If you want to improve the performance, say N. 73 If you want to improve the performance, say N.
74
75config F2FS_IO_TRACE
76 bool "F2FS IO tracer"
77 depends on F2FS_FS
78 depends on FUNCTION_TRACER
79 help
80 F2FS IO trace is based on a function trace, which gathers process
81 information and block IO patterns in the filesystem level.
82
83 If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2e35da12d292..d92397731db8 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -5,3 +5,4 @@ f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o 5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o 6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
7f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o 7f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
8f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1ccb26bc2a0b..742202779bd5 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
62 if (count == 0) 62 if (count == 0)
63 return NULL; 63 return NULL;
64 64
65 acl = posix_acl_alloc(count, GFP_KERNEL); 65 acl = posix_acl_alloc(count, GFP_NOFS);
66 if (!acl) 66 if (!acl)
67 return ERR_PTR(-ENOMEM); 67 return ERR_PTR(-ENOMEM);
68 68
@@ -116,7 +116,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
116 int i; 116 int i;
117 117
118 f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * 118 f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
119 sizeof(struct f2fs_acl_entry), GFP_KERNEL); 119 sizeof(struct f2fs_acl_entry), GFP_NOFS);
120 if (!f2fs_acl) 120 if (!f2fs_acl)
121 return ERR_PTR(-ENOMEM); 121 return ERR_PTR(-ENOMEM);
122 122
@@ -396,7 +396,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
396 posix_acl_release(default_acl); 396 posix_acl_release(default_acl);
397 } 397 }
398 if (acl) { 398 if (acl) {
399 if (error) 399 if (!error)
400 error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, 400 error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
401 ipage); 401 ipage);
402 posix_acl_release(acl); 402 posix_acl_release(acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e6c271fefaca..7f794b72b3b7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,10 +20,11 @@
20#include "f2fs.h" 20#include "f2fs.h"
21#include "node.h" 21#include "node.h"
22#include "segment.h" 22#include "segment.h"
23#include "trace.h"
23#include <trace/events/f2fs.h> 24#include <trace/events/f2fs.h>
24 25
25static struct kmem_cache *ino_entry_slab; 26static struct kmem_cache *ino_entry_slab;
26static struct kmem_cache *inode_entry_slab; 27struct kmem_cache *inode_entry_slab;
27 28
28/* 29/*
29 * We guarantee no failure on the returned page. 30 * We guarantee no failure on the returned page.
@@ -50,6 +51,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
50{ 51{
51 struct address_space *mapping = META_MAPPING(sbi); 52 struct address_space *mapping = META_MAPPING(sbi);
52 struct page *page; 53 struct page *page;
54 struct f2fs_io_info fio = {
55 .type = META,
56 .rw = READ_SYNC | REQ_META | REQ_PRIO,
57 .blk_addr = index,
58 };
53repeat: 59repeat:
54 page = grab_cache_page(mapping, index); 60 page = grab_cache_page(mapping, index);
55 if (!page) { 61 if (!page) {
@@ -59,8 +65,7 @@ repeat:
59 if (PageUptodate(page)) 65 if (PageUptodate(page))
60 goto out; 66 goto out;
61 67
62 if (f2fs_submit_page_bio(sbi, page, index, 68 if (f2fs_submit_page_bio(sbi, page, &fio))
63 READ_SYNC | REQ_META | REQ_PRIO))
64 goto repeat; 69 goto repeat;
65 70
66 lock_page(page); 71 lock_page(page);
@@ -112,14 +117,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
112 block_t prev_blk_addr = 0; 117 block_t prev_blk_addr = 0;
113 struct page *page; 118 struct page *page;
114 block_t blkno = start; 119 block_t blkno = start;
115
116 struct f2fs_io_info fio = { 120 struct f2fs_io_info fio = {
117 .type = META, 121 .type = META,
118 .rw = READ_SYNC | REQ_META | REQ_PRIO 122 .rw = READ_SYNC | REQ_META | REQ_PRIO
119 }; 123 };
120 124
121 for (; nrpages-- > 0; blkno++) { 125 for (; nrpages-- > 0; blkno++) {
122 block_t blk_addr;
123 126
124 if (!is_valid_blkaddr(sbi, blkno, type)) 127 if (!is_valid_blkaddr(sbi, blkno, type))
125 goto out; 128 goto out;
@@ -130,27 +133,27 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
130 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) 133 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
131 blkno = 0; 134 blkno = 0;
132 /* get nat block addr */ 135 /* get nat block addr */
133 blk_addr = current_nat_addr(sbi, 136 fio.blk_addr = current_nat_addr(sbi,
134 blkno * NAT_ENTRY_PER_BLOCK); 137 blkno * NAT_ENTRY_PER_BLOCK);
135 break; 138 break;
136 case META_SIT: 139 case META_SIT:
137 /* get sit block addr */ 140 /* get sit block addr */
138 blk_addr = current_sit_addr(sbi, 141 fio.blk_addr = current_sit_addr(sbi,
139 blkno * SIT_ENTRY_PER_BLOCK); 142 blkno * SIT_ENTRY_PER_BLOCK);
140 if (blkno != start && prev_blk_addr + 1 != blk_addr) 143 if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
141 goto out; 144 goto out;
142 prev_blk_addr = blk_addr; 145 prev_blk_addr = fio.blk_addr;
143 break; 146 break;
144 case META_SSA: 147 case META_SSA:
145 case META_CP: 148 case META_CP:
146 case META_POR: 149 case META_POR:
147 blk_addr = blkno; 150 fio.blk_addr = blkno;
148 break; 151 break;
149 default: 152 default:
150 BUG(); 153 BUG();
151 } 154 }
152 155
153 page = grab_cache_page(META_MAPPING(sbi), blk_addr); 156 page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
154 if (!page) 157 if (!page)
155 continue; 158 continue;
156 if (PageUptodate(page)) { 159 if (PageUptodate(page)) {
@@ -158,7 +161,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
158 continue; 161 continue;
159 } 162 }
160 163
161 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); 164 f2fs_submit_page_mbio(sbi, page, &fio);
162 f2fs_put_page(page, 0); 165 f2fs_put_page(page, 0);
163 } 166 }
164out: 167out:
@@ -187,7 +190,7 @@ static int f2fs_write_meta_page(struct page *page,
187 190
188 trace_f2fs_writepage(page, META); 191 trace_f2fs_writepage(page, META);
189 192
190 if (unlikely(sbi->por_doing)) 193 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
191 goto redirty_out; 194 goto redirty_out;
192 if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) 195 if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
193 goto redirty_out; 196 goto redirty_out;
@@ -299,6 +302,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
299 if (!PageDirty(page)) { 302 if (!PageDirty(page)) {
300 __set_page_dirty_nobuffers(page); 303 __set_page_dirty_nobuffers(page);
301 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); 304 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
305 SetPagePrivate(page);
306 f2fs_trace_pid(page);
302 return 1; 307 return 1;
303 } 308 }
304 return 0; 309 return 0;
@@ -308,6 +313,8 @@ const struct address_space_operations f2fs_meta_aops = {
308 .writepage = f2fs_write_meta_page, 313 .writepage = f2fs_write_meta_page,
309 .writepages = f2fs_write_meta_pages, 314 .writepages = f2fs_write_meta_pages,
310 .set_page_dirty = f2fs_set_meta_page_dirty, 315 .set_page_dirty = f2fs_set_meta_page_dirty,
316 .invalidatepage = f2fs_invalidate_page,
317 .releasepage = f2fs_release_page,
311}; 318};
312 319
313static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) 320static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -462,7 +469,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
462 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) 469 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
463 return; 470 return;
464 471
465 sbi->por_doing = true; 472 set_sbi_flag(sbi, SBI_POR_DOING);
466 473
467 start_blk = __start_cp_addr(sbi) + 1 + 474 start_blk = __start_cp_addr(sbi) + 1 +
468 le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); 475 le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
@@ -483,7 +490,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
483 } 490 }
484 /* clear Orphan Flag */ 491 /* clear Orphan Flag */
485 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); 492 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
486 sbi->por_doing = false; 493 clear_sbi_flag(sbi, SBI_POR_DOING);
487 return; 494 return;
488} 495}
489 496
@@ -567,7 +574,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
567 if (crc_offset >= blk_size) 574 if (crc_offset >= blk_size)
568 goto invalid_cp1; 575 goto invalid_cp1;
569 576
570 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); 577 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
571 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 578 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
572 goto invalid_cp1; 579 goto invalid_cp1;
573 580
@@ -582,7 +589,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
582 if (crc_offset >= blk_size) 589 if (crc_offset >= blk_size)
583 goto invalid_cp2; 590 goto invalid_cp2;
584 591
585 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); 592 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
586 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 593 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
587 goto invalid_cp2; 594 goto invalid_cp2;
588 595
@@ -669,7 +676,7 @@ fail_no_cp:
669 return -EINVAL; 676 return -EINVAL;
670} 677}
671 678
672static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) 679static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
673{ 680{
674 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 681 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
675 682
@@ -686,7 +693,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
686void update_dirty_page(struct inode *inode, struct page *page) 693void update_dirty_page(struct inode *inode, struct page *page)
687{ 694{
688 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 695 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
689 struct dir_inode_entry *new; 696 struct inode_entry *new;
690 int ret = 0; 697 int ret = 0;
691 698
692 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) 699 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
@@ -710,12 +717,13 @@ void update_dirty_page(struct inode *inode, struct page *page)
710 kmem_cache_free(inode_entry_slab, new); 717 kmem_cache_free(inode_entry_slab, new);
711out: 718out:
712 SetPagePrivate(page); 719 SetPagePrivate(page);
720 f2fs_trace_pid(page);
713} 721}
714 722
715void add_dirty_dir_inode(struct inode *inode) 723void add_dirty_dir_inode(struct inode *inode)
716{ 724{
717 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 725 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
718 struct dir_inode_entry *new = 726 struct inode_entry *new =
719 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 727 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
720 int ret = 0; 728 int ret = 0;
721 729
@@ -733,7 +741,7 @@ void add_dirty_dir_inode(struct inode *inode)
733void remove_dirty_dir_inode(struct inode *inode) 741void remove_dirty_dir_inode(struct inode *inode)
734{ 742{
735 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 743 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
736 struct dir_inode_entry *entry; 744 struct inode_entry *entry;
737 745
738 if (!S_ISDIR(inode->i_mode)) 746 if (!S_ISDIR(inode->i_mode))
739 return; 747 return;
@@ -763,7 +771,7 @@ void remove_dirty_dir_inode(struct inode *inode)
763void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) 771void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
764{ 772{
765 struct list_head *head; 773 struct list_head *head;
766 struct dir_inode_entry *entry; 774 struct inode_entry *entry;
767 struct inode *inode; 775 struct inode *inode;
768retry: 776retry:
769 if (unlikely(f2fs_cp_error(sbi))) 777 if (unlikely(f2fs_cp_error(sbi)))
@@ -776,7 +784,7 @@ retry:
776 spin_unlock(&sbi->dir_inode_lock); 784 spin_unlock(&sbi->dir_inode_lock);
777 return; 785 return;
778 } 786 }
779 entry = list_entry(head->next, struct dir_inode_entry, list); 787 entry = list_entry(head->next, struct inode_entry, list);
780 inode = igrab(entry->inode); 788 inode = igrab(entry->inode);
781 spin_unlock(&sbi->dir_inode_lock); 789 spin_unlock(&sbi->dir_inode_lock);
782 if (inode) { 790 if (inode) {
@@ -922,7 +930,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
922 ckpt->next_free_nid = cpu_to_le32(last_nid); 930 ckpt->next_free_nid = cpu_to_le32(last_nid);
923 931
924 /* 2 cp + n data seg summary + orphan inode blocks */ 932 /* 2 cp + n data seg summary + orphan inode blocks */
925 data_sum_blocks = npages_for_summary_flush(sbi); 933 data_sum_blocks = npages_for_summary_flush(sbi, false);
926 if (data_sum_blocks < NR_CURSEG_DATA_TYPE) 934 if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
927 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); 935 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
928 else 936 else
@@ -932,24 +940,31 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
932 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + 940 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
933 orphan_blocks); 941 orphan_blocks);
934 942
935 if (cpc->reason == CP_UMOUNT) { 943 if (__remain_node_summaries(cpc->reason))
936 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
937 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ 944 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
938 cp_payload_blks + data_sum_blocks + 945 cp_payload_blks + data_sum_blocks +
939 orphan_blocks + NR_CURSEG_NODE_TYPE); 946 orphan_blocks + NR_CURSEG_NODE_TYPE);
940 } else { 947 else
941 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
942 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + 948 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
943 cp_payload_blks + data_sum_blocks + 949 cp_payload_blks + data_sum_blocks +
944 orphan_blocks); 950 orphan_blocks);
945 } 951
952 if (cpc->reason == CP_UMOUNT)
953 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
954 else
955 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
956
957 if (cpc->reason == CP_FASTBOOT)
958 set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
959 else
960 clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
946 961
947 if (orphan_num) 962 if (orphan_num)
948 set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 963 set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
949 else 964 else
950 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 965 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
951 966
952 if (sbi->need_fsck) 967 if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
953 set_ckpt_flags(ckpt, CP_FSCK_FLAG); 968 set_ckpt_flags(ckpt, CP_FSCK_FLAG);
954 969
955 /* update SIT/NAT bitmap */ 970 /* update SIT/NAT bitmap */
@@ -966,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
966 /* write out checkpoint buffer at block 0 */ 981 /* write out checkpoint buffer at block 0 */
967 cp_page = grab_meta_page(sbi, start_blk++); 982 cp_page = grab_meta_page(sbi, start_blk++);
968 kaddr = page_address(cp_page); 983 kaddr = page_address(cp_page);
969 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); 984 memcpy(kaddr, ckpt, F2FS_BLKSIZE);
970 set_page_dirty(cp_page); 985 set_page_dirty(cp_page);
971 f2fs_put_page(cp_page, 1); 986 f2fs_put_page(cp_page, 1);
972 987
973 for (i = 1; i < 1 + cp_payload_blks; i++) { 988 for (i = 1; i < 1 + cp_payload_blks; i++) {
974 cp_page = grab_meta_page(sbi, start_blk++); 989 cp_page = grab_meta_page(sbi, start_blk++);
975 kaddr = page_address(cp_page); 990 kaddr = page_address(cp_page);
976 memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, 991 memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
977 (1 << sbi->log_blocksize));
978 set_page_dirty(cp_page); 992 set_page_dirty(cp_page);
979 f2fs_put_page(cp_page, 1); 993 f2fs_put_page(cp_page, 1);
980 } 994 }
@@ -986,7 +1000,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
986 1000
987 write_data_summaries(sbi, start_blk); 1001 write_data_summaries(sbi, start_blk);
988 start_blk += data_sum_blocks; 1002 start_blk += data_sum_blocks;
989 if (cpc->reason == CP_UMOUNT) { 1003 if (__remain_node_summaries(cpc->reason)) {
990 write_node_summaries(sbi, start_blk); 1004 write_node_summaries(sbi, start_blk);
991 start_blk += NR_CURSEG_NODE_TYPE; 1005 start_blk += NR_CURSEG_NODE_TYPE;
992 } 1006 }
@@ -994,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
994 /* writeout checkpoint block */ 1008 /* writeout checkpoint block */
995 cp_page = grab_meta_page(sbi, start_blk); 1009 cp_page = grab_meta_page(sbi, start_blk);
996 kaddr = page_address(cp_page); 1010 kaddr = page_address(cp_page);
997 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); 1011 memcpy(kaddr, ckpt, F2FS_BLKSIZE);
998 set_page_dirty(cp_page); 1012 set_page_dirty(cp_page);
999 f2fs_put_page(cp_page, 1); 1013 f2fs_put_page(cp_page, 1);
1000 1014
@@ -1023,7 +1037,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1023 return; 1037 return;
1024 1038
1025 clear_prefree_segments(sbi); 1039 clear_prefree_segments(sbi);
1026 F2FS_RESET_SB_DIRT(sbi); 1040 clear_sbi_flag(sbi, SBI_IS_DIRTY);
1027} 1041}
1028 1042
1029/* 1043/*
@@ -1038,10 +1052,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1038 1052
1039 mutex_lock(&sbi->cp_mutex); 1053 mutex_lock(&sbi->cp_mutex);
1040 1054
1041 if (!sbi->s_dirty && cpc->reason != CP_DISCARD) 1055 if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
1056 cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
1042 goto out; 1057 goto out;
1043 if (unlikely(f2fs_cp_error(sbi))) 1058 if (unlikely(f2fs_cp_error(sbi)))
1044 goto out; 1059 goto out;
1060 if (f2fs_readonly(sbi->sb))
1061 goto out;
1045 if (block_operations(sbi)) 1062 if (block_operations(sbi))
1046 goto out; 1063 goto out;
1047 1064
@@ -1102,8 +1119,8 @@ int __init create_checkpoint_caches(void)
1102 sizeof(struct ino_entry)); 1119 sizeof(struct ino_entry));
1103 if (!ino_entry_slab) 1120 if (!ino_entry_slab)
1104 return -ENOMEM; 1121 return -ENOMEM;
1105 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 1122 inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
1106 sizeof(struct dir_inode_entry)); 1123 sizeof(struct inode_entry));
1107 if (!inode_entry_slab) { 1124 if (!inode_entry_slab) {
1108 kmem_cache_destroy(ino_entry_slab); 1125 kmem_cache_destroy(ino_entry_slab);
1109 return -ENOMEM; 1126 return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ec697b37f19..985ed023a750 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
22#include "f2fs.h" 22#include "f2fs.h"
23#include "node.h" 23#include "node.h"
24#include "segment.h" 24#include "segment.h"
25#include "trace.h"
25#include <trace/events/f2fs.h> 26#include <trace/events/f2fs.h>
26 27
27static void f2fs_read_end_io(struct bio *bio, int err) 28static void f2fs_read_end_io(struct bio *bio, int err)
@@ -95,11 +96,9 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
95 return; 96 return;
96 97
97 if (is_read_io(fio->rw)) 98 if (is_read_io(fio->rw))
98 trace_f2fs_submit_read_bio(io->sbi->sb, fio->rw, 99 trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
99 fio->type, io->bio);
100 else 100 else
101 trace_f2fs_submit_write_bio(io->sbi->sb, fio->rw, 101 trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
102 fio->type, io->bio);
103 102
104 submit_bio(fio->rw, io->bio); 103 submit_bio(fio->rw, io->bio);
105 io->bio = NULL; 104 io->bio = NULL;
@@ -132,14 +131,15 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
132 * Return unlocked page. 131 * Return unlocked page.
133 */ 132 */
134int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, 133int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
135 block_t blk_addr, int rw) 134 struct f2fs_io_info *fio)
136{ 135{
137 struct bio *bio; 136 struct bio *bio;
138 137
139 trace_f2fs_submit_page_bio(page, blk_addr, rw); 138 trace_f2fs_submit_page_bio(page, fio);
139 f2fs_trace_ios(page, fio, 0);
140 140
141 /* Allocate a new bio */ 141 /* Allocate a new bio */
142 bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); 142 bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
143 143
144 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 144 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
145 bio_put(bio); 145 bio_put(bio);
@@ -147,12 +147,12 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
147 return -EFAULT; 147 return -EFAULT;
148 } 148 }
149 149
150 submit_bio(rw, bio); 150 submit_bio(fio->rw, bio);
151 return 0; 151 return 0;
152} 152}
153 153
154void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, 154void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
155 block_t blk_addr, struct f2fs_io_info *fio) 155 struct f2fs_io_info *fio)
156{ 156{
157 enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); 157 enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
158 struct f2fs_bio_info *io; 158 struct f2fs_bio_info *io;
@@ -160,21 +160,21 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
160 160
161 io = is_read ? &sbi->read_io : &sbi->write_io[btype]; 161 io = is_read ? &sbi->read_io : &sbi->write_io[btype];
162 162
163 verify_block_addr(sbi, blk_addr); 163 verify_block_addr(sbi, fio->blk_addr);
164 164
165 down_write(&io->io_rwsem); 165 down_write(&io->io_rwsem);
166 166
167 if (!is_read) 167 if (!is_read)
168 inc_page_count(sbi, F2FS_WRITEBACK); 168 inc_page_count(sbi, F2FS_WRITEBACK);
169 169
170 if (io->bio && (io->last_block_in_bio != blk_addr - 1 || 170 if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
171 io->fio.rw != fio->rw)) 171 io->fio.rw != fio->rw))
172 __submit_merged_bio(io); 172 __submit_merged_bio(io);
173alloc_new: 173alloc_new:
174 if (io->bio == NULL) { 174 if (io->bio == NULL) {
175 int bio_blocks = MAX_BIO_BLOCKS(sbi); 175 int bio_blocks = MAX_BIO_BLOCKS(sbi);
176 176
177 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); 177 io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
178 io->fio = *fio; 178 io->fio = *fio;
179 } 179 }
180 180
@@ -184,10 +184,11 @@ alloc_new:
184 goto alloc_new; 184 goto alloc_new;
185 } 185 }
186 186
187 io->last_block_in_bio = blk_addr; 187 io->last_block_in_bio = fio->blk_addr;
188 f2fs_trace_ios(page, fio, 0);
188 189
189 up_write(&io->io_rwsem); 190 up_write(&io->io_rwsem);
190 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); 191 trace_f2fs_submit_page_mbio(page, fio);
191} 192}
192 193
193/* 194/*
@@ -196,7 +197,7 @@ alloc_new:
196 * ->node_page 197 * ->node_page
197 * update block addresses in the node page 198 * update block addresses in the node page
198 */ 199 */
199static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) 200static void __set_data_blkaddr(struct dnode_of_data *dn)
200{ 201{
201 struct f2fs_node *rn; 202 struct f2fs_node *rn;
202 __le32 *addr_array; 203 __le32 *addr_array;
@@ -209,7 +210,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
209 210
210 /* Get physical address of data block */ 211 /* Get physical address of data block */
211 addr_array = blkaddr_in_node(rn); 212 addr_array = blkaddr_in_node(rn);
212 addr_array[ofs_in_node] = cpu_to_le32(new_addr); 213 addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
213 set_page_dirty(node_page); 214 set_page_dirty(node_page);
214} 215}
215 216
@@ -224,8 +225,8 @@ int reserve_new_block(struct dnode_of_data *dn)
224 225
225 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); 226 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
226 227
227 __set_data_blkaddr(dn, NEW_ADDR);
228 dn->data_blkaddr = NEW_ADDR; 228 dn->data_blkaddr = NEW_ADDR;
229 __set_data_blkaddr(dn);
229 mark_inode_dirty(dn->inode); 230 mark_inode_dirty(dn->inode);
230 sync_inode_page(dn); 231 sync_inode_page(dn);
231 return 0; 232 return 0;
@@ -273,7 +274,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
273 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 274 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
274 size_t count; 275 size_t count;
275 276
276 clear_buffer_new(bh_result); 277 set_buffer_new(bh_result);
277 map_bh(bh_result, inode->i_sb, 278 map_bh(bh_result, inode->i_sb,
278 start_blkaddr + pgofs - start_fofs); 279 start_blkaddr + pgofs - start_fofs);
279 count = end_fofs - pgofs + 1; 280 count = end_fofs - pgofs + 1;
@@ -290,23 +291,24 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
290 return 0; 291 return 0;
291} 292}
292 293
293void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) 294void update_extent_cache(struct dnode_of_data *dn)
294{ 295{
295 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 296 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
296 pgoff_t fofs, start_fofs, end_fofs; 297 pgoff_t fofs, start_fofs, end_fofs;
297 block_t start_blkaddr, end_blkaddr; 298 block_t start_blkaddr, end_blkaddr;
298 int need_update = true; 299 int need_update = true;
299 300
300 f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR); 301 f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
301 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
302 dn->ofs_in_node;
303 302
304 /* Update the page address in the parent node */ 303 /* Update the page address in the parent node */
305 __set_data_blkaddr(dn, blk_addr); 304 __set_data_blkaddr(dn);
306 305
307 if (is_inode_flag_set(fi, FI_NO_EXTENT)) 306 if (is_inode_flag_set(fi, FI_NO_EXTENT))
308 return; 307 return;
309 308
309 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
310 dn->ofs_in_node;
311
310 write_lock(&fi->ext.ext_lock); 312 write_lock(&fi->ext.ext_lock);
311 313
312 start_fofs = fi->ext.fofs; 314 start_fofs = fi->ext.fofs;
@@ -320,16 +322,16 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
320 322
321 /* Initial extent */ 323 /* Initial extent */
322 if (fi->ext.len == 0) { 324 if (fi->ext.len == 0) {
323 if (blk_addr != NULL_ADDR) { 325 if (dn->data_blkaddr != NULL_ADDR) {
324 fi->ext.fofs = fofs; 326 fi->ext.fofs = fofs;
325 fi->ext.blk_addr = blk_addr; 327 fi->ext.blk_addr = dn->data_blkaddr;
326 fi->ext.len = 1; 328 fi->ext.len = 1;
327 } 329 }
328 goto end_update; 330 goto end_update;
329 } 331 }
330 332
331 /* Front merge */ 333 /* Front merge */
332 if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { 334 if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
333 fi->ext.fofs--; 335 fi->ext.fofs--;
334 fi->ext.blk_addr--; 336 fi->ext.blk_addr--;
335 fi->ext.len++; 337 fi->ext.len++;
@@ -337,7 +339,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
337 } 339 }
338 340
339 /* Back merge */ 341 /* Back merge */
340 if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) { 342 if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
341 fi->ext.len++; 343 fi->ext.len++;
342 goto end_update; 344 goto end_update;
343 } 345 }
@@ -376,6 +378,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
376 struct dnode_of_data dn; 378 struct dnode_of_data dn;
377 struct page *page; 379 struct page *page;
378 int err; 380 int err;
381 struct f2fs_io_info fio = {
382 .type = DATA,
383 .rw = sync ? READ_SYNC : READA,
384 };
379 385
380 page = find_get_page(mapping, index); 386 page = find_get_page(mapping, index);
381 if (page && PageUptodate(page)) 387 if (page && PageUptodate(page))
@@ -404,8 +410,8 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
404 return page; 410 return page;
405 } 411 }
406 412
407 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr, 413 fio.blk_addr = dn.data_blkaddr;
408 sync ? READ_SYNC : READA); 414 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
409 if (err) 415 if (err)
410 return ERR_PTR(err); 416 return ERR_PTR(err);
411 417
@@ -430,7 +436,10 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
430 struct dnode_of_data dn; 436 struct dnode_of_data dn;
431 struct page *page; 437 struct page *page;
432 int err; 438 int err;
433 439 struct f2fs_io_info fio = {
440 .type = DATA,
441 .rw = READ_SYNC,
442 };
434repeat: 443repeat:
435 page = grab_cache_page(mapping, index); 444 page = grab_cache_page(mapping, index);
436 if (!page) 445 if (!page)
@@ -464,8 +473,8 @@ repeat:
464 return page; 473 return page;
465 } 474 }
466 475
467 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, 476 fio.blk_addr = dn.data_blkaddr;
468 dn.data_blkaddr, READ_SYNC); 477 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
469 if (err) 478 if (err)
470 return ERR_PTR(err); 479 return ERR_PTR(err);
471 480
@@ -515,8 +524,12 @@ repeat:
515 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 524 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
516 SetPageUptodate(page); 525 SetPageUptodate(page);
517 } else { 526 } else {
518 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, 527 struct f2fs_io_info fio = {
519 dn.data_blkaddr, READ_SYNC); 528 .type = DATA,
529 .rw = READ_SYNC,
530 .blk_addr = dn.data_blkaddr,
531 };
532 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
520 if (err) 533 if (err)
521 goto put_err; 534 goto put_err;
522 535
@@ -550,30 +563,25 @@ static int __allocate_data_block(struct dnode_of_data *dn)
550 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 563 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
551 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 564 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
552 struct f2fs_summary sum; 565 struct f2fs_summary sum;
553 block_t new_blkaddr;
554 struct node_info ni; 566 struct node_info ni;
567 int seg = CURSEG_WARM_DATA;
555 pgoff_t fofs; 568 pgoff_t fofs;
556 int type;
557 569
558 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 570 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
559 return -EPERM; 571 return -EPERM;
560 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) 572 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
561 return -ENOSPC; 573 return -ENOSPC;
562 574
563 __set_data_blkaddr(dn, NEW_ADDR);
564 dn->data_blkaddr = NEW_ADDR;
565
566 get_node_info(sbi, dn->nid, &ni); 575 get_node_info(sbi, dn->nid, &ni);
567 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 576 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
568 577
569 type = CURSEG_WARM_DATA; 578 if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
579 seg = CURSEG_DIRECT_IO;
570 580
571 allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); 581 allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
572 582
573 /* direct IO doesn't use extent cache to maximize the performance */ 583 /* direct IO doesn't use extent cache to maximize the performance */
574 set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); 584 __set_data_blkaddr(dn);
575 update_extent_cache(new_blkaddr, dn);
576 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
577 585
578 /* update i_size */ 586 /* update i_size */
579 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 587 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -581,10 +589,59 @@ static int __allocate_data_block(struct dnode_of_data *dn)
581 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) 589 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
582 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); 590 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
583 591
584 dn->data_blkaddr = new_blkaddr;
585 return 0; 592 return 0;
586} 593}
587 594
595static void __allocate_data_blocks(struct inode *inode, loff_t offset,
596 size_t count)
597{
598 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
599 struct dnode_of_data dn;
600 u64 start = F2FS_BYTES_TO_BLK(offset);
601 u64 len = F2FS_BYTES_TO_BLK(count);
602 bool allocated;
603 u64 end_offset;
604
605 while (len) {
606 f2fs_balance_fs(sbi);
607 f2fs_lock_op(sbi);
608
609 /* When reading holes, we need its node page */
610 set_new_dnode(&dn, inode, NULL, NULL, 0);
611 if (get_dnode_of_data(&dn, start, ALLOC_NODE))
612 goto out;
613
614 allocated = false;
615 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
616
617 while (dn.ofs_in_node < end_offset && len) {
618 if (dn.data_blkaddr == NULL_ADDR) {
619 if (__allocate_data_block(&dn))
620 goto sync_out;
621 allocated = true;
622 }
623 len--;
624 start++;
625 dn.ofs_in_node++;
626 }
627
628 if (allocated)
629 sync_inode_page(&dn);
630
631 f2fs_put_dnode(&dn);
632 f2fs_unlock_op(sbi);
633 }
634 return;
635
636sync_out:
637 if (allocated)
638 sync_inode_page(&dn);
639 f2fs_put_dnode(&dn);
640out:
641 f2fs_unlock_op(sbi);
642 return;
643}
644
588/* 645/*
589 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. 646 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
590 * If original data blocks are allocated, then give them to blockdev. 647 * If original data blocks are allocated, then give them to blockdev.
@@ -610,10 +667,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
610 if (check_extent_cache(inode, pgofs, bh_result)) 667 if (check_extent_cache(inode, pgofs, bh_result))
611 goto out; 668 goto out;
612 669
613 if (create) { 670 if (create)
614 f2fs_balance_fs(F2FS_I_SB(inode));
615 f2fs_lock_op(F2FS_I_SB(inode)); 671 f2fs_lock_op(F2FS_I_SB(inode));
616 }
617 672
618 /* When reading holes, we need its node page */ 673 /* When reading holes, we need its node page */
619 set_new_dnode(&dn, inode, NULL, NULL, 0); 674 set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -627,12 +682,14 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
627 goto put_out; 682 goto put_out;
628 683
629 if (dn.data_blkaddr != NULL_ADDR) { 684 if (dn.data_blkaddr != NULL_ADDR) {
685 set_buffer_new(bh_result);
630 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 686 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
631 } else if (create) { 687 } else if (create) {
632 err = __allocate_data_block(&dn); 688 err = __allocate_data_block(&dn);
633 if (err) 689 if (err)
634 goto put_out; 690 goto put_out;
635 allocated = true; 691 allocated = true;
692 set_buffer_new(bh_result);
636 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 693 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
637 } else { 694 } else {
638 goto put_out; 695 goto put_out;
@@ -745,7 +802,6 @@ static int f2fs_read_data_pages(struct file *file,
745int do_write_data_page(struct page *page, struct f2fs_io_info *fio) 802int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
746{ 803{
747 struct inode *inode = page->mapping->host; 804 struct inode *inode = page->mapping->host;
748 block_t old_blkaddr, new_blkaddr;
749 struct dnode_of_data dn; 805 struct dnode_of_data dn;
750 int err = 0; 806 int err = 0;
751 807
@@ -754,10 +810,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
754 if (err) 810 if (err)
755 return err; 811 return err;
756 812
757 old_blkaddr = dn.data_blkaddr; 813 fio->blk_addr = dn.data_blkaddr;
758 814
759 /* This page is already truncated */ 815 /* This page is already truncated */
760 if (old_blkaddr == NULL_ADDR) 816 if (fio->blk_addr == NULL_ADDR)
761 goto out_writepage; 817 goto out_writepage;
762 818
763 set_page_writeback(page); 819 set_page_writeback(page);
@@ -766,14 +822,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
766 * If current allocation needs SSR, 822 * If current allocation needs SSR,
767 * it had better in-place writes for updated data. 823 * it had better in-place writes for updated data.
768 */ 824 */
769 if (unlikely(old_blkaddr != NEW_ADDR && 825 if (unlikely(fio->blk_addr != NEW_ADDR &&
770 !is_cold_data(page) && 826 !is_cold_data(page) &&
771 need_inplace_update(inode))) { 827 need_inplace_update(inode))) {
772 rewrite_data_page(page, old_blkaddr, fio); 828 rewrite_data_page(page, fio);
773 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); 829 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
774 } else { 830 } else {
775 write_data_page(page, &dn, &new_blkaddr, fio); 831 write_data_page(page, &dn, fio);
776 update_extent_cache(new_blkaddr, &dn); 832 update_extent_cache(&dn);
777 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); 833 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
778 } 834 }
779out_writepage: 835out_writepage:
@@ -812,7 +868,12 @@ static int f2fs_write_data_page(struct page *page,
812 868
813 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 869 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
814write: 870write:
815 if (unlikely(sbi->por_doing)) 871 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
872 goto redirty_out;
873 if (f2fs_is_drop_cache(inode))
874 goto out;
875 if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
876 available_free_memory(sbi, BASE_CHECK))
816 goto redirty_out; 877 goto redirty_out;
817 878
818 /* Dentry blocks are controlled by checkpoint */ 879 /* Dentry blocks are controlled by checkpoint */
@@ -826,7 +887,6 @@ write:
826 /* we should bypass data pages to proceed the kworkder jobs */ 887 /* we should bypass data pages to proceed the kworkder jobs */
827 if (unlikely(f2fs_cp_error(sbi))) { 888 if (unlikely(f2fs_cp_error(sbi))) {
828 SetPageError(page); 889 SetPageError(page);
829 unlock_page(page);
830 goto out; 890 goto out;
831 } 891 }
832 892
@@ -1002,8 +1062,12 @@ put_next:
1002 if (dn.data_blkaddr == NEW_ADDR) { 1062 if (dn.data_blkaddr == NEW_ADDR) {
1003 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 1063 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
1004 } else { 1064 } else {
1005 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 1065 struct f2fs_io_info fio = {
1006 READ_SYNC); 1066 .type = DATA,
1067 .rw = READ_SYNC,
1068 .blk_addr = dn.data_blkaddr,
1069 };
1070 err = f2fs_submit_page_bio(sbi, page, &fio);
1007 if (err) 1071 if (err)
1008 goto fail; 1072 goto fail;
1009 1073
@@ -1092,6 +1156,9 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1092 1156
1093 trace_f2fs_direct_IO_enter(inode, offset, count, rw); 1157 trace_f2fs_direct_IO_enter(inode, offset, count, rw);
1094 1158
1159 if (rw & WRITE)
1160 __allocate_data_blocks(inode, offset, count);
1161
1095 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); 1162 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
1096 if (err < 0 && (rw & WRITE)) 1163 if (err < 0 && (rw & WRITE))
1097 f2fs_write_failed(mapping, offset + count); 1164 f2fs_write_failed(mapping, offset + count);
@@ -1101,24 +1168,33 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1101 return err; 1168 return err;
1102} 1169}
1103 1170
1104static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 1171void f2fs_invalidate_page(struct page *page, unsigned int offset,
1105 unsigned int length) 1172 unsigned int length)
1106{ 1173{
1107 struct inode *inode = page->mapping->host; 1174 struct inode *inode = page->mapping->host;
1175 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1108 1176
1109 if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE) 1177 if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
1178 (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
1110 return; 1179 return;
1111 1180
1112 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) 1181 if (PageDirty(page)) {
1113 invalidate_inmem_page(inode, page); 1182 if (inode->i_ino == F2FS_META_INO(sbi))
1114 1183 dec_page_count(sbi, F2FS_DIRTY_META);
1115 if (PageDirty(page)) 1184 else if (inode->i_ino == F2FS_NODE_INO(sbi))
1116 inode_dec_dirty_pages(inode); 1185 dec_page_count(sbi, F2FS_DIRTY_NODES);
1186 else
1187 inode_dec_dirty_pages(inode);
1188 }
1117 ClearPagePrivate(page); 1189 ClearPagePrivate(page);
1118} 1190}
1119 1191
1120static int f2fs_release_data_page(struct page *page, gfp_t wait) 1192int f2fs_release_page(struct page *page, gfp_t wait)
1121{ 1193{
1194 /* If this is dirty page, keep PagePrivate */
1195 if (PageDirty(page))
1196 return 0;
1197
1122 ClearPagePrivate(page); 1198 ClearPagePrivate(page);
1123 return 1; 1199 return 1;
1124} 1200}
@@ -1132,7 +1208,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
1132 1208
1133 SetPageUptodate(page); 1209 SetPageUptodate(page);
1134 1210
1135 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) { 1211 if (f2fs_is_atomic_file(inode)) {
1136 register_inmem_page(inode, page); 1212 register_inmem_page(inode, page);
1137 return 1; 1213 return 1;
1138 } 1214 }
@@ -1168,8 +1244,8 @@ const struct address_space_operations f2fs_dblock_aops = {
1168 .write_begin = f2fs_write_begin, 1244 .write_begin = f2fs_write_begin,
1169 .write_end = f2fs_write_end, 1245 .write_end = f2fs_write_end,
1170 .set_page_dirty = f2fs_set_data_page_dirty, 1246 .set_page_dirty = f2fs_set_data_page_dirty,
1171 .invalidatepage = f2fs_invalidate_data_page, 1247 .invalidatepage = f2fs_invalidate_page,
1172 .releasepage = f2fs_release_data_page, 1248 .releasepage = f2fs_release_page,
1173 .direct_IO = f2fs_direct_IO, 1249 .direct_IO = f2fs_direct_IO,
1174 .bmap = f2fs_bmap, 1250 .bmap = f2fs_bmap,
1175}; 1251};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91e8f699ab30..e671373cc8ab 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -40,6 +40,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
40 si->ndirty_dirs = sbi->n_dirty_dirs; 40 si->ndirty_dirs = sbi->n_dirty_dirs;
41 si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); 41 si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
42 si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); 42 si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
43 si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
43 si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; 44 si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
44 si->rsvd_segs = reserved_segments(sbi); 45 si->rsvd_segs = reserved_segments(sbi);
45 si->overp_segs = overprovision_segments(sbi); 46 si->overp_segs = overprovision_segments(sbi);
@@ -57,7 +58,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
57 si->node_pages = NODE_MAPPING(sbi)->nrpages; 58 si->node_pages = NODE_MAPPING(sbi)->nrpages;
58 si->meta_pages = META_MAPPING(sbi)->nrpages; 59 si->meta_pages = META_MAPPING(sbi)->nrpages;
59 si->nats = NM_I(sbi)->nat_cnt; 60 si->nats = NM_I(sbi)->nat_cnt;
60 si->sits = SIT_I(sbi)->dirty_sentries; 61 si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
62 si->sits = MAIN_SEGS(sbi);
63 si->dirty_sits = SIT_I(sbi)->dirty_sentries;
61 si->fnids = NM_I(sbi)->fcnt; 64 si->fnids = NM_I(sbi)->fcnt;
62 si->bg_gc = sbi->bg_gc; 65 si->bg_gc = sbi->bg_gc;
63 si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) 66 si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -79,6 +82,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
79 si->segment_count[i] = sbi->segment_count[i]; 82 si->segment_count[i] = sbi->segment_count[i];
80 si->block_count[i] = sbi->block_count[i]; 83 si->block_count[i] = sbi->block_count[i];
81 } 84 }
85
86 si->inplace_count = atomic_read(&sbi->inplace_count);
82} 87}
83 88
84/* 89/*
@@ -137,6 +142,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
137 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); 142 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
138 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); 143 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
139 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); 144 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
145 si->base_mem += SIT_VBLOCK_MAP_SIZE;
140 if (sbi->segs_per_sec > 1) 146 if (sbi->segs_per_sec > 1)
141 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); 147 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
142 si->base_mem += __bitmap_size(sbi, SIT_BITMAP); 148 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -159,20 +165,32 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
159 si->base_mem += sizeof(struct f2fs_nm_info); 165 si->base_mem += sizeof(struct f2fs_nm_info);
160 si->base_mem += __bitmap_size(sbi, NAT_BITMAP); 166 si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
161 167
168get_cache:
169 si->cache_mem = 0;
170
162 /* build gc */ 171 /* build gc */
163 si->base_mem += sizeof(struct f2fs_gc_kthread); 172 if (sbi->gc_thread)
173 si->cache_mem += sizeof(struct f2fs_gc_kthread);
174
175 /* build merge flush thread */
176 if (SM_I(sbi)->cmd_control_info)
177 si->cache_mem += sizeof(struct flush_cmd_control);
164 178
165get_cache:
166 /* free nids */ 179 /* free nids */
167 si->cache_mem = NM_I(sbi)->fcnt; 180 si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
168 si->cache_mem += NM_I(sbi)->nat_cnt; 181 si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
169 npages = NODE_MAPPING(sbi)->nrpages; 182 si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
170 si->cache_mem += npages << PAGE_CACHE_SHIFT; 183 sizeof(struct nat_entry_set);
171 npages = META_MAPPING(sbi)->nrpages; 184 si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
172 si->cache_mem += npages << PAGE_CACHE_SHIFT; 185 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
173 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
174 for (i = 0; i <= UPDATE_INO; i++) 186 for (i = 0; i <= UPDATE_INO; i++)
175 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); 187 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
188
189 si->page_mem = 0;
190 npages = NODE_MAPPING(sbi)->nrpages;
191 si->page_mem += npages << PAGE_CACHE_SHIFT;
192 npages = META_MAPPING(sbi)->nrpages;
193 si->page_mem += npages << PAGE_CACHE_SHIFT;
176} 194}
177 195
178static int stat_show(struct seq_file *s, void *v) 196static int stat_show(struct seq_file *s, void *v)
@@ -250,16 +268,16 @@ static int stat_show(struct seq_file *s, void *v)
250 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", 268 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
251 si->hit_ext, si->total_ext); 269 si->hit_ext, si->total_ext);
252 seq_puts(s, "\nBalancing F2FS Async:\n"); 270 seq_puts(s, "\nBalancing F2FS Async:\n");
253 seq_printf(s, " - inmem: %4d\n", 271 seq_printf(s, " - inmem: %4d, wb: %4d\n",
254 si->inmem_pages); 272 si->inmem_pages, si->wb_pages);
255 seq_printf(s, " - nodes: %4d in %4d\n", 273 seq_printf(s, " - nodes: %4d in %4d\n",
256 si->ndirty_node, si->node_pages); 274 si->ndirty_node, si->node_pages);
257 seq_printf(s, " - dents: %4d in dirs:%4d\n", 275 seq_printf(s, " - dents: %4d in dirs:%4d\n",
258 si->ndirty_dent, si->ndirty_dirs); 276 si->ndirty_dent, si->ndirty_dirs);
259 seq_printf(s, " - meta: %4d in %4d\n", 277 seq_printf(s, " - meta: %4d in %4d\n",
260 si->ndirty_meta, si->meta_pages); 278 si->ndirty_meta, si->meta_pages);
261 seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", 279 seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
262 si->nats, si->sits); 280 si->dirty_nats, si->nats, si->dirty_sits, si->sits);
263 seq_printf(s, " - free_nids: %9d\n", 281 seq_printf(s, " - free_nids: %9d\n",
264 si->fnids); 282 si->fnids);
265 seq_puts(s, "\nDistribution of User Blocks:"); 283 seq_puts(s, "\nDistribution of User Blocks:");
@@ -277,6 +295,7 @@ static int stat_show(struct seq_file *s, void *v)
277 for (j = 0; j < si->util_free; j++) 295 for (j = 0; j < si->util_free; j++)
278 seq_putc(s, '-'); 296 seq_putc(s, '-');
279 seq_puts(s, "]\n\n"); 297 seq_puts(s, "]\n\n");
298 seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
280 seq_printf(s, "SSR: %u blocks in %u segments\n", 299 seq_printf(s, "SSR: %u blocks in %u segments\n",
281 si->block_count[SSR], si->segment_count[SSR]); 300 si->block_count[SSR], si->segment_count[SSR]);
282 seq_printf(s, "LFS: %u blocks in %u segments\n", 301 seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -289,9 +308,14 @@ static int stat_show(struct seq_file *s, void *v)
289 308
290 /* memory footprint */ 309 /* memory footprint */
291 update_mem_info(si->sbi); 310 update_mem_info(si->sbi);
292 seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", 311 seq_printf(s, "\nMemory: %u KB\n",
293 (si->base_mem + si->cache_mem) >> 10, 312 (si->base_mem + si->cache_mem + si->page_mem) >> 10);
294 si->base_mem >> 10, si->cache_mem >> 10); 313 seq_printf(s, " - static: %u KB\n",
314 si->base_mem >> 10);
315 seq_printf(s, " - cached: %u KB\n",
316 si->cache_mem >> 10);
317 seq_printf(s, " - paged : %u KB\n",
318 si->page_mem >> 10);
295 } 319 }
296 mutex_unlock(&f2fs_stat_mutex); 320 mutex_unlock(&f2fs_stat_mutex);
297 return 0; 321 return 0;
@@ -331,6 +355,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
331 355
332 atomic_set(&sbi->inline_inode, 0); 356 atomic_set(&sbi->inline_inode, 0);
333 atomic_set(&sbi->inline_dir, 0); 357 atomic_set(&sbi->inline_dir, 0);
358 atomic_set(&sbi->inplace_count, 0);
334 359
335 mutex_lock(&f2fs_stat_mutex); 360 mutex_lock(&f2fs_stat_mutex);
336 list_add_tail(&si->stat_list, &f2fs_stat_list); 361 list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b1a7d5737cd0..b74097a7f6d9 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -286,8 +286,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
286 f2fs_wait_on_page_writeback(page, type); 286 f2fs_wait_on_page_writeback(page, type);
287 de->ino = cpu_to_le32(inode->i_ino); 287 de->ino = cpu_to_le32(inode->i_ino);
288 set_de_type(de, inode); 288 set_de_type(de, inode);
289 if (!f2fs_has_inline_dentry(dir)) 289 f2fs_dentry_kunmap(dir, page);
290 kunmap(page);
291 set_page_dirty(page); 290 set_page_dirty(page);
292 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 291 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
293 mark_inode_dirty(dir); 292 mark_inode_dirty(dir);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec58bb2373fc..7fa3313ab0e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -28,7 +28,7 @@
28 do { \ 28 do { \
29 if (unlikely(condition)) { \ 29 if (unlikely(condition)) { \
30 WARN_ON(1); \ 30 WARN_ON(1); \
31 sbi->need_fsck = true; \ 31 set_sbi_flag(sbi, SBI_NEED_FSCK); \
32 } \ 32 } \
33 } while (0) 33 } while (0)
34#define f2fs_down_write(x, y) down_write(x) 34#define f2fs_down_write(x, y) down_write(x)
@@ -100,10 +100,15 @@ enum {
100 100
101enum { 101enum {
102 CP_UMOUNT, 102 CP_UMOUNT,
103 CP_FASTBOOT,
103 CP_SYNC, 104 CP_SYNC,
104 CP_DISCARD, 105 CP_DISCARD,
105}; 106};
106 107
108#define DEF_BATCHED_TRIM_SECTIONS 32
109#define BATCHED_TRIM_SEGMENTS(sbi) \
110 (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
111
107struct cp_control { 112struct cp_control {
108 int reason; 113 int reason;
109 __u64 trim_start; 114 __u64 trim_start;
@@ -136,8 +141,14 @@ struct ino_entry {
136 nid_t ino; /* inode number */ 141 nid_t ino; /* inode number */
137}; 142};
138 143
139/* for the list of directory inodes */ 144/*
140struct dir_inode_entry { 145 * for the list of directory inodes or gc inodes.
146 * NOTE: there are two slab users for this structure, if we add/modify/delete
147 * fields in structure for one of slab users, it may affect fields or size of
148 * other one, in this condition, it's better to split both of slab and related
149 * data structure.
150 */
151struct inode_entry {
141 struct list_head list; /* list head */ 152 struct list_head list; /* list head */
142 struct inode *inode; /* vfs inode pointer */ 153 struct inode *inode; /* vfs inode pointer */
143}; 154};
@@ -196,11 +207,14 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
196 */ 207 */
197#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS 208#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
198#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS 209#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
210#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION
199 211
200#define F2FS_IOCTL_MAGIC 0xf5 212#define F2FS_IOCTL_MAGIC 0xf5
201#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) 213#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
202#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) 214#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
203#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) 215#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
216#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
217#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
204 218
205#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 219#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
206/* 220/*
@@ -295,7 +309,7 @@ struct f2fs_inode_info {
295 nid_t i_xattr_nid; /* node id that contains xattrs */ 309 nid_t i_xattr_nid; /* node id that contains xattrs */
296 unsigned long long xattr_ver; /* cp version of xattr modification */ 310 unsigned long long xattr_ver; /* cp version of xattr modification */
297 struct extent_info ext; /* in-memory extent cache entry */ 311 struct extent_info ext; /* in-memory extent cache entry */
298 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ 312 struct inode_entry *dirty_dir; /* the pointer of dirty dir */
299 313
300 struct radix_tree_root inmem_root; /* radix tree for inmem pages */ 314 struct radix_tree_root inmem_root; /* radix tree for inmem pages */
301 struct list_head inmem_pages; /* inmemory pages managed by f2fs */ 315 struct list_head inmem_pages; /* inmemory pages managed by f2fs */
@@ -398,7 +412,8 @@ enum {
398 CURSEG_HOT_NODE, /* direct node blocks of directory files */ 412 CURSEG_HOT_NODE, /* direct node blocks of directory files */
399 CURSEG_WARM_NODE, /* direct node blocks of normal files */ 413 CURSEG_WARM_NODE, /* direct node blocks of normal files */
400 CURSEG_COLD_NODE, /* indirect node blocks */ 414 CURSEG_COLD_NODE, /* indirect node blocks */
401 NO_CHECK_TYPE 415 NO_CHECK_TYPE,
416 CURSEG_DIRECT_IO, /* to use for the direct IO path */
402}; 417};
403 418
404struct flush_cmd { 419struct flush_cmd {
@@ -437,6 +452,9 @@ struct f2fs_sm_info {
437 int nr_discards; /* # of discards in the list */ 452 int nr_discards; /* # of discards in the list */
438 int max_discards; /* max. discards to be issued */ 453 int max_discards; /* max. discards to be issued */
439 454
455 /* for batched trimming */
456 unsigned int trim_sections; /* # of sections to trim */
457
440 struct list_head sit_entry_set; /* sit entry set list */ 458 struct list_head sit_entry_set; /* sit entry set list */
441 459
442 unsigned int ipu_policy; /* in-place-update policy */ 460 unsigned int ipu_policy; /* in-place-update policy */
@@ -489,6 +507,7 @@ enum page_type {
489struct f2fs_io_info { 507struct f2fs_io_info {
490 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ 508 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
491 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ 509 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
510 block_t blk_addr; /* block address to be written */
492}; 511};
493 512
494#define is_read_io(rw) (((rw) & 1) == READ) 513#define is_read_io(rw) (((rw) & 1) == READ)
@@ -508,13 +527,20 @@ struct inode_management {
508 unsigned long ino_num; /* number of entries */ 527 unsigned long ino_num; /* number of entries */
509}; 528};
510 529
530/* For s_flag in struct f2fs_sb_info */
531enum {
532 SBI_IS_DIRTY, /* dirty flag for checkpoint */
533 SBI_IS_CLOSE, /* specify unmounting */
534 SBI_NEED_FSCK, /* need fsck.f2fs to fix */
535 SBI_POR_DOING, /* recovery is doing or not */
536};
537
511struct f2fs_sb_info { 538struct f2fs_sb_info {
512 struct super_block *sb; /* pointer to VFS super block */ 539 struct super_block *sb; /* pointer to VFS super block */
513 struct proc_dir_entry *s_proc; /* proc entry */ 540 struct proc_dir_entry *s_proc; /* proc entry */
514 struct buffer_head *raw_super_buf; /* buffer head of raw sb */ 541 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
515 struct f2fs_super_block *raw_super; /* raw super block pointer */ 542 struct f2fs_super_block *raw_super; /* raw super block pointer */
516 int s_dirty; /* dirty flag for checkpoint */ 543 int s_flag; /* flags for sbi */
517 bool need_fsck; /* need fsck.f2fs to fix */
518 544
519 /* for node-related operations */ 545 /* for node-related operations */
520 struct f2fs_nm_info *nm_info; /* node manager */ 546 struct f2fs_nm_info *nm_info; /* node manager */
@@ -534,7 +560,6 @@ struct f2fs_sb_info {
534 struct rw_semaphore cp_rwsem; /* blocking FS operations */ 560 struct rw_semaphore cp_rwsem; /* blocking FS operations */
535 struct rw_semaphore node_write; /* locking node writes */ 561 struct rw_semaphore node_write; /* locking node writes */
536 struct mutex writepages; /* mutex for writepages() */ 562 struct mutex writepages; /* mutex for writepages() */
537 bool por_doing; /* recovery is doing or not */
538 wait_queue_head_t cp_wait; 563 wait_queue_head_t cp_wait;
539 564
540 struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ 565 struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -589,6 +614,7 @@ struct f2fs_sb_info {
589 struct f2fs_stat_info *stat_info; /* FS status information */ 614 struct f2fs_stat_info *stat_info; /* FS status information */
590 unsigned int segment_count[2]; /* # of allocated segments */ 615 unsigned int segment_count[2]; /* # of allocated segments */
591 unsigned int block_count[2]; /* # of allocated blocks */ 616 unsigned int block_count[2]; /* # of allocated blocks */
617 atomic_t inplace_count; /* # of inplace update */
592 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ 618 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
593 atomic_t inline_inode; /* # of inline_data inodes */ 619 atomic_t inline_inode; /* # of inline_data inodes */
594 atomic_t inline_dir; /* # of inline_dentry inodes */ 620 atomic_t inline_dir; /* # of inline_dentry inodes */
@@ -686,14 +712,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
686 return sbi->node_inode->i_mapping; 712 return sbi->node_inode->i_mapping;
687} 713}
688 714
689static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) 715static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
690{ 716{
691 sbi->s_dirty = 1; 717 return sbi->s_flag & (0x01 << type);
692} 718}
693 719
694static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) 720static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
695{ 721{
696 sbi->s_dirty = 0; 722 sbi->s_flag |= (0x01 << type);
723}
724
725static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
726{
727 sbi->s_flag &= ~(0x01 << type);
697} 728}
698 729
699static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) 730static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -741,6 +772,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
741 up_write(&sbi->cp_rwsem); 772 up_write(&sbi->cp_rwsem);
742} 773}
743 774
775static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
776{
777 int reason = CP_SYNC;
778
779 if (test_opt(sbi, FASTBOOT))
780 reason = CP_FASTBOOT;
781 if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
782 reason = CP_UMOUNT;
783 return reason;
784}
785
786static inline bool __remain_node_summaries(int reason)
787{
788 return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
789}
790
791static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
792{
793 return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
794 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
795}
796
744/* 797/*
745 * Check whether the given nid is within node id range. 798 * Check whether the given nid is within node id range.
746 */ 799 */
@@ -805,7 +858,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
805static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) 858static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
806{ 859{
807 atomic_inc(&sbi->nr_pages[count_type]); 860 atomic_inc(&sbi->nr_pages[count_type]);
808 F2FS_SET_SB_DIRT(sbi); 861 set_sbi_flag(sbi, SBI_IS_DIRTY);
809} 862}
810 863
811static inline void inode_inc_dirty_pages(struct inode *inode) 864static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -1113,6 +1166,7 @@ enum {
1113 FI_NEED_IPU, /* used for ipu per file */ 1166 FI_NEED_IPU, /* used for ipu per file */
1114 FI_ATOMIC_FILE, /* indicate atomic file */ 1167 FI_ATOMIC_FILE, /* indicate atomic file */
1115 FI_VOLATILE_FILE, /* indicate volatile file */ 1168 FI_VOLATILE_FILE, /* indicate volatile file */
1169 FI_DROP_CACHE, /* drop dirty page cache */
1116 FI_DATA_EXIST, /* indicate data exists */ 1170 FI_DATA_EXIST, /* indicate data exists */
1117}; 1171};
1118 1172
@@ -1220,6 +1274,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
1220 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); 1274 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
1221} 1275}
1222 1276
1277static inline bool f2fs_is_drop_cache(struct inode *inode)
1278{
1279 return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
1280}
1281
1223static inline void *inline_data_addr(struct page *page) 1282static inline void *inline_data_addr(struct page *page)
1224{ 1283{
1225 struct f2fs_inode *ri = F2FS_INODE(page); 1284 struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1389,7 +1448,6 @@ void destroy_node_manager_caches(void);
1389 * segment.c 1448 * segment.c
1390 */ 1449 */
1391void register_inmem_page(struct inode *, struct page *); 1450void register_inmem_page(struct inode *, struct page *);
1392void invalidate_inmem_page(struct inode *, struct page *);
1393void commit_inmem_pages(struct inode *, bool); 1451void commit_inmem_pages(struct inode *, bool);
1394void f2fs_balance_fs(struct f2fs_sb_info *); 1452void f2fs_balance_fs(struct f2fs_sb_info *);
1395void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1453void f2fs_balance_fs_bg(struct f2fs_sb_info *);
@@ -1401,16 +1459,16 @@ void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1401void clear_prefree_segments(struct f2fs_sb_info *); 1459void clear_prefree_segments(struct f2fs_sb_info *);
1402void release_discard_addrs(struct f2fs_sb_info *); 1460void release_discard_addrs(struct f2fs_sb_info *);
1403void discard_next_dnode(struct f2fs_sb_info *, block_t); 1461void discard_next_dnode(struct f2fs_sb_info *, block_t);
1404int npages_for_summary_flush(struct f2fs_sb_info *); 1462int npages_for_summary_flush(struct f2fs_sb_info *, bool);
1405void allocate_new_segments(struct f2fs_sb_info *); 1463void allocate_new_segments(struct f2fs_sb_info *);
1406int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); 1464int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
1407struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1465struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1408void write_meta_page(struct f2fs_sb_info *, struct page *); 1466void write_meta_page(struct f2fs_sb_info *, struct page *);
1409void write_node_page(struct f2fs_sb_info *, struct page *, 1467void write_node_page(struct f2fs_sb_info *, struct page *,
1410 struct f2fs_io_info *, unsigned int, block_t, block_t *); 1468 unsigned int, struct f2fs_io_info *);
1411void write_data_page(struct page *, struct dnode_of_data *, block_t *, 1469void write_data_page(struct page *, struct dnode_of_data *,
1412 struct f2fs_io_info *); 1470 struct f2fs_io_info *);
1413void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); 1471void rewrite_data_page(struct page *, struct f2fs_io_info *);
1414void recover_data_page(struct f2fs_sb_info *, struct page *, 1472void recover_data_page(struct f2fs_sb_info *, struct page *,
1415 struct f2fs_summary *, block_t, block_t); 1473 struct f2fs_summary *, block_t, block_t);
1416void allocate_data_block(struct f2fs_sb_info *, struct page *, 1474void allocate_data_block(struct f2fs_sb_info *, struct page *,
@@ -1457,17 +1515,20 @@ void destroy_checkpoint_caches(void);
1457 * data.c 1515 * data.c
1458 */ 1516 */
1459void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); 1517void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
1460int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); 1518int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
1461void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, 1519 struct f2fs_io_info *);
1520void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
1462 struct f2fs_io_info *); 1521 struct f2fs_io_info *);
1463int reserve_new_block(struct dnode_of_data *); 1522int reserve_new_block(struct dnode_of_data *);
1464int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); 1523int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
1465void update_extent_cache(block_t, struct dnode_of_data *); 1524void update_extent_cache(struct dnode_of_data *);
1466struct page *find_data_page(struct inode *, pgoff_t, bool); 1525struct page *find_data_page(struct inode *, pgoff_t, bool);
1467struct page *get_lock_data_page(struct inode *, pgoff_t); 1526struct page *get_lock_data_page(struct inode *, pgoff_t);
1468struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); 1527struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1469int do_write_data_page(struct page *, struct f2fs_io_info *); 1528int do_write_data_page(struct page *, struct f2fs_io_info *);
1470int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); 1529int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
1530void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
1531int f2fs_release_page(struct page *, gfp_t);
1471 1532
1472/* 1533/*
1473 * gc.c 1534 * gc.c
@@ -1477,8 +1538,6 @@ void stop_gc_thread(struct f2fs_sb_info *);
1477block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); 1538block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
1478int f2fs_gc(struct f2fs_sb_info *); 1539int f2fs_gc(struct f2fs_sb_info *);
1479void build_gc_manager(struct f2fs_sb_info *); 1540void build_gc_manager(struct f2fs_sb_info *);
1480int __init create_gc_caches(void);
1481void destroy_gc_caches(void);
1482 1541
1483/* 1542/*
1484 * recovery.c 1543 * recovery.c
@@ -1497,9 +1556,9 @@ struct f2fs_stat_info {
1497 int main_area_segs, main_area_sections, main_area_zones; 1556 int main_area_segs, main_area_sections, main_area_zones;
1498 int hit_ext, total_ext; 1557 int hit_ext, total_ext;
1499 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; 1558 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1500 int nats, sits, fnids; 1559 int nats, dirty_nats, sits, dirty_sits, fnids;
1501 int total_count, utilization; 1560 int total_count, utilization;
1502 int bg_gc, inline_inode, inline_dir, inmem_pages; 1561 int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
1503 unsigned int valid_count, valid_node_count, valid_inode_count; 1562 unsigned int valid_count, valid_node_count, valid_inode_count;
1504 unsigned int bimodal, avg_vblocks; 1563 unsigned int bimodal, avg_vblocks;
1505 int util_free, util_valid, util_invalid; 1564 int util_free, util_valid, util_invalid;
@@ -1514,7 +1573,8 @@ struct f2fs_stat_info {
1514 1573
1515 unsigned int segment_count[2]; 1574 unsigned int segment_count[2];
1516 unsigned int block_count[2]; 1575 unsigned int block_count[2];
1517 unsigned base_mem, cache_mem; 1576 unsigned int inplace_count;
1577 unsigned base_mem, cache_mem, page_mem;
1518}; 1578};
1519 1579
1520static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) 1580static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1553,7 +1613,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1553 ((sbi)->segment_count[(curseg)->alloc_type]++) 1613 ((sbi)->segment_count[(curseg)->alloc_type]++)
1554#define stat_inc_block_count(sbi, curseg) \ 1614#define stat_inc_block_count(sbi, curseg) \
1555 ((sbi)->block_count[(curseg)->alloc_type]++) 1615 ((sbi)->block_count[(curseg)->alloc_type]++)
1556 1616#define stat_inc_inplace_blocks(sbi) \
1617 (atomic_inc(&(sbi)->inplace_count))
1557#define stat_inc_seg_count(sbi, type) \ 1618#define stat_inc_seg_count(sbi, type) \
1558 do { \ 1619 do { \
1559 struct f2fs_stat_info *si = F2FS_STAT(sbi); \ 1620 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
@@ -1599,6 +1660,7 @@ void f2fs_destroy_root_stats(void);
1599#define stat_dec_inline_dir(inode) 1660#define stat_dec_inline_dir(inode)
1600#define stat_inc_seg_type(sbi, curseg) 1661#define stat_inc_seg_type(sbi, curseg)
1601#define stat_inc_block_count(sbi, curseg) 1662#define stat_inc_block_count(sbi, curseg)
1663#define stat_inc_inplace_blocks(sbi)
1602#define stat_inc_seg_count(si, type) 1664#define stat_inc_seg_count(si, type)
1603#define stat_inc_tot_blk_count(si, blks) 1665#define stat_inc_tot_blk_count(si, blks)
1604#define stat_inc_data_blk_count(si, blks) 1666#define stat_inc_data_blk_count(si, blks)
@@ -1619,6 +1681,7 @@ extern const struct address_space_operations f2fs_meta_aops;
1619extern const struct inode_operations f2fs_dir_inode_operations; 1681extern const struct inode_operations f2fs_dir_inode_operations;
1620extern const struct inode_operations f2fs_symlink_inode_operations; 1682extern const struct inode_operations f2fs_symlink_inode_operations;
1621extern const struct inode_operations f2fs_special_inode_operations; 1683extern const struct inode_operations f2fs_special_inode_operations;
1684extern struct kmem_cache *inode_entry_slab;
1622 1685
1623/* 1686/*
1624 * inline.c 1687 * inline.c
@@ -1629,7 +1692,6 @@ int f2fs_read_inline_data(struct inode *, struct page *);
1629int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); 1692int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
1630int f2fs_convert_inline_inode(struct inode *); 1693int f2fs_convert_inline_inode(struct inode *);
1631int f2fs_write_inline_data(struct inode *, struct page *); 1694int f2fs_write_inline_data(struct inode *, struct page *);
1632void truncate_inline_data(struct page *, u64);
1633bool recover_inline_data(struct inode *, struct page *); 1695bool recover_inline_data(struct inode *, struct page *);
1634struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, 1696struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
1635 struct page **); 1697 struct page **);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..98dac27bc3f7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -26,6 +26,7 @@
26#include "segment.h" 26#include "segment.h"
27#include "xattr.h" 27#include "xattr.h"
28#include "acl.h" 28#include "acl.h"
29#include "trace.h"
29#include <trace/events/f2fs.h> 30#include <trace/events/f2fs.h>
30 31
31static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, 32static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
@@ -92,7 +93,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
92 .fault = filemap_fault, 93 .fault = filemap_fault,
93 .map_pages = filemap_map_pages, 94 .map_pages = filemap_map_pages,
94 .page_mkwrite = f2fs_vm_page_mkwrite, 95 .page_mkwrite = f2fs_vm_page_mkwrite,
95 .remap_pages = generic_file_remap_pages,
96}; 96};
97 97
98static int get_parent_ino(struct inode *inode, nid_t *pino) 98static int get_parent_ino(struct inode *inode, nid_t *pino)
@@ -246,6 +246,10 @@ go_write:
246sync_nodes: 246sync_nodes:
247 sync_node_pages(sbi, ino, &wbc); 247 sync_node_pages(sbi, ino, &wbc);
248 248
249 /* if cp_error was enabled, we should avoid infinite loop */
250 if (unlikely(f2fs_cp_error(sbi)))
251 goto out;
252
249 if (need_inode_block_update(sbi, ino)) { 253 if (need_inode_block_update(sbi, ino)) {
250 mark_inode_dirty_sync(inode); 254 mark_inode_dirty_sync(inode);
251 f2fs_write_inode(inode, NULL); 255 f2fs_write_inode(inode, NULL);
@@ -265,6 +269,7 @@ flush_out:
265 ret = f2fs_issue_flush(sbi); 269 ret = f2fs_issue_flush(sbi);
266out: 270out:
267 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 271 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
272 f2fs_trace_ios(NULL, NULL, 1);
268 return ret; 273 return ret;
269} 274}
270 275
@@ -351,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
351 /* find data/hole in dnode block */ 356 /* find data/hole in dnode block */
352 for (; dn.ofs_in_node < end_offset; 357 for (; dn.ofs_in_node < end_offset;
353 dn.ofs_in_node++, pgofs++, 358 dn.ofs_in_node++, pgofs++,
354 data_ofs = pgofs << PAGE_CACHE_SHIFT) { 359 data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
355 block_t blkaddr; 360 block_t blkaddr;
356 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); 361 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
357 362
@@ -427,7 +432,8 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
427 if (blkaddr == NULL_ADDR) 432 if (blkaddr == NULL_ADDR)
428 continue; 433 continue;
429 434
430 update_extent_cache(NULL_ADDR, dn); 435 dn->data_blkaddr = NULL_ADDR;
436 update_extent_cache(dn);
431 invalidate_blocks(sbi, blkaddr); 437 invalidate_blocks(sbi, blkaddr);
432 nr_free++; 438 nr_free++;
433 } 439 }
@@ -484,8 +490,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
484 490
485 trace_f2fs_truncate_blocks_enter(inode, from); 491 trace_f2fs_truncate_blocks_enter(inode, from);
486 492
487 free_from = (pgoff_t) 493 free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
488 ((from + blocksize - 1) >> (sbi->log_blocksize));
489 494
490 if (lock) 495 if (lock)
491 f2fs_lock_op(sbi); 496 f2fs_lock_op(sbi);
@@ -836,6 +841,19 @@ static long f2fs_fallocate(struct file *file, int mode,
836 return ret; 841 return ret;
837} 842}
838 843
844static int f2fs_release_file(struct inode *inode, struct file *filp)
845{
846 /* some remained atomic pages should discarded */
847 if (f2fs_is_atomic_file(inode))
848 commit_inmem_pages(inode, true);
849 if (f2fs_is_volatile_file(inode)) {
850 set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
851 filemap_fdatawrite(inode->i_mapping);
852 clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
853 }
854 return 0;
855}
856
839#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) 857#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
840#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) 858#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
841 859
@@ -906,29 +924,30 @@ out:
906 return ret; 924 return ret;
907} 925}
908 926
927static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
928{
929 struct inode *inode = file_inode(filp);
930
931 return put_user(inode->i_generation, (int __user *)arg);
932}
933
909static int f2fs_ioc_start_atomic_write(struct file *filp) 934static int f2fs_ioc_start_atomic_write(struct file *filp)
910{ 935{
911 struct inode *inode = file_inode(filp); 936 struct inode *inode = file_inode(filp);
912 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
913 937
914 if (!inode_owner_or_capable(inode)) 938 if (!inode_owner_or_capable(inode))
915 return -EACCES; 939 return -EACCES;
916 940
917 f2fs_balance_fs(sbi); 941 f2fs_balance_fs(F2FS_I_SB(inode));
942
943 if (f2fs_is_atomic_file(inode))
944 return 0;
918 945
919 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); 946 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
920 947
921 return f2fs_convert_inline_inode(inode); 948 return f2fs_convert_inline_inode(inode);
922} 949}
923 950
924static int f2fs_release_file(struct inode *inode, struct file *filp)
925{
926 /* some remained atomic pages should discarded */
927 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
928 commit_inmem_pages(inode, true);
929 return 0;
930}
931
932static int f2fs_ioc_commit_atomic_write(struct file *filp) 951static int f2fs_ioc_commit_atomic_write(struct file *filp)
933{ 952{
934 struct inode *inode = file_inode(filp); 953 struct inode *inode = file_inode(filp);
@@ -949,6 +968,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
949 968
950 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); 969 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
951 mnt_drop_write_file(filp); 970 mnt_drop_write_file(filp);
971 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
952 return ret; 972 return ret;
953} 973}
954 974
@@ -959,11 +979,56 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
959 if (!inode_owner_or_capable(inode)) 979 if (!inode_owner_or_capable(inode))
960 return -EACCES; 980 return -EACCES;
961 981
982 if (f2fs_is_volatile_file(inode))
983 return 0;
984
962 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); 985 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
963 986
964 return f2fs_convert_inline_inode(inode); 987 return f2fs_convert_inline_inode(inode);
965} 988}
966 989
990static int f2fs_ioc_release_volatile_write(struct file *filp)
991{
992 struct inode *inode = file_inode(filp);
993
994 if (!inode_owner_or_capable(inode))
995 return -EACCES;
996
997 if (!f2fs_is_volatile_file(inode))
998 return 0;
999
1000 punch_hole(inode, 0, F2FS_BLKSIZE);
1001 return 0;
1002}
1003
1004static int f2fs_ioc_abort_volatile_write(struct file *filp)
1005{
1006 struct inode *inode = file_inode(filp);
1007 int ret;
1008
1009 if (!inode_owner_or_capable(inode))
1010 return -EACCES;
1011
1012 ret = mnt_want_write_file(filp);
1013 if (ret)
1014 return ret;
1015
1016 f2fs_balance_fs(F2FS_I_SB(inode));
1017
1018 if (f2fs_is_atomic_file(inode)) {
1019 commit_inmem_pages(inode, false);
1020 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
1021 }
1022
1023 if (f2fs_is_volatile_file(inode)) {
1024 clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
1025 filemap_fdatawrite(inode->i_mapping);
1026 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
1027 }
1028 mnt_drop_write_file(filp);
1029 return ret;
1030}
1031
967static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) 1032static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
968{ 1033{
969 struct inode *inode = file_inode(filp); 1034 struct inode *inode = file_inode(filp);
@@ -1001,12 +1066,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1001 return f2fs_ioc_getflags(filp, arg); 1066 return f2fs_ioc_getflags(filp, arg);
1002 case F2FS_IOC_SETFLAGS: 1067 case F2FS_IOC_SETFLAGS:
1003 return f2fs_ioc_setflags(filp, arg); 1068 return f2fs_ioc_setflags(filp, arg);
1069 case F2FS_IOC_GETVERSION:
1070 return f2fs_ioc_getversion(filp, arg);
1004 case F2FS_IOC_START_ATOMIC_WRITE: 1071 case F2FS_IOC_START_ATOMIC_WRITE:
1005 return f2fs_ioc_start_atomic_write(filp); 1072 return f2fs_ioc_start_atomic_write(filp);
1006 case F2FS_IOC_COMMIT_ATOMIC_WRITE: 1073 case F2FS_IOC_COMMIT_ATOMIC_WRITE:
1007 return f2fs_ioc_commit_atomic_write(filp); 1074 return f2fs_ioc_commit_atomic_write(filp);
1008 case F2FS_IOC_START_VOLATILE_WRITE: 1075 case F2FS_IOC_START_VOLATILE_WRITE:
1009 return f2fs_ioc_start_volatile_write(filp); 1076 return f2fs_ioc_start_volatile_write(filp);
1077 case F2FS_IOC_RELEASE_VOLATILE_WRITE:
1078 return f2fs_ioc_release_volatile_write(filp);
1079 case F2FS_IOC_ABORT_VOLATILE_WRITE:
1080 return f2fs_ioc_abort_volatile_write(filp);
1010 case FITRIM: 1081 case FITRIM:
1011 return f2fs_ioc_fitrim(filp, arg); 1082 return f2fs_ioc_fitrim(filp, arg);
1012 default: 1083 default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index eec0933a4819..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -24,8 +24,6 @@
24#include "gc.h" 24#include "gc.h"
25#include <trace/events/f2fs.h> 25#include <trace/events/f2fs.h>
26 26
27static struct kmem_cache *winode_slab;
28
29static int gc_thread_func(void *data) 27static int gc_thread_func(void *data)
30{ 28{
31 struct f2fs_sb_info *sbi = data; 29 struct f2fs_sb_info *sbi = data;
@@ -46,7 +44,7 @@ static int gc_thread_func(void *data)
46 break; 44 break;
47 45
48 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 46 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
49 wait_ms = increase_sleep_time(gc_th, wait_ms); 47 increase_sleep_time(gc_th, &wait_ms);
50 continue; 48 continue;
51 } 49 }
52 50
@@ -67,15 +65,15 @@ static int gc_thread_func(void *data)
67 continue; 65 continue;
68 66
69 if (!is_idle(sbi)) { 67 if (!is_idle(sbi)) {
70 wait_ms = increase_sleep_time(gc_th, wait_ms); 68 increase_sleep_time(gc_th, &wait_ms);
71 mutex_unlock(&sbi->gc_mutex); 69 mutex_unlock(&sbi->gc_mutex);
72 continue; 70 continue;
73 } 71 }
74 72
75 if (has_enough_invalid_blocks(sbi)) 73 if (has_enough_invalid_blocks(sbi))
76 wait_ms = decrease_sleep_time(gc_th, wait_ms); 74 decrease_sleep_time(gc_th, &wait_ms);
77 else 75 else
78 wait_ms = increase_sleep_time(gc_th, wait_ms); 76 increase_sleep_time(gc_th, &wait_ms);
79 77
80 stat_inc_bggc_count(sbi); 78 stat_inc_bggc_count(sbi);
81 79
@@ -356,13 +354,10 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
356 iput(inode); 354 iput(inode);
357 return; 355 return;
358 } 356 }
359 new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS); 357 new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
360 new_ie->inode = inode; 358 new_ie->inode = inode;
361retry: 359
362 if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) { 360 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
363 cond_resched();
364 goto retry;
365 }
366 list_add_tail(&new_ie->list, &gc_list->ilist); 361 list_add_tail(&new_ie->list, &gc_list->ilist);
367} 362}
368 363
@@ -373,7 +368,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list)
373 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); 368 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
374 iput(ie->inode); 369 iput(ie->inode);
375 list_del(&ie->list); 370 list_del(&ie->list);
376 kmem_cache_free(winode_slab, ie); 371 kmem_cache_free(inode_entry_slab, ie);
377 } 372 }
378} 373}
379 374
@@ -703,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
703 .iroot = RADIX_TREE_INIT(GFP_NOFS), 698 .iroot = RADIX_TREE_INIT(GFP_NOFS),
704 }; 699 };
705 700
706 cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC; 701 cpc.reason = __get_cp_reason(sbi);
707
708gc_more: 702gc_more:
709 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 703 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
710 goto stop; 704 goto stop;
@@ -750,17 +744,3 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
750{ 744{
751 DIRTY_I(sbi)->v_ops = &default_v_ops; 745 DIRTY_I(sbi)->v_ops = &default_v_ops;
752} 746}
753
754int __init create_gc_caches(void)
755{
756 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
757 sizeof(struct inode_entry));
758 if (!winode_slab)
759 return -ENOMEM;
760 return 0;
761}
762
763void destroy_gc_caches(void)
764{
765 kmem_cache_destroy(winode_slab);
766}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 6ff7ad38463e..b4a65be9f7d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,11 +35,6 @@ struct f2fs_gc_kthread {
35 unsigned int gc_idle; 35 unsigned int gc_idle;
36}; 36};
37 37
38struct inode_entry {
39 struct list_head list;
40 struct inode *inode;
41};
42
43struct gc_inode_list { 38struct gc_inode_list {
44 struct list_head ilist; 39 struct list_head ilist;
45 struct radix_tree_root iroot; 40 struct radix_tree_root iroot;
@@ -69,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
69 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; 64 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
70} 65}
71 66
72static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) 67static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
68 long *wait)
73{ 69{
74 if (wait == gc_th->no_gc_sleep_time) 70 if (*wait == gc_th->no_gc_sleep_time)
75 return wait; 71 return;
76 72
77 wait += gc_th->min_sleep_time; 73 *wait += gc_th->min_sleep_time;
78 if (wait > gc_th->max_sleep_time) 74 if (*wait > gc_th->max_sleep_time)
79 wait = gc_th->max_sleep_time; 75 *wait = gc_th->max_sleep_time;
80 return wait;
81} 76}
82 77
83static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) 78static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
79 long *wait)
84{ 80{
85 if (wait == gc_th->no_gc_sleep_time) 81 if (*wait == gc_th->no_gc_sleep_time)
86 wait = gc_th->max_sleep_time; 82 *wait = gc_th->max_sleep_time;
87 83
88 wait -= gc_th->min_sleep_time; 84 *wait -= gc_th->min_sleep_time;
89 if (wait <= gc_th->min_sleep_time) 85 if (*wait <= gc_th->min_sleep_time)
90 wait = gc_th->min_sleep_time; 86 *wait = gc_th->min_sleep_time;
91 return wait;
92} 87}
93 88
94static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) 89static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index f2d3c581e776..1484c00133cd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -50,6 +50,12 @@ void read_inline_data(struct page *page, struct page *ipage)
50 SetPageUptodate(page); 50 SetPageUptodate(page);
51} 51}
52 52
53static void truncate_inline_data(struct page *ipage)
54{
55 f2fs_wait_on_page_writeback(ipage, NODE);
56 memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
57}
58
53int f2fs_read_inline_data(struct inode *inode, struct page *page) 59int f2fs_read_inline_data(struct inode *inode, struct page *page)
54{ 60{
55 struct page *ipage; 61 struct page *ipage;
@@ -79,7 +85,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
79int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) 85int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
80{ 86{
81 void *src_addr, *dst_addr; 87 void *src_addr, *dst_addr;
82 block_t new_blk_addr;
83 struct f2fs_io_info fio = { 88 struct f2fs_io_info fio = {
84 .type = DATA, 89 .type = DATA,
85 .rw = WRITE_SYNC | REQ_PRIO, 90 .rw = WRITE_SYNC | REQ_PRIO,
@@ -115,9 +120,9 @@ no_update:
115 120
116 /* write data page to try to make data consistent */ 121 /* write data page to try to make data consistent */
117 set_page_writeback(page); 122 set_page_writeback(page);
118 123 fio.blk_addr = dn->data_blkaddr;
119 write_data_page(page, dn, &new_blk_addr, &fio); 124 write_data_page(page, dn, &fio);
120 update_extent_cache(new_blk_addr, dn); 125 update_extent_cache(dn);
121 f2fs_wait_on_page_writeback(page, DATA); 126 f2fs_wait_on_page_writeback(page, DATA);
122 if (dirty) 127 if (dirty)
123 inode_dec_dirty_pages(dn->inode); 128 inode_dec_dirty_pages(dn->inode);
@@ -126,7 +131,7 @@ no_update:
126 set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); 131 set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
127 132
128 /* clear inline data and flag after data writeback */ 133 /* clear inline data and flag after data writeback */
129 truncate_inline_data(dn->inode_page, 0); 134 truncate_inline_data(dn->inode_page);
130clear_out: 135clear_out:
131 stat_dec_inline_inode(dn->inode); 136 stat_dec_inline_inode(dn->inode);
132 f2fs_clear_inline_inode(dn->inode); 137 f2fs_clear_inline_inode(dn->inode);
@@ -199,19 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
199 return 0; 204 return 0;
200} 205}
201 206
202void truncate_inline_data(struct page *ipage, u64 from)
203{
204 void *addr;
205
206 if (from >= MAX_INLINE_DATA)
207 return;
208
209 f2fs_wait_on_page_writeback(ipage, NODE);
210
211 addr = inline_data_addr(ipage);
212 memset(addr + from, 0, MAX_INLINE_DATA - from);
213}
214
215bool recover_inline_data(struct inode *inode, struct page *npage) 207bool recover_inline_data(struct inode *inode, struct page *npage)
216{ 208{
217 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 209 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -253,7 +245,7 @@ process_inline:
253 if (f2fs_has_inline_data(inode)) { 245 if (f2fs_has_inline_data(inode)) {
254 ipage = get_node_page(sbi, inode->i_ino); 246 ipage = get_node_page(sbi, inode->i_ino);
255 f2fs_bug_on(sbi, IS_ERR(ipage)); 247 f2fs_bug_on(sbi, IS_ERR(ipage));
256 truncate_inline_data(ipage, 0); 248 truncate_inline_data(ipage);
257 f2fs_clear_inline_inode(inode); 249 f2fs_clear_inline_inode(inode);
258 update_inode(inode, ipage); 250 update_inode(inode, ipage);
259 f2fs_put_page(ipage, 1); 251 f2fs_put_page(ipage, 1);
@@ -371,7 +363,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
371 set_page_dirty(page); 363 set_page_dirty(page);
372 364
373 /* clear inline dir and flag after data writeback */ 365 /* clear inline dir and flag after data writeback */
374 truncate_inline_data(ipage, 0); 366 truncate_inline_data(ipage);
375 367
376 stat_dec_inline_dir(dir); 368 stat_dec_inline_dir(dir);
377 clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); 369 clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 196cc7843aaf..2d002e3738a7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,29 +67,23 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
67 } 67 }
68} 68}
69 69
70static int __recover_inline_status(struct inode *inode, struct page *ipage) 70static void __recover_inline_status(struct inode *inode, struct page *ipage)
71{ 71{
72 void *inline_data = inline_data_addr(ipage); 72 void *inline_data = inline_data_addr(ipage);
73 struct f2fs_inode *ri; 73 __le32 *start = inline_data;
74 void *zbuf; 74 __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
75 75
76 zbuf = kzalloc(MAX_INLINE_DATA, GFP_NOFS); 76 while (start < end) {
77 if (!zbuf) 77 if (*start++) {
78 return -ENOMEM; 78 f2fs_wait_on_page_writeback(ipage, NODE);
79 79
80 if (!memcmp(zbuf, inline_data, MAX_INLINE_DATA)) { 80 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
81 kfree(zbuf); 81 set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
82 return 0; 82 set_page_dirty(ipage);
83 return;
84 }
83 } 85 }
84 kfree(zbuf); 86 return;
85
86 f2fs_wait_on_page_writeback(ipage, NODE);
87 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
88
89 ri = F2FS_INODE(ipage);
90 set_raw_inline(F2FS_I(inode), ri);
91 set_page_dirty(ipage);
92 return 0;
93} 87}
94 88
95static int do_read_inode(struct inode *inode) 89static int do_read_inode(struct inode *inode)
@@ -98,7 +92,6 @@ static int do_read_inode(struct inode *inode)
98 struct f2fs_inode_info *fi = F2FS_I(inode); 92 struct f2fs_inode_info *fi = F2FS_I(inode);
99 struct page *node_page; 93 struct page *node_page;
100 struct f2fs_inode *ri; 94 struct f2fs_inode *ri;
101 int err = 0;
102 95
103 /* Check if ino is within scope */ 96 /* Check if ino is within scope */
104 if (check_nid_range(sbi, inode->i_ino)) { 97 if (check_nid_range(sbi, inode->i_ino)) {
@@ -142,7 +135,7 @@ static int do_read_inode(struct inode *inode)
142 135
143 /* check data exist */ 136 /* check data exist */
144 if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) 137 if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
145 err = __recover_inline_status(inode, node_page); 138 __recover_inline_status(inode, node_page);
146 139
147 /* get rdev by using inline_info */ 140 /* get rdev by using inline_info */
148 __get_inode_rdev(inode, ri); 141 __get_inode_rdev(inode, ri);
@@ -152,7 +145,7 @@ static int do_read_inode(struct inode *inode)
152 stat_inc_inline_inode(inode); 145 stat_inc_inline_inode(inode);
153 stat_inc_inline_dir(inode); 146 stat_inc_inline_dir(inode);
154 147
155 return err; 148 return 0;
156} 149}
157 150
158struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) 151struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
@@ -304,7 +297,7 @@ void f2fs_evict_inode(struct inode *inode)
304 nid_t xnid = F2FS_I(inode)->i_xattr_nid; 297 nid_t xnid = F2FS_I(inode)->i_xattr_nid;
305 298
306 /* some remained atomic pages should discarded */ 299 /* some remained atomic pages should discarded */
307 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) 300 if (f2fs_is_atomic_file(inode))
308 commit_inmem_pages(inode, true); 301 commit_inmem_pages(inode, true);
309 302
310 trace_f2fs_evict_inode(inode); 303 trace_f2fs_evict_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 547a2deeb1ac..e79639a9787a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -299,7 +299,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
299 inode->i_op = &f2fs_dir_inode_operations; 299 inode->i_op = &f2fs_dir_inode_operations;
300 inode->i_fop = &f2fs_dir_operations; 300 inode->i_fop = &f2fs_dir_operations;
301 inode->i_mapping->a_ops = &f2fs_dblock_aops; 301 inode->i_mapping->a_ops = &f2fs_dblock_aops;
302 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); 302 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
303 303
304 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 304 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
305 f2fs_lock_op(sbi); 305 f2fs_lock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f83326ca32ef..97bd9d3db882 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
19#include "f2fs.h" 19#include "f2fs.h"
20#include "node.h" 20#include "node.h"
21#include "segment.h" 21#include "segment.h"
22#include "trace.h"
22#include <trace/events/f2fs.h> 23#include <trace/events/f2fs.h>
23 24
24#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) 25#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
@@ -57,12 +58,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
57 } else if (type == INO_ENTRIES) { 58 } else if (type == INO_ENTRIES) {
58 int i; 59 int i;
59 60
60 if (sbi->sb->s_bdi->dirty_exceeded)
61 return false;
62 for (i = 0; i <= UPDATE_INO; i++) 61 for (i = 0; i <= UPDATE_INO; i++)
63 mem_size += (sbi->im[i].ino_num * 62 mem_size += (sbi->im[i].ino_num *
64 sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; 63 sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
65 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); 64 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
65 } else {
66 if (sbi->sb->s_bdi->dirty_exceeded)
67 return false;
66 } 68 }
67 return res; 69 return res;
68} 70}
@@ -268,7 +270,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
268 e = __lookup_nat_cache(nm_i, ni->nid); 270 e = __lookup_nat_cache(nm_i, ni->nid);
269 if (!e) { 271 if (!e) {
270 e = grab_nat_entry(nm_i, ni->nid); 272 e = grab_nat_entry(nm_i, ni->nid);
271 e->ni = *ni; 273 copy_node_info(&e->ni, ni);
272 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); 274 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
273 } else if (new_blkaddr == NEW_ADDR) { 275 } else if (new_blkaddr == NEW_ADDR) {
274 /* 276 /*
@@ -276,7 +278,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
276 * previous nat entry can be remained in nat cache. 278 * previous nat entry can be remained in nat cache.
277 * So, reinitialize it with new information. 279 * So, reinitialize it with new information.
278 */ 280 */
279 e->ni = *ni; 281 copy_node_info(&e->ni, ni);
280 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); 282 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
281 } 283 }
282 284
@@ -346,7 +348,6 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
346 struct nat_entry *e; 348 struct nat_entry *e;
347 int i; 349 int i;
348 350
349 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
350 ni->nid = nid; 351 ni->nid = nid;
351 352
352 /* Check nat cache */ 353 /* Check nat cache */
@@ -361,6 +362,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
361 if (e) 362 if (e)
362 return; 363 return;
363 364
365 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
366
364 /* Check current segment summary */ 367 /* Check current segment summary */
365 mutex_lock(&curseg->curseg_mutex); 368 mutex_lock(&curseg->curseg_mutex);
366 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); 369 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -471,7 +474,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
471{ 474{
472 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 475 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
473 struct page *npage[4]; 476 struct page *npage[4];
474 struct page *parent; 477 struct page *parent = NULL;
475 int offset[4]; 478 int offset[4];
476 unsigned int noffset[4]; 479 unsigned int noffset[4];
477 nid_t nids[4]; 480 nid_t nids[4];
@@ -488,6 +491,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
488 if (IS_ERR(npage[0])) 491 if (IS_ERR(npage[0]))
489 return PTR_ERR(npage[0]); 492 return PTR_ERR(npage[0]);
490 } 493 }
494
495 /* if inline_data is set, should not report any block indices */
496 if (f2fs_has_inline_data(dn->inode) && index) {
497 err = -EINVAL;
498 f2fs_put_page(npage[0], 1);
499 goto release_out;
500 }
501
491 parent = npage[0]; 502 parent = npage[0];
492 if (level != 0) 503 if (level != 0)
493 nids[1] = get_nid(parent, offset[0], true); 504 nids[1] = get_nid(parent, offset[0], true);
@@ -585,7 +596,7 @@ static void truncate_node(struct dnode_of_data *dn)
585 } 596 }
586invalidate: 597invalidate:
587 clear_node_page_dirty(dn->node_page); 598 clear_node_page_dirty(dn->node_page);
588 F2FS_SET_SB_DIRT(sbi); 599 set_sbi_flag(sbi, SBI_IS_DIRTY);
589 600
590 f2fs_put_page(dn->node_page, 1); 601 f2fs_put_page(dn->node_page, 1);
591 602
@@ -976,6 +987,10 @@ static int read_node_page(struct page *page, int rw)
976{ 987{
977 struct f2fs_sb_info *sbi = F2FS_P_SB(page); 988 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
978 struct node_info ni; 989 struct node_info ni;
990 struct f2fs_io_info fio = {
991 .type = NODE,
992 .rw = rw,
993 };
979 994
980 get_node_info(sbi, page->index, &ni); 995 get_node_info(sbi, page->index, &ni);
981 996
@@ -987,7 +1002,8 @@ static int read_node_page(struct page *page, int rw)
987 if (PageUptodate(page)) 1002 if (PageUptodate(page))
988 return LOCKED_PAGE; 1003 return LOCKED_PAGE;
989 1004
990 return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); 1005 fio.blk_addr = ni.blk_addr;
1006 return f2fs_submit_page_bio(sbi, page, &fio);
991} 1007}
992 1008
993/* 1009/*
@@ -1028,11 +1044,11 @@ repeat:
1028 err = read_node_page(page, READ_SYNC); 1044 err = read_node_page(page, READ_SYNC);
1029 if (err < 0) 1045 if (err < 0)
1030 return ERR_PTR(err); 1046 return ERR_PTR(err);
1031 else if (err == LOCKED_PAGE) 1047 else if (err != LOCKED_PAGE)
1032 goto got_it; 1048 lock_page(page);
1033 1049
1034 lock_page(page);
1035 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { 1050 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
1051 ClearPageUptodate(page);
1036 f2fs_put_page(page, 1); 1052 f2fs_put_page(page, 1);
1037 return ERR_PTR(-EIO); 1053 return ERR_PTR(-EIO);
1038 } 1054 }
@@ -1040,7 +1056,6 @@ repeat:
1040 f2fs_put_page(page, 1); 1056 f2fs_put_page(page, 1);
1041 goto repeat; 1057 goto repeat;
1042 } 1058 }
1043got_it:
1044 return page; 1059 return page;
1045} 1060}
1046 1061
@@ -1268,7 +1283,6 @@ static int f2fs_write_node_page(struct page *page,
1268{ 1283{
1269 struct f2fs_sb_info *sbi = F2FS_P_SB(page); 1284 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1270 nid_t nid; 1285 nid_t nid;
1271 block_t new_addr;
1272 struct node_info ni; 1286 struct node_info ni;
1273 struct f2fs_io_info fio = { 1287 struct f2fs_io_info fio = {
1274 .type = NODE, 1288 .type = NODE,
@@ -1277,7 +1291,7 @@ static int f2fs_write_node_page(struct page *page,
1277 1291
1278 trace_f2fs_writepage(page, NODE); 1292 trace_f2fs_writepage(page, NODE);
1279 1293
1280 if (unlikely(sbi->por_doing)) 1294 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1281 goto redirty_out; 1295 goto redirty_out;
1282 if (unlikely(f2fs_cp_error(sbi))) 1296 if (unlikely(f2fs_cp_error(sbi)))
1283 goto redirty_out; 1297 goto redirty_out;
@@ -1303,9 +1317,11 @@ static int f2fs_write_node_page(struct page *page,
1303 } else { 1317 } else {
1304 down_read(&sbi->node_write); 1318 down_read(&sbi->node_write);
1305 } 1319 }
1320
1306 set_page_writeback(page); 1321 set_page_writeback(page);
1307 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1322 fio.blk_addr = ni.blk_addr;
1308 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); 1323 write_node_page(sbi, page, nid, &fio);
1324 set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
1309 dec_page_count(sbi, F2FS_DIRTY_NODES); 1325 dec_page_count(sbi, F2FS_DIRTY_NODES);
1310 up_read(&sbi->node_write); 1326 up_read(&sbi->node_write);
1311 unlock_page(page); 1327 unlock_page(page);
@@ -1355,26 +1371,12 @@ static int f2fs_set_node_page_dirty(struct page *page)
1355 __set_page_dirty_nobuffers(page); 1371 __set_page_dirty_nobuffers(page);
1356 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); 1372 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1357 SetPagePrivate(page); 1373 SetPagePrivate(page);
1374 f2fs_trace_pid(page);
1358 return 1; 1375 return 1;
1359 } 1376 }
1360 return 0; 1377 return 0;
1361} 1378}
1362 1379
1363static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1364 unsigned int length)
1365{
1366 struct inode *inode = page->mapping->host;
1367 if (PageDirty(page))
1368 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
1369 ClearPagePrivate(page);
1370}
1371
1372static int f2fs_release_node_page(struct page *page, gfp_t wait)
1373{
1374 ClearPagePrivate(page);
1375 return 1;
1376}
1377
1378/* 1380/*
1379 * Structure of the f2fs node operations 1381 * Structure of the f2fs node operations
1380 */ 1382 */
@@ -1382,8 +1384,8 @@ const struct address_space_operations f2fs_node_aops = {
1382 .writepage = f2fs_write_node_page, 1384 .writepage = f2fs_write_node_page,
1383 .writepages = f2fs_write_node_pages, 1385 .writepages = f2fs_write_node_pages,
1384 .set_page_dirty = f2fs_set_node_page_dirty, 1386 .set_page_dirty = f2fs_set_node_page_dirty,
1385 .invalidatepage = f2fs_invalidate_node_page, 1387 .invalidatepage = f2fs_invalidate_page,
1386 .releasepage = f2fs_release_node_page, 1388 .releasepage = f2fs_release_page,
1387}; 1389};
1388 1390
1389static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, 1391static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1726,80 +1728,41 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1726 return 0; 1728 return 0;
1727} 1729}
1728 1730
1729/*
1730 * ra_sum_pages() merge contiguous pages into one bio and submit.
1731 * these pre-read pages are allocated in bd_inode's mapping tree.
1732 */
1733static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
1734 int start, int nrpages)
1735{
1736 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1737 struct address_space *mapping = inode->i_mapping;
1738 int i, page_idx = start;
1739 struct f2fs_io_info fio = {
1740 .type = META,
1741 .rw = READ_SYNC | REQ_META | REQ_PRIO
1742 };
1743
1744 for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
1745 /* alloc page in bd_inode for reading node summary info */
1746 pages[i] = grab_cache_page(mapping, page_idx);
1747 if (!pages[i])
1748 break;
1749 f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
1750 }
1751
1752 f2fs_submit_merged_bio(sbi, META, READ);
1753 return i;
1754}
1755
1756int restore_node_summary(struct f2fs_sb_info *sbi, 1731int restore_node_summary(struct f2fs_sb_info *sbi,
1757 unsigned int segno, struct f2fs_summary_block *sum) 1732 unsigned int segno, struct f2fs_summary_block *sum)
1758{ 1733{
1759 struct f2fs_node *rn; 1734 struct f2fs_node *rn;
1760 struct f2fs_summary *sum_entry; 1735 struct f2fs_summary *sum_entry;
1761 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1762 block_t addr; 1736 block_t addr;
1763 int bio_blocks = MAX_BIO_BLOCKS(sbi); 1737 int bio_blocks = MAX_BIO_BLOCKS(sbi);
1764 struct page *pages[bio_blocks]; 1738 int i, idx, last_offset, nrpages;
1765 int i, idx, last_offset, nrpages, err = 0;
1766 1739
1767 /* scan the node segment */ 1740 /* scan the node segment */
1768 last_offset = sbi->blocks_per_seg; 1741 last_offset = sbi->blocks_per_seg;
1769 addr = START_BLOCK(sbi, segno); 1742 addr = START_BLOCK(sbi, segno);
1770 sum_entry = &sum->entries[0]; 1743 sum_entry = &sum->entries[0];
1771 1744
1772 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { 1745 for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
1773 nrpages = min(last_offset - i, bio_blocks); 1746 nrpages = min(last_offset - i, bio_blocks);
1774 1747
1775 /* readahead node pages */ 1748 /* readahead node pages */
1776 nrpages = ra_sum_pages(sbi, pages, addr, nrpages); 1749 ra_meta_pages(sbi, addr, nrpages, META_POR);
1777 if (!nrpages)
1778 return -ENOMEM;
1779 1750
1780 for (idx = 0; idx < nrpages; idx++) { 1751 for (idx = addr; idx < addr + nrpages; idx++) {
1781 if (err) 1752 struct page *page = get_meta_page(sbi, idx);
1782 goto skip;
1783 1753
1784 lock_page(pages[idx]); 1754 rn = F2FS_NODE(page);
1785 if (unlikely(!PageUptodate(pages[idx]))) { 1755 sum_entry->nid = rn->footer.nid;
1786 err = -EIO; 1756 sum_entry->version = 0;
1787 } else { 1757 sum_entry->ofs_in_node = 0;
1788 rn = F2FS_NODE(pages[idx]); 1758 sum_entry++;
1789 sum_entry->nid = rn->footer.nid; 1759 f2fs_put_page(page, 1);
1790 sum_entry->version = 0;
1791 sum_entry->ofs_in_node = 0;
1792 sum_entry++;
1793 }
1794 unlock_page(pages[idx]);
1795skip:
1796 page_cache_release(pages[idx]);
1797 } 1760 }
1798 1761
1799 invalidate_mapping_pages(inode->i_mapping, addr, 1762 invalidate_mapping_pages(META_MAPPING(sbi), addr,
1800 addr + nrpages); 1763 addr + nrpages);
1801 } 1764 }
1802 return err; 1765 return 0;
1803} 1766}
1804 1767
1805static void remove_nats_in_journal(struct f2fs_sb_info *sbi) 1768static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -1923,7 +1886,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1923 struct f2fs_nm_info *nm_i = NM_I(sbi); 1886 struct f2fs_nm_info *nm_i = NM_I(sbi);
1924 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1887 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1925 struct f2fs_summary_block *sum = curseg->sum_blk; 1888 struct f2fs_summary_block *sum = curseg->sum_blk;
1926 struct nat_entry_set *setvec[NATVEC_SIZE]; 1889 struct nat_entry_set *setvec[SETVEC_SIZE];
1927 struct nat_entry_set *set, *tmp; 1890 struct nat_entry_set *set, *tmp;
1928 unsigned int found; 1891 unsigned int found;
1929 nid_t set_idx = 0; 1892 nid_t set_idx = 0;
@@ -1940,7 +1903,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1940 remove_nats_in_journal(sbi); 1903 remove_nats_in_journal(sbi);
1941 1904
1942 while ((found = __gang_lookup_nat_set(nm_i, 1905 while ((found = __gang_lookup_nat_set(nm_i,
1943 set_idx, NATVEC_SIZE, setvec))) { 1906 set_idx, SETVEC_SIZE, setvec))) {
1944 unsigned idx; 1907 unsigned idx;
1945 set_idx = setvec[found - 1]->set + 1; 1908 set_idx = setvec[found - 1]->set + 1;
1946 for (idx = 0; idx < found; idx++) 1909 for (idx = 0; idx < found; idx++)
@@ -2020,6 +1983,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2020 struct f2fs_nm_info *nm_i = NM_I(sbi); 1983 struct f2fs_nm_info *nm_i = NM_I(sbi);
2021 struct free_nid *i, *next_i; 1984 struct free_nid *i, *next_i;
2022 struct nat_entry *natvec[NATVEC_SIZE]; 1985 struct nat_entry *natvec[NATVEC_SIZE];
1986 struct nat_entry_set *setvec[SETVEC_SIZE];
2023 nid_t nid = 0; 1987 nid_t nid = 0;
2024 unsigned int found; 1988 unsigned int found;
2025 1989
@@ -2044,11 +2008,27 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2044 while ((found = __gang_lookup_nat_cache(nm_i, 2008 while ((found = __gang_lookup_nat_cache(nm_i,
2045 nid, NATVEC_SIZE, natvec))) { 2009 nid, NATVEC_SIZE, natvec))) {
2046 unsigned idx; 2010 unsigned idx;
2011
2047 nid = nat_get_nid(natvec[found - 1]) + 1; 2012 nid = nat_get_nid(natvec[found - 1]) + 1;
2048 for (idx = 0; idx < found; idx++) 2013 for (idx = 0; idx < found; idx++)
2049 __del_from_nat_cache(nm_i, natvec[idx]); 2014 __del_from_nat_cache(nm_i, natvec[idx]);
2050 } 2015 }
2051 f2fs_bug_on(sbi, nm_i->nat_cnt); 2016 f2fs_bug_on(sbi, nm_i->nat_cnt);
2017
2018 /* destroy nat set cache */
2019 nid = 0;
2020 while ((found = __gang_lookup_nat_set(nm_i,
2021 nid, SETVEC_SIZE, setvec))) {
2022 unsigned idx;
2023
2024 nid = setvec[found - 1]->set + 1;
2025 for (idx = 0; idx < found; idx++) {
2026 /* entry_cnt is not zero, when cp_error was occurred */
2027 f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
2028 radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
2029 kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2030 }
2031 }
2052 up_write(&nm_i->nat_tree_lock); 2032 up_write(&nm_i->nat_tree_lock);
2053 2033
2054 kfree(nm_i->nat_bitmap); 2034 kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d10b6448a671..f405bbf2435a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,10 +25,19 @@
25 25
26/* vector size for gang look-up from nat cache that consists of radix tree */ 26/* vector size for gang look-up from nat cache that consists of radix tree */
27#define NATVEC_SIZE 64 27#define NATVEC_SIZE 64
28#define SETVEC_SIZE 32
28 29
29/* return value for read_node_page */ 30/* return value for read_node_page */
30#define LOCKED_PAGE 1 31#define LOCKED_PAGE 1
31 32
33/* For flag in struct node_info */
34enum {
35 IS_CHECKPOINTED, /* is it checkpointed before? */
36 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
37 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
38 IS_DIRTY, /* this nat entry is dirty? */
39};
40
32/* 41/*
33 * For node information 42 * For node information
34 */ 43 */
@@ -37,18 +46,11 @@ struct node_info {
37 nid_t ino; /* inode number of the node's owner */ 46 nid_t ino; /* inode number of the node's owner */
38 block_t blk_addr; /* block address of the node */ 47 block_t blk_addr; /* block address of the node */
39 unsigned char version; /* version of the node */ 48 unsigned char version; /* version of the node */
40}; 49 unsigned char flag; /* for node information bits */
41
42enum {
43 IS_CHECKPOINTED, /* is it checkpointed before? */
44 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
45 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
46 IS_DIRTY, /* this nat entry is dirty? */
47}; 50};
48 51
49struct nat_entry { 52struct nat_entry {
50 struct list_head list; /* for clean or dirty nat list */ 53 struct list_head list; /* for clean or dirty nat list */
51 unsigned char flag; /* for node information bits */
52 struct node_info ni; /* in-memory node information */ 54 struct node_info ni; /* in-memory node information */
53}; 55};
54 56
@@ -63,20 +65,30 @@ struct nat_entry {
63 65
64#define inc_node_version(version) (++version) 66#define inc_node_version(version) (++version)
65 67
68static inline void copy_node_info(struct node_info *dst,
69 struct node_info *src)
70{
71 dst->nid = src->nid;
72 dst->ino = src->ino;
73 dst->blk_addr = src->blk_addr;
74 dst->version = src->version;
75 /* should not copy flag here */
76}
77
66static inline void set_nat_flag(struct nat_entry *ne, 78static inline void set_nat_flag(struct nat_entry *ne,
67 unsigned int type, bool set) 79 unsigned int type, bool set)
68{ 80{
69 unsigned char mask = 0x01 << type; 81 unsigned char mask = 0x01 << type;
70 if (set) 82 if (set)
71 ne->flag |= mask; 83 ne->ni.flag |= mask;
72 else 84 else
73 ne->flag &= ~mask; 85 ne->ni.flag &= ~mask;
74} 86}
75 87
76static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) 88static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
77{ 89{
78 unsigned char mask = 0x01 << type; 90 unsigned char mask = 0x01 << type;
79 return ne->flag & mask; 91 return ne->ni.flag & mask;
80} 92}
81 93
82static inline void nat_reset_flag(struct nat_entry *ne) 94static inline void nat_reset_flag(struct nat_entry *ne)
@@ -108,6 +120,7 @@ enum mem_type {
108 NAT_ENTRIES, /* indicates the cached nat entry */ 120 NAT_ENTRIES, /* indicates the cached nat entry */
109 DIRTY_DENTS, /* indicates dirty dentry pages */ 121 DIRTY_DENTS, /* indicates dirty dentry pages */
110 INO_ENTRIES, /* indicates inode entries */ 122 INO_ENTRIES, /* indicates inode entries */
123 BASE_CHECK, /* check kernel status */
111}; 124};
112 125
113struct nat_entry_set { 126struct nat_entry_set {
@@ -200,11 +213,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
200 nid_t ino, unsigned int ofs, bool reset) 213 nid_t ino, unsigned int ofs, bool reset)
201{ 214{
202 struct f2fs_node *rn = F2FS_NODE(page); 215 struct f2fs_node *rn = F2FS_NODE(page);
216 unsigned int old_flag = 0;
217
203 if (reset) 218 if (reset)
204 memset(rn, 0, sizeof(*rn)); 219 memset(rn, 0, sizeof(*rn));
220 else
221 old_flag = le32_to_cpu(rn->footer.flag);
222
205 rn->footer.nid = cpu_to_le32(nid); 223 rn->footer.nid = cpu_to_le32(nid);
206 rn->footer.ino = cpu_to_le32(ino); 224 rn->footer.ino = cpu_to_le32(ino);
207 rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT); 225
226 /* should remain old flag bits such as COLD_BIT_SHIFT */
227 rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
228 (old_flag & OFFSET_BIT_MASK));
208} 229}
209 230
210static inline void copy_node_footer(struct page *dst, struct page *src) 231static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9160a37e1c7a..41afb9534bbd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -346,6 +346,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
346 if (IS_INODE(page)) { 346 if (IS_INODE(page)) {
347 recover_inline_xattr(inode, page); 347 recover_inline_xattr(inode, page);
348 } else if (f2fs_has_xattr_block(ofs_of_node(page))) { 348 } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
349 /*
350 * Deprecated; xattr blocks should be found from cold log.
351 * But, we should remain this for backward compatibility.
352 */
349 recover_xattr_data(inode, page, blkaddr); 353 recover_xattr_data(inode, page, blkaddr);
350 goto out; 354 goto out;
351 } 355 }
@@ -396,7 +400,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
396 400
397 /* write dummy data page */ 401 /* write dummy data page */
398 recover_data_page(sbi, NULL, &sum, src, dest); 402 recover_data_page(sbi, NULL, &sum, src, dest);
399 update_extent_cache(dest, &dn); 403 dn.data_blkaddr = dest;
404 update_extent_cache(&dn);
400 recovered++; 405 recovered++;
401 } 406 }
402 dn.ofs_in_node++; 407 dn.ofs_in_node++;
@@ -503,7 +508,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
503 INIT_LIST_HEAD(&inode_list); 508 INIT_LIST_HEAD(&inode_list);
504 509
505 /* step #1: find fsynced inode numbers */ 510 /* step #1: find fsynced inode numbers */
506 sbi->por_doing = true; 511 set_sbi_flag(sbi, SBI_POR_DOING);
507 512
508 /* prevent checkpoint */ 513 /* prevent checkpoint */
509 mutex_lock(&sbi->cp_mutex); 514 mutex_lock(&sbi->cp_mutex);
@@ -536,7 +541,7 @@ out:
536 truncate_inode_pages_final(META_MAPPING(sbi)); 541 truncate_inode_pages_final(META_MAPPING(sbi));
537 } 542 }
538 543
539 sbi->por_doing = false; 544 clear_sbi_flag(sbi, SBI_POR_DOING);
540 if (err) { 545 if (err) {
541 discard_next_dnode(sbi, blkaddr); 546 discard_next_dnode(sbi, blkaddr);
542 547
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 42607a679923..daee4ab913da 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
20#include "f2fs.h" 20#include "f2fs.h"
21#include "segment.h" 21#include "segment.h"
22#include "node.h" 22#include "node.h"
23#include "trace.h"
23#include <trace/events/f2fs.h> 24#include <trace/events/f2fs.h>
24 25
25#define __reverse_ffz(x) __reverse_ffs(~(x)) 26#define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -181,6 +182,7 @@ void register_inmem_page(struct inode *inode, struct page *page)
181 int err; 182 int err;
182 183
183 SetPagePrivate(page); 184 SetPagePrivate(page);
185 f2fs_trace_pid(page);
184 186
185 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); 187 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
186 188
@@ -205,23 +207,6 @@ retry:
205 mutex_unlock(&fi->inmem_lock); 207 mutex_unlock(&fi->inmem_lock);
206} 208}
207 209
208void invalidate_inmem_page(struct inode *inode, struct page *page)
209{
210 struct f2fs_inode_info *fi = F2FS_I(inode);
211 struct inmem_pages *cur;
212
213 mutex_lock(&fi->inmem_lock);
214 cur = radix_tree_lookup(&fi->inmem_root, page->index);
215 if (cur) {
216 radix_tree_delete(&fi->inmem_root, cur->page->index);
217 f2fs_put_page(cur->page, 0);
218 list_del(&cur->list);
219 kmem_cache_free(inmem_entry_slab, cur);
220 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
221 }
222 mutex_unlock(&fi->inmem_lock);
223}
224
225void commit_inmem_pages(struct inode *inode, bool abort) 210void commit_inmem_pages(struct inode *inode, bool abort)
226{ 211{
227 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 212 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -230,7 +215,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
230 bool submit_bio = false; 215 bool submit_bio = false;
231 struct f2fs_io_info fio = { 216 struct f2fs_io_info fio = {
232 .type = DATA, 217 .type = DATA,
233 .rw = WRITE_SYNC, 218 .rw = WRITE_SYNC | REQ_PRIO,
234 }; 219 };
235 220
236 /* 221 /*
@@ -240,33 +225,38 @@ void commit_inmem_pages(struct inode *inode, bool abort)
240 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this 225 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
241 * inode becomes free by iget_locked in f2fs_iget. 226 * inode becomes free by iget_locked in f2fs_iget.
242 */ 227 */
243 if (!abort) 228 if (!abort) {
244 f2fs_balance_fs(sbi); 229 f2fs_balance_fs(sbi);
245 230 f2fs_lock_op(sbi);
246 f2fs_lock_op(sbi); 231 }
247 232
248 mutex_lock(&fi->inmem_lock); 233 mutex_lock(&fi->inmem_lock);
249 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { 234 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
250 lock_page(cur->page); 235 if (!abort) {
251 if (!abort && cur->page->mapping == inode->i_mapping) { 236 lock_page(cur->page);
252 f2fs_wait_on_page_writeback(cur->page, DATA); 237 if (cur->page->mapping == inode->i_mapping) {
253 if (clear_page_dirty_for_io(cur->page)) 238 f2fs_wait_on_page_writeback(cur->page, DATA);
254 inode_dec_dirty_pages(inode); 239 if (clear_page_dirty_for_io(cur->page))
255 do_write_data_page(cur->page, &fio); 240 inode_dec_dirty_pages(inode);
256 submit_bio = true; 241 do_write_data_page(cur->page, &fio);
242 submit_bio = true;
243 }
244 f2fs_put_page(cur->page, 1);
245 } else {
246 put_page(cur->page);
257 } 247 }
258 radix_tree_delete(&fi->inmem_root, cur->page->index); 248 radix_tree_delete(&fi->inmem_root, cur->page->index);
259 f2fs_put_page(cur->page, 1);
260 list_del(&cur->list); 249 list_del(&cur->list);
261 kmem_cache_free(inmem_entry_slab, cur); 250 kmem_cache_free(inmem_entry_slab, cur);
262 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); 251 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
263 } 252 }
264 if (submit_bio)
265 f2fs_submit_merged_bio(sbi, DATA, WRITE);
266 mutex_unlock(&fi->inmem_lock); 253 mutex_unlock(&fi->inmem_lock);
267 254
268 filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX); 255 if (!abort) {
269 f2fs_unlock_op(sbi); 256 f2fs_unlock_op(sbi);
257 if (submit_bio)
258 f2fs_submit_merged_bio(sbi, DATA, WRITE);
259 }
270} 260}
271 261
272/* 262/*
@@ -290,7 +280,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
290 /* check the # of cached NAT entries and prefree segments */ 280 /* check the # of cached NAT entries and prefree segments */
291 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || 281 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
292 excess_prefree_segs(sbi) || 282 excess_prefree_segs(sbi) ||
293 available_free_memory(sbi, INO_ENTRIES)) 283 !available_free_memory(sbi, INO_ENTRIES))
294 f2fs_sync_fs(sbi->sb, true); 284 f2fs_sync_fs(sbi->sb, true);
295} 285}
296 286
@@ -515,12 +505,13 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
515 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); 505 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
516 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 506 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
517 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 507 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
518 unsigned long dmap[entries]; 508 unsigned long *dmap = SIT_I(sbi)->tmp_map;
519 unsigned int start = 0, end = -1; 509 unsigned int start = 0, end = -1;
520 bool force = (cpc->reason == CP_DISCARD); 510 bool force = (cpc->reason == CP_DISCARD);
521 int i; 511 int i;
522 512
523 if (!force && !test_opt(sbi, DISCARD)) 513 if (!force && (!test_opt(sbi, DISCARD) ||
514 SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
524 return; 515 return;
525 516
526 if (force && !se->valid_blocks) { 517 if (force && !se->valid_blocks) {
@@ -548,7 +539,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
548 539
549 /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ 540 /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
550 for (i = 0; i < entries; i++) 541 for (i = 0; i < entries; i++)
551 dmap[i] = ~(cur_map[i] | ckpt_map[i]); 542 dmap[i] = force ? ~ckpt_map[i] :
543 (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
552 544
553 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { 545 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
554 start = __find_rev_next_bit(dmap, max_blocks, end + 1); 546 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
@@ -735,7 +727,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
735/* 727/*
736 * Calculate the number of current summary pages for writing 728 * Calculate the number of current summary pages for writing
737 */ 729 */
738int npages_for_summary_flush(struct f2fs_sb_info *sbi) 730int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
739{ 731{
740 int valid_sum_count = 0; 732 int valid_sum_count = 0;
741 int i, sum_in_page; 733 int i, sum_in_page;
@@ -743,8 +735,13 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
743 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { 735 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
744 if (sbi->ckpt->alloc_type[i] == SSR) 736 if (sbi->ckpt->alloc_type[i] == SSR)
745 valid_sum_count += sbi->blocks_per_seg; 737 valid_sum_count += sbi->blocks_per_seg;
746 else 738 else {
747 valid_sum_count += curseg_blkoff(sbi, i); 739 if (for_ra)
740 valid_sum_count += le16_to_cpu(
741 F2FS_CKPT(sbi)->cur_data_blkoff[i]);
742 else
743 valid_sum_count += curseg_blkoff(sbi, i);
744 }
748 } 745 }
749 746
750 sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - 747 sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
@@ -803,7 +800,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
803 int go_left = 0; 800 int go_left = 0;
804 int i; 801 int i;
805 802
806 write_lock(&free_i->segmap_lock); 803 spin_lock(&free_i->segmap_lock);
807 804
808 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { 805 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
809 segno = find_next_zero_bit(free_i->free_segmap, 806 segno = find_next_zero_bit(free_i->free_segmap,
@@ -876,7 +873,7 @@ got_it:
876 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); 873 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
877 __set_inuse(sbi, segno); 874 __set_inuse(sbi, segno);
878 *newseg = segno; 875 *newseg = segno;
879 write_unlock(&free_i->segmap_lock); 876 spin_unlock(&free_i->segmap_lock);
880} 877}
881 878
882static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) 879static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -927,7 +924,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
927{ 924{
928 struct seg_entry *se = get_seg_entry(sbi, seg->segno); 925 struct seg_entry *se = get_seg_entry(sbi, seg->segno);
929 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); 926 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
930 unsigned long target_map[entries]; 927 unsigned long *target_map = SIT_I(sbi)->tmp_map;
931 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 928 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
932 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 929 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
933 int i, pos; 930 int i, pos;
@@ -1027,18 +1024,22 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
1027 stat_inc_seg_type(sbi, curseg); 1024 stat_inc_seg_type(sbi, curseg);
1028} 1025}
1029 1026
1027static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
1028{
1029 struct curseg_info *curseg = CURSEG_I(sbi, type);
1030 unsigned int old_segno;
1031
1032 old_segno = curseg->segno;
1033 SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
1034 locate_dirty_segment(sbi, old_segno);
1035}
1036
1030void allocate_new_segments(struct f2fs_sb_info *sbi) 1037void allocate_new_segments(struct f2fs_sb_info *sbi)
1031{ 1038{
1032 struct curseg_info *curseg;
1033 unsigned int old_curseg;
1034 int i; 1039 int i;
1035 1040
1036 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { 1041 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
1037 curseg = CURSEG_I(sbi, i); 1042 __allocate_new_segments(sbi, i);
1038 old_curseg = curseg->segno;
1039 SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
1040 locate_dirty_segment(sbi, old_curseg);
1041 }
1042} 1043}
1043 1044
1044static const struct segment_allocation default_salloc_ops = { 1045static const struct segment_allocation default_salloc_ops = {
@@ -1047,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
1047 1048
1048int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) 1049int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
1049{ 1050{
1050 __u64 start = range->start >> sbi->log_blocksize; 1051 __u64 start = F2FS_BYTES_TO_BLK(range->start);
1051 __u64 end = start + (range->len >> sbi->log_blocksize) - 1; 1052 __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
1052 unsigned int start_segno, end_segno; 1053 unsigned int start_segno, end_segno;
1053 struct cp_control cpc; 1054 struct cp_control cpc;
1054 1055
@@ -1065,16 +1066,21 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
1065 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : 1066 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
1066 GET_SEGNO(sbi, end); 1067 GET_SEGNO(sbi, end);
1067 cpc.reason = CP_DISCARD; 1068 cpc.reason = CP_DISCARD;
1068 cpc.trim_start = start_segno; 1069 cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
1069 cpc.trim_end = end_segno;
1070 cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
1071 1070
1072 /* do checkpoint to issue discard commands safely */ 1071 /* do checkpoint to issue discard commands safely */
1073 mutex_lock(&sbi->gc_mutex); 1072 for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
1074 write_checkpoint(sbi, &cpc); 1073 cpc.trim_start = start_segno;
1075 mutex_unlock(&sbi->gc_mutex); 1074 cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
1075 BATCHED_TRIM_SEGMENTS(sbi),
1076 sbi->segs_per_sec) - 1, end_segno);
1077
1078 mutex_lock(&sbi->gc_mutex);
1079 write_checkpoint(sbi, &cpc);
1080 mutex_unlock(&sbi->gc_mutex);
1081 }
1076out: 1082out:
1077 range->len = cpc.trimmed << sbi->log_blocksize; 1083 range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
1078 return 0; 1084 return 0;
1079} 1085}
1080 1086
@@ -1151,11 +1157,18 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1151{ 1157{
1152 struct sit_info *sit_i = SIT_I(sbi); 1158 struct sit_info *sit_i = SIT_I(sbi);
1153 struct curseg_info *curseg; 1159 struct curseg_info *curseg;
1160 bool direct_io = (type == CURSEG_DIRECT_IO);
1161
1162 type = direct_io ? CURSEG_WARM_DATA : type;
1154 1163
1155 curseg = CURSEG_I(sbi, type); 1164 curseg = CURSEG_I(sbi, type);
1156 1165
1157 mutex_lock(&curseg->curseg_mutex); 1166 mutex_lock(&curseg->curseg_mutex);
1158 1167
1168 /* direct_io'ed data is aligned to the segment for better performance */
1169 if (direct_io && curseg->next_blkoff)
1170 __allocate_new_segments(sbi, type);
1171
1159 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 1172 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
1160 1173
1161 /* 1174 /*
@@ -1187,39 +1200,39 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1187} 1200}
1188 1201
1189static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, 1202static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
1190 block_t old_blkaddr, block_t *new_blkaddr, 1203 struct f2fs_summary *sum,
1191 struct f2fs_summary *sum, struct f2fs_io_info *fio) 1204 struct f2fs_io_info *fio)
1192{ 1205{
1193 int type = __get_segment_type(page, fio->type); 1206 int type = __get_segment_type(page, fio->type);
1194 1207
1195 allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); 1208 allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
1196 1209
1197 /* writeout dirty page into bdev */ 1210 /* writeout dirty page into bdev */
1198 f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); 1211 f2fs_submit_page_mbio(sbi, page, fio);
1199} 1212}
1200 1213
1201void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) 1214void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
1202{ 1215{
1203 struct f2fs_io_info fio = { 1216 struct f2fs_io_info fio = {
1204 .type = META, 1217 .type = META,
1205 .rw = WRITE_SYNC | REQ_META | REQ_PRIO 1218 .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
1219 .blk_addr = page->index,
1206 }; 1220 };
1207 1221
1208 set_page_writeback(page); 1222 set_page_writeback(page);
1209 f2fs_submit_page_mbio(sbi, page, page->index, &fio); 1223 f2fs_submit_page_mbio(sbi, page, &fio);
1210} 1224}
1211 1225
1212void write_node_page(struct f2fs_sb_info *sbi, struct page *page, 1226void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
1213 struct f2fs_io_info *fio, 1227 unsigned int nid, struct f2fs_io_info *fio)
1214 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
1215{ 1228{
1216 struct f2fs_summary sum; 1229 struct f2fs_summary sum;
1217 set_summary(&sum, nid, 0, 0); 1230 set_summary(&sum, nid, 0, 0);
1218 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); 1231 do_write_page(sbi, page, &sum, fio);
1219} 1232}
1220 1233
1221void write_data_page(struct page *page, struct dnode_of_data *dn, 1234void write_data_page(struct page *page, struct dnode_of_data *dn,
1222 block_t *new_blkaddr, struct f2fs_io_info *fio) 1235 struct f2fs_io_info *fio)
1223{ 1236{
1224 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 1237 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1225 struct f2fs_summary sum; 1238 struct f2fs_summary sum;
@@ -1228,14 +1241,14 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
1228 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); 1241 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
1229 get_node_info(sbi, dn->nid, &ni); 1242 get_node_info(sbi, dn->nid, &ni);
1230 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1243 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
1231 1244 do_write_page(sbi, page, &sum, fio);
1232 do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); 1245 dn->data_blkaddr = fio->blk_addr;
1233} 1246}
1234 1247
1235void rewrite_data_page(struct page *page, block_t old_blkaddr, 1248void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
1236 struct f2fs_io_info *fio)
1237{ 1249{
1238 f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio); 1250 stat_inc_inplace_blocks(F2FS_P_SB(page));
1251 f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
1239} 1252}
1240 1253
1241void recover_data_page(struct f2fs_sb_info *sbi, 1254void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1393,7 +1406,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1393 segno = le32_to_cpu(ckpt->cur_data_segno[type]); 1406 segno = le32_to_cpu(ckpt->cur_data_segno[type]);
1394 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - 1407 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
1395 CURSEG_HOT_DATA]); 1408 CURSEG_HOT_DATA]);
1396 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) 1409 if (__exist_node_summaries(sbi))
1397 blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); 1410 blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
1398 else 1411 else
1399 blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); 1412 blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1402,7 +1415,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1402 CURSEG_HOT_NODE]); 1415 CURSEG_HOT_NODE]);
1403 blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - 1416 blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
1404 CURSEG_HOT_NODE]); 1417 CURSEG_HOT_NODE]);
1405 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) 1418 if (__exist_node_summaries(sbi))
1406 blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, 1419 blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
1407 type - CURSEG_HOT_NODE); 1420 type - CURSEG_HOT_NODE);
1408 else 1421 else
@@ -1413,7 +1426,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1413 sum = (struct f2fs_summary_block *)page_address(new); 1426 sum = (struct f2fs_summary_block *)page_address(new);
1414 1427
1415 if (IS_NODESEG(type)) { 1428 if (IS_NODESEG(type)) {
1416 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) { 1429 if (__exist_node_summaries(sbi)) {
1417 struct f2fs_summary *ns = &sum->entries[0]; 1430 struct f2fs_summary *ns = &sum->entries[0];
1418 int i; 1431 int i;
1419 for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { 1432 for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1450,12 +1463,22 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1450 int err; 1463 int err;
1451 1464
1452 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { 1465 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
1466 int npages = npages_for_summary_flush(sbi, true);
1467
1468 if (npages >= 2)
1469 ra_meta_pages(sbi, start_sum_block(sbi), npages,
1470 META_CP);
1471
1453 /* restore for compacted data summary */ 1472 /* restore for compacted data summary */
1454 if (read_compacted_summaries(sbi)) 1473 if (read_compacted_summaries(sbi))
1455 return -EINVAL; 1474 return -EINVAL;
1456 type = CURSEG_HOT_NODE; 1475 type = CURSEG_HOT_NODE;
1457 } 1476 }
1458 1477
1478 if (__exist_node_summaries(sbi))
1479 ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
1480 NR_CURSEG_TYPE - type, META_CP);
1481
1459 for (; type <= CURSEG_COLD_NODE; type++) { 1482 for (; type <= CURSEG_COLD_NODE; type++) {
1460 err = read_normal_summaries(sbi, type); 1483 err = read_normal_summaries(sbi, type);
1461 if (err) 1484 if (err)
@@ -1549,8 +1572,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1549 1572
1550void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) 1573void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1551{ 1574{
1552 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) 1575 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1553 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1554} 1576}
1555 1577
1556int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, 1578int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
@@ -1754,7 +1776,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1754 se = get_seg_entry(sbi, segno); 1776 se = get_seg_entry(sbi, segno);
1755 1777
1756 /* add discard candidates */ 1778 /* add discard candidates */
1757 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) { 1779 if (cpc->reason != CP_DISCARD) {
1758 cpc->trim_start = segno; 1780 cpc->trim_start = segno;
1759 add_discard_addrs(sbi, cpc); 1781 add_discard_addrs(sbi, cpc);
1760 } 1782 }
@@ -1833,6 +1855,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1833 return -ENOMEM; 1855 return -ENOMEM;
1834 } 1856 }
1835 1857
1858 sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1859 if (!sit_i->tmp_map)
1860 return -ENOMEM;
1861
1836 if (sbi->segs_per_sec > 1) { 1862 if (sbi->segs_per_sec > 1) {
1837 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * 1863 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
1838 sizeof(struct sec_entry)); 1864 sizeof(struct sec_entry));
@@ -1897,7 +1923,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1897 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); 1923 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
1898 free_i->free_segments = 0; 1924 free_i->free_segments = 0;
1899 free_i->free_sections = 0; 1925 free_i->free_sections = 0;
1900 rwlock_init(&free_i->segmap_lock); 1926 spin_lock_init(&free_i->segmap_lock);
1901 return 0; 1927 return 0;
1902} 1928}
1903 1929
@@ -2110,6 +2136,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
2110 sm_info->nr_discards = 0; 2136 sm_info->nr_discards = 0;
2111 sm_info->max_discards = 0; 2137 sm_info->max_discards = 0;
2112 2138
2139 sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
2140
2113 INIT_LIST_HEAD(&sm_info->sit_entry_set); 2141 INIT_LIST_HEAD(&sm_info->sit_entry_set);
2114 2142
2115 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { 2143 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
@@ -2212,6 +2240,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
2212 kfree(sit_i->sentries[start].ckpt_valid_map); 2240 kfree(sit_i->sentries[start].ckpt_valid_map);
2213 } 2241 }
2214 } 2242 }
2243 kfree(sit_i->tmp_map);
2244
2215 vfree(sit_i->sentries); 2245 vfree(sit_i->sentries);
2216 vfree(sit_i->sec_entries); 2246 vfree(sit_i->sec_entries);
2217 kfree(sit_i->dirty_sentries_bitmap); 2247 kfree(sit_i->dirty_sentries_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7f327c0ba4e3..7fd35111cf62 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -189,6 +189,7 @@ struct sit_info {
189 char *sit_bitmap; /* SIT bitmap pointer */ 189 char *sit_bitmap; /* SIT bitmap pointer */
190 unsigned int bitmap_size; /* SIT bitmap size */ 190 unsigned int bitmap_size; /* SIT bitmap size */
191 191
192 unsigned long *tmp_map; /* bitmap for temporal use */
192 unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ 193 unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
193 unsigned int dirty_sentries; /* # of dirty sentries */ 194 unsigned int dirty_sentries; /* # of dirty sentries */
194 unsigned int sents_per_block; /* # of SIT entries per block */ 195 unsigned int sents_per_block; /* # of SIT entries per block */
@@ -207,7 +208,7 @@ struct free_segmap_info {
207 unsigned int start_segno; /* start segment number logically */ 208 unsigned int start_segno; /* start segment number logically */
208 unsigned int free_segments; /* # of free segments */ 209 unsigned int free_segments; /* # of free segments */
209 unsigned int free_sections; /* # of free sections */ 210 unsigned int free_sections; /* # of free sections */
210 rwlock_t segmap_lock; /* free segmap lock */ 211 spinlock_t segmap_lock; /* free segmap lock */
211 unsigned long *free_segmap; /* free segment bitmap */ 212 unsigned long *free_segmap; /* free segment bitmap */
212 unsigned long *free_secmap; /* free section bitmap */ 213 unsigned long *free_secmap; /* free section bitmap */
213}; 214};
@@ -318,9 +319,9 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
318 unsigned int max, unsigned int segno) 319 unsigned int max, unsigned int segno)
319{ 320{
320 unsigned int ret; 321 unsigned int ret;
321 read_lock(&free_i->segmap_lock); 322 spin_lock(&free_i->segmap_lock);
322 ret = find_next_bit(free_i->free_segmap, max, segno); 323 ret = find_next_bit(free_i->free_segmap, max, segno);
323 read_unlock(&free_i->segmap_lock); 324 spin_unlock(&free_i->segmap_lock);
324 return ret; 325 return ret;
325} 326}
326 327
@@ -331,7 +332,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
331 unsigned int start_segno = secno * sbi->segs_per_sec; 332 unsigned int start_segno = secno * sbi->segs_per_sec;
332 unsigned int next; 333 unsigned int next;
333 334
334 write_lock(&free_i->segmap_lock); 335 spin_lock(&free_i->segmap_lock);
335 clear_bit(segno, free_i->free_segmap); 336 clear_bit(segno, free_i->free_segmap);
336 free_i->free_segments++; 337 free_i->free_segments++;
337 338
@@ -340,7 +341,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
340 clear_bit(secno, free_i->free_secmap); 341 clear_bit(secno, free_i->free_secmap);
341 free_i->free_sections++; 342 free_i->free_sections++;
342 } 343 }
343 write_unlock(&free_i->segmap_lock); 344 spin_unlock(&free_i->segmap_lock);
344} 345}
345 346
346static inline void __set_inuse(struct f2fs_sb_info *sbi, 347static inline void __set_inuse(struct f2fs_sb_info *sbi,
@@ -362,7 +363,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
362 unsigned int start_segno = secno * sbi->segs_per_sec; 363 unsigned int start_segno = secno * sbi->segs_per_sec;
363 unsigned int next; 364 unsigned int next;
364 365
365 write_lock(&free_i->segmap_lock); 366 spin_lock(&free_i->segmap_lock);
366 if (test_and_clear_bit(segno, free_i->free_segmap)) { 367 if (test_and_clear_bit(segno, free_i->free_segmap)) {
367 free_i->free_segments++; 368 free_i->free_segments++;
368 369
@@ -373,7 +374,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
373 free_i->free_sections++; 374 free_i->free_sections++;
374 } 375 }
375 } 376 }
376 write_unlock(&free_i->segmap_lock); 377 spin_unlock(&free_i->segmap_lock);
377} 378}
378 379
379static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, 380static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
@@ -381,13 +382,13 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
381{ 382{
382 struct free_segmap_info *free_i = FREE_I(sbi); 383 struct free_segmap_info *free_i = FREE_I(sbi);
383 unsigned int secno = segno / sbi->segs_per_sec; 384 unsigned int secno = segno / sbi->segs_per_sec;
384 write_lock(&free_i->segmap_lock); 385 spin_lock(&free_i->segmap_lock);
385 if (!test_and_set_bit(segno, free_i->free_segmap)) { 386 if (!test_and_set_bit(segno, free_i->free_segmap)) {
386 free_i->free_segments--; 387 free_i->free_segments--;
387 if (!test_and_set_bit(secno, free_i->free_secmap)) 388 if (!test_and_set_bit(secno, free_i->free_secmap))
388 free_i->free_sections--; 389 free_i->free_sections--;
389 } 390 }
390 write_unlock(&free_i->segmap_lock); 391 spin_unlock(&free_i->segmap_lock);
391} 392}
392 393
393static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, 394static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
@@ -460,7 +461,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
460 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); 461 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
461 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); 462 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
462 463
463 if (unlikely(sbi->por_doing)) 464 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
464 return false; 465 return false;
465 466
466 return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + 467 return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
@@ -599,13 +600,13 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
599static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) 600static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
600{ 601{
601 if (segno > TOTAL_SEGS(sbi) - 1) 602 if (segno > TOTAL_SEGS(sbi) - 1)
602 sbi->need_fsck = true; 603 set_sbi_flag(sbi, SBI_NEED_FSCK);
603} 604}
604 605
605static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) 606static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
606{ 607{
607 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) 608 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
608 sbi->need_fsck = true; 609 set_sbi_flag(sbi, SBI_NEED_FSCK);
609} 610}
610 611
611/* 612/*
@@ -616,11 +617,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
616{ 617{
617 /* check segment usage */ 618 /* check segment usage */
618 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) 619 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
619 sbi->need_fsck = true; 620 set_sbi_flag(sbi, SBI_NEED_FSCK);
620 621
621 /* check boundary of a given segment number */ 622 /* check boundary of a given segment number */
622 if (segno > TOTAL_SEGS(sbi) - 1) 623 if (segno > TOTAL_SEGS(sbi) - 1)
623 sbi->need_fsck = true; 624 set_sbi_flag(sbi, SBI_NEED_FSCK);
624} 625}
625#endif 626#endif
626 627
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f71421d70475..f2fe666a6ea9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -30,6 +30,7 @@
30#include "segment.h" 30#include "segment.h"
31#include "xattr.h" 31#include "xattr.h"
32#include "gc.h" 32#include "gc.h"
33#include "trace.h"
33 34
34#define CREATE_TRACE_POINTS 35#define CREATE_TRACE_POINTS
35#include <trace/events/f2fs.h> 36#include <trace/events/f2fs.h>
@@ -41,6 +42,7 @@ static struct kset *f2fs_kset;
41enum { 42enum {
42 Opt_gc_background, 43 Opt_gc_background,
43 Opt_disable_roll_forward, 44 Opt_disable_roll_forward,
45 Opt_norecovery,
44 Opt_discard, 46 Opt_discard,
45 Opt_noheap, 47 Opt_noheap,
46 Opt_user_xattr, 48 Opt_user_xattr,
@@ -61,6 +63,7 @@ enum {
61static match_table_t f2fs_tokens = { 63static match_table_t f2fs_tokens = {
62 {Opt_gc_background, "background_gc=%s"}, 64 {Opt_gc_background, "background_gc=%s"},
63 {Opt_disable_roll_forward, "disable_roll_forward"}, 65 {Opt_disable_roll_forward, "disable_roll_forward"},
66 {Opt_norecovery, "norecovery"},
64 {Opt_discard, "discard"}, 67 {Opt_discard, "discard"},
65 {Opt_noheap, "no_heap"}, 68 {Opt_noheap, "no_heap"},
66 {Opt_user_xattr, "user_xattr"}, 69 {Opt_user_xattr, "user_xattr"},
@@ -192,6 +195,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
192F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); 195F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
193F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); 196F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
194F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 197F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
198F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
195F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 199F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
196F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 200F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
197F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); 201F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -207,6 +211,7 @@ static struct attribute *f2fs_attrs[] = {
207 ATTR_LIST(gc_idle), 211 ATTR_LIST(gc_idle),
208 ATTR_LIST(reclaim_segments), 212 ATTR_LIST(reclaim_segments),
209 ATTR_LIST(max_small_discards), 213 ATTR_LIST(max_small_discards),
214 ATTR_LIST(batched_trim_sections),
210 ATTR_LIST(ipu_policy), 215 ATTR_LIST(ipu_policy),
211 ATTR_LIST(min_ipu_util), 216 ATTR_LIST(min_ipu_util),
212 ATTR_LIST(min_fsync_blocks), 217 ATTR_LIST(min_fsync_blocks),
@@ -286,6 +291,12 @@ static int parse_options(struct super_block *sb, char *options)
286 case Opt_disable_roll_forward: 291 case Opt_disable_roll_forward:
287 set_opt(sbi, DISABLE_ROLL_FORWARD); 292 set_opt(sbi, DISABLE_ROLL_FORWARD);
288 break; 293 break;
294 case Opt_norecovery:
295 /* this option mounts f2fs with ro */
296 set_opt(sbi, DISABLE_ROLL_FORWARD);
297 if (!f2fs_readonly(sb))
298 return -EINVAL;
299 break;
289 case Opt_discard: 300 case Opt_discard:
290 set_opt(sbi, DISCARD); 301 set_opt(sbi, DISCARD);
291 break; 302 break;
@@ -446,8 +457,13 @@ static void f2fs_put_super(struct super_block *sb)
446 f2fs_destroy_stats(sbi); 457 f2fs_destroy_stats(sbi);
447 stop_gc_thread(sbi); 458 stop_gc_thread(sbi);
448 459
449 /* We don't need to do checkpoint when it's clean */ 460 /*
450 if (sbi->s_dirty) { 461 * We don't need to do checkpoint when superblock is clean.
462 * But, the previous checkpoint was not done by umount, it needs to do
463 * clean checkpoint again.
464 */
465 if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
466 !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
451 struct cp_control cpc = { 467 struct cp_control cpc = {
452 .reason = CP_UMOUNT, 468 .reason = CP_UMOUNT,
453 }; 469 };
@@ -486,13 +502,15 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
486 if (sync) { 502 if (sync) {
487 struct cp_control cpc; 503 struct cp_control cpc;
488 504
489 cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC; 505 cpc.reason = __get_cp_reason(sbi);
506
490 mutex_lock(&sbi->gc_mutex); 507 mutex_lock(&sbi->gc_mutex);
491 write_checkpoint(sbi, &cpc); 508 write_checkpoint(sbi, &cpc);
492 mutex_unlock(&sbi->gc_mutex); 509 mutex_unlock(&sbi->gc_mutex);
493 } else { 510 } else {
494 f2fs_balance_fs(sbi); 511 f2fs_balance_fs(sbi);
495 } 512 }
513 f2fs_trace_ios(NULL, NULL, 1);
496 514
497 return 0; 515 return 0;
498} 516}
@@ -887,7 +905,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
887 atomic_set(&sbi->nr_pages[i], 0); 905 atomic_set(&sbi->nr_pages[i], 0);
888 906
889 sbi->dir_level = DEF_DIR_LEVEL; 907 sbi->dir_level = DEF_DIR_LEVEL;
890 sbi->need_fsck = false; 908 clear_sbi_flag(sbi, SBI_NEED_FSCK);
891} 909}
892 910
893/* 911/*
@@ -942,6 +960,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
942 struct inode *root; 960 struct inode *root;
943 long err = -EINVAL; 961 long err = -EINVAL;
944 bool retry = true; 962 bool retry = true;
963 char *options = NULL;
945 int i; 964 int i;
946 965
947try_onemore: 966try_onemore:
@@ -973,9 +992,15 @@ try_onemore:
973 set_opt(sbi, POSIX_ACL); 992 set_opt(sbi, POSIX_ACL);
974#endif 993#endif
975 /* parse mount options */ 994 /* parse mount options */
976 err = parse_options(sb, (char *)data); 995 options = kstrdup((const char *)data, GFP_KERNEL);
977 if (err) 996 if (data && !options) {
997 err = -ENOMEM;
978 goto free_sb_buf; 998 goto free_sb_buf;
999 }
1000
1001 err = parse_options(sb, options);
1002 if (err)
1003 goto free_options;
979 1004
980 sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); 1005 sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
981 sb->s_max_links = F2FS_LINK_MAX; 1006 sb->s_max_links = F2FS_LINK_MAX;
@@ -998,7 +1023,7 @@ try_onemore:
998 mutex_init(&sbi->writepages); 1023 mutex_init(&sbi->writepages);
999 mutex_init(&sbi->cp_mutex); 1024 mutex_init(&sbi->cp_mutex);
1000 init_rwsem(&sbi->node_write); 1025 init_rwsem(&sbi->node_write);
1001 sbi->por_doing = false; 1026 clear_sbi_flag(sbi, SBI_POR_DOING);
1002 spin_lock_init(&sbi->stat_lock); 1027 spin_lock_init(&sbi->stat_lock);
1003 1028
1004 init_rwsem(&sbi->read_io.io_rwsem); 1029 init_rwsem(&sbi->read_io.io_rwsem);
@@ -1019,7 +1044,7 @@ try_onemore:
1019 if (IS_ERR(sbi->meta_inode)) { 1044 if (IS_ERR(sbi->meta_inode)) {
1020 f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); 1045 f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
1021 err = PTR_ERR(sbi->meta_inode); 1046 err = PTR_ERR(sbi->meta_inode);
1022 goto free_sb_buf; 1047 goto free_options;
1023 } 1048 }
1024 1049
1025 err = get_valid_checkpoint(sbi); 1050 err = get_valid_checkpoint(sbi);
@@ -1122,10 +1147,19 @@ try_onemore:
1122 goto free_proc; 1147 goto free_proc;
1123 1148
1124 if (!retry) 1149 if (!retry)
1125 sbi->need_fsck = true; 1150 set_sbi_flag(sbi, SBI_NEED_FSCK);
1126 1151
1127 /* recover fsynced data */ 1152 /* recover fsynced data */
1128 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1153 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1154 /*
1155 * mount should be failed, when device has readonly mode, and
1156 * previous checkpoint was not done by clean system shutdown.
1157 */
1158 if (bdev_read_only(sb->s_bdev) &&
1159 !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
1160 err = -EROFS;
1161 goto free_kobj;
1162 }
1129 err = recover_fsync_data(sbi); 1163 err = recover_fsync_data(sbi);
1130 if (err) { 1164 if (err) {
1131 f2fs_msg(sb, KERN_ERR, 1165 f2fs_msg(sb, KERN_ERR,
@@ -1144,6 +1178,7 @@ try_onemore:
1144 if (err) 1178 if (err)
1145 goto free_kobj; 1179 goto free_kobj;
1146 } 1180 }
1181 kfree(options);
1147 return 0; 1182 return 0;
1148 1183
1149free_kobj: 1184free_kobj:
@@ -1168,6 +1203,8 @@ free_cp:
1168free_meta_inode: 1203free_meta_inode:
1169 make_bad_inode(sbi->meta_inode); 1204 make_bad_inode(sbi->meta_inode);
1170 iput(sbi->meta_inode); 1205 iput(sbi->meta_inode);
1206free_options:
1207 kfree(options);
1171free_sb_buf: 1208free_sb_buf:
1172 brelse(raw_super_buf); 1209 brelse(raw_super_buf);
1173free_sbi: 1210free_sbi:
@@ -1188,11 +1225,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
1188 return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); 1225 return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
1189} 1226}
1190 1227
1228static void kill_f2fs_super(struct super_block *sb)
1229{
1230 if (sb->s_root)
1231 set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
1232 kill_block_super(sb);
1233}
1234
1191static struct file_system_type f2fs_fs_type = { 1235static struct file_system_type f2fs_fs_type = {
1192 .owner = THIS_MODULE, 1236 .owner = THIS_MODULE,
1193 .name = "f2fs", 1237 .name = "f2fs",
1194 .mount = f2fs_mount, 1238 .mount = f2fs_mount,
1195 .kill_sb = kill_block_super, 1239 .kill_sb = kill_f2fs_super,
1196 .fs_flags = FS_REQUIRES_DEV, 1240 .fs_flags = FS_REQUIRES_DEV,
1197}; 1241};
1198MODULE_ALIAS_FS("f2fs"); 1242MODULE_ALIAS_FS("f2fs");
@@ -1220,6 +1264,8 @@ static int __init init_f2fs_fs(void)
1220{ 1264{
1221 int err; 1265 int err;
1222 1266
1267 f2fs_build_trace_ios();
1268
1223 err = init_inodecache(); 1269 err = init_inodecache();
1224 if (err) 1270 if (err)
1225 goto fail; 1271 goto fail;
@@ -1229,12 +1275,9 @@ static int __init init_f2fs_fs(void)
1229 err = create_segment_manager_caches(); 1275 err = create_segment_manager_caches();
1230 if (err) 1276 if (err)
1231 goto free_node_manager_caches; 1277 goto free_node_manager_caches;
1232 err = create_gc_caches();
1233 if (err)
1234 goto free_segment_manager_caches;
1235 err = create_checkpoint_caches(); 1278 err = create_checkpoint_caches();
1236 if (err) 1279 if (err)
1237 goto free_gc_caches; 1280 goto free_segment_manager_caches;
1238 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); 1281 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
1239 if (!f2fs_kset) { 1282 if (!f2fs_kset) {
1240 err = -ENOMEM; 1283 err = -ENOMEM;
@@ -1251,8 +1294,6 @@ free_kset:
1251 kset_unregister(f2fs_kset); 1294 kset_unregister(f2fs_kset);
1252free_checkpoint_caches: 1295free_checkpoint_caches:
1253 destroy_checkpoint_caches(); 1296 destroy_checkpoint_caches();
1254free_gc_caches:
1255 destroy_gc_caches();
1256free_segment_manager_caches: 1297free_segment_manager_caches:
1257 destroy_segment_manager_caches(); 1298 destroy_segment_manager_caches();
1258free_node_manager_caches: 1299free_node_manager_caches:
@@ -1269,11 +1310,11 @@ static void __exit exit_f2fs_fs(void)
1269 f2fs_destroy_root_stats(); 1310 f2fs_destroy_root_stats();
1270 unregister_filesystem(&f2fs_fs_type); 1311 unregister_filesystem(&f2fs_fs_type);
1271 destroy_checkpoint_caches(); 1312 destroy_checkpoint_caches();
1272 destroy_gc_caches();
1273 destroy_segment_manager_caches(); 1313 destroy_segment_manager_caches();
1274 destroy_node_manager_caches(); 1314 destroy_node_manager_caches();
1275 destroy_inodecache(); 1315 destroy_inodecache();
1276 kset_unregister(f2fs_kset); 1316 kset_unregister(f2fs_kset);
1317 f2fs_destroy_trace_ios();
1277} 1318}
1278 1319
1279module_init(init_f2fs_fs) 1320module_init(init_f2fs_fs)
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
new file mode 100644
index 000000000000..875aa8179bc1
--- /dev/null
+++ b/fs/f2fs/trace.c
@@ -0,0 +1,159 @@
1/*
2 * f2fs IO tracer
3 *
4 * Copyright (c) 2014 Motorola Mobility
5 * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/sched.h>
14#include <linux/radix-tree.h>
15
16#include "f2fs.h"
17#include "trace.h"
18
19static RADIX_TREE(pids, GFP_ATOMIC);
20static spinlock_t pids_lock;
21static struct last_io_info last_io;
22
23static inline void __print_last_io(void)
24{
25 if (!last_io.len)
26 return;
27
28 trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
29 last_io.major, last_io.minor,
30 last_io.pid, "----------------",
31 last_io.type,
32 last_io.fio.rw, last_io.fio.blk_addr,
33 last_io.len);
34 memset(&last_io, 0, sizeof(last_io));
35}
36
37static int __file_type(struct inode *inode, pid_t pid)
38{
39 if (f2fs_is_atomic_file(inode))
40 return __ATOMIC_FILE;
41 else if (f2fs_is_volatile_file(inode))
42 return __VOLATILE_FILE;
43 else if (S_ISDIR(inode->i_mode))
44 return __DIR_FILE;
45 else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
46 return __NODE_FILE;
47 else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
48 return __META_FILE;
49 else if (pid)
50 return __NORMAL_FILE;
51 else
52 return __MISC_FILE;
53}
54
55void f2fs_trace_pid(struct page *page)
56{
57 struct inode *inode = page->mapping->host;
58 pid_t pid = task_pid_nr(current);
59 void *p;
60
61 page->private = pid;
62
63 if (radix_tree_preload(GFP_NOFS))
64 return;
65
66 spin_lock(&pids_lock);
67 p = radix_tree_lookup(&pids, pid);
68 if (p == current)
69 goto out;
70 if (p)
71 radix_tree_delete(&pids, pid);
72
73 f2fs_radix_tree_insert(&pids, pid, current);
74
75 trace_printk("%3x:%3x %4x %-16s\n",
76 MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
77 pid, current->comm);
78out:
79 spin_unlock(&pids_lock);
80 radix_tree_preload_end();
81}
82
83void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
84{
85 struct inode *inode;
86 pid_t pid;
87 int major, minor;
88
89 if (flush) {
90 __print_last_io();
91 return;
92 }
93
94 inode = page->mapping->host;
95 pid = page_private(page);
96
97 major = MAJOR(inode->i_sb->s_dev);
98 minor = MINOR(inode->i_sb->s_dev);
99
100 if (last_io.major == major && last_io.minor == minor &&
101 last_io.pid == pid &&
102 last_io.type == __file_type(inode, pid) &&
103 last_io.fio.rw == fio->rw &&
104 last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
105 last_io.len++;
106 return;
107 }
108
109 __print_last_io();
110
111 last_io.major = major;
112 last_io.minor = minor;
113 last_io.pid = pid;
114 last_io.type = __file_type(inode, pid);
115 last_io.fio = *fio;
116 last_io.len = 1;
117 return;
118}
119
120void f2fs_build_trace_ios(void)
121{
122 spin_lock_init(&pids_lock);
123}
124
125#define PIDVEC_SIZE 128
126static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
127 unsigned int max_items)
128{
129 struct radix_tree_iter iter;
130 void **slot;
131 unsigned int ret = 0;
132
133 if (unlikely(!max_items))
134 return 0;
135
136 radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
137 results[ret] = iter.index;
138 if (++ret == PIDVEC_SIZE)
139 break;
140 }
141 return ret;
142}
143
144void f2fs_destroy_trace_ios(void)
145{
146 pid_t pid[PIDVEC_SIZE];
147 pid_t next_pid = 0;
148 unsigned int found;
149
150 spin_lock(&pids_lock);
151 while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
152 unsigned idx;
153
154 next_pid = pid[found - 1] + 1;
155 for (idx = 0; idx < found; idx++)
156 radix_tree_delete(&pids, pid[idx]);
157 }
158 spin_unlock(&pids_lock);
159}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
new file mode 100644
index 000000000000..1041dbeb52ae
--- /dev/null
+++ b/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
1/*
2 * f2fs IO tracer
3 *
4 * Copyright (c) 2014 Motorola Mobility
5 * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef __F2FS_TRACE_H__
12#define __F2FS_TRACE_H__
13
14#ifdef CONFIG_F2FS_IO_TRACE
15#include <trace/events/f2fs.h>
16
17enum file_type {
18 __NORMAL_FILE,
19 __DIR_FILE,
20 __NODE_FILE,
21 __META_FILE,
22 __ATOMIC_FILE,
23 __VOLATILE_FILE,
24 __MISC_FILE,
25};
26
27struct last_io_info {
28 int major, minor;
29 pid_t pid;
30 enum file_type type;
31 struct f2fs_io_info fio;
32 block_t len;
33};
34
35extern void f2fs_trace_pid(struct page *);
36extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
37extern void f2fs_build_trace_ios(void);
38extern void f2fs_destroy_trace_ios(void);
39#else
40#define f2fs_trace_pid(p)
41#define f2fs_trace_ios(p, i, n)
42#define f2fs_build_trace_ios()
43#define f2fs_destroy_trace_ios()
44
45#endif
46#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7b41a2dcdd76..497c7c5263c7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -580,7 +580,7 @@ static void fat_set_state(struct super_block *sb,
580{ 580{
581 struct buffer_head *bh; 581 struct buffer_head *bh;
582 struct fat_boot_sector *b; 582 struct fat_boot_sector *b;
583 struct msdos_sb_info *sbi = sb->s_fs_info; 583 struct msdos_sb_info *sbi = MSDOS_SB(sb);
584 584
585 /* do not change any thing if mounted read only */ 585 /* do not change any thing if mounted read only */
586 if ((sb->s_flags & MS_RDONLY) && !force) 586 if ((sb->s_flags & MS_RDONLY) && !force)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..073657f755d4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
66} 66}
67EXPORT_SYMBOL(writeback_in_progress); 67EXPORT_SYMBOL(writeback_in_progress);
68 68
69static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 69struct backing_dev_info *inode_to_bdi(struct inode *inode)
70{ 70{
71 struct super_block *sb = inode->i_sb; 71 struct super_block *sb;
72 72
73 if (sb_is_blkdev_sb(sb)) 73 if (!inode)
74 return inode->i_mapping->backing_dev_info; 74 return &noop_backing_dev_info;
75 75
76 sb = inode->i_sb;
77#ifdef CONFIG_BLOCK
78 if (sb_is_blkdev_sb(sb))
79 return blk_get_backing_dev_info(I_BDEV(inode));
80#endif
76 return sb->s_bdi; 81 return sb->s_bdi;
77} 82}
83EXPORT_SYMBOL_GPL(inode_to_bdi);
78 84
79static inline struct inode *wb_inode(struct list_head *head) 85static inline struct inode *wb_inode(struct list_head *head)
80{ 86{
@@ -247,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
247 return ret; 253 return ret;
248} 254}
249 255
256#define EXPIRE_DIRTY_ATIME 0x0001
257
250/* 258/*
251 * Move expired (dirtied before work->older_than_this) dirty inodes from 259 * Move expired (dirtied before work->older_than_this) dirty inodes from
252 * @delaying_queue to @dispatch_queue. 260 * @delaying_queue to @dispatch_queue.
253 */ 261 */
254static int move_expired_inodes(struct list_head *delaying_queue, 262static int move_expired_inodes(struct list_head *delaying_queue,
255 struct list_head *dispatch_queue, 263 struct list_head *dispatch_queue,
264 int flags,
256 struct wb_writeback_work *work) 265 struct wb_writeback_work *work)
257{ 266{
267 unsigned long *older_than_this = NULL;
268 unsigned long expire_time;
258 LIST_HEAD(tmp); 269 LIST_HEAD(tmp);
259 struct list_head *pos, *node; 270 struct list_head *pos, *node;
260 struct super_block *sb = NULL; 271 struct super_block *sb = NULL;
@@ -262,13 +273,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 int do_sb_sort = 0; 273 int do_sb_sort = 0;
263 int moved = 0; 274 int moved = 0;
264 275
276 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
277 older_than_this = work->older_than_this;
278 else if ((work->reason == WB_REASON_SYNC) == 0) {
279 expire_time = jiffies - (HZ * 86400);
280 older_than_this = &expire_time;
281 }
265 while (!list_empty(delaying_queue)) { 282 while (!list_empty(delaying_queue)) {
266 inode = wb_inode(delaying_queue->prev); 283 inode = wb_inode(delaying_queue->prev);
267 if (work->older_than_this && 284 if (older_than_this &&
268 inode_dirtied_after(inode, *work->older_than_this)) 285 inode_dirtied_after(inode, *older_than_this))
269 break; 286 break;
270 list_move(&inode->i_wb_list, &tmp); 287 list_move(&inode->i_wb_list, &tmp);
271 moved++; 288 moved++;
289 if (flags & EXPIRE_DIRTY_ATIME)
290 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
272 if (sb_is_blkdev_sb(inode->i_sb)) 291 if (sb_is_blkdev_sb(inode->i_sb))
273 continue; 292 continue;
274 if (sb && sb != inode->i_sb) 293 if (sb && sb != inode->i_sb)
@@ -309,9 +328,12 @@ out:
309static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) 328static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
310{ 329{
311 int moved; 330 int moved;
331
312 assert_spin_locked(&wb->list_lock); 332 assert_spin_locked(&wb->list_lock);
313 list_splice_init(&wb->b_more_io, &wb->b_io); 333 list_splice_init(&wb->b_more_io, &wb->b_io);
314 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); 334 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
335 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
336 EXPIRE_DIRTY_ATIME, work);
315 trace_writeback_queue_io(wb, work, moved); 337 trace_writeback_queue_io(wb, work, moved);
316} 338}
317 339
@@ -435,6 +457,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
435 * updates after data IO completion. 457 * updates after data IO completion.
436 */ 458 */
437 redirty_tail(inode, wb); 459 redirty_tail(inode, wb);
460 } else if (inode->i_state & I_DIRTY_TIME) {
461 list_move(&inode->i_wb_list, &wb->b_dirty_time);
438 } else { 462 } else {
439 /* The inode is clean. Remove from writeback lists. */ 463 /* The inode is clean. Remove from writeback lists. */
440 list_del_init(&inode->i_wb_list); 464 list_del_init(&inode->i_wb_list);
@@ -481,7 +505,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
481 spin_lock(&inode->i_lock); 505 spin_lock(&inode->i_lock);
482 506
483 dirty = inode->i_state & I_DIRTY; 507 dirty = inode->i_state & I_DIRTY;
484 inode->i_state &= ~I_DIRTY; 508 if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
509 (inode->i_state & I_DIRTY_TIME)) ||
510 (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
511 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
512 trace_writeback_lazytime(inode);
513 }
514 inode->i_state &= ~dirty;
485 515
486 /* 516 /*
487 * Paired with smp_mb() in __mark_inode_dirty(). This allows 517 * Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -501,8 +531,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
501 531
502 spin_unlock(&inode->i_lock); 532 spin_unlock(&inode->i_lock);
503 533
534 if (dirty & I_DIRTY_TIME)
535 mark_inode_dirty_sync(inode);
504 /* Don't write the inode if only I_DIRTY_PAGES was set */ 536 /* Don't write the inode if only I_DIRTY_PAGES was set */
505 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 537 if (dirty & ~I_DIRTY_PAGES) {
506 int err = write_inode(inode, wbc); 538 int err = write_inode(inode, wbc);
507 if (ret == 0) 539 if (ret == 0)
508 ret = err; 540 ret = err;
@@ -550,7 +582,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
550 * make sure inode is on some writeback list and leave it there unless 582 * make sure inode is on some writeback list and leave it there unless
551 * we have completely cleaned the inode. 583 * we have completely cleaned the inode.
552 */ 584 */
553 if (!(inode->i_state & I_DIRTY) && 585 if (!(inode->i_state & I_DIRTY_ALL) &&
554 (wbc->sync_mode != WB_SYNC_ALL || 586 (wbc->sync_mode != WB_SYNC_ALL ||
555 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 587 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
556 goto out; 588 goto out;
@@ -565,7 +597,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
565 * If inode is clean, remove it from writeback lists. Otherwise don't 597 * If inode is clean, remove it from writeback lists. Otherwise don't
566 * touch it. See comment above for explanation. 598 * touch it. See comment above for explanation.
567 */ 599 */
568 if (!(inode->i_state & I_DIRTY)) 600 if (!(inode->i_state & I_DIRTY_ALL))
569 list_del_init(&inode->i_wb_list); 601 list_del_init(&inode->i_wb_list);
570 spin_unlock(&wb->list_lock); 602 spin_unlock(&wb->list_lock);
571 inode_sync_complete(inode); 603 inode_sync_complete(inode);
@@ -707,7 +739,7 @@ static long writeback_sb_inodes(struct super_block *sb,
707 wrote += write_chunk - wbc.nr_to_write; 739 wrote += write_chunk - wbc.nr_to_write;
708 spin_lock(&wb->list_lock); 740 spin_lock(&wb->list_lock);
709 spin_lock(&inode->i_lock); 741 spin_lock(&inode->i_lock);
710 if (!(inode->i_state & I_DIRTY)) 742 if (!(inode->i_state & I_DIRTY_ALL))
711 wrote++; 743 wrote++;
712 requeue_inode(inode, wb, &wbc); 744 requeue_inode(inode, wb, &wbc);
713 inode_sync_complete(inode); 745 inode_sync_complete(inode);
@@ -1145,16 +1177,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1145 * page->mapping->host, so the page-dirtying time is recorded in the internal 1177 * page->mapping->host, so the page-dirtying time is recorded in the internal
1146 * blockdev inode. 1178 * blockdev inode.
1147 */ 1179 */
1180#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
1148void __mark_inode_dirty(struct inode *inode, int flags) 1181void __mark_inode_dirty(struct inode *inode, int flags)
1149{ 1182{
1150 struct super_block *sb = inode->i_sb; 1183 struct super_block *sb = inode->i_sb;
1151 struct backing_dev_info *bdi = NULL; 1184 struct backing_dev_info *bdi = NULL;
1185 int dirtytime;
1186
1187 trace_writeback_mark_inode_dirty(inode, flags);
1152 1188
1153 /* 1189 /*
1154 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1190 * Don't do this for I_DIRTY_PAGES - that doesn't actually
1155 * dirty the inode itself 1191 * dirty the inode itself
1156 */ 1192 */
1157 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1193 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
1158 trace_writeback_dirty_inode_start(inode, flags); 1194 trace_writeback_dirty_inode_start(inode, flags);
1159 1195
1160 if (sb->s_op->dirty_inode) 1196 if (sb->s_op->dirty_inode)
@@ -1162,6 +1198,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1162 1198
1163 trace_writeback_dirty_inode(inode, flags); 1199 trace_writeback_dirty_inode(inode, flags);
1164 } 1200 }
1201 if (flags & I_DIRTY_INODE)
1202 flags &= ~I_DIRTY_TIME;
1203 dirtytime = flags & I_DIRTY_TIME;
1165 1204
1166 /* 1205 /*
1167 * Paired with smp_mb() in __writeback_single_inode() for the 1206 * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1208,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1169 */ 1208 */
1170 smp_mb(); 1209 smp_mb();
1171 1210
1172 if ((inode->i_state & flags) == flags) 1211 if (((inode->i_state & flags) == flags) ||
1212 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
1173 return; 1213 return;
1174 1214
1175 if (unlikely(block_dump)) 1215 if (unlikely(block_dump))
1176 block_dump___mark_inode_dirty(inode); 1216 block_dump___mark_inode_dirty(inode);
1177 1217
1178 spin_lock(&inode->i_lock); 1218 spin_lock(&inode->i_lock);
1219 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
1220 goto out_unlock_inode;
1179 if ((inode->i_state & flags) != flags) { 1221 if ((inode->i_state & flags) != flags) {
1180 const int was_dirty = inode->i_state & I_DIRTY; 1222 const int was_dirty = inode->i_state & I_DIRTY;
1181 1223
1224 if (flags & I_DIRTY_INODE)
1225 inode->i_state &= ~I_DIRTY_TIME;
1182 inode->i_state |= flags; 1226 inode->i_state |= flags;
1183 1227
1184 /* 1228 /*
@@ -1225,8 +1269,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1225 } 1269 }
1226 1270
1227 inode->dirtied_when = jiffies; 1271 inode->dirtied_when = jiffies;
1228 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1272 list_move(&inode->i_wb_list, dirtytime ?
1273 &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
1229 spin_unlock(&bdi->wb.list_lock); 1274 spin_unlock(&bdi->wb.list_lock);
1275 trace_writeback_dirty_inode_enqueue(inode);
1230 1276
1231 if (wakeup_bdi) 1277 if (wakeup_bdi)
1232 bdi_wakeup_thread_delayed(bdi); 1278 bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..b06c98796afb 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,78 +1,102 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/sched.h>
2#include <linux/slab.h> 3#include <linux/slab.h>
3#include <linux/fs_pin.h>
4#include "internal.h" 4#include "internal.h"
5#include "mount.h" 5#include "mount.h"
6 6
7static void pin_free_rcu(struct rcu_head *head)
8{
9 kfree(container_of(head, struct fs_pin, rcu));
10}
11
12static DEFINE_SPINLOCK(pin_lock); 7static DEFINE_SPINLOCK(pin_lock);
13 8
14void pin_put(struct fs_pin *p)
15{
16 if (atomic_long_dec_and_test(&p->count))
17 call_rcu(&p->rcu, pin_free_rcu);
18}
19
20void pin_remove(struct fs_pin *pin) 9void pin_remove(struct fs_pin *pin)
21{ 10{
22 spin_lock(&pin_lock); 11 spin_lock(&pin_lock);
23 hlist_del(&pin->m_list); 12 hlist_del(&pin->m_list);
24 hlist_del(&pin->s_list); 13 hlist_del(&pin->s_list);
25 spin_unlock(&pin_lock); 14 spin_unlock(&pin_lock);
15 spin_lock_irq(&pin->wait.lock);
16 pin->done = 1;
17 wake_up_locked(&pin->wait);
18 spin_unlock_irq(&pin->wait.lock);
26} 19}
27 20
28void pin_insert(struct fs_pin *pin, struct vfsmount *m) 21void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
29{ 22{
30 spin_lock(&pin_lock); 23 spin_lock(&pin_lock);
31 hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); 24 if (p)
25 hlist_add_head(&pin->s_list, p);
32 hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); 26 hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
33 spin_unlock(&pin_lock); 27 spin_unlock(&pin_lock);
34} 28}
35 29
30void pin_insert(struct fs_pin *pin, struct vfsmount *m)
31{
32 pin_insert_group(pin, m, &m->mnt_sb->s_pins);
33}
34
35void pin_kill(struct fs_pin *p)
36{
37 wait_queue_t wait;
38
39 if (!p) {
40 rcu_read_unlock();
41 return;
42 }
43 init_wait(&wait);
44 spin_lock_irq(&p->wait.lock);
45 if (likely(!p->done)) {
46 p->done = -1;
47 spin_unlock_irq(&p->wait.lock);
48 rcu_read_unlock();
49 p->kill(p);
50 return;
51 }
52 if (p->done > 0) {
53 spin_unlock_irq(&p->wait.lock);
54 rcu_read_unlock();
55 return;
56 }
57 __add_wait_queue(&p->wait, &wait);
58 while (1) {
59 set_current_state(TASK_UNINTERRUPTIBLE);
60 spin_unlock_irq(&p->wait.lock);
61 rcu_read_unlock();
62 schedule();
63 rcu_read_lock();
64 if (likely(list_empty(&wait.task_list)))
65 break;
66 /* OK, we know p couldn't have been freed yet */
67 spin_lock_irq(&p->wait.lock);
68 if (p->done > 0) {
69 spin_unlock_irq(&p->wait.lock);
70 break;
71 }
72 }
73 rcu_read_unlock();
74}
75
36void mnt_pin_kill(struct mount *m) 76void mnt_pin_kill(struct mount *m)
37{ 77{
38 while (1) { 78 while (1) {
39 struct hlist_node *p; 79 struct hlist_node *p;
40 struct fs_pin *pin;
41 rcu_read_lock(); 80 rcu_read_lock();
42 p = ACCESS_ONCE(m->mnt_pins.first); 81 p = ACCESS_ONCE(m->mnt_pins.first);
43 if (!p) { 82 if (!p) {
44 rcu_read_unlock(); 83 rcu_read_unlock();
45 break; 84 break;
46 } 85 }
47 pin = hlist_entry(p, struct fs_pin, m_list); 86 pin_kill(hlist_entry(p, struct fs_pin, m_list));
48 if (!atomic_long_inc_not_zero(&pin->count)) {
49 rcu_read_unlock();
50 cpu_relax();
51 continue;
52 }
53 rcu_read_unlock();
54 pin->kill(pin);
55 } 87 }
56} 88}
57 89
58void sb_pin_kill(struct super_block *sb) 90void group_pin_kill(struct hlist_head *p)
59{ 91{
60 while (1) { 92 while (1) {
61 struct hlist_node *p; 93 struct hlist_node *q;
62 struct fs_pin *pin;
63 rcu_read_lock(); 94 rcu_read_lock();
64 p = ACCESS_ONCE(sb->s_pins.first); 95 q = ACCESS_ONCE(p->first);
65 if (!p) { 96 if (!q) {
66 rcu_read_unlock(); 97 rcu_read_unlock();
67 break; 98 break;
68 } 99 }
69 pin = hlist_entry(p, struct fs_pin, s_list); 100 pin_kill(hlist_entry(q, struct fs_pin, s_list));
70 if (!atomic_long_inc_not_zero(&pin->count)) {
71 rcu_read_unlock();
72 cpu_relax();
73 continue;
74 }
75 rcu_read_unlock();
76 pin->kill(pin);
77 } 101 }
78} 102}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1159 mutex_lock(&inode->i_mutex); 1159 mutex_lock(&inode->i_mutex);
1160 1160
1161 /* We can write back this queue in page reclaim */ 1161 /* We can write back this queue in page reclaim */
1162 current->backing_dev_info = mapping->backing_dev_info; 1162 current->backing_dev_info = inode_to_bdi(inode);
1163 1163
1164 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1164 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1165 if (err) 1165 if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1464{ 1464{
1465 struct inode *inode = req->inode; 1465 struct inode *inode = req->inode;
1466 struct fuse_inode *fi = get_fuse_inode(inode); 1466 struct fuse_inode *fi = get_fuse_inode(inode);
1467 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; 1467 struct backing_dev_info *bdi = inode_to_bdi(inode);
1468 int i; 1468 int i;
1469 1469
1470 list_del(&req->writepages_entry); 1470 list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
1658 req->end = fuse_writepage_end; 1658 req->end = fuse_writepage_end;
1659 req->inode = inode; 1659 req->inode = inode;
1660 1660
1661 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); 1661 inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
1662 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1662 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1663 1663
1664 spin_lock(&fc->lock); 1664 spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
1768 1768
1769 if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || 1769 if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
1770 old_req->state == FUSE_REQ_PENDING)) { 1770 old_req->state == FUSE_REQ_PENDING)) {
1771 struct backing_dev_info *bdi = page->mapping->backing_dev_info; 1771 struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
1772 1772
1773 copy_highpage(old_req->pages[0], page); 1773 copy_highpage(old_req->pages[0], page);
1774 spin_unlock(&fc->lock); 1774 spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
1872 req->page_descs[req->num_pages].offset = 0; 1872 req->page_descs[req->num_pages].offset = 0;
1873 req->page_descs[req->num_pages].length = PAGE_SIZE; 1873 req->page_descs[req->num_pages].length = PAGE_SIZE;
1874 1874
1875 inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); 1875 inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
1876 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1876 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1877 1877
1878 err = 0; 1878 err = 0;
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
2062 .fault = filemap_fault, 2062 .fault = filemap_fault,
2063 .map_pages = filemap_map_pages, 2063 .map_pages = filemap_map_pages,
2064 .page_mkwrite = fuse_page_mkwrite, 2064 .page_mkwrite = fuse_page_mkwrite,
2065 .remap_pages = generic_file_remap_pages,
2066}; 2065};
2067 2066
2068static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2067static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
308 if (!fc->writeback_cache || !S_ISREG(attr->mode)) 308 if (!fc->writeback_cache || !S_ISREG(attr->mode))
309 inode->i_flags |= S_NOCMTIME; 309 inode->i_flags |= S_NOCMTIME;
310 inode->i_generation = generation; 310 inode->i_generation = generation;
311 inode->i_data.backing_dev_info = &fc->bdi;
312 fuse_init_inode(inode, attr); 311 fuse_init_inode(inode, attr);
313 unlock_new_inode(inode); 312 unlock_new_inode(inode);
314 } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { 313 } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a38e30..7b3143064af1 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
73 73
74 BUG_ON(name == NULL); 74 BUG_ON(name == NULL);
75 75
76 if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) 76 if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
77 return -E2BIG; 77 return -E2BIG;
78 78
79 if (type == ACL_TYPE_ACCESS) { 79 if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
289 if (!clear_page_dirty_for_io(page)) 289 if (!clear_page_dirty_for_io(page))
290 goto continue_unlock; 290 goto continue_unlock;
291 291
292 trace_wbc_writepage(wbc, mapping->backing_dev_info); 292 trace_wbc_writepage(wbc, inode_to_bdi(inode));
293 293
294 ret = __gfs2_jdata_writepage(page, wbc); 294 ret = __gfs2_jdata_writepage(page, wbc);
295 if (unlikely(ret)) { 295 if (unlikely(ret)) {
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f09e228..6371192961e2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1896 1896
1897 ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); 1897 ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
1898 if (ht == NULL) 1898 if (ht == NULL)
1899 ht = vzalloc(size); 1899 ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
1900 PAGE_KERNEL);
1900 if (!ht) 1901 if (!ht)
1901 return -ENOMEM; 1902 return -ENOMEM;
1902 1903
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..3e32bb8e2d7e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
498 .fault = filemap_fault, 498 .fault = filemap_fault,
499 .map_pages = filemap_map_pages, 499 .map_pages = filemap_map_pages,
500 .page_mkwrite = gfs2_page_mkwrite, 500 .page_mkwrite = gfs2_page_mkwrite,
501 .remap_pages = generic_file_remap_pages,
502}; 501};
503 502
504/** 503/**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
655{ 654{
656 struct address_space *mapping = file->f_mapping; 655 struct address_space *mapping = file->f_mapping;
657 struct inode *inode = mapping->host; 656 struct inode *inode = mapping->host;
658 int sync_state = inode->i_state & I_DIRTY; 657 int sync_state = inode->i_state & I_DIRTY_ALL;
659 struct gfs2_inode *ip = GFS2_I(inode); 658 struct gfs2_inode *ip = GFS2_I(inode);
660 int ret = 0, ret1 = 0; 659 int ret = 0, ret1 = 0;
661 660
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
668 if (!gfs2_is_jdata(ip)) 667 if (!gfs2_is_jdata(ip))
669 sync_state &= ~I_DIRTY_PAGES; 668 sync_state &= ~I_DIRTY_PAGES;
670 if (datasync) 669 if (datasync)
671 sync_state &= ~I_DIRTY_SYNC; 670 sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
672 671
673 if (sync_state) { 672 if (sync_state) {
674 ret = sync_inode_metadata(inode, 1); 673 ret = sync_inode_metadata(inode, 1);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
173 spin_unlock(&lru_lock); 173 spin_unlock(&lru_lock);
174} 174}
175 175
176static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl) 176static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
177{ 177{
178 spin_lock(&lru_lock);
178 if (!list_empty(&gl->gl_lru)) { 179 if (!list_empty(&gl->gl_lru)) {
179 list_del_init(&gl->gl_lru); 180 list_del_init(&gl->gl_lru);
180 atomic_dec(&lru_count); 181 atomic_dec(&lru_count);
181 clear_bit(GLF_LRU, &gl->gl_flags); 182 clear_bit(GLF_LRU, &gl->gl_flags);
182 } 183 }
183}
184
185static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
186{
187 spin_lock(&lru_lock);
188 __gfs2_glock_remove_from_lru(gl);
189 spin_unlock(&lru_lock); 184 spin_unlock(&lru_lock);
190} 185}
191 186
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
205 200
206 lockref_mark_dead(&gl->gl_lockref); 201 lockref_mark_dead(&gl->gl_lockref);
207 202
208 spin_lock(&lru_lock); 203 gfs2_glock_remove_from_lru(gl);
209 __gfs2_glock_remove_from_lru(gl);
210 spin_unlock(&lru_lock);
211 spin_unlock(&gl->gl_lockref.lock); 204 spin_unlock(&gl->gl_lockref.lock);
212 spin_lock_bucket(gl->gl_hash); 205 spin_lock_bucket(gl->gl_hash);
213 hlist_bl_del_rcu(&gl->gl_list); 206 hlist_bl_del_rcu(&gl->gl_list);
@@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
775 mapping->flags = 0; 768 mapping->flags = 0;
776 mapping_set_gfp_mask(mapping, GFP_NOFS); 769 mapping_set_gfp_mask(mapping, GFP_NOFS);
777 mapping->private_data = NULL; 770 mapping->private_data = NULL;
778 mapping->backing_dev_info = s->s_bdi;
779 mapping->writeback_index = 0; 771 mapping->writeback_index = 0;
780 } 772 }
781 773
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002ebe70..73c72253faac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
543 } 543 }
544 544
545 error = gfs2_dir_add(&dip->i_inode, name, ip, da); 545 error = gfs2_dir_add(&dip->i_inode, name, ip, da);
546 if (error)
547 goto fail_end_trans;
548 546
549fail_end_trans:
550 gfs2_trans_end(sdp); 547 gfs2_trans_end(sdp);
551fail_ipreserv: 548fail_ipreserv:
552 gfs2_inplace_release(dip); 549 gfs2_inplace_release(dip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
112 mapping->flags = 0; 112 mapping->flags = 0;
113 mapping_set_gfp_mask(mapping, GFP_NOFS); 113 mapping_set_gfp_mask(mapping, GFP_NOFS);
114 mapping->private_data = NULL; 114 mapping->private_data = NULL;
115 mapping->backing_dev_info = sb->s_bdi;
116 mapping->writeback_index = 0; 115 mapping->writeback_index = 0;
117 116
118 spin_lock_init(&sdp->sd_log_lock); 117 spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e193cb36996..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
145} 145}
146 146
147 147
148static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) 148static enum lru_status gfs2_qd_isolate(struct list_head *item,
149 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
149{ 150{
150 struct list_head *dispose = arg; 151 struct list_head *dispose = arg;
151 struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); 152 struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
155 156
156 if (qd->qd_lockref.count == 0) { 157 if (qd->qd_lockref.count == 0) {
157 lockref_mark_dead(&qd->qd_lockref); 158 lockref_mark_dead(&qd->qd_lockref);
158 list_move(&qd->qd_lru, dispose); 159 list_lru_isolate_move(lru, &qd->qd_lru, dispose);
159 } 160 }
160 161
161 spin_unlock(&qd->qd_lockref.lock); 162 spin_unlock(&qd->qd_lockref.lock);
@@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
171 if (!(sc->gfp_mask & __GFP_FS)) 172 if (!(sc->gfp_mask & __GFP_FS))
172 return SHRINK_STOP; 173 return SHRINK_STOP;
173 174
174 freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, 175 freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
175 &dispose, &sc->nr_to_scan); 176 gfs2_qd_isolate, &dispose);
176 177
177 gfs2_qd_dispose(&dispose); 178 gfs2_qd_dispose(&dispose);
178 179
@@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
182static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, 183static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
183 struct shrink_control *sc) 184 struct shrink_control *sc)
184{ 185{
185 return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); 186 return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
186} 187}
187 188
188struct shrinker gfs2_qd_shrinker = { 189struct shrinker gfs2_qd_shrinker = {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b758fa..1b645773c98e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
439 439
440 ls->ls_recover_jid_done = jid; 440 ls->ls_recover_jid_done = jid;
441 ls->ls_recover_jid_status = message; 441 ls->ls_recover_jid_status = message;
442 sprintf(env_jid, "JID=%d", jid); 442 sprintf(env_jid, "JID=%u", jid);
443 sprintf(env_status, "RECOVERY=%s", 443 sprintf(env_status, "RECOVERY=%s",
444 message == LM_RD_SUCCESS ? "Done" : "Failed"); 444 message == LM_RD_SUCCESS ? "Done" : "Failed");
445 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 445 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
743 struct gfs2_inode *ip = GFS2_I(inode); 743 struct gfs2_inode *ip = GFS2_I(inode);
744 struct gfs2_sbd *sdp = GFS2_SB(inode); 744 struct gfs2_sbd *sdp = GFS2_SB(inode);
745 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); 745 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
746 struct backing_dev_info *bdi = metamapping->backing_dev_info; 746 struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
747 int ret = 0; 747 int ret = 0;
748 748
749 if (wbc->sync_mode == WB_SYNC_ALL) 749 if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566ba5696..ae8e8811f0e8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
96 struct super_block *sb = sdp->sd_vfs; 96 struct super_block *sb = sdp->sd_vfs;
97 int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; 97 int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
98 98
99 return snprintf(buf, PAGE_SIZE, "%u\n", frozen); 99 return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
100} 100}
101 101
102static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 102static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
62 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); 62 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
63} 63}
64 64
65static struct backing_dev_info hugetlbfs_backing_dev_info = {
66 .name = "hugetlbfs",
67 .ra_pages = 0, /* No readahead */
68 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
69};
70
71int sysctl_hugetlb_shm_group; 65int sysctl_hugetlb_shm_group;
72 66
73enum { 67enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
498 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, 492 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
499 &hugetlbfs_i_mmap_rwsem_key); 493 &hugetlbfs_i_mmap_rwsem_key);
500 inode->i_mapping->a_ops = &hugetlbfs_aops; 494 inode->i_mapping->a_ops = &hugetlbfs_aops;
501 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
502 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 495 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
503 inode->i_mapping->private_data = resv_map; 496 inode->i_mapping->private_data = resv_map;
504 info = HUGETLBFS_I(inode); 497 info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
1032 return -ENOTSUPP; 1025 return -ENOTSUPP;
1033 } 1026 }
1034 1027
1035 error = bdi_init(&hugetlbfs_backing_dev_info);
1036 if (error)
1037 return error;
1038
1039 error = -ENOMEM; 1028 error = -ENOMEM;
1040 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1029 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1041 sizeof(struct hugetlbfs_inode_info), 1030 sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
1071 out: 1060 out:
1072 kmem_cache_destroy(hugetlbfs_inode_cachep); 1061 kmem_cache_destroy(hugetlbfs_inode_cachep);
1073 out2: 1062 out2:
1074 bdi_destroy(&hugetlbfs_backing_dev_info);
1075 return error; 1063 return error;
1076} 1064}
1077 1065
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
1091 for_each_hstate(h) 1079 for_each_hstate(h)
1092 kern_unmount(hugetlbfs_vfsmount[i++]); 1080 kern_unmount(hugetlbfs_vfsmount[i++]);
1093 unregister_filesystem(&hugetlbfs_fs_type); 1081 unregister_filesystem(&hugetlbfs_fs_type);
1094 bdi_destroy(&hugetlbfs_backing_dev_info);
1095} 1082}
1096 1083
1097module_init(init_hugetlbfs_fs) 1084module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f00b16f45507 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/buffer_head.h> /* for inode_has_buffers */ 18#include <linux/buffer_head.h> /* for inode_has_buffers */
19#include <linux/ratelimit.h> 19#include <linux/ratelimit.h>
20#include <linux/list_lru.h> 20#include <linux/list_lru.h>
21#include <trace/events/writeback.h>
21#include "internal.h" 22#include "internal.h"
22 23
23/* 24/*
@@ -30,7 +31,7 @@
30 * inode_sb_list_lock protects: 31 * inode_sb_list_lock protects:
31 * sb->s_inodes, inode->i_sb_list 32 * sb->s_inodes, inode->i_sb_list
32 * bdi->wb.list_lock protects: 33 * bdi->wb.list_lock protects:
33 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 34 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
34 * inode_hash_lock protects: 35 * inode_hash_lock protects:
35 * inode_hashtable, inode->i_hash 36 * inode_hashtable, inode->i_hash
36 * 37 *
@@ -170,20 +171,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
170 atomic_set(&mapping->i_mmap_writable, 0); 171 atomic_set(&mapping->i_mmap_writable, 0);
171 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 172 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
172 mapping->private_data = NULL; 173 mapping->private_data = NULL;
173 mapping->backing_dev_info = &default_backing_dev_info;
174 mapping->writeback_index = 0; 174 mapping->writeback_index = 0;
175
176 /*
177 * If the block_device provides a backing_dev_info for client
178 * inodes then use that. Otherwise the inode share the bdev's
179 * backing_dev_info.
180 */
181 if (sb->s_bdev) {
182 struct backing_dev_info *bdi;
183
184 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
185 mapping->backing_dev_info = bdi;
186 }
187 inode->i_private = NULL; 175 inode->i_private = NULL;
188 inode->i_mapping = mapping; 176 inode->i_mapping = mapping;
189 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ 177 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
@@ -194,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
194#ifdef CONFIG_FSNOTIFY 182#ifdef CONFIG_FSNOTIFY
195 inode->i_fsnotify_mask = 0; 183 inode->i_fsnotify_mask = 0;
196#endif 184#endif
197 185 inode->i_flctx = NULL;
198 this_cpu_inc(nr_inodes); 186 this_cpu_inc(nr_inodes);
199 187
200 return 0; 188 return 0;
@@ -237,6 +225,7 @@ void __destroy_inode(struct inode *inode)
237 BUG_ON(inode_has_buffers(inode)); 225 BUG_ON(inode_has_buffers(inode));
238 security_inode_free(inode); 226 security_inode_free(inode);
239 fsnotify_inode_delete(inode); 227 fsnotify_inode_delete(inode);
228 locks_free_lock_context(inode->i_flctx);
240 if (!inode->i_nlink) { 229 if (!inode->i_nlink) {
241 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); 230 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
242 atomic_long_dec(&inode->i_sb->s_remove_count); 231 atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -355,7 +344,6 @@ void address_space_init_once(struct address_space *mapping)
355 INIT_LIST_HEAD(&mapping->private_list); 344 INIT_LIST_HEAD(&mapping->private_list);
356 spin_lock_init(&mapping->private_lock); 345 spin_lock_init(&mapping->private_lock);
357 mapping->i_mmap = RB_ROOT; 346 mapping->i_mmap = RB_ROOT;
358 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
359} 347}
360EXPORT_SYMBOL(address_space_init_once); 348EXPORT_SYMBOL(address_space_init_once);
361 349
@@ -416,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode)
416 */ 404 */
417void inode_add_lru(struct inode *inode) 405void inode_add_lru(struct inode *inode)
418{ 406{
419 if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && 407 if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
408 I_FREEING | I_WILL_FREE)) &&
420 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) 409 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
421 inode_lru_list_add(inode); 410 inode_lru_list_add(inode);
422} 411}
@@ -647,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
647 spin_unlock(&inode->i_lock); 636 spin_unlock(&inode->i_lock);
648 continue; 637 continue;
649 } 638 }
650 if (inode->i_state & I_DIRTY && !kill_dirty) { 639 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
651 spin_unlock(&inode->i_lock); 640 spin_unlock(&inode->i_lock);
652 busy = 1; 641 busy = 1;
653 continue; 642 continue;
@@ -685,8 +674,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
685 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 674 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
686 * with this flag set because they are the inodes that are out of order. 675 * with this flag set because they are the inodes that are out of order.
687 */ 676 */
688static enum lru_status 677static enum lru_status inode_lru_isolate(struct list_head *item,
689inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) 678 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
690{ 679{
691 struct list_head *freeable = arg; 680 struct list_head *freeable = arg;
692 struct inode *inode = container_of(item, struct inode, i_lru); 681 struct inode *inode = container_of(item, struct inode, i_lru);
@@ -704,7 +693,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
704 */ 693 */
705 if (atomic_read(&inode->i_count) || 694 if (atomic_read(&inode->i_count) ||
706 (inode->i_state & ~I_REFERENCED)) { 695 (inode->i_state & ~I_REFERENCED)) {
707 list_del_init(&inode->i_lru); 696 list_lru_isolate(lru, &inode->i_lru);
708 spin_unlock(&inode->i_lock); 697 spin_unlock(&inode->i_lock);
709 this_cpu_dec(nr_unused); 698 this_cpu_dec(nr_unused);
710 return LRU_REMOVED; 699 return LRU_REMOVED;
@@ -738,7 +727,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
738 727
739 WARN_ON(inode->i_state & I_NEW); 728 WARN_ON(inode->i_state & I_NEW);
740 inode->i_state |= I_FREEING; 729 inode->i_state |= I_FREEING;
741 list_move(&inode->i_lru, freeable); 730 list_lru_isolate_move(lru, &inode->i_lru, freeable);
742 spin_unlock(&inode->i_lock); 731 spin_unlock(&inode->i_lock);
743 732
744 this_cpu_dec(nr_unused); 733 this_cpu_dec(nr_unused);
@@ -751,14 +740,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
751 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 740 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
752 * then are freed outside inode_lock by dispose_list(). 741 * then are freed outside inode_lock by dispose_list().
753 */ 742 */
754long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 743long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
755 int nid)
756{ 744{
757 LIST_HEAD(freeable); 745 LIST_HEAD(freeable);
758 long freed; 746 long freed;
759 747
760 freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, 748 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
761 &freeable, &nr_to_scan); 749 inode_lru_isolate, &freeable);
762 dispose_list(&freeable); 750 dispose_list(&freeable);
763 return freed; 751 return freed;
764} 752}
@@ -1282,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
1282} 1270}
1283EXPORT_SYMBOL(ilookup); 1271EXPORT_SYMBOL(ilookup);
1284 1272
1273/**
1274 * find_inode_nowait - find an inode in the inode cache
1275 * @sb: super block of file system to search
1276 * @hashval: hash value (usually inode number) to search for
1277 * @match: callback used for comparisons between inodes
1278 * @data: opaque data pointer to pass to @match
1279 *
1280 * Search for the inode specified by @hashval and @data in the inode
1281 * cache, where the helper function @match will return 0 if the inode
1282 * does not match, 1 if the inode does match, and -1 if the search
1283 * should be stopped. The @match function must be responsible for
1284 * taking the i_lock spin_lock and checking i_state for an inode being
1285 * freed or being initialized, and incrementing the reference count
1286 * before returning 1. It also must not sleep, since it is called with
1287 * the inode_hash_lock spinlock held.
1288 *
1289 * This is a even more generalized version of ilookup5() when the
1290 * function must never block --- find_inode() can block in
1291 * __wait_on_freeing_inode() --- or when the caller can not increment
1292 * the reference count because the resulting iput() might cause an
1293 * inode eviction. The tradeoff is that the @match funtion must be
1294 * very carefully implemented.
1295 */
1296struct inode *find_inode_nowait(struct super_block *sb,
1297 unsigned long hashval,
1298 int (*match)(struct inode *, unsigned long,
1299 void *),
1300 void *data)
1301{
1302 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1303 struct inode *inode, *ret_inode = NULL;
1304 int mval;
1305
1306 spin_lock(&inode_hash_lock);
1307 hlist_for_each_entry(inode, head, i_hash) {
1308 if (inode->i_sb != sb)
1309 continue;
1310 mval = match(inode, hashval, data);
1311 if (mval == 0)
1312 continue;
1313 if (mval == 1)
1314 ret_inode = inode;
1315 goto out;
1316 }
1317out:
1318 spin_unlock(&inode_hash_lock);
1319 return ret_inode;
1320}
1321EXPORT_SYMBOL(find_inode_nowait);
1322
1285int insert_inode_locked(struct inode *inode) 1323int insert_inode_locked(struct inode *inode)
1286{ 1324{
1287 struct super_block *sb = inode->i_sb; 1325 struct super_block *sb = inode->i_sb;
@@ -1432,11 +1470,20 @@ static void iput_final(struct inode *inode)
1432 */ 1470 */
1433void iput(struct inode *inode) 1471void iput(struct inode *inode)
1434{ 1472{
1435 if (inode) { 1473 if (!inode)
1436 BUG_ON(inode->i_state & I_CLEAR); 1474 return;
1437 1475 BUG_ON(inode->i_state & I_CLEAR);
1438 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) 1476retry:
1439 iput_final(inode); 1477 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
1478 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
1479 atomic_inc(&inode->i_count);
1480 inode->i_state &= ~I_DIRTY_TIME;
1481 spin_unlock(&inode->i_lock);
1482 trace_writeback_lazytime_iput(inode);
1483 mark_inode_dirty_sync(inode);
1484 goto retry;
1485 }
1486 iput_final(inode);
1440 } 1487 }
1441} 1488}
1442EXPORT_SYMBOL(iput); 1489EXPORT_SYMBOL(iput);
@@ -1495,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1495 return 0; 1542 return 0;
1496} 1543}
1497 1544
1498/* 1545int generic_update_time(struct inode *inode, struct timespec *time, int flags)
1499 * This does the actual work of updating an inodes time or version. Must have
1500 * had called mnt_want_write() before calling this.
1501 */
1502static int update_time(struct inode *inode, struct timespec *time, int flags)
1503{ 1546{
1504 if (inode->i_op->update_time) 1547 int iflags = I_DIRTY_TIME;
1505 return inode->i_op->update_time(inode, time, flags);
1506 1548
1507 if (flags & S_ATIME) 1549 if (flags & S_ATIME)
1508 inode->i_atime = *time; 1550 inode->i_atime = *time;
@@ -1512,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
1512 inode->i_ctime = *time; 1554 inode->i_ctime = *time;
1513 if (flags & S_MTIME) 1555 if (flags & S_MTIME)
1514 inode->i_mtime = *time; 1556 inode->i_mtime = *time;
1515 mark_inode_dirty_sync(inode); 1557
1558 if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
1559 iflags |= I_DIRTY_SYNC;
1560 __mark_inode_dirty(inode, iflags);
1516 return 0; 1561 return 0;
1517} 1562}
1563EXPORT_SYMBOL(generic_update_time);
1564
1565/*
1566 * This does the actual work of updating an inodes time or version. Must have
1567 * had called mnt_want_write() before calling this.
1568 */
1569static int update_time(struct inode *inode, struct timespec *time, int flags)
1570{
1571 int (*update_time)(struct inode *, struct timespec *, int);
1572
1573 update_time = inode->i_op->update_time ? inode->i_op->update_time :
1574 generic_update_time;
1575
1576 return update_time(inode, time, flags);
1577}
1518 1578
1519/** 1579/**
1520 * touch_atime - update the access time 1580 * touch_atime - update the access time
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..30459dab409d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
14struct linux_binprm; 14struct linux_binprm;
15struct path; 15struct path;
16struct mount; 16struct mount;
17struct shrink_control;
17 18
18/* 19/*
19 * block_dev.c 20 * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
111 * inode.c 112 * inode.c
112 */ 113 */
113extern spinlock_t inode_sb_list_lock; 114extern spinlock_t inode_sb_list_lock;
114extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 115extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
115 int nid);
116extern void inode_add_lru(struct inode *inode); 116extern void inode_add_lru(struct inode *inode);
117 117
118/* 118/*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
129 */ 129 */
130extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 130extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
131extern int d_set_mounted(struct dentry *dentry); 131extern int d_set_mounted(struct dentry *dentry);
132extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 132extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
133 int nid);
134 133
135/* 134/*
136 * read_write.c 135 * read_write.c
@@ -145,7 +144,7 @@ extern const struct file_operations pipefifo_fops;
145/* 144/*
146 * fs_pin.c 145 * fs_pin.c
147 */ 146 */
148extern void sb_pin_kill(struct super_block *sb); 147extern void group_pin_kill(struct hlist_head *p);
149extern void mnt_pin_kill(struct mount *m); 148extern void mnt_pin_kill(struct mount *m);
150 149
151/* 150/*
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 214c3c11fbc2..5d01d2638ca5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode,
379 past_eof = true; 379 past_eof = true;
380 } 380 }
381 cond_resched(); 381 cond_resched();
382 if (fatal_signal_pending(current)) {
383 ret = -EINTR;
384 break;
385 }
386
382 } while (1); 387 } while (1);
383 388
384 /* If ret is 1 then we just hit the end of the extent array */ 389 /* If ret is 1 then we just hit the end of the extent array */
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 01e1ee7a998b..005a15cfd30a 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -2,6 +2,7 @@
2 * linux/fs/isofs/util.c 2 * linux/fs/isofs/util.c
3 */ 3 */
4 4
5#include <linux/time.h>
5#include "isofs.h" 6#include "isofs.h"
6 7
7/* 8/*
@@ -17,9 +18,9 @@
17int iso_date(char * p, int flag) 18int iso_date(char * p, int flag)
18{ 19{
19 int year, month, day, hour, minute, second, tz; 20 int year, month, day, hour, minute, second, tz;
20 int crtime, days, i; 21 int crtime;
21 22
22 year = p[0] - 70; 23 year = p[0];
23 month = p[1]; 24 month = p[1];
24 day = p[2]; 25 day = p[2];
25 hour = p[3]; 26 hour = p[3];
@@ -31,18 +32,7 @@ int iso_date(char * p, int flag)
31 if (year < 0) { 32 if (year < 0) {
32 crtime = 0; 33 crtime = 0;
33 } else { 34 } else {
34 int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31}; 35 crtime = mktime64(year+1900, month, day, hour, minute, second);
35
36 days = year * 365;
37 if (year > 2)
38 days += (year+1) / 4;
39 for (i = 1; i < month; i++)
40 days += monlen[i-1];
41 if (((year+2) % 4) == 0 && month > 2)
42 days++;
43 days += day - 1;
44 crtime = ((((days * 24) + hour) * 60 + minute) * 60)
45 + second;
46 36
47 /* sign extend */ 37 /* sign extend */
48 if (tz & 0x80) 38 if (tz & 0x80)
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 92e0644bf867..556de100ebd5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp)
84 return bit; 84 return bit;
85} 85}
86 86
87static inline int pulledbits(struct pushpull *pp)
88{
89 return pp->ofs;
90}
91
92 87
93static void init_rubin(struct rubin_state *rs, int div, int *bits) 88static void init_rubin(struct rubin_state *rs, int div, int *bits)
94{ 89{
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87b0428..9ad5ba4b299b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
510 sumlen = c->sector_size - je32_to_cpu(sm->offset); 510 sumlen = c->sector_size - je32_to_cpu(sm->offset);
511 sumptr = buf + buf_size - sumlen; 511 sumptr = buf + buf_size - sumlen;
512 512
513 /* sm->offset maybe wrong but MAGIC maybe right */
514 if (sumlen > c->sector_size)
515 goto full_scan;
516
513 /* Now, make sure the summary itself is available */ 517 /* Now, make sure the summary itself is available */
514 if (sumlen > buf_size) { 518 if (sumlen > buf_size) {
515 /* Need to kmalloc for this. */ 519 /* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
544 } 548 }
545 } 549 }
546 550
551full_scan:
547 buf_ofs = jeb->offset; 552 buf_ofs = jeb->offset;
548 553
549 if (!buf_size) { 554 if (!buf_size) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
deleted file mode 100644
index fa92f7f1d0d0..000000000000
--- a/fs/jfs/endian24.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2001
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_ENDIAN24
19#define _H_ENDIAN24
20
21/*
22 * endian24.h:
23 *
24 * Endian conversion for 24-byte data
25 *
26 */
27#define __swab24(x) \
28({ \
29 __u32 __x = (x); \
30 ((__u32)( \
31 ((__x & (__u32)0x000000ffUL) << 16) | \
32 (__x & (__u32)0x0000ff00UL) | \
33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \
34})
35
36#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
37 #define __cpu_to_le24(x) ((__u32)(x))
38 #define __le24_to_cpu(x) ((__u32)(x))
39#else
40 #define __cpu_to_le24(x) __swab24(x)
41 #define __le24_to_cpu(x) __swab24(x)
42#endif
43
44#ifdef __KERNEL__
45 #define cpu_to_le24 __cpu_to_le24
46 #define le24_to_cpu __le24_to_cpu
47#endif
48
49#endif /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
39 return rc; 39 return rc;
40 40
41 mutex_lock(&inode->i_mutex); 41 mutex_lock(&inode->i_mutex);
42 if (!(inode->i_state & I_DIRTY) || 42 if (!(inode->i_state & I_DIRTY_ALL) ||
43 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { 43 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
44 /* Make sure committed changes hit the disk */ 44 /* Make sure committed changes hit the disk */
45 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); 45 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 984c2bbf4f61..d88576e23fe4 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid,
1040 pxdlist.maxnpxd = 1; 1040 pxdlist.maxnpxd = 1;
1041 pxdlist.npxd = 0; 1041 pxdlist.npxd = 0;
1042 pxd = &pxdlist.pxd[0]; 1042 pxd = &pxdlist.pxd[0];
1043 PXDaddress(pxd, nxaddr) 1043 PXDaddress(pxd, nxaddr);
1044 PXDlength(pxd, xlen + n); 1044 PXDlength(pxd, xlen + n);
1045 split->pxdlist = &pxdlist; 1045 split->pxdlist = &pxdlist;
1046 if ((rc = dtExtendPage(tid, ip, split, btstack))) { 1046 if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1047 nxaddr = addressPXD(pxd); 1047 nxaddr = addressPXD(pxd);
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 43ea3713c083..8f602dcb51fa 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -30,8 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/nls.h> 31#include <linux/nls.h>
32 32
33#include "endian24.h"
34
35/* 33/*
36 * transaction and lock id's 34 * transaction and lock id's
37 * 35 *
@@ -59,26 +57,42 @@ struct timestruc_t {
59 57
60/* 58/*
61 * physical xd (pxd) 59 * physical xd (pxd)
60 *
61 * The leftmost 24 bits of len_addr are the extent length.
62 * The rightmost 8 bits of len_addr are the most signficant bits of
63 * the extent address
62 */ 64 */
63typedef struct { 65typedef struct {
64 unsigned len:24; 66 __le32 len_addr;
65 unsigned addr1:8;
66 __le32 addr2; 67 __le32 addr2;
67} pxd_t; 68} pxd_t;
68 69
69/* xd_t field construction */ 70/* xd_t field construction */
70 71
71#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32)) 72static inline void PXDlength(pxd_t *pxd, __u32 len)
72#define PXDaddress(pxd, address64)\ 73{
73{\ 74 pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
74 (pxd)->addr1 = ((s64)address64) >> 32;\ 75 cpu_to_le32(len & 0xffffff);
75 (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ 76}
77
78static inline void PXDaddress(pxd_t *pxd, __u64 addr)
79{
80 pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
81 cpu_to_le32((addr >> 32)<<24);
82 pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
76} 83}
77 84
78/* xd_t field extraction */ 85/* xd_t field extraction */
79#define lengthPXD(pxd) __le24_to_cpu((pxd)->len) 86static inline __u32 lengthPXD(pxd_t *pxd)
80#define addressPXD(pxd)\ 87{
81 ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2)) 88 return le32_to_cpu((pxd)->len_addr) & 0xffffff;
89}
90
91static inline __u64 addressPXD(pxd_t *pxd)
92{
93 __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
94 return (n << 8) + le32_to_cpu(pxd->addr2);
95}
82 96
83#define MAXTREEHEIGHT 8 97#define MAXTREEHEIGHT 8
84/* pxd list */ 98/* pxd list */
@@ -93,12 +107,10 @@ struct pxdlist {
93 * data extent descriptor (dxd) 107 * data extent descriptor (dxd)
94 */ 108 */
95typedef struct { 109typedef struct {
96 unsigned flag:8; /* 1: flags */ 110 __u8 flag; /* 1: flags */
97 unsigned rsrvd:24; 111 __u8 rsrvd[3];
98 __le32 size; /* 4: size in byte */ 112 __le32 size; /* 4: size in byte */
99 unsigned len:24; /* 3: length in unit of fsblksize */ 113 pxd_t loc; /* 8: address and length in unit of fsblksize */
100 unsigned addr1:8; /* 1: address in unit of fsblksize */
101 __le32 addr2; /* 4: address in unit of fsblksize */
102} dxd_t; /* - 16 - */ 114} dxd_t; /* - 16 - */
103 115
104/* dxd_t flags */ 116/* dxd_t flags */
@@ -109,12 +121,11 @@ typedef struct {
109#define DXD_CORRUPT 0x08 /* Inconsistency detected */ 121#define DXD_CORRUPT 0x08 /* Inconsistency detected */
110 122
111/* dxd_t field construction 123/* dxd_t field construction
112 * Conveniently, the PXD macros work for DXD
113 */ 124 */
114#define DXDlength PXDlength 125#define DXDlength(dxd, len) PXDlength(&(dxd)->loc, len)
115#define DXDaddress PXDaddress 126#define DXDaddress(dxd, addr) PXDaddress(&(dxd)->loc, addr)
116#define lengthDXD lengthPXD 127#define lengthDXD(dxd) lengthPXD(&(dxd)->loc)
117#define addressDXD addressPXD 128#define addressDXD(dxd) addressPXD(&(dxd)->loc)
118#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) 129#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
119#define sizeDXD(dxd) le32_to_cpu((dxd)->size) 130#define sizeDXD(dxd) le32_to_cpu((dxd)->size)
120 131
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 08c0c749b986..1e0987986d5f 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -29,13 +29,11 @@
29 * extent allocation descriptor (xad) 29 * extent allocation descriptor (xad)
30 */ 30 */
31typedef struct xad { 31typedef struct xad {
32 unsigned flag:8; /* 1: flag */ 32 __u8 flag; /* 1: flag */
33 unsigned rsvrd:16; /* 2: reserved */ 33 __u8 rsvrd[2]; /* 2: reserved */
34 unsigned off1:8; /* 1: offset in unit of fsblksize */ 34 __u8 off1; /* 1: offset in unit of fsblksize */
35 __le32 off2; /* 4: offset in unit of fsblksize */ 35 __le32 off2; /* 4: offset in unit of fsblksize */
36 unsigned len:24; /* 3: length in unit of fsblksize */ 36 pxd_t loc; /* 8: length and address in unit of fsblksize */
37 unsigned addr1:8; /* 1: address in unit of fsblksize */
38 __le32 addr2; /* 4: address in unit of fsblksize */
39} xad_t; /* (16) */ 37} xad_t; /* (16) */
40 38
41#define MAXXLEN ((1 << 24) - 1) 39#define MAXXLEN ((1 << 24) - 1)
@@ -49,19 +47,14 @@ typedef struct xad {
49 (xad)->off1 = ((u64)offset64) >> 32;\ 47 (xad)->off1 = ((u64)offset64) >> 32;\
50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ 48 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
51} 49}
52#define XADaddress(xad, address64)\ 50#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
53{\ 51#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
54 (xad)->addr1 = ((u64)address64) >> 32;\
55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
56}
57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32)
58 52
59/* xad_t field extraction */ 53/* xad_t field extraction */
60#define offsetXAD(xad)\ 54#define offsetXAD(xad)\
61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) 55 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
62#define addressXAD(xad)\ 56#define addressXAD(xad) addressPXD(&(xad)->loc)
63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) 57#define lengthXAD(xad) lengthPXD(&(xad)->loc)
64#define lengthXAD(xad) __le24_to_cpu((xad)->len)
65 58
66/* xad list */ 59/* xad list */
67struct xadlist { 60struct xadlist {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 16c3a9556634..5d30c56ae075 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,8 +619,7 @@ out_mount_failed:
619 iput(sbi->direct_inode); 619 iput(sbi->direct_inode);
620 sbi->direct_inode = NULL; 620 sbi->direct_inode = NULL;
621out_unload: 621out_unload:
622 if (sbi->nls_tab) 622 unload_nls(sbi->nls_tab);
623 unload_nls(sbi->nls_tab);
624out_kfree: 623out_kfree:
625 kfree(sbi); 624 kfree(sbi);
626 return ret; 625 return ret;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d881b381d2b..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -411,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
411 411
412 if (kernfs_type(kn) == KERNFS_LINK) 412 if (kernfs_type(kn) == KERNFS_LINK)
413 kernfs_put(kn->symlink.target_kn); 413 kernfs_put(kn->symlink.target_kn);
414 if (!(kn->flags & KERNFS_STATIC_NAME)) 414
415 kfree(kn->name); 415 kfree_const(kn->name);
416
416 if (kn->iattr) { 417 if (kn->iattr) {
417 if (kn->iattr->ia_secdata) 418 if (kn->iattr->ia_secdata)
418 security_release_secctx(kn->iattr->ia_secdata, 419 security_release_secctx(kn->iattr->ia_secdata,
@@ -506,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
506 const char *name, umode_t mode, 507 const char *name, umode_t mode,
507 unsigned flags) 508 unsigned flags)
508{ 509{
509 char *dup_name = NULL;
510 struct kernfs_node *kn; 510 struct kernfs_node *kn;
511 int ret; 511 int ret;
512 512
513 if (!(flags & KERNFS_STATIC_NAME)) { 513 name = kstrdup_const(name, GFP_KERNEL);
514 name = dup_name = kstrdup(name, GFP_KERNEL); 514 if (!name)
515 if (!name) 515 return NULL;
516 return NULL;
517 }
518 516
519 kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); 517 kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
520 if (!kn) 518 if (!kn)
@@ -538,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
538 err_out2: 536 err_out2:
539 kmem_cache_free(kernfs_node_cache, kn); 537 kmem_cache_free(kernfs_node_cache, kn);
540 err_out1: 538 err_out1:
541 kfree(dup_name); 539 kfree_const(name);
542 return NULL; 540 return NULL;
543} 541}
544 542
@@ -1264,7 +1262,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1264 /* rename kernfs_node */ 1262 /* rename kernfs_node */
1265 if (strcmp(kn->name, new_name) != 0) { 1263 if (strcmp(kn->name, new_name) != 0) {
1266 error = -ENOMEM; 1264 error = -ENOMEM;
1267 new_name = kstrdup(new_name, GFP_KERNEL); 1265 new_name = kstrdup_const(new_name, GFP_KERNEL);
1268 if (!new_name) 1266 if (!new_name)
1269 goto out; 1267 goto out;
1270 } else { 1268 } else {
@@ -1285,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1285 1283
1286 kn->ns = new_ns; 1284 kn->ns = new_ns;
1287 if (new_name) { 1285 if (new_name) {
1288 if (!(kn->flags & KERNFS_STATIC_NAME)) 1286 old_name = kn->name;
1289 old_name = kn->name;
1290 kn->flags &= ~KERNFS_STATIC_NAME;
1291 kn->name = new_name; 1287 kn->name = new_name;
1292 } 1288 }
1293 1289
@@ -1297,7 +1293,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1297 kernfs_link_sibling(kn); 1293 kernfs_link_sibling(kn);
1298 1294
1299 kernfs_put(old_parent); 1295 kernfs_put(old_parent);
1300 kfree(old_name); 1296 kfree_const(old_name);
1301 1297
1302 error = 0; 1298 error = 0;
1303 out: 1299 out:
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
901 * @ops: kernfs operations for the file 901 * @ops: kernfs operations for the file
902 * @priv: private data for the file 902 * @priv: private data for the file
903 * @ns: optional namespace tag of the file 903 * @ns: optional namespace tag of the file
904 * @name_is_static: don't copy file name
905 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep 904 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
906 * 905 *
907 * Returns the created node on success, ERR_PTR() value on error. 906 * Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
911 umode_t mode, loff_t size, 910 umode_t mode, loff_t size,
912 const struct kernfs_ops *ops, 911 const struct kernfs_ops *ops,
913 void *priv, const void *ns, 912 void *priv, const void *ns,
914 bool name_is_static,
915 struct lock_class_key *key) 913 struct lock_class_key *key)
916{ 914{
917 struct kernfs_node *kn; 915 struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
919 int rc; 917 int rc;
920 918
921 flags = KERNFS_FILE; 919 flags = KERNFS_FILE;
922 if (name_is_static)
923 flags |= KERNFS_STATIC_NAME;
924 920
925 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); 921 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
926 if (!kn) 922 if (!kn)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
24 .write_end = simple_write_end, 24 .write_end = simple_write_end,
25}; 25};
26 26
27static struct backing_dev_info kernfs_bdi = {
28 .name = "kernfs",
29 .ra_pages = 0, /* No readahead */
30 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
31};
32
33static const struct inode_operations kernfs_iops = { 27static const struct inode_operations kernfs_iops = {
34 .permission = kernfs_iop_permission, 28 .permission = kernfs_iop_permission,
35 .setattr = kernfs_iop_setattr, 29 .setattr = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
40 .listxattr = kernfs_iop_listxattr, 34 .listxattr = kernfs_iop_listxattr,
41}; 35};
42 36
43void __init kernfs_inode_init(void)
44{
45 if (bdi_init(&kernfs_bdi))
46 panic("failed to init kernfs_bdi");
47}
48
49static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) 37static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
50{ 38{
51 static DEFINE_MUTEX(iattr_mutex); 39 static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
298 kernfs_get(kn); 286 kernfs_get(kn);
299 inode->i_private = kn; 287 inode->i_private = kn;
300 inode->i_mapping->a_ops = &kernfs_aops; 288 inode->i_mapping->a_ops = &kernfs_aops;
301 inode->i_mapping->backing_dev_info = &kernfs_bdi;
302 inode->i_op = &kernfs_iops; 289 inode->i_op = &kernfs_iops;
303 290
304 set_default_inode_attr(inode, kn->mode); 291 set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
88ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, 88ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
89 size_t size); 89 size_t size);
90ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); 90ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
91void kernfs_inode_init(void);
92 91
93/* 92/*
94 * dir.c 93 * dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
246 kernfs_node_cache = kmem_cache_create("kernfs_node_cache", 246 kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
247 sizeof(struct kernfs_node), 247 sizeof(struct kernfs_node),
248 0, SLAB_PANIC, NULL); 248 0, SLAB_PANIC, NULL);
249 kernfs_inode_init();
250} 249}
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
948 948
949 mutex_lock(&inode->i_mutex); 949 mutex_lock(&inode->i_mutex);
950 ret = sync_mapping_buffers(inode->i_mapping); 950 ret = sync_mapping_buffers(inode->i_mapping);
951 if (!(inode->i_state & I_DIRTY)) 951 if (!(inode->i_state & I_DIRTY_ALL))
952 goto out; 952 goto out;
953 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 953 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
954 goto out; 954 goto out;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
65 return (struct sockaddr *)&nsm->sm_addr; 65 return (struct sockaddr *)&nsm->sm_addr;
66} 66}
67 67
68static struct rpc_clnt *nsm_create(struct net *net) 68static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
69{ 69{
70 struct sockaddr_in sin = { 70 struct sockaddr_in sin = {
71 .sin_family = AF_INET, 71 .sin_family = AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
77 .address = (struct sockaddr *)&sin, 77 .address = (struct sockaddr *)&sin,
78 .addrsize = sizeof(sin), 78 .addrsize = sizeof(sin),
79 .servername = "rpc.statd", 79 .servername = "rpc.statd",
80 .nodename = nodename,
80 .program = &nsm_program, 81 .program = &nsm_program,
81 .version = NSM_VERSION, 82 .version = NSM_VERSION,
82 .authflavor = RPC_AUTH_NULL, 83 .authflavor = RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
102 return clnt; 103 return clnt;
103} 104}
104 105
105static struct rpc_clnt *nsm_client_get(struct net *net) 106static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
106{ 107{
107 struct rpc_clnt *clnt, *new; 108 struct rpc_clnt *clnt, *new;
108 struct lockd_net *ln = net_generic(net, lockd_net_id); 109 struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
111 if (clnt != NULL) 112 if (clnt != NULL)
112 goto out; 113 goto out;
113 114
114 clnt = new = nsm_create(net); 115 clnt = new = nsm_create(net, nodename);
115 if (IS_ERR(clnt)) 116 if (IS_ERR(clnt))
116 goto out; 117 goto out;
117 118
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
190 struct nsm_res res; 191 struct nsm_res res;
191 int status; 192 int status;
192 struct rpc_clnt *clnt; 193 struct rpc_clnt *clnt;
194 const char *nodename = NULL;
193 195
194 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); 196 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
195 197
196 if (nsm->sm_monitored) 198 if (nsm->sm_monitored)
197 return 0; 199 return 0;
198 200
201 if (host->h_rpcclnt)
202 nodename = host->h_rpcclnt->cl_nodename;
203
199 /* 204 /*
200 * Choose whether to record the caller_name or IP address of 205 * Choose whether to record the caller_name or IP address of
201 * this peer in the local rpc.statd's database. 206 * this peer in the local rpc.statd's database.
202 */ 207 */
203 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 208 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
204 209
205 clnt = nsm_client_get(host->net); 210 clnt = nsm_client_get(host->net, nodename);
206 if (IS_ERR(clnt)) { 211 if (IS_ERR(clnt)) {
207 status = PTR_ERR(clnt); 212 status = PTR_ERR(clnt);
208 dprintk("lockd: failed to create NSM upcall transport, " 213 dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) 57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
58{ 58{
59 /* 59 /*
60 * We can get away with a static buffer because we're only 60 * We can get away with a static buffer because this is only called
61 * called with BKL held. 61 * from lockd, which is single-threaded.
62 */ 62 */
63 static char buf[2*NLM_MAXCOOKIELEN+1]; 63 static char buf[2*NLM_MAXCOOKIELEN+1];
64 unsigned int i, len = sizeof(buf); 64 unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
164{ 164{
165 struct inode *inode = nlmsvc_file_inode(file); 165 struct inode *inode = nlmsvc_file_inode(file);
166 struct file_lock *fl; 166 struct file_lock *fl;
167 struct file_lock_context *flctx = inode->i_flctx;
167 struct nlm_host *lockhost; 168 struct nlm_host *lockhost;
168 169
170 if (!flctx || list_empty_careful(&flctx->flc_posix))
171 return 0;
169again: 172again:
170 file->f_locks = 0; 173 file->f_locks = 0;
171 spin_lock(&inode->i_lock); 174 spin_lock(&flctx->flc_lock);
172 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 175 list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
173 if (fl->fl_lmops != &nlmsvc_lock_operations) 176 if (fl->fl_lmops != &nlmsvc_lock_operations)
174 continue; 177 continue;
175 178
@@ -180,7 +183,7 @@ again:
180 if (match(lockhost, host)) { 183 if (match(lockhost, host)) {
181 struct file_lock lock = *fl; 184 struct file_lock lock = *fl;
182 185
183 spin_unlock(&inode->i_lock); 186 spin_unlock(&flctx->flc_lock);
184 lock.fl_type = F_UNLCK; 187 lock.fl_type = F_UNLCK;
185 lock.fl_start = 0; 188 lock.fl_start = 0;
186 lock.fl_end = OFFSET_MAX; 189 lock.fl_end = OFFSET_MAX;
@@ -192,7 +195,7 @@ again:
192 goto again; 195 goto again;
193 } 196 }
194 } 197 }
195 spin_unlock(&inode->i_lock); 198 spin_unlock(&flctx->flc_lock);
196 199
197 return 0; 200 return 0;
198} 201}
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
223{ 226{
224 struct inode *inode = nlmsvc_file_inode(file); 227 struct inode *inode = nlmsvc_file_inode(file);
225 struct file_lock *fl; 228 struct file_lock *fl;
229 struct file_lock_context *flctx = inode->i_flctx;
226 230
227 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 231 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
228 return 1; 232 return 1;
229 233
230 spin_lock(&inode->i_lock); 234 if (flctx && !list_empty_careful(&flctx->flc_posix)) {
231 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 235 spin_lock(&flctx->flc_lock);
232 if (fl->fl_lmops == &nlmsvc_lock_operations) { 236 list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
233 spin_unlock(&inode->i_lock); 237 if (fl->fl_lmops == &nlmsvc_lock_operations) {
234 return 1; 238 spin_unlock(&flctx->flc_lock);
239 return 1;
240 }
235 } 241 }
242 spin_unlock(&flctx->flc_lock);
236 } 243 }
237 spin_unlock(&inode->i_lock);
238 file->f_locks = 0; 244 file->f_locks = 0;
239 return 0; 245 return 0;
240} 246}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
95 return p + XDR_QUADLEN(NFS2_FHSIZE); 95 return p + XDR_QUADLEN(NFS2_FHSIZE);
96} 96}
97 97
98static inline __be32 *
99nlm_encode_fh(__be32 *p, struct nfs_fh *f)
100{
101 *p++ = htonl(NFS2_FHSIZE);
102 memcpy(p, f->data, NFS2_FHSIZE);
103 return p + XDR_QUADLEN(NFS2_FHSIZE);
104}
105
106/* 98/*
107 * Encode and decode owner handle 99 * Encode and decode owner handle
108 */ 100 */
diff --git a/fs/locks.c b/fs/locks.c
index 59e2f905e4ff..365c82e1b3a9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
137 137
138#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 138#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
139#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 139#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
140#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 140#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
141#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) 141#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
142 142
143static bool lease_breaking(struct file_lock *fl) 143static bool lease_breaking(struct file_lock *fl)
@@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl)
157int leases_enable = 1; 157int leases_enable = 1;
158int lease_break_time = 45; 158int lease_break_time = 45;
159 159
160#define for_each_lock(inode, lockp) \
161 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
162
163/* 160/*
164 * The global file_lock_list is only used for displaying /proc/locks, so we 161 * The global file_lock_list is only used for displaying /proc/locks, so we
165 * keep a list on each CPU, with each list protected by its own spinlock via 162 * keep a list on each CPU, with each list protected by its own spinlock via
166 * the file_lock_lglock. Note that alterations to the list also require that 163 * the file_lock_lglock. Note that alterations to the list also require that
167 * the relevant i_lock is held. 164 * the relevant flc_lock is held.
168 */ 165 */
169DEFINE_STATIC_LGLOCK(file_lock_lglock); 166DEFINE_STATIC_LGLOCK(file_lock_lglock);
170static DEFINE_PER_CPU(struct hlist_head, file_lock_list); 167static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
192 * contrast to those that are acting as records of acquired locks). 189 * contrast to those that are acting as records of acquired locks).
193 * 190 *
194 * Note that when we acquire this lock in order to change the above fields, 191 * Note that when we acquire this lock in order to change the above fields,
195 * we often hold the i_lock as well. In certain cases, when reading the fields 192 * we often hold the flc_lock as well. In certain cases, when reading the fields
196 * protected by this lock, we can skip acquiring it iff we already hold the 193 * protected by this lock, we can skip acquiring it iff we already hold the
197 * i_lock. 194 * flc_lock.
198 * 195 *
199 * In particular, adding an entry to the fl_block list requires that you hold 196 * In particular, adding an entry to the fl_block list requires that you hold
200 * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting 197 * both the flc_lock and the blocked_lock_lock (acquired in that order).
201 * an entry from the list however only requires the file_lock_lock. 198 * Deleting an entry from the list however only requires the file_lock_lock.
202 */ 199 */
203static DEFINE_SPINLOCK(blocked_lock_lock); 200static DEFINE_SPINLOCK(blocked_lock_lock);
204 201
202static struct kmem_cache *flctx_cache __read_mostly;
205static struct kmem_cache *filelock_cache __read_mostly; 203static struct kmem_cache *filelock_cache __read_mostly;
206 204
205static struct file_lock_context *
206locks_get_lock_context(struct inode *inode)
207{
208 struct file_lock_context *new;
209
210 if (likely(inode->i_flctx))
211 goto out;
212
213 new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
214 if (!new)
215 goto out;
216
217 spin_lock_init(&new->flc_lock);
218 INIT_LIST_HEAD(&new->flc_flock);
219 INIT_LIST_HEAD(&new->flc_posix);
220 INIT_LIST_HEAD(&new->flc_lease);
221
222 /*
223 * Assign the pointer if it's not already assigned. If it is, then
224 * free the context we just allocated.
225 */
226 spin_lock(&inode->i_lock);
227 if (likely(!inode->i_flctx)) {
228 inode->i_flctx = new;
229 new = NULL;
230 }
231 spin_unlock(&inode->i_lock);
232
233 if (new)
234 kmem_cache_free(flctx_cache, new);
235out:
236 return inode->i_flctx;
237}
238
239void
240locks_free_lock_context(struct file_lock_context *ctx)
241{
242 if (ctx) {
243 WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
244 WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
245 WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
246 kmem_cache_free(flctx_cache, ctx);
247 }
248}
249
207static void locks_init_lock_heads(struct file_lock *fl) 250static void locks_init_lock_heads(struct file_lock *fl)
208{ 251{
209 INIT_HLIST_NODE(&fl->fl_link); 252 INIT_HLIST_NODE(&fl->fl_link);
253 INIT_LIST_HEAD(&fl->fl_list);
210 INIT_LIST_HEAD(&fl->fl_block); 254 INIT_LIST_HEAD(&fl->fl_block);
211 init_waitqueue_head(&fl->fl_wait); 255 init_waitqueue_head(&fl->fl_wait);
212} 256}
@@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
243void locks_free_lock(struct file_lock *fl) 287void locks_free_lock(struct file_lock *fl)
244{ 288{
245 BUG_ON(waitqueue_active(&fl->fl_wait)); 289 BUG_ON(waitqueue_active(&fl->fl_wait));
290 BUG_ON(!list_empty(&fl->fl_list));
246 BUG_ON(!list_empty(&fl->fl_block)); 291 BUG_ON(!list_empty(&fl->fl_block));
247 BUG_ON(!hlist_unhashed(&fl->fl_link)); 292 BUG_ON(!hlist_unhashed(&fl->fl_link));
248 293
@@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose)
257 struct file_lock *fl; 302 struct file_lock *fl;
258 303
259 while (!list_empty(dispose)) { 304 while (!list_empty(dispose)) {
260 fl = list_first_entry(dispose, struct file_lock, fl_block); 305 fl = list_first_entry(dispose, struct file_lock, fl_list);
261 list_del_init(&fl->fl_block); 306 list_del_init(&fl->fl_list);
262 locks_free_lock(fl); 307 locks_free_lock(fl);
263 } 308 }
264} 309}
@@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
513 return fl1->fl_owner == fl2->fl_owner; 558 return fl1->fl_owner == fl2->fl_owner;
514} 559}
515 560
516/* Must be called with the i_lock held! */ 561/* Must be called with the flc_lock held! */
517static void locks_insert_global_locks(struct file_lock *fl) 562static void locks_insert_global_locks(struct file_lock *fl)
518{ 563{
519 lg_local_lock(&file_lock_lglock); 564 lg_local_lock(&file_lock_lglock);
@@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
522 lg_local_unlock(&file_lock_lglock); 567 lg_local_unlock(&file_lock_lglock);
523} 568}
524 569
525/* Must be called with the i_lock held! */ 570/* Must be called with the flc_lock held! */
526static void locks_delete_global_locks(struct file_lock *fl) 571static void locks_delete_global_locks(struct file_lock *fl)
527{ 572{
528 /* 573 /*
529 * Avoid taking lock if already unhashed. This is safe since this check 574 * Avoid taking lock if already unhashed. This is safe since this check
530 * is done while holding the i_lock, and new insertions into the list 575 * is done while holding the flc_lock, and new insertions into the list
531 * also require that it be held. 576 * also require that it be held.
532 */ 577 */
533 if (hlist_unhashed(&fl->fl_link)) 578 if (hlist_unhashed(&fl->fl_link))
@@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
579 * the order they blocked. The documentation doesn't require this but 624 * the order they blocked. The documentation doesn't require this but
580 * it seems like the reasonable thing to do. 625 * it seems like the reasonable thing to do.
581 * 626 *
582 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block 627 * Must be called with both the flc_lock and blocked_lock_lock held. The
583 * list itself is protected by the blocked_lock_lock, but by ensuring that the 628 * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
584 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock 629 * that the flc_lock is also held on insertions we can avoid taking the
585 * in some cases when we see that the fl_block list is empty. 630 * blocked_lock_lock in some cases when we see that the fl_block list is empty.
586 */ 631 */
587static void __locks_insert_block(struct file_lock *blocker, 632static void __locks_insert_block(struct file_lock *blocker,
588 struct file_lock *waiter) 633 struct file_lock *waiter)
@@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
594 locks_insert_global_blocked(waiter); 639 locks_insert_global_blocked(waiter);
595} 640}
596 641
597/* Must be called with i_lock held. */ 642/* Must be called with flc_lock held. */
598static void locks_insert_block(struct file_lock *blocker, 643static void locks_insert_block(struct file_lock *blocker,
599 struct file_lock *waiter) 644 struct file_lock *waiter)
600{ 645{
@@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
606/* 651/*
607 * Wake up processes blocked waiting for blocker. 652 * Wake up processes blocked waiting for blocker.
608 * 653 *
609 * Must be called with the inode->i_lock held! 654 * Must be called with the inode->flc_lock held!
610 */ 655 */
611static void locks_wake_up_blocks(struct file_lock *blocker) 656static void locks_wake_up_blocks(struct file_lock *blocker)
612{ 657{
613 /* 658 /*
614 * Avoid taking global lock if list is empty. This is safe since new 659 * Avoid taking global lock if list is empty. This is safe since new
615 * blocked requests are only added to the list under the i_lock, and 660 * blocked requests are only added to the list under the flc_lock, and
616 * the i_lock is always held here. Note that removal from the fl_block 661 * the flc_lock is always held here. Note that removal from the fl_block
617 * list does not require the i_lock, so we must recheck list_empty() 662 * list does not require the flc_lock, so we must recheck list_empty()
618 * after acquiring the blocked_lock_lock. 663 * after acquiring the blocked_lock_lock.
619 */ 664 */
620 if (list_empty(&blocker->fl_block)) 665 if (list_empty(&blocker->fl_block))
@@ -635,63 +680,32 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
635 spin_unlock(&blocked_lock_lock); 680 spin_unlock(&blocked_lock_lock);
636} 681}
637 682
638/* Insert file lock fl into an inode's lock list at the position indicated 683static void
639 * by pos. At the same time add the lock to the global file lock list. 684locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
640 *
641 * Must be called with the i_lock held!
642 */
643static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
644{ 685{
645 fl->fl_nspid = get_pid(task_tgid(current)); 686 fl->fl_nspid = get_pid(task_tgid(current));
646 687 list_add_tail(&fl->fl_list, before);
647 /* insert into file's list */
648 fl->fl_next = *pos;
649 *pos = fl;
650
651 locks_insert_global_locks(fl); 688 locks_insert_global_locks(fl);
652} 689}
653 690
654/** 691static void
655 * locks_delete_lock - Delete a lock and then free it. 692locks_unlink_lock_ctx(struct file_lock *fl)
656 * @thisfl_p: pointer that points to the fl_next field of the previous
657 * inode->i_flock list entry
658 *
659 * Unlink a lock from all lists and free the namespace reference, but don't
660 * free it yet. Wake up processes that are blocked waiting for this lock and
661 * notify the FS that the lock has been cleared.
662 *
663 * Must be called with the i_lock held!
664 */
665static void locks_unlink_lock(struct file_lock **thisfl_p)
666{ 693{
667 struct file_lock *fl = *thisfl_p;
668
669 locks_delete_global_locks(fl); 694 locks_delete_global_locks(fl);
670 695 list_del_init(&fl->fl_list);
671 *thisfl_p = fl->fl_next;
672 fl->fl_next = NULL;
673
674 if (fl->fl_nspid) { 696 if (fl->fl_nspid) {
675 put_pid(fl->fl_nspid); 697 put_pid(fl->fl_nspid);
676 fl->fl_nspid = NULL; 698 fl->fl_nspid = NULL;
677 } 699 }
678
679 locks_wake_up_blocks(fl); 700 locks_wake_up_blocks(fl);
680} 701}
681 702
682/* 703static void
683 * Unlink a lock from all lists and free it. 704locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
684 *
685 * Must be called with i_lock held!
686 */
687static void locks_delete_lock(struct file_lock **thisfl_p,
688 struct list_head *dispose)
689{ 705{
690 struct file_lock *fl = *thisfl_p; 706 locks_unlink_lock_ctx(fl);
691
692 locks_unlink_lock(thisfl_p);
693 if (dispose) 707 if (dispose)
694 list_add(&fl->fl_block, dispose); 708 list_add(&fl->fl_list, dispose);
695 else 709 else
696 locks_free_lock(fl); 710 locks_free_lock(fl);
697} 711}
@@ -746,22 +760,27 @@ void
746posix_test_lock(struct file *filp, struct file_lock *fl) 760posix_test_lock(struct file *filp, struct file_lock *fl)
747{ 761{
748 struct file_lock *cfl; 762 struct file_lock *cfl;
763 struct file_lock_context *ctx;
749 struct inode *inode = file_inode(filp); 764 struct inode *inode = file_inode(filp);
750 765
751 spin_lock(&inode->i_lock); 766 ctx = inode->i_flctx;
752 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { 767 if (!ctx || list_empty_careful(&ctx->flc_posix)) {
753 if (!IS_POSIX(cfl))
754 continue;
755 if (posix_locks_conflict(fl, cfl))
756 break;
757 }
758 if (cfl) {
759 locks_copy_conflock(fl, cfl);
760 if (cfl->fl_nspid)
761 fl->fl_pid = pid_vnr(cfl->fl_nspid);
762 } else
763 fl->fl_type = F_UNLCK; 768 fl->fl_type = F_UNLCK;
764 spin_unlock(&inode->i_lock); 769 return;
770 }
771
772 spin_lock(&ctx->flc_lock);
773 list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
774 if (posix_locks_conflict(fl, cfl)) {
775 locks_copy_conflock(fl, cfl);
776 if (cfl->fl_nspid)
777 fl->fl_pid = pid_vnr(cfl->fl_nspid);
778 goto out;
779 }
780 }
781 fl->fl_type = F_UNLCK;
782out:
783 spin_unlock(&ctx->flc_lock);
765 return; 784 return;
766} 785}
767EXPORT_SYMBOL(posix_test_lock); 786EXPORT_SYMBOL(posix_test_lock);
@@ -845,34 +864,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
845static int flock_lock_file(struct file *filp, struct file_lock *request) 864static int flock_lock_file(struct file *filp, struct file_lock *request)
846{ 865{
847 struct file_lock *new_fl = NULL; 866 struct file_lock *new_fl = NULL;
848 struct file_lock **before; 867 struct file_lock *fl;
849 struct inode * inode = file_inode(filp); 868 struct file_lock_context *ctx;
869 struct inode *inode = file_inode(filp);
850 int error = 0; 870 int error = 0;
851 int found = 0; 871 bool found = false;
852 LIST_HEAD(dispose); 872 LIST_HEAD(dispose);
853 873
874 ctx = locks_get_lock_context(inode);
875 if (!ctx)
876 return -ENOMEM;
877
854 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { 878 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
855 new_fl = locks_alloc_lock(); 879 new_fl = locks_alloc_lock();
856 if (!new_fl) 880 if (!new_fl)
857 return -ENOMEM; 881 return -ENOMEM;
858 } 882 }
859 883
860 spin_lock(&inode->i_lock); 884 spin_lock(&ctx->flc_lock);
861 if (request->fl_flags & FL_ACCESS) 885 if (request->fl_flags & FL_ACCESS)
862 goto find_conflict; 886 goto find_conflict;
863 887
864 for_each_lock(inode, before) { 888 list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
865 struct file_lock *fl = *before;
866 if (IS_POSIX(fl))
867 break;
868 if (IS_LEASE(fl))
869 continue;
870 if (filp != fl->fl_file) 889 if (filp != fl->fl_file)
871 continue; 890 continue;
872 if (request->fl_type == fl->fl_type) 891 if (request->fl_type == fl->fl_type)
873 goto out; 892 goto out;
874 found = 1; 893 found = true;
875 locks_delete_lock(before, &dispose); 894 locks_delete_lock_ctx(fl, &dispose);
876 break; 895 break;
877 } 896 }
878 897
@@ -882,23 +901,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
882 goto out; 901 goto out;
883 } 902 }
884 903
885 /*
886 * If a higher-priority process was blocked on the old file lock,
887 * give it the opportunity to lock the file.
888 */
889 if (found) {
890 spin_unlock(&inode->i_lock);
891 cond_resched();
892 spin_lock(&inode->i_lock);
893 }
894
895find_conflict: 904find_conflict:
896 for_each_lock(inode, before) { 905 list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
897 struct file_lock *fl = *before;
898 if (IS_POSIX(fl))
899 break;
900 if (IS_LEASE(fl))
901 continue;
902 if (!flock_locks_conflict(request, fl)) 906 if (!flock_locks_conflict(request, fl))
903 continue; 907 continue;
904 error = -EAGAIN; 908 error = -EAGAIN;
@@ -911,12 +915,12 @@ find_conflict:
911 if (request->fl_flags & FL_ACCESS) 915 if (request->fl_flags & FL_ACCESS)
912 goto out; 916 goto out;
913 locks_copy_lock(new_fl, request); 917 locks_copy_lock(new_fl, request);
914 locks_insert_lock(before, new_fl); 918 locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
915 new_fl = NULL; 919 new_fl = NULL;
916 error = 0; 920 error = 0;
917 921
918out: 922out:
919 spin_unlock(&inode->i_lock); 923 spin_unlock(&ctx->flc_lock);
920 if (new_fl) 924 if (new_fl)
921 locks_free_lock(new_fl); 925 locks_free_lock(new_fl);
922 locks_dispose_list(&dispose); 926 locks_dispose_list(&dispose);
@@ -925,16 +929,20 @@ out:
925 929
926static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) 930static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
927{ 931{
928 struct file_lock *fl; 932 struct file_lock *fl, *tmp;
929 struct file_lock *new_fl = NULL; 933 struct file_lock *new_fl = NULL;
930 struct file_lock *new_fl2 = NULL; 934 struct file_lock *new_fl2 = NULL;
931 struct file_lock *left = NULL; 935 struct file_lock *left = NULL;
932 struct file_lock *right = NULL; 936 struct file_lock *right = NULL;
933 struct file_lock **before; 937 struct file_lock_context *ctx;
934 int error; 938 int error;
935 bool added = false; 939 bool added = false;
936 LIST_HEAD(dispose); 940 LIST_HEAD(dispose);
937 941
942 ctx = locks_get_lock_context(inode);
943 if (!ctx)
944 return -ENOMEM;
945
938 /* 946 /*
939 * We may need two file_lock structures for this operation, 947 * We may need two file_lock structures for this operation,
940 * so we get them in advance to avoid races. 948 * so we get them in advance to avoid races.
@@ -948,15 +956,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
948 new_fl2 = locks_alloc_lock(); 956 new_fl2 = locks_alloc_lock();
949 } 957 }
950 958
951 spin_lock(&inode->i_lock); 959 spin_lock(&ctx->flc_lock);
952 /* 960 /*
953 * New lock request. Walk all POSIX locks and look for conflicts. If 961 * New lock request. Walk all POSIX locks and look for conflicts. If
954 * there are any, either return error or put the request on the 962 * there are any, either return error or put the request on the
955 * blocker's list of waiters and the global blocked_hash. 963 * blocker's list of waiters and the global blocked_hash.
956 */ 964 */
957 if (request->fl_type != F_UNLCK) { 965 if (request->fl_type != F_UNLCK) {
958 for_each_lock(inode, before) { 966 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
959 fl = *before;
960 if (!IS_POSIX(fl)) 967 if (!IS_POSIX(fl))
961 continue; 968 continue;
962 if (!posix_locks_conflict(request, fl)) 969 if (!posix_locks_conflict(request, fl))
@@ -986,29 +993,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
986 if (request->fl_flags & FL_ACCESS) 993 if (request->fl_flags & FL_ACCESS)
987 goto out; 994 goto out;
988 995
989 /* 996 /* Find the first old lock with the same owner as the new lock */
990 * Find the first old lock with the same owner as the new lock. 997 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
991 */ 998 if (posix_same_owner(request, fl))
992 999 break;
993 before = &inode->i_flock;
994
995 /* First skip locks owned by other processes. */
996 while ((fl = *before) && (!IS_POSIX(fl) ||
997 !posix_same_owner(request, fl))) {
998 before = &fl->fl_next;
999 } 1000 }
1000 1001
1001 /* Process locks with this owner. */ 1002 /* Process locks with this owner. */
1002 while ((fl = *before) && posix_same_owner(request, fl)) { 1003 list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
1003 /* Detect adjacent or overlapping regions (if same lock type) 1004 if (!posix_same_owner(request, fl))
1004 */ 1005 break;
1006
1007 /* Detect adjacent or overlapping regions (if same lock type) */
1005 if (request->fl_type == fl->fl_type) { 1008 if (request->fl_type == fl->fl_type) {
1006 /* In all comparisons of start vs end, use 1009 /* In all comparisons of start vs end, use
1007 * "start - 1" rather than "end + 1". If end 1010 * "start - 1" rather than "end + 1". If end
1008 * is OFFSET_MAX, end + 1 will become negative. 1011 * is OFFSET_MAX, end + 1 will become negative.
1009 */ 1012 */
1010 if (fl->fl_end < request->fl_start - 1) 1013 if (fl->fl_end < request->fl_start - 1)
1011 goto next_lock; 1014 continue;
1012 /* If the next lock in the list has entirely bigger 1015 /* If the next lock in the list has entirely bigger
1013 * addresses than the new one, insert the lock here. 1016 * addresses than the new one, insert the lock here.
1014 */ 1017 */
@@ -1029,18 +1032,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1029 else 1032 else
1030 request->fl_end = fl->fl_end; 1033 request->fl_end = fl->fl_end;
1031 if (added) { 1034 if (added) {
1032 locks_delete_lock(before, &dispose); 1035 locks_delete_lock_ctx(fl, &dispose);
1033 continue; 1036 continue;
1034 } 1037 }
1035 request = fl; 1038 request = fl;
1036 added = true; 1039 added = true;
1037 } 1040 } else {
1038 else {
1039 /* Processing for different lock types is a bit 1041 /* Processing for different lock types is a bit
1040 * more complex. 1042 * more complex.
1041 */ 1043 */
1042 if (fl->fl_end < request->fl_start) 1044 if (fl->fl_end < request->fl_start)
1043 goto next_lock; 1045 continue;
1044 if (fl->fl_start > request->fl_end) 1046 if (fl->fl_start > request->fl_end)
1045 break; 1047 break;
1046 if (request->fl_type == F_UNLCK) 1048 if (request->fl_type == F_UNLCK)
@@ -1059,7 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1059 * one (This may happen several times). 1061 * one (This may happen several times).
1060 */ 1062 */
1061 if (added) { 1063 if (added) {
1062 locks_delete_lock(before, &dispose); 1064 locks_delete_lock_ctx(fl, &dispose);
1063 continue; 1065 continue;
1064 } 1066 }
1065 /* 1067 /*
@@ -1075,15 +1077,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1075 locks_copy_lock(new_fl, request); 1077 locks_copy_lock(new_fl, request);
1076 request = new_fl; 1078 request = new_fl;
1077 new_fl = NULL; 1079 new_fl = NULL;
1078 locks_delete_lock(before, &dispose); 1080 locks_insert_lock_ctx(request, &fl->fl_list);
1079 locks_insert_lock(before, request); 1081 locks_delete_lock_ctx(fl, &dispose);
1080 added = true; 1082 added = true;
1081 } 1083 }
1082 } 1084 }
1083 /* Go on to next lock.
1084 */
1085 next_lock:
1086 before = &fl->fl_next;
1087 } 1085 }
1088 1086
1089 /* 1087 /*
@@ -1108,7 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1108 goto out; 1106 goto out;
1109 } 1107 }
1110 locks_copy_lock(new_fl, request); 1108 locks_copy_lock(new_fl, request);
1111 locks_insert_lock(before, new_fl); 1109 locks_insert_lock_ctx(new_fl, &fl->fl_list);
1110 fl = new_fl;
1112 new_fl = NULL; 1111 new_fl = NULL;
1113 } 1112 }
1114 if (right) { 1113 if (right) {
@@ -1119,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1119 left = new_fl2; 1118 left = new_fl2;
1120 new_fl2 = NULL; 1119 new_fl2 = NULL;
1121 locks_copy_lock(left, right); 1120 locks_copy_lock(left, right);
1122 locks_insert_lock(before, left); 1121 locks_insert_lock_ctx(left, &fl->fl_list);
1123 } 1122 }
1124 right->fl_start = request->fl_end + 1; 1123 right->fl_start = request->fl_end + 1;
1125 locks_wake_up_blocks(right); 1124 locks_wake_up_blocks(right);
@@ -1129,7 +1128,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1129 locks_wake_up_blocks(left); 1128 locks_wake_up_blocks(left);
1130 } 1129 }
1131 out: 1130 out:
1132 spin_unlock(&inode->i_lock); 1131 spin_unlock(&ctx->flc_lock);
1133 /* 1132 /*
1134 * Free any unused locks. 1133 * Free any unused locks.
1135 */ 1134 */
@@ -1199,22 +1198,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
1199 */ 1198 */
1200int locks_mandatory_locked(struct file *file) 1199int locks_mandatory_locked(struct file *file)
1201{ 1200{
1201 int ret;
1202 struct inode *inode = file_inode(file); 1202 struct inode *inode = file_inode(file);
1203 struct file_lock_context *ctx;
1203 struct file_lock *fl; 1204 struct file_lock *fl;
1204 1205
1206 ctx = inode->i_flctx;
1207 if (!ctx || list_empty_careful(&ctx->flc_posix))
1208 return 0;
1209
1205 /* 1210 /*
1206 * Search the lock list for this inode for any POSIX locks. 1211 * Search the lock list for this inode for any POSIX locks.
1207 */ 1212 */
1208 spin_lock(&inode->i_lock); 1213 spin_lock(&ctx->flc_lock);
1209 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1214 ret = 0;
1210 if (!IS_POSIX(fl)) 1215 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
1211 continue;
1212 if (fl->fl_owner != current->files && 1216 if (fl->fl_owner != current->files &&
1213 fl->fl_owner != file) 1217 fl->fl_owner != file) {
1218 ret = -EAGAIN;
1214 break; 1219 break;
1220 }
1215 } 1221 }
1216 spin_unlock(&inode->i_lock); 1222 spin_unlock(&ctx->flc_lock);
1217 return fl ? -EAGAIN : 0; 1223 return ret;
1218} 1224}
1219 1225
1220/** 1226/**
@@ -1294,9 +1300,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
1294} 1300}
1295 1301
1296/* We already had a lease on this file; just change its type */ 1302/* We already had a lease on this file; just change its type */
1297int lease_modify(struct file_lock **before, int arg, struct list_head *dispose) 1303int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
1298{ 1304{
1299 struct file_lock *fl = *before;
1300 int error = assign_type(fl, arg); 1305 int error = assign_type(fl, arg);
1301 1306
1302 if (error) 1307 if (error)
@@ -1313,7 +1318,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
1313 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); 1318 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
1314 fl->fl_fasync = NULL; 1319 fl->fl_fasync = NULL;
1315 } 1320 }
1316 locks_delete_lock(before, dispose); 1321 locks_delete_lock_ctx(fl, dispose);
1317 } 1322 }
1318 return 0; 1323 return 0;
1319} 1324}
@@ -1329,25 +1334,24 @@ static bool past_time(unsigned long then)
1329 1334
1330static void time_out_leases(struct inode *inode, struct list_head *dispose) 1335static void time_out_leases(struct inode *inode, struct list_head *dispose)
1331{ 1336{
1332 struct file_lock **before; 1337 struct file_lock_context *ctx = inode->i_flctx;
1333 struct file_lock *fl; 1338 struct file_lock *fl, *tmp;
1334 1339
1335 lockdep_assert_held(&inode->i_lock); 1340 lockdep_assert_held(&ctx->flc_lock);
1336 1341
1337 before = &inode->i_flock; 1342 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
1338 while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
1339 trace_time_out_leases(inode, fl); 1343 trace_time_out_leases(inode, fl);
1340 if (past_time(fl->fl_downgrade_time)) 1344 if (past_time(fl->fl_downgrade_time))
1341 lease_modify(before, F_RDLCK, dispose); 1345 lease_modify(fl, F_RDLCK, dispose);
1342 if (past_time(fl->fl_break_time)) 1346 if (past_time(fl->fl_break_time))
1343 lease_modify(before, F_UNLCK, dispose); 1347 lease_modify(fl, F_UNLCK, dispose);
1344 if (fl == *before) /* lease_modify may have freed fl */
1345 before = &fl->fl_next;
1346 } 1348 }
1347} 1349}
1348 1350
1349static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) 1351static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1350{ 1352{
1353 if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
1354 return false;
1351 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) 1355 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
1352 return false; 1356 return false;
1353 return locks_conflict(breaker, lease); 1357 return locks_conflict(breaker, lease);
@@ -1356,11 +1360,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1356static bool 1360static bool
1357any_leases_conflict(struct inode *inode, struct file_lock *breaker) 1361any_leases_conflict(struct inode *inode, struct file_lock *breaker)
1358{ 1362{
1363 struct file_lock_context *ctx = inode->i_flctx;
1359 struct file_lock *fl; 1364 struct file_lock *fl;
1360 1365
1361 lockdep_assert_held(&inode->i_lock); 1366 lockdep_assert_held(&ctx->flc_lock);
1362 1367
1363 for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) { 1368 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1364 if (leases_conflict(fl, breaker)) 1369 if (leases_conflict(fl, breaker))
1365 return true; 1370 return true;
1366 } 1371 }
@@ -1384,7 +1389,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1384{ 1389{
1385 int error = 0; 1390 int error = 0;
1386 struct file_lock *new_fl; 1391 struct file_lock *new_fl;
1387 struct file_lock *fl, **before; 1392 struct file_lock_context *ctx = inode->i_flctx;
1393 struct file_lock *fl;
1388 unsigned long break_time; 1394 unsigned long break_time;
1389 int want_write = (mode & O_ACCMODE) != O_RDONLY; 1395 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1390 LIST_HEAD(dispose); 1396 LIST_HEAD(dispose);
@@ -1394,7 +1400,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1394 return PTR_ERR(new_fl); 1400 return PTR_ERR(new_fl);
1395 new_fl->fl_flags = type; 1401 new_fl->fl_flags = type;
1396 1402
1397 spin_lock(&inode->i_lock); 1403 /* typically we will check that ctx is non-NULL before calling */
1404 if (!ctx) {
1405 WARN_ON_ONCE(1);
1406 return error;
1407 }
1408
1409 spin_lock(&ctx->flc_lock);
1398 1410
1399 time_out_leases(inode, &dispose); 1411 time_out_leases(inode, &dispose);
1400 1412
@@ -1408,9 +1420,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1408 break_time++; /* so that 0 means no break time */ 1420 break_time++; /* so that 0 means no break time */
1409 } 1421 }
1410 1422
1411 for (before = &inode->i_flock; 1423 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1412 ((fl = *before) != NULL) && IS_LEASE(fl);
1413 before = &fl->fl_next) {
1414 if (!leases_conflict(fl, new_fl)) 1424 if (!leases_conflict(fl, new_fl))
1415 continue; 1425 continue;
1416 if (want_write) { 1426 if (want_write) {
@@ -1419,17 +1429,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1419 fl->fl_flags |= FL_UNLOCK_PENDING; 1429 fl->fl_flags |= FL_UNLOCK_PENDING;
1420 fl->fl_break_time = break_time; 1430 fl->fl_break_time = break_time;
1421 } else { 1431 } else {
1422 if (lease_breaking(inode->i_flock)) 1432 if (lease_breaking(fl))
1423 continue; 1433 continue;
1424 fl->fl_flags |= FL_DOWNGRADE_PENDING; 1434 fl->fl_flags |= FL_DOWNGRADE_PENDING;
1425 fl->fl_downgrade_time = break_time; 1435 fl->fl_downgrade_time = break_time;
1426 } 1436 }
1427 if (fl->fl_lmops->lm_break(fl)) 1437 if (fl->fl_lmops->lm_break(fl))
1428 locks_delete_lock(before, &dispose); 1438 locks_delete_lock_ctx(fl, &dispose);
1429 } 1439 }
1430 1440
1431 fl = inode->i_flock; 1441 if (list_empty(&ctx->flc_lease))
1432 if (!fl || !IS_LEASE(fl))
1433 goto out; 1442 goto out;
1434 1443
1435 if (mode & O_NONBLOCK) { 1444 if (mode & O_NONBLOCK) {
@@ -1439,18 +1448,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1439 } 1448 }
1440 1449
1441restart: 1450restart:
1442 break_time = inode->i_flock->fl_break_time; 1451 fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
1452 break_time = fl->fl_break_time;
1443 if (break_time != 0) 1453 if (break_time != 0)
1444 break_time -= jiffies; 1454 break_time -= jiffies;
1445 if (break_time == 0) 1455 if (break_time == 0)
1446 break_time++; 1456 break_time++;
1447 locks_insert_block(inode->i_flock, new_fl); 1457 locks_insert_block(fl, new_fl);
1448 trace_break_lease_block(inode, new_fl); 1458 trace_break_lease_block(inode, new_fl);
1449 spin_unlock(&inode->i_lock); 1459 spin_unlock(&ctx->flc_lock);
1450 locks_dispose_list(&dispose); 1460 locks_dispose_list(&dispose);
1451 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1461 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1452 !new_fl->fl_next, break_time); 1462 !new_fl->fl_next, break_time);
1453 spin_lock(&inode->i_lock); 1463 spin_lock(&ctx->flc_lock);
1454 trace_break_lease_unblock(inode, new_fl); 1464 trace_break_lease_unblock(inode, new_fl);
1455 locks_delete_block(new_fl); 1465 locks_delete_block(new_fl);
1456 if (error >= 0) { 1466 if (error >= 0) {
@@ -1462,12 +1472,10 @@ restart:
1462 time_out_leases(inode, &dispose); 1472 time_out_leases(inode, &dispose);
1463 if (any_leases_conflict(inode, new_fl)) 1473 if (any_leases_conflict(inode, new_fl))
1464 goto restart; 1474 goto restart;
1465
1466 error = 0; 1475 error = 0;
1467 } 1476 }
1468
1469out: 1477out:
1470 spin_unlock(&inode->i_lock); 1478 spin_unlock(&ctx->flc_lock);
1471 locks_dispose_list(&dispose); 1479 locks_dispose_list(&dispose);
1472 locks_free_lock(new_fl); 1480 locks_free_lock(new_fl);
1473 return error; 1481 return error;
@@ -1487,14 +1495,18 @@ EXPORT_SYMBOL(__break_lease);
1487void lease_get_mtime(struct inode *inode, struct timespec *time) 1495void lease_get_mtime(struct inode *inode, struct timespec *time)
1488{ 1496{
1489 bool has_lease = false; 1497 bool has_lease = false;
1490 struct file_lock *flock; 1498 struct file_lock_context *ctx = inode->i_flctx;
1499 struct file_lock *fl;
1491 1500
1492 if (inode->i_flock) { 1501 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
1493 spin_lock(&inode->i_lock); 1502 spin_lock(&ctx->flc_lock);
1494 flock = inode->i_flock; 1503 if (!list_empty(&ctx->flc_lease)) {
1495 if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) 1504 fl = list_first_entry(&ctx->flc_lease,
1496 has_lease = true; 1505 struct file_lock, fl_list);
1497 spin_unlock(&inode->i_lock); 1506 if (fl->fl_type == F_WRLCK)
1507 has_lease = true;
1508 }
1509 spin_unlock(&ctx->flc_lock);
1498 } 1510 }
1499 1511
1500 if (has_lease) 1512 if (has_lease)
@@ -1532,20 +1544,22 @@ int fcntl_getlease(struct file *filp)
1532{ 1544{
1533 struct file_lock *fl; 1545 struct file_lock *fl;
1534 struct inode *inode = file_inode(filp); 1546 struct inode *inode = file_inode(filp);
1547 struct file_lock_context *ctx = inode->i_flctx;
1535 int type = F_UNLCK; 1548 int type = F_UNLCK;
1536 LIST_HEAD(dispose); 1549 LIST_HEAD(dispose);
1537 1550
1538 spin_lock(&inode->i_lock); 1551 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
1539 time_out_leases(file_inode(filp), &dispose); 1552 spin_lock(&ctx->flc_lock);
1540 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); 1553 time_out_leases(file_inode(filp), &dispose);
1541 fl = fl->fl_next) { 1554 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1542 if (fl->fl_file == filp) { 1555 if (fl->fl_file != filp)
1556 continue;
1543 type = target_leasetype(fl); 1557 type = target_leasetype(fl);
1544 break; 1558 break;
1545 } 1559 }
1560 spin_unlock(&ctx->flc_lock);
1561 locks_dispose_list(&dispose);
1546 } 1562 }
1547 spin_unlock(&inode->i_lock);
1548 locks_dispose_list(&dispose);
1549 return type; 1563 return type;
1550} 1564}
1551 1565
@@ -1560,11 +1574,14 @@ int fcntl_getlease(struct file *filp)
1560 * conflict with the lease we're trying to set. 1574 * conflict with the lease we're trying to set.
1561 */ 1575 */
1562static int 1576static int
1563check_conflicting_open(const struct dentry *dentry, const long arg) 1577check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
1564{ 1578{
1565 int ret = 0; 1579 int ret = 0;
1566 struct inode *inode = dentry->d_inode; 1580 struct inode *inode = dentry->d_inode;
1567 1581
1582 if (flags & FL_LAYOUT)
1583 return 0;
1584
1568 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1585 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1569 return -EAGAIN; 1586 return -EAGAIN;
1570 1587
@@ -1578,9 +1595,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
1578static int 1595static int
1579generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) 1596generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
1580{ 1597{
1581 struct file_lock *fl, **before, **my_before = NULL, *lease; 1598 struct file_lock *fl, *my_fl = NULL, *lease;
1582 struct dentry *dentry = filp->f_path.dentry; 1599 struct dentry *dentry = filp->f_path.dentry;
1583 struct inode *inode = dentry->d_inode; 1600 struct inode *inode = dentry->d_inode;
1601 struct file_lock_context *ctx;
1584 bool is_deleg = (*flp)->fl_flags & FL_DELEG; 1602 bool is_deleg = (*flp)->fl_flags & FL_DELEG;
1585 int error; 1603 int error;
1586 LIST_HEAD(dispose); 1604 LIST_HEAD(dispose);
@@ -1588,6 +1606,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1588 lease = *flp; 1606 lease = *flp;
1589 trace_generic_add_lease(inode, lease); 1607 trace_generic_add_lease(inode, lease);
1590 1608
1609 ctx = locks_get_lock_context(inode);
1610 if (!ctx)
1611 return -ENOMEM;
1612
1591 /* 1613 /*
1592 * In the delegation case we need mutual exclusion with 1614 * In the delegation case we need mutual exclusion with
1593 * a number of operations that take the i_mutex. We trylock 1615 * a number of operations that take the i_mutex. We trylock
@@ -1606,9 +1628,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1606 return -EINVAL; 1628 return -EINVAL;
1607 } 1629 }
1608 1630
1609 spin_lock(&inode->i_lock); 1631 spin_lock(&ctx->flc_lock);
1610 time_out_leases(inode, &dispose); 1632 time_out_leases(inode, &dispose);
1611 error = check_conflicting_open(dentry, arg); 1633 error = check_conflicting_open(dentry, arg, lease->fl_flags);
1612 if (error) 1634 if (error)
1613 goto out; 1635 goto out;
1614 1636
@@ -1621,13 +1643,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1621 * except for this filp. 1643 * except for this filp.
1622 */ 1644 */
1623 error = -EAGAIN; 1645 error = -EAGAIN;
1624 for (before = &inode->i_flock; 1646 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1625 ((fl = *before) != NULL) && IS_LEASE(fl); 1647 if (fl->fl_file == filp &&
1626 before = &fl->fl_next) { 1648 fl->fl_owner == lease->fl_owner) {
1627 if (fl->fl_file == filp) { 1649 my_fl = fl;
1628 my_before = before;
1629 continue; 1650 continue;
1630 } 1651 }
1652
1631 /* 1653 /*
1632 * No exclusive leases if someone else has a lease on 1654 * No exclusive leases if someone else has a lease on
1633 * this file: 1655 * this file:
@@ -1642,9 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1642 goto out; 1664 goto out;
1643 } 1665 }
1644 1666
1645 if (my_before != NULL) { 1667 if (my_fl != NULL) {
1646 lease = *my_before; 1668 error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
1647 error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
1648 if (error) 1669 if (error)
1649 goto out; 1670 goto out;
1650 goto out_setup; 1671 goto out_setup;
@@ -1654,7 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1654 if (!leases_enable) 1675 if (!leases_enable)
1655 goto out; 1676 goto out;
1656 1677
1657 locks_insert_lock(before, lease); 1678 locks_insert_lock_ctx(lease, &ctx->flc_lease);
1658 /* 1679 /*
1659 * The check in break_lease() is lockless. It's possible for another 1680 * The check in break_lease() is lockless. It's possible for another
1660 * open to race in after we did the earlier check for a conflicting 1681 * open to race in after we did the earlier check for a conflicting
@@ -1665,46 +1686,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1665 * precedes these checks. 1686 * precedes these checks.
1666 */ 1687 */
1667 smp_mb(); 1688 smp_mb();
1668 error = check_conflicting_open(dentry, arg); 1689 error = check_conflicting_open(dentry, arg, lease->fl_flags);
1669 if (error) 1690 if (error) {
1670 goto out_unlink; 1691 locks_unlink_lock_ctx(lease);
1692 goto out;
1693 }
1671 1694
1672out_setup: 1695out_setup:
1673 if (lease->fl_lmops->lm_setup) 1696 if (lease->fl_lmops->lm_setup)
1674 lease->fl_lmops->lm_setup(lease, priv); 1697 lease->fl_lmops->lm_setup(lease, priv);
1675out: 1698out:
1676 spin_unlock(&inode->i_lock); 1699 spin_unlock(&ctx->flc_lock);
1677 locks_dispose_list(&dispose); 1700 locks_dispose_list(&dispose);
1678 if (is_deleg) 1701 if (is_deleg)
1679 mutex_unlock(&inode->i_mutex); 1702 mutex_unlock(&inode->i_mutex);
1680 if (!error && !my_before) 1703 if (!error && !my_fl)
1681 *flp = NULL; 1704 *flp = NULL;
1682 return error; 1705 return error;
1683out_unlink:
1684 locks_unlink_lock(before);
1685 goto out;
1686} 1706}
1687 1707
1688static int generic_delete_lease(struct file *filp) 1708static int generic_delete_lease(struct file *filp, void *owner)
1689{ 1709{
1690 int error = -EAGAIN; 1710 int error = -EAGAIN;
1691 struct file_lock *fl, **before; 1711 struct file_lock *fl, *victim = NULL;
1692 struct dentry *dentry = filp->f_path.dentry; 1712 struct dentry *dentry = filp->f_path.dentry;
1693 struct inode *inode = dentry->d_inode; 1713 struct inode *inode = dentry->d_inode;
1714 struct file_lock_context *ctx = inode->i_flctx;
1694 LIST_HEAD(dispose); 1715 LIST_HEAD(dispose);
1695 1716
1696 spin_lock(&inode->i_lock); 1717 if (!ctx) {
1697 time_out_leases(inode, &dispose); 1718 trace_generic_delete_lease(inode, NULL);
1698 for (before = &inode->i_flock; 1719 return error;
1699 ((fl = *before) != NULL) && IS_LEASE(fl); 1720 }
1700 before = &fl->fl_next) { 1721
1701 if (fl->fl_file == filp) 1722 spin_lock(&ctx->flc_lock);
1723 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1724 if (fl->fl_file == filp &&
1725 fl->fl_owner == owner) {
1726 victim = fl;
1702 break; 1727 break;
1728 }
1703 } 1729 }
1704 trace_generic_delete_lease(inode, fl); 1730 trace_generic_delete_lease(inode, fl);
1705 if (fl && IS_LEASE(fl)) 1731 if (victim)
1706 error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose); 1732 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
1707 spin_unlock(&inode->i_lock); 1733 spin_unlock(&ctx->flc_lock);
1708 locks_dispose_list(&dispose); 1734 locks_dispose_list(&dispose);
1709 return error; 1735 return error;
1710} 1736}
@@ -1737,13 +1763,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
1737 1763
1738 switch (arg) { 1764 switch (arg) {
1739 case F_UNLCK: 1765 case F_UNLCK:
1740 return generic_delete_lease(filp); 1766 return generic_delete_lease(filp, *priv);
1741 case F_RDLCK: 1767 case F_RDLCK:
1742 case F_WRLCK: 1768 case F_WRLCK:
1743 if (!(*flp)->fl_lmops->lm_break) { 1769 if (!(*flp)->fl_lmops->lm_break) {
1744 WARN_ON_ONCE(1); 1770 WARN_ON_ONCE(1);
1745 return -ENOLCK; 1771 return -ENOLCK;
1746 } 1772 }
1773
1747 return generic_add_lease(filp, arg, flp, priv); 1774 return generic_add_lease(filp, arg, flp, priv);
1748 default: 1775 default:
1749 return -EINVAL; 1776 return -EINVAL;
@@ -1816,7 +1843,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1816int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1843int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1817{ 1844{
1818 if (arg == F_UNLCK) 1845 if (arg == F_UNLCK)
1819 return vfs_setlease(filp, F_UNLCK, NULL, NULL); 1846 return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
1820 return do_fcntl_add_lease(fd, filp, arg); 1847 return do_fcntl_add_lease(fd, filp, arg);
1821} 1848}
1822 1849
@@ -2171,7 +2198,7 @@ again:
2171 */ 2198 */
2172 /* 2199 /*
2173 * we need that spin_lock here - it prevents reordering between 2200 * we need that spin_lock here - it prevents reordering between
2174 * update of inode->i_flock and check for it done in close(). 2201 * update of i_flctx->flc_posix and check for it done in close().
2175 * rcu_read_lock() wouldn't do. 2202 * rcu_read_lock() wouldn't do.
2176 */ 2203 */
2177 spin_lock(&current->files->file_lock); 2204 spin_lock(&current->files->file_lock);
@@ -2331,13 +2358,14 @@ out:
2331void locks_remove_posix(struct file *filp, fl_owner_t owner) 2358void locks_remove_posix(struct file *filp, fl_owner_t owner)
2332{ 2359{
2333 struct file_lock lock; 2360 struct file_lock lock;
2361 struct file_lock_context *ctx = file_inode(filp)->i_flctx;
2334 2362
2335 /* 2363 /*
2336 * If there are no locks held on this file, we don't need to call 2364 * If there are no locks held on this file, we don't need to call
2337 * posix_lock_file(). Another process could be setting a lock on this 2365 * posix_lock_file(). Another process could be setting a lock on this
2338 * file at the same time, but we wouldn't remove that lock anyway. 2366 * file at the same time, but we wouldn't remove that lock anyway.
2339 */ 2367 */
2340 if (!file_inode(filp)->i_flock) 2368 if (!ctx || list_empty(&ctx->flc_posix))
2341 return; 2369 return;
2342 2370
2343 lock.fl_type = F_UNLCK; 2371 lock.fl_type = F_UNLCK;
@@ -2358,67 +2386,68 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
2358 2386
2359EXPORT_SYMBOL(locks_remove_posix); 2387EXPORT_SYMBOL(locks_remove_posix);
2360 2388
2389/* The i_flctx must be valid when calling into here */
2390static void
2391locks_remove_flock(struct file *filp)
2392{
2393 struct file_lock fl = {
2394 .fl_owner = filp,
2395 .fl_pid = current->tgid,
2396 .fl_file = filp,
2397 .fl_flags = FL_FLOCK,
2398 .fl_type = F_UNLCK,
2399 .fl_end = OFFSET_MAX,
2400 };
2401 struct file_lock_context *flctx = file_inode(filp)->i_flctx;
2402
2403 if (list_empty(&flctx->flc_flock))
2404 return;
2405
2406 if (filp->f_op->flock)
2407 filp->f_op->flock(filp, F_SETLKW, &fl);
2408 else
2409 flock_lock_file(filp, &fl);
2410
2411 if (fl.fl_ops && fl.fl_ops->fl_release_private)
2412 fl.fl_ops->fl_release_private(&fl);
2413}
2414
2415/* The i_flctx must be valid when calling into here */
2416static void
2417locks_remove_lease(struct file *filp)
2418{
2419 struct inode *inode = file_inode(filp);
2420 struct file_lock_context *ctx = inode->i_flctx;
2421 struct file_lock *fl, *tmp;
2422 LIST_HEAD(dispose);
2423
2424 if (list_empty(&ctx->flc_lease))
2425 return;
2426
2427 spin_lock(&ctx->flc_lock);
2428 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
2429 if (filp == fl->fl_file)
2430 lease_modify(fl, F_UNLCK, &dispose);
2431 spin_unlock(&ctx->flc_lock);
2432 locks_dispose_list(&dispose);
2433}
2434
2361/* 2435/*
2362 * This function is called on the last close of an open file. 2436 * This function is called on the last close of an open file.
2363 */ 2437 */
2364void locks_remove_file(struct file *filp) 2438void locks_remove_file(struct file *filp)
2365{ 2439{
2366 struct inode * inode = file_inode(filp); 2440 if (!file_inode(filp)->i_flctx)
2367 struct file_lock *fl;
2368 struct file_lock **before;
2369 LIST_HEAD(dispose);
2370
2371 if (!inode->i_flock)
2372 return; 2441 return;
2373 2442
2443 /* remove any OFD locks */
2374 locks_remove_posix(filp, filp); 2444 locks_remove_posix(filp, filp);
2375 2445
2376 if (filp->f_op->flock) { 2446 /* remove flock locks */
2377 struct file_lock fl = { 2447 locks_remove_flock(filp);
2378 .fl_owner = filp,
2379 .fl_pid = current->tgid,
2380 .fl_file = filp,
2381 .fl_flags = FL_FLOCK,
2382 .fl_type = F_UNLCK,
2383 .fl_end = OFFSET_MAX,
2384 };
2385 filp->f_op->flock(filp, F_SETLKW, &fl);
2386 if (fl.fl_ops && fl.fl_ops->fl_release_private)
2387 fl.fl_ops->fl_release_private(&fl);
2388 }
2389
2390 spin_lock(&inode->i_lock);
2391 before = &inode->i_flock;
2392 2448
2393 while ((fl = *before) != NULL) { 2449 /* remove any leases */
2394 if (fl->fl_file == filp) { 2450 locks_remove_lease(filp);
2395 if (IS_LEASE(fl)) {
2396 lease_modify(before, F_UNLCK, &dispose);
2397 continue;
2398 }
2399
2400 /*
2401 * There's a leftover lock on the list of a type that
2402 * we didn't expect to see. Most likely a classic
2403 * POSIX lock that ended up not getting released
2404 * properly, or that raced onto the list somehow. Log
2405 * some info about it and then just remove it from
2406 * the list.
2407 */
2408 WARN(!IS_FLOCK(fl),
2409 "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
2410 MAJOR(inode->i_sb->s_dev),
2411 MINOR(inode->i_sb->s_dev), inode->i_ino,
2412 fl->fl_type, fl->fl_flags,
2413 fl->fl_start, fl->fl_end);
2414
2415 locks_delete_lock(before, &dispose);
2416 continue;
2417 }
2418 before = &fl->fl_next;
2419 }
2420 spin_unlock(&inode->i_lock);
2421 locks_dispose_list(&dispose);
2422} 2451}
2423 2452
2424/** 2453/**
@@ -2621,6 +2650,9 @@ static int __init filelock_init(void)
2621{ 2650{
2622 int i; 2651 int i;
2623 2652
2653 flctx_cache = kmem_cache_create("file_lock_ctx",
2654 sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
2655
2624 filelock_cache = kmem_cache_create("file_lock_cache", 2656 filelock_cache = kmem_cache_create("file_lock_cache",
2625 sizeof(struct file_lock), 0, SLAB_PANIC, NULL); 2657 sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
2626 2658
diff --git a/fs/mount.h b/fs/mount.h
index 0ad6f760ce52..6a61c2b3e385 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -2,6 +2,7 @@
2#include <linux/seq_file.h> 2#include <linux/seq_file.h>
3#include <linux/poll.h> 3#include <linux/poll.h>
4#include <linux/ns_common.h> 4#include <linux/ns_common.h>
5#include <linux/fs_pin.h>
5 6
6struct mnt_namespace { 7struct mnt_namespace {
7 atomic_t count; 8 atomic_t count;
@@ -62,7 +63,8 @@ struct mount {
62 int mnt_group_id; /* peer group identifier */ 63 int mnt_group_id; /* peer group identifier */
63 int mnt_expiry_mark; /* true if marked for expiry */ 64 int mnt_expiry_mark; /* true if marked for expiry */
64 struct hlist_head mnt_pins; 65 struct hlist_head mnt_pins;
65 struct path mnt_ex_mountpoint; 66 struct fs_pin mnt_umount;
67 struct dentry *mnt_ex_mountpoint;
66}; 68};
67 69
68#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ 70#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namei.c b/fs/namei.c
index bc35b02883bb..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
118 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 118 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
119 * PATH_MAX includes the nul terminator --RR. 119 * PATH_MAX includes the nul terminator --RR.
120 */ 120 */
121void final_putname(struct filename *name)
122{
123 if (name->separate) {
124 __putname(name->name);
125 kfree(name);
126 } else {
127 __putname(name);
128 }
129}
130 121
131#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) 122#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
132 123
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
145 result = __getname(); 136 result = __getname();
146 if (unlikely(!result)) 137 if (unlikely(!result))
147 return ERR_PTR(-ENOMEM); 138 return ERR_PTR(-ENOMEM);
139 result->refcnt = 1;
148 140
149 /* 141 /*
150 * First, try to embed the struct filename inside the names_cache 142 * First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
179 } 171 }
180 result->name = kname; 172 result->name = kname;
181 result->separate = true; 173 result->separate = true;
174 result->refcnt = 1;
182 max = PATH_MAX; 175 max = PATH_MAX;
183 goto recopy; 176 goto recopy;
184 } 177 }
@@ -202,7 +195,7 @@ recopy:
202 return result; 195 return result;
203 196
204error: 197error:
205 final_putname(result); 198 putname(result);
206 return err; 199 return err;
207} 200}
208 201
@@ -212,43 +205,56 @@ getname(const char __user * filename)
212 return getname_flags(filename, 0, NULL); 205 return getname_flags(filename, 0, NULL);
213} 206}
214 207
215/*
216 * The "getname_kernel()" interface doesn't do pathnames longer
217 * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
218 */
219struct filename * 208struct filename *
220getname_kernel(const char * filename) 209getname_kernel(const char * filename)
221{ 210{
222 struct filename *result; 211 struct filename *result;
223 char *kname; 212 int len = strlen(filename) + 1;
224 int len;
225
226 len = strlen(filename);
227 if (len >= EMBEDDED_NAME_MAX)
228 return ERR_PTR(-ENAMETOOLONG);
229 213
230 result = __getname(); 214 result = __getname();
231 if (unlikely(!result)) 215 if (unlikely(!result))
232 return ERR_PTR(-ENOMEM); 216 return ERR_PTR(-ENOMEM);
233 217
234 kname = (char *)result + sizeof(*result); 218 if (len <= EMBEDDED_NAME_MAX) {
235 result->name = kname; 219 result->name = (char *)(result) + sizeof(*result);
220 result->separate = false;
221 } else if (len <= PATH_MAX) {
222 struct filename *tmp;
223
224 tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
225 if (unlikely(!tmp)) {
226 __putname(result);
227 return ERR_PTR(-ENOMEM);
228 }
229 tmp->name = (char *)result;
230 tmp->separate = true;
231 result = tmp;
232 } else {
233 __putname(result);
234 return ERR_PTR(-ENAMETOOLONG);
235 }
236 memcpy((char *)result->name, filename, len);
236 result->uptr = NULL; 237 result->uptr = NULL;
237 result->aname = NULL; 238 result->aname = NULL;
238 result->separate = false; 239 result->refcnt = 1;
240 audit_getname(result);
239 241
240 strlcpy(kname, filename, EMBEDDED_NAME_MAX);
241 return result; 242 return result;
242} 243}
243 244
244#ifdef CONFIG_AUDITSYSCALL
245void putname(struct filename *name) 245void putname(struct filename *name)
246{ 246{
247 if (unlikely(!audit_dummy_context())) 247 BUG_ON(name->refcnt <= 0);
248 return audit_putname(name); 248
249 final_putname(name); 249 if (--name->refcnt > 0)
250 return;
251
252 if (name->separate) {
253 __putname(name->name);
254 kfree(name);
255 } else
256 __putname(name);
250} 257}
251#endif
252 258
253static int check_acl(struct inode *inode, int mask) 259static int check_acl(struct inode *inode, int mask)
254{ 260{
@@ -2036,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name,
2036static int do_path_lookup(int dfd, const char *name, 2042static int do_path_lookup(int dfd, const char *name,
2037 unsigned int flags, struct nameidata *nd) 2043 unsigned int flags, struct nameidata *nd)
2038{ 2044{
2039 struct filename filename = { .name = name }; 2045 struct filename *filename = getname_kernel(name);
2046 int retval = PTR_ERR(filename);
2040 2047
2041 return filename_lookup(dfd, &filename, flags, nd); 2048 if (!IS_ERR(filename)) {
2049 retval = filename_lookup(dfd, filename, flags, nd);
2050 putname(filename);
2051 }
2052 return retval;
2042} 2053}
2043 2054
2044/* does lookup, returns the object with parent locked */ 2055/* does lookup, returns the object with parent locked */
2045struct dentry *kern_path_locked(const char *name, struct path *path) 2056struct dentry *kern_path_locked(const char *name, struct path *path)
2046{ 2057{
2058 struct filename *filename = getname_kernel(name);
2047 struct nameidata nd; 2059 struct nameidata nd;
2048 struct dentry *d; 2060 struct dentry *d;
2049 int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); 2061 int err;
2050 if (err) 2062
2051 return ERR_PTR(err); 2063 if (IS_ERR(filename))
2064 return ERR_CAST(filename);
2065
2066 err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
2067 if (err) {
2068 d = ERR_PTR(err);
2069 goto out;
2070 }
2052 if (nd.last_type != LAST_NORM) { 2071 if (nd.last_type != LAST_NORM) {
2053 path_put(&nd.path); 2072 path_put(&nd.path);
2054 return ERR_PTR(-EINVAL); 2073 d = ERR_PTR(-EINVAL);
2074 goto out;
2055 } 2075 }
2056 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2076 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2057 d = __lookup_hash(&nd.last, nd.path.dentry, 0); 2077 d = __lookup_hash(&nd.last, nd.path.dentry, 0);
2058 if (IS_ERR(d)) { 2078 if (IS_ERR(d)) {
2059 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2079 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2060 path_put(&nd.path); 2080 path_put(&nd.path);
2061 return d; 2081 goto out;
2062 } 2082 }
2063 *path = nd.path; 2083 *path = nd.path;
2084out:
2085 putname(filename);
2064 return d; 2086 return d;
2065} 2087}
2066 2088
@@ -2351,13 +2373,17 @@ static int
2351filename_mountpoint(int dfd, struct filename *s, struct path *path, 2373filename_mountpoint(int dfd, struct filename *s, struct path *path,
2352 unsigned int flags) 2374 unsigned int flags)
2353{ 2375{
2354 int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); 2376 int error;
2377 if (IS_ERR(s))
2378 return PTR_ERR(s);
2379 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
2355 if (unlikely(error == -ECHILD)) 2380 if (unlikely(error == -ECHILD))
2356 error = path_mountpoint(dfd, s->name, path, flags); 2381 error = path_mountpoint(dfd, s->name, path, flags);
2357 if (unlikely(error == -ESTALE)) 2382 if (unlikely(error == -ESTALE))
2358 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); 2383 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
2359 if (likely(!error)) 2384 if (likely(!error))
2360 audit_inode(s, path->dentry, 0); 2385 audit_inode(s, path->dentry, 0);
2386 putname(s);
2361 return error; 2387 return error;
2362} 2388}
2363 2389
@@ -2379,21 +2405,14 @@ int
2379user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, 2405user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2380 struct path *path) 2406 struct path *path)
2381{ 2407{
2382 struct filename *s = getname(name); 2408 return filename_mountpoint(dfd, getname(name), path, flags);
2383 int error;
2384 if (IS_ERR(s))
2385 return PTR_ERR(s);
2386 error = filename_mountpoint(dfd, s, path, flags);
2387 putname(s);
2388 return error;
2389} 2409}
2390 2410
2391int 2411int
2392kern_path_mountpoint(int dfd, const char *name, struct path *path, 2412kern_path_mountpoint(int dfd, const char *name, struct path *path,
2393 unsigned int flags) 2413 unsigned int flags)
2394{ 2414{
2395 struct filename s = {.name = name}; 2415 return filename_mountpoint(dfd, getname_kernel(name), path, flags);
2396 return filename_mountpoint(dfd, &s, path, flags);
2397} 2416}
2398EXPORT_SYMBOL(kern_path_mountpoint); 2417EXPORT_SYMBOL(kern_path_mountpoint);
2399 2418
@@ -3273,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3273{ 3292{
3274 struct nameidata nd; 3293 struct nameidata nd;
3275 struct file *file; 3294 struct file *file;
3276 struct filename filename = { .name = name }; 3295 struct filename *filename;
3277 int flags = op->lookup_flags | LOOKUP_ROOT; 3296 int flags = op->lookup_flags | LOOKUP_ROOT;
3278 3297
3279 nd.root.mnt = mnt; 3298 nd.root.mnt = mnt;
@@ -3282,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3282 if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) 3301 if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
3283 return ERR_PTR(-ELOOP); 3302 return ERR_PTR(-ELOOP);
3284 3303
3285 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); 3304 filename = getname_kernel(name);
3305 if (unlikely(IS_ERR(filename)))
3306 return ERR_CAST(filename);
3307
3308 file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
3286 if (unlikely(file == ERR_PTR(-ECHILD))) 3309 if (unlikely(file == ERR_PTR(-ECHILD)))
3287 file = path_openat(-1, &filename, &nd, op, flags); 3310 file = path_openat(-1, filename, &nd, op, flags);
3288 if (unlikely(file == ERR_PTR(-ESTALE))) 3311 if (unlikely(file == ERR_PTR(-ESTALE)))
3289 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); 3312 file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
3313 putname(filename);
3290 return file; 3314 return file;
3291} 3315}
3292 3316
3293struct dentry *kern_path_create(int dfd, const char *pathname, 3317static struct dentry *filename_create(int dfd, struct filename *name,
3294 struct path *path, unsigned int lookup_flags) 3318 struct path *path, unsigned int lookup_flags)
3295{ 3319{
3296 struct dentry *dentry = ERR_PTR(-EEXIST); 3320 struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3305,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
3305 */ 3329 */
3306 lookup_flags &= LOOKUP_REVAL; 3330 lookup_flags &= LOOKUP_REVAL;
3307 3331
3308 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); 3332 error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
3309 if (error) 3333 if (error)
3310 return ERR_PTR(error); 3334 return ERR_PTR(error);
3311 3335
@@ -3359,6 +3383,19 @@ out:
3359 path_put(&nd.path); 3383 path_put(&nd.path);
3360 return dentry; 3384 return dentry;
3361} 3385}
3386
3387struct dentry *kern_path_create(int dfd, const char *pathname,
3388 struct path *path, unsigned int lookup_flags)
3389{
3390 struct filename *filename = getname_kernel(pathname);
3391 struct dentry *res;
3392
3393 if (IS_ERR(filename))
3394 return ERR_CAST(filename);
3395 res = filename_create(dfd, filename, path, lookup_flags);
3396 putname(filename);
3397 return res;
3398}
3362EXPORT_SYMBOL(kern_path_create); 3399EXPORT_SYMBOL(kern_path_create);
3363 3400
3364void done_path_create(struct path *path, struct dentry *dentry) 3401void done_path_create(struct path *path, struct dentry *dentry)
@@ -3377,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname,
3377 struct dentry *res; 3414 struct dentry *res;
3378 if (IS_ERR(tmp)) 3415 if (IS_ERR(tmp))
3379 return ERR_CAST(tmp); 3416 return ERR_CAST(tmp);
3380 res = kern_path_create(dfd, tmp->name, path, lookup_flags); 3417 res = filename_create(dfd, tmp, path, lookup_flags);
3381 putname(tmp); 3418 putname(tmp);
3382 return res; 3419 return res;
3383} 3420}
diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e9681a0cf..72a286e0d33e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt)
190#endif 190#endif
191} 191}
192 192
193static void drop_mountpoint(struct fs_pin *p)
194{
195 struct mount *m = container_of(p, struct mount, mnt_umount);
196 dput(m->mnt_ex_mountpoint);
197 pin_remove(p);
198 mntput(&m->mnt);
199}
200
193static struct mount *alloc_vfsmnt(const char *name) 201static struct mount *alloc_vfsmnt(const char *name)
194{ 202{
195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 203 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -201,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name)
201 goto out_free_cache; 209 goto out_free_cache;
202 210
203 if (name) { 211 if (name) {
204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 212 mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
205 if (!mnt->mnt_devname) 213 if (!mnt->mnt_devname)
206 goto out_free_id; 214 goto out_free_id;
207 } 215 }
@@ -229,12 +237,13 @@ static struct mount *alloc_vfsmnt(const char *name)
229#ifdef CONFIG_FSNOTIFY 237#ifdef CONFIG_FSNOTIFY
230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 238 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
231#endif 239#endif
240 init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
232 } 241 }
233 return mnt; 242 return mnt;
234 243
235#ifdef CONFIG_SMP 244#ifdef CONFIG_SMP
236out_free_devname: 245out_free_devname:
237 kfree(mnt->mnt_devname); 246 kfree_const(mnt->mnt_devname);
238#endif 247#endif
239out_free_id: 248out_free_id:
240 mnt_free_id(mnt); 249 mnt_free_id(mnt);
@@ -568,7 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
568 577
569static void free_vfsmnt(struct mount *mnt) 578static void free_vfsmnt(struct mount *mnt)
570{ 579{
571 kfree(mnt->mnt_devname); 580 kfree_const(mnt->mnt_devname);
572#ifdef CONFIG_SMP 581#ifdef CONFIG_SMP
573 free_percpu(mnt->mnt_pcp); 582 free_percpu(mnt->mnt_pcp);
574#endif 583#endif
@@ -1289,7 +1298,6 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1289 1298
1290static void namespace_unlock(void) 1299static void namespace_unlock(void)
1291{ 1300{
1292 struct mount *mnt;
1293 struct hlist_head head = unmounted; 1301 struct hlist_head head = unmounted;
1294 1302
1295 if (likely(hlist_empty(&head))) { 1303 if (likely(hlist_empty(&head))) {
@@ -1299,23 +1307,11 @@ static void namespace_unlock(void)
1299 1307
1300 head.first->pprev = &head.first; 1308 head.first->pprev = &head.first;
1301 INIT_HLIST_HEAD(&unmounted); 1309 INIT_HLIST_HEAD(&unmounted);
1302
1303 /* undo decrements we'd done in umount_tree() */
1304 hlist_for_each_entry(mnt, &head, mnt_hash)
1305 if (mnt->mnt_ex_mountpoint.mnt)
1306 mntget(mnt->mnt_ex_mountpoint.mnt);
1307
1308 up_write(&namespace_sem); 1310 up_write(&namespace_sem);
1309 1311
1310 synchronize_rcu(); 1312 synchronize_rcu();
1311 1313
1312 while (!hlist_empty(&head)) { 1314 group_pin_kill(&head);
1313 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1314 hlist_del_init(&mnt->mnt_hash);
1315 if (mnt->mnt_ex_mountpoint.mnt)
1316 path_put(&mnt->mnt_ex_mountpoint);
1317 mntput(&mnt->mnt);
1318 }
1319} 1315}
1320 1316
1321static inline void namespace_lock(void) 1317static inline void namespace_lock(void)
@@ -1334,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how)
1334{ 1330{
1335 HLIST_HEAD(tmp_list); 1331 HLIST_HEAD(tmp_list);
1336 struct mount *p; 1332 struct mount *p;
1337 struct mount *last = NULL;
1338 1333
1339 for (p = mnt; p; p = next_mnt(p, mnt)) { 1334 for (p = mnt; p; p = next_mnt(p, mnt)) {
1340 hlist_del_init_rcu(&p->mnt_hash); 1335 hlist_del_init_rcu(&p->mnt_hash);
@@ -1347,33 +1342,28 @@ void umount_tree(struct mount *mnt, int how)
1347 if (how) 1342 if (how)
1348 propagate_umount(&tmp_list); 1343 propagate_umount(&tmp_list);
1349 1344
1350 hlist_for_each_entry(p, &tmp_list, mnt_hash) { 1345 while (!hlist_empty(&tmp_list)) {
1346 p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
1347 hlist_del_init_rcu(&p->mnt_hash);
1351 list_del_init(&p->mnt_expire); 1348 list_del_init(&p->mnt_expire);
1352 list_del_init(&p->mnt_list); 1349 list_del_init(&p->mnt_list);
1353 __touch_mnt_namespace(p->mnt_ns); 1350 __touch_mnt_namespace(p->mnt_ns);
1354 p->mnt_ns = NULL; 1351 p->mnt_ns = NULL;
1355 if (how < 2) 1352 if (how < 2)
1356 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1353 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1354
1355 pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
1357 if (mnt_has_parent(p)) { 1356 if (mnt_has_parent(p)) {
1358 hlist_del_init(&p->mnt_mp_list); 1357 hlist_del_init(&p->mnt_mp_list);
1359 put_mountpoint(p->mnt_mp); 1358 put_mountpoint(p->mnt_mp);
1360 mnt_add_count(p->mnt_parent, -1); 1359 mnt_add_count(p->mnt_parent, -1);
1361 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1360 /* old mountpoint will be dropped when we can do that */
1362 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1361 p->mnt_ex_mountpoint = p->mnt_mountpoint;
1363 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1364 p->mnt_mountpoint = p->mnt.mnt_root; 1362 p->mnt_mountpoint = p->mnt.mnt_root;
1365 p->mnt_parent = p; 1363 p->mnt_parent = p;
1366 p->mnt_mp = NULL; 1364 p->mnt_mp = NULL;
1367 } 1365 }
1368 change_mnt_propagation(p, MS_PRIVATE); 1366 change_mnt_propagation(p, MS_PRIVATE);
1369 last = p;
1370 }
1371 if (last) {
1372 last->mnt_hash.next = unmounted.first;
1373 if (unmounted.first)
1374 unmounted.first->pprev = &last->mnt_hash.next;
1375 unmounted.first = tmp_list.first;
1376 unmounted.first->pprev = &unmounted.first;
1377 } 1367 }
1378} 1368}
1379 1369
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 008960101520..e7ca827d7694 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *);
77static int ncp_compare_dentry(const struct dentry *, const struct dentry *, 77static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
78 unsigned int, const char *, const struct qstr *); 78 unsigned int, const char *, const struct qstr *);
79static int ncp_delete_dentry(const struct dentry *); 79static int ncp_delete_dentry(const struct dentry *);
80static void ncp_d_prune(struct dentry *dentry);
80 81
81const struct dentry_operations ncp_dentry_operations = 82const struct dentry_operations ncp_dentry_operations =
82{ 83{
@@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations =
84 .d_hash = ncp_hash_dentry, 85 .d_hash = ncp_hash_dentry,
85 .d_compare = ncp_compare_dentry, 86 .d_compare = ncp_compare_dentry,
86 .d_delete = ncp_delete_dentry, 87 .d_delete = ncp_delete_dentry,
88 .d_prune = ncp_d_prune,
87}; 89};
88 90
89#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) 91#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
@@ -384,42 +386,6 @@ finished:
384 return val; 386 return val;
385} 387}
386 388
387static struct dentry *
388ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
389{
390 struct dentry *dent = dentry;
391
392 if (d_validate(dent, parent)) {
393 if (dent->d_name.len <= NCP_MAXPATHLEN &&
394 (unsigned long)dent->d_fsdata == fpos) {
395 if (!dent->d_inode) {
396 dput(dent);
397 dent = NULL;
398 }
399 return dent;
400 }
401 dput(dent);
402 }
403
404 /* If a pointer is invalid, we search the dentry. */
405 spin_lock(&parent->d_lock);
406 list_for_each_entry(dent, &parent->d_subdirs, d_child) {
407 if ((unsigned long)dent->d_fsdata == fpos) {
408 if (dent->d_inode)
409 dget(dent);
410 else
411 dent = NULL;
412 spin_unlock(&parent->d_lock);
413 goto out;
414 }
415 }
416 spin_unlock(&parent->d_lock);
417 return NULL;
418
419out:
420 return dent;
421}
422
423static time_t ncp_obtain_mtime(struct dentry *dentry) 389static time_t ncp_obtain_mtime(struct dentry *dentry)
424{ 390{
425 struct inode *inode = dentry->d_inode; 391 struct inode *inode = dentry->d_inode;
@@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
435 return ncp_date_dos2unix(i.modifyTime, i.modifyDate); 401 return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
436} 402}
437 403
404static inline void
405ncp_invalidate_dircache_entries(struct dentry *parent)
406{
407 struct ncp_server *server = NCP_SERVER(parent->d_inode);
408 struct dentry *dentry;
409
410 spin_lock(&parent->d_lock);
411 list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
412 dentry->d_fsdata = NULL;
413 ncp_age_dentry(server, dentry);
414 }
415 spin_unlock(&parent->d_lock);
416}
417
438static int ncp_readdir(struct file *file, struct dir_context *ctx) 418static int ncp_readdir(struct file *file, struct dir_context *ctx)
439{ 419{
440 struct dentry *dentry = file->f_path.dentry; 420 struct dentry *dentry = file->f_path.dentry;
@@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
500 struct dentry *dent; 480 struct dentry *dent;
501 bool over; 481 bool over;
502 482
503 dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], 483 spin_lock(&dentry->d_lock);
504 dentry, ctx->pos); 484 if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) {
505 if (!dent) 485 spin_unlock(&dentry->d_lock);
486 goto invalid_cache;
487 }
488 dent = ctl.cache->dentry[ctl.idx];
489 if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) {
490 spin_unlock(&dentry->d_lock);
491 goto invalid_cache;
492 }
493 spin_unlock(&dentry->d_lock);
494 if (!dent->d_inode) {
495 dput(dent);
506 goto invalid_cache; 496 goto invalid_cache;
497 }
507 over = !dir_emit(ctx, dent->d_name.name, 498 over = !dir_emit(ctx, dent->d_name.name,
508 dent->d_name.len, 499 dent->d_name.len,
509 dent->d_inode->i_ino, DT_UNKNOWN); 500 dent->d_inode->i_ino, DT_UNKNOWN);
@@ -548,6 +539,9 @@ init_cache:
548 ctl.filled = 0; 539 ctl.filled = 0;
549 ctl.valid = 1; 540 ctl.valid = 1;
550read_really: 541read_really:
542 spin_lock(&dentry->d_lock);
543 NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE;
544 spin_unlock(&dentry->d_lock);
551 if (ncp_is_server_root(inode)) { 545 if (ncp_is_server_root(inode)) {
552 ncp_read_volume_list(file, ctx, &ctl); 546 ncp_read_volume_list(file, ctx, &ctl);
553 } else { 547 } else {
@@ -573,6 +567,13 @@ out:
573 return result; 567 return result;
574} 568}
575 569
570static void ncp_d_prune(struct dentry *dentry)
571{
572 if (!dentry->d_fsdata) /* not referenced from page cache */
573 return;
574 NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE;
575}
576
576static int 577static int
577ncp_fill_cache(struct file *file, struct dir_context *ctx, 578ncp_fill_cache(struct file *file, struct dir_context *ctx,
578 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, 579 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
@@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
630 d_instantiate(newdent, inode); 631 d_instantiate(newdent, inode);
631 if (!hashed) 632 if (!hashed)
632 d_rehash(newdent); 633 d_rehash(newdent);
634 } else {
635 spin_lock(&dentry->d_lock);
636 NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
637 spin_unlock(&dentry->d_lock);
633 } 638 }
634 } else { 639 } else {
635 struct inode *inode = newdent->d_inode; 640 struct inode *inode = newdent->d_inode;
@@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
639 mutex_unlock(&inode->i_mutex); 644 mutex_unlock(&inode->i_mutex);
640 } 645 }
641 646
642 if (newdent->d_inode) {
643 ino = newdent->d_inode->i_ino;
644 newdent->d_fsdata = (void *) ctl.fpos;
645 ncp_new_dentry(newdent);
646 }
647
648 if (ctl.idx >= NCP_DIRCACHE_SIZE) { 647 if (ctl.idx >= NCP_DIRCACHE_SIZE) {
649 if (ctl.page) { 648 if (ctl.page) {
650 kunmap(ctl.page); 649 kunmap(ctl.page);
@@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
660 ctl.cache = kmap(ctl.page); 659 ctl.cache = kmap(ctl.page);
661 } 660 }
662 if (ctl.cache) { 661 if (ctl.cache) {
663 ctl.cache->dentry[ctl.idx] = newdent; 662 if (newdent->d_inode) {
664 valid = 1; 663 newdent->d_fsdata = newdent;
664 ctl.cache->dentry[ctl.idx] = newdent;
665 ino = newdent->d_inode->i_ino;
666 ncp_new_dentry(newdent);
667 }
668 valid = 1;
665 } 669 }
666 dput(newdent); 670 dput(newdent);
667end_advance: 671end_advance:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
267 if (inode) { 267 if (inode) {
268 atomic_set(&NCP_FINFO(inode)->opened, info->opened); 268 atomic_set(&NCP_FINFO(inode)->opened, info->opened);
269 269
270 inode->i_mapping->backing_dev_info = sb->s_bdi;
271 inode->i_ino = info->ino; 270 inode->i_ino = info->ino;
272 ncp_set_attr(inode, info); 271 ncp_set_attr(inode, info);
273 if (S_ISREG(inode->i_mode)) { 272 if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
560 server = NCP_SBP(sb); 559 server = NCP_SBP(sb);
561 memset(server, 0, sizeof(*server)); 560 memset(server, 0, sizeof(*server));
562 561
563 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); 562 error = bdi_setup_and_register(&server->bdi, "ncpfs");
564 if (error) 563 if (error)
565 goto out_fput; 564 goto out_fput;
566 565
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
index 4b0bec477846..c4794504f843 100644
--- a/fs/ncpfs/ncp_fs_i.h
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -22,6 +22,7 @@ struct ncp_inode_info {
22 int access; 22 int access;
23 int flags; 23 int flags;
24#define NCPI_KLUDGE_SYMLINK 0x0001 24#define NCPI_KLUDGE_SYMLINK 0x0001
25#define NCPI_DIR_CACHE 0x0002
25 __u8 file_handle[6]; 26 __u8 file_handle[6];
26 struct inode vfs_inode; 27 struct inode vfs_inode;
27}; 28};
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index b785f74bfe3c..250e443a07f3 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry)
184 dentry->d_time = jiffies; 184 dentry->d_time = jiffies;
185} 185}
186 186
187static inline void
188ncp_renew_dentries(struct dentry *parent)
189{
190 struct ncp_server *server = NCP_SERVER(parent->d_inode);
191 struct dentry *dentry;
192
193 spin_lock(&parent->d_lock);
194 list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
195 if (dentry->d_fsdata == NULL)
196 ncp_age_dentry(server, dentry);
197 else
198 ncp_new_dentry(dentry);
199 }
200 spin_unlock(&parent->d_lock);
201}
202
203static inline void
204ncp_invalidate_dircache_entries(struct dentry *parent)
205{
206 struct ncp_server *server = NCP_SERVER(parent->d_inode);
207 struct dentry *dentry;
208
209 spin_lock(&parent->d_lock);
210 list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
211 dentry->d_fsdata = NULL;
212 ncp_age_dentry(server, dentry);
213 }
214 spin_unlock(&parent->d_lock);
215}
216
217struct ncp_cache_head { 187struct ncp_cache_head {
218 time_t mtime; 188 time_t mtime;
219 unsigned long time; /* cache age */ 189 unsigned long time; /* cache age */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
128 depends on NFS_V4_1 && SCSI_OSD_ULD 128 depends on NFS_V4_1 && SCSI_OSD_ULD
129 default NFS_V4 129 default NFS_V4
130 130
131config PNFS_FLEXFILE_LAYOUT
132 tristate
133 depends on NFS_V4_1 && NFS_V3
134 default m
135
131config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN 136config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
132 string "NFSv4.1 Implementation ID Domain" 137 string "NFSv4.1 Implementation ID Domain"
133 depends on NFS_V4_1 138 depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 04cb830fa09f..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
27 dns_resolve.o nfs4trace.o 27 dns_resolve.o nfs4trace.o
28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o 28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o 30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o 31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
32 32
33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ 33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
35obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ 35obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
36obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 77fec6a55f57..1cac3c175d18 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = {
860 .pg_init = bl_pg_init_read, 860 .pg_init = bl_pg_init_read,
861 .pg_test = bl_pg_test_read, 861 .pg_test = bl_pg_test_read,
862 .pg_doio = pnfs_generic_pg_readpages, 862 .pg_doio = pnfs_generic_pg_readpages,
863 .pg_cleanup = pnfs_generic_pg_cleanup,
863}; 864};
864 865
865static const struct nfs_pageio_ops bl_pg_write_ops = { 866static const struct nfs_pageio_ops bl_pg_write_ops = {
866 .pg_init = bl_pg_init_write, 867 .pg_init = bl_pg_init_write,
867 .pg_test = bl_pg_test_write, 868 .pg_test = bl_pg_test_write,
868 .pg_doio = pnfs_generic_pg_writepages, 869 .pg_doio = pnfs_generic_pg_writepages,
870 .pg_cleanup = pnfs_generic_pg_cleanup,
869}; 871};
870 872
871static struct pnfs_layoutdriver_type blocklayout_type = { 873static struct pnfs_layoutdriver_type blocklayout_type = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b8fb3a4ef649..351be9205bf8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
128 if (try_to_freeze()) 128 if (try_to_freeze())
129 continue; 129 continue;
130 130
131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); 131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
132 spin_lock_bh(&serv->sv_cb_lock); 132 spin_lock_bh(&serv->sv_cb_lock);
133 if (!list_empty(&serv->sv_cb_list)) { 133 if (!list_empty(&serv->sv_cb_list)) {
134 req = list_first_entry(&serv->sv_cb_list, 134 req = list_first_entry(&serv->sv_cb_list,
135 struct rpc_rqst, rq_bc_list); 135 struct rpc_rqst, rq_bc_list);
136 list_del(&req->rq_bc_list); 136 list_del(&req->rq_bc_list);
137 spin_unlock_bh(&serv->sv_cb_lock); 137 spin_unlock_bh(&serv->sv_cb_lock);
138 finish_wait(&serv->sv_cb_waitq, &wq);
138 dprintk("Invoking bc_svc_process()\n"); 139 dprintk("Invoking bc_svc_process()\n");
139 error = bc_svc_process(serv, req, rqstp); 140 error = bc_svc_process(serv, req, rqstp);
140 dprintk("bc_svc_process() returned w/ error code= %d\n", 141 dprintk("bc_svc_process() returned w/ error code= %d\n",
141 error); 142 error);
142 } else { 143 } else {
143 spin_unlock_bh(&serv->sv_cb_lock); 144 spin_unlock_bh(&serv->sv_cb_lock);
144 schedule(); 145 /* schedule_timeout to game the hung task watchdog */
146 schedule_timeout(60 * HZ);
147 finish_wait(&serv->sv_cb_waitq, &wq);
145 } 148 }
146 finish_wait(&serv->sv_cb_waitq, &wq);
147 } 149 }
148 return 0; 150 return 0;
149} 151}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f3f60641344..da5433230bb1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
85{ 85{
86 struct inode *inode = state->inode; 86 struct inode *inode = state->inode;
87 struct file_lock *fl; 87 struct file_lock *fl;
88 struct file_lock_context *flctx = inode->i_flctx;
89 struct list_head *list;
88 int status = 0; 90 int status = 0;
89 91
90 if (inode->i_flock == NULL) 92 if (flctx == NULL)
91 goto out; 93 goto out;
92 94
93 /* Protect inode->i_flock using the i_lock */ 95 list = &flctx->flc_posix;
94 spin_lock(&inode->i_lock); 96 spin_lock(&flctx->flc_lock);
95 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 97restart:
96 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 98 list_for_each_entry(fl, list, fl_list) {
97 continue;
98 if (nfs_file_open_context(fl->fl_file) != ctx) 99 if (nfs_file_open_context(fl->fl_file) != ctx)
99 continue; 100 continue;
100 spin_unlock(&inode->i_lock); 101 spin_unlock(&flctx->flc_lock);
101 status = nfs4_lock_delegation_recall(fl, state, stateid); 102 status = nfs4_lock_delegation_recall(fl, state, stateid);
102 if (status < 0) 103 if (status < 0)
103 goto out; 104 goto out;
104 spin_lock(&inode->i_lock); 105 spin_lock(&flctx->flc_lock);
105 } 106 }
106 spin_unlock(&inode->i_lock); 107 if (list == &flctx->flc_posix) {
108 list = &flctx->flc_flock;
109 goto restart;
110 }
111 spin_unlock(&flctx->flc_lock);
107out: 112out:
108 return status; 113 return status;
109} 114}
@@ -301,6 +306,17 @@ nfs_inode_detach_delegation(struct inode *inode)
301 return nfs_detach_delegation(nfsi, delegation, server); 306 return nfs_detach_delegation(nfsi, delegation, server);
302} 307}
303 308
309static void
310nfs_update_inplace_delegation(struct nfs_delegation *delegation,
311 const struct nfs_delegation *update)
312{
313 if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
314 delegation->stateid.seqid = update->stateid.seqid;
315 smp_wmb();
316 delegation->type = update->type;
317 }
318}
319
304/** 320/**
305 * nfs_inode_set_delegation - set up a delegation on an inode 321 * nfs_inode_set_delegation - set up a delegation on an inode
306 * @inode: inode to which delegation applies 322 * @inode: inode to which delegation applies
@@ -334,9 +350,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
334 old_delegation = rcu_dereference_protected(nfsi->delegation, 350 old_delegation = rcu_dereference_protected(nfsi->delegation,
335 lockdep_is_held(&clp->cl_lock)); 351 lockdep_is_held(&clp->cl_lock));
336 if (old_delegation != NULL) { 352 if (old_delegation != NULL) {
337 if (nfs4_stateid_match(&delegation->stateid, 353 /* Is this an update of the existing delegation? */
338 &old_delegation->stateid) && 354 if (nfs4_stateid_match_other(&old_delegation->stateid,
339 delegation->type == old_delegation->type) { 355 &delegation->stateid)) {
356 nfs_update_inplace_delegation(old_delegation,
357 delegation);
358 nfsi->delegation_state = old_delegation->type;
340 goto out; 359 goto out;
341 } 360 }
342 /* 361 /*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 294692ff83b1..7077521acdf4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep;
66/* 66/*
67 * This represents a set of asynchronous requests that we're waiting on 67 * This represents a set of asynchronous requests that we're waiting on
68 */ 68 */
69struct nfs_direct_mirror {
70 ssize_t count;
71};
72
69struct nfs_direct_req { 73struct nfs_direct_req {
70 struct kref kref; /* release manager */ 74 struct kref kref; /* release manager */
71 75
@@ -78,8 +82,13 @@ struct nfs_direct_req {
78 /* completion state */ 82 /* completion state */
79 atomic_t io_count; /* i/os we're waiting for */ 83 atomic_t io_count; /* i/os we're waiting for */
80 spinlock_t lock; /* protect completion state */ 84 spinlock_t lock; /* protect completion state */
85
86 struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
87 int mirror_count;
88
81 ssize_t count, /* bytes actually processed */ 89 ssize_t count, /* bytes actually processed */
82 bytes_left, /* bytes left to be sent */ 90 bytes_left, /* bytes left to be sent */
91 io_start, /* start of IO */
83 error; /* any reported error */ 92 error; /* any reported error */
84 struct completion completion; /* wait for i/o completion */ 93 struct completion completion; /* wait for i/o completion */
85 94
@@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
108 return atomic_dec_and_test(&dreq->io_count); 117 return atomic_dec_and_test(&dreq->io_count);
109} 118}
110 119
120void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
121{
122 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
123}
124EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
125
126static void
127nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
128{
129 int i;
130 ssize_t count;
131
132 WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
133
134 count = dreq->mirrors[hdr->pgio_mirror_idx].count;
135 if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
136 count = hdr->io_start + hdr->good_bytes - dreq->io_start;
137 dreq->mirrors[hdr->pgio_mirror_idx].count = count;
138 }
139
140 /* update the dreq->count by finding the minimum agreed count from all
141 * mirrors */
142 count = dreq->mirrors[0].count;
143
144 for (i = 1; i < dreq->mirror_count; i++)
145 count = min(count, dreq->mirrors[i].count);
146
147 dreq->count = count;
148}
149
111/* 150/*
112 * nfs_direct_select_verf - select the right verifier 151 * nfs_direct_select_verf - select the right verifier
113 * @dreq - direct request possibly spanning multiple servers 152 * @dreq - direct request possibly spanning multiple servers
114 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs 153 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
115 * @ds_idx - index of data server in data server list, only valid if ds_clp set 154 * @commit_idx - commit bucket index for the DS
116 * 155 *
117 * returns the correct verifier to use given the role of the server 156 * returns the correct verifier to use given the role of the server
118 */ 157 */
119static struct nfs_writeverf * 158static struct nfs_writeverf *
120nfs_direct_select_verf(struct nfs_direct_req *dreq, 159nfs_direct_select_verf(struct nfs_direct_req *dreq,
121 struct nfs_client *ds_clp, 160 struct nfs_client *ds_clp,
122 int ds_idx) 161 int commit_idx)
123{ 162{
124 struct nfs_writeverf *verfp = &dreq->verf; 163 struct nfs_writeverf *verfp = &dreq->verf;
125 164
126#ifdef CONFIG_NFS_V4_1 165#ifdef CONFIG_NFS_V4_1
127 if (ds_clp) { 166 if (ds_clp) {
128 /* pNFS is in use, use the DS verf */ 167 /* pNFS is in use, use the DS verf */
129 if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) 168 if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
130 verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; 169 verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
131 else 170 else
132 WARN_ON_ONCE(1); 171 WARN_ON_ONCE(1);
133 } 172 }
@@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
148{ 187{
149 struct nfs_writeverf *verfp; 188 struct nfs_writeverf *verfp;
150 189
151 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, 190 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
152 hdr->ds_idx);
153 WARN_ON_ONCE(verfp->committed >= 0); 191 WARN_ON_ONCE(verfp->committed >= 0);
154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 192 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
155 WARN_ON_ONCE(verfp->committed < 0); 193 WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
169{ 207{
170 struct nfs_writeverf *verfp; 208 struct nfs_writeverf *verfp;
171 209
172 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, 210 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
173 hdr->ds_idx);
174 if (verfp->committed < 0) { 211 if (verfp->committed < 0) {
175 nfs_direct_set_hdr_verf(dreq, hdr); 212 nfs_direct_set_hdr_verf(dreq, hdr);
176 return 0; 213 return 0;
@@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
193 230
194 verfp = nfs_direct_select_verf(dreq, data->ds_clp, 231 verfp = nfs_direct_select_verf(dreq, data->ds_clp,
195 data->ds_commit_index); 232 data->ds_commit_index);
196 WARN_ON_ONCE(verfp->committed < 0); 233
234 /* verifier not set so always fail */
235 if (verfp->committed < 0)
236 return 1;
237
197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 238 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
198} 239}
199 240
@@ -249,6 +290,18 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
249 cinfo->completion_ops = &nfs_direct_commit_completion_ops; 290 cinfo->completion_ops = &nfs_direct_commit_completion_ops;
250} 291}
251 292
293static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
294 struct nfs_pageio_descriptor *pgio,
295 struct nfs_page *req)
296{
297 int mirror_count = 1;
298
299 if (pgio->pg_ops->pg_get_mirror_count)
300 mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
301
302 dreq->mirror_count = mirror_count;
303}
304
252static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 305static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
253{ 306{
254 struct nfs_direct_req *dreq; 307 struct nfs_direct_req *dreq;
@@ -263,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
263 INIT_LIST_HEAD(&dreq->mds_cinfo.list); 316 INIT_LIST_HEAD(&dreq->mds_cinfo.list);
264 dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ 317 dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
265 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); 318 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
319 dreq->mirror_count = 1;
266 spin_lock_init(&dreq->lock); 320 spin_lock_init(&dreq->lock);
267 321
268 return dreq; 322 return dreq;
@@ -369,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
369 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) 423 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
370 dreq->error = hdr->error; 424 dreq->error = hdr->error;
371 else 425 else
372 dreq->count += hdr->good_bytes; 426 nfs_direct_good_bytes(dreq, hdr);
427
373 spin_unlock(&dreq->lock); 428 spin_unlock(&dreq->lock);
374 429
375 while (!list_empty(&hdr->pages)) { 430 while (!list_empty(&hdr->pages)) {
@@ -547,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
547 602
548 dreq->inode = inode; 603 dreq->inode = inode;
549 dreq->bytes_left = count; 604 dreq->bytes_left = count;
605 dreq->io_start = pos;
550 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 606 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
551 l_ctx = nfs_get_lock_context(dreq->ctx); 607 l_ctx = nfs_get_lock_context(dreq->ctx);
552 if (IS_ERR(l_ctx)) { 608 if (IS_ERR(l_ctx)) {
@@ -579,6 +635,20 @@ out:
579 return result; 635 return result;
580} 636}
581 637
638static void
639nfs_direct_write_scan_commit_list(struct inode *inode,
640 struct list_head *list,
641 struct nfs_commit_info *cinfo)
642{
643 spin_lock(cinfo->lock);
644#ifdef CONFIG_NFS_V4_1
645 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
646 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
647#endif
648 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
649 spin_unlock(cinfo->lock);
650}
651
582static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 652static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
583{ 653{
584 struct nfs_pageio_descriptor desc; 654 struct nfs_pageio_descriptor desc;
@@ -586,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
586 LIST_HEAD(reqs); 656 LIST_HEAD(reqs);
587 struct nfs_commit_info cinfo; 657 struct nfs_commit_info cinfo;
588 LIST_HEAD(failed); 658 LIST_HEAD(failed);
659 int i;
589 660
590 nfs_init_cinfo_from_dreq(&cinfo, dreq); 661 nfs_init_cinfo_from_dreq(&cinfo, dreq);
591 pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); 662 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
592 spin_lock(cinfo.lock);
593 nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
594 spin_unlock(cinfo.lock);
595 663
596 dreq->count = 0; 664 dreq->count = 0;
665 for (i = 0; i < dreq->mirror_count; i++)
666 dreq->mirrors[i].count = 0;
597 get_dreq(dreq); 667 get_dreq(dreq);
598 668
599 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, 669 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
600 &nfs_direct_write_completion_ops); 670 &nfs_direct_write_completion_ops);
601 desc.pg_dreq = dreq; 671 desc.pg_dreq = dreq;
602 672
673 req = nfs_list_entry(reqs.next);
674 nfs_direct_setup_mirroring(dreq, &desc, req);
675
603 list_for_each_entry_safe(req, tmp, &reqs, wb_list) { 676 list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
604 if (!nfs_pageio_add_request(&desc, req)) { 677 if (!nfs_pageio_add_request(&desc, req)) {
605 nfs_list_remove_request(req); 678 nfs_list_remove_request(req);
@@ -646,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
646 nfs_list_remove_request(req); 719 nfs_list_remove_request(req);
647 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { 720 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
648 /* Note the rewrite will go through mds */ 721 /* Note the rewrite will go through mds */
649 nfs_mark_request_commit(req, NULL, &cinfo); 722 nfs_mark_request_commit(req, NULL, &cinfo, 0);
650 } else 723 } else
651 nfs_release_request(req); 724 nfs_release_request(req);
652 nfs_unlock_and_release_request(req); 725 nfs_unlock_and_release_request(req);
@@ -721,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
721 dreq->error = hdr->error; 794 dreq->error = hdr->error;
722 } 795 }
723 if (dreq->error == 0) { 796 if (dreq->error == 0) {
724 dreq->count += hdr->good_bytes; 797 nfs_direct_good_bytes(dreq, hdr);
725 if (nfs_write_need_commit(hdr)) { 798 if (nfs_write_need_commit(hdr)) {
726 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) 799 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
727 request_commit = true; 800 request_commit = true;
@@ -745,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
745 nfs_list_remove_request(req); 818 nfs_list_remove_request(req);
746 if (request_commit) { 819 if (request_commit) {
747 kref_get(&req->wb_kref); 820 kref_get(&req->wb_kref);
748 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 821 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
822 hdr->ds_commit_idx);
749 } 823 }
750 nfs_unlock_and_release_request(req); 824 nfs_unlock_and_release_request(req);
751 } 825 }
@@ -826,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
826 result = PTR_ERR(req); 900 result = PTR_ERR(req);
827 break; 901 break;
828 } 902 }
903
904 nfs_direct_setup_mirroring(dreq, &desc, req);
905
829 nfs_lock_request(req); 906 nfs_lock_request(req);
830 req->wb_index = pos >> PAGE_SHIFT; 907 req->wb_index = pos >> PAGE_SHIFT;
831 req->wb_offset = pos & ~PAGE_MASK; 908 req->wb_offset = pos & ~PAGE_MASK;
@@ -934,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
934 1011
935 dreq->inode = inode; 1012 dreq->inode = inode;
936 dreq->bytes_left = count; 1013 dreq->bytes_left = count;
1014 dreq->io_start = pos;
937 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1015 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
938 l_ctx = nfs_get_lock_context(dreq->ctx); 1016 l_ctx = nfs_get_lock_context(dreq->ctx);
939 if (IS_ERR(l_ctx)) { 1017 if (IS_ERR(l_ctx)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
646 .fault = filemap_fault, 646 .fault = filemap_fault,
647 .map_pages = filemap_map_pages, 647 .map_pages = filemap_map_pages,
648 .page_mkwrite = nfs_vm_page_mkwrite, 648 .page_mkwrite = nfs_vm_page_mkwrite,
649 .remap_pages = generic_file_remap_pages,
650}; 649};
651 650
652static int nfs_need_sync_write(struct file *filp, struct inode *inode) 651static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..7ae1c263c5cf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
118 } 118 }
119} 119}
120 120
121static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
122{
123 if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
124 return;
125 pnfs_return_layout(inode);
126}
127
128static int filelayout_async_handle_error(struct rpc_task *task, 121static int filelayout_async_handle_error(struct rpc_task *task,
129 struct nfs4_state *state, 122 struct nfs4_state *state,
130 struct nfs_client *clp, 123 struct nfs_client *clp,
@@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
207 dprintk("%s DS connection error %d\n", __func__, 200 dprintk("%s DS connection error %d\n", __func__,
208 task->tk_status); 201 task->tk_status);
209 nfs4_mark_deviceid_unavailable(devid); 202 nfs4_mark_deviceid_unavailable(devid);
210 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); 203 pnfs_error_mark_layout_for_return(inode, lseg);
211 rpc_wake_up(&tbl->slot_tbl_waitq); 204 rpc_wake_up(&tbl->slot_tbl_waitq);
212 /* fall through */ 205 /* fall through */
213 default: 206 default:
@@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data)
339 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); 332 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
340} 333}
341 334
342static void filelayout_read_release(void *data)
343{
344 struct nfs_pgio_header *hdr = data;
345 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
346
347 filelayout_fenceme(lo->plh_inode, lo);
348 nfs_put_client(hdr->ds_clp);
349 hdr->mds_ops->rpc_release(data);
350}
351
352static int filelayout_write_done_cb(struct rpc_task *task, 335static int filelayout_write_done_cb(struct rpc_task *task,
353 struct nfs_pgio_header *hdr) 336 struct nfs_pgio_header *hdr)
354{ 337{
@@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task,
371 return 0; 354 return 0;
372} 355}
373 356
374/* Fake up some data that will cause nfs_commit_release to retry the writes. */
375static void prepare_to_resend_writes(struct nfs_commit_data *data)
376{
377 struct nfs_page *first = nfs_list_entry(data->pages.next);
378
379 data->task.tk_status = 0;
380 memcpy(&data->verf.verifier, &first->wb_verf,
381 sizeof(data->verf.verifier));
382 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
383}
384
385static int filelayout_commit_done_cb(struct rpc_task *task, 357static int filelayout_commit_done_cb(struct rpc_task *task,
386 struct nfs_commit_data *data) 358 struct nfs_commit_data *data)
387{ 359{
@@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
393 365
394 switch (err) { 366 switch (err) {
395 case -NFS4ERR_RESET_TO_MDS: 367 case -NFS4ERR_RESET_TO_MDS:
396 prepare_to_resend_writes(data); 368 pnfs_generic_prepare_to_resend_writes(data);
397 return -EAGAIN; 369 return -EAGAIN;
398 case -EAGAIN: 370 case -EAGAIN:
399 rpc_restart_call_prepare(task); 371 rpc_restart_call_prepare(task);
@@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data)
451 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); 423 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
452} 424}
453 425
454static void filelayout_write_release(void *data)
455{
456 struct nfs_pgio_header *hdr = data;
457 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
458
459 filelayout_fenceme(lo->plh_inode, lo);
460 nfs_put_client(hdr->ds_clp);
461 hdr->mds_ops->rpc_release(data);
462}
463
464static void filelayout_commit_prepare(struct rpc_task *task, void *data) 426static void filelayout_commit_prepare(struct rpc_task *task, void *data)
465{ 427{
466 struct nfs_commit_data *wdata = data; 428 struct nfs_commit_data *wdata = data;
@@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
471 task); 433 task);
472} 434}
473 435
474static void filelayout_write_commit_done(struct rpc_task *task, void *data)
475{
476 struct nfs_commit_data *wdata = data;
477
478 /* Note this may cause RPC to be resent */
479 wdata->mds_ops->rpc_call_done(task, data);
480}
481
482static void filelayout_commit_count_stats(struct rpc_task *task, void *data) 436static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
483{ 437{
484 struct nfs_commit_data *cdata = data; 438 struct nfs_commit_data *cdata = data;
@@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
486 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); 440 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
487} 441}
488 442
489static void filelayout_commit_release(void *calldata)
490{
491 struct nfs_commit_data *data = calldata;
492
493 data->completion_ops->completion(data);
494 pnfs_put_lseg(data->lseg);
495 nfs_put_client(data->ds_clp);
496 nfs_commitdata_release(data);
497}
498
499static const struct rpc_call_ops filelayout_read_call_ops = { 443static const struct rpc_call_ops filelayout_read_call_ops = {
500 .rpc_call_prepare = filelayout_read_prepare, 444 .rpc_call_prepare = filelayout_read_prepare,
501 .rpc_call_done = filelayout_read_call_done, 445 .rpc_call_done = filelayout_read_call_done,
502 .rpc_count_stats = filelayout_read_count_stats, 446 .rpc_count_stats = filelayout_read_count_stats,
503 .rpc_release = filelayout_read_release, 447 .rpc_release = pnfs_generic_rw_release,
504}; 448};
505 449
506static const struct rpc_call_ops filelayout_write_call_ops = { 450static const struct rpc_call_ops filelayout_write_call_ops = {
507 .rpc_call_prepare = filelayout_write_prepare, 451 .rpc_call_prepare = filelayout_write_prepare,
508 .rpc_call_done = filelayout_write_call_done, 452 .rpc_call_done = filelayout_write_call_done,
509 .rpc_count_stats = filelayout_write_count_stats, 453 .rpc_count_stats = filelayout_write_count_stats,
510 .rpc_release = filelayout_write_release, 454 .rpc_release = pnfs_generic_rw_release,
511}; 455};
512 456
513static const struct rpc_call_ops filelayout_commit_call_ops = { 457static const struct rpc_call_ops filelayout_commit_call_ops = {
514 .rpc_call_prepare = filelayout_commit_prepare, 458 .rpc_call_prepare = filelayout_commit_prepare,
515 .rpc_call_done = filelayout_write_commit_done, 459 .rpc_call_done = pnfs_generic_write_commit_done,
516 .rpc_count_stats = filelayout_commit_count_stats, 460 .rpc_count_stats = filelayout_commit_count_stats,
517 .rpc_release = filelayout_commit_release, 461 .rpc_release = pnfs_generic_commit_release,
518}; 462};
519 463
520static enum pnfs_try_status 464static enum pnfs_try_status
@@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
548 /* No multipath support. Use first DS */ 492 /* No multipath support. Use first DS */
549 atomic_inc(&ds->ds_clp->cl_count); 493 atomic_inc(&ds->ds_clp->cl_count);
550 hdr->ds_clp = ds->ds_clp; 494 hdr->ds_clp = ds->ds_clp;
551 hdr->ds_idx = idx; 495 hdr->ds_commit_idx = idx;
552 fh = nfs4_fl_select_ds_fh(lseg, j); 496 fh = nfs4_fl_select_ds_fh(lseg, j);
553 if (fh) 497 if (fh)
554 hdr->args.fh = fh; 498 hdr->args.fh = fh;
@@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
557 hdr->mds_offset = offset; 501 hdr->mds_offset = offset;
558 502
559 /* Perform an asynchronous read to ds */ 503 /* Perform an asynchronous read to ds */
560 nfs_initiate_pgio(ds_clnt, hdr, 504 nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
561 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); 505 NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
506 0, RPC_TASK_SOFTCONN);
562 return PNFS_ATTEMPTED; 507 return PNFS_ATTEMPTED;
563} 508}
564 509
@@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
591 hdr->pgio_done_cb = filelayout_write_done_cb; 536 hdr->pgio_done_cb = filelayout_write_done_cb;
592 atomic_inc(&ds->ds_clp->cl_count); 537 atomic_inc(&ds->ds_clp->cl_count);
593 hdr->ds_clp = ds->ds_clp; 538 hdr->ds_clp = ds->ds_clp;
594 hdr->ds_idx = idx; 539 hdr->ds_commit_idx = idx;
595 fh = nfs4_fl_select_ds_fh(lseg, j); 540 fh = nfs4_fl_select_ds_fh(lseg, j);
596 if (fh) 541 if (fh)
597 hdr->args.fh = fh; 542 hdr->args.fh = fh;
598 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); 543 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
599 544
600 /* Perform an asynchronous write */ 545 /* Perform an asynchronous write */
601 nfs_initiate_pgio(ds_clnt, hdr, 546 nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
602 &filelayout_write_call_ops, sync, 547 NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
603 RPC_TASK_SOFTCONN); 548 sync, RPC_TASK_SOFTCONN);
604 return PNFS_ATTEMPTED; 549 return PNFS_ATTEMPTED;
605} 550}
606 551
@@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = {
988 .pg_init = filelayout_pg_init_read, 933 .pg_init = filelayout_pg_init_read,
989 .pg_test = filelayout_pg_test, 934 .pg_test = filelayout_pg_test,
990 .pg_doio = pnfs_generic_pg_readpages, 935 .pg_doio = pnfs_generic_pg_readpages,
936 .pg_cleanup = pnfs_generic_pg_cleanup,
991}; 937};
992 938
993static const struct nfs_pageio_ops filelayout_pg_write_ops = { 939static const struct nfs_pageio_ops filelayout_pg_write_ops = {
994 .pg_init = filelayout_pg_init_write, 940 .pg_init = filelayout_pg_init_write,
995 .pg_test = filelayout_pg_test, 941 .pg_test = filelayout_pg_test,
996 .pg_doio = pnfs_generic_pg_writepages, 942 .pg_doio = pnfs_generic_pg_writepages,
943 .pg_cleanup = pnfs_generic_pg_cleanup,
997}; 944};
998 945
999static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) 946static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
@@ -1004,37 +951,11 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
1004 return j; 951 return j;
1005} 952}
1006 953
1007/* The generic layer is about to remove the req from the commit list.
1008 * If this will make the bucket empty, it will need to put the lseg reference.
1009 * Note this is must be called holding the inode (/cinfo) lock
1010 */
1011static void
1012filelayout_clear_request_commit(struct nfs_page *req,
1013 struct nfs_commit_info *cinfo)
1014{
1015 struct pnfs_layout_segment *freeme = NULL;
1016
1017 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
1018 goto out;
1019 cinfo->ds->nwritten--;
1020 if (list_is_singular(&req->wb_list)) {
1021 struct pnfs_commit_bucket *bucket;
1022
1023 bucket = list_first_entry(&req->wb_list,
1024 struct pnfs_commit_bucket,
1025 written);
1026 freeme = bucket->wlseg;
1027 bucket->wlseg = NULL;
1028 }
1029out:
1030 nfs_request_remove_commit_list(req, cinfo);
1031 pnfs_put_lseg_locked(freeme);
1032}
1033
1034static void 954static void
1035filelayout_mark_request_commit(struct nfs_page *req, 955filelayout_mark_request_commit(struct nfs_page *req,
1036 struct pnfs_layout_segment *lseg, 956 struct pnfs_layout_segment *lseg,
1037 struct nfs_commit_info *cinfo) 957 struct nfs_commit_info *cinfo,
958 u32 ds_commit_idx)
1038 959
1039{ 960{
1040 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 961 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
@@ -1064,7 +985,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
1064 * is normally transferred to the COMMIT call and released 985 * is normally transferred to the COMMIT call and released
1065 * there. It could also be released if the last req is pulled 986 * there. It could also be released if the last req is pulled
1066 * off due to a rewrite, in which case it will be done in 987 * off due to a rewrite, in which case it will be done in
1067 * filelayout_clear_request_commit 988 * pnfs_generic_clear_request_commit
1068 */ 989 */
1069 buckets[i].wlseg = pnfs_get_lseg(lseg); 990 buckets[i].wlseg = pnfs_get_lseg(lseg);
1070 } 991 }
@@ -1081,7 +1002,7 @@ mds_commit:
1081 spin_unlock(cinfo->lock); 1002 spin_unlock(cinfo->lock);
1082 if (!cinfo->dreq) { 1003 if (!cinfo->dreq) {
1083 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1004 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1084 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, 1005 inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
1085 BDI_RECLAIMABLE); 1006 BDI_RECLAIMABLE);
1086 __mark_inode_dirty(req->wb_context->dentry->d_inode, 1007 __mark_inode_dirty(req->wb_context->dentry->d_inode,
1087 I_DIRTY_DATASYNC); 1008 I_DIRTY_DATASYNC);
@@ -1138,101 +1059,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
1138 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); 1059 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1139 if (fh) 1060 if (fh)
1140 data->args.fh = fh; 1061 data->args.fh = fh;
1141 return nfs_initiate_commit(ds_clnt, data, 1062 return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
1142 &filelayout_commit_call_ops, how, 1063 &filelayout_commit_call_ops, how,
1143 RPC_TASK_SOFTCONN); 1064 RPC_TASK_SOFTCONN);
1144out_err: 1065out_err:
1145 prepare_to_resend_writes(data); 1066 pnfs_generic_prepare_to_resend_writes(data);
1146 filelayout_commit_release(data); 1067 pnfs_generic_commit_release(data);
1147 return -EAGAIN; 1068 return -EAGAIN;
1148} 1069}
1149 1070
1150static int
1151transfer_commit_list(struct list_head *src, struct list_head *dst,
1152 struct nfs_commit_info *cinfo, int max)
1153{
1154 struct nfs_page *req, *tmp;
1155 int ret = 0;
1156
1157 list_for_each_entry_safe(req, tmp, src, wb_list) {
1158 if (!nfs_lock_request(req))
1159 continue;
1160 kref_get(&req->wb_kref);
1161 if (cond_resched_lock(cinfo->lock))
1162 list_safe_reset_next(req, tmp, wb_list);
1163 nfs_request_remove_commit_list(req, cinfo);
1164 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1165 nfs_list_add_request(req, dst);
1166 ret++;
1167 if ((ret == max) && !cinfo->dreq)
1168 break;
1169 }
1170 return ret;
1171}
1172
1173/* Note called with cinfo->lock held. */
1174static int
1175filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
1176 struct nfs_commit_info *cinfo,
1177 int max)
1178{
1179 struct list_head *src = &bucket->written;
1180 struct list_head *dst = &bucket->committing;
1181 int ret;
1182
1183 ret = transfer_commit_list(src, dst, cinfo, max);
1184 if (ret) {
1185 cinfo->ds->nwritten -= ret;
1186 cinfo->ds->ncommitting += ret;
1187 bucket->clseg = bucket->wlseg;
1188 if (list_empty(src))
1189 bucket->wlseg = NULL;
1190 else
1191 pnfs_get_lseg(bucket->clseg);
1192 }
1193 return ret;
1194}
1195
1196/* Move reqs from written to committing lists, returning count of number moved.
1197 * Note called with cinfo->lock held.
1198 */
1199static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
1200 int max)
1201{
1202 int i, rv = 0, cnt;
1203
1204 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
1205 cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
1206 cinfo, max);
1207 max -= cnt;
1208 rv += cnt;
1209 }
1210 return rv;
1211}
1212
1213/* Pull everything off the committing lists and dump into @dst */
1214static void filelayout_recover_commit_reqs(struct list_head *dst,
1215 struct nfs_commit_info *cinfo)
1216{
1217 struct pnfs_commit_bucket *b;
1218 struct pnfs_layout_segment *freeme;
1219 int i;
1220
1221restart:
1222 spin_lock(cinfo->lock);
1223 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1224 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1225 freeme = b->wlseg;
1226 b->wlseg = NULL;
1227 spin_unlock(cinfo->lock);
1228 pnfs_put_lseg(freeme);
1229 goto restart;
1230 }
1231 }
1232 cinfo->ds->nwritten = 0;
1233 spin_unlock(cinfo->lock);
1234}
1235
1236/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest 1071/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
1237 * for @page 1072 * for @page
1238 * @cinfo - commit info for current inode 1073 * @cinfo - commit info for current inode
@@ -1263,108 +1098,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
1263 return NULL; 1098 return NULL;
1264} 1099}
1265 1100
1266static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
1267{
1268 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
1269 struct pnfs_commit_bucket *bucket;
1270 struct pnfs_layout_segment *freeme;
1271 int i;
1272
1273 for (i = idx; i < fl_cinfo->nbuckets; i++) {
1274 bucket = &fl_cinfo->buckets[i];
1275 if (list_empty(&bucket->committing))
1276 continue;
1277 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1278 spin_lock(cinfo->lock);
1279 freeme = bucket->clseg;
1280 bucket->clseg = NULL;
1281 spin_unlock(cinfo->lock);
1282 pnfs_put_lseg(freeme);
1283 }
1284}
1285
1286static unsigned int
1287alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1288{
1289 struct pnfs_ds_commit_info *fl_cinfo;
1290 struct pnfs_commit_bucket *bucket;
1291 struct nfs_commit_data *data;
1292 int i;
1293 unsigned int nreq = 0;
1294
1295 fl_cinfo = cinfo->ds;
1296 bucket = fl_cinfo->buckets;
1297 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
1298 if (list_empty(&bucket->committing))
1299 continue;
1300 data = nfs_commitdata_alloc();
1301 if (!data)
1302 break;
1303 data->ds_commit_index = i;
1304 spin_lock(cinfo->lock);
1305 data->lseg = bucket->clseg;
1306 bucket->clseg = NULL;
1307 spin_unlock(cinfo->lock);
1308 list_add(&data->pages, list);
1309 nreq++;
1310 }
1311
1312 /* Clean up on error */
1313 filelayout_retry_commit(cinfo, i);
1314 /* Caller will clean up entries put on list */
1315 return nreq;
1316}
1317
1318/* This follows nfs_commit_list pretty closely */
1319static int 1101static int
1320filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, 1102filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1321 int how, struct nfs_commit_info *cinfo) 1103 int how, struct nfs_commit_info *cinfo)
1322{ 1104{
1323 struct nfs_commit_data *data, *tmp; 1105 return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1324 LIST_HEAD(list); 1106 filelayout_initiate_commit);
1325 unsigned int nreq = 0;
1326
1327 if (!list_empty(mds_pages)) {
1328 data = nfs_commitdata_alloc();
1329 if (data != NULL) {
1330 data->lseg = NULL;
1331 list_add(&data->pages, &list);
1332 nreq++;
1333 } else {
1334 nfs_retry_commit(mds_pages, NULL, cinfo);
1335 filelayout_retry_commit(cinfo, 0);
1336 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1337 return -ENOMEM;
1338 }
1339 }
1340
1341 nreq += alloc_ds_commits(cinfo, &list);
1342
1343 if (nreq == 0) {
1344 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1345 goto out;
1346 }
1347
1348 atomic_add(nreq, &cinfo->mds->rpcs_out);
1349
1350 list_for_each_entry_safe(data, tmp, &list, pages) {
1351 list_del_init(&data->pages);
1352 if (!data->lseg) {
1353 nfs_init_commit(data, mds_pages, NULL, cinfo);
1354 nfs_initiate_commit(NFS_CLIENT(inode), data,
1355 data->mds_ops, how, 0);
1356 } else {
1357 struct pnfs_commit_bucket *buckets;
1358
1359 buckets = cinfo->ds->buckets;
1360 nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
1361 filelayout_initiate_commit(data, how);
1362 }
1363 }
1364out:
1365 cinfo->ds->ncommitting = 0;
1366 return PNFS_ATTEMPTED;
1367} 1107}
1108
1368static struct nfs4_deviceid_node * 1109static struct nfs4_deviceid_node *
1369filelayout_alloc_deviceid_node(struct nfs_server *server, 1110filelayout_alloc_deviceid_node(struct nfs_server *server,
1370 struct pnfs_device *pdev, gfp_t gfp_flags) 1111 struct pnfs_device *pdev, gfp_t gfp_flags)
@@ -1421,9 +1162,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1421 .pg_write_ops = &filelayout_pg_write_ops, 1162 .pg_write_ops = &filelayout_pg_write_ops,
1422 .get_ds_info = &filelayout_get_ds_info, 1163 .get_ds_info = &filelayout_get_ds_info,
1423 .mark_request_commit = filelayout_mark_request_commit, 1164 .mark_request_commit = filelayout_mark_request_commit,
1424 .clear_request_commit = filelayout_clear_request_commit, 1165 .clear_request_commit = pnfs_generic_clear_request_commit,
1425 .scan_commit_lists = filelayout_scan_commit_lists, 1166 .scan_commit_lists = pnfs_generic_scan_commit_lists,
1426 .recover_commit_reqs = filelayout_recover_commit_reqs, 1167 .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
1427 .search_commit_reqs = filelayout_search_commit_reqs, 1168 .search_commit_reqs = filelayout_search_commit_reqs,
1428 .commit_pagelist = filelayout_commit_pagelist, 1169 .commit_pagelist = filelayout_commit_pagelist,
1429 .read_pagelist = filelayout_read_pagelist, 1170 .read_pagelist = filelayout_read_pagelist,
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 7c9f800c49d7..2896cb833a11 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -33,13 +33,6 @@
33#include "../pnfs.h" 33#include "../pnfs.h"
34 34
35/* 35/*
36 * Default data server connection timeout and retrans vaules.
37 * Set by module paramters dataserver_timeo and dataserver_retrans.
38 */
39#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
40#define NFS4_DEF_DS_RETRANS 5
41
42/*
43 * Field testing shows we need to support up to 4096 stripe indices. 36 * Field testing shows we need to support up to 4096 stripe indices.
44 * We store each index as a u8 (u32 on the wire) to keep the memory footprint 37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
45 * reasonable. This in turn means we support a maximum of 256 38 * reasonable. This in turn means we support a maximum of 256
@@ -48,32 +41,11 @@
48#define NFS4_PNFS_MAX_STRIPE_CNT 4096 41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
49#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ 42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
50 43
51/* error codes for internal use */
52#define NFS4ERR_RESET_TO_MDS 12001
53
54enum stripetype4 { 44enum stripetype4 {
55 STRIPE_SPARSE = 1, 45 STRIPE_SPARSE = 1,
56 STRIPE_DENSE = 2 46 STRIPE_DENSE = 2
57}; 47};
58 48
59/* Individual ip address */
60struct nfs4_pnfs_ds_addr {
61 struct sockaddr_storage da_addr;
62 size_t da_addrlen;
63 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
64 char *da_remotestr; /* human readable addr+port */
65};
66
67struct nfs4_pnfs_ds {
68 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
69 char *ds_remotestr; /* comma sep list of addrs */
70 struct list_head ds_addrs;
71 struct nfs_client *ds_clp;
72 atomic_t ds_count;
73 unsigned long ds_state;
74#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
75};
76
77struct nfs4_file_layout_dsaddr { 49struct nfs4_file_layout_dsaddr {
78 struct nfs4_deviceid_node id_node; 50 struct nfs4_deviceid_node id_node;
79 u32 stripe_count; 51 u32 stripe_count;
@@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
119 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; 91 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
120} 92}
121 93
122static inline void
123filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
124{
125 u32 *p = (u32 *)&node->deviceid;
126
127 printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
128 p[0], p[1], p[2], p[3]);
129
130 set_bit(NFS_DEVICEID_INVALID, &node->flags);
131}
132
133static inline bool 94static inline bool
134filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) 95filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
135{ 96{
@@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
142extern struct nfs_fh * 103extern struct nfs_fh *
143nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 104nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
144 105
145extern void print_ds(struct nfs4_pnfs_ds *ds);
146u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); 106u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 107u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 108struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index bfecac781f19..4f372e224603 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -31,7 +31,6 @@
31#include <linux/nfs_fs.h> 31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/sunrpc/addr.h>
35 34
36#include "../internal.h" 35#include "../internal.h"
37#include "../nfs4session.h" 36#include "../nfs4session.h"
@@ -42,183 +41,6 @@
42static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; 41static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
43static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; 42static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
44 43
45/*
46 * Data server cache
47 *
48 * Data servers can be mapped to different device ids.
49 * nfs4_pnfs_ds reference counting
50 * - set to 1 on allocation
51 * - incremented when a device id maps a data server already in the cache.
52 * - decremented when deviceid is removed from the cache.
53 */
54static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55static LIST_HEAD(nfs4_data_server_cache);
56
57/* Debug routines */
58void
59print_ds(struct nfs4_pnfs_ds *ds)
60{
61 if (ds == NULL) {
62 printk("%s NULL device\n", __func__);
63 return;
64 }
65 printk(" ds %s\n"
66 " ref count %d\n"
67 " client %p\n"
68 " cl_exchange_flags %x\n",
69 ds->ds_remotestr,
70 atomic_read(&ds->ds_count), ds->ds_clp,
71 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
72}
73
74static bool
75same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76{
77 struct sockaddr_in *a, *b;
78 struct sockaddr_in6 *a6, *b6;
79
80 if (addr1->sa_family != addr2->sa_family)
81 return false;
82
83 switch (addr1->sa_family) {
84 case AF_INET:
85 a = (struct sockaddr_in *)addr1;
86 b = (struct sockaddr_in *)addr2;
87
88 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
89 a->sin_port == b->sin_port)
90 return true;
91 break;
92
93 case AF_INET6:
94 a6 = (struct sockaddr_in6 *)addr1;
95 b6 = (struct sockaddr_in6 *)addr2;
96
97 /* LINKLOCAL addresses must have matching scope_id */
98 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 IPV6_ADDR_SCOPE_LINKLOCAL &&
100 a6->sin6_scope_id != b6->sin6_scope_id)
101 return false;
102
103 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
104 a6->sin6_port == b6->sin6_port)
105 return true;
106 break;
107
108 default:
109 dprintk("%s: unhandled address family: %u\n",
110 __func__, addr1->sa_family);
111 return false;
112 }
113
114 return false;
115}
116
117static bool
118_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
119 const struct list_head *dsaddrs2)
120{
121 struct nfs4_pnfs_ds_addr *da1, *da2;
122
123 /* step through both lists, comparing as we go */
124 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
125 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
126 da1 != NULL && da2 != NULL;
127 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
128 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
129 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
130 (struct sockaddr *)&da2->da_addr))
131 return false;
132 }
133 if (da1 == NULL && da2 == NULL)
134 return true;
135
136 return false;
137}
138
139/*
140 * Lookup DS by addresses. nfs4_ds_cache_lock is held
141 */
142static struct nfs4_pnfs_ds *
143_data_server_lookup_locked(const struct list_head *dsaddrs)
144{
145 struct nfs4_pnfs_ds *ds;
146
147 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
148 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
149 return ds;
150 return NULL;
151}
152
153/*
154 * Create an rpc connection to the nfs4_pnfs_ds data server
155 * Currently only supports IPv4 and IPv6 addresses
156 */
157static int
158nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
159{
160 struct nfs_client *clp = ERR_PTR(-EIO);
161 struct nfs4_pnfs_ds_addr *da;
162 int status = 0;
163
164 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
165 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
166
167 list_for_each_entry(da, &ds->ds_addrs, da_node) {
168 dprintk("%s: DS %s: trying address %s\n",
169 __func__, ds->ds_remotestr, da->da_remotestr);
170
171 clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 (struct sockaddr *)&da->da_addr,
173 da->da_addrlen, IPPROTO_TCP,
174 dataserver_timeo, dataserver_retrans);
175 if (!IS_ERR(clp))
176 break;
177 }
178
179 if (IS_ERR(clp)) {
180 status = PTR_ERR(clp);
181 goto out;
182 }
183
184 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
185 if (status)
186 goto out_put;
187
188 smp_wmb();
189 ds->ds_clp = clp;
190 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
191out:
192 return status;
193out_put:
194 nfs_put_client(clp);
195 goto out;
196}
197
198static void
199destroy_ds(struct nfs4_pnfs_ds *ds)
200{
201 struct nfs4_pnfs_ds_addr *da;
202
203 dprintk("--> %s\n", __func__);
204 ifdebug(FACILITY)
205 print_ds(ds);
206
207 nfs_put_client(ds->ds_clp);
208
209 while (!list_empty(&ds->ds_addrs)) {
210 da = list_first_entry(&ds->ds_addrs,
211 struct nfs4_pnfs_ds_addr,
212 da_node);
213 list_del_init(&da->da_node);
214 kfree(da->da_remotestr);
215 kfree(da);
216 }
217
218 kfree(ds->ds_remotestr);
219 kfree(ds);
220}
221
222void 44void
223nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 45nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
224{ 46{
@@ -229,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
229 51
230 for (i = 0; i < dsaddr->ds_num; i++) { 52 for (i = 0; i < dsaddr->ds_num; i++) {
231 ds = dsaddr->ds_list[i]; 53 ds = dsaddr->ds_list[i];
232 if (ds != NULL) { 54 if (ds != NULL)
233 if (atomic_dec_and_lock(&ds->ds_count, 55 nfs4_pnfs_ds_put(ds);
234 &nfs4_ds_cache_lock)) {
235 list_del_init(&ds->ds_node);
236 spin_unlock(&nfs4_ds_cache_lock);
237 destroy_ds(ds);
238 }
239 }
240 } 56 }
241 kfree(dsaddr->stripe_indices); 57 kfree(dsaddr->stripe_indices);
242 kfree(dsaddr); 58 kfree(dsaddr);
243} 59}
244 60
245/*
246 * Create a string with a human readable address and port to avoid
247 * complicated setup around many dprinks.
248 */
249static char *
250nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
251{
252 struct nfs4_pnfs_ds_addr *da;
253 char *remotestr;
254 size_t len;
255 char *p;
256
257 len = 3; /* '{', '}' and eol */
258 list_for_each_entry(da, dsaddrs, da_node) {
259 len += strlen(da->da_remotestr) + 1; /* string plus comma */
260 }
261
262 remotestr = kzalloc(len, gfp_flags);
263 if (!remotestr)
264 return NULL;
265
266 p = remotestr;
267 *(p++) = '{';
268 len--;
269 list_for_each_entry(da, dsaddrs, da_node) {
270 size_t ll = strlen(da->da_remotestr);
271
272 if (ll > len)
273 goto out_err;
274
275 memcpy(p, da->da_remotestr, ll);
276 p += ll;
277 len -= ll;
278
279 if (len < 1)
280 goto out_err;
281 (*p++) = ',';
282 len--;
283 }
284 if (len < 2)
285 goto out_err;
286 *(p++) = '}';
287 *p = '\0';
288 return remotestr;
289out_err:
290 kfree(remotestr);
291 return NULL;
292}
293
294static struct nfs4_pnfs_ds *
295nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
296{
297 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
298 char *remotestr;
299
300 if (list_empty(dsaddrs)) {
301 dprintk("%s: no addresses defined\n", __func__);
302 goto out;
303 }
304
305 ds = kzalloc(sizeof(*ds), gfp_flags);
306 if (!ds)
307 goto out;
308
309 /* this is only used for debugging, so it's ok if its NULL */
310 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
311
312 spin_lock(&nfs4_ds_cache_lock);
313 tmp_ds = _data_server_lookup_locked(dsaddrs);
314 if (tmp_ds == NULL) {
315 INIT_LIST_HEAD(&ds->ds_addrs);
316 list_splice_init(dsaddrs, &ds->ds_addrs);
317 ds->ds_remotestr = remotestr;
318 atomic_set(&ds->ds_count, 1);
319 INIT_LIST_HEAD(&ds->ds_node);
320 ds->ds_clp = NULL;
321 list_add(&ds->ds_node, &nfs4_data_server_cache);
322 dprintk("%s add new data server %s\n", __func__,
323 ds->ds_remotestr);
324 } else {
325 kfree(remotestr);
326 kfree(ds);
327 atomic_inc(&tmp_ds->ds_count);
328 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
329 __func__, tmp_ds->ds_remotestr,
330 atomic_read(&tmp_ds->ds_count));
331 ds = tmp_ds;
332 }
333 spin_unlock(&nfs4_ds_cache_lock);
334out:
335 return ds;
336}
337
338/*
339 * Currently only supports ipv4, ipv6 and one multi-path address.
340 */
341static struct nfs4_pnfs_ds_addr *
342decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
343{
344 struct nfs4_pnfs_ds_addr *da = NULL;
345 char *buf, *portstr;
346 __be16 port;
347 int nlen, rlen;
348 int tmp[2];
349 __be32 *p;
350 char *netid, *match_netid;
351 size_t len, match_netid_len;
352 char *startsep = "";
353 char *endsep = "";
354
355
356 /* r_netid */
357 p = xdr_inline_decode(streamp, 4);
358 if (unlikely(!p))
359 goto out_err;
360 nlen = be32_to_cpup(p++);
361
362 p = xdr_inline_decode(streamp, nlen);
363 if (unlikely(!p))
364 goto out_err;
365
366 netid = kmalloc(nlen+1, gfp_flags);
367 if (unlikely(!netid))
368 goto out_err;
369
370 netid[nlen] = '\0';
371 memcpy(netid, p, nlen);
372
373 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
374 p = xdr_inline_decode(streamp, 4);
375 if (unlikely(!p))
376 goto out_free_netid;
377 rlen = be32_to_cpup(p);
378
379 p = xdr_inline_decode(streamp, rlen);
380 if (unlikely(!p))
381 goto out_free_netid;
382
383 /* port is ".ABC.DEF", 8 chars max */
384 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
385 dprintk("%s: Invalid address, length %d\n", __func__,
386 rlen);
387 goto out_free_netid;
388 }
389 buf = kmalloc(rlen + 1, gfp_flags);
390 if (!buf) {
391 dprintk("%s: Not enough memory\n", __func__);
392 goto out_free_netid;
393 }
394 buf[rlen] = '\0';
395 memcpy(buf, p, rlen);
396
397 /* replace port '.' with '-' */
398 portstr = strrchr(buf, '.');
399 if (!portstr) {
400 dprintk("%s: Failed finding expected dot in port\n",
401 __func__);
402 goto out_free_buf;
403 }
404 *portstr = '-';
405
406 /* find '.' between address and port */
407 portstr = strrchr(buf, '.');
408 if (!portstr) {
409 dprintk("%s: Failed finding expected dot between address and "
410 "port\n", __func__);
411 goto out_free_buf;
412 }
413 *portstr = '\0';
414
415 da = kzalloc(sizeof(*da), gfp_flags);
416 if (unlikely(!da))
417 goto out_free_buf;
418
419 INIT_LIST_HEAD(&da->da_node);
420
421 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
422 sizeof(da->da_addr))) {
423 dprintk("%s: error parsing address %s\n", __func__, buf);
424 goto out_free_da;
425 }
426
427 portstr++;
428 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
429 port = htons((tmp[0] << 8) | (tmp[1]));
430
431 switch (da->da_addr.ss_family) {
432 case AF_INET:
433 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
434 da->da_addrlen = sizeof(struct sockaddr_in);
435 match_netid = "tcp";
436 match_netid_len = 3;
437 break;
438
439 case AF_INET6:
440 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
441 da->da_addrlen = sizeof(struct sockaddr_in6);
442 match_netid = "tcp6";
443 match_netid_len = 4;
444 startsep = "[";
445 endsep = "]";
446 break;
447
448 default:
449 dprintk("%s: unsupported address family: %u\n",
450 __func__, da->da_addr.ss_family);
451 goto out_free_da;
452 }
453
454 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
455 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
456 __func__, netid, match_netid);
457 goto out_free_da;
458 }
459
460 /* save human readable address */
461 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
462 da->da_remotestr = kzalloc(len, gfp_flags);
463
464 /* NULL is ok, only used for dprintk */
465 if (da->da_remotestr)
466 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
467 buf, endsep, ntohs(port));
468
469 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
470 kfree(buf);
471 kfree(netid);
472 return da;
473
474out_free_da:
475 kfree(da);
476out_free_buf:
477 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
478 kfree(buf);
479out_free_netid:
480 kfree(netid);
481out_err:
482 return NULL;
483}
484
485/* Decode opaque device data and return the result */ 61/* Decode opaque device data and return the result */
486struct nfs4_file_layout_dsaddr * 62struct nfs4_file_layout_dsaddr *
487nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, 63nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
@@ -584,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
584 160
585 mp_count = be32_to_cpup(p); /* multipath count */ 161 mp_count = be32_to_cpup(p); /* multipath count */
586 for (j = 0; j < mp_count; j++) { 162 for (j = 0; j < mp_count; j++) {
587 da = decode_ds_addr(server->nfs_client->cl_net, 163 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
588 &stream, gfp_flags); 164 &stream, gfp_flags);
589 if (da) 165 if (da)
590 list_add_tail(&da->da_node, &dsaddrs); 166 list_add_tail(&da->da_node, &dsaddrs);
591 } 167 }
@@ -681,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
681 return flseg->fh_array[i]; 257 return flseg->fh_array[i];
682} 258}
683 259
684static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) 260/* Upon return, either ds is connected, or ds is NULL */
685{
686 might_sleep();
687 wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
688 nfs_wait_bit_killable, TASK_KILLABLE);
689}
690
691static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
692{
693 smp_mb__before_atomic();
694 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
695 smp_mb__after_atomic();
696 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
697}
698
699
700struct nfs4_pnfs_ds * 261struct nfs4_pnfs_ds *
701nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) 262nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
702{ 263{
@@ -704,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
704 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 265 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
705 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); 266 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
706 struct nfs4_pnfs_ds *ret = ds; 267 struct nfs4_pnfs_ds *ret = ds;
268 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
707 269
708 if (ds == NULL) { 270 if (ds == NULL) {
709 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", 271 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
710 __func__, ds_idx); 272 __func__, ds_idx);
711 filelayout_mark_devid_invalid(devid); 273 pnfs_generic_mark_devid_invalid(devid);
712 goto out; 274 goto out;
713 } 275 }
714 smp_rmb(); 276 smp_rmb();
715 if (ds->ds_clp) 277 if (ds->ds_clp)
716 goto out_test_devid; 278 goto out_test_devid;
717 279
718 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { 280 nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
719 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); 281 dataserver_retrans, 4,
720 int err; 282 s->nfs_client->cl_minorversion,
721 283 s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
722 err = nfs4_ds_connect(s, ds); 284
723 if (err)
724 nfs4_mark_deviceid_unavailable(devid);
725 nfs4_clear_ds_conn_bit(ds);
726 } else {
727 /* Either ds is connected, or ds is NULL */
728 nfs4_wait_ds_connect(ds);
729 }
730out_test_devid: 285out_test_devid:
731 if (filelayout_test_devid_unavailable(devid)) 286 if (filelayout_test_devid_unavailable(devid))
732 ret = NULL; 287 ret = NULL;
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Flexfile Layout Driver kernel module
3#
4obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
5nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..c22ecaa86c1c
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1574 @@
1/*
2 * Module for pnfs flexfile layout driver.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#include <linux/nfs_fs.h>
10#include <linux/nfs_page.h>
11#include <linux/module.h>
12
13#include <linux/sunrpc/metrics.h>
14#include <linux/nfs_idmap.h>
15
16#include "flexfilelayout.h"
17#include "../nfs4session.h"
18#include "../internal.h"
19#include "../delegation.h"
20#include "../nfs4trace.h"
21#include "../iostat.h"
22#include "../nfs.h"
23
24#define NFSDBG_FACILITY NFSDBG_PNFS_LD
25
26#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
27
28static struct pnfs_layout_hdr *
29ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
30{
31 struct nfs4_flexfile_layout *ffl;
32
33 ffl = kzalloc(sizeof(*ffl), gfp_flags);
34 if (ffl) {
35 INIT_LIST_HEAD(&ffl->error_list);
36 return &ffl->generic_hdr;
37 } else
38 return NULL;
39}
40
41static void
42ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
43{
44 struct nfs4_ff_layout_ds_err *err, *n;
45
46 list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
47 list) {
48 list_del(&err->list);
49 kfree(err);
50 }
51 kfree(FF_LAYOUT_FROM_HDR(lo));
52}
53
54static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
55{
56 __be32 *p;
57
58 p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
59 if (unlikely(p == NULL))
60 return -ENOBUFS;
61 memcpy(stateid, p, NFS4_STATEID_SIZE);
62 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
63 p[0], p[1], p[2], p[3]);
64 return 0;
65}
66
67static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
68{
69 __be32 *p;
70
71 p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
72 if (unlikely(!p))
73 return -ENOBUFS;
74 memcpy(devid, p, NFS4_DEVICEID4_SIZE);
75 nfs4_print_deviceid(devid);
76 return 0;
77}
78
79static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
80{
81 __be32 *p;
82
83 p = xdr_inline_decode(xdr, 4);
84 if (unlikely(!p))
85 return -ENOBUFS;
86 fh->size = be32_to_cpup(p++);
87 if (fh->size > sizeof(struct nfs_fh)) {
88 printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
89 fh->size);
90 return -EOVERFLOW;
91 }
92 /* fh.data */
93 p = xdr_inline_decode(xdr, fh->size);
94 if (unlikely(!p))
95 return -ENOBUFS;
96 memcpy(&fh->data, p, fh->size);
97 dprintk("%s: fh len %d\n", __func__, fh->size);
98
99 return 0;
100}
101
102/*
103 * Currently only stringified uids and gids are accepted.
104 * I.e., kerberos is not supported to the DSes, so no pricipals.
105 *
106 * That means that one common function will suffice, but when
107 * principals are added, this should be split to accomodate
108 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
109 */
110static int
111decode_name(struct xdr_stream *xdr, u32 *id)
112{
113 __be32 *p;
114 int len;
115
116 /* opaque_length(4)*/
117 p = xdr_inline_decode(xdr, 4);
118 if (unlikely(!p))
119 return -ENOBUFS;
120 len = be32_to_cpup(p++);
121 if (len < 0)
122 return -EINVAL;
123
124 dprintk("%s: len %u\n", __func__, len);
125
126 /* opaque body */
127 p = xdr_inline_decode(xdr, len);
128 if (unlikely(!p))
129 return -ENOBUFS;
130
131 if (!nfs_map_string_to_numeric((char *)p, len, id))
132 return -EINVAL;
133
134 return 0;
135}
136
137static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
138{
139 int i;
140
141 if (fls->mirror_array) {
142 for (i = 0; i < fls->mirror_array_cnt; i++) {
143 /* normally mirror_ds is freed in
144 * .free_deviceid_node but we still do it here
145 * for .alloc_lseg error path */
146 if (fls->mirror_array[i]) {
147 kfree(fls->mirror_array[i]->fh_versions);
148 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
149 kfree(fls->mirror_array[i]);
150 }
151 }
152 kfree(fls->mirror_array);
153 fls->mirror_array = NULL;
154 }
155}
156
157static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
158{
159 int ret = 0;
160
161 dprintk("--> %s\n", __func__);
162
163 /* FIXME: remove this check when layout segment support is added */
164 if (lgr->range.offset != 0 ||
165 lgr->range.length != NFS4_MAX_UINT64) {
166 dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
167 __func__);
168 ret = -EINVAL;
169 }
170
171 dprintk("--> %s returns %d\n", __func__, ret);
172 return ret;
173}
174
175static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
176{
177 if (fls) {
178 ff_layout_free_mirror_array(fls);
179 kfree(fls);
180 }
181}
182
183static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
184{
185 struct nfs4_ff_layout_mirror *tmp;
186 int i, j;
187
188 for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
189 for (j = i + 1; j < fls->mirror_array_cnt; j++)
190 if (fls->mirror_array[i]->efficiency <
191 fls->mirror_array[j]->efficiency) {
192 tmp = fls->mirror_array[i];
193 fls->mirror_array[i] = fls->mirror_array[j];
194 fls->mirror_array[j] = tmp;
195 }
196 }
197}
198
199static struct pnfs_layout_segment *
200ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
201 struct nfs4_layoutget_res *lgr,
202 gfp_t gfp_flags)
203{
204 struct pnfs_layout_segment *ret;
205 struct nfs4_ff_layout_segment *fls = NULL;
206 struct xdr_stream stream;
207 struct xdr_buf buf;
208 struct page *scratch;
209 u64 stripe_unit;
210 u32 mirror_array_cnt;
211 __be32 *p;
212 int i, rc;
213
214 dprintk("--> %s\n", __func__);
215 scratch = alloc_page(gfp_flags);
216 if (!scratch)
217 return ERR_PTR(-ENOMEM);
218
219 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
220 lgr->layoutp->len);
221 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
222
223 /* stripe unit and mirror_array_cnt */
224 rc = -EIO;
225 p = xdr_inline_decode(&stream, 8 + 4);
226 if (!p)
227 goto out_err_free;
228
229 p = xdr_decode_hyper(p, &stripe_unit);
230 mirror_array_cnt = be32_to_cpup(p++);
231 dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
232 stripe_unit, mirror_array_cnt);
233
234 if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
235 mirror_array_cnt == 0)
236 goto out_err_free;
237
238 rc = -ENOMEM;
239 fls = kzalloc(sizeof(*fls), gfp_flags);
240 if (!fls)
241 goto out_err_free;
242
243 fls->mirror_array_cnt = mirror_array_cnt;
244 fls->stripe_unit = stripe_unit;
245 fls->mirror_array = kcalloc(fls->mirror_array_cnt,
246 sizeof(fls->mirror_array[0]), gfp_flags);
247 if (fls->mirror_array == NULL)
248 goto out_err_free;
249
250 for (i = 0; i < fls->mirror_array_cnt; i++) {
251 struct nfs4_deviceid devid;
252 struct nfs4_deviceid_node *idnode;
253 u32 ds_count;
254 u32 fh_count;
255 int j;
256
257 rc = -EIO;
258 p = xdr_inline_decode(&stream, 4);
259 if (!p)
260 goto out_err_free;
261 ds_count = be32_to_cpup(p);
262
263 /* FIXME: allow for striping? */
264 if (ds_count != 1)
265 goto out_err_free;
266
267 fls->mirror_array[i] =
268 kzalloc(sizeof(struct nfs4_ff_layout_mirror),
269 gfp_flags);
270 if (fls->mirror_array[i] == NULL) {
271 rc = -ENOMEM;
272 goto out_err_free;
273 }
274
275 spin_lock_init(&fls->mirror_array[i]->lock);
276 fls->mirror_array[i]->ds_count = ds_count;
277
278 /* deviceid */
279 rc = decode_deviceid(&stream, &devid);
280 if (rc)
281 goto out_err_free;
282
283 idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
284 &devid, lh->plh_lc_cred,
285 gfp_flags);
286 /*
287 * upon success, mirror_ds is allocated by previous
288 * getdeviceinfo, or newly by .alloc_deviceid_node
289 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
290 */
291 if (idnode)
292 fls->mirror_array[i]->mirror_ds =
293 FF_LAYOUT_MIRROR_DS(idnode);
294 else
295 goto out_err_free;
296
297 /* efficiency */
298 rc = -EIO;
299 p = xdr_inline_decode(&stream, 4);
300 if (!p)
301 goto out_err_free;
302 fls->mirror_array[i]->efficiency = be32_to_cpup(p);
303
304 /* stateid */
305 rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
306 if (rc)
307 goto out_err_free;
308
309 /* fh */
310 p = xdr_inline_decode(&stream, 4);
311 if (!p)
312 goto out_err_free;
313 fh_count = be32_to_cpup(p);
314
315 fls->mirror_array[i]->fh_versions =
316 kzalloc(fh_count * sizeof(struct nfs_fh),
317 gfp_flags);
318 if (fls->mirror_array[i]->fh_versions == NULL) {
319 rc = -ENOMEM;
320 goto out_err_free;
321 }
322
323 for (j = 0; j < fh_count; j++) {
324 rc = decode_nfs_fh(&stream,
325 &fls->mirror_array[i]->fh_versions[j]);
326 if (rc)
327 goto out_err_free;
328 }
329
330 fls->mirror_array[i]->fh_versions_cnt = fh_count;
331
332 /* user */
333 rc = decode_name(&stream, &fls->mirror_array[i]->uid);
334 if (rc)
335 goto out_err_free;
336
337 /* group */
338 rc = decode_name(&stream, &fls->mirror_array[i]->gid);
339 if (rc)
340 goto out_err_free;
341
342 dprintk("%s: uid %d gid %d\n", __func__,
343 fls->mirror_array[i]->uid,
344 fls->mirror_array[i]->gid);
345 }
346
347 ff_layout_sort_mirrors(fls);
348 rc = ff_layout_check_layout(lgr);
349 if (rc)
350 goto out_err_free;
351
352 ret = &fls->generic_hdr;
353 dprintk("<-- %s (success)\n", __func__);
354out_free_page:
355 __free_page(scratch);
356 return ret;
357out_err_free:
358 _ff_layout_free_lseg(fls);
359 ret = ERR_PTR(rc);
360 dprintk("<-- %s (%d)\n", __func__, rc);
361 goto out_free_page;
362}
363
364static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
365{
366 struct pnfs_layout_segment *lseg;
367
368 list_for_each_entry(lseg, &layout->plh_segs, pls_list)
369 if (lseg->pls_range.iomode == IOMODE_RW)
370 return true;
371
372 return false;
373}
374
375static void
376ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
377{
378 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
379 int i;
380
381 dprintk("--> %s\n", __func__);
382
383 for (i = 0; i < fls->mirror_array_cnt; i++) {
384 if (fls->mirror_array[i]) {
385 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
386 fls->mirror_array[i]->mirror_ds = NULL;
387 if (fls->mirror_array[i]->cred) {
388 put_rpccred(fls->mirror_array[i]->cred);
389 fls->mirror_array[i]->cred = NULL;
390 }
391 }
392 }
393
394 if (lseg->pls_range.iomode == IOMODE_RW) {
395 struct nfs4_flexfile_layout *ffl;
396 struct inode *inode;
397
398 ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
399 inode = ffl->generic_hdr.plh_inode;
400 spin_lock(&inode->i_lock);
401 if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
402 ffl->commit_info.nbuckets = 0;
403 kfree(ffl->commit_info.buckets);
404 ffl->commit_info.buckets = NULL;
405 }
406 spin_unlock(&inode->i_lock);
407 }
408 _ff_layout_free_lseg(fls);
409}
410
411/* Return 1 until we have multiple lsegs support */
412static int
413ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
414{
415 return 1;
416}
417
418static int
419ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
420 struct nfs_commit_info *cinfo,
421 gfp_t gfp_flags)
422{
423 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
424 struct pnfs_commit_bucket *buckets;
425 int size;
426
427 if (cinfo->ds->nbuckets != 0) {
428 /* This assumes there is only one RW lseg per file.
429 * To support multiple lseg per file, we need to
430 * change struct pnfs_commit_bucket to allow dynamic
431 * increasing nbuckets.
432 */
433 return 0;
434 }
435
436 size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
437
438 buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
439 gfp_flags);
440 if (!buckets)
441 return -ENOMEM;
442 else {
443 int i;
444
445 spin_lock(cinfo->lock);
446 if (cinfo->ds->nbuckets != 0)
447 kfree(buckets);
448 else {
449 cinfo->ds->buckets = buckets;
450 cinfo->ds->nbuckets = size;
451 for (i = 0; i < size; i++) {
452 INIT_LIST_HEAD(&buckets[i].written);
453 INIT_LIST_HEAD(&buckets[i].committing);
454 /* mark direct verifier as unset */
455 buckets[i].direct_verf.committed =
456 NFS_INVALID_STABLE_HOW;
457 }
458 }
459 spin_unlock(cinfo->lock);
460 return 0;
461 }
462}
463
464static struct nfs4_pnfs_ds *
465ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
466 int *best_idx)
467{
468 struct nfs4_ff_layout_segment *fls;
469 struct nfs4_pnfs_ds *ds;
470 int idx;
471
472 fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
473 /* mirrors are sorted by efficiency */
474 for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
475 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
476 if (ds) {
477 *best_idx = idx;
478 return ds;
479 }
480 }
481
482 return NULL;
483}
484
485static void
486ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
487 struct nfs_page *req)
488{
489 struct nfs_pgio_mirror *pgm;
490 struct nfs4_ff_layout_mirror *mirror;
491 struct nfs4_pnfs_ds *ds;
492 int ds_idx;
493
494 /* Use full layout for now */
495 if (!pgio->pg_lseg)
496 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
497 req->wb_context,
498 0,
499 NFS4_MAX_UINT64,
500 IOMODE_READ,
501 GFP_KERNEL);
502 /* If no lseg, fall back to read through mds */
503 if (pgio->pg_lseg == NULL)
504 goto out_mds;
505
506 ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
507 if (!ds)
508 goto out_mds;
509 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
510
511 pgio->pg_mirror_idx = ds_idx;
512
513 /* read always uses only one mirror - idx 0 for pgio layer */
514 pgm = &pgio->pg_mirrors[0];
515 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
516
517 return;
518out_mds:
519 pnfs_put_lseg(pgio->pg_lseg);
520 pgio->pg_lseg = NULL;
521 nfs_pageio_reset_read_mds(pgio);
522}
523
524static void
525ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
526 struct nfs_page *req)
527{
528 struct nfs4_ff_layout_mirror *mirror;
529 struct nfs_pgio_mirror *pgm;
530 struct nfs_commit_info cinfo;
531 struct nfs4_pnfs_ds *ds;
532 int i;
533 int status;
534
535 if (!pgio->pg_lseg)
536 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
537 req->wb_context,
538 0,
539 NFS4_MAX_UINT64,
540 IOMODE_RW,
541 GFP_NOFS);
542 /* If no lseg, fall back to write through mds */
543 if (pgio->pg_lseg == NULL)
544 goto out_mds;
545
546 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
547 status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
548 if (status < 0)
549 goto out_mds;
550
551 /* Use a direct mapping of ds_idx to pgio mirror_idx */
552 if (WARN_ON_ONCE(pgio->pg_mirror_count !=
553 FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
554 goto out_mds;
555
556 for (i = 0; i < pgio->pg_mirror_count; i++) {
557 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
558 if (!ds)
559 goto out_mds;
560 pgm = &pgio->pg_mirrors[i];
561 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
562 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
563 }
564
565 return;
566
567out_mds:
568 pnfs_put_lseg(pgio->pg_lseg);
569 pgio->pg_lseg = NULL;
570 nfs_pageio_reset_write_mds(pgio);
571}
572
573static unsigned int
574ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
575 struct nfs_page *req)
576{
577 if (!pgio->pg_lseg)
578 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
579 req->wb_context,
580 0,
581 NFS4_MAX_UINT64,
582 IOMODE_RW,
583 GFP_NOFS);
584 if (pgio->pg_lseg)
585 return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
586
587 /* no lseg means that pnfs is not in use, so no mirroring here */
588 pnfs_put_lseg(pgio->pg_lseg);
589 pgio->pg_lseg = NULL;
590 nfs_pageio_reset_write_mds(pgio);
591 return 1;
592}
593
594static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
595 .pg_init = ff_layout_pg_init_read,
596 .pg_test = pnfs_generic_pg_test,
597 .pg_doio = pnfs_generic_pg_readpages,
598 .pg_cleanup = pnfs_generic_pg_cleanup,
599};
600
601static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
602 .pg_init = ff_layout_pg_init_write,
603 .pg_test = pnfs_generic_pg_test,
604 .pg_doio = pnfs_generic_pg_writepages,
605 .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
606 .pg_cleanup = pnfs_generic_pg_cleanup,
607};
608
609static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
610{
611 struct rpc_task *task = &hdr->task;
612
613 pnfs_layoutcommit_inode(hdr->inode, false);
614
615 if (retry_pnfs) {
616 dprintk("%s Reset task %5u for i/o through pNFS "
617 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
618 hdr->task.tk_pid,
619 hdr->inode->i_sb->s_id,
620 (unsigned long long)NFS_FILEID(hdr->inode),
621 hdr->args.count,
622 (unsigned long long)hdr->args.offset);
623
624 if (!hdr->dreq) {
625 struct nfs_open_context *ctx;
626
627 ctx = nfs_list_entry(hdr->pages.next)->wb_context;
628 set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
629 hdr->completion_ops->error_cleanup(&hdr->pages);
630 } else {
631 nfs_direct_set_resched_writes(hdr->dreq);
632 /* fake unstable write to let common nfs resend pages */
633 hdr->verf.committed = NFS_UNSTABLE;
634 hdr->good_bytes = 0;
635 }
636 return;
637 }
638
639 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
640 dprintk("%s Reset task %5u for i/o through MDS "
641 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
642 hdr->task.tk_pid,
643 hdr->inode->i_sb->s_id,
644 (unsigned long long)NFS_FILEID(hdr->inode),
645 hdr->args.count,
646 (unsigned long long)hdr->args.offset);
647
648 task->tk_status = pnfs_write_done_resend_to_mds(hdr);
649 }
650}
651
652static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
653{
654 struct rpc_task *task = &hdr->task;
655
656 pnfs_layoutcommit_inode(hdr->inode, false);
657
658 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
659 dprintk("%s Reset task %5u for i/o through MDS "
660 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
661 hdr->task.tk_pid,
662 hdr->inode->i_sb->s_id,
663 (unsigned long long)NFS_FILEID(hdr->inode),
664 hdr->args.count,
665 (unsigned long long)hdr->args.offset);
666
667 task->tk_status = pnfs_read_done_resend_to_mds(hdr);
668 }
669}
670
671static int ff_layout_async_handle_error_v4(struct rpc_task *task,
672 struct nfs4_state *state,
673 struct nfs_client *clp,
674 struct pnfs_layout_segment *lseg,
675 int idx)
676{
677 struct pnfs_layout_hdr *lo = lseg->pls_layout;
678 struct inode *inode = lo->plh_inode;
679 struct nfs_server *mds_server = NFS_SERVER(inode);
680
681 struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
682 struct nfs_client *mds_client = mds_server->nfs_client;
683 struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
684
685 if (task->tk_status >= 0)
686 return 0;
687
688 switch (task->tk_status) {
689 /* MDS state errors */
690 case -NFS4ERR_DELEG_REVOKED:
691 case -NFS4ERR_ADMIN_REVOKED:
692 case -NFS4ERR_BAD_STATEID:
693 if (state == NULL)
694 break;
695 nfs_remove_bad_delegation(state->inode);
696 case -NFS4ERR_OPENMODE:
697 if (state == NULL)
698 break;
699 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
700 goto out_bad_stateid;
701 goto wait_on_recovery;
702 case -NFS4ERR_EXPIRED:
703 if (state != NULL) {
704 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
705 goto out_bad_stateid;
706 }
707 nfs4_schedule_lease_recovery(mds_client);
708 goto wait_on_recovery;
709 /* DS session errors */
710 case -NFS4ERR_BADSESSION:
711 case -NFS4ERR_BADSLOT:
712 case -NFS4ERR_BAD_HIGH_SLOT:
713 case -NFS4ERR_DEADSESSION:
714 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
715 case -NFS4ERR_SEQ_FALSE_RETRY:
716 case -NFS4ERR_SEQ_MISORDERED:
717 dprintk("%s ERROR %d, Reset session. Exchangeid "
718 "flags 0x%x\n", __func__, task->tk_status,
719 clp->cl_exchange_flags);
720 nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
721 break;
722 case -NFS4ERR_DELAY:
723 case -NFS4ERR_GRACE:
724 rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
725 break;
726 case -NFS4ERR_RETRY_UNCACHED_REP:
727 break;
728 /* Invalidate Layout errors */
729 case -NFS4ERR_PNFS_NO_LAYOUT:
730 case -ESTALE: /* mapped NFS4ERR_STALE */
731 case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
732 case -EISDIR: /* mapped NFS4ERR_ISDIR */
733 case -NFS4ERR_FHEXPIRED:
734 case -NFS4ERR_WRONG_TYPE:
735 dprintk("%s Invalid layout error %d\n", __func__,
736 task->tk_status);
737 /*
738 * Destroy layout so new i/o will get a new layout.
739 * Layout will not be destroyed until all current lseg
740 * references are put. Mark layout as invalid to resend failed
741 * i/o and all i/o waiting on the slot table to the MDS until
742 * layout is destroyed and a new valid layout is obtained.
743 */
744 pnfs_destroy_layout(NFS_I(inode));
745 rpc_wake_up(&tbl->slot_tbl_waitq);
746 goto reset;
747 /* RPC connection errors */
748 case -ECONNREFUSED:
749 case -EHOSTDOWN:
750 case -EHOSTUNREACH:
751 case -ENETUNREACH:
752 case -EIO:
753 case -ETIMEDOUT:
754 case -EPIPE:
755 dprintk("%s DS connection error %d\n", __func__,
756 task->tk_status);
757 nfs4_mark_deviceid_unavailable(devid);
758 rpc_wake_up(&tbl->slot_tbl_waitq);
759 /* fall through */
760 default:
761 if (ff_layout_has_available_ds(lseg))
762 return -NFS4ERR_RESET_TO_PNFS;
763reset:
764 dprintk("%s Retry through MDS. Error %d\n", __func__,
765 task->tk_status);
766 return -NFS4ERR_RESET_TO_MDS;
767 }
768out:
769 task->tk_status = 0;
770 return -EAGAIN;
771out_bad_stateid:
772 task->tk_status = -EIO;
773 return 0;
774wait_on_recovery:
775 rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
776 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
777 rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
778 goto out;
779}
780
781/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
782static int ff_layout_async_handle_error_v3(struct rpc_task *task,
783 struct pnfs_layout_segment *lseg,
784 int idx)
785{
786 struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
787
788 if (task->tk_status >= 0)
789 return 0;
790
791 if (task->tk_status != -EJUKEBOX) {
792 dprintk("%s DS connection error %d\n", __func__,
793 task->tk_status);
794 nfs4_mark_deviceid_unavailable(devid);
795 if (ff_layout_has_available_ds(lseg))
796 return -NFS4ERR_RESET_TO_PNFS;
797 else
798 return -NFS4ERR_RESET_TO_MDS;
799 }
800
801 if (task->tk_status == -EJUKEBOX)
802 nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
803 task->tk_status = 0;
804 rpc_restart_call(task);
805 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
806 return -EAGAIN;
807}
808
809static int ff_layout_async_handle_error(struct rpc_task *task,
810 struct nfs4_state *state,
811 struct nfs_client *clp,
812 struct pnfs_layout_segment *lseg,
813 int idx)
814{
815 int vers = clp->cl_nfs_mod->rpc_vers->number;
816
817 switch (vers) {
818 case 3:
819 return ff_layout_async_handle_error_v3(task, lseg, idx);
820 case 4:
821 return ff_layout_async_handle_error_v4(task, state, clp,
822 lseg, idx);
823 default:
824 /* should never happen */
825 WARN_ON_ONCE(1);
826 return 0;
827 }
828}
829
830static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
831 int idx, u64 offset, u64 length,
832 u32 status, int opnum)
833{
834 struct nfs4_ff_layout_mirror *mirror;
835 int err;
836
837 mirror = FF_LAYOUT_COMP(lseg, idx);
838 err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
839 mirror, offset, length, status, opnum,
840 GFP_NOIO);
841 dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
842}
843
844/* NFS_PROTO call done callback routines */
845
846static int ff_layout_read_done_cb(struct rpc_task *task,
847 struct nfs_pgio_header *hdr)
848{
849 struct inode *inode;
850 int err;
851
852 trace_nfs4_pnfs_read(hdr, task->tk_status);
853 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
854 hdr->res.op_status = NFS4ERR_NXIO;
855 if (task->tk_status < 0 && hdr->res.op_status)
856 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
857 hdr->args.offset, hdr->args.count,
858 hdr->res.op_status, OP_READ);
859 err = ff_layout_async_handle_error(task, hdr->args.context->state,
860 hdr->ds_clp, hdr->lseg,
861 hdr->pgio_mirror_idx);
862
863 switch (err) {
864 case -NFS4ERR_RESET_TO_PNFS:
865 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
866 &hdr->lseg->pls_layout->plh_flags);
867 pnfs_read_resend_pnfs(hdr);
868 return task->tk_status;
869 case -NFS4ERR_RESET_TO_MDS:
870 inode = hdr->lseg->pls_layout->plh_inode;
871 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
872 ff_layout_reset_read(hdr);
873 return task->tk_status;
874 case -EAGAIN:
875 rpc_restart_call_prepare(task);
876 return -EAGAIN;
877 }
878
879 return 0;
880}
881
882/*
883 * We reference the rpc_cred of the first WRITE that triggers the need for
884 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
885 * rfc5661 is not clear about which credential should be used.
886 *
887 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
888 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
889 * we always send layoutcommit after DS writes.
890 */
891static void
892ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
893{
894 pnfs_set_layoutcommit(hdr);
895 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
896 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
897}
898
899static bool
900ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
901{
902 /* No mirroring for now */
903 struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
904
905 return ff_layout_test_devid_unavailable(node);
906}
907
908static int ff_layout_read_prepare_common(struct rpc_task *task,
909 struct nfs_pgio_header *hdr)
910{
911 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
912 rpc_exit(task, -EIO);
913 return -EIO;
914 }
915 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
916 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
917 if (ff_layout_has_available_ds(hdr->lseg))
918 pnfs_read_resend_pnfs(hdr);
919 else
920 ff_layout_reset_read(hdr);
921 rpc_exit(task, 0);
922 return -EAGAIN;
923 }
924 hdr->pgio_done_cb = ff_layout_read_done_cb;
925
926 return 0;
927}
928
929/*
930 * Call ops for the async read/write cases
931 * In the case of dense layouts, the offset needs to be reset to its
932 * original value.
933 */
934static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
935{
936 struct nfs_pgio_header *hdr = data;
937
938 if (ff_layout_read_prepare_common(task, hdr))
939 return;
940
941 rpc_call_start(task);
942}
943
944static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
945 struct nfs4_sequence_args *args,
946 struct nfs4_sequence_res *res,
947 struct rpc_task *task)
948{
949 if (ds_clp->cl_session)
950 return nfs41_setup_sequence(ds_clp->cl_session,
951 args,
952 res,
953 task);
954 return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
955 args,
956 res,
957 task);
958}
959
960static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
961{
962 struct nfs_pgio_header *hdr = data;
963
964 if (ff_layout_read_prepare_common(task, hdr))
965 return;
966
967 if (ff_layout_setup_sequence(hdr->ds_clp,
968 &hdr->args.seq_args,
969 &hdr->res.seq_res,
970 task))
971 return;
972
973 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
974 hdr->args.lock_context, FMODE_READ) == -EIO)
975 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
976}
977
978static void ff_layout_read_call_done(struct rpc_task *task, void *data)
979{
980 struct nfs_pgio_header *hdr = data;
981
982 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
983
984 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
985 task->tk_status == 0) {
986 nfs4_sequence_done(task, &hdr->res.seq_res);
987 return;
988 }
989
990 /* Note this may cause RPC to be resent */
991 hdr->mds_ops->rpc_call_done(task, hdr);
992}
993
994static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
995{
996 struct nfs_pgio_header *hdr = data;
997
998 rpc_count_iostats_metrics(task,
999 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
1000}
1001
1002static int ff_layout_write_done_cb(struct rpc_task *task,
1003 struct nfs_pgio_header *hdr)
1004{
1005 struct inode *inode;
1006 int err;
1007
1008 trace_nfs4_pnfs_write(hdr, task->tk_status);
1009 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
1010 hdr->res.op_status = NFS4ERR_NXIO;
1011 if (task->tk_status < 0 && hdr->res.op_status)
1012 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1013 hdr->args.offset, hdr->args.count,
1014 hdr->res.op_status, OP_WRITE);
1015 err = ff_layout_async_handle_error(task, hdr->args.context->state,
1016 hdr->ds_clp, hdr->lseg,
1017 hdr->pgio_mirror_idx);
1018
1019 switch (err) {
1020 case -NFS4ERR_RESET_TO_PNFS:
1021 case -NFS4ERR_RESET_TO_MDS:
1022 inode = hdr->lseg->pls_layout->plh_inode;
1023 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
1024 if (err == -NFS4ERR_RESET_TO_PNFS) {
1025 pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
1026 ff_layout_reset_write(hdr, true);
1027 } else {
1028 pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
1029 ff_layout_reset_write(hdr, false);
1030 }
1031 return task->tk_status;
1032 case -EAGAIN:
1033 rpc_restart_call_prepare(task);
1034 return -EAGAIN;
1035 }
1036
1037 if (hdr->res.verf->committed == NFS_FILE_SYNC ||
1038 hdr->res.verf->committed == NFS_DATA_SYNC)
1039 ff_layout_set_layoutcommit(hdr);
1040
1041 return 0;
1042}
1043
1044static int ff_layout_commit_done_cb(struct rpc_task *task,
1045 struct nfs_commit_data *data)
1046{
1047 struct inode *inode;
1048 int err;
1049
1050 trace_nfs4_pnfs_commit_ds(data, task->tk_status);
1051 if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
1052 data->res.op_status = NFS4ERR_NXIO;
1053 if (task->tk_status < 0 && data->res.op_status)
1054 ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
1055 data->args.offset, data->args.count,
1056 data->res.op_status, OP_COMMIT);
1057 err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
1058 data->lseg, data->ds_commit_index);
1059
1060 switch (err) {
1061 case -NFS4ERR_RESET_TO_PNFS:
1062 case -NFS4ERR_RESET_TO_MDS:
1063 inode = data->lseg->pls_layout->plh_inode;
1064 pnfs_error_mark_layout_for_return(inode, data->lseg);
1065 if (err == -NFS4ERR_RESET_TO_PNFS)
1066 pnfs_set_retry_layoutget(data->lseg->pls_layout);
1067 else
1068 pnfs_clear_retry_layoutget(data->lseg->pls_layout);
1069 pnfs_generic_prepare_to_resend_writes(data);
1070 return -EAGAIN;
1071 case -EAGAIN:
1072 rpc_restart_call_prepare(task);
1073 return -EAGAIN;
1074 }
1075
1076 if (data->verf.committed == NFS_UNSTABLE)
1077 pnfs_commit_set_layoutcommit(data);
1078
1079 return 0;
1080}
1081
1082static int ff_layout_write_prepare_common(struct rpc_task *task,
1083 struct nfs_pgio_header *hdr)
1084{
1085 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1086 rpc_exit(task, -EIO);
1087 return -EIO;
1088 }
1089
1090 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
1091 bool retry_pnfs;
1092
1093 retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
1094 dprintk("%s task %u reset io to %s\n", __func__,
1095 task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
1096 ff_layout_reset_write(hdr, retry_pnfs);
1097 rpc_exit(task, 0);
1098 return -EAGAIN;
1099 }
1100
1101 return 0;
1102}
1103
1104static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
1105{
1106 struct nfs_pgio_header *hdr = data;
1107
1108 if (ff_layout_write_prepare_common(task, hdr))
1109 return;
1110
1111 rpc_call_start(task);
1112}
1113
1114static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
1115{
1116 struct nfs_pgio_header *hdr = data;
1117
1118 if (ff_layout_write_prepare_common(task, hdr))
1119 return;
1120
1121 if (ff_layout_setup_sequence(hdr->ds_clp,
1122 &hdr->args.seq_args,
1123 &hdr->res.seq_res,
1124 task))
1125 return;
1126
1127 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
1128 hdr->args.lock_context, FMODE_WRITE) == -EIO)
1129 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
1130}
1131
1132static void ff_layout_write_call_done(struct rpc_task *task, void *data)
1133{
1134 struct nfs_pgio_header *hdr = data;
1135
1136 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1137 task->tk_status == 0) {
1138 nfs4_sequence_done(task, &hdr->res.seq_res);
1139 return;
1140 }
1141
1142 /* Note this may cause RPC to be resent */
1143 hdr->mds_ops->rpc_call_done(task, hdr);
1144}
1145
1146static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1147{
1148 struct nfs_pgio_header *hdr = data;
1149
1150 rpc_count_iostats_metrics(task,
1151 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
1152}
1153
1154static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
1155{
1156 rpc_call_start(task);
1157}
1158
1159static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
1160{
1161 struct nfs_commit_data *wdata = data;
1162
1163 ff_layout_setup_sequence(wdata->ds_clp,
1164 &wdata->args.seq_args,
1165 &wdata->res.seq_res,
1166 task);
1167}
1168
1169static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
1170{
1171 struct nfs_commit_data *cdata = data;
1172
1173 rpc_count_iostats_metrics(task,
1174 &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
1175}
1176
1177static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
1178 .rpc_call_prepare = ff_layout_read_prepare_v3,
1179 .rpc_call_done = ff_layout_read_call_done,
1180 .rpc_count_stats = ff_layout_read_count_stats,
1181 .rpc_release = pnfs_generic_rw_release,
1182};
1183
1184static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
1185 .rpc_call_prepare = ff_layout_read_prepare_v4,
1186 .rpc_call_done = ff_layout_read_call_done,
1187 .rpc_count_stats = ff_layout_read_count_stats,
1188 .rpc_release = pnfs_generic_rw_release,
1189};
1190
1191static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
1192 .rpc_call_prepare = ff_layout_write_prepare_v3,
1193 .rpc_call_done = ff_layout_write_call_done,
1194 .rpc_count_stats = ff_layout_write_count_stats,
1195 .rpc_release = pnfs_generic_rw_release,
1196};
1197
1198static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
1199 .rpc_call_prepare = ff_layout_write_prepare_v4,
1200 .rpc_call_done = ff_layout_write_call_done,
1201 .rpc_count_stats = ff_layout_write_count_stats,
1202 .rpc_release = pnfs_generic_rw_release,
1203};
1204
1205static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
1206 .rpc_call_prepare = ff_layout_commit_prepare_v3,
1207 .rpc_call_done = pnfs_generic_write_commit_done,
1208 .rpc_count_stats = ff_layout_commit_count_stats,
1209 .rpc_release = pnfs_generic_commit_release,
1210};
1211
1212static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
1213 .rpc_call_prepare = ff_layout_commit_prepare_v4,
1214 .rpc_call_done = pnfs_generic_write_commit_done,
1215 .rpc_count_stats = ff_layout_commit_count_stats,
1216 .rpc_release = pnfs_generic_commit_release,
1217};
1218
1219static enum pnfs_try_status
1220ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1221{
1222 struct pnfs_layout_segment *lseg = hdr->lseg;
1223 struct nfs4_pnfs_ds *ds;
1224 struct rpc_clnt *ds_clnt;
1225 struct rpc_cred *ds_cred;
1226 loff_t offset = hdr->args.offset;
1227 u32 idx = hdr->pgio_mirror_idx;
1228 int vers;
1229 struct nfs_fh *fh;
1230
1231 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
1232 __func__, hdr->inode->i_ino,
1233 hdr->args.pgbase, (size_t)hdr->args.count, offset);
1234
1235 ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
1236 if (!ds)
1237 goto out_failed;
1238
1239 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1240 hdr->inode);
1241 if (IS_ERR(ds_clnt))
1242 goto out_failed;
1243
1244 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1245 if (IS_ERR(ds_cred))
1246 goto out_failed;
1247
1248 vers = nfs4_ff_layout_ds_version(lseg, idx);
1249
1250 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
1251 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
1252
1253 atomic_inc(&ds->ds_clp->cl_count);
1254 hdr->ds_clp = ds->ds_clp;
1255 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1256 if (fh)
1257 hdr->args.fh = fh;
1258
1259 /*
1260 * Note that if we ever decide to split across DSes,
1261 * then we may need to handle dense-like offsets.
1262 */
1263 hdr->args.offset = offset;
1264 hdr->mds_offset = offset;
1265
1266 /* Perform an asynchronous read to ds */
1267 nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1268 vers == 3 ? &ff_layout_read_call_ops_v3 :
1269 &ff_layout_read_call_ops_v4,
1270 0, RPC_TASK_SOFTCONN);
1271
1272 return PNFS_ATTEMPTED;
1273
1274out_failed:
1275 if (ff_layout_has_available_ds(lseg))
1276 return PNFS_TRY_AGAIN;
1277 return PNFS_NOT_ATTEMPTED;
1278}
1279
1280/* Perform async writes. */
1281static enum pnfs_try_status
1282ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1283{
1284 struct pnfs_layout_segment *lseg = hdr->lseg;
1285 struct nfs4_pnfs_ds *ds;
1286 struct rpc_clnt *ds_clnt;
1287 struct rpc_cred *ds_cred;
1288 loff_t offset = hdr->args.offset;
1289 int vers;
1290 struct nfs_fh *fh;
1291 int idx = hdr->pgio_mirror_idx;
1292
1293 ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1294 if (!ds)
1295 return PNFS_NOT_ATTEMPTED;
1296
1297 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1298 hdr->inode);
1299 if (IS_ERR(ds_clnt))
1300 return PNFS_NOT_ATTEMPTED;
1301
1302 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1303 if (IS_ERR(ds_cred))
1304 return PNFS_NOT_ATTEMPTED;
1305
1306 vers = nfs4_ff_layout_ds_version(lseg, idx);
1307
1308 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
1309 __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
1310 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
1311 vers);
1312
1313 hdr->pgio_done_cb = ff_layout_write_done_cb;
1314 atomic_inc(&ds->ds_clp->cl_count);
1315 hdr->ds_clp = ds->ds_clp;
1316 hdr->ds_commit_idx = idx;
1317 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1318 if (fh)
1319 hdr->args.fh = fh;
1320
1321 /*
1322 * Note that if we ever decide to split across DSes,
1323 * then we may need to handle dense-like offsets.
1324 */
1325 hdr->args.offset = offset;
1326
1327 /* Perform an asynchronous write */
1328 nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1329 vers == 3 ? &ff_layout_write_call_ops_v3 :
1330 &ff_layout_write_call_ops_v4,
1331 sync, RPC_TASK_SOFTCONN);
1332 return PNFS_ATTEMPTED;
1333}
1334
1335static void
1336ff_layout_mark_request_commit(struct nfs_page *req,
1337 struct pnfs_layout_segment *lseg,
1338 struct nfs_commit_info *cinfo,
1339 u32 ds_commit_idx)
1340{
1341 struct list_head *list;
1342 struct pnfs_commit_bucket *buckets;
1343
1344 spin_lock(cinfo->lock);
1345 buckets = cinfo->ds->buckets;
1346 list = &buckets[ds_commit_idx].written;
1347 if (list_empty(list)) {
1348 /* Non-empty buckets hold a reference on the lseg. That ref
1349 * is normally transferred to the COMMIT call and released
1350 * there. It could also be released if the last req is pulled
1351 * off due to a rewrite, in which case it will be done in
1352 * pnfs_common_clear_request_commit
1353 */
1354 WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
1355 buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
1356 }
1357 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1358 cinfo->ds->nwritten++;
1359
1360 /* nfs_request_add_commit_list(). We need to add req to list without
1361 * dropping cinfo lock.
1362 */
1363 set_bit(PG_CLEAN, &(req)->wb_flags);
1364 nfs_list_add_request(req, list);
1365 cinfo->mds->ncommit++;
1366 spin_unlock(cinfo->lock);
1367 if (!cinfo->dreq) {
1368 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1369 inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
1370 BDI_RECLAIMABLE);
1371 __mark_inode_dirty(req->wb_context->dentry->d_inode,
1372 I_DIRTY_DATASYNC);
1373 }
1374}
1375
1376static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1377{
1378 return i;
1379}
1380
1381static struct nfs_fh *
1382select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1383{
1384 struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
1385
1386 /* FIXME: Assume that there is only one NFS version available
1387 * for the DS.
1388 */
1389 return &flseg->mirror_array[i]->fh_versions[0];
1390}
1391
1392static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1393{
1394 struct pnfs_layout_segment *lseg = data->lseg;
1395 struct nfs4_pnfs_ds *ds;
1396 struct rpc_clnt *ds_clnt;
1397 struct rpc_cred *ds_cred;
1398 u32 idx;
1399 int vers;
1400 struct nfs_fh *fh;
1401
1402 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1403 ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1404 if (!ds)
1405 goto out_err;
1406
1407 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1408 data->inode);
1409 if (IS_ERR(ds_clnt))
1410 goto out_err;
1411
1412 ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
1413 if (IS_ERR(ds_cred))
1414 goto out_err;
1415
1416 vers = nfs4_ff_layout_ds_version(lseg, idx);
1417
1418 dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
1419 data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
1420 vers);
1421 data->commit_done_cb = ff_layout_commit_done_cb;
1422 data->cred = ds_cred;
1423 atomic_inc(&ds->ds_clp->cl_count);
1424 data->ds_clp = ds->ds_clp;
1425 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1426 if (fh)
1427 data->args.fh = fh;
1428 return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1429 vers == 3 ? &ff_layout_commit_call_ops_v3 :
1430 &ff_layout_commit_call_ops_v4,
1431 how, RPC_TASK_SOFTCONN);
1432out_err:
1433 pnfs_generic_prepare_to_resend_writes(data);
1434 pnfs_generic_commit_release(data);
1435 return -EAGAIN;
1436}
1437
1438static int
1439ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1440 int how, struct nfs_commit_info *cinfo)
1441{
1442 return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1443 ff_layout_initiate_commit);
1444}
1445
1446static struct pnfs_ds_commit_info *
1447ff_layout_get_ds_info(struct inode *inode)
1448{
1449 struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
1450
1451 if (layout == NULL)
1452 return NULL;
1453
1454 return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
1455}
1456
1457static void
1458ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
1459{
1460 nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
1461 id_node));
1462}
1463
1464static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
1465 struct xdr_stream *xdr,
1466 const struct nfs4_layoutreturn_args *args)
1467{
1468 struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
1469 __be32 *start;
1470 int count = 0, ret = 0;
1471
1472 start = xdr_reserve_space(xdr, 4);
1473 if (unlikely(!start))
1474 return -E2BIG;
1475
1476 /* This assume we always return _ALL_ layouts */
1477 spin_lock(&hdr->plh_inode->i_lock);
1478 ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
1479 spin_unlock(&hdr->plh_inode->i_lock);
1480
1481 *start = cpu_to_be32(count);
1482
1483 return ret;
1484}
1485
1486/* report nothing for now */
1487static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
1488 struct xdr_stream *xdr,
1489 const struct nfs4_layoutreturn_args *args)
1490{
1491 __be32 *p;
1492
1493 p = xdr_reserve_space(xdr, 4);
1494 if (likely(p))
1495 *p = cpu_to_be32(0);
1496}
1497
1498static struct nfs4_deviceid_node *
1499ff_layout_alloc_deviceid_node(struct nfs_server *server,
1500 struct pnfs_device *pdev, gfp_t gfp_flags)
1501{
1502 struct nfs4_ff_layout_ds *dsaddr;
1503
1504 dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
1505 if (!dsaddr)
1506 return NULL;
1507 return &dsaddr->id_node;
1508}
1509
1510static void
1511ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
1512 struct xdr_stream *xdr,
1513 const struct nfs4_layoutreturn_args *args)
1514{
1515 struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
1516 __be32 *start;
1517
1518 dprintk("%s: Begin\n", __func__);
1519 start = xdr_reserve_space(xdr, 4);
1520 BUG_ON(!start);
1521
1522 if (ff_layout_encode_ioerr(flo, xdr, args))
1523 goto out;
1524
1525 ff_layout_encode_iostats(flo, xdr, args);
1526out:
1527 *start = cpu_to_be32((xdr->p - start - 1) * 4);
1528 dprintk("%s: Return\n", __func__);
1529}
1530
1531static struct pnfs_layoutdriver_type flexfilelayout_type = {
1532 .id = LAYOUT_FLEX_FILES,
1533 .name = "LAYOUT_FLEX_FILES",
1534 .owner = THIS_MODULE,
1535 .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
1536 .free_layout_hdr = ff_layout_free_layout_hdr,
1537 .alloc_lseg = ff_layout_alloc_lseg,
1538 .free_lseg = ff_layout_free_lseg,
1539 .pg_read_ops = &ff_layout_pg_read_ops,
1540 .pg_write_ops = &ff_layout_pg_write_ops,
1541 .get_ds_info = ff_layout_get_ds_info,
1542 .free_deviceid_node = ff_layout_free_deveiceid_node,
1543 .mark_request_commit = ff_layout_mark_request_commit,
1544 .clear_request_commit = pnfs_generic_clear_request_commit,
1545 .scan_commit_lists = pnfs_generic_scan_commit_lists,
1546 .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
1547 .commit_pagelist = ff_layout_commit_pagelist,
1548 .read_pagelist = ff_layout_read_pagelist,
1549 .write_pagelist = ff_layout_write_pagelist,
1550 .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
1551 .encode_layoutreturn = ff_layout_encode_layoutreturn,
1552};
1553
1554static int __init nfs4flexfilelayout_init(void)
1555{
1556 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
1557 __func__);
1558 return pnfs_register_layoutdriver(&flexfilelayout_type);
1559}
1560
1561static void __exit nfs4flexfilelayout_exit(void)
1562{
1563 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
1564 __func__);
1565 pnfs_unregister_layoutdriver(&flexfilelayout_type);
1566}
1567
1568MODULE_ALIAS("nfs-layouttype4-4");
1569
1570MODULE_LICENSE("GPL");
1571MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
1572
1573module_init(nfs4flexfilelayout_init);
1574module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
1/*
2 * NFSv4 flexfile layout driver data structures.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
10#define FS_NFS_NFS4FLEXFILELAYOUT_H
11
12#include "../pnfs.h"
13
14/* XXX: Let's filter out insanely large mirror count for now to avoid oom
15 * due to network error etc. */
16#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
17
18struct nfs4_ff_ds_version {
19 u32 version;
20 u32 minor_version;
21 u32 rsize;
22 u32 wsize;
23 bool tightly_coupled;
24};
25
26/* chained in global deviceid hlist */
27struct nfs4_ff_layout_ds {
28 struct nfs4_deviceid_node id_node;
29 u32 ds_versions_cnt;
30 struct nfs4_ff_ds_version *ds_versions;
31 struct nfs4_pnfs_ds *ds;
32};
33
34struct nfs4_ff_layout_ds_err {
35 struct list_head list; /* linked in mirror error_list */
36 u64 offset;
37 u64 length;
38 int status;
39 enum nfs_opnum4 opnum;
40 nfs4_stateid stateid;
41 struct nfs4_deviceid deviceid;
42};
43
44struct nfs4_ff_layout_mirror {
45 u32 ds_count;
46 u32 efficiency;
47 struct nfs4_ff_layout_ds *mirror_ds;
48 u32 fh_versions_cnt;
49 struct nfs_fh *fh_versions;
50 nfs4_stateid stateid;
51 struct nfs4_string user_name;
52 struct nfs4_string group_name;
53 u32 uid;
54 u32 gid;
55 struct rpc_cred *cred;
56 spinlock_t lock;
57};
58
59struct nfs4_ff_layout_segment {
60 struct pnfs_layout_segment generic_hdr;
61 u64 stripe_unit;
62 u32 mirror_array_cnt;
63 struct nfs4_ff_layout_mirror **mirror_array;
64};
65
66struct nfs4_flexfile_layout {
67 struct pnfs_layout_hdr generic_hdr;
68 struct pnfs_ds_commit_info commit_info;
69 struct list_head error_list; /* nfs4_ff_layout_ds_err */
70};
71
72static inline struct nfs4_flexfile_layout *
73FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
74{
75 return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
76}
77
78static inline struct nfs4_ff_layout_segment *
79FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_ff_layout_segment,
83 generic_hdr);
84}
85
86static inline struct nfs4_deviceid_node *
87FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
88{
89 if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
90 FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
91 FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
92 return NULL;
93 return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
94}
95
96static inline struct nfs4_ff_layout_ds *
97FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
98{
99 return container_of(node, struct nfs4_ff_layout_ds, id_node);
100}
101
102static inline struct nfs4_ff_layout_mirror *
103FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
104{
105 if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
106 return NULL;
107 return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
108}
109
110static inline u32
111FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
112{
113 return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
114}
115
116static inline bool
117ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
118{
119 return nfs4_test_deviceid_unavailable(node);
120}
121
122static inline int
123nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
124{
125 return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
126}
127
128struct nfs4_ff_layout_ds *
129nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
130 gfp_t gfp_flags);
131void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
132void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
133int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
134 struct nfs4_ff_layout_mirror *mirror, u64 offset,
135 u64 length, int status, enum nfs_opnum4 opnum,
136 gfp_t gfp_flags);
137int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
138 struct xdr_stream *xdr, int *count,
139 const struct pnfs_layout_range *range);
140struct nfs_fh *
141nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
142
143struct nfs4_pnfs_ds *
144nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
145 bool fail_return);
146
147struct rpc_clnt *
148nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
149 u32 ds_idx,
150 struct nfs_client *ds_clp,
151 struct inode *inode);
152struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
153 u32 ds_idx, struct rpc_cred *mdscred);
154bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
155#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..e2c01f204a95
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#include <linux/nfs_fs.h>
10#include <linux/vmalloc.h>
11#include <linux/module.h>
12#include <linux/sunrpc/addr.h>
13
14#include "../internal.h"
15#include "../nfs4session.h"
16#include "flexfilelayout.h"
17
18#define NFSDBG_FACILITY NFSDBG_PNFS_LD
19
20static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
21static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
22
23void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
24{
25 if (mirror_ds)
26 nfs4_put_deviceid_node(&mirror_ds->id_node);
27}
28
29void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
30{
31 nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
32 nfs4_pnfs_ds_put(mirror_ds->ds);
33 kfree(mirror_ds);
34}
35
36/* Decode opaque device data and construct new_ds using it */
37struct nfs4_ff_layout_ds *
38nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
39 gfp_t gfp_flags)
40{
41 struct xdr_stream stream;
42 struct xdr_buf buf;
43 struct page *scratch;
44 struct list_head dsaddrs;
45 struct nfs4_pnfs_ds_addr *da;
46 struct nfs4_ff_layout_ds *new_ds = NULL;
47 struct nfs4_ff_ds_version *ds_versions = NULL;
48 u32 mp_count;
49 u32 version_count;
50 __be32 *p;
51 int i, ret = -ENOMEM;
52
53 /* set up xdr stream */
54 scratch = alloc_page(gfp_flags);
55 if (!scratch)
56 goto out_err;
57
58 new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
59 if (!new_ds)
60 goto out_scratch;
61
62 nfs4_init_deviceid_node(&new_ds->id_node,
63 server,
64 &pdev->dev_id);
65 INIT_LIST_HEAD(&dsaddrs);
66
67 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
68 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
69
70 /* multipath count */
71 p = xdr_inline_decode(&stream, 4);
72 if (unlikely(!p))
73 goto out_err_drain_dsaddrs;
74 mp_count = be32_to_cpup(p);
75 dprintk("%s: multipath ds count %d\n", __func__, mp_count);
76
77 for (i = 0; i < mp_count; i++) {
78 /* multipath ds */
79 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
80 &stream, gfp_flags);
81 if (da)
82 list_add_tail(&da->da_node, &dsaddrs);
83 }
84 if (list_empty(&dsaddrs)) {
85 dprintk("%s: no suitable DS addresses found\n",
86 __func__);
87 ret = -ENOMEDIUM;
88 goto out_err_drain_dsaddrs;
89 }
90
91 /* version count */
92 p = xdr_inline_decode(&stream, 4);
93 if (unlikely(!p))
94 goto out_err_drain_dsaddrs;
95 version_count = be32_to_cpup(p);
96 dprintk("%s: version count %d\n", __func__, version_count);
97
98 ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
99 gfp_flags);
100 if (!ds_versions)
101 goto out_scratch;
102
103 for (i = 0; i < version_count; i++) {
104 /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
105 * tightly_coupled(4) */
106 p = xdr_inline_decode(&stream, 20);
107 if (unlikely(!p))
108 goto out_err_drain_dsaddrs;
109 ds_versions[i].version = be32_to_cpup(p++);
110 ds_versions[i].minor_version = be32_to_cpup(p++);
111 ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
112 ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
113 ds_versions[i].tightly_coupled = be32_to_cpup(p);
114
115 if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
116 ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
117 if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
118 ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
119
120 if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
121 dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
122 i, ds_versions[i].version,
123 ds_versions[i].minor_version);
124 ret = -EPROTONOSUPPORT;
125 goto out_err_drain_dsaddrs;
126 }
127
128 dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
129 __func__, i, ds_versions[i].version,
130 ds_versions[i].minor_version,
131 ds_versions[i].rsize,
132 ds_versions[i].wsize,
133 ds_versions[i].tightly_coupled);
134 }
135
136 new_ds->ds_versions = ds_versions;
137 new_ds->ds_versions_cnt = version_count;
138
139 new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
140 if (!new_ds->ds)
141 goto out_err_drain_dsaddrs;
142
143 /* If DS was already in cache, free ds addrs */
144 while (!list_empty(&dsaddrs)) {
145 da = list_first_entry(&dsaddrs,
146 struct nfs4_pnfs_ds_addr,
147 da_node);
148 list_del_init(&da->da_node);
149 kfree(da->da_remotestr);
150 kfree(da);
151 }
152
153 __free_page(scratch);
154 return new_ds;
155
156out_err_drain_dsaddrs:
157 while (!list_empty(&dsaddrs)) {
158 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
159 da_node);
160 list_del_init(&da->da_node);
161 kfree(da->da_remotestr);
162 kfree(da);
163 }
164
165 kfree(ds_versions);
166out_scratch:
167 __free_page(scratch);
168out_err:
169 kfree(new_ds);
170
171 dprintk("%s ERROR: returning %d\n", __func__, ret);
172 return NULL;
173}
174
175static u64
176end_offset(u64 start, u64 len)
177{
178 u64 end;
179
180 end = start + len;
181 return end >= start ? end : NFS4_MAX_UINT64;
182}
183
184static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
185 u64 offset, u64 length)
186{
187 u64 end;
188
189 end = max_t(u64, end_offset(err->offset, err->length),
190 end_offset(offset, length));
191 err->offset = min_t(u64, err->offset, offset);
192 err->length = end - err->offset;
193}
194
195static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
196 u64 length, int status, enum nfs_opnum4 opnum,
197 nfs4_stateid *stateid,
198 struct nfs4_deviceid *deviceid)
199{
200 return err->status == status && err->opnum == opnum &&
201 nfs4_stateid_match(&err->stateid, stateid) &&
202 !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
203 end_offset(err->offset, err->length) >= offset &&
204 err->offset <= end_offset(offset, length);
205}
206
207static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
208 struct nfs4_ff_layout_ds_err *new)
209{
210 if (!ds_error_can_merge(old, new->offset, new->length, new->status,
211 new->opnum, &new->stateid, &new->deviceid))
212 return false;
213
214 extend_ds_error(old, new->offset, new->length);
215 return true;
216}
217
218static bool
219ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
220 struct nfs4_ff_layout_ds_err *dserr)
221{
222 struct nfs4_ff_layout_ds_err *err;
223
224 list_for_each_entry(err, &flo->error_list, list) {
225 if (merge_ds_error(err, dserr)) {
226 return true;
227 }
228 }
229
230 list_add(&dserr->list, &flo->error_list);
231 return false;
232}
233
234static bool
235ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
236 u64 length, int status, enum nfs_opnum4 opnum,
237 nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
238{
239 bool found = false;
240 struct nfs4_ff_layout_ds_err *err;
241
242 list_for_each_entry(err, &flo->error_list, list) {
243 if (ds_error_can_merge(err, offset, length, status, opnum,
244 stateid, deviceid)) {
245 found = true;
246 extend_ds_error(err, offset, length);
247 break;
248 }
249 }
250
251 return found;
252}
253
254int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
255 struct nfs4_ff_layout_mirror *mirror, u64 offset,
256 u64 length, int status, enum nfs_opnum4 opnum,
257 gfp_t gfp_flags)
258{
259 struct nfs4_ff_layout_ds_err *dserr;
260 bool needfree;
261
262 if (status == 0)
263 return 0;
264
265 if (mirror->mirror_ds == NULL)
266 return -EINVAL;
267
268 spin_lock(&flo->generic_hdr.plh_inode->i_lock);
269 if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
270 &mirror->stateid,
271 &mirror->mirror_ds->id_node.deviceid)) {
272 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
273 return 0;
274 }
275 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
276 dserr = kmalloc(sizeof(*dserr), gfp_flags);
277 if (!dserr)
278 return -ENOMEM;
279
280 INIT_LIST_HEAD(&dserr->list);
281 dserr->offset = offset;
282 dserr->length = length;
283 dserr->status = status;
284 dserr->opnum = opnum;
285 nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
286 memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
287 NFS4_DEVICEID4_SIZE);
288
289 spin_lock(&flo->generic_hdr.plh_inode->i_lock);
290 needfree = ff_layout_add_ds_error_locked(flo, dserr);
291 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
292 if (needfree)
293 kfree(dserr);
294
295 return 0;
296}
297
298/* currently we only support AUTH_NONE and AUTH_SYS */
299static rpc_authflavor_t
300nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
301{
302 if (mirror->uid == (u32)-1)
303 return RPC_AUTH_NULL;
304 return RPC_AUTH_UNIX;
305}
306
307/* fetch cred for NFSv3 DS */
308static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
309 struct nfs4_pnfs_ds *ds)
310{
311 if (ds->ds_clp && !mirror->cred &&
312 mirror->mirror_ds->ds_versions[0].version == 3) {
313 struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
314 struct rpc_cred *cred;
315 struct auth_cred acred = {
316 .uid = make_kuid(&init_user_ns, mirror->uid),
317 .gid = make_kgid(&init_user_ns, mirror->gid),
318 };
319
320 /* AUTH_NULL ignores acred */
321 cred = auth->au_ops->lookup_cred(auth, &acred, 0);
322 if (IS_ERR(cred)) {
323 dprintk("%s: lookup_cred failed with %ld\n",
324 __func__, PTR_ERR(cred));
325 return PTR_ERR(cred);
326 } else {
327 mirror->cred = cred;
328 }
329 }
330 return 0;
331}
332
333struct nfs_fh *
334nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
335{
336 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
337 struct nfs_fh *fh = NULL;
338 struct nfs4_deviceid_node *devid;
339
340 if (mirror == NULL || mirror->mirror_ds == NULL ||
341 mirror->mirror_ds->ds == NULL) {
342 printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
343 __func__, mirror_idx);
344 if (mirror && mirror->mirror_ds) {
345 devid = &mirror->mirror_ds->id_node;
346 pnfs_generic_mark_devid_invalid(devid);
347 }
348 goto out;
349 }
350
351 /* FIXME: For now assume there is only 1 version available for the DS */
352 fh = &mirror->fh_versions[0];
353out:
354 return fh;
355}
356
357/* Upon return, either ds is connected, or ds is NULL */
358struct nfs4_pnfs_ds *
359nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
360 bool fail_return)
361{
362 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
363 struct nfs4_pnfs_ds *ds = NULL;
364 struct nfs4_deviceid_node *devid;
365 struct inode *ino = lseg->pls_layout->plh_inode;
366 struct nfs_server *s = NFS_SERVER(ino);
367 unsigned int max_payload;
368 rpc_authflavor_t flavor;
369
370 if (mirror == NULL || mirror->mirror_ds == NULL ||
371 mirror->mirror_ds->ds == NULL) {
372 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
373 __func__, ds_idx);
374 if (mirror && mirror->mirror_ds) {
375 devid = &mirror->mirror_ds->id_node;
376 pnfs_generic_mark_devid_invalid(devid);
377 }
378 goto out;
379 }
380
381 devid = &mirror->mirror_ds->id_node;
382 if (ff_layout_test_devid_unavailable(devid))
383 goto out;
384
385 ds = mirror->mirror_ds->ds;
386 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
387 smp_rmb();
388 if (ds->ds_clp)
389 goto out;
390
391 flavor = nfs4_ff_layout_choose_authflavor(mirror);
392
393 /* FIXME: For now we assume the server sent only one version of NFS
394 * to use for the DS.
395 */
396 nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
397 dataserver_retrans,
398 mirror->mirror_ds->ds_versions[0].version,
399 mirror->mirror_ds->ds_versions[0].minor_version,
400 flavor);
401
402 /* connect success, check rsize/wsize limit */
403 if (ds->ds_clp) {
404 max_payload =
405 nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
406 NULL);
407 if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
408 mirror->mirror_ds->ds_versions[0].rsize = max_payload;
409 if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
410 mirror->mirror_ds->ds_versions[0].wsize = max_payload;
411 } else {
412 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
413 mirror, lseg->pls_range.offset,
414 lseg->pls_range.length, NFS4ERR_NXIO,
415 OP_ILLEGAL, GFP_NOIO);
416 if (fail_return) {
417 pnfs_error_mark_layout_for_return(ino, lseg);
418 if (ff_layout_has_available_ds(lseg))
419 pnfs_set_retry_layoutget(lseg->pls_layout);
420 else
421 pnfs_clear_retry_layoutget(lseg->pls_layout);
422
423 } else {
424 if (ff_layout_has_available_ds(lseg))
425 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
426 &lseg->pls_layout->plh_flags);
427 else {
428 pnfs_error_mark_layout_for_return(ino, lseg);
429 pnfs_clear_retry_layoutget(lseg->pls_layout);
430 }
431 }
432 }
433
434 if (ff_layout_update_mirror_cred(mirror, ds))
435 ds = NULL;
436out:
437 return ds;
438}
439
440struct rpc_cred *
441ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
442 struct rpc_cred *mdscred)
443{
444 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
445 struct rpc_cred *cred = ERR_PTR(-EINVAL);
446
447 if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
448 goto out;
449
450 if (mirror && mirror->cred)
451 cred = mirror->cred;
452 else
453 cred = mdscred;
454out:
455 return cred;
456}
457
458/**
459* Find or create a DS rpc client with th MDS server rpc client auth flavor
460* in the nfs_client cl_ds_clients list.
461*/
462struct rpc_clnt *
463nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
464 struct nfs_client *ds_clp, struct inode *inode)
465{
466 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
467
468 switch (mirror->mirror_ds->ds_versions[0].version) {
469 case 3:
470 /* For NFSv3 DS, flavor is set when creating DS connections */
471 return ds_clp->cl_rpcclient;
472 case 4:
473 return nfs4_find_or_create_ds_client(ds_clp, inode);
474 default:
475 BUG();
476 }
477}
478
479static bool is_range_intersecting(u64 offset1, u64 length1,
480 u64 offset2, u64 length2)
481{
482 u64 end1 = end_offset(offset1, length1);
483 u64 end2 = end_offset(offset2, length2);
484
485 return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
486 (end2 == NFS4_MAX_UINT64 || end2 > offset1);
487}
488
489/* called with inode i_lock held */
490int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
491 struct xdr_stream *xdr, int *count,
492 const struct pnfs_layout_range *range)
493{
494 struct nfs4_ff_layout_ds_err *err, *n;
495 __be32 *p;
496
497 list_for_each_entry_safe(err, n, &flo->error_list, list) {
498 if (!is_range_intersecting(err->offset, err->length,
499 range->offset, range->length))
500 continue;
501 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
502 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
503 */
504 p = xdr_reserve_space(xdr,
505 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
506 if (unlikely(!p))
507 return -ENOBUFS;
508 p = xdr_encode_hyper(p, err->offset);
509 p = xdr_encode_hyper(p, err->length);
510 p = xdr_encode_opaque_fixed(p, &err->stateid,
511 NFS4_STATEID_SIZE);
512 p = xdr_encode_opaque_fixed(p, &err->deviceid,
513 NFS4_DEVICEID4_SIZE);
514 *p++ = cpu_to_be32(err->status);
515 *p++ = cpu_to_be32(err->opnum);
516 *count += 1;
517 list_del(&err->list);
518 dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
519 __func__, err->offset, err->length, err->status,
520 err->opnum, *count);
521 kfree(err);
522 }
523
524 return 0;
525}
526
527bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
528{
529 struct nfs4_ff_layout_mirror *mirror;
530 struct nfs4_deviceid_node *devid;
531 int idx;
532
533 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
534 mirror = FF_LAYOUT_COMP(lseg, idx);
535 if (mirror && mirror->mirror_ds) {
536 devid = &mirror->mirror_ds->id_node;
537 if (!ff_layout_test_devid_unavailable(devid))
538 return true;
539 }
540 }
541
542 return false;
543}
544
545module_param(dataserver_retrans, uint, 0644);
546MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
547 "retries a request before it attempts further "
548 " recovery action.");
549module_param(dataserver_timeo, uint, 0644);
550MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
551 "NFSv4.1 client waits for a response from a "
552 " data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
152 nfs_fattr_free_group_name(fattr); 152 nfs_fattr_free_group_name(fattr);
153} 153}
154 154
155static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) 155int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
156{ 156{
157 unsigned long val; 157 unsigned long val;
158 char buf[16]; 158 char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
166 *res = val; 166 *res = val;
167 return 1; 167 return 1;
168} 168}
169EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
169 170
170static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) 171static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
171{ 172{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2211f6ba8736..e4f0dcef8f54 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -388,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
388 if (S_ISREG(inode->i_mode)) { 388 if (S_ISREG(inode->i_mode)) {
389 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; 389 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
390 inode->i_data.a_ops = &nfs_file_aops; 390 inode->i_data.a_ops = &nfs_file_aops;
391 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
392 } else if (S_ISDIR(inode->i_mode)) { 391 } else if (S_ISDIR(inode->i_mode)) {
393 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 392 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
394 inode->i_fop = &nfs_dir_operations; 393 inode->i_fop = &nfs_dir_operations;
@@ -507,10 +506,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
507 attr->ia_valid &= ~ATTR_MODE; 506 attr->ia_valid &= ~ATTR_MODE;
508 507
509 if (attr->ia_valid & ATTR_SIZE) { 508 if (attr->ia_valid & ATTR_SIZE) {
509 loff_t i_size;
510
510 BUG_ON(!S_ISREG(inode->i_mode)); 511 BUG_ON(!S_ISREG(inode->i_mode));
511 512
512 if (attr->ia_size == i_size_read(inode)) 513 i_size = i_size_read(inode);
514 if (attr->ia_size == i_size)
513 attr->ia_valid &= ~ATTR_SIZE; 515 attr->ia_valid &= ~ATTR_SIZE;
516 else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
517 return -ETXTBSY;
514 } 518 }
515 519
516 /* Optimization: if the end result is no change, don't RPC */ 520 /* Optimization: if the end result is no change, don't RPC */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b6f34bfa6fe8..212b8c883d22 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -6,6 +6,7 @@
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/crc32.h> 8#include <linux/crc32.h>
9#include <linux/nfs_page.h>
9 10
10#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) 11#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
11 12
@@ -187,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
187 const struct sockaddr *ds_addr, 188 const struct sockaddr *ds_addr,
188 int ds_addrlen, int ds_proto, 189 int ds_addrlen, int ds_proto,
189 unsigned int ds_timeo, 190 unsigned int ds_timeo,
190 unsigned int ds_retrans); 191 unsigned int ds_retrans,
192 u32 minor_version,
193 rpc_authflavor_t au_flavor);
191extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, 194extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
192 struct inode *); 195 struct inode *);
196extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
197 const struct sockaddr *ds_addr, int ds_addrlen,
198 int ds_proto, unsigned int ds_timeo,
199 unsigned int ds_retrans, rpc_authflavor_t au_flavor);
193#ifdef CONFIG_PROC_FS 200#ifdef CONFIG_PROC_FS
194extern int __init nfs_fs_proc_init(void); 201extern int __init nfs_fs_proc_init(void);
195extern void nfs_fs_proc_exit(void); 202extern void nfs_fs_proc_exit(void);
@@ -242,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
242void nfs_pgio_header_free(struct nfs_pgio_header *); 249void nfs_pgio_header_free(struct nfs_pgio_header *);
243void nfs_pgio_data_destroy(struct nfs_pgio_header *); 250void nfs_pgio_data_destroy(struct nfs_pgio_header *);
244int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); 251int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
245int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *, 252int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
246 const struct rpc_call_ops *, int, int); 253 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
254 const struct rpc_call_ops *call_ops, int how, int flags);
247void nfs_free_request(struct nfs_page *req); 255void nfs_free_request(struct nfs_page *req);
256struct nfs_pgio_mirror *
257nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
248 258
249static inline void nfs_iocounter_init(struct nfs_io_counter *c) 259static inline void nfs_iocounter_init(struct nfs_io_counter *c)
250{ 260{
@@ -252,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c)
252 atomic_set(&c->io_count, 0); 262 atomic_set(&c->io_count, 0);
253} 263}
254 264
265static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
266{
267 WARN_ON_ONCE(desc->pg_mirror_count < 1);
268 return desc->pg_mirror_count > 1;
269}
270
255/* nfs2xdr.c */ 271/* nfs2xdr.c */
256extern struct rpc_procinfo nfs_procedures[]; 272extern struct rpc_procinfo nfs_procedures[];
257extern int nfs2_decode_dirent(struct xdr_stream *, 273extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -375,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
375 391
376extern int __init register_nfs_fs(void); 392extern int __init register_nfs_fs(void);
377extern void __exit unregister_nfs_fs(void); 393extern void __exit unregister_nfs_fs(void);
378extern void nfs_sb_active(struct super_block *sb); 394extern bool nfs_sb_active(struct super_block *sb);
379extern void nfs_sb_deactive(struct super_block *sb); 395extern void nfs_sb_deactive(struct super_block *sb);
380 396
381/* namespace.c */ 397/* namespace.c */
@@ -414,7 +430,6 @@ int nfs_show_options(struct seq_file *, struct dentry *);
414int nfs_show_devname(struct seq_file *, struct dentry *); 430int nfs_show_devname(struct seq_file *, struct dentry *);
415int nfs_show_path(struct seq_file *, struct dentry *); 431int nfs_show_path(struct seq_file *, struct dentry *);
416int nfs_show_stats(struct seq_file *, struct dentry *); 432int nfs_show_stats(struct seq_file *, struct dentry *);
417void nfs_put_super(struct super_block *);
418int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 433int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
419 434
420/* write.c */ 435/* write.c */
@@ -427,6 +442,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
427extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); 442extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
428extern int nfs_initiate_commit(struct rpc_clnt *clnt, 443extern int nfs_initiate_commit(struct rpc_clnt *clnt,
429 struct nfs_commit_data *data, 444 struct nfs_commit_data *data,
445 const struct nfs_rpc_ops *nfs_ops,
430 const struct rpc_call_ops *call_ops, 446 const struct rpc_call_ops *call_ops,
431 int how, int flags); 447 int how, int flags);
432extern void nfs_init_commit(struct nfs_commit_data *data, 448extern void nfs_init_commit(struct nfs_commit_data *data,
@@ -440,13 +456,15 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
440 struct nfs_commit_info *cinfo); 456 struct nfs_commit_info *cinfo);
441void nfs_mark_request_commit(struct nfs_page *req, 457void nfs_mark_request_commit(struct nfs_page *req,
442 struct pnfs_layout_segment *lseg, 458 struct pnfs_layout_segment *lseg,
443 struct nfs_commit_info *cinfo); 459 struct nfs_commit_info *cinfo,
460 u32 ds_commit_idx);
444int nfs_write_need_commit(struct nfs_pgio_header *); 461int nfs_write_need_commit(struct nfs_pgio_header *);
445int nfs_generic_commit_list(struct inode *inode, struct list_head *head, 462int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
446 int how, struct nfs_commit_info *cinfo); 463 int how, struct nfs_commit_info *cinfo);
447void nfs_retry_commit(struct list_head *page_list, 464void nfs_retry_commit(struct list_head *page_list,
448 struct pnfs_layout_segment *lseg, 465 struct pnfs_layout_segment *lseg,
449 struct nfs_commit_info *cinfo); 466 struct nfs_commit_info *cinfo,
467 u32 ds_commit_idx);
450void nfs_commitdata_release(struct nfs_commit_data *data); 468void nfs_commitdata_release(struct nfs_commit_data *data);
451void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, 469void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
452 struct nfs_commit_info *cinfo); 470 struct nfs_commit_info *cinfo);
@@ -457,6 +475,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
457 struct nfs_direct_req *dreq); 475 struct nfs_direct_req *dreq);
458int nfs_key_timeout_notify(struct file *filp, struct inode *inode); 476int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
459bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); 477bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
478void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
460 479
461#ifdef CONFIG_MIGRATION 480#ifdef CONFIG_MIGRATION
462extern int nfs_migrate_page(struct address_space *, 481extern int nfs_migrate_page(struct address_space *,
@@ -480,6 +499,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
480 inode_dio_wait(inode); 499 inode_dio_wait(inode);
481} 500}
482extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); 501extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
502extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
483 503
484/* nfs4proc.c */ 504/* nfs4proc.c */
485extern void __nfs4_read_done_cb(struct nfs_pgio_header *); 505extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -493,6 +513,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
493 struct nfs_client **result, 513 struct nfs_client **result,
494 struct rpc_cred *cred); 514 struct rpc_cred *cred);
495 515
516static inline struct inode *nfs_igrab_and_active(struct inode *inode)
517{
518 inode = igrab(inode);
519 if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
520 iput(inode);
521 inode = NULL;
522 }
523 return inode;
524}
525
526static inline void nfs_iput_and_deactive(struct inode *inode)
527{
528 if (inode != NULL) {
529 struct super_block *sb = inode->i_sb;
530
531 iput(inode);
532 nfs_sb_deactive(sb);
533 }
534}
535
496/* 536/*
497 * Determine the device name as a string 537 * Determine the device name as a string
498 */ 538 */
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
481 * void; 481 * void;
482 * }; 482 * };
483 */ 483 */
484static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) 484static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
485 __u32 *op_status)
485{ 486{
486 enum nfs_stat status; 487 enum nfs_stat status;
487 int error; 488 int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
489 error = decode_stat(xdr, &status); 490 error = decode_stat(xdr, &status);
490 if (unlikely(error)) 491 if (unlikely(error))
491 goto out; 492 goto out;
493 if (op_status)
494 *op_status = status;
492 if (status != NFS_OK) 495 if (status != NFS_OK)
493 goto out_default; 496 goto out_default;
494 error = decode_fattr(xdr, result); 497 error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
808static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, 811static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
809 struct nfs_fattr *result) 812 struct nfs_fattr *result)
810{ 813{
811 return decode_attrstat(xdr, result); 814 return decode_attrstat(xdr, result, NULL);
812} 815}
813 816
814static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, 817static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
865 error = decode_stat(xdr, &status); 868 error = decode_stat(xdr, &status);
866 if (unlikely(error)) 869 if (unlikely(error))
867 goto out; 870 goto out;
871 result->op_status = status;
868 if (status != NFS_OK) 872 if (status != NFS_OK)
869 goto out_default; 873 goto out_default;
870 error = decode_fattr(xdr, result->fattr); 874 error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
882{ 886{
883 /* All NFSv2 writes are "file sync" writes */ 887 /* All NFSv2 writes are "file sync" writes */
884 result->verf->committed = NFS_FILE_SYNC; 888 result->verf->committed = NFS_FILE_SYNC;
885 return decode_attrstat(xdr, result->fattr); 889 return decode_attrstat(xdr, result->fattr, &result->op_status);
886} 890}
887 891
888/** 892/**
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, 30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t); 31 struct nfs_fattr *, rpc_authflavor_t);
32 32
33/* nfs3super.c */
34extern struct nfs_subversion nfs_v3;
33 35
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */ 36#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..9e9fa347a948 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,5 +1,6 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include <linux/sunrpc/addr.h>
3#include "internal.h" 4#include "internal.h"
4#include "nfs3_fs.h" 5#include "nfs3_fs.h"
5 6
@@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
64 nfs_init_server_aclclient(server); 65 nfs_init_server_aclclient(server);
65 return server; 66 return server;
66} 67}
68
69/*
70 * Set up a pNFS Data Server client over NFSv3.
71 *
72 * Return any existing nfs_client that matches server address,port,version
73 * and minorversion.
74 *
75 * For a new nfs_client, use a soft mount (default), a low retrans and a
76 * low timeout interval so that if a connection is lost, we retry through
77 * the MDS.
78 */
79struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
80 const struct sockaddr *ds_addr, int ds_addrlen,
81 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
82 rpc_authflavor_t au_flavor)
83{
84 struct nfs_client_initdata cl_init = {
85 .addr = ds_addr,
86 .addrlen = ds_addrlen,
87 .nfs_mod = &nfs_v3,
88 .proto = ds_proto,
89 .net = mds_clp->cl_net,
90 };
91 struct rpc_timeout ds_timeout;
92 struct nfs_client *clp;
93 char buf[INET6_ADDRSTRLEN + 1];
94
95 /* fake a hostname because lockd wants it */
96 if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
97 return ERR_PTR(-EINVAL);
98 cl_init.hostname = buf;
99
100 /* Use the MDS nfs_client cl_ipaddr. */
101 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
102 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
103 au_flavor);
104
105 return clp;
106}
107EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 524f9f837408..78e557c3ab87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
800{ 800{
801 struct inode *inode = hdr->inode; 801 struct inode *inode = hdr->inode;
802 802
803 if (hdr->pgio_done_cb != NULL)
804 return hdr->pgio_done_cb(task, hdr);
805
803 if (nfs3_async_handle_jukebox(task, inode)) 806 if (nfs3_async_handle_jukebox(task, inode))
804 return -EAGAIN; 807 return -EAGAIN;
805 808
@@ -825,6 +828,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
825{ 828{
826 struct inode *inode = hdr->inode; 829 struct inode *inode = hdr->inode;
827 830
831 if (hdr->pgio_done_cb != NULL)
832 return hdr->pgio_done_cb(task, hdr);
833
828 if (nfs3_async_handle_jukebox(task, inode)) 834 if (nfs3_async_handle_jukebox(task, inode))
829 return -EAGAIN; 835 return -EAGAIN;
830 if (task->tk_status >= 0) 836 if (task->tk_status >= 0)
@@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi
845 851
846static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) 852static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
847{ 853{
854 if (data->commit_done_cb != NULL)
855 return data->commit_done_cb(task, data);
856
848 if (nfs3_async_handle_jukebox(task, data->inode)) 857 if (nfs3_async_handle_jukebox(task, data->inode))
849 return -EAGAIN; 858 return -EAGAIN;
850 nfs_refresh_inode(data->inode, data->res.fattr); 859 nfs_refresh_inode(data->inode, data->res.fattr);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
7#include "nfs3_fs.h" 7#include "nfs3_fs.h"
8#include "nfs.h" 8#include "nfs.h"
9 9
10static struct nfs_subversion nfs_v3 = { 10struct nfs_subversion nfs_v3 = {
11 .owner = THIS_MODULE, 11 .owner = THIS_MODULE,
12 .nfs_fs = &nfs_fs_type, 12 .nfs_fs = &nfs_fs_type,
13 .rpc_vers = &nfs_version3, 13 .rpc_vers = &nfs_version3,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1636 error = decode_post_op_attr(xdr, result->fattr); 1636 error = decode_post_op_attr(xdr, result->fattr);
1637 if (unlikely(error)) 1637 if (unlikely(error))
1638 goto out; 1638 goto out;
1639 result->op_status = status;
1639 if (status != NFS3_OK) 1640 if (status != NFS3_OK)
1640 goto out_status; 1641 goto out_status;
1641 error = decode_read3resok(xdr, result); 1642 error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1708 error = decode_wcc_data(xdr, result->fattr); 1709 error = decode_wcc_data(xdr, result->fattr);
1709 if (unlikely(error)) 1710 if (unlikely(error))
1710 goto out; 1711 goto out;
1712 result->op_status = status;
1711 if (status != NFS3_OK) 1713 if (status != NFS3_OK)
1712 goto out_status; 1714 goto out_status;
1713 error = decode_write3resok(xdr, result); 1715 error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
2323 error = decode_wcc_data(xdr, result->fattr); 2325 error = decode_wcc_data(xdr, result->fattr);
2324 if (unlikely(error)) 2326 if (unlikely(error))
2325 goto out; 2327 goto out;
2328 result->op_status = status;
2326 if (status != NFS3_OK) 2329 if (status != NFS3_OK)
2327 goto out_status; 2330 goto out_status;
2328 error = decode_writeverf3(xdr, &result->verf->verifier); 2331 error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a08178764cf9..fdef424b0cd3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44#define NFS4_RENEW_TIMEOUT 0x01 44#define NFS4_RENEW_TIMEOUT 0x01
45#define NFS4_RENEW_DELEGATION_CB 0x02 45#define NFS4_RENEW_DELEGATION_CB 0x02
46 46
47struct nfs_seqid_counter;
47struct nfs4_minor_version_ops { 48struct nfs4_minor_version_ops {
48 u32 minor_version; 49 u32 minor_version;
49 unsigned init_caps; 50 unsigned init_caps;
@@ -56,6 +57,8 @@ struct nfs4_minor_version_ops {
56 struct nfs_fsinfo *); 57 struct nfs_fsinfo *);
57 void (*free_lock_state)(struct nfs_server *, 58 void (*free_lock_state)(struct nfs_server *,
58 struct nfs4_lock_state *); 59 struct nfs4_lock_state *);
60 struct nfs_seqid *
61 (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
59 const struct rpc_call_ops *call_sync_ops; 62 const struct rpc_call_ops *call_sync_ops;
60 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 63 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
61 const struct nfs4_state_recovery_ops *nograce_recovery_ops; 64 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -443,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
443extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 446extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
444extern void nfs_release_seqid(struct nfs_seqid *seqid); 447extern void nfs_release_seqid(struct nfs_seqid *seqid);
445extern void nfs_free_seqid(struct nfs_seqid *seqid); 448extern void nfs_free_seqid(struct nfs_seqid *seqid);
449extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
450 struct nfs4_sequence_args *args,
451 struct nfs4_sequence_res *res,
452 struct rpc_task *task);
453extern int nfs4_sequence_done(struct rpc_task *task,
454 struct nfs4_sequence_res *res);
446 455
447extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); 456extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
448 457
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 706ad10b8186..8646af9b11d2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -849,14 +849,15 @@ error:
849 */ 849 */
850struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 850struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
851 const struct sockaddr *ds_addr, int ds_addrlen, 851 const struct sockaddr *ds_addr, int ds_addrlen,
852 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) 852 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
853 u32 minor_version, rpc_authflavor_t au_flavor)
853{ 854{
854 struct nfs_client_initdata cl_init = { 855 struct nfs_client_initdata cl_init = {
855 .addr = ds_addr, 856 .addr = ds_addr,
856 .addrlen = ds_addrlen, 857 .addrlen = ds_addrlen,
857 .nfs_mod = &nfs_v4, 858 .nfs_mod = &nfs_v4,
858 .proto = ds_proto, 859 .proto = ds_proto,
859 .minorversion = mds_clp->cl_minorversion, 860 .minorversion = minor_version,
860 .net = mds_clp->cl_net, 861 .net = mds_clp->cl_net,
861 }; 862 };
862 struct rpc_timeout ds_timeout; 863 struct rpc_timeout ds_timeout;
@@ -874,7 +875,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
874 */ 875 */
875 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 876 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
876 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 877 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
877 mds_clp->cl_rpcclient->cl_auth->au_flavor); 878 au_flavor);
878 879
879 dprintk("<-- %s %p\n", __func__, clp); 880 dprintk("<-- %s %p\n", __func__, clp);
880 return clp; 881 return clp;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c347705b0161..2e7c9f7a6f7c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -495,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
495 args->sa_privileged = 1; 495 args->sa_privileged = 1;
496} 496}
497 497
498static int nfs40_setup_sequence(const struct nfs_server *server, 498int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
499 struct nfs4_sequence_args *args, 499 struct nfs4_sequence_args *args,
500 struct nfs4_sequence_res *res, 500 struct nfs4_sequence_res *res,
501 struct rpc_task *task) 501 struct rpc_task *task)
502{ 502{
503 struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
504 struct nfs4_slot *slot; 503 struct nfs4_slot *slot;
505 504
506 /* slot already allocated? */ 505 /* slot already allocated? */
@@ -535,6 +534,7 @@ out_sleep:
535 spin_unlock(&tbl->slot_tbl_lock); 534 spin_unlock(&tbl->slot_tbl_lock);
536 return -EAGAIN; 535 return -EAGAIN;
537} 536}
537EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
538 538
539static int nfs40_sequence_done(struct rpc_task *task, 539static int nfs40_sequence_done(struct rpc_task *task,
540 struct nfs4_sequence_res *res) 540 struct nfs4_sequence_res *res)
@@ -694,8 +694,7 @@ out_retry:
694} 694}
695EXPORT_SYMBOL_GPL(nfs41_sequence_done); 695EXPORT_SYMBOL_GPL(nfs41_sequence_done);
696 696
697static int nfs4_sequence_done(struct rpc_task *task, 697int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
698 struct nfs4_sequence_res *res)
699{ 698{
700 if (res->sr_slot == NULL) 699 if (res->sr_slot == NULL)
701 return 1; 700 return 1;
@@ -703,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task,
703 return nfs40_sequence_done(task, res); 702 return nfs40_sequence_done(task, res);
704 return nfs41_sequence_done(task, res); 703 return nfs41_sequence_done(task, res);
705} 704}
705EXPORT_SYMBOL_GPL(nfs4_sequence_done);
706 706
707int nfs41_setup_sequence(struct nfs4_session *session, 707int nfs41_setup_sequence(struct nfs4_session *session,
708 struct nfs4_sequence_args *args, 708 struct nfs4_sequence_args *args,
@@ -777,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
777 int ret = 0; 777 int ret = 0;
778 778
779 if (!session) 779 if (!session)
780 return nfs40_setup_sequence(server, args, res, task); 780 return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
781 args, res, task);
781 782
782 dprintk("--> %s clp %p session %p sr_slot %u\n", 783 dprintk("--> %s clp %p session %p sr_slot %u\n",
783 __func__, session->clp, session, res->sr_slot ? 784 __func__, session->clp, session, res->sr_slot ?
@@ -818,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
818 struct nfs4_sequence_res *res, 819 struct nfs4_sequence_res *res,
819 struct rpc_task *task) 820 struct rpc_task *task)
820{ 821{
821 return nfs40_setup_sequence(server, args, res, task); 822 return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
823 args, res, task);
822} 824}
823 825
824static int nfs4_sequence_done(struct rpc_task *task, 826int nfs4_sequence_done(struct rpc_task *task,
825 struct nfs4_sequence_res *res) 827 struct nfs4_sequence_res *res)
826{ 828{
827 return nfs40_sequence_done(task, res); 829 return nfs40_sequence_done(task, res);
828} 830}
831EXPORT_SYMBOL_GPL(nfs4_sequence_done);
829 832
830#endif /* !CONFIG_NFS_V4_1 */ 833#endif /* !CONFIG_NFS_V4_1 */
831 834
@@ -937,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
937 return true; 940 return true;
938} 941}
939 942
943static u32
944nfs4_map_atomic_open_share(struct nfs_server *server,
945 fmode_t fmode, int openflags)
946{
947 u32 res = 0;
948
949 switch (fmode & (FMODE_READ | FMODE_WRITE)) {
950 case FMODE_READ:
951 res = NFS4_SHARE_ACCESS_READ;
952 break;
953 case FMODE_WRITE:
954 res = NFS4_SHARE_ACCESS_WRITE;
955 break;
956 case FMODE_READ|FMODE_WRITE:
957 res = NFS4_SHARE_ACCESS_BOTH;
958 }
959 if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
960 goto out;
961 /* Want no delegation if we're using O_DIRECT */
962 if (openflags & O_DIRECT)
963 res |= NFS4_SHARE_WANT_NO_DELEG;
964out:
965 return res;
966}
967
940static enum open_claim_type4 968static enum open_claim_type4
941nfs4_map_atomic_open_claim(struct nfs_server *server, 969nfs4_map_atomic_open_claim(struct nfs_server *server,
942 enum open_claim_type4 claim) 970 enum open_claim_type4 claim)
@@ -977,6 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
977 struct dentry *parent = dget_parent(dentry); 1005 struct dentry *parent = dget_parent(dentry);
978 struct inode *dir = parent->d_inode; 1006 struct inode *dir = parent->d_inode;
979 struct nfs_server *server = NFS_SERVER(dir); 1007 struct nfs_server *server = NFS_SERVER(dir);
1008 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
980 struct nfs4_opendata *p; 1009 struct nfs4_opendata *p;
981 1010
982 p = kzalloc(sizeof(*p), gfp_mask); 1011 p = kzalloc(sizeof(*p), gfp_mask);
@@ -987,8 +1016,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
987 if (IS_ERR(p->f_label)) 1016 if (IS_ERR(p->f_label))
988 goto err_free_p; 1017 goto err_free_p;
989 1018
990 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 1019 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
991 if (p->o_arg.seqid == NULL) 1020 p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
1021 if (IS_ERR(p->o_arg.seqid))
992 goto err_free_label; 1022 goto err_free_label;
993 nfs_sb_active(dentry->d_sb); 1023 nfs_sb_active(dentry->d_sb);
994 p->dentry = dget(dentry); 1024 p->dentry = dget(dentry);
@@ -997,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
997 atomic_inc(&sp->so_count); 1027 atomic_inc(&sp->so_count);
998 p->o_arg.open_flags = flags; 1028 p->o_arg.open_flags = flags;
999 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 1029 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
1030 p->o_arg.share_access = nfs4_map_atomic_open_share(server,
1031 fmode, flags);
1000 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS 1032 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
1001 * will return permission denied for all bits until close */ 1033 * will return permission denied for all bits until close */
1002 if (!(flags & O_EXCL)) { 1034 if (!(flags & O_EXCL)) {
@@ -1167,6 +1199,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
1167 return false; 1199 return false;
1168} 1200}
1169 1201
1202static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
1203{
1204 if (state->n_wronly)
1205 set_bit(NFS_O_WRONLY_STATE, &state->flags);
1206 if (state->n_rdonly)
1207 set_bit(NFS_O_RDONLY_STATE, &state->flags);
1208 if (state->n_rdwr)
1209 set_bit(NFS_O_RDWR_STATE, &state->flags);
1210}
1211
1170static void nfs_clear_open_stateid_locked(struct nfs4_state *state, 1212static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1171 nfs4_stateid *stateid, fmode_t fmode) 1213 nfs4_stateid *stateid, fmode_t fmode)
1172{ 1214{
@@ -1185,8 +1227,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1185 } 1227 }
1186 if (stateid == NULL) 1228 if (stateid == NULL)
1187 return; 1229 return;
1188 if (!nfs_need_update_open_stateid(state, stateid)) 1230 /* Handle races with OPEN */
1231 if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
1232 !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
1233 nfs_resync_open_stateid_locked(state);
1189 return; 1234 return;
1235 }
1190 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1236 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1191 nfs4_stateid_copy(&state->stateid, stateid); 1237 nfs4_stateid_copy(&state->stateid, stateid);
1192 nfs4_stateid_copy(&state->open_stateid, stateid); 1238 nfs4_stateid_copy(&state->open_stateid, stateid);
@@ -1281,6 +1327,23 @@ no_delegation:
1281 return ret; 1327 return ret;
1282} 1328}
1283 1329
1330static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
1331 const nfs4_stateid *stateid)
1332{
1333 struct nfs4_state *state = lsp->ls_state;
1334 bool ret = false;
1335
1336 spin_lock(&state->state_lock);
1337 if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
1338 goto out_noupdate;
1339 if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
1340 goto out_noupdate;
1341 nfs4_stateid_copy(&lsp->ls_stateid, stateid);
1342 ret = true;
1343out_noupdate:
1344 spin_unlock(&state->state_lock);
1345 return ret;
1346}
1284 1347
1285static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) 1348static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
1286{ 1349{
@@ -1679,8 +1742,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
1679{ 1742{
1680 struct nfs4_opendata *data = calldata; 1743 struct nfs4_opendata *data = calldata;
1681 1744
1682 nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, 1745 nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
1683 &data->c_res.seq_res, task); 1746 &data->c_arg.seq_args, &data->c_res.seq_res, task);
1684} 1747}
1685 1748
1686static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) 1749static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -2587,6 +2650,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2587 case -NFS4ERR_OLD_STATEID: 2650 case -NFS4ERR_OLD_STATEID:
2588 case -NFS4ERR_BAD_STATEID: 2651 case -NFS4ERR_BAD_STATEID:
2589 case -NFS4ERR_EXPIRED: 2652 case -NFS4ERR_EXPIRED:
2653 if (!nfs4_stateid_match(&calldata->arg.stateid,
2654 &state->stateid)) {
2655 rpc_restart_call_prepare(task);
2656 goto out_release;
2657 }
2590 if (calldata->arg.fmode == 0) 2658 if (calldata->arg.fmode == 0)
2591 break; 2659 break;
2592 default: 2660 default:
@@ -2619,6 +2687,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2619 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); 2687 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
2620 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); 2688 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
2621 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); 2689 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
2690 nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
2622 /* Calculate the change in open mode */ 2691 /* Calculate the change in open mode */
2623 calldata->arg.fmode = 0; 2692 calldata->arg.fmode = 0;
2624 if (state->n_rdwr == 0) { 2693 if (state->n_rdwr == 0) {
@@ -2653,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2653 goto out_wait; 2722 goto out_wait;
2654 } 2723 }
2655 } 2724 }
2725 calldata->arg.share_access =
2726 nfs4_map_atomic_open_share(NFS_SERVER(inode),
2727 calldata->arg.fmode, 0);
2656 2728
2657 nfs_fattr_init(calldata->res.fattr); 2729 nfs_fattr_init(calldata->res.fattr);
2658 calldata->timestamp = jiffies; 2730 calldata->timestamp = jiffies;
@@ -2675,45 +2747,10 @@ static const struct rpc_call_ops nfs4_close_ops = {
2675 .rpc_release = nfs4_free_closedata, 2747 .rpc_release = nfs4_free_closedata,
2676}; 2748};
2677 2749
2678static bool nfs4_state_has_opener(struct nfs4_state *state)
2679{
2680 /* first check existing openers */
2681 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
2682 state->n_rdonly != 0)
2683 return true;
2684
2685 if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
2686 state->n_wronly != 0)
2687 return true;
2688
2689 if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
2690 state->n_rdwr != 0)
2691 return true;
2692
2693 return false;
2694}
2695
2696static bool nfs4_roc(struct inode *inode) 2750static bool nfs4_roc(struct inode *inode)
2697{ 2751{
2698 struct nfs_inode *nfsi = NFS_I(inode); 2752 if (!nfs_have_layout(inode))
2699 struct nfs_open_context *ctx;
2700 struct nfs4_state *state;
2701
2702 spin_lock(&inode->i_lock);
2703 list_for_each_entry(ctx, &nfsi->open_files, list) {
2704 state = ctx->state;
2705 if (state == NULL)
2706 continue;
2707 if (nfs4_state_has_opener(state)) {
2708 spin_unlock(&inode->i_lock);
2709 return false;
2710 }
2711 }
2712 spin_unlock(&inode->i_lock);
2713
2714 if (nfs4_check_delegation(inode, FMODE_READ))
2715 return false; 2753 return false;
2716
2717 return pnfs_roc(inode); 2754 return pnfs_roc(inode);
2718} 2755}
2719 2756
@@ -2731,6 +2768,7 @@ static bool nfs4_roc(struct inode *inode)
2731int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) 2768int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2732{ 2769{
2733 struct nfs_server *server = NFS_SERVER(state->inode); 2770 struct nfs_server *server = NFS_SERVER(state->inode);
2771 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
2734 struct nfs4_closedata *calldata; 2772 struct nfs4_closedata *calldata;
2735 struct nfs4_state_owner *sp = state->owner; 2773 struct nfs4_state_owner *sp = state->owner;
2736 struct rpc_task *task; 2774 struct rpc_task *task;
@@ -2757,10 +2795,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2757 calldata->inode = state->inode; 2795 calldata->inode = state->inode;
2758 calldata->state = state; 2796 calldata->state = state;
2759 calldata->arg.fh = NFS_FH(state->inode); 2797 calldata->arg.fh = NFS_FH(state->inode);
2760 calldata->arg.stateid = &state->open_stateid;
2761 /* Serialization for the sequence id */ 2798 /* Serialization for the sequence id */
2762 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); 2799 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
2763 if (calldata->arg.seqid == NULL) 2800 calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
2801 if (IS_ERR(calldata->arg.seqid))
2764 goto out_free_calldata; 2802 goto out_free_calldata;
2765 calldata->arg.fmode = 0; 2803 calldata->arg.fmode = 0;
2766 calldata->arg.bitmask = server->cache_consistency_bitmask; 2804 calldata->arg.bitmask = server->cache_consistency_bitmask;
@@ -5137,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5137static void nfs4_delegreturn_release(void *calldata) 5175static void nfs4_delegreturn_release(void *calldata)
5138{ 5176{
5139 struct nfs4_delegreturndata *data = calldata; 5177 struct nfs4_delegreturndata *data = calldata;
5178 struct inode *inode = data->inode;
5140 5179
5141 if (data->roc) 5180 if (inode) {
5142 pnfs_roc_release(data->inode); 5181 if (data->roc)
5182 pnfs_roc_release(inode);
5183 nfs_iput_and_deactive(inode);
5184 }
5143 kfree(calldata); 5185 kfree(calldata);
5144} 5186}
5145 5187
@@ -5196,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
5196 nfs_fattr_init(data->res.fattr); 5238 nfs_fattr_init(data->res.fattr);
5197 data->timestamp = jiffies; 5239 data->timestamp = jiffies;
5198 data->rpc_status = 0; 5240 data->rpc_status = 0;
5199 data->inode = inode; 5241 data->inode = nfs_igrab_and_active(inode);
5200 data->roc = list_empty(&NFS_I(inode)->open_files) ? 5242 if (data->inode)
5201 pnfs_roc(inode) : false; 5243 data->roc = nfs4_roc(inode);
5202 5244
5203 task_setup_data.callback_data = data; 5245 task_setup_data.callback_data = data;
5204 msg.rpc_argp = &data->args; 5246 msg.rpc_argp = &data->args;
@@ -5353,7 +5395,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
5353 p->arg.fl = &p->fl; 5395 p->arg.fl = &p->fl;
5354 p->arg.seqid = seqid; 5396 p->arg.seqid = seqid;
5355 p->res.seqid = seqid; 5397 p->res.seqid = seqid;
5356 p->arg.stateid = &lsp->ls_stateid;
5357 p->lsp = lsp; 5398 p->lsp = lsp;
5358 atomic_inc(&lsp->ls_count); 5399 atomic_inc(&lsp->ls_count);
5359 /* Ensure we don't close file until we're done freeing locks! */ 5400 /* Ensure we don't close file until we're done freeing locks! */
@@ -5380,14 +5421,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5380 return; 5421 return;
5381 switch (task->tk_status) { 5422 switch (task->tk_status) {
5382 case 0: 5423 case 0:
5383 nfs4_stateid_copy(&calldata->lsp->ls_stateid,
5384 &calldata->res.stateid);
5385 renew_lease(calldata->server, calldata->timestamp); 5424 renew_lease(calldata->server, calldata->timestamp);
5386 break; 5425 do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
5426 if (nfs4_update_lock_stateid(calldata->lsp,
5427 &calldata->res.stateid))
5428 break;
5387 case -NFS4ERR_BAD_STATEID: 5429 case -NFS4ERR_BAD_STATEID:
5388 case -NFS4ERR_OLD_STATEID: 5430 case -NFS4ERR_OLD_STATEID:
5389 case -NFS4ERR_STALE_STATEID: 5431 case -NFS4ERR_STALE_STATEID:
5390 case -NFS4ERR_EXPIRED: 5432 case -NFS4ERR_EXPIRED:
5433 if (!nfs4_stateid_match(&calldata->arg.stateid,
5434 &calldata->lsp->ls_stateid))
5435 rpc_restart_call_prepare(task);
5391 break; 5436 break;
5392 default: 5437 default:
5393 if (nfs4_async_handle_error(task, calldata->server, 5438 if (nfs4_async_handle_error(task, calldata->server,
@@ -5403,6 +5448,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
5403 5448
5404 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 5449 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
5405 goto out_wait; 5450 goto out_wait;
5451 nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
5406 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { 5452 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
5407 /* Note: exit _without_ running nfs4_locku_done */ 5453 /* Note: exit _without_ running nfs4_locku_done */
5408 goto out_no_action; 5454 goto out_no_action;
@@ -5473,6 +5519,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5473 struct nfs_seqid *seqid; 5519 struct nfs_seqid *seqid;
5474 struct nfs4_lock_state *lsp; 5520 struct nfs4_lock_state *lsp;
5475 struct rpc_task *task; 5521 struct rpc_task *task;
5522 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
5476 int status = 0; 5523 int status = 0;
5477 unsigned char fl_flags = request->fl_flags; 5524 unsigned char fl_flags = request->fl_flags;
5478 5525
@@ -5496,9 +5543,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5496 lsp = request->fl_u.nfs4_fl.owner; 5543 lsp = request->fl_u.nfs4_fl.owner;
5497 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) 5544 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
5498 goto out; 5545 goto out;
5499 seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); 5546 alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
5547 seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
5500 status = -ENOMEM; 5548 status = -ENOMEM;
5501 if (seqid == NULL) 5549 if (IS_ERR(seqid))
5502 goto out; 5550 goto out;
5503 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); 5551 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
5504 status = PTR_ERR(task); 5552 status = PTR_ERR(task);
@@ -5531,6 +5579,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
5531 struct nfs4_lockdata *p; 5579 struct nfs4_lockdata *p;
5532 struct inode *inode = lsp->ls_state->inode; 5580 struct inode *inode = lsp->ls_state->inode;
5533 struct nfs_server *server = NFS_SERVER(inode); 5581 struct nfs_server *server = NFS_SERVER(inode);
5582 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
5534 5583
5535 p = kzalloc(sizeof(*p), gfp_mask); 5584 p = kzalloc(sizeof(*p), gfp_mask);
5536 if (p == NULL) 5585 if (p == NULL)
@@ -5539,12 +5588,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
5539 p->arg.fh = NFS_FH(inode); 5588 p->arg.fh = NFS_FH(inode);
5540 p->arg.fl = &p->fl; 5589 p->arg.fl = &p->fl;
5541 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); 5590 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
5542 if (p->arg.open_seqid == NULL) 5591 if (IS_ERR(p->arg.open_seqid))
5543 goto out_free; 5592 goto out_free;
5544 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); 5593 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
5545 if (p->arg.lock_seqid == NULL) 5594 p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
5595 if (IS_ERR(p->arg.lock_seqid))
5546 goto out_free_seqid; 5596 goto out_free_seqid;
5547 p->arg.lock_stateid = &lsp->ls_stateid;
5548 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 5597 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
5549 p->arg.lock_owner.id = lsp->ls_seqid.owner_id; 5598 p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
5550 p->arg.lock_owner.s_dev = server->s_dev; 5599 p->arg.lock_owner.s_dev = server->s_dev;
@@ -5571,15 +5620,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
5571 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) 5620 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
5572 goto out_wait; 5621 goto out_wait;
5573 /* Do we need to do an open_to_lock_owner? */ 5622 /* Do we need to do an open_to_lock_owner? */
5574 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 5623 if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
5575 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { 5624 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
5576 goto out_release_lock_seqid; 5625 goto out_release_lock_seqid;
5577 } 5626 }
5578 data->arg.open_stateid = &state->open_stateid; 5627 nfs4_stateid_copy(&data->arg.open_stateid,
5628 &state->open_stateid);
5579 data->arg.new_lock_owner = 1; 5629 data->arg.new_lock_owner = 1;
5580 data->res.open_seqid = data->arg.open_seqid; 5630 data->res.open_seqid = data->arg.open_seqid;
5581 } else 5631 } else {
5582 data->arg.new_lock_owner = 0; 5632 data->arg.new_lock_owner = 0;
5633 nfs4_stateid_copy(&data->arg.lock_stateid,
5634 &data->lsp->ls_stateid);
5635 }
5583 if (!nfs4_valid_open_stateid(state)) { 5636 if (!nfs4_valid_open_stateid(state)) {
5584 data->rpc_status = -EBADF; 5637 data->rpc_status = -EBADF;
5585 task->tk_action = NULL; 5638 task->tk_action = NULL;
@@ -5603,6 +5656,7 @@ out_wait:
5603static void nfs4_lock_done(struct rpc_task *task, void *calldata) 5656static void nfs4_lock_done(struct rpc_task *task, void *calldata)
5604{ 5657{
5605 struct nfs4_lockdata *data = calldata; 5658 struct nfs4_lockdata *data = calldata;
5659 struct nfs4_lock_state *lsp = data->lsp;
5606 5660
5607 dprintk("%s: begin!\n", __func__); 5661 dprintk("%s: begin!\n", __func__);
5608 5662
@@ -5610,18 +5664,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
5610 return; 5664 return;
5611 5665
5612 data->rpc_status = task->tk_status; 5666 data->rpc_status = task->tk_status;
5613 if (data->arg.new_lock_owner != 0) { 5667 switch (task->tk_status) {
5614 if (data->rpc_status == 0) 5668 case 0:
5615 nfs_confirm_seqid(&data->lsp->ls_seqid, 0); 5669 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
5616 else 5670 data->timestamp);
5617 goto out; 5671 if (data->arg.new_lock) {
5618 } 5672 data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
5619 if (data->rpc_status == 0) { 5673 if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
5620 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); 5674 rpc_restart_call_prepare(task);
5621 set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); 5675 break;
5622 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 5676 }
5677 }
5678 if (data->arg.new_lock_owner != 0) {
5679 nfs_confirm_seqid(&lsp->ls_seqid, 0);
5680 nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
5681 set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
5682 } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
5683 rpc_restart_call_prepare(task);
5684 break;
5685 case -NFS4ERR_BAD_STATEID:
5686 case -NFS4ERR_OLD_STATEID:
5687 case -NFS4ERR_STALE_STATEID:
5688 case -NFS4ERR_EXPIRED:
5689 if (data->arg.new_lock_owner != 0) {
5690 if (!nfs4_stateid_match(&data->arg.open_stateid,
5691 &lsp->ls_state->open_stateid))
5692 rpc_restart_call_prepare(task);
5693 } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
5694 &lsp->ls_stateid))
5695 rpc_restart_call_prepare(task);
5623 } 5696 }
5624out:
5625 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); 5697 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
5626} 5698}
5627 5699
@@ -5702,7 +5774,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
5702 if (recovery_type == NFS_LOCK_RECLAIM) 5774 if (recovery_type == NFS_LOCK_RECLAIM)
5703 data->arg.reclaim = NFS_LOCK_RECLAIM; 5775 data->arg.reclaim = NFS_LOCK_RECLAIM;
5704 nfs4_set_sequence_privileged(&data->arg.seq_args); 5776 nfs4_set_sequence_privileged(&data->arg.seq_args);
5705 } 5777 } else
5778 data->arg.new_lock = 1;
5706 task = rpc_run_task(&task_setup_data); 5779 task = rpc_run_task(&task_setup_data);
5707 if (IS_ERR(task)) 5780 if (IS_ERR(task))
5708 return PTR_ERR(task); 5781 return PTR_ERR(task);
@@ -5826,10 +5899,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
5826 5899
5827static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 5900static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
5828{ 5901{
5829 struct nfs4_state_owner *sp = state->owner;
5830 struct nfs_inode *nfsi = NFS_I(state->inode); 5902 struct nfs_inode *nfsi = NFS_I(state->inode);
5831 unsigned char fl_flags = request->fl_flags; 5903 unsigned char fl_flags = request->fl_flags;
5832 unsigned int seq;
5833 int status = -ENOLCK; 5904 int status = -ENOLCK;
5834 5905
5835 if ((fl_flags & FL_POSIX) && 5906 if ((fl_flags & FL_POSIX) &&
@@ -5849,25 +5920,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
5849 /* ...but avoid races with delegation recall... */ 5920 /* ...but avoid races with delegation recall... */
5850 request->fl_flags = fl_flags & ~FL_SLEEP; 5921 request->fl_flags = fl_flags & ~FL_SLEEP;
5851 status = do_vfs_lock(request->fl_file, request); 5922 status = do_vfs_lock(request->fl_file, request);
5852 goto out_unlock; 5923 up_read(&nfsi->rwsem);
5853 }
5854 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5855 up_read(&nfsi->rwsem);
5856 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
5857 if (status != 0)
5858 goto out; 5924 goto out;
5859 down_read(&nfsi->rwsem);
5860 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
5861 status = -NFS4ERR_DELAY;
5862 goto out_unlock;
5863 } 5925 }
5864 /* Note: we always want to sleep here! */
5865 request->fl_flags = fl_flags | FL_SLEEP;
5866 if (do_vfs_lock(request->fl_file, request) < 0)
5867 printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
5868 "manager!\n", __func__);
5869out_unlock:
5870 up_read(&nfsi->rwsem); 5926 up_read(&nfsi->rwsem);
5927 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
5871out: 5928out:
5872 request->fl_flags = fl_flags; 5929 request->fl_flags = fl_flags;
5873 return status; 5930 return status;
@@ -5974,8 +6031,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
5974{ 6031{
5975 struct nfs_release_lockowner_data *data = calldata; 6032 struct nfs_release_lockowner_data *data = calldata;
5976 struct nfs_server *server = data->server; 6033 struct nfs_server *server = data->server;
5977 nfs40_setup_sequence(server, &data->args.seq_args, 6034 nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
5978 &data->res.seq_res, task); 6035 &data->args.seq_args, &data->res.seq_res, task);
5979 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 6036 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
5980 data->timestamp = jiffies; 6037 data->timestamp = jiffies;
5981} 6038}
@@ -7537,6 +7594,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
7537 return; 7594 return;
7538 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, 7595 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
7539 NFS_I(lgp->args.inode)->layout, 7596 NFS_I(lgp->args.inode)->layout,
7597 &lgp->args.range,
7540 lgp->args.ctx->state)) { 7598 lgp->args.ctx->state)) {
7541 rpc_exit(task, NFS4_OK); 7599 rpc_exit(task, NFS4_OK);
7542 } 7600 }
@@ -7792,9 +7850,13 @@ static void nfs4_layoutreturn_release(void *calldata)
7792 spin_lock(&lo->plh_inode->i_lock); 7850 spin_lock(&lo->plh_inode->i_lock);
7793 if (lrp->res.lrs_present) 7851 if (lrp->res.lrs_present)
7794 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 7852 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
7853 pnfs_clear_layoutreturn_waitbit(lo);
7854 clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
7855 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
7795 lo->plh_block_lgets--; 7856 lo->plh_block_lgets--;
7796 spin_unlock(&lo->plh_inode->i_lock); 7857 spin_unlock(&lo->plh_inode->i_lock);
7797 pnfs_put_layout_hdr(lrp->args.layout); 7858 pnfs_put_layout_hdr(lrp->args.layout);
7859 nfs_iput_and_deactive(lrp->inode);
7798 kfree(calldata); 7860 kfree(calldata);
7799 dprintk("<-- %s\n", __func__); 7861 dprintk("<-- %s\n", __func__);
7800} 7862}
@@ -7805,7 +7867,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
7805 .rpc_release = nfs4_layoutreturn_release, 7867 .rpc_release = nfs4_layoutreturn_release,
7806}; 7868};
7807 7869
7808int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) 7870int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
7809{ 7871{
7810 struct rpc_task *task; 7872 struct rpc_task *task;
7811 struct rpc_message msg = { 7873 struct rpc_message msg = {
@@ -7820,14 +7882,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7820 .callback_ops = &nfs4_layoutreturn_call_ops, 7882 .callback_ops = &nfs4_layoutreturn_call_ops,
7821 .callback_data = lrp, 7883 .callback_data = lrp,
7822 }; 7884 };
7823 int status; 7885 int status = 0;
7824 7886
7825 dprintk("--> %s\n", __func__); 7887 dprintk("--> %s\n", __func__);
7888 if (!sync) {
7889 lrp->inode = nfs_igrab_and_active(lrp->args.inode);
7890 if (!lrp->inode) {
7891 nfs4_layoutreturn_release(lrp);
7892 return -EAGAIN;
7893 }
7894 task_setup_data.flags |= RPC_TASK_ASYNC;
7895 }
7826 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); 7896 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
7827 task = rpc_run_task(&task_setup_data); 7897 task = rpc_run_task(&task_setup_data);
7828 if (IS_ERR(task)) 7898 if (IS_ERR(task))
7829 return PTR_ERR(task); 7899 return PTR_ERR(task);
7830 status = task->tk_status; 7900 if (sync)
7901 status = task->tk_status;
7831 trace_nfs4_layoutreturn(lrp->args.inode, status); 7902 trace_nfs4_layoutreturn(lrp->args.inode, status);
7832 dprintk("<-- %s status=%d\n", __func__, status); 7903 dprintk("<-- %s status=%d\n", __func__, status);
7833 rpc_put_task(task); 7904 rpc_put_task(task);
@@ -7921,6 +7992,7 @@ static void nfs4_layoutcommit_release(void *calldata)
7921 nfs_post_op_update_inode_force_wcc(data->args.inode, 7992 nfs_post_op_update_inode_force_wcc(data->args.inode,
7922 data->res.fattr); 7993 data->res.fattr);
7923 put_rpccred(data->cred); 7994 put_rpccred(data->cred);
7995 nfs_iput_and_deactive(data->inode);
7924 kfree(data); 7996 kfree(data);
7925} 7997}
7926 7998
@@ -7945,7 +8017,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7945 .rpc_message = &msg, 8017 .rpc_message = &msg,
7946 .callback_ops = &nfs4_layoutcommit_ops, 8018 .callback_ops = &nfs4_layoutcommit_ops,
7947 .callback_data = data, 8019 .callback_data = data,
7948 .flags = RPC_TASK_ASYNC,
7949 }; 8020 };
7950 struct rpc_task *task; 8021 struct rpc_task *task;
7951 int status = 0; 8022 int status = 0;
@@ -7956,18 +8027,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7956 data->args.lastbytewritten, 8027 data->args.lastbytewritten,
7957 data->args.inode->i_ino); 8028 data->args.inode->i_ino);
7958 8029
8030 if (!sync) {
8031 data->inode = nfs_igrab_and_active(data->args.inode);
8032 if (data->inode == NULL) {
8033 nfs4_layoutcommit_release(data);
8034 return -EAGAIN;
8035 }
8036 task_setup_data.flags = RPC_TASK_ASYNC;
8037 }
7959 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 8038 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
7960 task = rpc_run_task(&task_setup_data); 8039 task = rpc_run_task(&task_setup_data);
7961 if (IS_ERR(task)) 8040 if (IS_ERR(task))
7962 return PTR_ERR(task); 8041 return PTR_ERR(task);
7963 if (sync == false) 8042 if (sync)
7964 goto out; 8043 status = task->tk_status;
7965 status = nfs4_wait_for_completion_rpc_task(task);
7966 if (status != 0)
7967 goto out;
7968 status = task->tk_status;
7969 trace_nfs4_layoutcommit(data->args.inode, status); 8044 trace_nfs4_layoutcommit(data->args.inode, status);
7970out:
7971 dprintk("%s: status %d\n", __func__, status); 8045 dprintk("%s: status %d\n", __func__, status);
7972 rpc_put_task(task); 8046 rpc_put_task(task);
7973 return status; 8047 return status;
@@ -8395,6 +8469,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
8395 .match_stateid = nfs4_match_stateid, 8469 .match_stateid = nfs4_match_stateid,
8396 .find_root_sec = nfs4_find_root_sec, 8470 .find_root_sec = nfs4_find_root_sec,
8397 .free_lock_state = nfs4_release_lockowner, 8471 .free_lock_state = nfs4_release_lockowner,
8472 .alloc_seqid = nfs_alloc_seqid,
8398 .call_sync_ops = &nfs40_call_sync_ops, 8473 .call_sync_ops = &nfs40_call_sync_ops,
8399 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 8474 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
8400 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 8475 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -8403,6 +8478,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
8403}; 8478};
8404 8479
8405#if defined(CONFIG_NFS_V4_1) 8480#if defined(CONFIG_NFS_V4_1)
8481static struct nfs_seqid *
8482nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
8483{
8484 return NULL;
8485}
8486
8406static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { 8487static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8407 .minor_version = 1, 8488 .minor_version = 1,
8408 .init_caps = NFS_CAP_READDIRPLUS 8489 .init_caps = NFS_CAP_READDIRPLUS
@@ -8416,6 +8497,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8416 .match_stateid = nfs41_match_stateid, 8497 .match_stateid = nfs41_match_stateid,
8417 .find_root_sec = nfs41_find_root_sec, 8498 .find_root_sec = nfs41_find_root_sec,
8418 .free_lock_state = nfs41_free_lock_state, 8499 .free_lock_state = nfs41_free_lock_state,
8500 .alloc_seqid = nfs_alloc_no_seqid,
8419 .call_sync_ops = &nfs41_call_sync_ops, 8501 .call_sync_ops = &nfs41_call_sync_ops,
8420 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8502 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8421 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8503 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8442,6 +8524,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8442 .find_root_sec = nfs41_find_root_sec, 8524 .find_root_sec = nfs41_find_root_sec,
8443 .free_lock_state = nfs41_free_lock_state, 8525 .free_lock_state = nfs41_free_lock_state,
8444 .call_sync_ops = &nfs41_call_sync_ops, 8526 .call_sync_ops = &nfs41_call_sync_ops,
8527 .alloc_seqid = nfs_alloc_no_seqid,
8445 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8528 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8446 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8529 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
8447 .state_renewal_ops = &nfs41_state_renewal_ops, 8530 .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5194933ed419..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1003,11 +1003,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
1003 struct nfs_seqid *new; 1003 struct nfs_seqid *new;
1004 1004
1005 new = kmalloc(sizeof(*new), gfp_mask); 1005 new = kmalloc(sizeof(*new), gfp_mask);
1006 if (new != NULL) { 1006 if (new == NULL)
1007 new->sequence = counter; 1007 return ERR_PTR(-ENOMEM);
1008 INIT_LIST_HEAD(&new->list); 1008 new->sequence = counter;
1009 new->task = NULL; 1009 INIT_LIST_HEAD(&new->list);
1010 } 1010 new->task = NULL;
1011 return new; 1011 return new;
1012} 1012}
1013 1013
@@ -1015,7 +1015,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid)
1015{ 1015{
1016 struct nfs_seqid_counter *sequence; 1016 struct nfs_seqid_counter *sequence;
1017 1017
1018 if (list_empty(&seqid->list)) 1018 if (seqid == NULL || list_empty(&seqid->list))
1019 return; 1019 return;
1020 sequence = seqid->sequence; 1020 sequence = seqid->sequence;
1021 spin_lock(&sequence->lock); 1021 spin_lock(&sequence->lock);
@@ -1071,13 +1071,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
1071 1071
1072void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) 1072void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1073{ 1073{
1074 struct nfs4_state_owner *sp = container_of(seqid->sequence, 1074 struct nfs4_state_owner *sp;
1075 struct nfs4_state_owner, so_seqid); 1075
1076 struct nfs_server *server = sp->so_server; 1076 if (seqid == NULL)
1077 return;
1077 1078
1079 sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
1078 if (status == -NFS4ERR_BAD_SEQID) 1080 if (status == -NFS4ERR_BAD_SEQID)
1079 nfs4_drop_state_owner(sp); 1081 nfs4_drop_state_owner(sp);
1080 if (!nfs4_has_session(server->nfs_client)) 1082 if (!nfs4_has_session(sp->so_server->nfs_client))
1081 nfs_increment_seqid(status, seqid); 1083 nfs_increment_seqid(status, seqid);
1082} 1084}
1083 1085
@@ -1088,14 +1090,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1088 */ 1090 */
1089void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) 1091void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
1090{ 1092{
1091 nfs_increment_seqid(status, seqid); 1093 if (seqid != NULL)
1094 nfs_increment_seqid(status, seqid);
1092} 1095}
1093 1096
1094int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) 1097int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1095{ 1098{
1096 struct nfs_seqid_counter *sequence = seqid->sequence; 1099 struct nfs_seqid_counter *sequence;
1097 int status = 0; 1100 int status = 0;
1098 1101
1102 if (seqid == NULL)
1103 goto out;
1104 sequence = seqid->sequence;
1099 spin_lock(&sequence->lock); 1105 spin_lock(&sequence->lock);
1100 seqid->task = task; 1106 seqid->task = task;
1101 if (list_empty(&seqid->list)) 1107 if (list_empty(&seqid->list))
@@ -1106,6 +1112,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1106 status = -EAGAIN; 1112 status = -EAGAIN;
1107unlock: 1113unlock:
1108 spin_unlock(&sequence->lock); 1114 spin_unlock(&sequence->lock);
1115out:
1109 return status; 1116 return status;
1110} 1117}
1111 1118
@@ -1366,49 +1373,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1366 struct nfs_inode *nfsi = NFS_I(inode); 1373 struct nfs_inode *nfsi = NFS_I(inode);
1367 struct file_lock *fl; 1374 struct file_lock *fl;
1368 int status = 0; 1375 int status = 0;
1376 struct file_lock_context *flctx = inode->i_flctx;
1377 struct list_head *list;
1369 1378
1370 if (inode->i_flock == NULL) 1379 if (flctx == NULL)
1371 return 0; 1380 return 0;
1372 1381
1382 list = &flctx->flc_posix;
1383
1373 /* Guard against delegation returns and new lock/unlock calls */ 1384 /* Guard against delegation returns and new lock/unlock calls */
1374 down_write(&nfsi->rwsem); 1385 down_write(&nfsi->rwsem);
1375 /* Protect inode->i_flock using the BKL */ 1386 spin_lock(&flctx->flc_lock);
1376 spin_lock(&inode->i_lock); 1387restart:
1377 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1388 list_for_each_entry(fl, list, fl_list) {
1378 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
1379 continue;
1380 if (nfs_file_open_context(fl->fl_file)->state != state) 1389 if (nfs_file_open_context(fl->fl_file)->state != state)
1381 continue; 1390 continue;
1382 spin_unlock(&inode->i_lock); 1391 spin_unlock(&flctx->flc_lock);
1383 status = ops->recover_lock(state, fl); 1392 status = ops->recover_lock(state, fl);
1384 switch (status) { 1393 switch (status) {
1385 case 0: 1394 case 0:
1386 break; 1395 break;
1387 case -ESTALE: 1396 case -ESTALE:
1388 case -NFS4ERR_ADMIN_REVOKED: 1397 case -NFS4ERR_ADMIN_REVOKED:
1389 case -NFS4ERR_STALE_STATEID: 1398 case -NFS4ERR_STALE_STATEID:
1390 case -NFS4ERR_BAD_STATEID: 1399 case -NFS4ERR_BAD_STATEID:
1391 case -NFS4ERR_EXPIRED: 1400 case -NFS4ERR_EXPIRED:
1392 case -NFS4ERR_NO_GRACE: 1401 case -NFS4ERR_NO_GRACE:
1393 case -NFS4ERR_STALE_CLIENTID: 1402 case -NFS4ERR_STALE_CLIENTID:
1394 case -NFS4ERR_BADSESSION: 1403 case -NFS4ERR_BADSESSION:
1395 case -NFS4ERR_BADSLOT: 1404 case -NFS4ERR_BADSLOT:
1396 case -NFS4ERR_BAD_HIGH_SLOT: 1405 case -NFS4ERR_BAD_HIGH_SLOT:
1397 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1406 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1398 goto out; 1407 goto out;
1399 default: 1408 default:
1400 printk(KERN_ERR "NFS: %s: unhandled error %d\n", 1409 pr_err("NFS: %s: unhandled error %d\n",
1401 __func__, status); 1410 __func__, status);
1402 case -ENOMEM: 1411 case -ENOMEM:
1403 case -NFS4ERR_DENIED: 1412 case -NFS4ERR_DENIED:
1404 case -NFS4ERR_RECLAIM_BAD: 1413 case -NFS4ERR_RECLAIM_BAD:
1405 case -NFS4ERR_RECLAIM_CONFLICT: 1414 case -NFS4ERR_RECLAIM_CONFLICT:
1406 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1415 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1407 status = 0; 1416 status = 0;
1408 } 1417 }
1409 spin_lock(&inode->i_lock); 1418 spin_lock(&flctx->flc_lock);
1410 } 1419 }
1411 spin_unlock(&inode->i_lock); 1420 if (list == &flctx->flc_posix) {
1421 list = &flctx->flc_flock;
1422 goto restart;
1423 }
1424 spin_unlock(&flctx->flc_lock);
1412out: 1425out:
1413 up_write(&nfsi->rwsem); 1426 up_write(&nfsi->rwsem);
1414 return status; 1427 return status;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6f340f02f2ba..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
53 .destroy_inode = nfs_destroy_inode, 53 .destroy_inode = nfs_destroy_inode,
54 .write_inode = nfs4_write_inode, 54 .write_inode = nfs4_write_inode,
55 .drop_inode = nfs_drop_inode, 55 .drop_inode = nfs_drop_inode,
56 .put_super = nfs_put_super,
57 .statfs = nfs_statfs, 56 .statfs = nfs_statfs,
58 .evict_inode = nfs4_evict_inode, 57 .evict_inode = nfs4_evict_inode,
59 .umount_begin = nfs_umount_begin, 58 .umount_begin = nfs_umount_begin,
@@ -346,6 +345,9 @@ out:
346 345
347static void __exit exit_nfs_v4(void) 346static void __exit exit_nfs_v4(void)
348{ 347{
348 /* Not called in the _init(), conditionally loaded */
349 nfs4_pnfs_v3_ds_connect_unload();
350
349 unregister_nfs_version(&nfs_v4); 351 unregister_nfs_version(&nfs_v4);
350 nfs4_unregister_sysctl(); 352 nfs4_unregister_sysctl();
351 nfs_idmap_quit(); 353 nfs_idmap_quit();
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..e23a0a664e12 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -946,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
946static void encode_nfs4_seqid(struct xdr_stream *xdr, 946static void encode_nfs4_seqid(struct xdr_stream *xdr,
947 const struct nfs_seqid *seqid) 947 const struct nfs_seqid *seqid)
948{ 948{
949 encode_uint32(xdr, seqid->sequence->counter); 949 if (seqid != NULL)
950 encode_uint32(xdr, seqid->sequence->counter);
951 else
952 encode_uint32(xdr, 0);
950} 953}
951 954
952static void encode_compound_hdr(struct xdr_stream *xdr, 955static void encode_compound_hdr(struct xdr_stream *xdr,
@@ -1125,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
1125{ 1128{
1126 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); 1129 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
1127 encode_nfs4_seqid(xdr, arg->seqid); 1130 encode_nfs4_seqid(xdr, arg->seqid);
1128 encode_nfs4_stateid(xdr, arg->stateid); 1131 encode_nfs4_stateid(xdr, &arg->stateid);
1129} 1132}
1130 1133
1131static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) 1134static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1301,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1301 *p = cpu_to_be32(args->new_lock_owner); 1304 *p = cpu_to_be32(args->new_lock_owner);
1302 if (args->new_lock_owner){ 1305 if (args->new_lock_owner){
1303 encode_nfs4_seqid(xdr, args->open_seqid); 1306 encode_nfs4_seqid(xdr, args->open_seqid);
1304 encode_nfs4_stateid(xdr, args->open_stateid); 1307 encode_nfs4_stateid(xdr, &args->open_stateid);
1305 encode_nfs4_seqid(xdr, args->lock_seqid); 1308 encode_nfs4_seqid(xdr, args->lock_seqid);
1306 encode_lockowner(xdr, &args->lock_owner); 1309 encode_lockowner(xdr, &args->lock_owner);
1307 } 1310 }
1308 else { 1311 else {
1309 encode_nfs4_stateid(xdr, args->lock_stateid); 1312 encode_nfs4_stateid(xdr, &args->lock_stateid);
1310 encode_nfs4_seqid(xdr, args->lock_seqid); 1313 encode_nfs4_seqid(xdr, args->lock_seqid);
1311 } 1314 }
1312} 1315}
@@ -1330,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1330 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); 1333 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
1331 encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); 1334 encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
1332 encode_nfs4_seqid(xdr, args->seqid); 1335 encode_nfs4_seqid(xdr, args->seqid);
1333 encode_nfs4_stateid(xdr, args->stateid); 1336 encode_nfs4_stateid(xdr, &args->stateid);
1334 p = reserve_space(xdr, 16); 1337 p = reserve_space(xdr, 16);
1335 p = xdr_encode_hyper(p, args->fl->fl_start); 1338 p = xdr_encode_hyper(p, args->fl->fl_start);
1336 xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1339 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
@@ -1348,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
1348 encode_string(xdr, name->len, name->name); 1351 encode_string(xdr, name->len, name->name);
1349} 1352}
1350 1353
1351static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) 1354static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
1352{ 1355{
1353 __be32 *p; 1356 __be32 *p;
1354 1357
1355 p = reserve_space(xdr, 8); 1358 p = reserve_space(xdr, 8);
1356 switch (fmode & (FMODE_READ|FMODE_WRITE)) { 1359 *p++ = cpu_to_be32(share_access);
1357 case FMODE_READ:
1358 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
1359 break;
1360 case FMODE_WRITE:
1361 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
1362 break;
1363 case FMODE_READ|FMODE_WRITE:
1364 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
1365 break;
1366 default:
1367 *p++ = cpu_to_be32(0);
1368 }
1369 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ 1360 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
1370} 1361}
1371 1362
@@ -1377,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1377 * owner 4 = 32 1368 * owner 4 = 32
1378 */ 1369 */
1379 encode_nfs4_seqid(xdr, arg->seqid); 1370 encode_nfs4_seqid(xdr, arg->seqid);
1380 encode_share_access(xdr, arg->fmode); 1371 encode_share_access(xdr, arg->share_access);
1381 p = reserve_space(xdr, 36); 1372 p = reserve_space(xdr, 36);
1382 p = xdr_encode_hyper(p, arg->clientid); 1373 p = xdr_encode_hyper(p, arg->clientid);
1383 *p++ = cpu_to_be32(24); 1374 *p++ = cpu_to_be32(24);
@@ -1530,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1530static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 1521static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
1531{ 1522{
1532 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); 1523 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
1533 encode_nfs4_stateid(xdr, arg->stateid); 1524 encode_nfs4_stateid(xdr, &arg->stateid);
1534 encode_nfs4_seqid(xdr, arg->seqid); 1525 encode_nfs4_seqid(xdr, arg->seqid);
1535 encode_share_access(xdr, arg->fmode); 1526 encode_share_access(xdr, arg->share_access);
1536} 1527}
1537 1528
1538static void 1529static void
@@ -1801,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr,
1801 struct compound_hdr *hdr) 1792 struct compound_hdr *hdr)
1802{ 1793{
1803 __be32 *p; 1794 __be32 *p;
1804 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1805 uint32_t len;
1806 struct nfs_client *clp = args->client; 1795 struct nfs_client *clp = args->client;
1796 struct rpc_clnt *clnt = clp->cl_rpcclient;
1807 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); 1797 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
1808 u32 max_resp_sz_cached; 1798 u32 max_resp_sz_cached;
1809 1799
@@ -1814,11 +1804,8 @@ static void encode_create_session(struct xdr_stream *xdr,
1814 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 1804 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
1815 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; 1805 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
1816 1806
1817 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1818 clp->cl_ipaddr);
1819
1820 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); 1807 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
1821 p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); 1808 p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
1822 p = xdr_encode_hyper(p, clp->cl_clientid); 1809 p = xdr_encode_hyper(p, clp->cl_clientid);
1823 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ 1810 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1824 *p++ = cpu_to_be32(args->flags); /*flags */ 1811 *p++ = cpu_to_be32(args->flags); /*flags */
@@ -1847,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1847 1834
1848 /* authsys_parms rfc1831 */ 1835 /* authsys_parms rfc1831 */
1849 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ 1836 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
1850 p = xdr_encode_opaque(p, machine_name, len); 1837 p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
1851 *p++ = cpu_to_be32(0); /* UID */ 1838 *p++ = cpu_to_be32(0); /* UID */
1852 *p++ = cpu_to_be32(0); /* GID */ 1839 *p++ = cpu_to_be32(0); /* GID */
1853 *p = cpu_to_be32(0); /* No more gids */ 1840 *p = cpu_to_be32(0); /* No more gids */
@@ -2012,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
2012 p = reserve_space(xdr, 16); 1999 p = reserve_space(xdr, 16);
2013 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ 2000 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
2014 *p++ = cpu_to_be32(args->layout_type); 2001 *p++ = cpu_to_be32(args->layout_type);
2015 *p++ = cpu_to_be32(IOMODE_ANY); 2002 *p++ = cpu_to_be32(args->range.iomode);
2016 *p = cpu_to_be32(RETURN_FILE); 2003 *p = cpu_to_be32(RETURN_FILE);
2017 p = reserve_space(xdr, 16); 2004 p = reserve_space(xdr, 16);
2018 p = xdr_encode_hyper(p, 0); 2005 p = xdr_encode_hyper(p, args->range.offset);
2019 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); 2006 p = xdr_encode_hyper(p, args->range.length);
2020 spin_lock(&args->inode->i_lock); 2007 spin_lock(&args->inode->i_lock);
2021 encode_nfs4_stateid(xdr, &args->stateid); 2008 encode_nfs4_stateid(xdr, &args->stateid);
2022 spin_unlock(&args->inode->i_lock); 2009 spin_unlock(&args->inode->i_lock);
@@ -4936,20 +4923,13 @@ out_overflow:
4936 return -EIO; 4923 return -EIO;
4937} 4924}
4938 4925
4939static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 4926static int decode_rw_delegation(struct xdr_stream *xdr,
4927 uint32_t delegation_type,
4928 struct nfs_openres *res)
4940{ 4929{
4941 __be32 *p; 4930 __be32 *p;
4942 uint32_t delegation_type;
4943 int status; 4931 int status;
4944 4932
4945 p = xdr_inline_decode(xdr, 4);
4946 if (unlikely(!p))
4947 goto out_overflow;
4948 delegation_type = be32_to_cpup(p);
4949 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
4950 res->delegation_type = 0;
4951 return 0;
4952 }
4953 status = decode_stateid(xdr, &res->delegation); 4933 status = decode_stateid(xdr, &res->delegation);
4954 if (unlikely(status)) 4934 if (unlikely(status))
4955 return status; 4935 return status;
@@ -4973,6 +4953,52 @@ out_overflow:
4973 return -EIO; 4953 return -EIO;
4974} 4954}
4975 4955
4956static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
4957{
4958 __be32 *p;
4959 uint32_t why_no_delegation;
4960
4961 p = xdr_inline_decode(xdr, 4);
4962 if (unlikely(!p))
4963 goto out_overflow;
4964 why_no_delegation = be32_to_cpup(p);
4965 switch (why_no_delegation) {
4966 case WND4_CONTENTION:
4967 case WND4_RESOURCE:
4968 xdr_inline_decode(xdr, 4);
4969 /* Ignore for now */
4970 }
4971 return 0;
4972out_overflow:
4973 print_overflow_msg(__func__, xdr);
4974 return -EIO;
4975}
4976
4977static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
4978{
4979 __be32 *p;
4980 uint32_t delegation_type;
4981
4982 p = xdr_inline_decode(xdr, 4);
4983 if (unlikely(!p))
4984 goto out_overflow;
4985 delegation_type = be32_to_cpup(p);
4986 res->delegation_type = 0;
4987 switch (delegation_type) {
4988 case NFS4_OPEN_DELEGATE_NONE:
4989 return 0;
4990 case NFS4_OPEN_DELEGATE_READ:
4991 case NFS4_OPEN_DELEGATE_WRITE:
4992 return decode_rw_delegation(xdr, delegation_type, res);
4993 case NFS4_OPEN_DELEGATE_NONE_EXT:
4994 return decode_no_delegation(xdr, res);
4995 }
4996 return -EIO;
4997out_overflow:
4998 print_overflow_msg(__func__, xdr);
4999 return -EIO;
5000}
5001
4976static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 5002static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
4977{ 5003{
4978 __be32 *p; 5004 __be32 *p;
@@ -6567,6 +6593,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6567 int status; 6593 int status;
6568 6594
6569 status = decode_compound_hdr(xdr, &hdr); 6595 status = decode_compound_hdr(xdr, &hdr);
6596 res->op_status = hdr.status;
6570 if (status) 6597 if (status)
6571 goto out; 6598 goto out;
6572 status = decode_sequence(xdr, &res->seq_res, rqstp); 6599 status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6619,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6592 int status; 6619 int status;
6593 6620
6594 status = decode_compound_hdr(xdr, &hdr); 6621 status = decode_compound_hdr(xdr, &hdr);
6622 res->op_status = hdr.status;
6595 if (status) 6623 if (status)
6596 goto out; 6624 goto out;
6597 status = decode_sequence(xdr, &res->seq_res, rqstp); 6625 status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6649,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6621 int status; 6649 int status;
6622 6650
6623 status = decode_compound_hdr(xdr, &hdr); 6651 status = decode_compound_hdr(xdr, &hdr);
6652 res->op_status = hdr.status;
6624 if (status) 6653 if (status)
6625 goto out; 6654 goto out;
6626 status = decode_sequence(xdr, &res->seq_res, rqstp); 6655 status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cd3c910d2d12..9bc9f04fb7f6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline)
261 */ 261 */
262 len = snprintf(nfs_export_path, sizeof(nfs_export_path), 262 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
263 tmp, utsname()->nodename); 263 tmp, utsname()->nodename);
264 if (len > (int)sizeof(nfs_export_path)) 264 if (len >= (int)sizeof(nfs_export_path))
265 goto out_devnametoolong; 265 goto out_devnametoolong;
266 len = snprintf(nfs_root_device, sizeof(nfs_root_device), 266 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
267 "%pI4:%s", &servaddr, nfs_export_path); 267 "%pI4:%s", &servaddr, nfs_export_path);
268 if (len > (int)sizeof(nfs_root_device)) 268 if (len >= (int)sizeof(nfs_root_device))
269 goto out_devnametoolong; 269 goto out_devnametoolong;
270 270
271 retval = 0; 271 retval = 0;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9e5bc42180e4..24e1d7403c0b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
537static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, 537static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
538 struct nfs_page *prev, struct nfs_page *req) 538 struct nfs_page *prev, struct nfs_page *req)
539{ 539{
540 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
540 unsigned int size; 541 unsigned int size;
541 542
542 size = pnfs_generic_pg_test(pgio, prev, req); 543 size = pnfs_generic_pg_test(pgio, prev, req);
543 544
544 if (!size || pgio->pg_count + req->wb_bytes > 545 if (!size || mirror->pg_count + req->wb_bytes >
545 (unsigned long)pgio->pg_layout_private) 546 (unsigned long)pgio->pg_layout_private)
546 return 0; 547 return 0;
547 548
@@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = {
607 .pg_init = objio_init_read, 608 .pg_init = objio_init_read,
608 .pg_test = objio_pg_test, 609 .pg_test = objio_pg_test,
609 .pg_doio = pnfs_generic_pg_readpages, 610 .pg_doio = pnfs_generic_pg_readpages,
611 .pg_cleanup = pnfs_generic_pg_cleanup,
610}; 612};
611 613
612static const struct nfs_pageio_ops objio_pg_write_ops = { 614static const struct nfs_pageio_ops objio_pg_write_ops = {
613 .pg_init = objio_init_write, 615 .pg_init = objio_init_write,
614 .pg_test = objio_pg_test, 616 .pg_test = objio_pg_test,
615 .pg_doio = pnfs_generic_pg_writepages, 617 .pg_doio = pnfs_generic_pg_writepages,
618 .pg_cleanup = pnfs_generic_pg_cleanup,
616}; 619};
617 620
618static struct pnfs_layoutdriver_type objlayout_type = { 621static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2b5e769beb16..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
42 return p->pagevec != NULL; 42 return p->pagevec != NULL;
43} 43}
44 44
45struct nfs_pgio_mirror *
46nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
47{
48 return nfs_pgio_has_mirroring(desc) ?
49 &desc->pg_mirrors[desc->pg_mirror_idx] :
50 &desc->pg_mirrors[0];
51}
52EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
53
45void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, 54void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
46 struct nfs_pgio_header *hdr, 55 struct nfs_pgio_header *hdr,
47 void (*release)(struct nfs_pgio_header *hdr)) 56 void (*release)(struct nfs_pgio_header *hdr))
48{ 57{
49 hdr->req = nfs_list_entry(desc->pg_list.next); 58 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
59
60
61 hdr->req = nfs_list_entry(mirror->pg_list.next);
50 hdr->inode = desc->pg_inode; 62 hdr->inode = desc->pg_inode;
51 hdr->cred = hdr->req->wb_context->cred; 63 hdr->cred = hdr->req->wb_context->cred;
52 hdr->io_start = req_offset(hdr->req); 64 hdr->io_start = req_offset(hdr->req);
53 hdr->good_bytes = desc->pg_count; 65 hdr->good_bytes = mirror->pg_count;
54 hdr->dreq = desc->pg_dreq; 66 hdr->dreq = desc->pg_dreq;
55 hdr->layout_private = desc->pg_layout_private; 67 hdr->layout_private = desc->pg_layout_private;
56 hdr->release = release; 68 hdr->release = release;
57 hdr->completion_ops = desc->pg_completion_ops; 69 hdr->completion_ops = desc->pg_completion_ops;
58 if (hdr->completion_ops->init_hdr) 70 if (hdr->completion_ops->init_hdr)
59 hdr->completion_ops->init_hdr(hdr); 71 hdr->completion_ops->init_hdr(hdr);
72
73 hdr->pgio_mirror_idx = desc->pg_mirror_idx;
60} 74}
61EXPORT_SYMBOL_GPL(nfs_pgheader_init); 75EXPORT_SYMBOL_GPL(nfs_pgheader_init);
62 76
@@ -480,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req)
480size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, 494size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
481 struct nfs_page *prev, struct nfs_page *req) 495 struct nfs_page *prev, struct nfs_page *req)
482{ 496{
483 if (desc->pg_count > desc->pg_bsize) { 497 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
498
499
500 if (mirror->pg_count > mirror->pg_bsize) {
484 /* should never happen */ 501 /* should never happen */
485 WARN_ON_ONCE(1); 502 WARN_ON_ONCE(1);
486 return 0; 503 return 0;
@@ -490,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
490 * Limit the request size so that we can still allocate a page array 507 * Limit the request size so that we can still allocate a page array
491 * for it without upsetting the slab allocator. 508 * for it without upsetting the slab allocator.
492 */ 509 */
493 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * 510 if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
494 sizeof(struct page) > PAGE_SIZE) 511 sizeof(struct page) > PAGE_SIZE)
495 return 0; 512 return 0;
496 513
497 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 514 return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
498} 515}
499EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 516EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
500 517
@@ -597,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
597} 614}
598 615
599int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, 616int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
617 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
600 const struct rpc_call_ops *call_ops, int how, int flags) 618 const struct rpc_call_ops *call_ops, int how, int flags)
601{ 619{
602 struct rpc_task *task; 620 struct rpc_task *task;
603 struct rpc_message msg = { 621 struct rpc_message msg = {
604 .rpc_argp = &hdr->args, 622 .rpc_argp = &hdr->args,
605 .rpc_resp = &hdr->res, 623 .rpc_resp = &hdr->res,
606 .rpc_cred = hdr->cred, 624 .rpc_cred = cred,
607 }; 625 };
608 struct rpc_task_setup task_setup_data = { 626 struct rpc_task_setup task_setup_data = {
609 .rpc_client = clnt, 627 .rpc_client = clnt,
@@ -616,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
616 }; 634 };
617 int ret = 0; 635 int ret = 0;
618 636
619 hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how); 637 hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
620 638
621 dprintk("NFS: %5u initiated pgio call " 639 dprintk("NFS: %5u initiated pgio call "
622 "(req %s/%llu, %u bytes @ offset %llu)\n", 640 "(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -650,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
650static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, 668static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
651 struct nfs_pgio_header *hdr) 669 struct nfs_pgio_header *hdr)
652{ 670{
671 struct nfs_pgio_mirror *mirror;
672 u32 midx;
673
653 set_bit(NFS_IOHDR_REDO, &hdr->flags); 674 set_bit(NFS_IOHDR_REDO, &hdr->flags);
654 nfs_pgio_data_destroy(hdr); 675 nfs_pgio_data_destroy(hdr);
655 hdr->completion_ops->completion(hdr); 676 hdr->completion_ops->completion(hdr);
656 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 677 /* TODO: Make sure it's right to clean up all mirrors here
678 * and not just hdr->pgio_mirror_idx */
679 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
680 mirror = &desc->pg_mirrors[midx];
681 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
682 }
657 return -ENOMEM; 683 return -ENOMEM;
658} 684}
659 685
@@ -670,6 +696,17 @@ static void nfs_pgio_release(void *calldata)
670 hdr->completion_ops->completion(hdr); 696 hdr->completion_ops->completion(hdr);
671} 697}
672 698
699static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
700 unsigned int bsize)
701{
702 INIT_LIST_HEAD(&mirror->pg_list);
703 mirror->pg_bytes_written = 0;
704 mirror->pg_count = 0;
705 mirror->pg_bsize = bsize;
706 mirror->pg_base = 0;
707 mirror->pg_recoalesce = 0;
708}
709
673/** 710/**
674 * nfs_pageio_init - initialise a page io descriptor 711 * nfs_pageio_init - initialise a page io descriptor
675 * @desc: pointer to descriptor 712 * @desc: pointer to descriptor
@@ -686,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
686 size_t bsize, 723 size_t bsize,
687 int io_flags) 724 int io_flags)
688{ 725{
689 INIT_LIST_HEAD(&desc->pg_list); 726 struct nfs_pgio_mirror *new;
690 desc->pg_bytes_written = 0; 727 int i;
691 desc->pg_count = 0; 728
692 desc->pg_bsize = bsize;
693 desc->pg_base = 0;
694 desc->pg_moreio = 0; 729 desc->pg_moreio = 0;
695 desc->pg_recoalesce = 0;
696 desc->pg_inode = inode; 730 desc->pg_inode = inode;
697 desc->pg_ops = pg_ops; 731 desc->pg_ops = pg_ops;
698 desc->pg_completion_ops = compl_ops; 732 desc->pg_completion_ops = compl_ops;
@@ -702,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
702 desc->pg_lseg = NULL; 736 desc->pg_lseg = NULL;
703 desc->pg_dreq = NULL; 737 desc->pg_dreq = NULL;
704 desc->pg_layout_private = NULL; 738 desc->pg_layout_private = NULL;
739 desc->pg_bsize = bsize;
740
741 desc->pg_mirror_count = 1;
742 desc->pg_mirror_idx = 0;
743
744 if (pg_ops->pg_get_mirror_count) {
745 /* until we have a request, we don't have an lseg and no
746 * idea how many mirrors there will be */
747 new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
748 sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
749 desc->pg_mirrors_dynamic = new;
750 desc->pg_mirrors = new;
751
752 for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
753 nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
754 } else {
755 desc->pg_mirrors_dynamic = NULL;
756 desc->pg_mirrors = desc->pg_mirrors_static;
757 nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
758 }
705} 759}
706EXPORT_SYMBOL_GPL(nfs_pageio_init); 760EXPORT_SYMBOL_GPL(nfs_pageio_init);
707 761
@@ -737,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
737int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, 791int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
738 struct nfs_pgio_header *hdr) 792 struct nfs_pgio_header *hdr)
739{ 793{
794 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
795
740 struct nfs_page *req; 796 struct nfs_page *req;
741 struct page **pages, 797 struct page **pages,
742 *last_page; 798 *last_page;
743 struct list_head *head = &desc->pg_list; 799 struct list_head *head = &mirror->pg_list;
744 struct nfs_commit_info cinfo; 800 struct nfs_commit_info cinfo;
745 unsigned int pagecount, pageused; 801 unsigned int pagecount, pageused;
746 802
747 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); 803 pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
748 if (!nfs_pgarray_set(&hdr->page_array, pagecount)) 804 if (!nfs_pgarray_set(&hdr->page_array, pagecount))
749 return nfs_pgio_error(desc, hdr); 805 return nfs_pgio_error(desc, hdr);
750 806
@@ -772,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
772 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 828 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
773 829
774 /* Set up the argument struct */ 830 /* Set up the argument struct */
775 nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo); 831 nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
776 desc->pg_rpc_callops = &nfs_pgio_common_ops; 832 desc->pg_rpc_callops = &nfs_pgio_common_ops;
777 return 0; 833 return 0;
778} 834}
@@ -780,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
780 836
781static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) 837static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
782{ 838{
839 struct nfs_pgio_mirror *mirror;
783 struct nfs_pgio_header *hdr; 840 struct nfs_pgio_header *hdr;
784 int ret; 841 int ret;
785 842
843 mirror = nfs_pgio_current_mirror(desc);
844
786 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 845 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
787 if (!hdr) { 846 if (!hdr) {
788 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 847 /* TODO: make sure this is right with mirroring - or
848 * should it back out all mirrors? */
849 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
789 return -ENOMEM; 850 return -ENOMEM;
790 } 851 }
791 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); 852 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
792 ret = nfs_generic_pgio(desc, hdr); 853 ret = nfs_generic_pgio(desc, hdr);
793 if (ret == 0) 854 if (ret == 0)
794 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), 855 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
795 hdr, desc->pg_rpc_callops, 856 hdr,
857 hdr->cred,
858 NFS_PROTO(hdr->inode),
859 desc->pg_rpc_callops,
796 desc->pg_ioflags, 0); 860 desc->pg_ioflags, 0);
797 return ret; 861 return ret;
798} 862}
799 863
864/*
865 * nfs_pageio_setup_mirroring - determine if mirroring is to be used
866 * by calling the pg_get_mirror_count op
867 */
868static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
869 struct nfs_page *req)
870{
871 int mirror_count = 1;
872
873 if (!pgio->pg_ops->pg_get_mirror_count)
874 return 0;
875
876 mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
877
878 if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
879 return -EINVAL;
880
881 if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
882 return -EINVAL;
883
884 pgio->pg_mirror_count = mirror_count;
885
886 return 0;
887}
888
889/*
890 * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
891 */
892void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
893{
894 pgio->pg_mirror_count = 1;
895 pgio->pg_mirror_idx = 0;
896}
897
898static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
899{
900 pgio->pg_mirror_count = 1;
901 pgio->pg_mirror_idx = 0;
902 pgio->pg_mirrors = pgio->pg_mirrors_static;
903 kfree(pgio->pg_mirrors_dynamic);
904 pgio->pg_mirrors_dynamic = NULL;
905}
906
800static bool nfs_match_open_context(const struct nfs_open_context *ctx1, 907static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
801 const struct nfs_open_context *ctx2) 908 const struct nfs_open_context *ctx2)
802{ 909{
@@ -826,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
826 struct nfs_pageio_descriptor *pgio) 933 struct nfs_pageio_descriptor *pgio)
827{ 934{
828 size_t size; 935 size_t size;
936 struct file_lock_context *flctx;
829 937
830 if (prev) { 938 if (prev) {
831 if (!nfs_match_open_context(req->wb_context, prev->wb_context)) 939 if (!nfs_match_open_context(req->wb_context, prev->wb_context))
832 return false; 940 return false;
833 if (req->wb_context->dentry->d_inode->i_flock != NULL && 941 flctx = req->wb_context->dentry->d_inode->i_flctx;
942 if (flctx != NULL &&
943 !(list_empty_careful(&flctx->flc_posix) &&
944 list_empty_careful(&flctx->flc_flock)) &&
834 !nfs_match_lock_context(req->wb_lock_context, 945 !nfs_match_lock_context(req->wb_lock_context,
835 prev->wb_lock_context)) 946 prev->wb_lock_context))
836 return false; 947 return false;
@@ -863,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
863static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, 974static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
864 struct nfs_page *req) 975 struct nfs_page *req)
865{ 976{
977 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
978
866 struct nfs_page *prev = NULL; 979 struct nfs_page *prev = NULL;
867 if (desc->pg_count != 0) { 980
868 prev = nfs_list_entry(desc->pg_list.prev); 981 if (mirror->pg_count != 0) {
982 prev = nfs_list_entry(mirror->pg_list.prev);
869 } else { 983 } else {
870 if (desc->pg_ops->pg_init) 984 if (desc->pg_ops->pg_init)
871 desc->pg_ops->pg_init(desc, req); 985 desc->pg_ops->pg_init(desc, req);
872 desc->pg_base = req->wb_pgbase; 986 mirror->pg_base = req->wb_pgbase;
873 } 987 }
874 if (!nfs_can_coalesce_requests(prev, req, desc)) 988 if (!nfs_can_coalesce_requests(prev, req, desc))
875 return 0; 989 return 0;
876 nfs_list_remove_request(req); 990 nfs_list_remove_request(req);
877 nfs_list_add_request(req, &desc->pg_list); 991 nfs_list_add_request(req, &mirror->pg_list);
878 desc->pg_count += req->wb_bytes; 992 mirror->pg_count += req->wb_bytes;
879 return 1; 993 return 1;
880} 994}
881 995
@@ -884,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
884 */ 998 */
885static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 999static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
886{ 1000{
887 if (!list_empty(&desc->pg_list)) { 1001 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1002
1003
1004 if (!list_empty(&mirror->pg_list)) {
888 int error = desc->pg_ops->pg_doio(desc); 1005 int error = desc->pg_ops->pg_doio(desc);
889 if (error < 0) 1006 if (error < 0)
890 desc->pg_error = error; 1007 desc->pg_error = error;
891 else 1008 else
892 desc->pg_bytes_written += desc->pg_count; 1009 mirror->pg_bytes_written += mirror->pg_count;
893 } 1010 }
894 if (list_empty(&desc->pg_list)) { 1011 if (list_empty(&mirror->pg_list)) {
895 desc->pg_count = 0; 1012 mirror->pg_count = 0;
896 desc->pg_base = 0; 1013 mirror->pg_base = 0;
897 } 1014 }
898} 1015}
899 1016
@@ -911,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
911static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 1028static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
912 struct nfs_page *req) 1029 struct nfs_page *req)
913{ 1030{
1031 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1032
914 struct nfs_page *subreq; 1033 struct nfs_page *subreq;
915 unsigned int bytes_left = 0; 1034 unsigned int bytes_left = 0;
916 unsigned int offset, pgbase; 1035 unsigned int offset, pgbase;
@@ -934,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
934 nfs_pageio_doio(desc); 1053 nfs_pageio_doio(desc);
935 if (desc->pg_error < 0) 1054 if (desc->pg_error < 0)
936 return 0; 1055 return 0;
937 if (desc->pg_recoalesce) 1056 if (mirror->pg_recoalesce)
938 return 0; 1057 return 0;
939 /* retry add_request for this subreq */ 1058 /* retry add_request for this subreq */
940 nfs_page_group_lock(req, false); 1059 nfs_page_group_lock(req, false);
@@ -972,14 +1091,16 @@ err_ptr:
972 1091
973static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) 1092static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
974{ 1093{
1094 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
975 LIST_HEAD(head); 1095 LIST_HEAD(head);
976 1096
977 do { 1097 do {
978 list_splice_init(&desc->pg_list, &head); 1098 list_splice_init(&mirror->pg_list, &head);
979 desc->pg_bytes_written -= desc->pg_count; 1099 mirror->pg_bytes_written -= mirror->pg_count;
980 desc->pg_count = 0; 1100 mirror->pg_count = 0;
981 desc->pg_base = 0; 1101 mirror->pg_base = 0;
982 desc->pg_recoalesce = 0; 1102 mirror->pg_recoalesce = 0;
1103
983 desc->pg_moreio = 0; 1104 desc->pg_moreio = 0;
984 1105
985 while (!list_empty(&head)) { 1106 while (!list_empty(&head)) {
@@ -993,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
993 return 0; 1114 return 0;
994 break; 1115 break;
995 } 1116 }
996 } while (desc->pg_recoalesce); 1117 } while (mirror->pg_recoalesce);
997 return 1; 1118 return 1;
998} 1119}
999 1120
1000int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 1121static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
1001 struct nfs_page *req) 1122 struct nfs_page *req)
1002{ 1123{
1003 int ret; 1124 int ret;
@@ -1010,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1010 break; 1131 break;
1011 ret = nfs_do_recoalesce(desc); 1132 ret = nfs_do_recoalesce(desc);
1012 } while (ret); 1133 } while (ret);
1134
1013 return ret; 1135 return ret;
1014} 1136}
1015 1137
1138int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1139 struct nfs_page *req)
1140{
1141 u32 midx;
1142 unsigned int pgbase, offset, bytes;
1143 struct nfs_page *dupreq, *lastreq;
1144
1145 pgbase = req->wb_pgbase;
1146 offset = req->wb_offset;
1147 bytes = req->wb_bytes;
1148
1149 nfs_pageio_setup_mirroring(desc, req);
1150
1151 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
1152 if (midx) {
1153 nfs_page_group_lock(req, false);
1154
1155 /* find the last request */
1156 for (lastreq = req->wb_head;
1157 lastreq->wb_this_page != req->wb_head;
1158 lastreq = lastreq->wb_this_page)
1159 ;
1160
1161 dupreq = nfs_create_request(req->wb_context,
1162 req->wb_page, lastreq, pgbase, bytes);
1163
1164 if (IS_ERR(dupreq)) {
1165 nfs_page_group_unlock(req);
1166 return 0;
1167 }
1168
1169 nfs_lock_request(dupreq);
1170 nfs_page_group_unlock(req);
1171 dupreq->wb_offset = offset;
1172 dupreq->wb_index = req->wb_index;
1173 } else
1174 dupreq = req;
1175
1176 if (nfs_pgio_has_mirroring(desc))
1177 desc->pg_mirror_idx = midx;
1178 if (!nfs_pageio_add_request_mirror(desc, dupreq))
1179 return 0;
1180 }
1181
1182 return 1;
1183}
1184
1185/*
1186 * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
1187 * nfs_pageio_descriptor
1188 * @desc: pointer to io descriptor
1189 */
1190static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
1191 u32 mirror_idx)
1192{
1193 struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
1194 u32 restore_idx = desc->pg_mirror_idx;
1195
1196 if (nfs_pgio_has_mirroring(desc))
1197 desc->pg_mirror_idx = mirror_idx;
1198 for (;;) {
1199 nfs_pageio_doio(desc);
1200 if (!mirror->pg_recoalesce)
1201 break;
1202 if (!nfs_do_recoalesce(desc))
1203 break;
1204 }
1205 desc->pg_mirror_idx = restore_idx;
1206}
1207
1016/* 1208/*
1017 * nfs_pageio_resend - Transfer requests to new descriptor and resend 1209 * nfs_pageio_resend - Transfer requests to new descriptor and resend
1018 * @hdr - the pgio header to move request from 1210 * @hdr - the pgio header to move request from
@@ -1046,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
1046EXPORT_SYMBOL_GPL(nfs_pageio_resend); 1238EXPORT_SYMBOL_GPL(nfs_pageio_resend);
1047 1239
1048/** 1240/**
1049 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 1241 * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
1050 * @desc: pointer to io descriptor 1242 * @desc: pointer to io descriptor
1051 */ 1243 */
1052void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) 1244void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1053{ 1245{
1054 for (;;) { 1246 u32 midx;
1055 nfs_pageio_doio(desc); 1247
1056 if (!desc->pg_recoalesce) 1248 for (midx = 0; midx < desc->pg_mirror_count; midx++)
1057 break; 1249 nfs_pageio_complete_mirror(desc, midx);
1058 if (!nfs_do_recoalesce(desc)) 1250
1059 break; 1251 if (desc->pg_ops->pg_cleanup)
1060 } 1252 desc->pg_ops->pg_cleanup(desc);
1253 nfs_pageio_cleanup_mirroring(desc);
1061} 1254}
1062 1255
1063/** 1256/**
@@ -1073,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1073 */ 1266 */
1074void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) 1267void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
1075{ 1268{
1076 if (!list_empty(&desc->pg_list)) { 1269 struct nfs_pgio_mirror *mirror;
1077 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); 1270 struct nfs_page *prev;
1078 if (index != prev->wb_index + 1) 1271 u32 midx;
1079 nfs_pageio_complete(desc); 1272
1273 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
1274 mirror = &desc->pg_mirrors[midx];
1275 if (!list_empty(&mirror->pg_list)) {
1276 prev = nfs_list_entry(mirror->pg_list.prev);
1277 if (index != prev->wb_index + 1)
1278 nfs_pageio_complete_mirror(desc, midx);
1279 }
1080 } 1280 }
1081} 1281}
1082 1282
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
34#include "pnfs.h" 34#include "pnfs.h"
35#include "iostat.h" 35#include "iostat.h"
36#include "nfs4trace.h" 36#include "nfs4trace.h"
37#include "delegation.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_PNFS 39#define NFSDBG_FACILITY NFSDBG_PNFS
39#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 40#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
50 */ 51 */
51static LIST_HEAD(pnfs_modules_tbl); 52static LIST_HEAD(pnfs_modules_tbl);
52 53
54static int
55pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
56 enum pnfs_iomode iomode, bool sync);
57
53/* Return the registered pnfs layout driver module matching given id */ 58/* Return the registered pnfs layout driver module matching given id */
54static struct pnfs_layoutdriver_type * 59static struct pnfs_layoutdriver_type *
55find_pnfs_driver_locked(u32 id) 60find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
238 struct inode *inode = lo->plh_inode; 243 struct inode *inode = lo->plh_inode;
239 244
240 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 245 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
246 if (!list_empty(&lo->plh_segs))
247 WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
241 pnfs_detach_layout_hdr(lo); 248 pnfs_detach_layout_hdr(lo);
242 spin_unlock(&inode->i_lock); 249 spin_unlock(&inode->i_lock);
243 pnfs_free_layout_hdr(lo); 250 pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
337 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 344 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
338} 345}
339 346
347/* Return true if layoutreturn is needed */
348static bool
349pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
350 struct pnfs_layout_segment *lseg)
351{
352 struct pnfs_layout_segment *s;
353
354 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
355 return false;
356
357 list_for_each_entry(s, &lo->plh_segs, pls_list)
358 if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
359 return false;
360
361 return true;
362}
363
364static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
365 struct pnfs_layout_hdr *lo, struct inode *inode)
366{
367 lo = lseg->pls_layout;
368 inode = lo->plh_inode;
369
370 spin_lock(&inode->i_lock);
371 if (pnfs_layout_need_return(lo, lseg)) {
372 nfs4_stateid stateid;
373 enum pnfs_iomode iomode;
374
375 stateid = lo->plh_stateid;
376 iomode = lo->plh_return_iomode;
377 /* decreased in pnfs_send_layoutreturn() */
378 lo->plh_block_lgets++;
379 lo->plh_return_iomode = 0;
380 spin_unlock(&inode->i_lock);
381 pnfs_get_layout_hdr(lo);
382
383 /* Send an async layoutreturn so we dont deadlock */
384 pnfs_send_layoutreturn(lo, stateid, iomode, false);
385 } else
386 spin_unlock(&inode->i_lock);
387}
388
340void 389void
341pnfs_put_lseg(struct pnfs_layout_segment *lseg) 390pnfs_put_lseg(struct pnfs_layout_segment *lseg)
342{ 391{
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
349 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 398 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
350 atomic_read(&lseg->pls_refcount), 399 atomic_read(&lseg->pls_refcount),
351 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 400 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
401
402 /* Handle the case where refcount != 1 */
403 if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
404 return;
405
352 lo = lseg->pls_layout; 406 lo = lseg->pls_layout;
353 inode = lo->plh_inode; 407 inode = lo->plh_inode;
408 /* Do we need a layoutreturn? */
409 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
410 pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
411
354 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 412 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
355 pnfs_get_layout_hdr(lo); 413 pnfs_get_layout_hdr(lo);
356 pnfs_layout_remove_lseg(lo, lseg); 414 pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
543 pnfs_get_layout_hdr(lo); 601 pnfs_get_layout_hdr(lo);
544 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 602 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
545 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 603 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
604 pnfs_clear_retry_layoutget(lo);
546 spin_unlock(&nfsi->vfs_inode.i_lock); 605 spin_unlock(&nfsi->vfs_inode.i_lock);
547 pnfs_free_lseg_list(&tmp_list); 606 pnfs_free_lseg_list(&tmp_list);
548 pnfs_put_layout_hdr(lo); 607 pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
740 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 799 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
741} 800}
742 801
802static bool
803pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
804 struct pnfs_layout_range *range)
805{
806 return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
807 (lo->plh_return_iomode == IOMODE_ANY ||
808 lo->plh_return_iomode == range->iomode);
809}
810
743/* lget is set to 1 if called from inside send_layoutget call chain */ 811/* lget is set to 1 if called from inside send_layoutget call chain */
744static bool 812static bool
745pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) 813pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
814 struct pnfs_layout_range *range, int lget)
746{ 815{
747 return lo->plh_block_lgets || 816 return lo->plh_block_lgets ||
748 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 817 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
749 (list_empty(&lo->plh_segs) && 818 (list_empty(&lo->plh_segs) &&
750 (atomic_read(&lo->plh_outstanding) > lget)); 819 (atomic_read(&lo->plh_outstanding) > lget)) ||
820 pnfs_layout_returning(lo, range);
751} 821}
752 822
753int 823int
754pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 824pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
825 struct pnfs_layout_range *range,
755 struct nfs4_state *open_state) 826 struct nfs4_state *open_state)
756{ 827{
757 int status = 0; 828 int status = 0;
758 829
759 dprintk("--> %s\n", __func__); 830 dprintk("--> %s\n", __func__);
760 spin_lock(&lo->plh_inode->i_lock); 831 spin_lock(&lo->plh_inode->i_lock);
761 if (pnfs_layoutgets_blocked(lo, 1)) { 832 if (pnfs_layoutgets_blocked(lo, range, 1)) {
762 status = -EAGAIN; 833 status = -EAGAIN;
763 } else if (!nfs4_valid_open_stateid(open_state)) { 834 } else if (!nfs4_valid_open_stateid(open_state)) {
764 status = -EBADF; 835 status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
825 pnfs_layout_io_set_failed(lo, range->iomode); 896 pnfs_layout_io_set_failed(lo, range->iomode);
826 } 897 }
827 return NULL; 898 return NULL;
828 } 899 } else
900 pnfs_layout_clear_fail_bit(lo,
901 pnfs_iomode_to_fail_bit(range->iomode));
829 902
830 return lseg; 903 return lseg;
831} 904}
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
845 } 918 }
846} 919}
847 920
921void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
922{
923 clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
924 smp_mb__after_atomic();
925 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
926}
927
928static int
929pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
930 enum pnfs_iomode iomode, bool sync)
931{
932 struct inode *ino = lo->plh_inode;
933 struct nfs4_layoutreturn *lrp;
934 int status = 0;
935
936 lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
937 if (unlikely(lrp == NULL)) {
938 status = -ENOMEM;
939 spin_lock(&ino->i_lock);
940 lo->plh_block_lgets--;
941 pnfs_clear_layoutreturn_waitbit(lo);
942 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
943 spin_unlock(&ino->i_lock);
944 pnfs_put_layout_hdr(lo);
945 goto out;
946 }
947
948 lrp->args.stateid = stateid;
949 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
950 lrp->args.inode = ino;
951 lrp->args.range.iomode = iomode;
952 lrp->args.range.offset = 0;
953 lrp->args.range.length = NFS4_MAX_UINT64;
954 lrp->args.layout = lo;
955 lrp->clp = NFS_SERVER(ino)->nfs_client;
956 lrp->cred = lo->plh_lc_cred;
957
958 status = nfs4_proc_layoutreturn(lrp, sync);
959out:
960 dprintk("<-- %s status: %d\n", __func__, status);
961 return status;
962}
963
848/* 964/*
849 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 965 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
850 * when the layout segment list is empty. 966 * when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
859 struct pnfs_layout_hdr *lo = NULL; 975 struct pnfs_layout_hdr *lo = NULL;
860 struct nfs_inode *nfsi = NFS_I(ino); 976 struct nfs_inode *nfsi = NFS_I(ino);
861 LIST_HEAD(tmp_list); 977 LIST_HEAD(tmp_list);
862 struct nfs4_layoutreturn *lrp;
863 nfs4_stateid stateid; 978 nfs4_stateid stateid;
864 int status = 0, empty; 979 int status = 0, empty;
865 980
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
901 spin_unlock(&ino->i_lock); 1016 spin_unlock(&ino->i_lock);
902 pnfs_free_lseg_list(&tmp_list); 1017 pnfs_free_lseg_list(&tmp_list);
903 1018
904 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 1019 status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
905 if (unlikely(lrp == NULL)) {
906 status = -ENOMEM;
907 spin_lock(&ino->i_lock);
908 lo->plh_block_lgets--;
909 spin_unlock(&ino->i_lock);
910 pnfs_put_layout_hdr(lo);
911 goto out;
912 }
913
914 lrp->args.stateid = stateid;
915 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
916 lrp->args.inode = ino;
917 lrp->args.layout = lo;
918 lrp->clp = NFS_SERVER(ino)->nfs_client;
919 lrp->cred = lo->plh_lc_cred;
920
921 status = nfs4_proc_layoutreturn(lrp);
922out: 1020out:
923 dprintk("<-- %s status: %d\n", __func__, status); 1021 dprintk("<-- %s status: %d\n", __func__, status);
924 return status; 1022 return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
954 1052
955bool pnfs_roc(struct inode *ino) 1053bool pnfs_roc(struct inode *ino)
956{ 1054{
1055 struct nfs_inode *nfsi = NFS_I(ino);
1056 struct nfs_open_context *ctx;
1057 struct nfs4_state *state;
957 struct pnfs_layout_hdr *lo; 1058 struct pnfs_layout_hdr *lo;
958 struct pnfs_layout_segment *lseg, *tmp; 1059 struct pnfs_layout_segment *lseg, *tmp;
1060 nfs4_stateid stateid;
959 LIST_HEAD(tmp_list); 1061 LIST_HEAD(tmp_list);
960 bool found = false; 1062 bool found = false, layoutreturn = false;
961 1063
962 spin_lock(&ino->i_lock); 1064 spin_lock(&ino->i_lock);
963 lo = NFS_I(ino)->layout; 1065 lo = nfsi->layout;
964 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 1066 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
965 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 1067 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
966 goto out_nolayout; 1068 goto out_noroc;
1069
1070 /* Don't return layout if we hold a delegation */
1071 if (nfs4_check_delegation(ino, FMODE_READ))
1072 goto out_noroc;
1073
1074 list_for_each_entry(ctx, &nfsi->open_files, list) {
1075 state = ctx->state;
1076 /* Don't return layout if there is open file state */
1077 if (state != NULL && state->state != 0)
1078 goto out_noroc;
1079 }
1080
1081 pnfs_clear_retry_layoutget(lo);
967 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1082 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
968 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1083 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
969 mark_lseg_invalid(lseg, &tmp_list); 1084 mark_lseg_invalid(lseg, &tmp_list);
970 found = true; 1085 found = true;
971 } 1086 }
972 if (!found) 1087 if (!found)
973 goto out_nolayout; 1088 goto out_noroc;
974 lo->plh_block_lgets++; 1089 lo->plh_block_lgets++;
975 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
976 spin_unlock(&ino->i_lock); 1091 spin_unlock(&ino->i_lock);
977 pnfs_free_lseg_list(&tmp_list); 1092 pnfs_free_lseg_list(&tmp_list);
978 return true; 1093 return true;
979 1094
980out_nolayout: 1095out_noroc:
1096 if (lo) {
1097 stateid = lo->plh_stateid;
1098 layoutreturn =
1099 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1100 &lo->plh_flags);
1101 if (layoutreturn) {
1102 lo->plh_block_lgets++;
1103 pnfs_get_layout_hdr(lo);
1104 }
1105 }
981 spin_unlock(&ino->i_lock); 1106 spin_unlock(&ino->i_lock);
1107 if (layoutreturn)
1108 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
982 return false; 1109 return false;
983} 1110}
984 1111
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1013 struct nfs_inode *nfsi = NFS_I(ino); 1140 struct nfs_inode *nfsi = NFS_I(ino);
1014 struct pnfs_layout_hdr *lo; 1141 struct pnfs_layout_hdr *lo;
1015 struct pnfs_layout_segment *lseg; 1142 struct pnfs_layout_segment *lseg;
1143 nfs4_stateid stateid;
1016 u32 current_seqid; 1144 u32 current_seqid;
1017 bool found = false; 1145 bool found = false, layoutreturn = false;
1018 1146
1019 spin_lock(&ino->i_lock); 1147 spin_lock(&ino->i_lock);
1020 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 1148 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1031 */ 1159 */
1032 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 1160 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1033out: 1161out:
1162 if (!found) {
1163 stateid = lo->plh_stateid;
1164 layoutreturn =
1165 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1166 &lo->plh_flags);
1167 if (layoutreturn) {
1168 lo->plh_block_lgets++;
1169 pnfs_get_layout_hdr(lo);
1170 }
1171 }
1034 spin_unlock(&ino->i_lock); 1172 spin_unlock(&ino->i_lock);
1173 if (layoutreturn) {
1174 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1175 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
1176 }
1035 return found; 1177 return found;
1036} 1178}
1037 1179
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1178 1320
1179 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1321 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1180 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1322 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1323 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1181 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1324 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1182 ret = pnfs_get_lseg(lseg); 1325 ret = pnfs_get_lseg(lseg);
1183 break; 1326 break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1266 return ret; 1409 return ret;
1267} 1410}
1268 1411
1412/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
1413static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
1414{
1415 if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
1416 return 1;
1417 return nfs_wait_bit_killable(key);
1418}
1419
1420static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1421{
1422 /*
1423 * send layoutcommit as it can hold up layoutreturn due to lseg
1424 * reference
1425 */
1426 pnfs_layoutcommit_inode(lo->plh_inode, false);
1427 return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1428 pnfs_layoutget_retry_bit_wait,
1429 TASK_UNINTERRUPTIBLE);
1430}
1431
1432static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1433{
1434 unsigned long *bitlock = &lo->plh_flags;
1435
1436 clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1437 smp_mb__after_atomic();
1438 wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1439}
1440
1269/* 1441/*
1270 * Layout segment is retreived from the server if not cached. 1442 * Layout segment is retreived from the server if not cached.
1271 * The appropriate layout segment is referenced and returned to the caller. 1443 * The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
1296 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1468 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1297 goto out; 1469 goto out;
1298 1470
1471lookup_again:
1472 first = false;
1299 spin_lock(&ino->i_lock); 1473 spin_lock(&ino->i_lock);
1300 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1474 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1301 if (lo == NULL) { 1475 if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
1310 } 1484 }
1311 1485
1312 /* if LAYOUTGET already failed once we don't try again */ 1486 /* if LAYOUTGET already failed once we don't try again */
1313 if (pnfs_layout_io_test_failed(lo, iomode)) 1487 if (pnfs_layout_io_test_failed(lo, iomode) &&
1488 !pnfs_should_retry_layoutget(lo))
1314 goto out_unlock; 1489 goto out_unlock;
1315 1490
1316 /* Check to see if the layout for the given range already exists */ 1491 first = list_empty(&lo->plh_segs);
1317 lseg = pnfs_find_lseg(lo, &arg); 1492 if (first) {
1318 if (lseg) 1493 /* The first layoutget for the file. Need to serialize per
1319 goto out_unlock; 1494 * RFC 5661 Errata 3208.
1495 */
1496 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1497 &lo->plh_flags)) {
1498 spin_unlock(&ino->i_lock);
1499 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1500 TASK_UNINTERRUPTIBLE);
1501 pnfs_put_layout_hdr(lo);
1502 goto lookup_again;
1503 }
1504 } else {
1505 /* Check to see if the layout for the given range
1506 * already exists
1507 */
1508 lseg = pnfs_find_lseg(lo, &arg);
1509 if (lseg)
1510 goto out_unlock;
1511 }
1512
1513 /*
1514 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1515 * for LAYOUTRETURN even if first is true.
1516 */
1517 if (!lseg && pnfs_should_retry_layoutget(lo) &&
1518 test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1519 spin_unlock(&ino->i_lock);
1520 dprintk("%s wait for layoutreturn\n", __func__);
1521 if (pnfs_prepare_to_retry_layoutget(lo)) {
1522 if (first)
1523 pnfs_clear_first_layoutget(lo);
1524 pnfs_put_layout_hdr(lo);
1525 dprintk("%s retrying\n", __func__);
1526 goto lookup_again;
1527 }
1528 goto out_put_layout_hdr;
1529 }
1320 1530
1321 if (pnfs_layoutgets_blocked(lo, 0)) 1531 if (pnfs_layoutgets_blocked(lo, &arg, 0))
1322 goto out_unlock; 1532 goto out_unlock;
1323 atomic_inc(&lo->plh_outstanding); 1533 atomic_inc(&lo->plh_outstanding);
1324
1325 first = list_empty(&lo->plh_layouts) ? true : false;
1326 spin_unlock(&ino->i_lock); 1534 spin_unlock(&ino->i_lock);
1327 1535
1328 if (first) { 1536 if (list_empty(&lo->plh_layouts)) {
1329 /* The lo must be on the clp list if there is any 1537 /* The lo must be on the clp list if there is any
1330 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1538 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1331 */ 1539 */
1332 spin_lock(&clp->cl_lock); 1540 spin_lock(&clp->cl_lock);
1333 list_add_tail(&lo->plh_layouts, &server->layouts); 1541 if (list_empty(&lo->plh_layouts))
1542 list_add_tail(&lo->plh_layouts, &server->layouts);
1334 spin_unlock(&clp->cl_lock); 1543 spin_unlock(&clp->cl_lock);
1335 } 1544 }
1336 1545
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
1343 arg.length = PAGE_CACHE_ALIGN(arg.length); 1552 arg.length = PAGE_CACHE_ALIGN(arg.length);
1344 1553
1345 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1554 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1555 pnfs_clear_retry_layoutget(lo);
1346 atomic_dec(&lo->plh_outstanding); 1556 atomic_dec(&lo->plh_outstanding);
1347out_put_layout_hdr: 1557out_put_layout_hdr:
1558 if (first)
1559 pnfs_clear_first_layoutget(lo);
1348 pnfs_put_layout_hdr(lo); 1560 pnfs_put_layout_hdr(lo);
1349out: 1561out:
1350 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1562 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1393 goto out_forget_reply; 1605 goto out_forget_reply;
1394 } 1606 }
1395 1607
1396 if (pnfs_layoutgets_blocked(lo, 1)) { 1608 if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
1397 dprintk("%s forget reply due to state\n", __func__); 1609 dprintk("%s forget reply due to state\n", __func__);
1398 goto out_forget_reply; 1610 goto out_forget_reply;
1399 } 1611 }
@@ -1440,24 +1652,79 @@ out_forget_reply:
1440 goto out; 1652 goto out;
1441} 1653}
1442 1654
1655static void
1656pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1657 struct list_head *tmp_list,
1658 struct pnfs_layout_range *return_range)
1659{
1660 struct pnfs_layout_segment *lseg, *next;
1661
1662 dprintk("%s:Begin lo %p\n", __func__, lo);
1663
1664 if (list_empty(&lo->plh_segs))
1665 return;
1666
1667 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1668 if (should_free_lseg(&lseg->pls_range, return_range)) {
1669 dprintk("%s: marking lseg %p iomode %d "
1670 "offset %llu length %llu\n", __func__,
1671 lseg, lseg->pls_range.iomode,
1672 lseg->pls_range.offset,
1673 lseg->pls_range.length);
1674 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1675 mark_lseg_invalid(lseg, tmp_list);
1676 }
1677}
1678
1679void pnfs_error_mark_layout_for_return(struct inode *inode,
1680 struct pnfs_layout_segment *lseg)
1681{
1682 struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1683 int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
1684 struct pnfs_layout_range range = {
1685 .iomode = lseg->pls_range.iomode,
1686 .offset = 0,
1687 .length = NFS4_MAX_UINT64,
1688 };
1689 LIST_HEAD(free_me);
1690
1691 spin_lock(&inode->i_lock);
1692 /* set failure bit so that pnfs path will be retried later */
1693 pnfs_layout_set_fail_bit(lo, iomode);
1694 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1695 if (lo->plh_return_iomode == 0)
1696 lo->plh_return_iomode = range.iomode;
1697 else if (lo->plh_return_iomode != range.iomode)
1698 lo->plh_return_iomode = IOMODE_ANY;
1699 /*
1700 * mark all matching lsegs so that we are sure to have no live
1701 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1702 * for how it works.
1703 */
1704 pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
1705 spin_unlock(&inode->i_lock);
1706 pnfs_free_lseg_list(&free_me);
1707}
1708EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1709
1443void 1710void
1444pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1711pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1445{ 1712{
1446 u64 rd_size = req->wb_bytes; 1713 u64 rd_size = req->wb_bytes;
1447 1714
1448 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1715 if (pgio->pg_lseg == NULL) {
1449 1716 if (pgio->pg_dreq == NULL)
1450 if (pgio->pg_dreq == NULL) 1717 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1451 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1718 else
1452 else 1719 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1453 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1720
1454 1721 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1455 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1722 req->wb_context,
1456 req->wb_context, 1723 req_offset(req),
1457 req_offset(req), 1724 rd_size,
1458 rd_size, 1725 IOMODE_READ,
1459 IOMODE_READ, 1726 GFP_KERNEL);
1460 GFP_KERNEL); 1727 }
1461 /* If no lseg, fall back to read through mds */ 1728 /* If no lseg, fall back to read through mds */
1462 if (pgio->pg_lseg == NULL) 1729 if (pgio->pg_lseg == NULL)
1463 nfs_pageio_reset_read_mds(pgio); 1730 nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
1469pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1736pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1470 struct nfs_page *req, u64 wb_size) 1737 struct nfs_page *req, u64 wb_size)
1471{ 1738{
1472 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1739 if (pgio->pg_lseg == NULL)
1473 1740 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1474 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1741 req->wb_context,
1475 req->wb_context, 1742 req_offset(req),
1476 req_offset(req), 1743 wb_size,
1477 wb_size, 1744 IOMODE_RW,
1478 IOMODE_RW, 1745 GFP_NOFS);
1479 GFP_NOFS);
1480 /* If no lseg, fall back to write through mds */ 1746 /* If no lseg, fall back to write through mds */
1481 if (pgio->pg_lseg == NULL) 1747 if (pgio->pg_lseg == NULL)
1482 nfs_pageio_reset_write_mds(pgio); 1748 nfs_pageio_reset_write_mds(pgio);
1483} 1749}
1484EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1750EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1485 1751
1752void
1753pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1754{
1755 if (desc->pg_lseg) {
1756 pnfs_put_lseg(desc->pg_lseg);
1757 desc->pg_lseg = NULL;
1758 }
1759}
1760EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1761
1486/* 1762/*
1487 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number 1763 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1488 * of bytes (maximum @req->wb_bytes) that can be coalesced. 1764 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1489 */ 1765 */
1490size_t 1766size_t
1491pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1767pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1492 struct nfs_page *req) 1768 struct nfs_page *prev, struct nfs_page *req)
1493{ 1769{
1494 unsigned int size; 1770 unsigned int size;
1495 u64 seg_end, req_start, seg_left; 1771 u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1513 seg_end = end_offset(pgio->pg_lseg->pls_range.offset, 1789 seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1514 pgio->pg_lseg->pls_range.length); 1790 pgio->pg_lseg->pls_range.length);
1515 req_start = req_offset(req); 1791 req_start = req_offset(req);
1516 WARN_ON_ONCE(req_start > seg_end); 1792 WARN_ON_ONCE(req_start >= seg_end);
1517 /* start of request is past the last byte of this segment */ 1793 /* start of request is past the last byte of this segment */
1518 if (req_start >= seg_end) 1794 if (req_start >= seg_end) {
1795 /* reference the new lseg */
1796 if (pgio->pg_ops->pg_cleanup)
1797 pgio->pg_ops->pg_cleanup(pgio);
1798 if (pgio->pg_ops->pg_init)
1799 pgio->pg_ops->pg_init(pgio, req);
1519 return 0; 1800 return 0;
1801 }
1520 1802
1521 /* adjust 'size' iff there are fewer bytes left in the 1803 /* adjust 'size' iff there are fewer bytes left in the
1522 * segment than what nfs_generic_pg_test returned */ 1804 * segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
1571pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1853pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1572 struct nfs_pgio_header *hdr) 1854 struct nfs_pgio_header *hdr)
1573{ 1855{
1856 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1857
1574 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1858 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1575 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1859 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1576 nfs_pageio_reset_write_mds(desc); 1860 nfs_pageio_reset_write_mds(desc);
1577 desc->pg_recoalesce = 1; 1861 mirror->pg_recoalesce = 1;
1578 } 1862 }
1579 nfs_pgio_data_destroy(hdr); 1863 nfs_pgio_data_destroy(hdr);
1580} 1864}
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
1608 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1892 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1609 enum pnfs_try_status trypnfs; 1893 enum pnfs_try_status trypnfs;
1610 1894
1611 desc->pg_lseg = NULL;
1612 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); 1895 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1613 if (trypnfs == PNFS_NOT_ATTEMPTED) 1896 if (trypnfs == PNFS_NOT_ATTEMPTED)
1614 pnfs_write_through_mds(desc, hdr); 1897 pnfs_write_through_mds(desc, hdr);
1615 pnfs_put_lseg(lseg);
1616} 1898}
1617 1899
1618static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1900static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1625int 1907int
1626pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1908pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1627{ 1909{
1910 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1911
1628 struct nfs_pgio_header *hdr; 1912 struct nfs_pgio_header *hdr;
1629 int ret; 1913 int ret;
1630 1914
1631 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 1915 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1632 if (!hdr) { 1916 if (!hdr) {
1633 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1917 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1634 pnfs_put_lseg(desc->pg_lseg);
1635 desc->pg_lseg = NULL;
1636 return -ENOMEM; 1918 return -ENOMEM;
1637 } 1919 }
1638 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1920 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1921
1639 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1922 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1640 ret = nfs_generic_pgio(desc, hdr); 1923 ret = nfs_generic_pgio(desc, hdr);
1641 if (ret != 0) { 1924 if (!ret)
1642 pnfs_put_lseg(desc->pg_lseg);
1643 desc->pg_lseg = NULL;
1644 } else
1645 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1925 pnfs_do_write(desc, hdr, desc->pg_ioflags);
1926
1646 return ret; 1927 return ret;
1647} 1928}
1648EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1929EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
1687pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1968pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1688 struct nfs_pgio_header *hdr) 1969 struct nfs_pgio_header *hdr)
1689{ 1970{
1971 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1972
1690 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1973 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1691 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1974 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1692 nfs_pageio_reset_read_mds(desc); 1975 nfs_pageio_reset_read_mds(desc);
1693 desc->pg_recoalesce = 1; 1976 mirror->pg_recoalesce = 1;
1694 } 1977 }
1695 nfs_pgio_data_destroy(hdr); 1978 nfs_pgio_data_destroy(hdr);
1696} 1979}
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
1719 return trypnfs; 2002 return trypnfs;
1720} 2003}
1721 2004
2005/* Resend all requests through pnfs. */
2006int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2007{
2008 struct nfs_pageio_descriptor pgio;
2009
2010 nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
2011 return nfs_pageio_resend(&pgio, hdr);
2012}
2013EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2014
1722static void 2015static void
1723pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 2016pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
1724{ 2017{
1725 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2018 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1726 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2019 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1727 enum pnfs_try_status trypnfs; 2020 enum pnfs_try_status trypnfs;
2021 int err = 0;
1728 2022
1729 desc->pg_lseg = NULL;
1730 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 2023 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
1731 if (trypnfs == PNFS_NOT_ATTEMPTED) 2024 if (trypnfs == PNFS_TRY_AGAIN)
2025 err = pnfs_read_resend_pnfs(hdr);
2026 if (trypnfs == PNFS_NOT_ATTEMPTED || err)
1732 pnfs_read_through_mds(desc, hdr); 2027 pnfs_read_through_mds(desc, hdr);
1733 pnfs_put_lseg(lseg);
1734} 2028}
1735 2029
1736static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 2030static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1743int 2037int
1744pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 2038pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1745{ 2039{
2040 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2041
1746 struct nfs_pgio_header *hdr; 2042 struct nfs_pgio_header *hdr;
1747 int ret; 2043 int ret;
1748 2044
1749 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 2045 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1750 if (!hdr) { 2046 if (!hdr) {
1751 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 2047 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1752 ret = -ENOMEM; 2048 return -ENOMEM;
1753 pnfs_put_lseg(desc->pg_lseg);
1754 desc->pg_lseg = NULL;
1755 return ret;
1756 } 2049 }
1757 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 2050 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1758 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 2051 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1759 ret = nfs_generic_pgio(desc, hdr); 2052 ret = nfs_generic_pgio(desc, hdr);
1760 if (ret != 0) { 2053 if (!ret)
1761 pnfs_put_lseg(desc->pg_lseg);
1762 desc->pg_lseg = NULL;
1763 } else
1764 pnfs_do_read(desc, hdr); 2054 pnfs_do_read(desc, hdr);
1765 return ret; 2055 return ret;
1766} 2056}
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
1966 pnfs_clear_layoutcommitting(inode); 2256 pnfs_clear_layoutcommitting(inode);
1967 goto out; 2257 goto out;
1968} 2258}
2259EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
1969 2260
1970struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 2261struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1971{ 2262{
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9ae5b765b073..797cd6253adf 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -38,6 +38,25 @@ enum {
38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
39 NFS_LSEG_ROC, /* roc bit received from server */ 39 NFS_LSEG_ROC, /* roc bit received from server */
40 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ 40 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
41 NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
42};
43
44/* Individual ip address */
45struct nfs4_pnfs_ds_addr {
46 struct sockaddr_storage da_addr;
47 size_t da_addrlen;
48 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
49 char *da_remotestr; /* human readable addr+port */
50};
51
52struct nfs4_pnfs_ds {
53 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
54 char *ds_remotestr; /* comma sep list of addrs */
55 struct list_head ds_addrs;
56 struct nfs_client *ds_clp;
57 atomic_t ds_count;
58 unsigned long ds_state;
59#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
41}; 60};
42 61
43struct pnfs_layout_segment { 62struct pnfs_layout_segment {
@@ -53,19 +72,34 @@ struct pnfs_layout_segment {
53enum pnfs_try_status { 72enum pnfs_try_status {
54 PNFS_ATTEMPTED = 0, 73 PNFS_ATTEMPTED = 0,
55 PNFS_NOT_ATTEMPTED = 1, 74 PNFS_NOT_ATTEMPTED = 1,
75 PNFS_TRY_AGAIN = 2,
56}; 76};
57 77
58#ifdef CONFIG_NFS_V4_1 78#ifdef CONFIG_NFS_V4_1
59 79
60#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" 80#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
61 81
82/*
83 * Default data server connection timeout and retrans vaules.
84 * Set by module parameters dataserver_timeo and dataserver_retrans.
85 */
86#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
87#define NFS4_DEF_DS_RETRANS 5
88
89/* error codes for internal use */
90#define NFS4ERR_RESET_TO_MDS 12001
91#define NFS4ERR_RESET_TO_PNFS 12002
92
62enum { 93enum {
63 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ 94 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
64 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 95 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 96 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 97 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 98 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
99 NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ 100 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
101 NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
102 NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
69}; 103};
70 104
71enum layoutdriver_policy_flags { 105enum layoutdriver_policy_flags {
@@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type {
106 struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); 140 struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
107 void (*mark_request_commit) (struct nfs_page *req, 141 void (*mark_request_commit) (struct nfs_page *req,
108 struct pnfs_layout_segment *lseg, 142 struct pnfs_layout_segment *lseg,
109 struct nfs_commit_info *cinfo); 143 struct nfs_commit_info *cinfo,
144 u32 ds_commit_idx);
110 void (*clear_request_commit) (struct nfs_page *req, 145 void (*clear_request_commit) (struct nfs_page *req,
111 struct nfs_commit_info *cinfo); 146 struct nfs_commit_info *cinfo);
112 int (*scan_commit_lists) (struct nfs_commit_info *cinfo, 147 int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
@@ -154,6 +189,7 @@ struct pnfs_layout_hdr {
154 u32 plh_barrier; /* ignore lower seqids */ 189 u32 plh_barrier; /* ignore lower seqids */
155 unsigned long plh_retry_timestamp; 190 unsigned long plh_retry_timestamp;
156 unsigned long plh_flags; 191 unsigned long plh_flags;
192 enum pnfs_iomode plh_return_iomode;
157 loff_t plh_lwb; /* last write byte for layoutcommit */ 193 loff_t plh_lwb; /* last write byte for layoutcommit */
158 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 194 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
159 struct inode *plh_inode; 195 struct inode *plh_inode;
@@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
185 struct pnfs_device *dev, 221 struct pnfs_device *dev,
186 struct rpc_cred *cred); 222 struct rpc_cred *cred);
187extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 223extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
188extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 224extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
189 225
190/* pnfs.c */ 226/* pnfs.c */
191void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); 227void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *
198int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); 234int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
199void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 235void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
200 struct nfs_page *req, u64 wb_size); 236 struct nfs_page *req, u64 wb_size);
237void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
201int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 238int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
202size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, 239size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
203 struct nfs_page *prev, struct nfs_page *req); 240 struct nfs_page *prev, struct nfs_page *req);
@@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
217 bool update_barrier); 254 bool update_barrier);
218int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, 255int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
219 struct pnfs_layout_hdr *lo, 256 struct pnfs_layout_hdr *lo,
257 struct pnfs_layout_range *range,
220 struct nfs4_state *open_state); 258 struct nfs4_state *open_state);
221int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 259int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
222 struct list_head *tmp_list, 260 struct list_head *tmp_list,
@@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *);
233int pnfs_commit_and_return_layout(struct inode *); 271int pnfs_commit_and_return_layout(struct inode *);
234void pnfs_ld_write_done(struct nfs_pgio_header *); 272void pnfs_ld_write_done(struct nfs_pgio_header *);
235void pnfs_ld_read_done(struct nfs_pgio_header *); 273void pnfs_ld_read_done(struct nfs_pgio_header *);
274int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
236struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 275struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
237 struct nfs_open_context *ctx, 276 struct nfs_open_context *ctx,
238 loff_t pos, 277 loff_t pos,
239 u64 count, 278 u64 count,
240 enum pnfs_iomode iomode, 279 enum pnfs_iomode iomode,
241 gfp_t gfp_flags); 280 gfp_t gfp_flags);
281void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
242 282
243void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); 283void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
244int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); 284int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
245int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); 285int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
246struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); 286struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
287void pnfs_error_mark_layout_for_return(struct inode *inode,
288 struct pnfs_layout_segment *lseg);
247 289
248/* nfs4_deviceid_flags */ 290/* nfs4_deviceid_flags */
249enum { 291enum {
@@ -275,6 +317,39 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
275bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 317bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
276void nfs4_deviceid_purge_client(const struct nfs_client *); 318void nfs4_deviceid_purge_client(const struct nfs_client *);
277 319
320/* pnfs_nfs.c */
321void pnfs_generic_clear_request_commit(struct nfs_page *req,
322 struct nfs_commit_info *cinfo);
323void pnfs_generic_commit_release(void *calldata);
324void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
325void pnfs_generic_rw_release(void *data);
326void pnfs_generic_recover_commit_reqs(struct list_head *dst,
327 struct nfs_commit_info *cinfo);
328int pnfs_generic_commit_pagelist(struct inode *inode,
329 struct list_head *mds_pages,
330 int how,
331 struct nfs_commit_info *cinfo,
332 int (*initiate_commit)(struct nfs_commit_data *data,
333 int how));
334int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
335void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
336void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
337struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
338 gfp_t gfp_flags);
339void nfs4_pnfs_v3_ds_connect_unload(void);
340void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
341 struct nfs4_deviceid_node *devid, unsigned int timeo,
342 unsigned int retrans, u32 version, u32 minor_version,
343 rpc_authflavor_t au_flavor);
344struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
345 struct xdr_stream *xdr,
346 gfp_t gfp_flags);
347
348static inline bool nfs_have_layout(struct inode *inode)
349{
350 return NFS_I(inode)->layout != NULL;
351}
352
278static inline struct nfs4_deviceid_node * 353static inline struct nfs4_deviceid_node *
279nfs4_get_deviceid(struct nfs4_deviceid_node *d) 354nfs4_get_deviceid(struct nfs4_deviceid_node *d)
280{ 355{
@@ -282,6 +357,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
282 return d; 357 return d;
283} 358}
284 359
360static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
361{
362 if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
363 atomic_inc(&lo->plh_refcount);
364}
365
366static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
367{
368 if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
369 atomic_dec(&lo->plh_refcount);
370 /* wake up waiters for LAYOUTRETURN as that is not needed */
371 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
372 }
373}
374
375static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
376{
377 return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
378}
379
285static inline struct pnfs_layout_segment * 380static inline struct pnfs_layout_segment *
286pnfs_get_lseg(struct pnfs_layout_segment *lseg) 381pnfs_get_lseg(struct pnfs_layout_segment *lseg)
287{ 382{
@@ -317,16 +412,22 @@ pnfs_get_ds_info(struct inode *inode)
317 return ld->get_ds_info(inode); 412 return ld->get_ds_info(inode);
318} 413}
319 414
415static inline void
416pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
417{
418 set_bit(NFS_DEVICEID_INVALID, &node->flags);
419}
420
320static inline bool 421static inline bool
321pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 422pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
322 struct nfs_commit_info *cinfo) 423 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
323{ 424{
324 struct inode *inode = req->wb_context->dentry->d_inode; 425 struct inode *inode = req->wb_context->dentry->d_inode;
325 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 426 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
326 427
327 if (lseg == NULL || ld->mark_request_commit == NULL) 428 if (lseg == NULL || ld->mark_request_commit == NULL)
328 return false; 429 return false;
329 ld->mark_request_commit(req, lseg, cinfo); 430 ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
330 return true; 431 return true;
331} 432}
332 433
@@ -352,15 +453,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
352 return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); 453 return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
353} 454}
354 455
355static inline void
356pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
357 struct nfs_commit_info *cinfo)
358{
359 if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
360 return;
361 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
362}
363
364static inline struct nfs_page * 456static inline struct nfs_page *
365pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, 457pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
366 struct page *page) 458 struct page *page)
@@ -427,6 +519,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
427#endif /* NFS_DEBUG */ 519#endif /* NFS_DEBUG */
428#else /* CONFIG_NFS_V4_1 */ 520#else /* CONFIG_NFS_V4_1 */
429 521
522static inline bool nfs_have_layout(struct inode *inode)
523{
524 return false;
525}
526
430static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 527static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
431{ 528{
432} 529}
@@ -513,7 +610,7 @@ pnfs_get_ds_info(struct inode *inode)
513 610
514static inline bool 611static inline bool
515pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 612pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
516 struct nfs_commit_info *cinfo) 613 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
517{ 614{
518 return false; 615 return false;
519} 616}
@@ -531,12 +628,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
531 return 0; 628 return 0;
532} 629}
533 630
534static inline void
535pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
536 struct nfs_commit_info *cinfo)
537{
538}
539
540static inline struct nfs_page * 631static inline struct nfs_page *
541pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, 632pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
542 struct page *page) 633 struct page *page)
@@ -568,6 +659,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
568 return NULL; 659 return NULL;
569} 660}
570 661
662static inline void nfs4_pnfs_v3_ds_connect_unload(void)
663{
664}
665
571#endif /* CONFIG_NFS_V4_1 */ 666#endif /* CONFIG_NFS_V4_1 */
572 667
573#endif /* FS_NFS_PNFS_H */ 668#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000000..fdc4f6562bb7
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,840 @@
1/*
2 * Common NFS I/O operations for the pnfs file based
3 * layout drivers.
4 *
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
6 *
7 * Tom Haynes <loghyr@primarydata.com>
8 */
9
10#include <linux/nfs_fs.h>
11#include <linux/nfs_page.h>
12#include <linux/sunrpc/addr.h>
13#include <linux/module.h>
14
15#include "nfs4session.h"
16#include "internal.h"
17#include "pnfs.h"
18
19#define NFSDBG_FACILITY NFSDBG_PNFS
20
21void pnfs_generic_rw_release(void *data)
22{
23 struct nfs_pgio_header *hdr = data;
24
25 nfs_put_client(hdr->ds_clp);
26 hdr->mds_ops->rpc_release(data);
27}
28EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
29
30/* Fake up some data that will cause nfs_commit_release to retry the writes. */
31void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
32{
33 struct nfs_page *first = nfs_list_entry(data->pages.next);
34
35 data->task.tk_status = 0;
36 memcpy(&data->verf.verifier, &first->wb_verf,
37 sizeof(data->verf.verifier));
38 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
39}
40EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
41
42void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
43{
44 struct nfs_commit_data *wdata = data;
45
46 /* Note this may cause RPC to be resent */
47 wdata->mds_ops->rpc_call_done(task, data);
48}
49EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
50
51void pnfs_generic_commit_release(void *calldata)
52{
53 struct nfs_commit_data *data = calldata;
54
55 data->completion_ops->completion(data);
56 pnfs_put_lseg(data->lseg);
57 nfs_put_client(data->ds_clp);
58 nfs_commitdata_release(data);
59}
60EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
61
62/* The generic layer is about to remove the req from the commit list.
63 * If this will make the bucket empty, it will need to put the lseg reference.
64 * Note this must be called holding the inode (/cinfo) lock
65 */
66void
67pnfs_generic_clear_request_commit(struct nfs_page *req,
68 struct nfs_commit_info *cinfo)
69{
70 struct pnfs_layout_segment *freeme = NULL;
71
72 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
73 goto out;
74 cinfo->ds->nwritten--;
75 if (list_is_singular(&req->wb_list)) {
76 struct pnfs_commit_bucket *bucket;
77
78 bucket = list_first_entry(&req->wb_list,
79 struct pnfs_commit_bucket,
80 written);
81 freeme = bucket->wlseg;
82 bucket->wlseg = NULL;
83 }
84out:
85 nfs_request_remove_commit_list(req, cinfo);
86 pnfs_put_lseg_locked(freeme);
87}
88EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
89
90static int
91pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
92 struct nfs_commit_info *cinfo, int max)
93{
94 struct nfs_page *req, *tmp;
95 int ret = 0;
96
97 list_for_each_entry_safe(req, tmp, src, wb_list) {
98 if (!nfs_lock_request(req))
99 continue;
100 kref_get(&req->wb_kref);
101 if (cond_resched_lock(cinfo->lock))
102 list_safe_reset_next(req, tmp, wb_list);
103 nfs_request_remove_commit_list(req, cinfo);
104 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
105 nfs_list_add_request(req, dst);
106 ret++;
107 if ((ret == max) && !cinfo->dreq)
108 break;
109 }
110 return ret;
111}
112
113static int
114pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
115 struct nfs_commit_info *cinfo,
116 int max)
117{
118 struct list_head *src = &bucket->written;
119 struct list_head *dst = &bucket->committing;
120 int ret;
121
122 lockdep_assert_held(cinfo->lock);
123 ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
124 if (ret) {
125 cinfo->ds->nwritten -= ret;
126 cinfo->ds->ncommitting += ret;
127 bucket->clseg = bucket->wlseg;
128 if (list_empty(src))
129 bucket->wlseg = NULL;
130 else
131 pnfs_get_lseg(bucket->clseg);
132 }
133 return ret;
134}
135
136/* Move reqs from written to committing lists, returning count
137 * of number moved.
138 */
139int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
140 int max)
141{
142 int i, rv = 0, cnt;
143
144 lockdep_assert_held(cinfo->lock);
145 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
146 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
147 cinfo, max);
148 max -= cnt;
149 rv += cnt;
150 }
151 return rv;
152}
153EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
154
155/* Pull everything off the committing lists and dump into @dst. */
156void pnfs_generic_recover_commit_reqs(struct list_head *dst,
157 struct nfs_commit_info *cinfo)
158{
159 struct pnfs_commit_bucket *b;
160 struct pnfs_layout_segment *freeme;
161 int i;
162
163 lockdep_assert_held(cinfo->lock);
164restart:
165 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
166 if (pnfs_generic_transfer_commit_list(&b->written, dst,
167 cinfo, 0)) {
168 freeme = b->wlseg;
169 b->wlseg = NULL;
170 spin_unlock(cinfo->lock);
171 pnfs_put_lseg(freeme);
172 spin_lock(cinfo->lock);
173 goto restart;
174 }
175 }
176 cinfo->ds->nwritten = 0;
177}
178EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
179
180static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
181{
182 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
183 struct pnfs_commit_bucket *bucket;
184 struct pnfs_layout_segment *freeme;
185 int i;
186
187 for (i = idx; i < fl_cinfo->nbuckets; i++) {
188 bucket = &fl_cinfo->buckets[i];
189 if (list_empty(&bucket->committing))
190 continue;
191 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
192 spin_lock(cinfo->lock);
193 freeme = bucket->clseg;
194 bucket->clseg = NULL;
195 spin_unlock(cinfo->lock);
196 pnfs_put_lseg(freeme);
197 }
198}
199
200static unsigned int
201pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
202 struct list_head *list)
203{
204 struct pnfs_ds_commit_info *fl_cinfo;
205 struct pnfs_commit_bucket *bucket;
206 struct nfs_commit_data *data;
207 int i;
208 unsigned int nreq = 0;
209
210 fl_cinfo = cinfo->ds;
211 bucket = fl_cinfo->buckets;
212 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
213 if (list_empty(&bucket->committing))
214 continue;
215 data = nfs_commitdata_alloc();
216 if (!data)
217 break;
218 data->ds_commit_index = i;
219 spin_lock(cinfo->lock);
220 data->lseg = bucket->clseg;
221 bucket->clseg = NULL;
222 spin_unlock(cinfo->lock);
223 list_add(&data->pages, list);
224 nreq++;
225 }
226
227 /* Clean up on error */
228 pnfs_generic_retry_commit(cinfo, i);
229 return nreq;
230}
231
232/* This follows nfs_commit_list pretty closely */
233int
234pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
235 int how, struct nfs_commit_info *cinfo,
236 int (*initiate_commit)(struct nfs_commit_data *data,
237 int how))
238{
239 struct nfs_commit_data *data, *tmp;
240 LIST_HEAD(list);
241 unsigned int nreq = 0;
242
243 if (!list_empty(mds_pages)) {
244 data = nfs_commitdata_alloc();
245 if (data != NULL) {
246 data->lseg = NULL;
247 list_add(&data->pages, &list);
248 nreq++;
249 } else {
250 nfs_retry_commit(mds_pages, NULL, cinfo, 0);
251 pnfs_generic_retry_commit(cinfo, 0);
252 cinfo->completion_ops->error_cleanup(NFS_I(inode));
253 return -ENOMEM;
254 }
255 }
256
257 nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
258
259 if (nreq == 0) {
260 cinfo->completion_ops->error_cleanup(NFS_I(inode));
261 goto out;
262 }
263
264 atomic_add(nreq, &cinfo->mds->rpcs_out);
265
266 list_for_each_entry_safe(data, tmp, &list, pages) {
267 list_del_init(&data->pages);
268 if (!data->lseg) {
269 nfs_init_commit(data, mds_pages, NULL, cinfo);
270 nfs_initiate_commit(NFS_CLIENT(inode), data,
271 NFS_PROTO(data->inode),
272 data->mds_ops, how, 0);
273 } else {
274 struct pnfs_commit_bucket *buckets;
275
276 buckets = cinfo->ds->buckets;
277 nfs_init_commit(data,
278 &buckets[data->ds_commit_index].committing,
279 data->lseg,
280 cinfo);
281 initiate_commit(data, how);
282 }
283 }
284out:
285 cinfo->ds->ncommitting = 0;
286 return PNFS_ATTEMPTED;
287}
288EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
289
290/*
291 * Data server cache
292 *
293 * Data servers can be mapped to different device ids.
294 * nfs4_pnfs_ds reference counting
295 * - set to 1 on allocation
296 * - incremented when a device id maps a data server already in the cache.
297 * - decremented when deviceid is removed from the cache.
298 */
299static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
300static LIST_HEAD(nfs4_data_server_cache);
301
302/* Debug routines */
303static void
304print_ds(struct nfs4_pnfs_ds *ds)
305{
306 if (ds == NULL) {
307 printk(KERN_WARNING "%s NULL device\n", __func__);
308 return;
309 }
310 printk(KERN_WARNING " ds %s\n"
311 " ref count %d\n"
312 " client %p\n"
313 " cl_exchange_flags %x\n",
314 ds->ds_remotestr,
315 atomic_read(&ds->ds_count), ds->ds_clp,
316 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
317}
318
319static bool
320same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
321{
322 struct sockaddr_in *a, *b;
323 struct sockaddr_in6 *a6, *b6;
324
325 if (addr1->sa_family != addr2->sa_family)
326 return false;
327
328 switch (addr1->sa_family) {
329 case AF_INET:
330 a = (struct sockaddr_in *)addr1;
331 b = (struct sockaddr_in *)addr2;
332
333 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
334 a->sin_port == b->sin_port)
335 return true;
336 break;
337
338 case AF_INET6:
339 a6 = (struct sockaddr_in6 *)addr1;
340 b6 = (struct sockaddr_in6 *)addr2;
341
342 /* LINKLOCAL addresses must have matching scope_id */
343 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
344 IPV6_ADDR_SCOPE_LINKLOCAL &&
345 a6->sin6_scope_id != b6->sin6_scope_id)
346 return false;
347
348 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
349 a6->sin6_port == b6->sin6_port)
350 return true;
351 break;
352
353 default:
354 dprintk("%s: unhandled address family: %u\n",
355 __func__, addr1->sa_family);
356 return false;
357 }
358
359 return false;
360}
361
362static bool
363_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
364 const struct list_head *dsaddrs2)
365{
366 struct nfs4_pnfs_ds_addr *da1, *da2;
367
368 /* step through both lists, comparing as we go */
369 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
370 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
371 da1 != NULL && da2 != NULL;
372 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
373 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
374 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
375 (struct sockaddr *)&da2->da_addr))
376 return false;
377 }
378 if (da1 == NULL && da2 == NULL)
379 return true;
380
381 return false;
382}
383
384/*
385 * Lookup DS by addresses. nfs4_ds_cache_lock is held
386 */
387static struct nfs4_pnfs_ds *
388_data_server_lookup_locked(const struct list_head *dsaddrs)
389{
390 struct nfs4_pnfs_ds *ds;
391
392 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
393 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
394 return ds;
395 return NULL;
396}
397
398static void destroy_ds(struct nfs4_pnfs_ds *ds)
399{
400 struct nfs4_pnfs_ds_addr *da;
401
402 dprintk("--> %s\n", __func__);
403 ifdebug(FACILITY)
404 print_ds(ds);
405
406 nfs_put_client(ds->ds_clp);
407
408 while (!list_empty(&ds->ds_addrs)) {
409 da = list_first_entry(&ds->ds_addrs,
410 struct nfs4_pnfs_ds_addr,
411 da_node);
412 list_del_init(&da->da_node);
413 kfree(da->da_remotestr);
414 kfree(da);
415 }
416
417 kfree(ds->ds_remotestr);
418 kfree(ds);
419}
420
421void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
422{
423 if (atomic_dec_and_lock(&ds->ds_count,
424 &nfs4_ds_cache_lock)) {
425 list_del_init(&ds->ds_node);
426 spin_unlock(&nfs4_ds_cache_lock);
427 destroy_ds(ds);
428 }
429}
430EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
431
432/*
433 * Create a string with a human readable address and port to avoid
434 * complicated setup around many dprinks.
435 */
436static char *
437nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
438{
439 struct nfs4_pnfs_ds_addr *da;
440 char *remotestr;
441 size_t len;
442 char *p;
443
444 len = 3; /* '{', '}' and eol */
445 list_for_each_entry(da, dsaddrs, da_node) {
446 len += strlen(da->da_remotestr) + 1; /* string plus comma */
447 }
448
449 remotestr = kzalloc(len, gfp_flags);
450 if (!remotestr)
451 return NULL;
452
453 p = remotestr;
454 *(p++) = '{';
455 len--;
456 list_for_each_entry(da, dsaddrs, da_node) {
457 size_t ll = strlen(da->da_remotestr);
458
459 if (ll > len)
460 goto out_err;
461
462 memcpy(p, da->da_remotestr, ll);
463 p += ll;
464 len -= ll;
465
466 if (len < 1)
467 goto out_err;
468 (*p++) = ',';
469 len--;
470 }
471 if (len < 2)
472 goto out_err;
473 *(p++) = '}';
474 *p = '\0';
475 return remotestr;
476out_err:
477 kfree(remotestr);
478 return NULL;
479}
480
481/*
482 * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
483 * uncached and return cached struct nfs4_pnfs_ds.
484 */
485struct nfs4_pnfs_ds *
486nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
487{
488 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
489 char *remotestr;
490
491 if (list_empty(dsaddrs)) {
492 dprintk("%s: no addresses defined\n", __func__);
493 goto out;
494 }
495
496 ds = kzalloc(sizeof(*ds), gfp_flags);
497 if (!ds)
498 goto out;
499
500 /* this is only used for debugging, so it's ok if its NULL */
501 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
502
503 spin_lock(&nfs4_ds_cache_lock);
504 tmp_ds = _data_server_lookup_locked(dsaddrs);
505 if (tmp_ds == NULL) {
506 INIT_LIST_HEAD(&ds->ds_addrs);
507 list_splice_init(dsaddrs, &ds->ds_addrs);
508 ds->ds_remotestr = remotestr;
509 atomic_set(&ds->ds_count, 1);
510 INIT_LIST_HEAD(&ds->ds_node);
511 ds->ds_clp = NULL;
512 list_add(&ds->ds_node, &nfs4_data_server_cache);
513 dprintk("%s add new data server %s\n", __func__,
514 ds->ds_remotestr);
515 } else {
516 kfree(remotestr);
517 kfree(ds);
518 atomic_inc(&tmp_ds->ds_count);
519 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
520 __func__, tmp_ds->ds_remotestr,
521 atomic_read(&tmp_ds->ds_count));
522 ds = tmp_ds;
523 }
524 spin_unlock(&nfs4_ds_cache_lock);
525out:
526 return ds;
527}
528EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
529
530static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
531{
532 might_sleep();
533 wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
534 TASK_KILLABLE);
535}
536
537static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
538{
539 smp_mb__before_atomic();
540 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
541 smp_mb__after_atomic();
542 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
543}
544
545static struct nfs_client *(*get_v3_ds_connect)(
546 struct nfs_client *mds_clp,
547 const struct sockaddr *ds_addr,
548 int ds_addrlen,
549 int ds_proto,
550 unsigned int ds_timeo,
551 unsigned int ds_retrans,
552 rpc_authflavor_t au_flavor);
553
554static bool load_v3_ds_connect(void)
555{
556 if (!get_v3_ds_connect) {
557 get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
558 WARN_ON_ONCE(!get_v3_ds_connect);
559 }
560
561 return(get_v3_ds_connect != NULL);
562}
563
564void __exit nfs4_pnfs_v3_ds_connect_unload(void)
565{
566 if (get_v3_ds_connect) {
567 symbol_put(nfs3_set_ds_client);
568 get_v3_ds_connect = NULL;
569 }
570}
571EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
572
573static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
574 struct nfs4_pnfs_ds *ds,
575 unsigned int timeo,
576 unsigned int retrans,
577 rpc_authflavor_t au_flavor)
578{
579 struct nfs_client *clp = ERR_PTR(-EIO);
580 struct nfs4_pnfs_ds_addr *da;
581 int status = 0;
582
583 dprintk("--> %s DS %s au_flavor %d\n", __func__,
584 ds->ds_remotestr, au_flavor);
585
586 if (!load_v3_ds_connect())
587 goto out;
588
589 list_for_each_entry(da, &ds->ds_addrs, da_node) {
590 dprintk("%s: DS %s: trying address %s\n",
591 __func__, ds->ds_remotestr, da->da_remotestr);
592
593 clp = get_v3_ds_connect(mds_srv->nfs_client,
594 (struct sockaddr *)&da->da_addr,
595 da->da_addrlen, IPPROTO_TCP,
596 timeo, retrans, au_flavor);
597 if (!IS_ERR(clp))
598 break;
599 }
600
601 if (IS_ERR(clp)) {
602 status = PTR_ERR(clp);
603 goto out;
604 }
605
606 smp_wmb();
607 ds->ds_clp = clp;
608 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
609out:
610 return status;
611}
612
613static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
614 struct nfs4_pnfs_ds *ds,
615 unsigned int timeo,
616 unsigned int retrans,
617 u32 minor_version,
618 rpc_authflavor_t au_flavor)
619{
620 struct nfs_client *clp = ERR_PTR(-EIO);
621 struct nfs4_pnfs_ds_addr *da;
622 int status = 0;
623
624 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
625 au_flavor);
626
627 list_for_each_entry(da, &ds->ds_addrs, da_node) {
628 dprintk("%s: DS %s: trying address %s\n",
629 __func__, ds->ds_remotestr, da->da_remotestr);
630
631 clp = nfs4_set_ds_client(mds_srv->nfs_client,
632 (struct sockaddr *)&da->da_addr,
633 da->da_addrlen, IPPROTO_TCP,
634 timeo, retrans, minor_version,
635 au_flavor);
636 if (!IS_ERR(clp))
637 break;
638 }
639
640 if (IS_ERR(clp)) {
641 status = PTR_ERR(clp);
642 goto out;
643 }
644
645 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
646 if (status)
647 goto out_put;
648
649 smp_wmb();
650 ds->ds_clp = clp;
651 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
652out:
653 return status;
654out_put:
655 nfs_put_client(clp);
656 goto out;
657}
658
659/*
660 * Create an rpc connection to the nfs4_pnfs_ds data server.
661 * Currently only supports IPv4 and IPv6 addresses.
662 * If connection fails, make devid unavailable.
663 */
664void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
665 struct nfs4_deviceid_node *devid, unsigned int timeo,
666 unsigned int retrans, u32 version,
667 u32 minor_version, rpc_authflavor_t au_flavor)
668{
669 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
670 int err = 0;
671
672 if (version == 3) {
673 err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
674 retrans, au_flavor);
675 } else if (version == 4) {
676 err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
677 retrans, minor_version,
678 au_flavor);
679 } else {
680 dprintk("%s: unsupported DS version %d\n", __func__,
681 version);
682 err = -EPROTONOSUPPORT;
683 }
684
685 if (err)
686 nfs4_mark_deviceid_unavailable(devid);
687 nfs4_clear_ds_conn_bit(ds);
688 } else {
689 nfs4_wait_ds_connect(ds);
690 }
691}
692EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
693
694/*
695 * Currently only supports ipv4, ipv6 and one multi-path address.
696 */
697struct nfs4_pnfs_ds_addr *
698nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
699{
700 struct nfs4_pnfs_ds_addr *da = NULL;
701 char *buf, *portstr;
702 __be16 port;
703 int nlen, rlen;
704 int tmp[2];
705 __be32 *p;
706 char *netid, *match_netid;
707 size_t len, match_netid_len;
708 char *startsep = "";
709 char *endsep = "";
710
711
712 /* r_netid */
713 p = xdr_inline_decode(xdr, 4);
714 if (unlikely(!p))
715 goto out_err;
716 nlen = be32_to_cpup(p++);
717
718 p = xdr_inline_decode(xdr, nlen);
719 if (unlikely(!p))
720 goto out_err;
721
722 netid = kmalloc(nlen+1, gfp_flags);
723 if (unlikely(!netid))
724 goto out_err;
725
726 netid[nlen] = '\0';
727 memcpy(netid, p, nlen);
728
729 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
730 p = xdr_inline_decode(xdr, 4);
731 if (unlikely(!p))
732 goto out_free_netid;
733 rlen = be32_to_cpup(p);
734
735 p = xdr_inline_decode(xdr, rlen);
736 if (unlikely(!p))
737 goto out_free_netid;
738
739 /* port is ".ABC.DEF", 8 chars max */
740 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
741 dprintk("%s: Invalid address, length %d\n", __func__,
742 rlen);
743 goto out_free_netid;
744 }
745 buf = kmalloc(rlen + 1, gfp_flags);
746 if (!buf) {
747 dprintk("%s: Not enough memory\n", __func__);
748 goto out_free_netid;
749 }
750 buf[rlen] = '\0';
751 memcpy(buf, p, rlen);
752
753 /* replace port '.' with '-' */
754 portstr = strrchr(buf, '.');
755 if (!portstr) {
756 dprintk("%s: Failed finding expected dot in port\n",
757 __func__);
758 goto out_free_buf;
759 }
760 *portstr = '-';
761
762 /* find '.' between address and port */
763 portstr = strrchr(buf, '.');
764 if (!portstr) {
765 dprintk("%s: Failed finding expected dot between address and "
766 "port\n", __func__);
767 goto out_free_buf;
768 }
769 *portstr = '\0';
770
771 da = kzalloc(sizeof(*da), gfp_flags);
772 if (unlikely(!da))
773 goto out_free_buf;
774
775 INIT_LIST_HEAD(&da->da_node);
776
777 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
778 sizeof(da->da_addr))) {
779 dprintk("%s: error parsing address %s\n", __func__, buf);
780 goto out_free_da;
781 }
782
783 portstr++;
784 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
785 port = htons((tmp[0] << 8) | (tmp[1]));
786
787 switch (da->da_addr.ss_family) {
788 case AF_INET:
789 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
790 da->da_addrlen = sizeof(struct sockaddr_in);
791 match_netid = "tcp";
792 match_netid_len = 3;
793 break;
794
795 case AF_INET6:
796 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
797 da->da_addrlen = sizeof(struct sockaddr_in6);
798 match_netid = "tcp6";
799 match_netid_len = 4;
800 startsep = "[";
801 endsep = "]";
802 break;
803
804 default:
805 dprintk("%s: unsupported address family: %u\n",
806 __func__, da->da_addr.ss_family);
807 goto out_free_da;
808 }
809
810 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
811 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
812 __func__, netid, match_netid);
813 goto out_free_da;
814 }
815
816 /* save human readable address */
817 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
818 da->da_remotestr = kzalloc(len, gfp_flags);
819
820 /* NULL is ok, only used for dprintk */
821 if (da->da_remotestr)
822 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
823 buf, endsep, ntohs(port));
824
825 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
826 kfree(buf);
827 kfree(netid);
828 return da;
829
830out_free_da:
831 kfree(da);
832out_free_buf:
833 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
834 kfree(buf);
835out_free_netid:
836 kfree(netid);
837out_err:
838 return NULL;
839}
840EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
70 70
71void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) 71void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
72{ 72{
73 struct nfs_pgio_mirror *mirror;
74
73 pgio->pg_ops = &nfs_pgio_rw_ops; 75 pgio->pg_ops = &nfs_pgio_rw_ops;
74 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; 76
77 /* read path should never have more than one mirror */
78 WARN_ON_ONCE(pgio->pg_mirror_count != 1);
79
80 mirror = &pgio->pg_mirrors[0];
81 mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
75} 82}
76EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); 83EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
77 84
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
81 struct nfs_page *new; 88 struct nfs_page *new;
82 unsigned int len; 89 unsigned int len;
83 struct nfs_pageio_descriptor pgio; 90 struct nfs_pageio_descriptor pgio;
91 struct nfs_pgio_mirror *pgm;
84 92
85 len = nfs_page_length(page); 93 len = nfs_page_length(page);
86 if (len == 0) 94 if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
97 &nfs_async_read_completion_ops); 105 &nfs_async_read_completion_ops);
98 nfs_pageio_add_request(&pgio, new); 106 nfs_pageio_add_request(&pgio, new);
99 nfs_pageio_complete(&pgio); 107 nfs_pageio_complete(&pgio);
100 NFS_I(inode)->read_io += pgio.pg_bytes_written; 108
109 /* It doesn't make sense to do mirrored reads! */
110 WARN_ON_ONCE(pgio.pg_mirror_count != 1);
111
112 pgm = &pgio.pg_mirrors[0];
113 NFS_I(inode)->read_io += pgm->pg_bytes_written;
114
101 return 0; 115 return 0;
102} 116}
103 117
@@ -168,13 +182,14 @@ out:
168 182
169static void nfs_initiate_read(struct nfs_pgio_header *hdr, 183static void nfs_initiate_read(struct nfs_pgio_header *hdr,
170 struct rpc_message *msg, 184 struct rpc_message *msg,
185 const struct nfs_rpc_ops *rpc_ops,
171 struct rpc_task_setup *task_setup_data, int how) 186 struct rpc_task_setup *task_setup_data, int how)
172{ 187{
173 struct inode *inode = hdr->inode; 188 struct inode *inode = hdr->inode;
174 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 189 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
175 190
176 task_setup_data->flags |= swap_flags; 191 task_setup_data->flags |= swap_flags;
177 NFS_PROTO(inode)->read_setup(hdr, msg); 192 rpc_ops->read_setup(hdr, msg);
178} 193}
179 194
180static void 195static void
@@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
351 struct list_head *pages, unsigned nr_pages) 366 struct list_head *pages, unsigned nr_pages)
352{ 367{
353 struct nfs_pageio_descriptor pgio; 368 struct nfs_pageio_descriptor pgio;
369 struct nfs_pgio_mirror *pgm;
354 struct nfs_readdesc desc = { 370 struct nfs_readdesc desc = {
355 .pgio = &pgio, 371 .pgio = &pgio,
356 }; 372 };
@@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
386 &nfs_async_read_completion_ops); 402 &nfs_async_read_completion_ops);
387 403
388 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 404 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
389
390 nfs_pageio_complete(&pgio); 405 nfs_pageio_complete(&pgio);
391 NFS_I(inode)->read_io += pgio.pg_bytes_written; 406
392 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 407 /* It doesn't make sense to do mirrored reads! */
408 WARN_ON_ONCE(pgio.pg_mirror_count != 1);
409
410 pgm = &pgio.pg_mirrors[0];
411 NFS_I(inode)->read_io += pgm->pg_bytes_written;
412 npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
413 PAGE_CACHE_SHIFT;
393 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 414 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
394read_complete: 415read_complete:
395 put_nfs_open_context(desc.ctx); 416 put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
311 .destroy_inode = nfs_destroy_inode, 311 .destroy_inode = nfs_destroy_inode,
312 .write_inode = nfs_write_inode, 312 .write_inode = nfs_write_inode,
313 .drop_inode = nfs_drop_inode, 313 .drop_inode = nfs_drop_inode,
314 .put_super = nfs_put_super,
315 .statfs = nfs_statfs, 314 .statfs = nfs_statfs,
316 .evict_inode = nfs_evict_inode, 315 .evict_inode = nfs_evict_inode,
317 .umount_begin = nfs_umount_begin, 316 .umount_begin = nfs_umount_begin,
@@ -405,12 +404,15 @@ void __exit unregister_nfs_fs(void)
405 unregister_filesystem(&nfs_fs_type); 404 unregister_filesystem(&nfs_fs_type);
406} 405}
407 406
408void nfs_sb_active(struct super_block *sb) 407bool nfs_sb_active(struct super_block *sb)
409{ 408{
410 struct nfs_server *server = NFS_SB(sb); 409 struct nfs_server *server = NFS_SB(sb);
411 410
412 if (atomic_inc_return(&server->active) == 1) 411 if (!atomic_inc_not_zero(&sb->s_active))
413 atomic_inc(&sb->s_active); 412 return false;
413 if (atomic_inc_return(&server->active) != 1)
414 atomic_dec(&sb->s_active);
415 return true;
414} 416}
415EXPORT_SYMBOL_GPL(nfs_sb_active); 417EXPORT_SYMBOL_GPL(nfs_sb_active);
416 418
@@ -2569,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
2569 error = nfs_bdi_register(server); 2571 error = nfs_bdi_register(server);
2570 if (error) { 2572 if (error) {
2571 mntroot = ERR_PTR(error); 2573 mntroot = ERR_PTR(error);
2572 goto error_splat_bdi; 2574 goto error_splat_super;
2573 } 2575 }
2574 server->super = s; 2576 server->super = s;
2575 } 2577 }
@@ -2601,9 +2603,6 @@ error_splat_root:
2601 dput(mntroot); 2603 dput(mntroot);
2602 mntroot = ERR_PTR(error); 2604 mntroot = ERR_PTR(error);
2603error_splat_super: 2605error_splat_super:
2604 if (server && !s->s_root)
2605 bdi_unregister(&server->backing_dev_info);
2606error_splat_bdi:
2607 deactivate_locked_super(s); 2606 deactivate_locked_super(s);
2608 goto out; 2607 goto out;
2609} 2608}
@@ -2651,27 +2650,19 @@ out:
2651EXPORT_SYMBOL_GPL(nfs_fs_mount); 2650EXPORT_SYMBOL_GPL(nfs_fs_mount);
2652 2651
2653/* 2652/*
2654 * Ensure that we unregister the bdi before kill_anon_super
2655 * releases the device name
2656 */
2657void nfs_put_super(struct super_block *s)
2658{
2659 struct nfs_server *server = NFS_SB(s);
2660
2661 bdi_unregister(&server->backing_dev_info);
2662}
2663EXPORT_SYMBOL_GPL(nfs_put_super);
2664
2665/*
2666 * Destroy an NFS2/3 superblock 2653 * Destroy an NFS2/3 superblock
2667 */ 2654 */
2668void nfs_kill_super(struct super_block *s) 2655void nfs_kill_super(struct super_block *s)
2669{ 2656{
2670 struct nfs_server *server = NFS_SB(s); 2657 struct nfs_server *server = NFS_SB(s);
2658 dev_t dev = s->s_dev;
2659
2660 generic_shutdown_super(s);
2671 2661
2672 kill_anon_super(s);
2673 nfs_fscache_release_super_cookie(s); 2662 nfs_fscache_release_super_cookie(s);
2663
2674 nfs_free_server(server); 2664 nfs_free_server(server);
2665 free_anon_bdev(dev);
2675} 2666}
2676EXPORT_SYMBOL_GPL(nfs_kill_super); 2667EXPORT_SYMBOL_GPL(nfs_kill_super);
2677 2668
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..88a6d2196ece 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -473,13 +473,18 @@ try_again:
473 do { 473 do {
474 /* 474 /*
475 * Subrequests are always contiguous, non overlapping 475 * Subrequests are always contiguous, non overlapping
476 * and in order. If not, it's a programming error. 476 * and in order - but may be repeated (mirrored writes).
477 */ 477 */
478 WARN_ON_ONCE(subreq->wb_offset != 478 if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
479 (head->wb_offset + total_bytes)); 479 /* keep track of how many bytes this group covers */
480 480 total_bytes += subreq->wb_bytes;
481 /* keep track of how many bytes this group covers */ 481 } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
482 total_bytes += subreq->wb_bytes; 482 ((subreq->wb_offset + subreq->wb_bytes) >
483 (head->wb_offset + total_bytes)))) {
484 nfs_page_group_unlock(head);
485 spin_unlock(&inode->i_lock);
486 return ERR_PTR(-EIO);
487 }
483 488
484 if (!nfs_lock_request(subreq)) { 489 if (!nfs_lock_request(subreq)) {
485 /* releases page group bit lock and 490 /* releases page group bit lock and
@@ -786,7 +791,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
786 spin_unlock(cinfo->lock); 791 spin_unlock(cinfo->lock);
787 if (!cinfo->dreq) { 792 if (!cinfo->dreq) {
788 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 793 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
789 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, 794 inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
790 BDI_RECLAIMABLE); 795 BDI_RECLAIMABLE);
791 __mark_inode_dirty(req->wb_context->dentry->d_inode, 796 __mark_inode_dirty(req->wb_context->dentry->d_inode,
792 I_DIRTY_DATASYNC); 797 I_DIRTY_DATASYNC);
@@ -842,9 +847,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo);
842 */ 847 */
843void 848void
844nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 849nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
845 struct nfs_commit_info *cinfo) 850 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
846{ 851{
847 if (pnfs_mark_request_commit(req, lseg, cinfo)) 852 if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
848 return; 853 return;
849 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); 854 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
850} 855}
@@ -853,7 +858,7 @@ static void
853nfs_clear_page_commit(struct page *page) 858nfs_clear_page_commit(struct page *page)
854{ 859{
855 dec_zone_page_state(page, NR_UNSTABLE_NFS); 860 dec_zone_page_state(page, NR_UNSTABLE_NFS);
856 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); 861 dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
857} 862}
858 863
859/* Called holding inode (/cinfo) lock */ 864/* Called holding inode (/cinfo) lock */
@@ -900,7 +905,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
900 } 905 }
901 if (nfs_write_need_commit(hdr)) { 906 if (nfs_write_need_commit(hdr)) {
902 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); 907 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
903 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 908 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
909 hdr->pgio_mirror_idx);
904 goto next; 910 goto next;
905 } 911 }
906remove_req: 912remove_req:
@@ -1091,6 +1097,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
1091{ 1097{
1092 struct nfs_open_context *ctx = nfs_file_open_context(file); 1098 struct nfs_open_context *ctx = nfs_file_open_context(file);
1093 struct nfs_lock_context *l_ctx; 1099 struct nfs_lock_context *l_ctx;
1100 struct file_lock_context *flctx = file_inode(file)->i_flctx;
1094 struct nfs_page *req; 1101 struct nfs_page *req;
1095 int do_flush, status; 1102 int do_flush, status;
1096 /* 1103 /*
@@ -1109,7 +1116,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
1109 do_flush = req->wb_page != page || req->wb_context != ctx; 1116 do_flush = req->wb_page != page || req->wb_context != ctx;
1110 /* for now, flush if more than 1 request in page_group */ 1117 /* for now, flush if more than 1 request in page_group */
1111 do_flush |= req->wb_this_page != req; 1118 do_flush |= req->wb_this_page != req;
1112 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { 1119 if (l_ctx && flctx &&
1120 !(list_empty_careful(&flctx->flc_posix) &&
1121 list_empty_careful(&flctx->flc_flock))) {
1113 do_flush |= l_ctx->lockowner.l_owner != current->files 1122 do_flush |= l_ctx->lockowner.l_owner != current->files
1114 || l_ctx->lockowner.l_pid != current->tgid; 1123 || l_ctx->lockowner.l_pid != current->tgid;
1115 } 1124 }
@@ -1170,6 +1179,13 @@ out:
1170 return PageUptodate(page) != 0; 1179 return PageUptodate(page) != 0;
1171} 1180}
1172 1181
1182static bool
1183is_whole_file_wrlock(struct file_lock *fl)
1184{
1185 return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
1186 fl->fl_type == F_WRLCK;
1187}
1188
1173/* If we know the page is up to date, and we're not using byte range locks (or 1189/* If we know the page is up to date, and we're not using byte range locks (or
1174 * if we have the whole file locked for writing), it may be more efficient to 1190 * if we have the whole file locked for writing), it may be more efficient to
1175 * extend the write to cover the entire page in order to avoid fragmentation 1191 * extend the write to cover the entire page in order to avoid fragmentation
@@ -1180,17 +1196,36 @@ out:
1180 */ 1196 */
1181static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) 1197static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
1182{ 1198{
1199 int ret;
1200 struct file_lock_context *flctx = inode->i_flctx;
1201 struct file_lock *fl;
1202
1183 if (file->f_flags & O_DSYNC) 1203 if (file->f_flags & O_DSYNC)
1184 return 0; 1204 return 0;
1185 if (!nfs_write_pageuptodate(page, inode)) 1205 if (!nfs_write_pageuptodate(page, inode))
1186 return 0; 1206 return 0;
1187 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) 1207 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
1188 return 1; 1208 return 1;
1189 if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && 1209 if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
1190 inode->i_flock->fl_end == OFFSET_MAX && 1210 list_empty_careful(&flctx->flc_posix)))
1191 inode->i_flock->fl_type != F_RDLCK)) 1211 return 0;
1192 return 1; 1212
1193 return 0; 1213 /* Check to see if there are whole file write locks */
1214 ret = 0;
1215 spin_lock(&flctx->flc_lock);
1216 if (!list_empty(&flctx->flc_posix)) {
1217 fl = list_first_entry(&flctx->flc_posix, struct file_lock,
1218 fl_list);
1219 if (is_whole_file_wrlock(fl))
1220 ret = 1;
1221 } else if (!list_empty(&flctx->flc_flock)) {
1222 fl = list_first_entry(&flctx->flc_flock, struct file_lock,
1223 fl_list);
1224 if (fl->fl_type == F_WRLCK)
1225 ret = 1;
1226 }
1227 spin_unlock(&flctx->flc_lock);
1228 return ret;
1194} 1229}
1195 1230
1196/* 1231/*
@@ -1240,15 +1275,15 @@ static int flush_task_priority(int how)
1240 1275
1241static void nfs_initiate_write(struct nfs_pgio_header *hdr, 1276static void nfs_initiate_write(struct nfs_pgio_header *hdr,
1242 struct rpc_message *msg, 1277 struct rpc_message *msg,
1278 const struct nfs_rpc_ops *rpc_ops,
1243 struct rpc_task_setup *task_setup_data, int how) 1279 struct rpc_task_setup *task_setup_data, int how)
1244{ 1280{
1245 struct inode *inode = hdr->inode;
1246 int priority = flush_task_priority(how); 1281 int priority = flush_task_priority(how);
1247 1282
1248 task_setup_data->priority = priority; 1283 task_setup_data->priority = priority;
1249 NFS_PROTO(inode)->write_setup(hdr, msg); 1284 rpc_ops->write_setup(hdr, msg);
1250 1285
1251 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, 1286 nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
1252 &task_setup_data->rpc_client, msg, hdr); 1287 &task_setup_data->rpc_client, msg, hdr);
1253} 1288}
1254 1289
@@ -1298,8 +1333,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
1298 1333
1299void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) 1334void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
1300{ 1335{
1336 struct nfs_pgio_mirror *mirror;
1337
1301 pgio->pg_ops = &nfs_pgio_rw_ops; 1338 pgio->pg_ops = &nfs_pgio_rw_ops;
1302 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; 1339
1340 nfs_pageio_stop_mirroring(pgio);
1341
1342 mirror = &pgio->pg_mirrors[0];
1343 mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
1303} 1344}
1304EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); 1345EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1305 1346
@@ -1465,6 +1506,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data)
1465EXPORT_SYMBOL_GPL(nfs_commitdata_release); 1506EXPORT_SYMBOL_GPL(nfs_commitdata_release);
1466 1507
1467int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, 1508int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1509 const struct nfs_rpc_ops *nfs_ops,
1468 const struct rpc_call_ops *call_ops, 1510 const struct rpc_call_ops *call_ops,
1469 int how, int flags) 1511 int how, int flags)
1470{ 1512{
@@ -1486,7 +1528,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1486 .priority = priority, 1528 .priority = priority,
1487 }; 1529 };
1488 /* Set up the initial task struct. */ 1530 /* Set up the initial task struct. */
1489 NFS_PROTO(data->inode)->commit_setup(data, &msg); 1531 nfs_ops->commit_setup(data, &msg);
1490 1532
1491 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1533 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1492 1534
@@ -1554,17 +1596,18 @@ EXPORT_SYMBOL_GPL(nfs_init_commit);
1554 1596
1555void nfs_retry_commit(struct list_head *page_list, 1597void nfs_retry_commit(struct list_head *page_list,
1556 struct pnfs_layout_segment *lseg, 1598 struct pnfs_layout_segment *lseg,
1557 struct nfs_commit_info *cinfo) 1599 struct nfs_commit_info *cinfo,
1600 u32 ds_commit_idx)
1558{ 1601{
1559 struct nfs_page *req; 1602 struct nfs_page *req;
1560 1603
1561 while (!list_empty(page_list)) { 1604 while (!list_empty(page_list)) {
1562 req = nfs_list_entry(page_list->next); 1605 req = nfs_list_entry(page_list->next);
1563 nfs_list_remove_request(req); 1606 nfs_list_remove_request(req);
1564 nfs_mark_request_commit(req, lseg, cinfo); 1607 nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
1565 if (!cinfo->dreq) { 1608 if (!cinfo->dreq) {
1566 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1609 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1567 dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, 1610 dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
1568 BDI_RECLAIMABLE); 1611 BDI_RECLAIMABLE);
1569 } 1612 }
1570 nfs_unlock_and_release_request(req); 1613 nfs_unlock_and_release_request(req);
@@ -1589,10 +1632,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
1589 /* Set up the argument struct */ 1632 /* Set up the argument struct */
1590 nfs_init_commit(data, head, NULL, cinfo); 1633 nfs_init_commit(data, head, NULL, cinfo);
1591 atomic_inc(&cinfo->mds->rpcs_out); 1634 atomic_inc(&cinfo->mds->rpcs_out);
1592 return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, 1635 return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
1593 how, 0); 1636 data->mds_ops, how, 0);
1594 out_bad: 1637 out_bad:
1595 nfs_retry_commit(head, NULL, cinfo); 1638 nfs_retry_commit(head, NULL, cinfo, 0);
1596 cinfo->completion_ops->error_cleanup(NFS_I(inode)); 1639 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1597 return -ENOMEM; 1640 return -ENOMEM;
1598} 1641}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
82 82
83 If unsure, say N. 83 If unsure, say N.
84 84
85config NFSD_PNFS
86 bool "NFSv4.1 server support for Parallel NFS (pNFS)"
87 depends on NFSD_V4
88 help
89 This option enables support for the parallel NFS features of the
90 minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
91 server.
92
93 If unsure, say N.
94
85config NFSD_V4_SECURITY_LABEL 95config NFSD_V4_SECURITY_LABEL
86 bool "Provide Security Label support for NFSv4 server" 96 bool "Provide Security Label support for NFSv4 server"
87 depends on NFSD_V4 && SECURITY 97 depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
2# Makefile for the Linux nfs server 2# Makefile for the Linux nfs server
3# 3#
4 4
5ccflags-y += -I$(src) # needed for trace events
6
5obj-$(CONFIG_NFSD) += nfsd.o 7obj-$(CONFIG_NFSD) += nfsd.o
6 8
7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 9# this one should be compiled first, as the tracing macros can easily blow up
10nfsd-y += trace.o
11
12nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o 13 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
9nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o 14nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
10nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o 15nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
12nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
13nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
14 nfs4acl.o nfs4callback.o nfs4recover.o 19 nfs4acl.o nfs4callback.o nfs4recover.o
20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/exportfs.h>
5#include <linux/genhd.h>
6#include <linux/slab.h>
7
8#include <linux/nfsd/debug.h>
9
10#include "blocklayoutxdr.h"
11#include "pnfs.h"
12
13#define NFSDDBG_FACILITY NFSDDBG_PNFS
14
15
16static int
17nfsd4_block_get_device_info_simple(struct super_block *sb,
18 struct nfsd4_getdeviceinfo *gdp)
19{
20 struct pnfs_block_deviceaddr *dev;
21 struct pnfs_block_volume *b;
22
23 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
24 sizeof(struct pnfs_block_volume), GFP_KERNEL);
25 if (!dev)
26 return -ENOMEM;
27 gdp->gd_device = dev;
28
29 dev->nr_volumes = 1;
30 b = &dev->volumes[0];
31
32 b->type = PNFS_BLOCK_VOLUME_SIMPLE;
33 b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
34 return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
35 &b->simple.offset);
36}
37
38static __be32
39nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
40 struct nfsd4_getdeviceinfo *gdp)
41{
42 if (sb->s_bdev != sb->s_bdev->bd_contains)
43 return nfserr_inval;
44 return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
45}
46
47static __be32
48nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
49 struct nfsd4_layoutget *args)
50{
51 struct nfsd4_layout_seg *seg = &args->lg_seg;
52 struct super_block *sb = inode->i_sb;
53 u32 block_size = (1 << inode->i_blkbits);
54 struct pnfs_block_extent *bex;
55 struct iomap iomap;
56 u32 device_generation = 0;
57 int error;
58
59 /*
60 * We do not attempt to support I/O smaller than the fs block size,
61 * or not aligned to it.
62 */
63 if (args->lg_minlength < block_size) {
64 dprintk("pnfsd: I/O too small\n");
65 goto out_layoutunavailable;
66 }
67 if (seg->offset & (block_size - 1)) {
68 dprintk("pnfsd: I/O misaligned\n");
69 goto out_layoutunavailable;
70 }
71
72 /*
73 * Some clients barf on non-zero block numbers for NONE or INVALID
74 * layouts, so make sure to zero the whole structure.
75 */
76 error = -ENOMEM;
77 bex = kzalloc(sizeof(*bex), GFP_KERNEL);
78 if (!bex)
79 goto out_error;
80 args->lg_content = bex;
81
82 error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
83 &iomap, seg->iomode != IOMODE_READ,
84 &device_generation);
85 if (error) {
86 if (error == -ENXIO)
87 goto out_layoutunavailable;
88 goto out_error;
89 }
90
91 if (iomap.length < args->lg_minlength) {
92 dprintk("pnfsd: extent smaller than minlength\n");
93 goto out_layoutunavailable;
94 }
95
96 switch (iomap.type) {
97 case IOMAP_MAPPED:
98 if (seg->iomode == IOMODE_READ)
99 bex->es = PNFS_BLOCK_READ_DATA;
100 else
101 bex->es = PNFS_BLOCK_READWRITE_DATA;
102 bex->soff = (iomap.blkno << 9);
103 break;
104 case IOMAP_UNWRITTEN:
105 if (seg->iomode & IOMODE_RW) {
106 /*
107 * Crack monkey special case from section 2.3.1.
108 */
109 if (args->lg_minlength == 0) {
110 dprintk("pnfsd: no soup for you!\n");
111 goto out_layoutunavailable;
112 }
113
114 bex->es = PNFS_BLOCK_INVALID_DATA;
115 bex->soff = (iomap.blkno << 9);
116 break;
117 }
118 /*FALLTHRU*/
119 case IOMAP_HOLE:
120 if (seg->iomode == IOMODE_READ) {
121 bex->es = PNFS_BLOCK_NONE_DATA;
122 break;
123 }
124 /*FALLTHRU*/
125 case IOMAP_DELALLOC:
126 default:
127 WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
128 goto out_layoutunavailable;
129 }
130
131 error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
132 if (error)
133 goto out_error;
134 bex->foff = iomap.offset;
135 bex->len = iomap.length;
136
137 seg->offset = iomap.offset;
138 seg->length = iomap.length;
139
140 dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
141 return 0;
142
143out_error:
144 seg->length = 0;
145 return nfserrno(error);
146out_layoutunavailable:
147 seg->length = 0;
148 return nfserr_layoutunavailable;
149}
150
151static __be32
152nfsd4_block_proc_layoutcommit(struct inode *inode,
153 struct nfsd4_layoutcommit *lcp)
154{
155 loff_t new_size = lcp->lc_last_wr + 1;
156 struct iattr iattr = { .ia_valid = 0 };
157 struct iomap *iomaps;
158 int nr_iomaps;
159 int error;
160
161 nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
162 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
163 if (nr_iomaps < 0)
164 return nfserrno(nr_iomaps);
165
166 if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
167 timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
168 lcp->lc_mtime = current_fs_time(inode->i_sb);
169 iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
170 iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
171
172 if (new_size > i_size_read(inode)) {
173 iattr.ia_valid |= ATTR_SIZE;
174 iattr.ia_size = new_size;
175 }
176
177 error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
178 nr_iomaps, &iattr);
179 kfree(iomaps);
180 return nfserrno(error);
181}
182
183const struct nfsd4_layout_ops bl_layout_ops = {
184 .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
185 .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
186 .proc_layoutget = nfsd4_block_proc_layoutget,
187 .encode_layoutget = nfsd4_block_encode_layoutget,
188 .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
189};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h>
6#include <linux/nfs4.h>
7
8#include "nfsd.h"
9#include "blocklayoutxdr.h"
10
11#define NFSDDBG_FACILITY NFSDDBG_PNFS
12
13
14__be32
15nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
16 struct nfsd4_layoutget *lgp)
17{
18 struct pnfs_block_extent *b = lgp->lg_content;
19 int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
20 __be32 *p;
21
22 p = xdr_reserve_space(xdr, sizeof(__be32) + len);
23 if (!p)
24 return nfserr_toosmall;
25
26 *p++ = cpu_to_be32(len);
27 *p++ = cpu_to_be32(1); /* we always return a single extent */
28
29 p = xdr_encode_opaque_fixed(p, &b->vol_id,
30 sizeof(struct nfsd4_deviceid));
31 p = xdr_encode_hyper(p, b->foff);
32 p = xdr_encode_hyper(p, b->len);
33 p = xdr_encode_hyper(p, b->soff);
34 *p++ = cpu_to_be32(b->es);
35 return 0;
36}
37
38static int
39nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
40{
41 __be32 *p;
42 int len;
43
44 switch (b->type) {
45 case PNFS_BLOCK_VOLUME_SIMPLE:
46 len = 4 + 4 + 8 + 4 + b->simple.sig_len;
47 p = xdr_reserve_space(xdr, len);
48 if (!p)
49 return -ETOOSMALL;
50
51 *p++ = cpu_to_be32(b->type);
52 *p++ = cpu_to_be32(1); /* single signature */
53 p = xdr_encode_hyper(p, b->simple.offset);
54 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
55 break;
56 default:
57 return -ENOTSUPP;
58 }
59
60 return len;
61}
62
63__be32
64nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
65 struct nfsd4_getdeviceinfo *gdp)
66{
67 struct pnfs_block_deviceaddr *dev = gdp->gd_device;
68 int len = sizeof(__be32), ret, i;
69 __be32 *p;
70
71 p = xdr_reserve_space(xdr, len + sizeof(__be32));
72 if (!p)
73 return nfserr_resource;
74
75 for (i = 0; i < dev->nr_volumes; i++) {
76 ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
77 if (ret < 0)
78 return nfserrno(ret);
79 len += ret;
80 }
81
82 /*
83 * Fill in the overall length and number of volumes at the beginning
84 * of the layout.
85 */
86 *p++ = cpu_to_be32(len);
87 *p++ = cpu_to_be32(dev->nr_volumes);
88 return 0;
89}
90
91int
92nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
93 u32 block_size)
94{
95 struct iomap *iomaps;
96 u32 nr_iomaps, expected, i;
97
98 if (len < sizeof(u32)) {
99 dprintk("%s: extent array too small: %u\n", __func__, len);
100 return -EINVAL;
101 }
102
103 nr_iomaps = be32_to_cpup(p++);
104 expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
105 if (len != expected) {
106 dprintk("%s: extent array size mismatch: %u/%u\n",
107 __func__, len, expected);
108 return -EINVAL;
109 }
110
111 iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
112 if (!iomaps) {
113 dprintk("%s: failed to allocate extent array\n", __func__);
114 return -ENOMEM;
115 }
116
117 for (i = 0; i < nr_iomaps; i++) {
118 struct pnfs_block_extent bex;
119
120 memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
121 p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
122
123 p = xdr_decode_hyper(p, &bex.foff);
124 if (bex.foff & (block_size - 1)) {
125 dprintk("%s: unaligned offset %lld\n",
126 __func__, bex.foff);
127 goto fail;
128 }
129 p = xdr_decode_hyper(p, &bex.len);
130 if (bex.len & (block_size - 1)) {
131 dprintk("%s: unaligned length %lld\n",
132 __func__, bex.foff);
133 goto fail;
134 }
135 p = xdr_decode_hyper(p, &bex.soff);
136 if (bex.soff & (block_size - 1)) {
137 dprintk("%s: unaligned disk offset %lld\n",
138 __func__, bex.soff);
139 goto fail;
140 }
141 bex.es = be32_to_cpup(p++);
142 if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
143 dprintk("%s: incorrect extent state %d\n",
144 __func__, bex.es);
145 goto fail;
146 }
147
148 iomaps[i].offset = bex.foff;
149 iomaps[i].length = bex.len;
150 }
151
152 *iomapp = iomaps;
153 return nr_iomaps;
154fail:
155 kfree(iomaps);
156 return -EINVAL;
157}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
1#ifndef _NFSD_BLOCKLAYOUTXDR_H
2#define _NFSD_BLOCKLAYOUTXDR_H 1
3
4#include <linux/blkdev.h>
5#include "xdr4.h"
6
7struct iomap;
8struct xdr_stream;
9
10enum pnfs_block_extent_state {
11 PNFS_BLOCK_READWRITE_DATA = 0,
12 PNFS_BLOCK_READ_DATA = 1,
13 PNFS_BLOCK_INVALID_DATA = 2,
14 PNFS_BLOCK_NONE_DATA = 3,
15};
16
17struct pnfs_block_extent {
18 struct nfsd4_deviceid vol_id;
19 u64 foff;
20 u64 len;
21 u64 soff;
22 enum pnfs_block_extent_state es;
23};
24#define NFS4_BLOCK_EXTENT_SIZE 44
25
26enum pnfs_block_volume_type {
27 PNFS_BLOCK_VOLUME_SIMPLE = 0,
28 PNFS_BLOCK_VOLUME_SLICE = 1,
29 PNFS_BLOCK_VOLUME_CONCAT = 2,
30 PNFS_BLOCK_VOLUME_STRIPE = 3,
31};
32
33/*
34 * Random upper cap for the uuid length to avoid unbounded allocation.
35 * Not actually limited by the protocol.
36 */
37#define PNFS_BLOCK_UUID_LEN 128
38
39struct pnfs_block_volume {
40 enum pnfs_block_volume_type type;
41 union {
42 struct {
43 u64 offset;
44 u32 sig_len;
45 u8 sig[PNFS_BLOCK_UUID_LEN];
46 } simple;
47 };
48};
49
50struct pnfs_block_deviceaddr {
51 u32 nr_volumes;
52 struct pnfs_block_volume volumes[];
53};
54
55__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
56 struct nfsd4_getdeviceinfo *gdp);
57__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
58 struct nfsd4_layoutget *lgp);
59int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
60 u32 block_size);
61
62#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
20#include "nfsd.h" 20#include "nfsd.h"
21#include "nfsfh.h" 21#include "nfsfh.h"
22#include "netns.h" 22#include "netns.h"
23#include "pnfs.h"
23 24
24#define NFSDDBG_FACILITY NFSDDBG_EXPORT 25#define NFSDDBG_FACILITY NFSDDBG_EXPORT
25 26
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
545 546
546 exp.ex_client = dom; 547 exp.ex_client = dom;
547 exp.cd = cd; 548 exp.cd = cd;
549 exp.ex_devid_map = NULL;
548 550
549 /* expiry */ 551 /* expiry */
550 err = -EINVAL; 552 err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
621 if (!gid_valid(exp.ex_anon_gid)) 623 if (!gid_valid(exp.ex_anon_gid))
622 goto out4; 624 goto out4;
623 err = 0; 625 err = 0;
626
627 nfsd4_setup_layout_type(&exp);
624 } 628 }
625 629
626 expp = svc_export_lookup(&exp); 630 expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
703 new->ex_fslocs.locations = NULL; 707 new->ex_fslocs.locations = NULL;
704 new->ex_fslocs.locations_count = 0; 708 new->ex_fslocs.locations_count = 0;
705 new->ex_fslocs.migrated = 0; 709 new->ex_fslocs.migrated = 0;
710 new->ex_layout_type = 0;
706 new->ex_uuid = NULL; 711 new->ex_uuid = NULL;
707 new->cd = item->cd; 712 new->cd = item->cd;
708} 713}
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
717 new->ex_anon_uid = item->ex_anon_uid; 722 new->ex_anon_uid = item->ex_anon_uid;
718 new->ex_anon_gid = item->ex_anon_gid; 723 new->ex_anon_gid = item->ex_anon_gid;
719 new->ex_fsid = item->ex_fsid; 724 new->ex_fsid = item->ex_fsid;
725 new->ex_devid_map = item->ex_devid_map;
726 item->ex_devid_map = NULL;
720 new->ex_uuid = item->ex_uuid; 727 new->ex_uuid = item->ex_uuid;
721 item->ex_uuid = NULL; 728 item->ex_uuid = NULL;
722 new->ex_fslocs.locations = item->ex_fslocs.locations; 729 new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
725 item->ex_fslocs.locations_count = 0; 732 item->ex_fslocs.locations_count = 0;
726 new->ex_fslocs.migrated = item->ex_fslocs.migrated; 733 new->ex_fslocs.migrated = item->ex_fslocs.migrated;
727 item->ex_fslocs.migrated = 0; 734 item->ex_fslocs.migrated = 0;
735 new->ex_layout_type = item->ex_layout_type;
728 new->ex_nflavors = item->ex_nflavors; 736 new->ex_nflavors = item->ex_nflavors;
729 for (i = 0; i < MAX_SECINFO_LIST; i++) { 737 for (i = 0; i < MAX_SECINFO_LIST; i++) {
730 new->ex_flavors[i] = item->ex_flavors[i]; 738 new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
56 struct nfsd4_fs_locations ex_fslocs; 56 struct nfsd4_fs_locations ex_fslocs;
57 uint32_t ex_nflavors; 57 uint32_t ex_nflavors;
58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; 58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
59 enum pnfs_layouttype ex_layout_type;
60 struct nfsd4_deviceid_map *ex_devid_map;
59 struct cache_detail *cd; 61 struct cache_detail *cd;
60}; 62};
61 63
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
546 return status; 546 return status;
547} 547}
548 548
549#ifdef CONFIG_NFSD_PNFS
550/*
551 * CB_LAYOUTRECALL4args
552 *
553 * struct layoutrecall_file4 {
554 * nfs_fh4 lor_fh;
555 * offset4 lor_offset;
556 * length4 lor_length;
557 * stateid4 lor_stateid;
558 * };
559 *
560 * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
561 * case LAYOUTRECALL4_FILE:
562 * layoutrecall_file4 lor_layout;
563 * case LAYOUTRECALL4_FSID:
564 * fsid4 lor_fsid;
565 * case LAYOUTRECALL4_ALL:
566 * void;
567 * };
568 *
569 * struct CB_LAYOUTRECALL4args {
570 * layouttype4 clora_type;
571 * layoutiomode4 clora_iomode;
572 * bool clora_changed;
573 * layoutrecall4 clora_recall;
574 * };
575 */
576static void encode_cb_layout4args(struct xdr_stream *xdr,
577 const struct nfs4_layout_stateid *ls,
578 struct nfs4_cb_compound_hdr *hdr)
579{
580 __be32 *p;
581
582 BUG_ON(hdr->minorversion == 0);
583
584 p = xdr_reserve_space(xdr, 5 * 4);
585 *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
586 *p++ = cpu_to_be32(ls->ls_layout_type);
587 *p++ = cpu_to_be32(IOMODE_ANY);
588 *p++ = cpu_to_be32(1);
589 *p = cpu_to_be32(RETURN_FILE);
590
591 encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
592
593 p = xdr_reserve_space(xdr, 2 * 8);
594 p = xdr_encode_hyper(p, 0);
595 xdr_encode_hyper(p, NFS4_MAX_UINT64);
596
597 encode_stateid4(xdr, &ls->ls_recall_sid);
598
599 hdr->nops++;
600}
601
602static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
603 struct xdr_stream *xdr,
604 const struct nfsd4_callback *cb)
605{
606 const struct nfs4_layout_stateid *ls =
607 container_of(cb, struct nfs4_layout_stateid, ls_recall);
608 struct nfs4_cb_compound_hdr hdr = {
609 .ident = 0,
610 .minorversion = cb->cb_minorversion,
611 };
612
613 encode_cb_compound4args(xdr, &hdr);
614 encode_cb_sequence4args(xdr, cb, &hdr);
615 encode_cb_layout4args(xdr, ls, &hdr);
616 encode_cb_nops(&hdr);
617}
618
619static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
620 struct xdr_stream *xdr,
621 struct nfsd4_callback *cb)
622{
623 struct nfs4_cb_compound_hdr hdr;
624 enum nfsstat4 nfserr;
625 int status;
626
627 status = decode_cb_compound4res(xdr, &hdr);
628 if (unlikely(status))
629 goto out;
630 if (cb) {
631 status = decode_cb_sequence4res(xdr, cb);
632 if (unlikely(status))
633 goto out;
634 }
635 status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
636 if (unlikely(status))
637 goto out;
638 if (unlikely(nfserr != NFS4_OK))
639 status = nfs_cb_stat_to_errno(nfserr);
640out:
641 return status;
642}
643#endif /* CONFIG_NFSD_PNFS */
644
549/* 645/*
550 * RPC procedure tables 646 * RPC procedure tables
551 */ 647 */
@@ -563,6 +659,9 @@ out:
563static struct rpc_procinfo nfs4_cb_procedures[] = { 659static struct rpc_procinfo nfs4_cb_procedures[] = {
564 PROC(CB_NULL, NULL, cb_null, cb_null), 660 PROC(CB_NULL, NULL, cb_null, cb_null),
565 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), 661 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
662#ifdef CONFIG_NFSD_PNFS
663 PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
664#endif
566}; 665};
567 666
568static struct rpc_version nfs_cb_version4 = { 667static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/kmod.h>
5#include <linux/file.h>
6#include <linux/jhash.h>
7#include <linux/sched.h>
8#include <linux/sunrpc/addr.h>
9
10#include "pnfs.h"
11#include "netns.h"
12#include "trace.h"
13
14#define NFSDDBG_FACILITY NFSDDBG_PNFS
15
16struct nfs4_layout {
17 struct list_head lo_perstate;
18 struct nfs4_layout_stateid *lo_state;
19 struct nfsd4_layout_seg lo_seg;
20};
21
22static struct kmem_cache *nfs4_layout_cache;
23static struct kmem_cache *nfs4_layout_stateid_cache;
24
25static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
26static const struct lock_manager_operations nfsd4_layouts_lm_ops;
27
28const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
29 [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
30};
31
32/* pNFS device ID to export fsid mapping */
33#define DEVID_HASH_BITS 8
34#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36static u64 nfsd_devid_seq = 1;
37static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
38static DEFINE_SPINLOCK(nfsd_devid_lock);
39
40static inline u32 devid_hashfn(u64 idx)
41{
42 return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
43}
44
45static void
46nfsd4_alloc_devid_map(const struct svc_fh *fhp)
47{
48 const struct knfsd_fh *fh = &fhp->fh_handle;
49 size_t fsid_len = key_len(fh->fh_fsid_type);
50 struct nfsd4_deviceid_map *map, *old;
51 int i;
52
53 map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
54 if (!map)
55 return;
56
57 map->fsid_type = fh->fh_fsid_type;
58 memcpy(&map->fsid, fh->fh_fsid, fsid_len);
59
60 spin_lock(&nfsd_devid_lock);
61 if (fhp->fh_export->ex_devid_map)
62 goto out_unlock;
63
64 for (i = 0; i < DEVID_HASH_SIZE; i++) {
65 list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
66 if (old->fsid_type != fh->fh_fsid_type)
67 continue;
68 if (memcmp(old->fsid, fh->fh_fsid,
69 key_len(old->fsid_type)))
70 continue;
71
72 fhp->fh_export->ex_devid_map = old;
73 goto out_unlock;
74 }
75 }
76
77 map->idx = nfsd_devid_seq++;
78 list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
79 fhp->fh_export->ex_devid_map = map;
80 map = NULL;
81
82out_unlock:
83 spin_unlock(&nfsd_devid_lock);
84 kfree(map);
85}
86
87struct nfsd4_deviceid_map *
88nfsd4_find_devid_map(int idx)
89{
90 struct nfsd4_deviceid_map *map, *ret = NULL;
91
92 rcu_read_lock();
93 list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
94 if (map->idx == idx)
95 ret = map;
96 rcu_read_unlock();
97
98 return ret;
99}
100
101int
102nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
103 u32 device_generation)
104{
105 if (!fhp->fh_export->ex_devid_map) {
106 nfsd4_alloc_devid_map(fhp);
107 if (!fhp->fh_export->ex_devid_map)
108 return -ENOMEM;
109 }
110
111 id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
112 id->generation = device_generation;
113 id->pad = 0;
114 return 0;
115}
116
117void nfsd4_setup_layout_type(struct svc_export *exp)
118{
119 struct super_block *sb = exp->ex_path.mnt->mnt_sb;
120
121 if (exp->ex_flags & NFSEXP_NOPNFS)
122 return;
123
124 if (sb->s_export_op->get_uuid &&
125 sb->s_export_op->map_blocks &&
126 sb->s_export_op->commit_blocks)
127 exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
128}
129
130static void
131nfsd4_free_layout_stateid(struct nfs4_stid *stid)
132{
133 struct nfs4_layout_stateid *ls = layoutstateid(stid);
134 struct nfs4_client *clp = ls->ls_stid.sc_client;
135 struct nfs4_file *fp = ls->ls_stid.sc_file;
136
137 trace_layoutstate_free(&ls->ls_stid.sc_stateid);
138
139 spin_lock(&clp->cl_lock);
140 list_del_init(&ls->ls_perclnt);
141 spin_unlock(&clp->cl_lock);
142
143 spin_lock(&fp->fi_lock);
144 list_del_init(&ls->ls_perfile);
145 spin_unlock(&fp->fi_lock);
146
147 vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
148 fput(ls->ls_file);
149
150 if (ls->ls_recalled)
151 atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
152
153 kmem_cache_free(nfs4_layout_stateid_cache, ls);
154}
155
156static int
157nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
158{
159 struct file_lock *fl;
160 int status;
161
162 fl = locks_alloc_lock();
163 if (!fl)
164 return -ENOMEM;
165 locks_init_lock(fl);
166 fl->fl_lmops = &nfsd4_layouts_lm_ops;
167 fl->fl_flags = FL_LAYOUT;
168 fl->fl_type = F_RDLCK;
169 fl->fl_end = OFFSET_MAX;
170 fl->fl_owner = ls;
171 fl->fl_pid = current->tgid;
172 fl->fl_file = ls->ls_file;
173
174 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
175 if (status) {
176 locks_free_lock(fl);
177 return status;
178 }
179 BUG_ON(fl != NULL);
180 return 0;
181}
182
183static struct nfs4_layout_stateid *
184nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
185 struct nfs4_stid *parent, u32 layout_type)
186{
187 struct nfs4_client *clp = cstate->clp;
188 struct nfs4_file *fp = parent->sc_file;
189 struct nfs4_layout_stateid *ls;
190 struct nfs4_stid *stp;
191
192 stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
193 if (!stp)
194 return NULL;
195 stp->sc_free = nfsd4_free_layout_stateid;
196 get_nfs4_file(fp);
197 stp->sc_file = fp;
198
199 ls = layoutstateid(stp);
200 INIT_LIST_HEAD(&ls->ls_perclnt);
201 INIT_LIST_HEAD(&ls->ls_perfile);
202 spin_lock_init(&ls->ls_lock);
203 INIT_LIST_HEAD(&ls->ls_layouts);
204 ls->ls_layout_type = layout_type;
205 nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
206 NFSPROC4_CLNT_CB_LAYOUT);
207
208 if (parent->sc_type == NFS4_DELEG_STID)
209 ls->ls_file = get_file(fp->fi_deleg_file);
210 else
211 ls->ls_file = find_any_file(fp);
212 BUG_ON(!ls->ls_file);
213
214 if (nfsd4_layout_setlease(ls)) {
215 put_nfs4_file(fp);
216 kmem_cache_free(nfs4_layout_stateid_cache, ls);
217 return NULL;
218 }
219
220 spin_lock(&clp->cl_lock);
221 stp->sc_type = NFS4_LAYOUT_STID;
222 list_add(&ls->ls_perclnt, &clp->cl_lo_states);
223 spin_unlock(&clp->cl_lock);
224
225 spin_lock(&fp->fi_lock);
226 list_add(&ls->ls_perfile, &fp->fi_lo_states);
227 spin_unlock(&fp->fi_lock);
228
229 trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
230 return ls;
231}
232
233__be32
234nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
235 struct nfsd4_compound_state *cstate, stateid_t *stateid,
236 bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
237{
238 struct nfs4_layout_stateid *ls;
239 struct nfs4_stid *stid;
240 unsigned char typemask = NFS4_LAYOUT_STID;
241 __be32 status;
242
243 if (create)
244 typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
245
246 status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
247 net_generic(SVC_NET(rqstp), nfsd_net_id));
248 if (status)
249 goto out;
250
251 if (!fh_match(&cstate->current_fh.fh_handle,
252 &stid->sc_file->fi_fhandle)) {
253 status = nfserr_bad_stateid;
254 goto out_put_stid;
255 }
256
257 if (stid->sc_type != NFS4_LAYOUT_STID) {
258 ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
259 nfs4_put_stid(stid);
260
261 status = nfserr_jukebox;
262 if (!ls)
263 goto out;
264 } else {
265 ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
266
267 status = nfserr_bad_stateid;
268 if (stateid->si_generation > stid->sc_stateid.si_generation)
269 goto out_put_stid;
270 if (layout_type != ls->ls_layout_type)
271 goto out_put_stid;
272 }
273
274 *lsp = ls;
275 return 0;
276
277out_put_stid:
278 nfs4_put_stid(stid);
279out:
280 return status;
281}
282
283static void
284nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
285{
286 spin_lock(&ls->ls_lock);
287 if (ls->ls_recalled)
288 goto out_unlock;
289
290 ls->ls_recalled = true;
291 atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
292 if (list_empty(&ls->ls_layouts))
293 goto out_unlock;
294
295 trace_layout_recall(&ls->ls_stid.sc_stateid);
296
297 atomic_inc(&ls->ls_stid.sc_count);
298 update_stateid(&ls->ls_stid.sc_stateid);
299 memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
300 nfsd4_run_cb(&ls->ls_recall);
301
302out_unlock:
303 spin_unlock(&ls->ls_lock);
304}
305
306static inline u64
307layout_end(struct nfsd4_layout_seg *seg)
308{
309 u64 end = seg->offset + seg->length;
310 return end >= seg->offset ? end : NFS4_MAX_UINT64;
311}
312
313static void
314layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
315{
316 if (end == NFS4_MAX_UINT64)
317 lo->length = NFS4_MAX_UINT64;
318 else
319 lo->length = end - lo->offset;
320}
321
322static bool
323layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
324{
325 if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
326 return false;
327 if (layout_end(&lo->lo_seg) <= s->offset)
328 return false;
329 if (layout_end(s) <= lo->lo_seg.offset)
330 return false;
331 return true;
332}
333
334static bool
335layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
336{
337 if (lo->iomode != new->iomode)
338 return false;
339 if (layout_end(new) < lo->offset)
340 return false;
341 if (layout_end(lo) < new->offset)
342 return false;
343
344 lo->offset = min(lo->offset, new->offset);
345 layout_update_len(lo, max(layout_end(lo), layout_end(new)));
346 return true;
347}
348
349static __be32
350nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
351{
352 struct nfs4_file *fp = ls->ls_stid.sc_file;
353 struct nfs4_layout_stateid *l, *n;
354 __be32 nfserr = nfs_ok;
355
356 assert_spin_locked(&fp->fi_lock);
357
358 list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
359 if (l != ls) {
360 nfsd4_recall_file_layout(l);
361 nfserr = nfserr_recallconflict;
362 }
363 }
364
365 return nfserr;
366}
367
368__be32
369nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
370{
371 struct nfsd4_layout_seg *seg = &lgp->lg_seg;
372 struct nfs4_file *fp = ls->ls_stid.sc_file;
373 struct nfs4_layout *lp, *new = NULL;
374 __be32 nfserr;
375
376 spin_lock(&fp->fi_lock);
377 nfserr = nfsd4_recall_conflict(ls);
378 if (nfserr)
379 goto out;
380 spin_lock(&ls->ls_lock);
381 list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
382 if (layouts_try_merge(&lp->lo_seg, seg))
383 goto done;
384 }
385 spin_unlock(&ls->ls_lock);
386 spin_unlock(&fp->fi_lock);
387
388 new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
389 if (!new)
390 return nfserr_jukebox;
391 memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
392 new->lo_state = ls;
393
394 spin_lock(&fp->fi_lock);
395 nfserr = nfsd4_recall_conflict(ls);
396 if (nfserr)
397 goto out;
398 spin_lock(&ls->ls_lock);
399 list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
400 if (layouts_try_merge(&lp->lo_seg, seg))
401 goto done;
402 }
403
404 atomic_inc(&ls->ls_stid.sc_count);
405 list_add_tail(&new->lo_perstate, &ls->ls_layouts);
406 new = NULL;
407done:
408 update_stateid(&ls->ls_stid.sc_stateid);
409 memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
410 spin_unlock(&ls->ls_lock);
411out:
412 spin_unlock(&fp->fi_lock);
413 if (new)
414 kmem_cache_free(nfs4_layout_cache, new);
415 return nfserr;
416}
417
418static void
419nfsd4_free_layouts(struct list_head *reaplist)
420{
421 while (!list_empty(reaplist)) {
422 struct nfs4_layout *lp = list_first_entry(reaplist,
423 struct nfs4_layout, lo_perstate);
424
425 list_del(&lp->lo_perstate);
426 nfs4_put_stid(&lp->lo_state->ls_stid);
427 kmem_cache_free(nfs4_layout_cache, lp);
428 }
429}
430
431static void
432nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
433 struct list_head *reaplist)
434{
435 struct nfsd4_layout_seg *lo = &lp->lo_seg;
436 u64 end = layout_end(lo);
437
438 if (seg->offset <= lo->offset) {
439 if (layout_end(seg) >= end) {
440 list_move_tail(&lp->lo_perstate, reaplist);
441 return;
442 }
443 end = seg->offset;
444 } else {
445 /* retain the whole layout segment on a split. */
446 if (layout_end(seg) < end) {
447 dprintk("%s: split not supported\n", __func__);
448 return;
449 }
450
451 lo->offset = layout_end(seg);
452 }
453
454 layout_update_len(lo, end);
455}
456
457__be32
458nfsd4_return_file_layouts(struct svc_rqst *rqstp,
459 struct nfsd4_compound_state *cstate,
460 struct nfsd4_layoutreturn *lrp)
461{
462 struct nfs4_layout_stateid *ls;
463 struct nfs4_layout *lp, *n;
464 LIST_HEAD(reaplist);
465 __be32 nfserr;
466 int found = 0;
467
468 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
469 false, lrp->lr_layout_type,
470 &ls);
471 if (nfserr) {
472 trace_layout_return_lookup_fail(&lrp->lr_sid);
473 return nfserr;
474 }
475
476 spin_lock(&ls->ls_lock);
477 list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
478 if (layouts_overlapping(lp, &lrp->lr_seg)) {
479 nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
480 found++;
481 }
482 }
483 if (!list_empty(&ls->ls_layouts)) {
484 if (found) {
485 update_stateid(&ls->ls_stid.sc_stateid);
486 memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
487 sizeof(stateid_t));
488 }
489 lrp->lrs_present = 1;
490 } else {
491 trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
492 nfs4_unhash_stid(&ls->ls_stid);
493 lrp->lrs_present = 0;
494 }
495 spin_unlock(&ls->ls_lock);
496
497 nfs4_put_stid(&ls->ls_stid);
498 nfsd4_free_layouts(&reaplist);
499 return nfs_ok;
500}
501
502__be32
503nfsd4_return_client_layouts(struct svc_rqst *rqstp,
504 struct nfsd4_compound_state *cstate,
505 struct nfsd4_layoutreturn *lrp)
506{
507 struct nfs4_layout_stateid *ls, *n;
508 struct nfs4_client *clp = cstate->clp;
509 struct nfs4_layout *lp, *t;
510 LIST_HEAD(reaplist);
511
512 lrp->lrs_present = 0;
513
514 spin_lock(&clp->cl_lock);
515 list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
516 if (lrp->lr_return_type == RETURN_FSID &&
517 !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
518 &cstate->current_fh.fh_handle))
519 continue;
520
521 spin_lock(&ls->ls_lock);
522 list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
523 if (lrp->lr_seg.iomode == IOMODE_ANY ||
524 lrp->lr_seg.iomode == lp->lo_seg.iomode)
525 list_move_tail(&lp->lo_perstate, &reaplist);
526 }
527 spin_unlock(&ls->ls_lock);
528 }
529 spin_unlock(&clp->cl_lock);
530
531 nfsd4_free_layouts(&reaplist);
532 return 0;
533}
534
535static void
536nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
537 struct list_head *reaplist)
538{
539 spin_lock(&ls->ls_lock);
540 list_splice_init(&ls->ls_layouts, reaplist);
541 spin_unlock(&ls->ls_lock);
542}
543
544void
545nfsd4_return_all_client_layouts(struct nfs4_client *clp)
546{
547 struct nfs4_layout_stateid *ls, *n;
548 LIST_HEAD(reaplist);
549
550 spin_lock(&clp->cl_lock);
551 list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
552 nfsd4_return_all_layouts(ls, &reaplist);
553 spin_unlock(&clp->cl_lock);
554
555 nfsd4_free_layouts(&reaplist);
556}
557
558void
559nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
560{
561 struct nfs4_layout_stateid *ls, *n;
562 LIST_HEAD(reaplist);
563
564 spin_lock(&fp->fi_lock);
565 list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
566 if (ls->ls_stid.sc_client == clp)
567 nfsd4_return_all_layouts(ls, &reaplist);
568 }
569 spin_unlock(&fp->fi_lock);
570
571 nfsd4_free_layouts(&reaplist);
572}
573
574static void
575nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
576{
577 struct nfs4_client *clp = ls->ls_stid.sc_client;
578 char addr_str[INET6_ADDRSTRLEN];
579 static char *envp[] = {
580 "HOME=/",
581 "TERM=linux",
582 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
583 NULL
584 };
585 char *argv[8];
586 int error;
587
588 rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
589
590 nfsd4_cb_layout_fail(ls);
591
592 printk(KERN_WARNING
593 "nfsd: client %s failed to respond to layout recall. "
594 " Fencing..\n", addr_str);
595
596 argv[0] = "/sbin/nfsd-recall-failed";
597 argv[1] = addr_str;
598 argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
599 argv[3] = NULL;
600
601 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
602 if (error) {
603 printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
604 addr_str, error);
605 }
606}
607
608static int
609nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
610{
611 struct nfs4_layout_stateid *ls =
612 container_of(cb, struct nfs4_layout_stateid, ls_recall);
613 LIST_HEAD(reaplist);
614
615 switch (task->tk_status) {
616 case 0:
617 return 1;
618 case -NFS4ERR_NOMATCHING_LAYOUT:
619 trace_layout_recall_done(&ls->ls_stid.sc_stateid);
620 task->tk_status = 0;
621 return 1;
622 case -NFS4ERR_DELAY:
623 /* Poll the client until it's done with the layout */
624 /* FIXME: cap number of retries.
625 * The pnfs standard states that we need to only expire
626 * the client after at-least "lease time" .eg lease-time * 2
627 * when failing to communicate a recall
628 */
629 rpc_delay(task, HZ/100); /* 10 mili-seconds */
630 return 0;
631 default:
632 /*
633 * Unknown error or non-responding client, we'll need to fence.
634 */
635 nfsd4_cb_layout_fail(ls);
636 return -1;
637 }
638}
639
640static void
641nfsd4_cb_layout_release(struct nfsd4_callback *cb)
642{
643 struct nfs4_layout_stateid *ls =
644 container_of(cb, struct nfs4_layout_stateid, ls_recall);
645 LIST_HEAD(reaplist);
646
647 trace_layout_recall_release(&ls->ls_stid.sc_stateid);
648
649 nfsd4_return_all_layouts(ls, &reaplist);
650 nfsd4_free_layouts(&reaplist);
651 nfs4_put_stid(&ls->ls_stid);
652}
653
654static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
655 .done = nfsd4_cb_layout_done,
656 .release = nfsd4_cb_layout_release,
657};
658
659static bool
660nfsd4_layout_lm_break(struct file_lock *fl)
661{
662 /*
663 * We don't want the locks code to timeout the lease for us;
664 * we'll remove it ourself if a layout isn't returned
665 * in time:
666 */
667 fl->fl_break_time = 0;
668 nfsd4_recall_file_layout(fl->fl_owner);
669 return false;
670}
671
672static int
673nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
674 struct list_head *dispose)
675{
676 BUG_ON(!(arg & F_UNLCK));
677 return lease_modify(onlist, arg, dispose);
678}
679
680static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
681 .lm_break = nfsd4_layout_lm_break,
682 .lm_change = nfsd4_layout_lm_change,
683};
684
685int
686nfsd4_init_pnfs(void)
687{
688 int i;
689
690 for (i = 0; i < DEVID_HASH_SIZE; i++)
691 INIT_LIST_HEAD(&nfsd_devid_hash[i]);
692
693 nfs4_layout_cache = kmem_cache_create("nfs4_layout",
694 sizeof(struct nfs4_layout), 0, 0, NULL);
695 if (!nfs4_layout_cache)
696 return -ENOMEM;
697
698 nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
699 sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
700 if (!nfs4_layout_stateid_cache) {
701 kmem_cache_destroy(nfs4_layout_cache);
702 return -ENOMEM;
703 }
704 return 0;
705}
706
707void
708nfsd4_exit_pnfs(void)
709{
710 int i;
711
712 kmem_cache_destroy(nfs4_layout_cache);
713 kmem_cache_destroy(nfs4_layout_stateid_cache);
714
715 for (i = 0; i < DEVID_HASH_SIZE; i++) {
716 struct nfsd4_deviceid_map *map, *n;
717
718 list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
719 kfree(map);
720 }
721}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
43#include "current_stateid.h" 43#include "current_stateid.h"
44#include "netns.h" 44#include "netns.h"
45#include "acl.h" 45#include "acl.h"
46#include "pnfs.h"
47#include "trace.h"
46 48
47#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 49#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
48#include <linux/security.h> 50#include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1178 return status == nfserr_same ? nfs_ok : status; 1180 return status == nfserr_same ? nfs_ok : status;
1179} 1181}
1180 1182
1183#ifdef CONFIG_NFSD_PNFS
1184static const struct nfsd4_layout_ops *
1185nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
1186{
1187 if (!exp->ex_layout_type) {
1188 dprintk("%s: export does not support pNFS\n", __func__);
1189 return NULL;
1190 }
1191
1192 if (exp->ex_layout_type != layout_type) {
1193 dprintk("%s: layout type %d not supported\n",
1194 __func__, layout_type);
1195 return NULL;
1196 }
1197
1198 return nfsd4_layout_ops[layout_type];
1199}
1200
1201static __be32
1202nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
1203 struct nfsd4_compound_state *cstate,
1204 struct nfsd4_getdeviceinfo *gdp)
1205{
1206 const struct nfsd4_layout_ops *ops;
1207 struct nfsd4_deviceid_map *map;
1208 struct svc_export *exp;
1209 __be32 nfserr;
1210
1211 dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
1212 __func__,
1213 gdp->gd_layout_type,
1214 gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
1215 gdp->gd_maxcount);
1216
1217 map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
1218 if (!map) {
1219 dprintk("%s: couldn't find device ID to export mapping!\n",
1220 __func__);
1221 return nfserr_noent;
1222 }
1223
1224 exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
1225 if (IS_ERR(exp)) {
1226 dprintk("%s: could not find device id\n", __func__);
1227 return nfserr_noent;
1228 }
1229
1230 nfserr = nfserr_layoutunavailable;
1231 ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
1232 if (!ops)
1233 goto out;
1234
1235 nfserr = nfs_ok;
1236 if (gdp->gd_maxcount != 0)
1237 nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
1238
1239 gdp->gd_notify_types &= ops->notify_types;
1240 exp_put(exp);
1241out:
1242 return nfserr;
1243}
1244
1245static __be32
1246nfsd4_layoutget(struct svc_rqst *rqstp,
1247 struct nfsd4_compound_state *cstate,
1248 struct nfsd4_layoutget *lgp)
1249{
1250 struct svc_fh *current_fh = &cstate->current_fh;
1251 const struct nfsd4_layout_ops *ops;
1252 struct nfs4_layout_stateid *ls;
1253 __be32 nfserr;
1254 int accmode;
1255
1256 switch (lgp->lg_seg.iomode) {
1257 case IOMODE_READ:
1258 accmode = NFSD_MAY_READ;
1259 break;
1260 case IOMODE_RW:
1261 accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
1262 break;
1263 default:
1264 dprintk("%s: invalid iomode %d\n",
1265 __func__, lgp->lg_seg.iomode);
1266 nfserr = nfserr_badiomode;
1267 goto out;
1268 }
1269
1270 nfserr = fh_verify(rqstp, current_fh, 0, accmode);
1271 if (nfserr)
1272 goto out;
1273
1274 nfserr = nfserr_layoutunavailable;
1275 ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
1276 if (!ops)
1277 goto out;
1278
1279 /*
1280 * Verify minlength and range as per RFC5661:
1281 * o If loga_length is less than loga_minlength,
1282 * the metadata server MUST return NFS4ERR_INVAL.
1283 * o If the sum of loga_offset and loga_minlength exceeds
1284 * NFS4_UINT64_MAX, and loga_minlength is not
1285 * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
1286 * o If the sum of loga_offset and loga_length exceeds
1287 * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
1288 * the error NFS4ERR_INVAL MUST result.
1289 */
1290 nfserr = nfserr_inval;
1291 if (lgp->lg_seg.length < lgp->lg_minlength ||
1292 (lgp->lg_minlength != NFS4_MAX_UINT64 &&
1293 lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
1294 (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
1295 lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
1296 goto out;
1297 if (lgp->lg_seg.length == 0)
1298 goto out;
1299
1300 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
1301 true, lgp->lg_layout_type, &ls);
1302 if (nfserr) {
1303 trace_layout_get_lookup_fail(&lgp->lg_sid);
1304 goto out;
1305 }
1306
1307 nfserr = nfserr_recallconflict;
1308 if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
1309 goto out_put_stid;
1310
1311 nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
1312 current_fh, lgp);
1313 if (nfserr)
1314 goto out_put_stid;
1315
1316 nfserr = nfsd4_insert_layout(lgp, ls);
1317
1318out_put_stid:
1319 nfs4_put_stid(&ls->ls_stid);
1320out:
1321 return nfserr;
1322}
1323
1324static __be32
1325nfsd4_layoutcommit(struct svc_rqst *rqstp,
1326 struct nfsd4_compound_state *cstate,
1327 struct nfsd4_layoutcommit *lcp)
1328{
1329 const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
1330 struct svc_fh *current_fh = &cstate->current_fh;
1331 const struct nfsd4_layout_ops *ops;
1332 loff_t new_size = lcp->lc_last_wr + 1;
1333 struct inode *inode;
1334 struct nfs4_layout_stateid *ls;
1335 __be32 nfserr;
1336
1337 nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
1338 if (nfserr)
1339 goto out;
1340
1341 nfserr = nfserr_layoutunavailable;
1342 ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
1343 if (!ops)
1344 goto out;
1345 inode = current_fh->fh_dentry->d_inode;
1346
1347 nfserr = nfserr_inval;
1348 if (new_size <= seg->offset) {
1349 dprintk("pnfsd: last write before layout segment\n");
1350 goto out;
1351 }
1352 if (new_size > seg->offset + seg->length) {
1353 dprintk("pnfsd: last write beyond layout segment\n");
1354 goto out;
1355 }
1356 if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
1357 dprintk("pnfsd: layoutcommit beyond EOF\n");
1358 goto out;
1359 }
1360
1361 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
1362 false, lcp->lc_layout_type,
1363 &ls);
1364 if (nfserr) {
1365 trace_layout_commit_lookup_fail(&lcp->lc_sid);
1366 /* fixup error code as per RFC5661 */
1367 if (nfserr == nfserr_bad_stateid)
1368 nfserr = nfserr_badlayout;
1369 goto out;
1370 }
1371
1372 nfserr = ops->proc_layoutcommit(inode, lcp);
1373 if (nfserr)
1374 goto out_put_stid;
1375
1376 if (new_size > i_size_read(inode)) {
1377 lcp->lc_size_chg = 1;
1378 lcp->lc_newsize = new_size;
1379 } else {
1380 lcp->lc_size_chg = 0;
1381 }
1382
1383out_put_stid:
1384 nfs4_put_stid(&ls->ls_stid);
1385out:
1386 return nfserr;
1387}
1388
1389static __be32
1390nfsd4_layoutreturn(struct svc_rqst *rqstp,
1391 struct nfsd4_compound_state *cstate,
1392 struct nfsd4_layoutreturn *lrp)
1393{
1394 struct svc_fh *current_fh = &cstate->current_fh;
1395 __be32 nfserr;
1396
1397 nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
1398 if (nfserr)
1399 goto out;
1400
1401 nfserr = nfserr_layoutunavailable;
1402 if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
1403 goto out;
1404
1405 switch (lrp->lr_seg.iomode) {
1406 case IOMODE_READ:
1407 case IOMODE_RW:
1408 case IOMODE_ANY:
1409 break;
1410 default:
1411 dprintk("%s: invalid iomode %d\n", __func__,
1412 lrp->lr_seg.iomode);
1413 nfserr = nfserr_inval;
1414 goto out;
1415 }
1416
1417 switch (lrp->lr_return_type) {
1418 case RETURN_FILE:
1419 nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
1420 break;
1421 case RETURN_FSID:
1422 case RETURN_ALL:
1423 nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
1424 break;
1425 default:
1426 dprintk("%s: invalid return_type %d\n", __func__,
1427 lrp->lr_return_type);
1428 nfserr = nfserr_inval;
1429 break;
1430 }
1431out:
1432 return nfserr;
1433}
1434#endif /* CONFIG_NFSD_PNFS */
1435
1181/* 1436/*
1182 * NULL call. 1437 * NULL call.
1183 */ 1438 */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
1679 op_encode_channel_attrs_maxsz) * sizeof(__be32); 1934 op_encode_channel_attrs_maxsz) * sizeof(__be32);
1680} 1935}
1681 1936
1937#ifdef CONFIG_NFSD_PNFS
1938/*
1939 * At this stage we don't really know what layout driver will handle the request,
1940 * so we need to define an arbitrary upper bound here.
1941 */
1942#define MAX_LAYOUT_SIZE 128
1943static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1944{
1945 return (op_encode_hdr_size +
1946 1 /* logr_return_on_close */ +
1947 op_encode_stateid_maxsz +
1948 1 /* nr of layouts */ +
1949 MAX_LAYOUT_SIZE) * sizeof(__be32);
1950}
1951
1952static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1953{
1954 return (op_encode_hdr_size +
1955 1 /* locr_newsize */ +
1956 2 /* ns_size */) * sizeof(__be32);
1957}
1958
1959static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1960{
1961 return (op_encode_hdr_size +
1962 1 /* lrs_stateid */ +
1963 op_encode_stateid_maxsz) * sizeof(__be32);
1964}
1965#endif /* CONFIG_NFSD_PNFS */
1966
1682static struct nfsd4_operation nfsd4_ops[] = { 1967static struct nfsd4_operation nfsd4_ops[] = {
1683 [OP_ACCESS] = { 1968 [OP_ACCESS] = {
1684 .op_func = (nfsd4op_func)nfsd4_access, 1969 .op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
1966 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 2251 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1967 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 2252 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1968 }, 2253 },
2254#ifdef CONFIG_NFSD_PNFS
2255 [OP_GETDEVICEINFO] = {
2256 .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
2257 .op_flags = ALLOWED_WITHOUT_FH,
2258 .op_name = "OP_GETDEVICEINFO",
2259 },
2260 [OP_LAYOUTGET] = {
2261 .op_func = (nfsd4op_func)nfsd4_layoutget,
2262 .op_flags = OP_MODIFIES_SOMETHING,
2263 .op_name = "OP_LAYOUTGET",
2264 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
2265 },
2266 [OP_LAYOUTCOMMIT] = {
2267 .op_func = (nfsd4op_func)nfsd4_layoutcommit,
2268 .op_flags = OP_MODIFIES_SOMETHING,
2269 .op_name = "OP_LAYOUTCOMMIT",
2270 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
2271 },
2272 [OP_LAYOUTRETURN] = {
2273 .op_func = (nfsd4op_func)nfsd4_layoutreturn,
2274 .op_flags = OP_MODIFIES_SOMETHING,
2275 .op_name = "OP_LAYOUTRETURN",
2276 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
2277 },
2278#endif /* CONFIG_NFSD_PNFS */
1969 2279
1970 /* NFSv4.2 operations */ 2280 /* NFSv4.2 operations */
1971 [OP_ALLOCATE] = { 2281 [OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c06a1ba80d73..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
48#include "current_stateid.h" 48#include "current_stateid.h"
49 49
50#include "netns.h" 50#include "netns.h"
51#include "pnfs.h"
51 52
52#define NFSDDBG_FACILITY NFSDDBG_PROC 53#define NFSDDBG_FACILITY NFSDDBG_PROC
53 54
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
150 clp->cl_time = get_seconds(); 151 clp->cl_time = get_seconds();
151} 152}
152 153
153static inline void
154renew_client(struct nfs4_client *clp)
155{
156 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
157
158 spin_lock(&nn->client_lock);
159 renew_client_locked(clp);
160 spin_unlock(&nn->client_lock);
161}
162
163static void put_client_renew_locked(struct nfs4_client *clp) 154static void put_client_renew_locked(struct nfs4_client *clp)
164{ 155{
165 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 156 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
282 kmem_cache_free(file_slab, fp); 273 kmem_cache_free(file_slab, fp);
283} 274}
284 275
285static inline void 276void
286put_nfs4_file(struct nfs4_file *fi) 277put_nfs4_file(struct nfs4_file *fi)
287{ 278{
288 might_lock(&state_lock); 279 might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
295 } 286 }
296} 287}
297 288
298static inline void
299get_nfs4_file(struct nfs4_file *fi)
300{
301 atomic_inc(&fi->fi_ref);
302}
303
304static struct file * 289static struct file *
305__nfs4_get_fd(struct nfs4_file *f, int oflag) 290__nfs4_get_fd(struct nfs4_file *f, int oflag)
306{ 291{
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
358 return ret; 343 return ret;
359} 344}
360 345
361static struct file * 346struct file *
362find_any_file(struct nfs4_file *f) 347find_any_file(struct nfs4_file *f)
363{ 348{
364 struct file *ret; 349 struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
408 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); 393 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
409} 394}
410 395
411static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
412{
413 return fh1->fh_size == fh2->fh_size &&
414 !memcmp(fh1->fh_base.fh_pad,
415 fh2->fh_base.fh_pad,
416 fh1->fh_size);
417}
418
419static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; 396static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
420 397
421static void 398static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
494 __nfs4_file_put_access(fp, O_RDONLY); 471 __nfs4_file_put_access(fp, O_RDONLY);
495} 472}
496 473
497static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, 474struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
498 struct kmem_cache *slab) 475 struct kmem_cache *slab)
499{ 476{
500 struct nfs4_stid *stid; 477 struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
688 struct file *filp = NULL; 665 struct file *filp = NULL;
689 666
690 spin_lock(&fp->fi_lock); 667 spin_lock(&fp->fi_lock);
691 if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) 668 if (fp->fi_deleg_file && --fp->fi_delegees == 0)
692 swap(filp, fp->fi_deleg_file); 669 swap(filp, fp->fi_deleg_file);
693 spin_unlock(&fp->fi_lock); 670 spin_unlock(&fp->fi_lock);
694 671
695 if (filp) { 672 if (filp) {
696 vfs_setlease(filp, F_UNLCK, NULL, NULL); 673 vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
697 fput(filp); 674 fput(filp);
698 } 675 }
699} 676}
700 677
701static void unhash_stid(struct nfs4_stid *s) 678void nfs4_unhash_stid(struct nfs4_stid *s)
702{ 679{
703 s->sc_type = 0; 680 s->sc_type = 0;
704} 681}
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
1006 983
1007 list_del_init(&stp->st_locks); 984 list_del_init(&stp->st_locks);
1008 unhash_ol_stateid(stp); 985 unhash_ol_stateid(stp);
1009 unhash_stid(&stp->st_stid); 986 nfs4_unhash_stid(&stp->st_stid);
1010} 987}
1011 988
1012static void release_lock_stateid(struct nfs4_ol_stateid *stp) 989static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
1518static int 1495static int
1519STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) 1496STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
1520{ 1497{
1521 if (clid->cl_boot == nn->boot_time) 1498 /*
1499 * We're assuming the clid was not given out from a boot
1500 * precisely 2^32 (about 136 years) before this one. That seems
1501 * a safe assumption:
1502 */
1503 if (clid->cl_boot == (u32)nn->boot_time)
1522 return 0; 1504 return 0;
1523 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", 1505 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
1524 clid->cl_boot, clid->cl_id, nn->boot_time); 1506 clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1558 INIT_LIST_HEAD(&clp->cl_lru); 1540 INIT_LIST_HEAD(&clp->cl_lru);
1559 INIT_LIST_HEAD(&clp->cl_callbacks); 1541 INIT_LIST_HEAD(&clp->cl_callbacks);
1560 INIT_LIST_HEAD(&clp->cl_revoked); 1542 INIT_LIST_HEAD(&clp->cl_revoked);
1543#ifdef CONFIG_NFSD_PNFS
1544 INIT_LIST_HEAD(&clp->cl_lo_states);
1545#endif
1561 spin_lock_init(&clp->cl_lock); 1546 spin_lock_init(&clp->cl_lock);
1562 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1547 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1563 return clp; 1548 return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
1662 nfs4_get_stateowner(&oo->oo_owner); 1647 nfs4_get_stateowner(&oo->oo_owner);
1663 release_openowner(oo); 1648 release_openowner(oo);
1664 } 1649 }
1650 nfsd4_return_all_client_layouts(clp);
1665 nfsd4_shutdown_callback(clp); 1651 nfsd4_shutdown_callback(clp);
1666 if (clp->cl_cb_conn.cb_xprt) 1652 if (clp->cl_cb_conn.cb_xprt)
1667 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 1653 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
2145static void 2131static void
2146nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) 2132nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
2147{ 2133{
2148 /* pNFS is not supported */ 2134#ifdef CONFIG_NFSD_PNFS
2135 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
2136#else
2149 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; 2137 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
2138#endif
2150 2139
2151 /* Referrals are supported, Migration is not. */ 2140 /* Referrals are supported, Migration is not. */
2152 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; 2141 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
3074 fp->fi_share_deny = 0; 3063 fp->fi_share_deny = 0;
3075 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 3064 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
3076 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 3065 memset(fp->fi_access, 0, sizeof(fp->fi_access));
3066#ifdef CONFIG_NFSD_PNFS
3067 INIT_LIST_HEAD(&fp->fi_lo_states);
3068 atomic_set(&fp->fi_lo_recalls, 0);
3069#endif
3077 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); 3070 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
3078} 3071}
3079 3072
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
3300 struct nfs4_file *fp; 3293 struct nfs4_file *fp;
3301 3294
3302 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { 3295 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
3303 if (nfsd_fh_match(&fp->fi_fhandle, fh)) { 3296 if (fh_match(&fp->fi_fhandle, fh)) {
3304 if (atomic_inc_not_zero(&fp->fi_ref)) 3297 if (atomic_inc_not_zero(&fp->fi_ref))
3305 return fp; 3298 return fp;
3306 } 3299 }
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
3308 return NULL; 3301 return NULL;
3309} 3302}
3310 3303
3311static struct nfs4_file * 3304struct nfs4_file *
3312find_file(struct knfsd_fh *fh) 3305find_file(struct knfsd_fh *fh)
3313{ 3306{
3314 struct nfs4_file *fp; 3307 struct nfs4_file *fp;
@@ -3477,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
3477} 3470}
3478 3471
3479static int 3472static int
3480nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose) 3473nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
3474 struct list_head *dispose)
3481{ 3475{
3482 if (arg & F_UNLCK) 3476 if (arg & F_UNLCK)
3483 return lease_modify(onlist, arg, dispose); 3477 return lease_modify(onlist, arg, dispose);
@@ -3855,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
3855 /* Race breaker */ 3849 /* Race breaker */
3856 if (fp->fi_deleg_file) { 3850 if (fp->fi_deleg_file) {
3857 status = 0; 3851 status = 0;
3858 atomic_inc(&fp->fi_delegees); 3852 ++fp->fi_delegees;
3859 hash_delegation_locked(dp, fp); 3853 hash_delegation_locked(dp, fp);
3860 goto out_unlock; 3854 goto out_unlock;
3861 } 3855 }
3862 fp->fi_deleg_file = filp; 3856 fp->fi_deleg_file = filp;
3863 atomic_set(&fp->fi_delegees, 1); 3857 fp->fi_delegees = 1;
3864 hash_delegation_locked(dp, fp); 3858 hash_delegation_locked(dp, fp);
3865 spin_unlock(&fp->fi_lock); 3859 spin_unlock(&fp->fi_lock);
3866 spin_unlock(&state_lock); 3860 spin_unlock(&state_lock);
@@ -3901,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
3901 status = -EAGAIN; 3895 status = -EAGAIN;
3902 goto out_unlock; 3896 goto out_unlock;
3903 } 3897 }
3904 atomic_inc(&fp->fi_delegees); 3898 ++fp->fi_delegees;
3905 hash_delegation_locked(dp, fp); 3899 hash_delegation_locked(dp, fp);
3906 status = 0; 3900 status = 0;
3907out_unlock: 3901out_unlock:
@@ -4294,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
4294 4288
4295static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 4289static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
4296{ 4290{
4297 if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) 4291 if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
4298 return nfserr_bad_stateid; 4292 return nfserr_bad_stateid;
4299 return nfs_ok; 4293 return nfs_ok;
4300} 4294}
@@ -4445,7 +4439,7 @@ out_unlock:
4445 return status; 4439 return status;
4446} 4440}
4447 4441
4448static __be32 4442__be32
4449nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, 4443nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
4450 stateid_t *stateid, unsigned char typemask, 4444 stateid_t *stateid, unsigned char typemask,
4451 struct nfs4_stid **s, struct nfsd_net *nn) 4445 struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4859,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4859 update_stateid(&stp->st_stid.sc_stateid); 4853 update_stateid(&stp->st_stid.sc_stateid);
4860 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4854 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4861 4855
4856 nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
4857 stp->st_stid.sc_file);
4858
4862 nfsd4_close_open_stateid(stp); 4859 nfsd4_close_open_stateid(stp);
4863 4860
4864 /* put reference from nfs4_preprocess_seqid_op */ 4861 /* put reference from nfs4_preprocess_seqid_op */
@@ -5556,10 +5553,11 @@ out_nfserr:
5556static bool 5553static bool
5557check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) 5554check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
5558{ 5555{
5559 struct file_lock **flpp; 5556 struct file_lock *fl;
5560 int status = false; 5557 int status = false;
5561 struct file *filp = find_any_file(fp); 5558 struct file *filp = find_any_file(fp);
5562 struct inode *inode; 5559 struct inode *inode;
5560 struct file_lock_context *flctx;
5563 5561
5564 if (!filp) { 5562 if (!filp) {
5565 /* Any valid lock stateid should have some sort of access */ 5563 /* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
5568 } 5566 }
5569 5567
5570 inode = file_inode(filp); 5568 inode = file_inode(filp);
5569 flctx = inode->i_flctx;
5571 5570
5572 spin_lock(&inode->i_lock); 5571 if (flctx && !list_empty_careful(&flctx->flc_posix)) {
5573 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 5572 spin_lock(&flctx->flc_lock);
5574 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 5573 list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
5575 status = true; 5574 if (fl->fl_owner == (fl_owner_t)lowner) {
5576 break; 5575 status = true;
5576 break;
5577 }
5577 } 5578 }
5579 spin_unlock(&flctx->flc_lock);
5578 } 5580 }
5579 spin_unlock(&inode->i_lock);
5580 fput(filp); 5581 fput(filp);
5581 return status; 5582 return status;
5582} 5583}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
47#include "state.h" 47#include "state.h"
48#include "cache.h" 48#include "cache.h"
49#include "netns.h" 49#include "netns.h"
50#include "pnfs.h"
50 51
51#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 52#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
52#include <linux/security.h> 53#include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
234 return ret; 235 return ret;
235} 236}
236 237
238/*
239 * We require the high 32 bits of 'seconds' to be 0, and
240 * we ignore all 32 bits of 'nseconds'.
241 */
242static __be32
243nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
244{
245 DECODE_HEAD;
246 u64 sec;
247
248 READ_BUF(12);
249 p = xdr_decode_hyper(p, &sec);
250 tv->tv_sec = sec;
251 tv->tv_nsec = be32_to_cpup(p++);
252 if (tv->tv_nsec >= (u32)1000000000)
253 return nfserr_inval;
254
255 DECODE_TAIL;
256}
257
237static __be32 258static __be32
238nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) 259nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
239{ 260{
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
267{ 288{
268 int expected_len, len = 0; 289 int expected_len, len = 0;
269 u32 dummy32; 290 u32 dummy32;
270 u64 sec;
271 char *buf; 291 char *buf;
272 292
273 DECODE_HEAD; 293 DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
358 dummy32 = be32_to_cpup(p++); 378 dummy32 = be32_to_cpup(p++);
359 switch (dummy32) { 379 switch (dummy32) {
360 case NFS4_SET_TO_CLIENT_TIME: 380 case NFS4_SET_TO_CLIENT_TIME:
361 /* We require the high 32 bits of 'seconds' to be 0, and we ignore
362 all 32 bits of 'nseconds'. */
363 READ_BUF(12);
364 len += 12; 381 len += 12;
365 p = xdr_decode_hyper(p, &sec); 382 status = nfsd4_decode_time(argp, &iattr->ia_atime);
366 iattr->ia_atime.tv_sec = (time_t)sec; 383 if (status)
367 iattr->ia_atime.tv_nsec = be32_to_cpup(p++); 384 return status;
368 if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
369 return nfserr_inval;
370 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); 385 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
371 break; 386 break;
372 case NFS4_SET_TO_SERVER_TIME: 387 case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
382 dummy32 = be32_to_cpup(p++); 397 dummy32 = be32_to_cpup(p++);
383 switch (dummy32) { 398 switch (dummy32) {
384 case NFS4_SET_TO_CLIENT_TIME: 399 case NFS4_SET_TO_CLIENT_TIME:
385 /* We require the high 32 bits of 'seconds' to be 0, and we ignore
386 all 32 bits of 'nseconds'. */
387 READ_BUF(12);
388 len += 12; 400 len += 12;
389 p = xdr_decode_hyper(p, &sec); 401 status = nfsd4_decode_time(argp, &iattr->ia_mtime);
390 iattr->ia_mtime.tv_sec = sec; 402 if (status)
391 iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); 403 return status;
392 if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
393 return nfserr_inval;
394 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); 404 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
395 break; 405 break;
396 case NFS4_SET_TO_SERVER_TIME: 406 case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
1513 DECODE_TAIL; 1523 DECODE_TAIL;
1514} 1524}
1515 1525
1526#ifdef CONFIG_NFSD_PNFS
1527static __be32
1528nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
1529 struct nfsd4_getdeviceinfo *gdev)
1530{
1531 DECODE_HEAD;
1532 u32 num, i;
1533
1534 READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
1535 COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
1536 gdev->gd_layout_type = be32_to_cpup(p++);
1537 gdev->gd_maxcount = be32_to_cpup(p++);
1538 num = be32_to_cpup(p++);
1539 if (num) {
1540 READ_BUF(4 * num);
1541 gdev->gd_notify_types = be32_to_cpup(p++);
1542 for (i = 1; i < num; i++) {
1543 if (be32_to_cpup(p++)) {
1544 status = nfserr_inval;
1545 goto out;
1546 }
1547 }
1548 }
1549 DECODE_TAIL;
1550}
1551
1552static __be32
1553nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
1554 struct nfsd4_layoutget *lgp)
1555{
1556 DECODE_HEAD;
1557
1558 READ_BUF(36);
1559 lgp->lg_signal = be32_to_cpup(p++);
1560 lgp->lg_layout_type = be32_to_cpup(p++);
1561 lgp->lg_seg.iomode = be32_to_cpup(p++);
1562 p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
1563 p = xdr_decode_hyper(p, &lgp->lg_seg.length);
1564 p = xdr_decode_hyper(p, &lgp->lg_minlength);
1565 nfsd4_decode_stateid(argp, &lgp->lg_sid);
1566 READ_BUF(4);
1567 lgp->lg_maxcount = be32_to_cpup(p++);
1568
1569 DECODE_TAIL;
1570}
1571
1572static __be32
1573nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
1574 struct nfsd4_layoutcommit *lcp)
1575{
1576 DECODE_HEAD;
1577 u32 timechange;
1578
1579 READ_BUF(20);
1580 p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
1581 p = xdr_decode_hyper(p, &lcp->lc_seg.length);
1582 lcp->lc_reclaim = be32_to_cpup(p++);
1583 nfsd4_decode_stateid(argp, &lcp->lc_sid);
1584 READ_BUF(4);
1585 lcp->lc_newoffset = be32_to_cpup(p++);
1586 if (lcp->lc_newoffset) {
1587 READ_BUF(8);
1588 p = xdr_decode_hyper(p, &lcp->lc_last_wr);
1589 } else
1590 lcp->lc_last_wr = 0;
1591 READ_BUF(4);
1592 timechange = be32_to_cpup(p++);
1593 if (timechange) {
1594 status = nfsd4_decode_time(argp, &lcp->lc_mtime);
1595 if (status)
1596 return status;
1597 } else {
1598 lcp->lc_mtime.tv_nsec = UTIME_NOW;
1599 }
1600 READ_BUF(8);
1601 lcp->lc_layout_type = be32_to_cpup(p++);
1602
1603 /*
1604 * Save the layout update in XDR format and let the layout driver deal
1605 * with it later.
1606 */
1607 lcp->lc_up_len = be32_to_cpup(p++);
1608 if (lcp->lc_up_len > 0) {
1609 READ_BUF(lcp->lc_up_len);
1610 READMEM(lcp->lc_up_layout, lcp->lc_up_len);
1611 }
1612
1613 DECODE_TAIL;
1614}
1615
1616static __be32
1617nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
1618 struct nfsd4_layoutreturn *lrp)
1619{
1620 DECODE_HEAD;
1621
1622 READ_BUF(16);
1623 lrp->lr_reclaim = be32_to_cpup(p++);
1624 lrp->lr_layout_type = be32_to_cpup(p++);
1625 lrp->lr_seg.iomode = be32_to_cpup(p++);
1626 lrp->lr_return_type = be32_to_cpup(p++);
1627 if (lrp->lr_return_type == RETURN_FILE) {
1628 READ_BUF(16);
1629 p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
1630 p = xdr_decode_hyper(p, &lrp->lr_seg.length);
1631 nfsd4_decode_stateid(argp, &lrp->lr_sid);
1632 READ_BUF(4);
1633 lrp->lrf_body_len = be32_to_cpup(p++);
1634 if (lrp->lrf_body_len > 0) {
1635 READ_BUF(lrp->lrf_body_len);
1636 READMEM(lrp->lrf_body, lrp->lrf_body_len);
1637 }
1638 } else {
1639 lrp->lr_seg.offset = 0;
1640 lrp->lr_seg.length = NFS4_MAX_UINT64;
1641 }
1642
1643 DECODE_TAIL;
1644}
1645#endif /* CONFIG_NFSD_PNFS */
1646
1516static __be32 1647static __be32
1517nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, 1648nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
1518 struct nfsd4_fallocate *fallocate) 1649 struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1607 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1738 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
1608 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, 1739 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
1609 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1740 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1741#ifdef CONFIG_NFSD_PNFS
1742 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
1743 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1744 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
1745 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
1746 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
1747#else
1610 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, 1748 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
1611 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, 1749 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1612 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1750 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1613 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1751 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1614 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1752 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1753#endif
1615 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, 1754 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1616 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1755 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1617 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1756 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
2539 get_parent_attributes(exp, &stat); 2678 get_parent_attributes(exp, &stat);
2540 p = xdr_encode_hyper(p, stat.ino); 2679 p = xdr_encode_hyper(p, stat.ino);
2541 } 2680 }
2681#ifdef CONFIG_NFSD_PNFS
2682 if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
2683 (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
2684 if (exp->ex_layout_type) {
2685 p = xdr_reserve_space(xdr, 8);
2686 if (!p)
2687 goto out_resource;
2688 *p++ = cpu_to_be32(1);
2689 *p++ = cpu_to_be32(exp->ex_layout_type);
2690 } else {
2691 p = xdr_reserve_space(xdr, 4);
2692 if (!p)
2693 goto out_resource;
2694 *p++ = cpu_to_be32(0);
2695 }
2696 }
2697
2698 if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
2699 p = xdr_reserve_space(xdr, 4);
2700 if (!p)
2701 goto out_resource;
2702 *p++ = cpu_to_be32(stat.blksize);
2703 }
2704#endif /* CONFIG_NFSD_PNFS */
2542 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { 2705 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2543 status = nfsd4_encode_security_label(xdr, rqstp, context, 2706 status = nfsd4_encode_security_label(xdr, rqstp, context,
2544 contextlen); 2707 contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2768 if (entry_bytes > cd->rd_maxcount) 2931 if (entry_bytes > cd->rd_maxcount)
2769 goto fail; 2932 goto fail;
2770 cd->rd_maxcount -= entry_bytes; 2933 cd->rd_maxcount -= entry_bytes;
2771 if (!cd->rd_dircount)
2772 goto fail;
2773 /* 2934 /*
2774 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so 2935 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
2775 * let's always let through the first entry, at least: 2936 * let's always let through the first entry, at least:
2776 */ 2937 */
2777 name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; 2938 if (!cd->rd_dircount)
2939 goto fail;
2940 name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
2778 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) 2941 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
2779 goto fail; 2942 goto fail;
2780 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); 2943 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
2944
2781 cd->cookie_offset = cookie_offset; 2945 cd->cookie_offset = cookie_offset;
2782skip_entry: 2946skip_entry:
2783 cd->common.err = nfs_ok; 2947 cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3814 return nfserr; 3978 return nfserr;
3815} 3979}
3816 3980
3981#ifdef CONFIG_NFSD_PNFS
3982static __be32
3983nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
3984 struct nfsd4_getdeviceinfo *gdev)
3985{
3986 struct xdr_stream *xdr = &resp->xdr;
3987 const struct nfsd4_layout_ops *ops =
3988 nfsd4_layout_ops[gdev->gd_layout_type];
3989 u32 starting_len = xdr->buf->len, needed_len;
3990 __be32 *p;
3991
3992 dprintk("%s: err %d\n", __func__, nfserr);
3993 if (nfserr)
3994 goto out;
3995
3996 nfserr = nfserr_resource;
3997 p = xdr_reserve_space(xdr, 4);
3998 if (!p)
3999 goto out;
4000
4001 *p++ = cpu_to_be32(gdev->gd_layout_type);
4002
4003 /* If maxcount is 0 then just update notifications */
4004 if (gdev->gd_maxcount != 0) {
4005 nfserr = ops->encode_getdeviceinfo(xdr, gdev);
4006 if (nfserr) {
4007 /*
4008 * We don't bother to burden the layout drivers with
4009 * enforcing gd_maxcount, just tell the client to
4010 * come back with a bigger buffer if it's not enough.
4011 */
4012 if (xdr->buf->len + 4 > gdev->gd_maxcount)
4013 goto toosmall;
4014 goto out;
4015 }
4016 }
4017
4018 nfserr = nfserr_resource;
4019 if (gdev->gd_notify_types) {
4020 p = xdr_reserve_space(xdr, 4 + 4);
4021 if (!p)
4022 goto out;
4023 *p++ = cpu_to_be32(1); /* bitmap length */
4024 *p++ = cpu_to_be32(gdev->gd_notify_types);
4025 } else {
4026 p = xdr_reserve_space(xdr, 4);
4027 if (!p)
4028 goto out;
4029 *p++ = 0;
4030 }
4031
4032 nfserr = 0;
4033out:
4034 kfree(gdev->gd_device);
4035 dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
4036 return nfserr;
4037
4038toosmall:
4039 dprintk("%s: maxcount too small\n", __func__);
4040 needed_len = xdr->buf->len + 4 /* notifications */;
4041 xdr_truncate_encode(xdr, starting_len);
4042 p = xdr_reserve_space(xdr, 4);
4043 if (!p) {
4044 nfserr = nfserr_resource;
4045 } else {
4046 *p++ = cpu_to_be32(needed_len);
4047 nfserr = nfserr_toosmall;
4048 }
4049 goto out;
4050}
4051
4052static __be32
4053nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
4054 struct nfsd4_layoutget *lgp)
4055{
4056 struct xdr_stream *xdr = &resp->xdr;
4057 const struct nfsd4_layout_ops *ops =
4058 nfsd4_layout_ops[lgp->lg_layout_type];
4059 __be32 *p;
4060
4061 dprintk("%s: err %d\n", __func__, nfserr);
4062 if (nfserr)
4063 goto out;
4064
4065 nfserr = nfserr_resource;
4066 p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
4067 if (!p)
4068 goto out;
4069
4070 *p++ = cpu_to_be32(1); /* we always set return-on-close */
4071 *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
4072 p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
4073 sizeof(stateid_opaque_t));
4074
4075 *p++ = cpu_to_be32(1); /* we always return a single layout */
4076 p = xdr_encode_hyper(p, lgp->lg_seg.offset);
4077 p = xdr_encode_hyper(p, lgp->lg_seg.length);
4078 *p++ = cpu_to_be32(lgp->lg_seg.iomode);
4079 *p++ = cpu_to_be32(lgp->lg_layout_type);
4080
4081 nfserr = ops->encode_layoutget(xdr, lgp);
4082out:
4083 kfree(lgp->lg_content);
4084 return nfserr;
4085}
4086
4087static __be32
4088nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
4089 struct nfsd4_layoutcommit *lcp)
4090{
4091 struct xdr_stream *xdr = &resp->xdr;
4092 __be32 *p;
4093
4094 if (nfserr)
4095 return nfserr;
4096
4097 p = xdr_reserve_space(xdr, 4);
4098 if (!p)
4099 return nfserr_resource;
4100 *p++ = cpu_to_be32(lcp->lc_size_chg);
4101 if (lcp->lc_size_chg) {
4102 p = xdr_reserve_space(xdr, 8);
4103 if (!p)
4104 return nfserr_resource;
4105 p = xdr_encode_hyper(p, lcp->lc_newsize);
4106 }
4107
4108 return nfs_ok;
4109}
4110
4111static __be32
4112nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
4113 struct nfsd4_layoutreturn *lrp)
4114{
4115 struct xdr_stream *xdr = &resp->xdr;
4116 __be32 *p;
4117
4118 if (nfserr)
4119 return nfserr;
4120
4121 p = xdr_reserve_space(xdr, 4);
4122 if (!p)
4123 return nfserr_resource;
4124 *p++ = cpu_to_be32(lrp->lrs_present);
4125 if (lrp->lrs_present)
4126 nfsd4_encode_stateid(xdr, &lrp->lr_sid);
4127 return nfs_ok;
4128}
4129#endif /* CONFIG_NFSD_PNFS */
4130
3817static __be32 4131static __be32
3818nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, 4132nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
3819 struct nfsd4_seek *seek) 4133 struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3890 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 4204 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3891 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 4205 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3892 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 4206 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
4207#ifdef CONFIG_NFSD_PNFS
4208 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
4209 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
4210 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
4211 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
4212 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
4213#else
3893 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, 4214 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3894 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, 4215 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3895 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 4216 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3896 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 4217 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3897 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 4218 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
4219#endif
3898 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, 4220 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3899 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 4221 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3900 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 4222 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
21#include "cache.h" 21#include "cache.h"
22#include "state.h" 22#include "state.h"
23#include "netns.h" 23#include "netns.h"
24#include "pnfs.h"
24 25
25/* 26/*
26 * We have a single directory with several nodes in it. 27 * We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
1258 retval = nfsd4_init_slabs(); 1259 retval = nfsd4_init_slabs();
1259 if (retval) 1260 if (retval)
1260 goto out_unregister_pernet; 1261 goto out_unregister_pernet;
1261 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ 1262 retval = nfsd4_init_pnfs();
1262 if (retval) 1263 if (retval)
1263 goto out_free_slabs; 1264 goto out_free_slabs;
1265 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
1266 if (retval)
1267 goto out_exit_pnfs;
1264 nfsd_stat_init(); /* Statistics */ 1268 nfsd_stat_init(); /* Statistics */
1265 retval = nfsd_reply_cache_init(); 1269 retval = nfsd_reply_cache_init();
1266 if (retval) 1270 if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
1282out_free_stat: 1286out_free_stat:
1283 nfsd_stat_shutdown(); 1287 nfsd_stat_shutdown();
1284 nfsd_fault_inject_cleanup(); 1288 nfsd_fault_inject_cleanup();
1289out_exit_pnfs:
1290 nfsd4_exit_pnfs();
1285out_free_slabs: 1291out_free_slabs:
1286 nfsd4_free_slabs(); 1292 nfsd4_free_slabs();
1287out_unregister_pernet: 1293out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
1299 nfsd_stat_shutdown(); 1305 nfsd_stat_shutdown();
1300 nfsd_lockd_shutdown(); 1306 nfsd_lockd_shutdown();
1301 nfsd4_free_slabs(); 1307 nfsd4_free_slabs();
1308 nfsd4_exit_pnfs();
1302 nfsd_fault_inject_cleanup(); 1309 nfsd_fault_inject_cleanup();
1303 unregister_filesystem(&nfsd_fs_type); 1310 unregister_filesystem(&nfsd_fs_type);
1304 unregister_pernet_subsys(&nfsd_net_ops); 1311 unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void nfsd_lockd_shutdown(void);
325 325
326#define NFSD4_SUPPORTED_ATTRS_WORD2 0 326#define NFSD4_SUPPORTED_ATTRS_WORD2 0
327 327
328/* 4.1 */
329#ifdef CONFIG_NFSD_PNFS
330#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES
331#define PNFSD_SUPPORTED_ATTRS_WORD2 \
332(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES)
333#else
334#define PNFSD_SUPPORTED_ATTRS_WORD1 0
335#define PNFSD_SUPPORTED_ATTRS_WORD2 0
336#endif /* CONFIG_NFSD_PNFS */
337
328#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ 338#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
329 NFSD4_SUPPORTED_ATTRS_WORD0 339 NFSD4_SUPPORTED_ATTRS_WORD0
330 340
331#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ 341#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
332 NFSD4_SUPPORTED_ATTRS_WORD1 342 (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1)
333 343
334#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 344#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
335 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 345 (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \
346 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
336 347
348/* 4.2 */
337#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 349#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
338#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL 350#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
339#else 351#else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..f22920442172 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
187 return fhp; 187 return fhp;
188} 188}
189 189
190static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
191{
192 if (fh1->fh_size != fh2->fh_size)
193 return false;
194 if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
195 return false;
196 return true;
197}
198
199static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
200{
201 if (fh1->fh_fsid_type != fh2->fh_fsid_type)
202 return false;
203 if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
204 return false;
205 return true;
206}
207
190#ifdef CONFIG_NFSD_V3 208#ifdef CONFIG_NFSD_V3
191/* 209/*
192 * The wcc data stored in current_fh should be cleared 210 * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program nfsd_program = {
119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { 119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
120 [0] = 1, 120 [0] = 1,
121 [1] = 1, 121 [1] = 1,
122 [2] = 1,
122}; 123};
123 124
124int nfsd_vers(int vers, enum vers_op change) 125int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..d4c4453674c6
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,86 @@
1#ifndef _FS_NFSD_PNFS_H
2#define _FS_NFSD_PNFS_H 1
3
4#ifdef CONFIG_NFSD_V4
5#include <linux/exportfs.h>
6#include <linux/nfsd/export.h>
7
8#include "state.h"
9#include "xdr4.h"
10
11struct xdr_stream;
12
13struct nfsd4_deviceid_map {
14 struct list_head hash;
15 u64 idx;
16 int fsid_type;
17 u32 fsid[];
18};
19
20struct nfsd4_layout_ops {
21 u32 notify_types;
22
23 __be32 (*proc_getdeviceinfo)(struct super_block *sb,
24 struct nfsd4_getdeviceinfo *gdevp);
25 __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
26 struct nfsd4_getdeviceinfo *gdevp);
27
28 __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
29 struct nfsd4_layoutget *lgp);
30 __be32 (*encode_layoutget)(struct xdr_stream *,
31 struct nfsd4_layoutget *lgp);
32
33 __be32 (*proc_layoutcommit)(struct inode *inode,
34 struct nfsd4_layoutcommit *lcp);
35};
36
37extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
38extern const struct nfsd4_layout_ops bl_layout_ops;
39
40__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
41 struct nfsd4_compound_state *cstate, stateid_t *stateid,
42 bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
43__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
44 struct nfs4_layout_stateid *ls);
45__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
46 struct nfsd4_compound_state *cstate,
47 struct nfsd4_layoutreturn *lrp);
48__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
49 struct nfsd4_compound_state *cstate,
50 struct nfsd4_layoutreturn *lrp);
51int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
52 u32 device_generation);
53struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
54#endif /* CONFIG_NFSD_V4 */
55
56#ifdef CONFIG_NFSD_PNFS
57void nfsd4_setup_layout_type(struct svc_export *exp);
58void nfsd4_return_all_client_layouts(struct nfs4_client *);
59void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
60 struct nfs4_file *fp);
61int nfsd4_init_pnfs(void);
62void nfsd4_exit_pnfs(void);
63#else
64struct nfs4_client;
65struct nfs4_file;
66
67static inline void nfsd4_setup_layout_type(struct svc_export *exp)
68{
69}
70
71static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
72{
73}
74static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
75 struct nfs4_file *fp)
76{
77}
78static inline void nfsd4_exit_pnfs(void)
79{
80}
81static inline int nfsd4_init_pnfs(void)
82{
83 return 0;
84}
85#endif /* CONFIG_NFSD_PNFS */
86#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
92/* For a deleg stateid kept around only to process free_stateid's: */ 92/* For a deleg stateid kept around only to process free_stateid's: */
93#define NFS4_REVOKED_DELEG_STID 16 93#define NFS4_REVOKED_DELEG_STID 16
94#define NFS4_CLOSED_DELEG_STID 32 94#define NFS4_CLOSED_DELEG_STID 32
95#define NFS4_LAYOUT_STID 64
95 unsigned char sc_type; 96 unsigned char sc_type;
96 stateid_t sc_stateid; 97 stateid_t sc_stateid;
97 struct nfs4_client *sc_client; 98 struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
297 struct list_head cl_delegations; 298 struct list_head cl_delegations;
298 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ 299 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
299 struct list_head cl_lru; /* tail queue */ 300 struct list_head cl_lru; /* tail queue */
301#ifdef CONFIG_NFSD_PNFS
302 struct list_head cl_lo_states; /* outstanding layout states */
303#endif
300 struct xdr_netobj cl_name; /* id generated by client */ 304 struct xdr_netobj cl_name; /* id generated by client */
301 nfs4_verifier cl_verifier; /* generated by client */ 305 nfs4_verifier cl_verifier; /* generated by client */
302 time_t cl_time; /* time of last lease renewal */ 306 time_t cl_time; /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
493 atomic_t fi_access[2]; 497 atomic_t fi_access[2];
494 u32 fi_share_deny; 498 u32 fi_share_deny;
495 struct file *fi_deleg_file; 499 struct file *fi_deleg_file;
496 atomic_t fi_delegees; 500 int fi_delegees;
497 struct knfsd_fh fi_fhandle; 501 struct knfsd_fh fi_fhandle;
498 bool fi_had_conflict; 502 bool fi_had_conflict;
503#ifdef CONFIG_NFSD_PNFS
504 struct list_head fi_lo_states;
505 atomic_t fi_lo_recalls;
506#endif
499}; 507};
500 508
501/* 509/*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
528 return container_of(s, struct nfs4_ol_stateid, st_stid); 536 return container_of(s, struct nfs4_ol_stateid, st_stid);
529} 537}
530 538
539struct nfs4_layout_stateid {
540 struct nfs4_stid ls_stid;
541 struct list_head ls_perclnt;
542 struct list_head ls_perfile;
543 spinlock_t ls_lock;
544 struct list_head ls_layouts;
545 u32 ls_layout_type;
546 struct file *ls_file;
547 struct nfsd4_callback ls_recall;
548 stateid_t ls_recall_sid;
549 bool ls_recalled;
550};
551
552static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
553{
554 return container_of(s, struct nfs4_layout_stateid, ls_stid);
555}
556
531/* flags for preprocess_seqid_op() */ 557/* flags for preprocess_seqid_op() */
532#define RD_STATE 0x00000010 558#define RD_STATE 0x00000010
533#define WR_STATE 0x00000020 559#define WR_STATE 0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
535enum nfsd4_cb_op { 561enum nfsd4_cb_op {
536 NFSPROC4_CLNT_CB_NULL = 0, 562 NFSPROC4_CLNT_CB_NULL = 0,
537 NFSPROC4_CLNT_CB_RECALL, 563 NFSPROC4_CLNT_CB_RECALL,
564 NFSPROC4_CLNT_CB_LAYOUT,
538 NFSPROC4_CLNT_CB_SEQUENCE, 565 NFSPROC4_CLNT_CB_SEQUENCE,
539}; 566};
540 567
@@ -545,6 +572,12 @@ struct nfsd_net;
545extern __be32 nfs4_preprocess_stateid_op(struct net *net, 572extern __be32 nfs4_preprocess_stateid_op(struct net *net,
546 struct nfsd4_compound_state *cstate, 573 struct nfsd4_compound_state *cstate,
547 stateid_t *stateid, int flags, struct file **filp); 574 stateid_t *stateid, int flags, struct file **filp);
575__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
576 stateid_t *stateid, unsigned char typemask,
577 struct nfs4_stid **s, struct nfsd_net *nn);
578struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
579 struct kmem_cache *slab);
580void nfs4_unhash_stid(struct nfs4_stid *s);
548void nfs4_put_stid(struct nfs4_stid *s); 581void nfs4_put_stid(struct nfs4_stid *s);
549void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); 582void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
550extern void nfs4_release_reclaim(struct nfsd_net *); 583extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
567 struct nfsd_net *nn); 600 struct nfsd_net *nn);
568extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 601extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
569 602
603struct nfs4_file *find_file(struct knfsd_fh *fh);
604void put_nfs4_file(struct nfs4_file *fi);
605static inline void get_nfs4_file(struct nfs4_file *fi)
606{
607 atomic_inc(&fi->fi_ref);
608}
609struct file *find_any_file(struct nfs4_file *f);
610
570/* grace period management */ 611/* grace period management */
571void nfsd4_end_grace(struct nfsd_net *nn); 612void nfsd4_end_grace(struct nfsd_net *nn);
572 613
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
1
2#include "state.h"
3
4#define CREATE_TRACE_POINTS
5#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#undef TRACE_SYSTEM
5#define TRACE_SYSTEM nfsd
6
7#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
8#define _NFSD_TRACE_H
9
10#include <linux/tracepoint.h>
11
12DECLARE_EVENT_CLASS(nfsd_stateid_class,
13 TP_PROTO(stateid_t *stp),
14 TP_ARGS(stp),
15 TP_STRUCT__entry(
16 __field(u32, cl_boot)
17 __field(u32, cl_id)
18 __field(u32, si_id)
19 __field(u32, si_generation)
20 ),
21 TP_fast_assign(
22 __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
23 __entry->cl_id = stp->si_opaque.so_clid.cl_id;
24 __entry->si_id = stp->si_opaque.so_id;
25 __entry->si_generation = stp->si_generation;
26 ),
27 TP_printk("client %08x:%08x stateid %08x:%08x",
28 __entry->cl_boot,
29 __entry->cl_id,
30 __entry->si_id,
31 __entry->si_generation)
32)
33
34#define DEFINE_STATEID_EVENT(name) \
35DEFINE_EVENT(nfsd_stateid_class, name, \
36 TP_PROTO(stateid_t *stp), \
37 TP_ARGS(stp))
38DEFINE_STATEID_EVENT(layoutstate_alloc);
39DEFINE_STATEID_EVENT(layoutstate_unhash);
40DEFINE_STATEID_EVENT(layoutstate_free);
41DEFINE_STATEID_EVENT(layout_get_lookup_fail);
42DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
43DEFINE_STATEID_EVENT(layout_return_lookup_fail);
44DEFINE_STATEID_EVENT(layout_recall);
45DEFINE_STATEID_EVENT(layout_recall_done);
46DEFINE_STATEID_EVENT(layout_recall_fail);
47DEFINE_STATEID_EVENT(layout_recall_release);
48
49#endif /* _NFSD_TRACE_H */
50
51#undef TRACE_INCLUDE_PATH
52#define TRACE_INCLUDE_PATH .
53#define TRACE_INCLUDE_FILE trace
54#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
428 u32 rca_one_fs; 428 u32 rca_one_fs;
429}; 429};
430 430
431struct nfsd4_deviceid {
432 u64 fsid_idx;
433 u32 generation;
434 u32 pad;
435};
436
437struct nfsd4_layout_seg {
438 u32 iomode;
439 u64 offset;
440 u64 length;
441};
442
443struct nfsd4_getdeviceinfo {
444 struct nfsd4_deviceid gd_devid; /* request */
445 u32 gd_layout_type; /* request */
446 u32 gd_maxcount; /* request */
447 u32 gd_notify_types;/* request - response */
448 void *gd_device; /* response */
449};
450
451struct nfsd4_layoutget {
452 u64 lg_minlength; /* request */
453 u32 lg_signal; /* request */
454 u32 lg_layout_type; /* request */
455 u32 lg_maxcount; /* request */
456 stateid_t lg_sid; /* request/response */
457 struct nfsd4_layout_seg lg_seg; /* request/response */
458 void *lg_content; /* response */
459};
460
461struct nfsd4_layoutcommit {
462 stateid_t lc_sid; /* request */
463 struct nfsd4_layout_seg lc_seg; /* request */
464 u32 lc_reclaim; /* request */
465 u32 lc_newoffset; /* request */
466 u64 lc_last_wr; /* request */
467 struct timespec lc_mtime; /* request */
468 u32 lc_layout_type; /* request */
469 u32 lc_up_len; /* layout length */
470 void *lc_up_layout; /* decoded by callback */
471 u32 lc_size_chg; /* boolean for response */
472 u64 lc_newsize; /* response */
473};
474
475struct nfsd4_layoutreturn {
476 u32 lr_return_type; /* request */
477 u32 lr_layout_type; /* request */
478 struct nfsd4_layout_seg lr_seg; /* request */
479 u32 lr_reclaim; /* request */
480 u32 lrf_body_len; /* request */
481 void *lrf_body; /* request */
482 stateid_t lr_sid; /* request/response */
483 u32 lrs_present; /* response */
484};
485
431struct nfsd4_fallocate { 486struct nfsd4_fallocate {
432 /* request */ 487 /* request */
433 stateid_t falloc_stateid; 488 stateid_t falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
491 struct nfsd4_reclaim_complete reclaim_complete; 546 struct nfsd4_reclaim_complete reclaim_complete;
492 struct nfsd4_test_stateid test_stateid; 547 struct nfsd4_test_stateid test_stateid;
493 struct nfsd4_free_stateid free_stateid; 548 struct nfsd4_free_stateid free_stateid;
549 struct nfsd4_getdeviceinfo getdeviceinfo;
550 struct nfsd4_layoutget layoutget;
551 struct nfsd4_layoutcommit layoutcommit;
552 struct nfsd4_layoutreturn layoutreturn;
494 553
495 /* NFSv4.2 */ 554 /* NFSv4.2 */
496 struct nfsd4_fallocate allocate; 555 struct nfsd4_fallocate allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
21#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 21#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
22 cb_sequence_dec_sz + \ 22 cb_sequence_dec_sz + \
23 op_dec_sz) 23 op_dec_sz)
24#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
25 cb_sequence_enc_sz + \
26 1 + 3 + \
27 enc_nfs4_fh_sz + 4)
28#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
29 cb_sequence_dec_sz + \
30 op_dec_sz)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
128 .fault = filemap_fault, 128 .fault = filemap_fault,
129 .map_pages = filemap_map_pages, 129 .map_pages = filemap_map_pages,
130 .page_mkwrite = nilfs_page_mkwrite, 130 .page_mkwrite = nilfs_page_mkwrite,
131 .remap_pages = generic_file_remap_pages,
132}; 131};
133 132
134static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) 133static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
172 inode->i_mode = S_IFREG; 172 inode->i_mode = S_IFREG;
173 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 173 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
174 inode->i_mapping->a_ops = &empty_aops; 174 inode->i_mapping->a_ops = &empty_aops;
175 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
176 175
177 ii->i_flags = 0; 176 ii->i_flags = 0;
178 nilfs_bmap_init_gc(ii->i_bmap); 177 nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
429 429
430 inode->i_mode = S_IFREG; 430 inode->i_mode = S_IFREG;
431 mapping_set_gfp_mask(inode->i_mapping, gfp_mask); 431 mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
432 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
433 432
434 inode->i_op = &def_mdt_iops; 433 inode->i_op = &def_mdt_iops;
435 inode->i_fop = &def_mdt_fops; 434 inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
457 struct nilfs_shadow_map *shadow) 456 struct nilfs_shadow_map *shadow)
458{ 457{
459 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 458 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
460 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
461 459
462 INIT_LIST_HEAD(&shadow->frozen_buffers); 460 INIT_LIST_HEAD(&shadow->frozen_buffers);
463 address_space_init_once(&shadow->frozen_data); 461 address_space_init_once(&shadow->frozen_data);
464 nilfs_mapping_init(&shadow->frozen_data, inode, bdi); 462 nilfs_mapping_init(&shadow->frozen_data, inode);
465 address_space_init_once(&shadow->frozen_btnodes); 463 address_space_init_once(&shadow->frozen_btnodes);
466 nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); 464 nilfs_mapping_init(&shadow->frozen_btnodes, inode);
467 mi->mi_shadow = shadow; 465 mi->mi_shadow = shadow;
468 return 0; 466 return 0;
469} 467}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
461 return nc; 461 return nc;
462} 462}
463 463
464void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, 464void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
465 struct backing_dev_info *bdi)
466{ 465{
467 mapping->host = inode; 466 mapping->host = inode;
468 mapping->flags = 0; 467 mapping->flags = 0;
469 mapping_set_gfp_mask(mapping, GFP_NOFS); 468 mapping_set_gfp_mask(mapping, GFP_NOFS);
470 mapping->private_data = NULL; 469 mapping->private_data = NULL;
471 mapping->backing_dev_info = bdi;
472 mapping->a_ops = &empty_aops; 470 mapping->a_ops = &empty_aops;
473} 471}
474 472
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
57void nilfs_copy_back_pages(struct address_space *, struct address_space *); 57void nilfs_copy_back_pages(struct address_space *, struct address_space *);
58void nilfs_clear_dirty_page(struct page *, bool); 58void nilfs_clear_dirty_page(struct page *, bool);
59void nilfs_clear_dirty_pages(struct address_space *, bool); 59void nilfs_clear_dirty_pages(struct address_space *, bool);
60void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, 60void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
61 struct backing_dev_info *bdi);
62unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 61unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
63unsigned long nilfs_find_uncommitted_extent(struct inode *inode, 62unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
64 sector_t start_blk, 63 sector_t start_blk,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
166 ii->i_state = 0; 166 ii->i_state = 0;
167 ii->i_cno = 0; 167 ii->i_cno = 0;
168 ii->vfs_inode.i_version = 1; 168 ii->vfs_inode.i_version = 1;
169 nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); 169 nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
170 return &ii->vfs_inode; 170 return &ii->vfs_inode;
171} 171}
172 172
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
1057{ 1057{
1058 struct the_nilfs *nilfs; 1058 struct the_nilfs *nilfs;
1059 struct nilfs_root *fsroot; 1059 struct nilfs_root *fsroot;
1060 struct backing_dev_info *bdi;
1061 __u64 cno; 1060 __u64 cno;
1062 int err; 1061 int err;
1063 1062
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
1077 sb->s_time_gran = 1; 1076 sb->s_time_gran = 1;
1078 sb->s_max_links = NILFS_LINK_MAX; 1077 sb->s_max_links = NILFS_LINK_MAX;
1079 1078
1080 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 1079 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
1081 sb->s_bdi = bdi ? : &default_backing_dev_info;
1082 1080
1083 err = load_nilfs(nilfs, sb); 1081 err = load_nilfs(nilfs, sb);
1084 if (err) 1082 if (err)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..2a24249b30af 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
1config FSNOTIFY 1config FSNOTIFY
2 def_bool n 2 def_bool n
3 select SRCU
3 4
4source "fs/notify/dnotify/Kconfig" 5source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 6source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 30d3addfad75..51ceb8107284 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
140 } 140 }
141 141
142 if (S_ISDIR(path->dentry->d_inode->i_mode) && 142 if (S_ISDIR(path->dentry->d_inode->i_mode) &&
143 (marks_ignored_mask & FS_ISDIR)) 143 !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
144 return false; 144 return false;
145 145
146 if (event_mask & marks_mask & ~marks_ignored_mask) 146 if (event_mask & marks_mask & ~marks_ignored_mask)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index bff8567aa42d..cf275500a665 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
487 unsigned int flags, 487 unsigned int flags,
488 int *destroy) 488 int *destroy)
489{ 489{
490 __u32 oldmask; 490 __u32 oldmask = 0;
491 491
492 spin_lock(&fsn_mark->lock); 492 spin_lock(&fsn_mark->lock);
493 if (!(flags & FAN_MARK_IGNORED_MASK)) { 493 if (!(flags & FAN_MARK_IGNORED_MASK)) {
494 __u32 tmask = fsn_mark->mask & ~mask;
495
496 if (flags & FAN_MARK_ONDIR)
497 tmask &= ~FAN_ONDIR;
498
494 oldmask = fsn_mark->mask; 499 oldmask = fsn_mark->mask;
495 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask)); 500 fsnotify_set_mark_mask_locked(fsn_mark, tmask);
496 } else { 501 } else {
497 oldmask = fsn_mark->ignored_mask; 502 __u32 tmask = fsn_mark->ignored_mask & ~mask;
498 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask)); 503 if (flags & FAN_MARK_ONDIR)
504 tmask &= ~FAN_ONDIR;
505
506 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
499 } 507 }
508 *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
500 spin_unlock(&fsn_mark->lock); 509 spin_unlock(&fsn_mark->lock);
501 510
502 *destroy = !(oldmask & ~mask);
503
504 return mask & oldmask; 511 return mask & oldmask;
505} 512}
506 513
@@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
569 576
570 spin_lock(&fsn_mark->lock); 577 spin_lock(&fsn_mark->lock);
571 if (!(flags & FAN_MARK_IGNORED_MASK)) { 578 if (!(flags & FAN_MARK_IGNORED_MASK)) {
579 __u32 tmask = fsn_mark->mask | mask;
580
581 if (flags & FAN_MARK_ONDIR)
582 tmask |= FAN_ONDIR;
583
572 oldmask = fsn_mark->mask; 584 oldmask = fsn_mark->mask;
573 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); 585 fsnotify_set_mark_mask_locked(fsn_mark, tmask);
574 } else { 586 } else {
575 __u32 tmask = fsn_mark->ignored_mask | mask; 587 __u32 tmask = fsn_mark->ignored_mask | mask;
588 if (flags & FAN_MARK_ONDIR)
589 tmask |= FAN_ONDIR;
590
576 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); 591 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
577 if (flags & FAN_MARK_IGNORED_SURV_MODIFY) 592 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
578 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 593 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
579 } 594 }
580
581 if (!(flags & FAN_MARK_ONDIR)) {
582 __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
583 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
584 }
585
586 spin_unlock(&fsn_mark->lock); 595 spin_unlock(&fsn_mark->lock);
587 596
588 return mask & ~oldmask; 597 return mask & ~oldmask;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/backing-dev.h>
22#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
23#include <linux/gfp.h> 24#include <linux/gfp.h>
24#include <linux/pagemap.h> 25#include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2091 count = iov_length(iov, nr_segs); 2092 count = iov_length(iov, nr_segs);
2092 pos = *ppos; 2093 pos = *ppos;
2093 /* We can write back this queue in page reclaim. */ 2094 /* We can write back this queue in page reclaim. */
2094 current->backing_dev_info = mapping->backing_dev_info; 2095 current->backing_dev_info = inode_to_bdi(inode);
2095 written = 0; 2096 written = 0;
2096 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2097 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2097 if (err) 2098 if (err)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7e8282dcea2a..c58a1bcfda0f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle,
245 ret = posix_acl_equiv_mode(acl, &mode); 245 ret = posix_acl_equiv_mode(acl, &mode);
246 if (ret < 0) 246 if (ret < 0)
247 return ret; 247 return ret;
248 else {
249 if (ret == 0)
250 acl = NULL;
251 248
252 ret = ocfs2_acl_set_mode(inode, di_bh, 249 if (ret == 0)
253 handle, mode); 250 acl = NULL;
254 if (ret)
255 return ret;
256 251
257 } 252 ret = ocfs2_acl_set_mode(inode, di_bh,
253 handle, mode);
254 if (ret)
255 return ret;
258 } 256 }
259 break; 257 break;
260 case ACL_TYPE_DEFAULT: 258 case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fcae9ef1a328..044158bd22be 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6873 if (IS_ERR(handle)) { 6873 if (IS_ERR(handle)) {
6874 ret = PTR_ERR(handle); 6874 ret = PTR_ERR(handle);
6875 mlog_errno(ret); 6875 mlog_errno(ret);
6876 goto out_unlock; 6876 goto out;
6877 } 6877 }
6878 6878
6879 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 6879 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6931 if (ret) { 6931 if (ret) {
6932 mlog_errno(ret); 6932 mlog_errno(ret);
6933 need_free = 1; 6933 need_free = 1;
6934 goto out_commit; 6934 goto out_unlock;
6935 } 6935 }
6936 6936
6937 page_end = PAGE_CACHE_SIZE; 6937 page_end = PAGE_CACHE_SIZE;
@@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6964 if (ret) { 6964 if (ret) {
6965 mlog_errno(ret); 6965 mlog_errno(ret);
6966 need_free = 1; 6966 need_free = 1;
6967 goto out_commit; 6967 goto out_unlock;
6968 } 6968 }
6969 6969
6970 inode->i_blocks = ocfs2_inode_sector_count(inode); 6970 inode->i_blocks = ocfs2_inode_sector_count(inode);
6971 } 6971 }
6972 6972
6973out_unlock:
6974 if (pages)
6975 ocfs2_unlock_and_free_pages(pages, num_pages);
6976
6973out_commit: 6977out_commit:
6974 if (ret < 0 && did_quota) 6978 if (ret < 0 && did_quota)
6975 dquot_free_space_nodirty(inode, 6979 dquot_free_space_nodirty(inode,
@@ -6989,15 +6993,11 @@ out_commit:
6989 6993
6990 ocfs2_commit_trans(osb, handle); 6994 ocfs2_commit_trans(osb, handle);
6991 6995
6992out_unlock: 6996out:
6993 if (data_ac) 6997 if (data_ac)
6994 ocfs2_free_alloc_context(data_ac); 6998 ocfs2_free_alloc_context(data_ac);
6995 6999 if (pages)
6996out:
6997 if (pages) {
6998 ocfs2_unlock_and_free_pages(pages, num_pages);
6999 kfree(pages); 7000 kfree(pages);
7000 }
7001 7001
7002 return ret; 7002 return ret;
7003} 7003}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 46d93e941f3d..44db1808cdb5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -28,6 +28,7 @@
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h> 29#include <linux/mpage.h>
30#include <linux/quotaops.h> 30#include <linux/quotaops.h>
31#include <linux/blkdev.h>
31 32
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
33 34
@@ -47,6 +48,9 @@
47#include "ocfs2_trace.h" 48#include "ocfs2_trace.h"
48 49
49#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51#include "dir.h"
52#include "namei.h"
53#include "sysfile.h"
50 54
51static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 55static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
52 struct buffer_head *bh_result, int create) 56 struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
506 * 510 *
507 * called like this: dio->get_blocks(dio->inode, fs_startblk, 511 * called like this: dio->get_blocks(dio->inode, fs_startblk,
508 * fs_count, map_bh, dio->rw == WRITE); 512 * fs_count, map_bh, dio->rw == WRITE);
509 *
510 * Note that we never bother to allocate blocks here, and thus ignore the
511 * create argument.
512 */ 513 */
513static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 514static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
514 struct buffer_head *bh_result, int create) 515 struct buffer_head *bh_result, int create)
515{ 516{
516 int ret; 517 int ret;
518 u32 cpos = 0;
519 int alloc_locked = 0;
517 u64 p_blkno, inode_blocks, contig_blocks; 520 u64 p_blkno, inode_blocks, contig_blocks;
518 unsigned int ext_flags; 521 unsigned int ext_flags;
519 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 522 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
520 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 523 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
524 unsigned long len = bh_result->b_size;
525 unsigned int clusters_to_alloc = 0;
526
527 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
521 528
522 /* This function won't even be called if the request isn't all 529 /* This function won't even be called if the request isn't all
523 * nicely aligned and of the right size, so there's no need 530 * nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
539 /* We should already CoW the refcounted extent in case of create. */ 546 /* We should already CoW the refcounted extent in case of create. */
540 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); 547 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
541 548
549 /* allocate blocks if no p_blkno is found, and create == 1 */
550 if (!p_blkno && create) {
551 ret = ocfs2_inode_lock(inode, NULL, 1);
552 if (ret < 0) {
553 mlog_errno(ret);
554 goto bail;
555 }
556
557 alloc_locked = 1;
558
559 /* fill hole, allocate blocks can't be larger than the size
560 * of the hole */
561 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
562 if (clusters_to_alloc > contig_blocks)
563 clusters_to_alloc = contig_blocks;
564
565 /* allocate extent and insert them into the extent tree */
566 ret = ocfs2_extend_allocation(inode, cpos,
567 clusters_to_alloc, 0);
568 if (ret < 0) {
569 mlog_errno(ret);
570 goto bail;
571 }
572
573 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
574 &contig_blocks, &ext_flags);
575 if (ret < 0) {
576 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
577 (unsigned long long)iblock);
578 ret = -EIO;
579 goto bail;
580 }
581 }
582
542 /* 583 /*
543 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
544 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
556 contig_blocks = max_blocks; 597 contig_blocks = max_blocks;
557 bh_result->b_size = contig_blocks << blocksize_bits; 598 bh_result->b_size = contig_blocks << blocksize_bits;
558bail: 599bail:
600 if (alloc_locked)
601 ocfs2_inode_unlock(inode, 1);
559 return ret; 602 return ret;
560} 603}
561 604
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
597 return try_to_free_buffers(page); 640 return try_to_free_buffers(page);
598} 641}
599 642
643static int ocfs2_is_overwrite(struct ocfs2_super *osb,
644 struct inode *inode, loff_t offset)
645{
646 int ret = 0;
647 u32 v_cpos = 0;
648 u32 p_cpos = 0;
649 unsigned int num_clusters = 0;
650 unsigned int ext_flags = 0;
651
652 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
653 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
654 &num_clusters, &ext_flags);
655 if (ret < 0) {
656 mlog_errno(ret);
657 return ret;
658 }
659
660 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
661 return 1;
662
663 return 0;
664}
665
666static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
667 struct iov_iter *iter,
668 loff_t offset)
669{
670 ssize_t ret = 0;
671 ssize_t written = 0;
672 bool orphaned = false;
673 int is_overwrite = 0;
674 struct file *file = iocb->ki_filp;
675 struct inode *inode = file_inode(file)->i_mapping->host;
676 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
677 struct buffer_head *di_bh = NULL;
678 size_t count = iter->count;
679 journal_t *journal = osb->journal->j_journal;
680 u32 zero_len;
681 int cluster_align;
682 loff_t final_size = offset + count;
683 int append_write = offset >= i_size_read(inode) ? 1 : 0;
684 unsigned int num_clusters = 0;
685 unsigned int ext_flags = 0;
686
687 {
688 u64 o = offset;
689
690 zero_len = do_div(o, 1 << osb->s_clustersize_bits);
691 cluster_align = !zero_len;
692 }
693
694 /*
695 * when final_size > inode->i_size, inode->i_size will be
696 * updated after direct write, so add the inode to orphan
697 * dir first.
698 */
699 if (final_size > i_size_read(inode)) {
700 ret = ocfs2_add_inode_to_orphan(osb, inode);
701 if (ret < 0) {
702 mlog_errno(ret);
703 goto out;
704 }
705 orphaned = true;
706 }
707
708 if (append_write) {
709 ret = ocfs2_inode_lock(inode, &di_bh, 1);
710 if (ret < 0) {
711 mlog_errno(ret);
712 goto clean_orphan;
713 }
714
715 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
716 ret = ocfs2_zero_extend(inode, di_bh, offset);
717 else
718 ret = ocfs2_extend_no_holes(inode, di_bh, offset,
719 offset);
720 if (ret < 0) {
721 mlog_errno(ret);
722 ocfs2_inode_unlock(inode, 1);
723 brelse(di_bh);
724 goto clean_orphan;
725 }
726
727 is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
728 if (is_overwrite < 0) {
729 mlog_errno(is_overwrite);
730 ocfs2_inode_unlock(inode, 1);
731 brelse(di_bh);
732 goto clean_orphan;
733 }
734
735 ocfs2_inode_unlock(inode, 1);
736 brelse(di_bh);
737 di_bh = NULL;
738 }
739
740 written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
741 iter, offset,
742 ocfs2_direct_IO_get_blocks,
743 ocfs2_dio_end_io, NULL, 0);
744 if (unlikely(written < 0)) {
745 loff_t i_size = i_size_read(inode);
746
747 if (offset + count > i_size) {
748 ret = ocfs2_inode_lock(inode, &di_bh, 1);
749 if (ret < 0) {
750 mlog_errno(ret);
751 goto clean_orphan;
752 }
753
754 if (i_size == i_size_read(inode)) {
755 ret = ocfs2_truncate_file(inode, di_bh,
756 i_size);
757 if (ret < 0) {
758 if (ret != -ENOSPC)
759 mlog_errno(ret);
760
761 ocfs2_inode_unlock(inode, 1);
762 brelse(di_bh);
763 goto clean_orphan;
764 }
765 }
766
767 ocfs2_inode_unlock(inode, 1);
768 brelse(di_bh);
769
770 ret = jbd2_journal_force_commit(journal);
771 if (ret < 0)
772 mlog_errno(ret);
773 }
774 } else if (written < 0 && append_write && !is_overwrite &&
775 !cluster_align) {
776 u32 p_cpos = 0;
777 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
778
779 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
780 &num_clusters, &ext_flags);
781 if (ret < 0) {
782 mlog_errno(ret);
783 goto clean_orphan;
784 }
785
786 BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
787
788 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
789 p_cpos << (osb->s_clustersize_bits - 9),
790 zero_len >> 9, GFP_KERNEL, false);
791 if (ret < 0)
792 mlog_errno(ret);
793 }
794
795clean_orphan:
796 if (orphaned) {
797 int tmp_ret;
798 int update_isize = written > 0 ? 1 : 0;
799 loff_t end = update_isize ? offset + written : 0;
800
801 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
802 update_isize, end);
803 if (tmp_ret < 0) {
804 ret = tmp_ret;
805 goto out;
806 }
807
808 tmp_ret = jbd2_journal_force_commit(journal);
809 if (tmp_ret < 0) {
810 ret = tmp_ret;
811 mlog_errno(tmp_ret);
812 }
813 }
814
815out:
816 if (ret >= 0)
817 ret = written;
818 return ret;
819}
820
600static ssize_t ocfs2_direct_IO(int rw, 821static ssize_t ocfs2_direct_IO(int rw,
601 struct kiocb *iocb, 822 struct kiocb *iocb,
602 struct iov_iter *iter, 823 struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
604{ 825{
605 struct file *file = iocb->ki_filp; 826 struct file *file = iocb->ki_filp;
606 struct inode *inode = file_inode(file)->i_mapping->host; 827 struct inode *inode = file_inode(file)->i_mapping->host;
828 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
829 int full_coherency = !(osb->s_mount_opt &
830 OCFS2_MOUNT_COHERENCY_BUFFERED);
607 831
608 /* 832 /*
609 * Fallback to buffered I/O if we see an inode without 833 * Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
612 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 836 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
613 return 0; 837 return 0;
614 838
615 /* Fallback to buffered I/O if we are appending. */ 839 /* Fallback to buffered I/O if we are appending and
616 if (i_size_read(inode) <= offset) 840 * concurrent O_DIRECT writes are allowed.
841 */
842 if (i_size_read(inode) <= offset && !full_coherency)
617 return 0; 843 return 0;
618 844
619 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, 845 if (rw == READ)
846 return __blockdev_direct_IO(rw, iocb, inode,
847 inode->i_sb->s_bdev,
620 iter, offset, 848 iter, offset,
621 ocfs2_direct_IO_get_blocks, 849 ocfs2_direct_IO_get_blocks,
622 ocfs2_dio_end_io, NULL, 0); 850 ocfs2_dio_end_io, NULL, 0);
851 else
852 return ocfs2_direct_IO_write(iocb, iter, offset);
623} 853}
624 854
625static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 855static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2e355e0f8335..56c403a563bc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes)
1016 1016
1017 memset(map, 0, bytes); 1017 memset(map, 0, bytes);
1018 for (node = 0; node < O2NM_MAX_NODES; ++node) { 1018 for (node = 0; node < O2NM_MAX_NODES; ++node) {
1019 o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); 1019 if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
1020 continue;
1020 if (!ret) { 1021 if (!ret) {
1021 set_bit(node, map); 1022 set_bit(node, map);
1022 sc_put(sc); 1023 sc_put(sc);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index dc024367110a..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -107,12 +107,12 @@ struct o2net_node {
107 struct list_head nn_status_list; 107 struct list_head nn_status_list;
108 108
109 /* connects are attempted from when heartbeat comes up until either hb 109 /* connects are attempted from when heartbeat comes up until either hb
110 * goes down, the node is unconfigured, no connect attempts succeed 110 * goes down, the node is unconfigured, or a connect succeeds.
111 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work 111 * connect_work is queued from set_nn_state both from hb up and from
112 * is queued from set_nn_state both from hb up and from itself if a 112 * itself if a connect attempt fails and so can be self-arming.
113 * connect attempt fails and so can be self-arming. shutdown is 113 * shutdown is careful to first mark the nn such that no connects will
114 * careful to first mark the nn such that no connects will be attempted 114 * be attempted before canceling delayed connect work and flushing the
115 * before canceling delayed connect work and flushing the queue. */ 115 * queue. */
116 struct delayed_work nn_connect_work; 116 struct delayed_work nn_connect_work;
117 unsigned long nn_last_connect_attempt; 117 unsigned long nn_last_connect_attempt;
118 118
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 319e786175af..b08050bd3f2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3456 int blocksize = dir->i_sb->s_blocksize; 3456 int blocksize = dir->i_sb->s_blocksize;
3457 3457
3458 status = ocfs2_read_dir_block(dir, 0, &bh, 0); 3458 status = ocfs2_read_dir_block(dir, 0, &bh, 0);
3459 if (status) { 3459 if (status)
3460 mlog_errno(status);
3461 goto bail; 3460 goto bail;
3462 }
3463 3461
3464 rec_len = OCFS2_DIR_REC_LEN(namelen); 3462 rec_len = OCFS2_DIR_REC_LEN(namelen);
3465 offset = 0; 3463 offset = 0;
@@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3480 status = ocfs2_read_dir_block(dir, 3478 status = ocfs2_read_dir_block(dir,
3481 offset >> sb->s_blocksize_bits, 3479 offset >> sb->s_blocksize_bits,
3482 &bh, 0); 3480 &bh, 0);
3483 if (status) { 3481 if (status)
3484 mlog_errno(status);
3485 goto bail; 3482 goto bail;
3486 } 3483
3487 /* move to next block */ 3484 /* move to next block */
3488 de = (struct ocfs2_dir_entry *) bh->b_data; 3485 de = (struct ocfs2_dir_entry *) bh->b_data;
3489 } 3486 }
@@ -3513,7 +3510,6 @@ next:
3513 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); 3510 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
3514 } 3511 }
3515 3512
3516 status = 0;
3517bail: 3513bail:
3518 brelse(bh); 3514 brelse(bh);
3519 if (status) 3515 if (status)
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
385 head = &res->granted; 385 head = &res->granted;
386 386
387 list_for_each_entry(lock, head, list) { 387 list_for_each_entry(lock, head, list) {
388 if (lock->ml.cookie == cookie) 388 /* if lock is found but unlock is pending ignore the bast */
389 if (lock->ml.cookie == cookie) {
390 if (lock->unlock_pending)
391 break;
389 goto do_ast; 392 goto do_ast;
393 }
390 } 394 }
391 395
392 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " 396 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 149eb556b8c6..825136070d2c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
406 } 406 }
407 spin_unlock(&dlm->spinlock); 407 spin_unlock(&dlm->spinlock);
408 408
409 out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); 409 out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
410 410
411 return out; 411 return out;
412} 412}
@@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
464 spin_unlock(&dlm->master_lock); 464 spin_unlock(&dlm->master_lock);
465 465
466 out += snprintf(buf + out, len - out, 466 out += snprintf(buf + out, len - out,
467 "Total: %ld, Longest: %ld\n", total, longest); 467 "Total: %lu, Longest: %lu\n", total, longest);
468 return out; 468 return out;
469} 469}
470 470
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 50a59d2337b2..7df88a6dd626 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
674 spin_unlock(&dlm->spinlock); 674 spin_unlock(&dlm->spinlock);
675} 675}
676 676
677int dlm_joined(struct dlm_ctxt *dlm)
678{
679 int ret = 0;
680
681 spin_lock(&dlm_domain_lock);
682
683 if (dlm->dlm_state == DLM_CTXT_JOINED)
684 ret = 1;
685
686 spin_unlock(&dlm_domain_lock);
687
688 return ret;
689}
690
691int dlm_shutting_down(struct dlm_ctxt *dlm) 677int dlm_shutting_down(struct dlm_ctxt *dlm)
692{ 678{
693 int ret = 0; 679 int ret = 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..fd6122a38dbd 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,6 @@
28extern spinlock_t dlm_domain_lock; 28extern spinlock_t dlm_domain_lock;
29extern struct list_head dlm_domains; 29extern struct list_head dlm_domains;
30 30
31int dlm_joined(struct dlm_ctxt *dlm);
32int dlm_shutting_down(struct dlm_ctxt *dlm); 31int dlm_shutting_down(struct dlm_ctxt *dlm);
33void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, 32void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
34 int node_num); 33 int node_num);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index cecd875653e4..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1070 dead_node, dlm->name); 1070 dead_node, dlm->name);
1071 list_del_init(&lock->list); 1071 list_del_init(&lock->list);
1072 dlm_lock_put(lock); 1072 dlm_lock_put(lock);
1073 /* Can't schedule DLM_UNLOCK_FREE_LOCK
1074 * - do manually */
1075 dlm_lock_put(lock);
1073 break; 1076 break;
1074 } 1077 }
1075 } 1078 }
@@ -2346,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2346 dead_node, dlm->name); 2349 dead_node, dlm->name);
2347 list_del_init(&lock->list); 2350 list_del_init(&lock->list);
2348 dlm_lock_put(lock); 2351 dlm_lock_put(lock);
2352 /* Can't schedule
2353 * DLM_UNLOCK_FREE_LOCK
2354 * - do manually */
2355 dlm_lock_put(lock);
2349 break; 2356 break;
2350 } 2357 }
2351 } 2358 }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
390 ip->ip_conn = NULL; 390 ip->ip_conn = NULL;
391} 391}
392 392
393static struct backing_dev_info dlmfs_backing_dev_info = {
394 .name = "ocfs2-dlmfs",
395 .ra_pages = 0, /* No readahead */
396 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
397};
398
399static struct inode *dlmfs_get_root_inode(struct super_block *sb) 393static struct inode *dlmfs_get_root_inode(struct super_block *sb)
400{ 394{
401 struct inode *inode = new_inode(sb); 395 struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
404 if (inode) { 398 if (inode) {
405 inode->i_ino = get_next_ino(); 399 inode->i_ino = get_next_ino();
406 inode_init_owner(inode, NULL, mode); 400 inode_init_owner(inode, NULL, mode);
407 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
408 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 401 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
409 inc_nlink(inode); 402 inc_nlink(inode);
410 403
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
428 421
429 inode->i_ino = get_next_ino(); 422 inode->i_ino = get_next_ino();
430 inode_init_owner(inode, parent, mode); 423 inode_init_owner(inode, parent, mode);
431 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 424 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
433 425
434 ip = DLMFS_I(inode); 426 ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
643 int status; 635 int status;
644 int cleanup_inode = 0, cleanup_worker = 0; 636 int cleanup_inode = 0, cleanup_worker = 0;
645 637
646 status = bdi_init(&dlmfs_backing_dev_info);
647 if (status)
648 return status;
649
650 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", 638 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
651 sizeof(struct dlmfs_inode_private), 639 sizeof(struct dlmfs_inode_private),
652 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 640 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
673 kmem_cache_destroy(dlmfs_inode_cache); 661 kmem_cache_destroy(dlmfs_inode_cache);
674 if (cleanup_worker) 662 if (cleanup_worker)
675 destroy_workqueue(user_dlm_worker); 663 destroy_workqueue(user_dlm_worker);
676 bdi_destroy(&dlmfs_backing_dev_info);
677 } else 664 } else
678 printk("OCFS2 User DLM kernel interface loaded\n"); 665 printk("OCFS2 User DLM kernel interface loaded\n");
679 return status; 666 return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
693 rcu_barrier(); 680 rcu_barrier();
694 kmem_cache_destroy(dlmfs_inode_cache); 681 kmem_cache_destroy(dlmfs_inode_cache);
695 682
696 bdi_destroy(&dlmfs_backing_dev_info);
697} 683}
698 684
699MODULE_AUTHOR("Oracle"); 685MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c423af04c69..11849a44dc5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3750 break; 3750 break;
3751 spin_unlock(&dentry_attach_lock); 3751 spin_unlock(&dentry_attach_lock);
3752 3752
3753 if (S_ISDIR(dl->dl_inode->i_mode))
3754 shrink_dcache_parent(dentry);
3755
3753 mlog(0, "d_delete(%pd);\n", dentry); 3756 mlog(0, "d_delete(%pd);\n", dentry);
3754 3757
3755 /* 3758 /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..46e0d4e857c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -295,7 +295,7 @@ out:
295 return ret; 295 return ret;
296} 296}
297 297
298static int ocfs2_set_inode_size(handle_t *handle, 298int ocfs2_set_inode_size(handle_t *handle,
299 struct inode *inode, 299 struct inode *inode,
300 struct buffer_head *fe_bh, 300 struct buffer_head *fe_bh,
301 u64 new_i_size) 301 u64 new_i_size)
@@ -441,7 +441,7 @@ out:
441 return status; 441 return status;
442} 442}
443 443
444static int ocfs2_truncate_file(struct inode *inode, 444int ocfs2_truncate_file(struct inode *inode,
445 struct buffer_head *di_bh, 445 struct buffer_head *di_bh,
446 u64 new_i_size) 446 u64 new_i_size)
447{ 447{
@@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
569 handle_t *handle = NULL; 569 handle_t *handle = NULL;
570 struct ocfs2_alloc_context *data_ac = NULL; 570 struct ocfs2_alloc_context *data_ac = NULL;
571 struct ocfs2_alloc_context *meta_ac = NULL; 571 struct ocfs2_alloc_context *meta_ac = NULL;
572 enum ocfs2_alloc_restarted why; 572 enum ocfs2_alloc_restarted why = RESTART_NONE;
573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
574 struct ocfs2_extent_tree et; 574 struct ocfs2_extent_tree et;
575 int did_quota = 0; 575 int did_quota = 0;
@@ -709,6 +709,13 @@ leave:
709 return status; 709 return status;
710} 710}
711 711
712int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
713 u32 clusters_to_add, int mark_unwritten)
714{
715 return __ocfs2_extend_allocation(inode, logical_start,
716 clusters_to_add, mark_unwritten);
717}
718
712/* 719/*
713 * While a write will already be ordering the data, a truncate will not. 720 * While a write will already be ordering the data, a truncate will not.
714 * Thus, we need to explicitly order the zeroed pages. 721 * Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2109 struct dentry *dentry = file->f_path.dentry; 2116 struct dentry *dentry = file->f_path.dentry;
2110 struct inode *inode = dentry->d_inode; 2117 struct inode *inode = dentry->d_inode;
2111 loff_t saved_pos = 0, end; 2118 loff_t saved_pos = 0, end;
2119 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2120 int full_coherency = !(osb->s_mount_opt &
2121 OCFS2_MOUNT_COHERENCY_BUFFERED);
2112 2122
2113 /* 2123 /*
2114 * We start with a read level meta lock and only jump to an ex 2124 * We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2197 * one node could wind up truncating another 2207 * one node could wind up truncating another
2198 * nodes writes. 2208 * nodes writes.
2199 */ 2209 */
2200 if (end > i_size_read(inode)) { 2210 if (end > i_size_read(inode) && !full_coherency) {
2211 *direct_io = 0;
2212 break;
2213 }
2214
2215 /*
2216 * Fallback to old way if the feature bit is not set.
2217 */
2218 if (end > i_size_read(inode) &&
2219 !ocfs2_supports_append_dio(osb)) {
2201 *direct_io = 0; 2220 *direct_io = 0;
2202 break; 2221 break;
2203 } 2222 }
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2210 */ 2229 */
2211 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 2230 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2212 if (ret == 1) { 2231 if (ret == 1) {
2213 *direct_io = 0; 2232 /*
2233 * Fallback to old way if the feature bit is not set.
2234 * Otherwise try dio first and then complete the rest
2235 * request through buffer io.
2236 */
2237 if (!ocfs2_supports_append_dio(osb))
2238 *direct_io = 0;
2214 ret = 0; 2239 ret = 0;
2215 } else if (ret < 0) 2240 } else if (ret < 0)
2216 mlog_errno(ret); 2241 mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2243 u32 old_clusters; 2268 u32 old_clusters;
2244 struct file *file = iocb->ki_filp; 2269 struct file *file = iocb->ki_filp;
2245 struct inode *inode = file_inode(file); 2270 struct inode *inode = file_inode(file);
2271 struct address_space *mapping = file->f_mapping;
2246 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2272 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2247 int full_coherency = !(osb->s_mount_opt & 2273 int full_coherency = !(osb->s_mount_opt &
2248 OCFS2_MOUNT_COHERENCY_BUFFERED); 2274 OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,13 +2383,53 @@ relock:
2357 2383
2358 iov_iter_truncate(from, count); 2384 iov_iter_truncate(from, count);
2359 if (direct_io) { 2385 if (direct_io) {
2386 loff_t endbyte;
2387 ssize_t written_buffered;
2360 written = generic_file_direct_write(iocb, from, *ppos); 2388 written = generic_file_direct_write(iocb, from, *ppos);
2361 if (written < 0) { 2389 if (written < 0 || written == count) {
2362 ret = written; 2390 ret = written;
2363 goto out_dio; 2391 goto out_dio;
2364 } 2392 }
2393
2394 /*
2395 * for completing the rest of the request.
2396 */
2397 *ppos += written;
2398 count -= written;
2399 written_buffered = generic_perform_write(file, from, *ppos);
2400 /*
2401 * If generic_file_buffered_write() returned a synchronous error
2402 * then we want to return the number of bytes which were
2403 * direct-written, or the error code if that was zero. Note
2404 * that this differs from normal direct-io semantics, which
2405 * will return -EFOO even if some bytes were written.
2406 */
2407 if (written_buffered < 0) {
2408 ret = written_buffered;
2409 goto out_dio;
2410 }
2411
2412 iocb->ki_pos = *ppos + written_buffered;
2413 /* We need to ensure that the page cache pages are written to
2414 * disk and invalidated to preserve the expected O_DIRECT
2415 * semantics.
2416 */
2417 endbyte = *ppos + written_buffered - 1;
2418 ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
2419 endbyte);
2420 if (ret == 0) {
2421 written += written_buffered;
2422 invalidate_mapping_pages(mapping,
2423 *ppos >> PAGE_CACHE_SHIFT,
2424 endbyte >> PAGE_CACHE_SHIFT);
2425 } else {
2426 /*
2427 * We don't know how much we wrote, so just return
2428 * the number of bytes which were direct-written
2429 */
2430 }
2365 } else { 2431 } else {
2366 current->backing_dev_info = file->f_mapping->backing_dev_info; 2432 current->backing_dev_info = inode_to_bdi(inode);
2367 written = generic_perform_write(file, from, *ppos); 2433 written = generic_perform_write(file, from, *ppos);
2368 if (likely(written >= 0)) 2434 if (likely(written >= 0))
2369 iocb->ki_pos = *ppos + written; 2435 iocb->ki_pos = *ppos + written;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..e8c62f22215c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
51 struct ocfs2_alloc_context *data_ac, 51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac, 52 struct ocfs2_alloc_context *meta_ac,
53 enum ocfs2_alloc_restarted *reason_ret); 53 enum ocfs2_alloc_restarted *reason_ret);
54int ocfs2_set_inode_size(handle_t *handle,
55 struct inode *inode,
56 struct buffer_head *fe_bh,
57 u64 new_i_size);
54int ocfs2_simple_size_update(struct inode *inode, 58int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh, 59 struct buffer_head *di_bh,
56 u64 new_i_size); 60 u64 new_i_size);
61int ocfs2_truncate_file(struct inode *inode,
62 struct buffer_head *di_bh,
63 u64 new_i_size);
57int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, 64int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
58 u64 new_i_size, u64 zero_to); 65 u64 new_i_size, u64 zero_to);
59int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, 66int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
60 loff_t zero_to); 67 loff_t zero_to);
68int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
69 u32 clusters_to_add, int mark_unwritten);
61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 70int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 71int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
63 struct kstat *stat); 72 struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8b25de9efbb..3025c0da6b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
648 648
649 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 649 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
650 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 650 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
651 orphan_dir_bh); 651 orphan_dir_bh, false);
652 if (status < 0) { 652 if (status < 0) {
653 mlog_errno(status); 653 mlog_errno(status);
654 goto bail_commit; 654 goto bail_commit;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..5e86b247c821 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
81 tid_t i_sync_tid; 81 tid_t i_sync_tid;
82 tid_t i_datasync_tid; 82 tid_t i_datasync_tid;
83 83
84 wait_queue_head_t append_dio_wq;
85
84 struct dquot *i_dquot[MAXQUOTAS]; 86 struct dquot *i_dquot[MAXQUOTAS];
85}; 87};
86 88
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f502382180f..ff531928269e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -50,6 +50,8 @@
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "quota.h" 52#include "quota.h"
53#include "file.h"
54#include "namei.h"
53 55
54#include "buffer_head_io.h" 56#include "buffer_head_io.h"
55#include "ocfs2_trace.h" 57#include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
69static int ocfs2_trylock_journal(struct ocfs2_super *osb, 71static int ocfs2_trylock_journal(struct ocfs2_super *osb,
70 int slot_num); 72 int slot_num);
71static int ocfs2_recover_orphans(struct ocfs2_super *osb, 73static int ocfs2_recover_orphans(struct ocfs2_super *osb,
72 int slot); 74 int slot,
75 enum ocfs2_orphan_reco_type orphan_reco_type);
73static int ocfs2_commit_thread(void *arg); 76static int ocfs2_commit_thread(void *arg);
74static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 77static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
75 int slot_num, 78 int slot_num,
76 struct ocfs2_dinode *la_dinode, 79 struct ocfs2_dinode *la_dinode,
77 struct ocfs2_dinode *tl_dinode, 80 struct ocfs2_dinode *tl_dinode,
78 struct ocfs2_quota_recovery *qrec); 81 struct ocfs2_quota_recovery *qrec,
82 enum ocfs2_orphan_reco_type orphan_reco_type);
79 83
80static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 84static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
81{ 85{
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
149 return 0; 153 return 0;
150} 154}
151 155
152void ocfs2_queue_replay_slots(struct ocfs2_super *osb) 156void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
157 enum ocfs2_orphan_reco_type orphan_reco_type)
153{ 158{
154 struct ocfs2_replay_map *replay_map = osb->replay_map; 159 struct ocfs2_replay_map *replay_map = osb->replay_map;
155 int i; 160 int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
163 for (i = 0; i < replay_map->rm_slots; i++) 168 for (i = 0; i < replay_map->rm_slots; i++)
164 if (replay_map->rm_replay_slots[i]) 169 if (replay_map->rm_replay_slots[i])
165 ocfs2_queue_recovery_completion(osb->journal, i, NULL, 170 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
166 NULL, NULL); 171 NULL, NULL,
172 orphan_reco_type);
167 replay_map->rm_state = REPLAY_DONE; 173 replay_map->rm_state = REPLAY_DONE;
168} 174}
169 175
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
1174 struct ocfs2_dinode *lri_la_dinode; 1180 struct ocfs2_dinode *lri_la_dinode;
1175 struct ocfs2_dinode *lri_tl_dinode; 1181 struct ocfs2_dinode *lri_tl_dinode;
1176 struct ocfs2_quota_recovery *lri_qrec; 1182 struct ocfs2_quota_recovery *lri_qrec;
1183 enum ocfs2_orphan_reco_type lri_orphan_reco_type;
1177}; 1184};
1178 1185
1179/* Does the second half of the recovery process. By this point, the 1186/* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1195 struct ocfs2_dinode *la_dinode, *tl_dinode; 1202 struct ocfs2_dinode *la_dinode, *tl_dinode;
1196 struct ocfs2_la_recovery_item *item, *n; 1203 struct ocfs2_la_recovery_item *item, *n;
1197 struct ocfs2_quota_recovery *qrec; 1204 struct ocfs2_quota_recovery *qrec;
1205 enum ocfs2_orphan_reco_type orphan_reco_type;
1198 LIST_HEAD(tmp_la_list); 1206 LIST_HEAD(tmp_la_list);
1199 1207
1200 trace_ocfs2_complete_recovery( 1208 trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1212 la_dinode = item->lri_la_dinode; 1220 la_dinode = item->lri_la_dinode;
1213 tl_dinode = item->lri_tl_dinode; 1221 tl_dinode = item->lri_tl_dinode;
1214 qrec = item->lri_qrec; 1222 qrec = item->lri_qrec;
1223 orphan_reco_type = item->lri_orphan_reco_type;
1215 1224
1216 trace_ocfs2_complete_recovery_slot(item->lri_slot, 1225 trace_ocfs2_complete_recovery_slot(item->lri_slot,
1217 la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, 1226 la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
1236 kfree(tl_dinode); 1245 kfree(tl_dinode);
1237 } 1246 }
1238 1247
1239 ret = ocfs2_recover_orphans(osb, item->lri_slot); 1248 ret = ocfs2_recover_orphans(osb, item->lri_slot,
1249 orphan_reco_type);
1240 if (ret < 0) 1250 if (ret < 0)
1241 mlog_errno(ret); 1251 mlog_errno(ret);
1242 1252
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1261 int slot_num, 1271 int slot_num,
1262 struct ocfs2_dinode *la_dinode, 1272 struct ocfs2_dinode *la_dinode,
1263 struct ocfs2_dinode *tl_dinode, 1273 struct ocfs2_dinode *tl_dinode,
1264 struct ocfs2_quota_recovery *qrec) 1274 struct ocfs2_quota_recovery *qrec,
1275 enum ocfs2_orphan_reco_type orphan_reco_type)
1265{ 1276{
1266 struct ocfs2_la_recovery_item *item; 1277 struct ocfs2_la_recovery_item *item;
1267 1278
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1285 item->lri_slot = slot_num; 1296 item->lri_slot = slot_num;
1286 item->lri_tl_dinode = tl_dinode; 1297 item->lri_tl_dinode = tl_dinode;
1287 item->lri_qrec = qrec; 1298 item->lri_qrec = qrec;
1299 item->lri_orphan_reco_type = orphan_reco_type;
1288 1300
1289 spin_lock(&journal->j_lock); 1301 spin_lock(&journal->j_lock);
1290 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1302 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1304 /* No need to queue up our truncate_log as regular cleanup will catch 1316 /* No need to queue up our truncate_log as regular cleanup will catch
1305 * that */ 1317 * that */
1306 ocfs2_queue_recovery_completion(journal, osb->slot_num, 1318 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1307 osb->local_alloc_copy, NULL, NULL); 1319 osb->local_alloc_copy, NULL, NULL,
1320 ORPHAN_NEED_TRUNCATE);
1308 ocfs2_schedule_truncate_log_flush(osb, 0); 1321 ocfs2_schedule_truncate_log_flush(osb, 0);
1309 1322
1310 osb->local_alloc_copy = NULL; 1323 osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1312 1325
1313 /* queue to recover orphan slots for all offline slots */ 1326 /* queue to recover orphan slots for all offline slots */
1314 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1327 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1315 ocfs2_queue_replay_slots(osb); 1328 ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
1316 ocfs2_free_replay_slots(osb); 1329 ocfs2_free_replay_slots(osb);
1317} 1330}
1318 1331
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
1323 osb->slot_num, 1336 osb->slot_num,
1324 NULL, 1337 NULL,
1325 NULL, 1338 NULL,
1326 osb->quota_rec); 1339 osb->quota_rec,
1340 ORPHAN_NEED_TRUNCATE);
1327 osb->quota_rec = NULL; 1341 osb->quota_rec = NULL;
1328 } 1342 }
1329} 1343}
@@ -1360,7 +1374,7 @@ restart:
1360 1374
1361 /* queue recovery for our own slot */ 1375 /* queue recovery for our own slot */
1362 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 1376 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1363 NULL, NULL); 1377 NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
1364 1378
1365 spin_lock(&osb->osb_lock); 1379 spin_lock(&osb->osb_lock);
1366 while (rm->rm_used) { 1380 while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
1419 continue; 1433 continue;
1420 } 1434 }
1421 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], 1435 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
1422 NULL, NULL, qrec); 1436 NULL, NULL, qrec,
1437 ORPHAN_NEED_TRUNCATE);
1423 } 1438 }
1424 1439
1425 ocfs2_super_unlock(osb, 1); 1440 ocfs2_super_unlock(osb, 1);
1426 1441
1427 /* queue recovery for offline slots */ 1442 /* queue recovery for offline slots */
1428 ocfs2_queue_replay_slots(osb); 1443 ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
1429 1444
1430bail: 1445bail:
1431 mutex_lock(&osb->recovery_lock); 1446 mutex_lock(&osb->recovery_lock);
@@ -1447,7 +1462,6 @@ bail:
1447 * requires that we call do_exit(). And it isn't exported, but 1462 * requires that we call do_exit(). And it isn't exported, but
1448 * complete_and_exit() seems to be a minimal wrapper around it. */ 1463 * complete_and_exit() seems to be a minimal wrapper around it. */
1449 complete_and_exit(NULL, status); 1464 complete_and_exit(NULL, status);
1450 return status;
1451} 1465}
1452 1466
1453void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) 1467void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
@@ -1712,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1712 1726
1713 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1727 /* This will kfree the memory pointed to by la_copy and tl_copy */
1714 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1728 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1715 tl_copy, NULL); 1729 tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
1716 1730
1717 status = 0; 1731 status = 0;
1718done: 1732done:
@@ -1902,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1902 1916
1903 for (i = 0; i < osb->max_slots; i++) 1917 for (i = 0; i < osb->max_slots; i++)
1904 ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, 1918 ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
1905 NULL); 1919 NULL, ORPHAN_NO_NEED_TRUNCATE);
1906 /* 1920 /*
1907 * We queued a recovery on orphan slots, increment the sequence 1921 * We queued a recovery on orphan slots, increment the sequence
1908 * number and update LVB so other node will skip the scan for a while 1922 * number and update LVB so other node will skip the scan for a while
@@ -2001,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
2001 if (IS_ERR(iter)) 2015 if (IS_ERR(iter))
2002 return 0; 2016 return 0;
2003 2017
2018 /* Skip inodes which are already added to recover list, since dio may
2019 * happen concurrently with unlink/rename */
2020 if (OCFS2_I(iter)->ip_next_orphan) {
2021 iput(iter);
2022 return 0;
2023 }
2024
2004 trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); 2025 trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
2005 /* No locking is required for the next_orphan queue as there 2026 /* No locking is required for the next_orphan queue as there
2006 * is only ever a single process doing orphan recovery. */ 2027 * is only ever a single process doing orphan recovery. */
@@ -2109,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
2109 * advertising our state to ocfs2_delete_inode(). 2130 * advertising our state to ocfs2_delete_inode().
2110 */ 2131 */
2111static int ocfs2_recover_orphans(struct ocfs2_super *osb, 2132static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2112 int slot) 2133 int slot,
2134 enum ocfs2_orphan_reco_type orphan_reco_type)
2113{ 2135{
2114 int ret = 0; 2136 int ret = 0;
2115 struct inode *inode = NULL; 2137 struct inode *inode = NULL;
@@ -2133,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2133 (unsigned long long)oi->ip_blkno); 2155 (unsigned long long)oi->ip_blkno);
2134 2156
2135 iter = oi->ip_next_orphan; 2157 iter = oi->ip_next_orphan;
2158 oi->ip_next_orphan = NULL;
2159
2160 /*
2161 * We need to take and drop the inode lock to
2162 * force read inode from disk.
2163 */
2164 ret = ocfs2_inode_lock(inode, NULL, 0);
2165 if (ret) {
2166 mlog_errno(ret);
2167 goto next;
2168 }
2169 ocfs2_inode_unlock(inode, 0);
2170
2171 if (inode->i_nlink == 0) {
2172 spin_lock(&oi->ip_lock);
2173 /* Set the proper information to get us going into
2174 * ocfs2_delete_inode. */
2175 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2176 spin_unlock(&oi->ip_lock);
2177 } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
2178 struct buffer_head *di_bh = NULL;
2179
2180 ret = ocfs2_rw_lock(inode, 1);
2181 if (ret) {
2182 mlog_errno(ret);
2183 goto next;
2184 }
2185
2186 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2187 if (ret < 0) {
2188 ocfs2_rw_unlock(inode, 1);
2189 mlog_errno(ret);
2190 goto next;
2191 }
2192
2193 ret = ocfs2_truncate_file(inode, di_bh,
2194 i_size_read(inode));
2195 ocfs2_inode_unlock(inode, 1);
2196 ocfs2_rw_unlock(inode, 1);
2197 brelse(di_bh);
2198 if (ret < 0) {
2199 if (ret != -ENOSPC)
2200 mlog_errno(ret);
2201 goto next;
2202 }
2203
2204 ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
2205 if (ret)
2206 mlog_errno(ret);
2136 2207
2137 spin_lock(&oi->ip_lock); 2208 wake_up(&OCFS2_I(inode)->append_dio_wq);
2138 /* Set the proper information to get us going into 2209 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
2139 * ocfs2_delete_inode. */
2140 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2141 spin_unlock(&oi->ip_lock);
2142 2210
2211next:
2143 iput(inode); 2212 iput(inode);
2144 2213
2145 inode = iter; 2214 inode = iter;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7f8cde94abfe..f4cd3c3e9fb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
472 * orphan dir index leaf */ 472 * orphan dir index leaf */
473#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) 473#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
474 474
475/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
476 * orphan dir index root + orphan dir index leaf */
477#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
478#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
479
475/* dinode update, old dir dinode update, new dir dinode update, old 480/* dinode update, old dir dinode update, new dir dinode update, old
476 * dir dir entry, new dir dir entry, dir entry update for renaming 481 * dir dir entry, new dir dir entry, dir entry update for renaming
477 * directory + target unlink + 3 x dir index leaves */ 482 * directory + target unlink + 3 x dir index leaves */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
173static const struct vm_operations_struct ocfs2_file_vm_ops = { 173static const struct vm_operations_struct ocfs2_file_vm_ops = {
174 .fault = ocfs2_fault, 174 .fault = ocfs2_fault,
175 .page_mkwrite = ocfs2_page_mkwrite, 175 .page_mkwrite = ocfs2_page_mkwrite,
176 .remap_pages = generic_file_remap_pages,
177}; 176};
178 177
179int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 178int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 914c121ec890..b5c3a5ea3ee6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
79 struct inode **ret_orphan_dir, 79 struct inode **ret_orphan_dir,
80 u64 blkno, 80 u64 blkno,
81 char *name, 81 char *name,
82 struct ocfs2_dir_lookup_result *lookup); 82 struct ocfs2_dir_lookup_result *lookup,
83 bool dio);
83 84
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 86 handle_t *handle,
@@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
87 struct buffer_head *fe_bh, 88 struct buffer_head *fe_bh,
88 char *name, 89 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 90 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode,
92 bool dio);
91 93
92static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 94static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
93 handle_t *handle, 95 handle_t *handle,
@@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
104static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); 106static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
105/* An orphan dir name is an 8 byte value, printed as a hex string */ 107/* An orphan dir name is an 8 byte value, printed as a hex string */
106#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 108#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
109#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
110#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
107 111
108static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, 112static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
109 unsigned int flags) 113 unsigned int flags)
@@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
952 if (ocfs2_inode_is_unlinkable(inode)) { 956 if (ocfs2_inode_is_unlinkable(inode)) {
953 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 957 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
954 OCFS2_I(inode)->ip_blkno, 958 OCFS2_I(inode)->ip_blkno,
955 orphan_name, &orphan_insert); 959 orphan_name, &orphan_insert,
960 false);
956 if (status < 0) { 961 if (status < 0) {
957 mlog_errno(status); 962 mlog_errno(status);
958 goto leave; 963 goto leave;
@@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
1004 1009
1005 if (is_unlinkable) { 1010 if (is_unlinkable) {
1006 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, 1011 status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
1007 orphan_name, &orphan_insert, orphan_dir); 1012 orphan_name, &orphan_insert, orphan_dir, false);
1008 if (status < 0) 1013 if (status < 0)
1009 mlog_errno(status); 1014 mlog_errno(status);
1010 } 1015 }
@@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
1440 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { 1445 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1441 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1446 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1442 OCFS2_I(new_inode)->ip_blkno, 1447 OCFS2_I(new_inode)->ip_blkno,
1443 orphan_name, &orphan_insert); 1448 orphan_name, &orphan_insert,
1449 false);
1444 if (status < 0) { 1450 if (status < 0) {
1445 mlog_errno(status); 1451 mlog_errno(status);
1446 goto bail; 1452 goto bail;
@@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
1507 if (should_add_orphan) { 1513 if (should_add_orphan) {
1508 status = ocfs2_orphan_add(osb, handle, new_inode, 1514 status = ocfs2_orphan_add(osb, handle, new_inode,
1509 newfe_bh, orphan_name, 1515 newfe_bh, orphan_name,
1510 &orphan_insert, orphan_dir); 1516 &orphan_insert, orphan_dir, false);
1511 if (status < 0) { 1517 if (status < 0) {
1512 mlog_errno(status); 1518 mlog_errno(status);
1513 goto bail; 1519 goto bail;
@@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
2088 struct buffer_head *orphan_dir_bh, 2094 struct buffer_head *orphan_dir_bh,
2089 u64 blkno, 2095 u64 blkno,
2090 char *name, 2096 char *name,
2091 struct ocfs2_dir_lookup_result *lookup) 2097 struct ocfs2_dir_lookup_result *lookup,
2098 bool dio)
2092{ 2099{
2093 int ret; 2100 int ret;
2094 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); 2101 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
2102 int namelen = dio ?
2103 (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
2104 OCFS2_ORPHAN_NAMELEN;
2105
2106 if (dio) {
2107 ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
2108 OCFS2_DIO_ORPHAN_PREFIX);
2109 if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
2110 ret = -EINVAL;
2111 mlog_errno(ret);
2112 return ret;
2113 }
2095 2114
2096 ret = ocfs2_blkno_stringify(blkno, name); 2115 ret = ocfs2_blkno_stringify(blkno,
2116 name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
2117 } else
2118 ret = ocfs2_blkno_stringify(blkno, name);
2097 if (ret < 0) { 2119 if (ret < 0) {
2098 mlog_errno(ret); 2120 mlog_errno(ret);
2099 return ret; 2121 return ret;
@@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
2101 2123
2102 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 2124 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2103 orphan_dir_bh, name, 2125 orphan_dir_bh, name,
2104 OCFS2_ORPHAN_NAMELEN, lookup); 2126 namelen, lookup);
2105 if (ret < 0) { 2127 if (ret < 0) {
2106 mlog_errno(ret); 2128 mlog_errno(ret);
2107 return ret; 2129 return ret;
@@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2128 struct inode **ret_orphan_dir, 2150 struct inode **ret_orphan_dir,
2129 u64 blkno, 2151 u64 blkno,
2130 char *name, 2152 char *name,
2131 struct ocfs2_dir_lookup_result *lookup) 2153 struct ocfs2_dir_lookup_result *lookup,
2154 bool dio)
2132{ 2155{
2133 struct inode *orphan_dir_inode = NULL; 2156 struct inode *orphan_dir_inode = NULL;
2134 struct buffer_head *orphan_dir_bh = NULL; 2157 struct buffer_head *orphan_dir_bh = NULL;
@@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2142 } 2165 }
2143 2166
2144 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, 2167 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
2145 blkno, name, lookup); 2168 blkno, name, lookup, dio);
2146 if (ret < 0) { 2169 if (ret < 0) {
2147 mlog_errno(ret); 2170 mlog_errno(ret);
2148 goto out; 2171 goto out;
@@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2170 struct buffer_head *fe_bh, 2193 struct buffer_head *fe_bh,
2171 char *name, 2194 char *name,
2172 struct ocfs2_dir_lookup_result *lookup, 2195 struct ocfs2_dir_lookup_result *lookup,
2173 struct inode *orphan_dir_inode) 2196 struct inode *orphan_dir_inode,
2197 bool dio)
2174{ 2198{
2175 struct buffer_head *orphan_dir_bh = NULL; 2199 struct buffer_head *orphan_dir_bh = NULL;
2176 int status = 0; 2200 int status = 0;
2177 struct ocfs2_dinode *orphan_fe; 2201 struct ocfs2_dinode *orphan_fe;
2178 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 2202 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
2203 int namelen = dio ?
2204 (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
2205 OCFS2_ORPHAN_NAMELEN;
2179 2206
2180 trace_ocfs2_orphan_add_begin( 2207 trace_ocfs2_orphan_add_begin(
2181 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2208 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2219 ocfs2_journal_dirty(handle, orphan_dir_bh); 2246 ocfs2_journal_dirty(handle, orphan_dir_bh);
2220 2247
2221 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 2248 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2222 OCFS2_ORPHAN_NAMELEN, inode, 2249 namelen, inode,
2223 OCFS2_I(inode)->ip_blkno, 2250 OCFS2_I(inode)->ip_blkno,
2224 orphan_dir_bh, lookup); 2251 orphan_dir_bh, lookup);
2225 if (status < 0) { 2252 if (status < 0) {
@@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2227 goto rollback; 2254 goto rollback;
2228 } 2255 }
2229 2256
2230 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); 2257 if (dio) {
2231 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; 2258 /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
2259 * slot.
2260 */
2261 fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
2262 fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
2263 } else {
2264 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
2265 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
2232 2266
2233 /* Record which orphan dir our inode now resides 2267 /* Record which orphan dir our inode now resides
2234 * in. delete_inode will use this to determine which orphan 2268 * in. delete_inode will use this to determine which orphan
2235 * dir to lock. */ 2269 * dir to lock. */
2236 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 2270 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
2271 }
2237 2272
2238 ocfs2_journal_dirty(handle, fe_bh); 2273 ocfs2_journal_dirty(handle, fe_bh);
2239 2274
@@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2258 handle_t *handle, 2293 handle_t *handle,
2259 struct inode *orphan_dir_inode, 2294 struct inode *orphan_dir_inode,
2260 struct inode *inode, 2295 struct inode *inode,
2261 struct buffer_head *orphan_dir_bh) 2296 struct buffer_head *orphan_dir_bh,
2297 bool dio)
2262{ 2298{
2263 char name[OCFS2_ORPHAN_NAMELEN + 1]; 2299 const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
2300 char name[namelen + 1];
2264 struct ocfs2_dinode *orphan_fe; 2301 struct ocfs2_dinode *orphan_fe;
2265 int status = 0; 2302 int status = 0;
2266 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2303 struct ocfs2_dir_lookup_result lookup = { NULL, };
2267 2304
2268 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 2305 if (dio) {
2306 status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
2307 OCFS2_DIO_ORPHAN_PREFIX);
2308 if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
2309 status = -EINVAL;
2310 mlog_errno(status);
2311 return status;
2312 }
2313
2314 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
2315 name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
2316 } else
2317 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2269 if (status < 0) { 2318 if (status < 0) {
2270 mlog_errno(status); 2319 mlog_errno(status);
2271 goto leave; 2320 goto leave;
@@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2273 2322
2274 trace_ocfs2_orphan_del( 2323 trace_ocfs2_orphan_del(
2275 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, 2324 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
2276 name, OCFS2_ORPHAN_NAMELEN); 2325 name, namelen);
2277 2326
2278 /* find it's spot in the orphan directory */ 2327 /* find it's spot in the orphan directory */
2279 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, 2328 status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
2280 &lookup); 2329 &lookup);
2281 if (status) { 2330 if (status) {
2282 mlog_errno(status); 2331 mlog_errno(status);
@@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
2376 } 2425 }
2377 2426
2378 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, 2427 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
2379 di_blkno, orphan_name, orphan_insert); 2428 di_blkno, orphan_name, orphan_insert,
2429 false);
2380 if (ret < 0) { 2430 if (ret < 0) {
2381 mlog_errno(ret); 2431 mlog_errno(ret);
2382 goto out; 2432 goto out;
@@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2482 2532
2483 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2533 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2484 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, 2534 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2485 &orphan_insert, orphan_dir); 2535 &orphan_insert, orphan_dir, false);
2486 if (status < 0) { 2536 if (status < 0) {
2487 mlog_errno(status); 2537 mlog_errno(status);
2488 goto leave; 2538 goto leave;
@@ -2527,6 +2577,186 @@ leave:
2527 return status; 2577 return status;
2528} 2578}
2529 2579
2580static int ocfs2_dio_orphan_recovered(struct inode *inode)
2581{
2582 int ret;
2583 struct buffer_head *di_bh = NULL;
2584 struct ocfs2_dinode *di = NULL;
2585
2586 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2587 if (ret < 0) {
2588 mlog_errno(ret);
2589 return 0;
2590 }
2591
2592 di = (struct ocfs2_dinode *) di_bh->b_data;
2593 ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
2594 ocfs2_inode_unlock(inode, 1);
2595 brelse(di_bh);
2596
2597 return ret;
2598}
2599
2600#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
2601int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
2602 struct inode *inode)
2603{
2604 char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
2605 struct inode *orphan_dir_inode = NULL;
2606 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2607 struct buffer_head *di_bh = NULL;
2608 int status = 0;
2609 handle_t *handle = NULL;
2610 struct ocfs2_dinode *di = NULL;
2611
2612restart:
2613 status = ocfs2_inode_lock(inode, &di_bh, 1);
2614 if (status < 0) {
2615 mlog_errno(status);
2616 goto bail;
2617 }
2618
2619 di = (struct ocfs2_dinode *) di_bh->b_data;
2620 /*
2621 * Another append dio crashed?
2622 * If so, wait for recovery first.
2623 */
2624 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
2625 ocfs2_inode_unlock(inode, 1);
2626 brelse(di_bh);
2627 wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
2628 ocfs2_dio_orphan_recovered(inode),
2629 msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
2630 goto restart;
2631 }
2632
2633 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
2634 OCFS2_I(inode)->ip_blkno,
2635 orphan_name,
2636 &orphan_insert,
2637 true);
2638 if (status < 0) {
2639 mlog_errno(status);
2640 goto bail_unlock_inode;
2641 }
2642
2643 handle = ocfs2_start_trans(osb,
2644 OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
2645 if (IS_ERR(handle)) {
2646 status = PTR_ERR(handle);
2647 goto bail_unlock_orphan;
2648 }
2649
2650 status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
2651 &orphan_insert, orphan_dir_inode, true);
2652 if (status)
2653 mlog_errno(status);
2654
2655 ocfs2_commit_trans(osb, handle);
2656
2657bail_unlock_orphan:
2658 ocfs2_inode_unlock(orphan_dir_inode, 1);
2659 mutex_unlock(&orphan_dir_inode->i_mutex);
2660 iput(orphan_dir_inode);
2661
2662 ocfs2_free_dir_lookup_result(&orphan_insert);
2663
2664bail_unlock_inode:
2665 ocfs2_inode_unlock(inode, 1);
2666 brelse(di_bh);
2667
2668bail:
2669 return status;
2670}
2671
2672int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
2673 struct inode *inode, int update_isize,
2674 loff_t end)
2675{
2676 struct inode *orphan_dir_inode = NULL;
2677 struct buffer_head *orphan_dir_bh = NULL;
2678 struct buffer_head *di_bh = NULL;
2679 struct ocfs2_dinode *di = NULL;
2680 handle_t *handle = NULL;
2681 int status = 0;
2682
2683 status = ocfs2_inode_lock(inode, &di_bh, 1);
2684 if (status < 0) {
2685 mlog_errno(status);
2686 goto bail;
2687 }
2688 di = (struct ocfs2_dinode *) di_bh->b_data;
2689
2690 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2691 ORPHAN_DIR_SYSTEM_INODE,
2692 le16_to_cpu(di->i_dio_orphaned_slot));
2693 if (!orphan_dir_inode) {
2694 status = -ENOENT;
2695 mlog_errno(status);
2696 goto bail_unlock_inode;
2697 }
2698
2699 mutex_lock(&orphan_dir_inode->i_mutex);
2700 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
2701 if (status < 0) {
2702 mutex_unlock(&orphan_dir_inode->i_mutex);
2703 iput(orphan_dir_inode);
2704 mlog_errno(status);
2705 goto bail_unlock_inode;
2706 }
2707
2708 handle = ocfs2_start_trans(osb,
2709 OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
2710 if (IS_ERR(handle)) {
2711 status = PTR_ERR(handle);
2712 goto bail_unlock_orphan;
2713 }
2714
2715 BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
2716
2717 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
2718 inode, orphan_dir_bh, true);
2719 if (status < 0) {
2720 mlog_errno(status);
2721 goto bail_commit;
2722 }
2723
2724 status = ocfs2_journal_access_di(handle,
2725 INODE_CACHE(inode),
2726 di_bh,
2727 OCFS2_JOURNAL_ACCESS_WRITE);
2728 if (status < 0) {
2729 mlog_errno(status);
2730 goto bail_commit;
2731 }
2732
2733 di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
2734 di->i_dio_orphaned_slot = 0;
2735
2736 if (update_isize) {
2737 status = ocfs2_set_inode_size(handle, inode, di_bh, end);
2738 if (status)
2739 mlog_errno(status);
2740 } else
2741 ocfs2_journal_dirty(handle, di_bh);
2742
2743bail_commit:
2744 ocfs2_commit_trans(osb, handle);
2745
2746bail_unlock_orphan:
2747 ocfs2_inode_unlock(orphan_dir_inode, 1);
2748 mutex_unlock(&orphan_dir_inode->i_mutex);
2749 brelse(orphan_dir_bh);
2750 iput(orphan_dir_inode);
2751
2752bail_unlock_inode:
2753 ocfs2_inode_unlock(inode, 1);
2754 brelse(di_bh);
2755
2756bail:
2757 return status;
2758}
2759
2530int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, 2760int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2531 struct inode *inode, 2761 struct inode *inode,
2532 struct dentry *dentry) 2762 struct dentry *dentry)
@@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2615 } 2845 }
2616 2846
2617 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 2847 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
2618 orphan_dir_bh); 2848 orphan_dir_bh, false);
2619 if (status < 0) { 2849 if (status < 0) {
2620 mlog_errno(status); 2850 mlog_errno(status);
2621 goto out_commit; 2851 goto out_commit;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..5ddecce172fa 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
34 handle_t *handle, 34 handle_t *handle,
35 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
36 struct inode *inode, 36 struct inode *inode,
37 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh,
38 bool dio);
38int ocfs2_create_inode_in_orphan(struct inode *dir, 39int ocfs2_create_inode_in_orphan(struct inode *dir,
39 int mode, 40 int mode,
40 struct inode **new_inode); 41 struct inode **new_inode);
42int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
43 struct inode *inode);
44int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
45 struct inode *inode, int update_isize,
46 loff_t end);
41int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, 47int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
42 struct inode *new_inode, 48 struct inode *new_inode,
43 struct dentry *new_dentry); 49 struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7d6b7d090452..8490c64d34fe 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
209#endif 209#endif
210}; 210};
211 211
212enum ocfs2_orphan_reco_type {
213 ORPHAN_NO_NEED_TRUNCATE = 0,
214 ORPHAN_NEED_TRUNCATE,
215};
216
212enum ocfs2_orphan_scan_state { 217enum ocfs2_orphan_scan_state {
213 ORPHAN_SCAN_ACTIVE, 218 ORPHAN_SCAN_ACTIVE,
214 ORPHAN_SCAN_INACTIVE 219 ORPHAN_SCAN_INACTIVE
@@ -279,6 +284,8 @@ enum ocfs2_mount_options
279 writes */ 284 writes */
280 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ 285 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
281 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ 286 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
287
288 OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
282}; 289};
283 290
284#define OCFS2_OSB_SOFT_RO 0x0001 291#define OCFS2_OSB_SOFT_RO 0x0001
@@ -493,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
493 return 0; 500 return 0;
494} 501}
495 502
503static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
504{
505 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
506 return 1;
507 return 0;
508}
509
510
496static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) 511static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
497{ 512{
498 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) 513 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -724,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
724 return clusters; 739 return clusters;
725} 740}
726 741
742static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
743 u64 bytes)
744{
745 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
746 unsigned int clusters;
747
748 clusters = (unsigned int)(bytes >> cl_bits);
749 return clusters;
750}
751
727static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, 752static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
728 u64 bytes) 753 u64 bytes)
729{ 754{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..20e37a3ed26f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -105,7 +105,8 @@
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) 105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
109 | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
109 110
110/* 111/*
111 * Heartbeat-only devices are missing journals and other files. The 112 * Heartbeat-only devices are missing journals and other files. The
@@ -199,6 +200,11 @@
199#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 200#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
200#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 201#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
201 202
203/*
204 * Append Direct IO support
205 */
206#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008
207
202/* The byte offset of the first backup block will be 1G. 208/* The byte offset of the first backup block will be 1G.
203 * The following will be 4G, 16G, 64G, 256G and 1T. 209 * The following will be 4G, 16G, 64G, 256G and 1T.
204 */ 210 */
@@ -229,6 +235,8 @@
229#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 235#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
230#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 236#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
231#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ 237#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
238#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially
239 * for dio */
232 240
233/* 241/*
234 * Flags on ocfs2_dinode.i_dyn_features 242 * Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
729 inode belongs to. Only valid 737 inode belongs to. Only valid
730 if allocated from a 738 if allocated from a
731 discontiguous block group */ 739 discontiguous block group */
732/*A0*/ __le64 i_reserved2[3]; 740/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */
741 __le16 i_reserved1[3];
742 __le64 i_reserved2[2];
733/*B8*/ union { 743/*B8*/ union {
734 __le64 i_pad1; /* Generic way to refer to this 744 __le64 i_pad1; /* Generic way to refer to this
735 64bit union */ 745 64bit union */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1eae330193a6..b6d51333ad02 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -48,6 +48,7 @@ struct ocfs2_quota_recovery {
48/* In-memory structure with quota header information */ 48/* In-memory structure with quota header information */
49struct ocfs2_mem_dqinfo { 49struct ocfs2_mem_dqinfo {
50 unsigned int dqi_type; /* Quota type this structure describes */ 50 unsigned int dqi_type; /* Quota type this structure describes */
51 unsigned int dqi_flags; /* Flags OLQF_* */
51 unsigned int dqi_chunks; /* Number of chunks in local quota file */ 52 unsigned int dqi_chunks; /* Number of chunks in local quota file */
52 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ 53 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
53 unsigned int dqi_syncms; /* How often should we sync with other nodes */ 54 unsigned int dqi_syncms; /* How often should we sync with other nodes */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 10b653930ee2..3d0b63d34225 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
73 ol_dqblk_block_off(sb, c, off); 73 ol_dqblk_block_off(sb, c, off);
74} 74}
75 75
76/* Compute block number from given offset */
77static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
78{
79 return off >> sb->s_blocksize_bits;
80}
81
82static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) 76static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
83{ 77{
84 return off & ((1 << sb->s_blocksize_bits) - 1); 78 return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
292 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + 286 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
293 OCFS2_LOCAL_INFO_OFF); 287 OCFS2_LOCAL_INFO_OFF);
294 spin_lock(&dq_data_lock); 288 spin_lock(&dq_data_lock);
295 ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 289 ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
296 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); 290 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
297 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); 291 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
298 spin_unlock(&dq_data_lock); 292 spin_unlock(&dq_data_lock);
@@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
701 /* We don't need the lock and we have to acquire quota file locks 695 /* We don't need the lock and we have to acquire quota file locks
702 * which will later depend on this lock */ 696 * which will later depend on this lock */
703 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 697 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
704 info->dqi_maxblimit = 0x7fffffffffffffffLL; 698 info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
705 info->dqi_maxilimit = 0x7fffffffffffffffLL; 699 info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
706 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); 700 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
707 if (!oinfo) { 701 if (!oinfo) {
708 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" 702 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
@@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
737 } 731 }
738 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + 732 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
739 OCFS2_LOCAL_INFO_OFF); 733 OCFS2_LOCAL_INFO_OFF);
740 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); 734 oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
741 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); 735 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
742 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); 736 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
743 oinfo->dqi_libh = bh; 737 oinfo->dqi_libh = bh;
744 738
745 /* We crashed when using local quota file? */ 739 /* We crashed when using local quota file? */
746 if (!(info->dqi_flags & OLQF_CLEAN)) { 740 if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
747 rec = OCFS2_SB(sb)->quota_rec; 741 rec = OCFS2_SB(sb)->quota_rec;
748 if (!rec) { 742 if (!rec) {
749 rec = ocfs2_alloc_quota_recovery(); 743 rec = ocfs2_alloc_quota_recovery();
@@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
772 } 766 }
773 767
774 /* Now mark quota file as used */ 768 /* Now mark quota file as used */
775 info->dqi_flags &= ~OLQF_CLEAN; 769 oinfo->dqi_flags &= ~OLQF_CLEAN;
776 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); 770 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
777 if (status < 0) { 771 if (status < 0) {
778 mlog_errno(status); 772 mlog_errno(status);
@@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
857 goto out; 851 goto out;
858 852
859 /* Mark local file as clean */ 853 /* Mark local file as clean */
860 info->dqi_flags |= OLQF_CLEAN; 854 oinfo->dqi_flags |= OLQF_CLEAN;
861 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], 855 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
862 oinfo->dqi_libh, 856 oinfo->dqi_libh,
863 olq_update_info, 857 olq_update_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d81f6e2a97f5..ee541f92dab4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2428 get_bh(prev_bh); 2428 get_bh(prev_bh);
2429 } 2429 }
2430 2430
2431 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2432
2433 trace_ocfs2_calc_refcount_meta_credits_iterate( 2431 trace_ocfs2_calc_refcount_meta_credits_iterate(
2434 recs_add, (unsigned long long)cpos, clusters, 2432 recs_add, (unsigned long long)cpos, clusters,
2435 (unsigned long long)le64_to_cpu(rec.r_cpos), 2433 (unsigned long long)le64_to_cpu(rec.r_cpos),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..6a348b0294ab 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -39,7 +39,7 @@
39#define OCFS2_CHECK_RESERVATIONS 39#define OCFS2_CHECK_RESERVATIONS
40#endif 40#endif
41 41
42DEFINE_SPINLOCK(resv_lock); 42static DEFINE_SPINLOCK(resv_lock);
43 43
44#define OCFS2_MIN_RESV_WINDOW_BITS 8 44#define OCFS2_MIN_RESV_WINDOW_BITS 8
45#define OCFS2_MAX_RESV_WINDOW_BITS 1024 45#define OCFS2_MAX_RESV_WINDOW_BITS 1024
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83723179e1ec..26675185b886 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -191,6 +191,7 @@ enum {
191 Opt_coherency_full, 191 Opt_coherency_full,
192 Opt_resv_level, 192 Opt_resv_level,
193 Opt_dir_resv_level, 193 Opt_dir_resv_level,
194 Opt_journal_async_commit,
194 Opt_err, 195 Opt_err,
195}; 196};
196 197
@@ -222,6 +223,7 @@ static const match_table_t tokens = {
222 {Opt_coherency_full, "coherency=full"}, 223 {Opt_coherency_full, "coherency=full"},
223 {Opt_resv_level, "resv_level=%u"}, 224 {Opt_resv_level, "resv_level=%u"},
224 {Opt_dir_resv_level, "dir_resv_level=%u"}, 225 {Opt_dir_resv_level, "dir_resv_level=%u"},
226 {Opt_journal_async_commit, "journal_async_commit"},
225 {Opt_err, NULL} 227 {Opt_err, NULL}
226}; 228};
227 229
@@ -1000,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
1000 } 1002 }
1001} 1003}
1002 1004
1003/* Handle quota on quotactl */
1004static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
1005{
1006 unsigned int feature[OCFS2_MAXQUOTAS] = {
1007 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
1008 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
1009
1010 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
1011 return -EINVAL;
1012
1013 return dquot_enable(sb_dqopt(sb)->files[type], type,
1014 format_id, DQUOT_LIMITS_ENABLED);
1015}
1016
1017/* Handle quota off quotactl */
1018static int ocfs2_quota_off(struct super_block *sb, int type)
1019{
1020 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
1021}
1022
1023static const struct quotactl_ops ocfs2_quotactl_ops = {
1024 .quota_on_meta = ocfs2_quota_on,
1025 .quota_off = ocfs2_quota_off,
1026 .quota_sync = dquot_quota_sync,
1027 .get_info = dquot_get_dqinfo,
1028 .set_info = dquot_set_dqinfo,
1029 .get_dqblk = dquot_get_dqblk,
1030 .set_dqblk = dquot_set_dqblk,
1031};
1032
1033static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 1005static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1034{ 1006{
1035 struct dentry *root; 1007 struct dentry *root;
@@ -1500,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb,
1500 option < OCFS2_MAX_RESV_LEVEL) 1472 option < OCFS2_MAX_RESV_LEVEL)
1501 mopt->dir_resv_level = option; 1473 mopt->dir_resv_level = option;
1502 break; 1474 break;
1475 case Opt_journal_async_commit:
1476 mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
1477 break;
1503 default: 1478 default:
1504 mlog(ML_ERROR, 1479 mlog(ML_ERROR,
1505 "Unrecognized mount option \"%s\" " 1480 "Unrecognized mount option \"%s\" "
@@ -1606,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1606 if (osb->osb_dir_resv_level != osb->osb_resv_level) 1581 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1607 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); 1582 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1608 1583
1584 if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
1585 seq_printf(s, ",journal_async_commit");
1586
1609 return 0; 1587 return 0;
1610} 1588}
1611 1589
@@ -1768,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
1768 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1746 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1769 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1747 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1770 1748
1749 init_waitqueue_head(&oi->append_dio_wq);
1750
1771 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), 1751 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1772 &ocfs2_inode_caching_ops); 1752 &ocfs2_inode_caching_ops);
1773 1753
@@ -2079,7 +2059,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2079 sb->s_op = &ocfs2_sops; 2059 sb->s_op = &ocfs2_sops;
2080 sb->s_d_op = &ocfs2_dentry_ops; 2060 sb->s_d_op = &ocfs2_dentry_ops;
2081 sb->s_export_op = &ocfs2_export_ops; 2061 sb->s_export_op = &ocfs2_export_ops;
2082 sb->s_qcop = &ocfs2_quotactl_ops; 2062 sb->s_qcop = &dquot_quotactl_sysfile_ops;
2083 sb->dq_op = &ocfs2_quota_operations; 2063 sb->dq_op = &ocfs2_quota_operations;
2084 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 2064 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
2085 sb->s_xattr = ocfs2_xattr_handlers; 2065 sb->s_xattr = ocfs2_xattr_handlers;
@@ -2475,6 +2455,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2475 goto finally; 2455 goto finally;
2476 } 2456 }
2477 2457
2458 if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
2459 jbd2_journal_set_features(osb->journal->j_journal,
2460 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2461 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2462 else
2463 jbd2_journal_clear_features(osb->journal->j_journal,
2464 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2465 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2466
2478 if (dirty) { 2467 if (dirty) {
2479 /* recover my local alloc if we didn't unmount cleanly. */ 2468 /* recover my local alloc if we didn't unmount cleanly. */
2480 status = ocfs2_begin_local_alloc_recovery(osb, 2469 status = ocfs2_begin_local_alloc_recovery(osb,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 662f8dee149f..85b190dc132f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5334,16 +5334,6 @@ out:
5334 return ret; 5334 return ret;
5335} 5335}
5336 5336
5337static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
5338 struct ocfs2_xattr_bucket *bucket,
5339 int offs)
5340{
5341 int block_off = offs >> inode->i_sb->s_blocksize_bits;
5342
5343 offs = offs % inode->i_sb->s_blocksize;
5344 return bucket_block(bucket, block_off) + offs;
5345}
5346
5347/* 5337/*
5348 * Truncate the specified xe_off entry in xattr bucket. 5338 * Truncate the specified xe_off entry in xattr bucket.
5349 * bucket is indicated by header_bh and len is the new length. 5339 * bucket is indicated by header_bh and len is the new length.
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..33f9cbf2610b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
667{ 667{
668 /* NB: we're sure to have correct a_ops only after f_op->open */ 668 /* NB: we're sure to have correct a_ops only after f_op->open */
669 if (f->f_flags & O_DIRECT) { 669 if (f->f_flags & O_DIRECT) {
670 if (!f->f_mapping->a_ops || 670 if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
671 ((!f->f_mapping->a_ops->direct_IO) &&
672 (!f->f_mapping->a_ops->get_xip_mem))) {
673 return -EINVAL; 671 return -EINVAL;
674 }
675 } 672 }
676 return 0; 673 return 0;
677} 674}
@@ -971,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
971 */ 968 */
972struct file *filp_open(const char *filename, int flags, umode_t mode) 969struct file *filp_open(const char *filename, int flags, umode_t mode)
973{ 970{
974 struct filename name = {.name = filename}; 971 struct filename *name = getname_kernel(filename);
975 return file_open_name(&name, flags, mode); 972 struct file *file = ERR_CAST(name);
973
974 if (!IS_ERR(name)) {
975 file = file_open_name(name, flags, mode);
976 putname(name);
977 }
978 return file;
976} 979}
977EXPORT_SYMBOL(filp_open); 980EXPORT_SYMBOL(filp_open);
978 981
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d065b82..1295a00ca316 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
81#include <linux/pid_namespace.h> 81#include <linux/pid_namespace.h>
82#include <linux/ptrace.h> 82#include <linux/ptrace.h>
83#include <linux/tracehook.h> 83#include <linux/tracehook.h>
84#include <linux/string_helpers.h>
84#include <linux/user_namespace.h> 85#include <linux/user_namespace.h>
85 86
86#include <asm/pgtable.h> 87#include <asm/pgtable.h>
@@ -89,39 +90,18 @@
89 90
90static inline void task_name(struct seq_file *m, struct task_struct *p) 91static inline void task_name(struct seq_file *m, struct task_struct *p)
91{ 92{
92 int i; 93 char *buf;
93 char *buf, *end;
94 char *name;
95 char tcomm[sizeof(p->comm)]; 94 char tcomm[sizeof(p->comm)];
96 95
97 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
98 97
99 seq_puts(m, "Name:\t"); 98 seq_puts(m, "Name:\t");
100 end = m->buf + m->size;
101 buf = m->buf + m->count; 99 buf = m->buf + m->count;
102 name = tcomm; 100
103 i = sizeof(tcomm); 101 /* Ignore error for now */
104 while (i && (buf < end)) { 102 string_escape_str(tcomm, &buf, m->size - m->count,
105 unsigned char c = *name; 103 ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
106 name++; 104
107 i--;
108 *buf = c;
109 if (!c)
110 break;
111 if (c == '\\') {
112 buf++;
113 if (buf < end)
114 *buf++ = c;
115 continue;
116 }
117 if (c == '\n') {
118 *buf++ = '\\';
119 if (buf < end)
120 *buf++ = 'n';
121 continue;
122 }
123 buf++;
124 }
125 m->count = buf - m->buf; 105 m->count = buf - m->buf;
126 seq_putc(m, '\n'); 106 seq_putc(m, '\n');
127} 107}
@@ -336,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
336 316
337static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 317static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
338{ 318{
339 seq_puts(m, "Cpus_allowed:\t"); 319 seq_printf(m, "Cpus_allowed:\t%*pb\n",
340 seq_cpumask(m, &task->cpus_allowed); 320 cpumask_pr_args(&task->cpus_allowed));
341 seq_putc(m, '\n'); 321 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
342 seq_puts(m, "Cpus_allowed_list:\t"); 322 cpumask_pr_args(&task->cpus_allowed));
343 seq_cpumask_list(m, &task->cpus_allowed);
344 seq_putc(m, '\n');
345} 323}
346 324
347int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 325int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7fea13229f33..3309f59d421b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
122 struct kstat *stat) 122 struct kstat *stat)
123{ 123{
124 struct inode *inode = dentry->d_inode; 124 struct inode *inode = dentry->d_inode;
125 struct proc_dir_entry *de = PROC_I(inode)->pde; 125 struct proc_dir_entry *de = PDE(inode);
126 if (de && de->nlink) 126 if (de && de->nlink)
127 set_nlink(inode, de->nlink); 127 set_nlink(inode, de->nlink);
128 128
@@ -350,29 +350,12 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
350 if (ret) 350 if (ret)
351 return ret; 351 return ret;
352 352
353 if (S_ISDIR(dp->mode)) {
354 dp->proc_fops = &proc_dir_operations;
355 dp->proc_iops = &proc_dir_inode_operations;
356 dir->nlink++;
357 } else if (S_ISLNK(dp->mode)) {
358 dp->proc_iops = &proc_link_inode_operations;
359 } else if (S_ISREG(dp->mode)) {
360 BUG_ON(dp->proc_fops == NULL);
361 dp->proc_iops = &proc_file_inode_operations;
362 } else {
363 WARN_ON(1);
364 proc_free_inum(dp->low_ino);
365 return -EINVAL;
366 }
367
368 spin_lock(&proc_subdir_lock); 353 spin_lock(&proc_subdir_lock);
369 dp->parent = dir; 354 dp->parent = dir;
370 if (pde_subdir_insert(dir, dp) == false) { 355 if (pde_subdir_insert(dir, dp) == false) {
371 WARN(1, "proc_dir_entry '%s/%s' already registered\n", 356 WARN(1, "proc_dir_entry '%s/%s' already registered\n",
372 dir->name, dp->name); 357 dir->name, dp->name);
373 spin_unlock(&proc_subdir_lock); 358 spin_unlock(&proc_subdir_lock);
374 if (S_ISDIR(dp->mode))
375 dir->nlink--;
376 proc_free_inum(dp->low_ino); 359 proc_free_inum(dp->low_ino);
377 return -EEXIST; 360 return -EEXIST;
378 } 361 }
@@ -431,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
431 ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); 414 ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
432 if (ent->data) { 415 if (ent->data) {
433 strcpy((char*)ent->data,dest); 416 strcpy((char*)ent->data,dest);
417 ent->proc_iops = &proc_link_inode_operations;
434 if (proc_register(parent, ent) < 0) { 418 if (proc_register(parent, ent) < 0) {
435 kfree(ent->data); 419 kfree(ent->data);
436 kfree(ent); 420 kfree(ent);
@@ -456,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
456 ent = __proc_create(&parent, name, S_IFDIR | mode, 2); 440 ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
457 if (ent) { 441 if (ent) {
458 ent->data = data; 442 ent->data = data;
443 ent->proc_fops = &proc_dir_operations;
444 ent->proc_iops = &proc_dir_inode_operations;
445 parent->nlink++;
459 if (proc_register(parent, ent) < 0) { 446 if (proc_register(parent, ent) < 0) {
460 kfree(ent); 447 kfree(ent);
448 parent->nlink--;
461 ent = NULL; 449 ent = NULL;
462 } 450 }
463 } 451 }
@@ -493,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
493 return NULL; 481 return NULL;
494 } 482 }
495 483
484 BUG_ON(proc_fops == NULL);
485
496 if ((mode & S_IALLUGO) == 0) 486 if ((mode & S_IALLUGO) == 0)
497 mode |= S_IRUGO; 487 mode |= S_IRUGO;
498 pde = __proc_create(&parent, name, mode, 1); 488 pde = __proc_create(&parent, name, mode, 1);
@@ -500,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
500 goto out; 490 goto out;
501 pde->proc_fops = proc_fops; 491 pde->proc_fops = proc_fops;
502 pde->data = data; 492 pde->data = data;
493 pde->proc_iops = &proc_file_inode_operations;
503 if (proc_register(parent, pde) < 0) 494 if (proc_register(parent, pde) < 0)
504 goto out_free; 495 goto out_free;
505 return pde; 496 return pde;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8420a2f80811..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
40 put_pid(PROC_I(inode)->pid); 40 put_pid(PROC_I(inode)->pid);
41 41
42 /* Let go of any associated proc directory entry */ 42 /* Let go of any associated proc directory entry */
43 de = PROC_I(inode)->pde; 43 de = PDE(inode);
44 if (de) 44 if (de)
45 pde_put(de); 45 pde_put(de);
46 head = PROC_I(inode)->sysctl; 46 head = PROC_I(inode)->sysctl;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
5#include <linux/ksm.h> 5#include <linux/ksm.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmzone.h> 7#include <linux/mmzone.h>
8#include <linux/huge_mm.h>
8#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
9#include <linux/seq_file.h> 10#include <linux/seq_file.h>
10#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon 122 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
122 * to make sure a given page is a thp, not a non-huge compound page. 123 * to make sure a given page is a thp, not a non-huge compound page.
123 */ 124 */
124 else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || 125 else if (PageTransCompound(page)) {
125 PageAnon(compound_head(page)))) 126 struct page *head = compound_head(page);
126 u |= 1 << KPF_THP; 127
128 if (PageLRU(head) || PageAnon(head))
129 u |= 1 << KPF_THP;
130 else if (is_huge_zero_page(head)) {
131 u |= 1 << KPF_ZERO_PAGE;
132 u |= 1 << KPF_THP;
133 }
134 } else if (is_zero_pfn(page_to_pfn(page)))
135 u |= 1 << KPF_ZERO_PAGE;
136
127 137
128 /* 138 /*
129 * Caveats on high order pages: page->_count will only be set 139 * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 246eae84b13b..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
21 21
22void task_mem(struct seq_file *m, struct mm_struct *mm) 22void task_mem(struct seq_file *m, struct mm_struct *mm)
23{ 23{
24 unsigned long data, text, lib, swap; 24 unsigned long data, text, lib, swap, ptes, pmds;
25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
26 26
27 /* 27 /*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
44 swap = get_mm_counter(mm, MM_SWAPENTS); 44 swap = get_mm_counter(mm, MM_SWAPENTS);
45 ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
46 pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
45 seq_printf(m, 47 seq_printf(m,
46 "VmPeak:\t%8lu kB\n" 48 "VmPeak:\t%8lu kB\n"
47 "VmSize:\t%8lu kB\n" 49 "VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 "VmExe:\t%8lu kB\n" 56 "VmExe:\t%8lu kB\n"
55 "VmLib:\t%8lu kB\n" 57 "VmLib:\t%8lu kB\n"
56 "VmPTE:\t%8lu kB\n" 58 "VmPTE:\t%8lu kB\n"
59 "VmPMD:\t%8lu kB\n"
57 "VmSwap:\t%8lu kB\n", 60 "VmSwap:\t%8lu kB\n",
58 hiwater_vm << (PAGE_SHIFT-10), 61 hiwater_vm << (PAGE_SHIFT-10),
59 total_vm << (PAGE_SHIFT-10), 62 total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
63 total_rss << (PAGE_SHIFT-10), 66 total_rss << (PAGE_SHIFT-10),
64 data << (PAGE_SHIFT-10), 67 data << (PAGE_SHIFT-10),
65 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 68 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66 (PTRS_PER_PTE * sizeof(pte_t) * 69 ptes >> 10,
67 atomic_long_read(&mm->nr_ptes)) >> 10, 70 pmds >> 10,
68 swap << (PAGE_SHIFT-10)); 71 swap << (PAGE_SHIFT-10));
69} 72}
70 73
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
433 436
434#ifdef CONFIG_PROC_PAGE_MONITOR 437#ifdef CONFIG_PROC_PAGE_MONITOR
435struct mem_size_stats { 438struct mem_size_stats {
436 struct vm_area_struct *vma;
437 unsigned long resident; 439 unsigned long resident;
438 unsigned long shared_clean; 440 unsigned long shared_clean;
439 unsigned long shared_dirty; 441 unsigned long shared_dirty;
@@ -443,7 +445,6 @@ struct mem_size_stats {
443 unsigned long anonymous; 445 unsigned long anonymous;
444 unsigned long anonymous_thp; 446 unsigned long anonymous_thp;
445 unsigned long swap; 447 unsigned long swap;
446 unsigned long nonlinear;
447 u64 pss; 448 u64 pss;
448}; 449};
449 450
@@ -483,8 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
483 struct mm_walk *walk) 484 struct mm_walk *walk)
484{ 485{
485 struct mem_size_stats *mss = walk->private; 486 struct mem_size_stats *mss = walk->private;
486 struct vm_area_struct *vma = mss->vma; 487 struct vm_area_struct *vma = walk->vma;
487 pgoff_t pgoff = linear_page_index(vma, addr);
488 struct page *page = NULL; 488 struct page *page = NULL;
489 489
490 if (pte_present(*pte)) { 490 if (pte_present(*pte)) {
@@ -496,17 +496,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
496 mss->swap += PAGE_SIZE; 496 mss->swap += PAGE_SIZE;
497 else if (is_migration_entry(swpent)) 497 else if (is_migration_entry(swpent))
498 page = migration_entry_to_page(swpent); 498 page = migration_entry_to_page(swpent);
499 } else if (pte_file(*pte)) {
500 if (pte_to_pgoff(*pte) != pgoff)
501 mss->nonlinear += PAGE_SIZE;
502 } 499 }
503 500
504 if (!page) 501 if (!page)
505 return; 502 return;
506
507 if (page->index != pgoff)
508 mss->nonlinear += PAGE_SIZE;
509
510 smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); 503 smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
511} 504}
512 505
@@ -515,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
515 struct mm_walk *walk) 508 struct mm_walk *walk)
516{ 509{
517 struct mem_size_stats *mss = walk->private; 510 struct mem_size_stats *mss = walk->private;
518 struct vm_area_struct *vma = mss->vma; 511 struct vm_area_struct *vma = walk->vma;
519 struct page *page; 512 struct page *page;
520 513
521 /* FOLL_DUMP will return -EFAULT on huge zero page */ 514 /* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -536,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
536static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 529static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
537 struct mm_walk *walk) 530 struct mm_walk *walk)
538{ 531{
539 struct mem_size_stats *mss = walk->private; 532 struct vm_area_struct *vma = walk->vma;
540 struct vm_area_struct *vma = mss->vma;
541 pte_t *pte; 533 pte_t *pte;
542 spinlock_t *ptl; 534 spinlock_t *ptl;
543 535
@@ -596,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
596 [ilog2(VM_ACCOUNT)] = "ac", 588 [ilog2(VM_ACCOUNT)] = "ac",
597 [ilog2(VM_NORESERVE)] = "nr", 589 [ilog2(VM_NORESERVE)] = "nr",
598 [ilog2(VM_HUGETLB)] = "ht", 590 [ilog2(VM_HUGETLB)] = "ht",
599 [ilog2(VM_NONLINEAR)] = "nl",
600 [ilog2(VM_ARCH_1)] = "ar", 591 [ilog2(VM_ARCH_1)] = "ar",
601 [ilog2(VM_DONTDUMP)] = "dd", 592 [ilog2(VM_DONTDUMP)] = "dd",
602#ifdef CONFIG_MEM_SOFT_DIRTY 593#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -630,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
630 }; 621 };
631 622
632 memset(&mss, 0, sizeof mss); 623 memset(&mss, 0, sizeof mss);
633 mss.vma = vma;
634 /* mmap_sem is held in m_start */ 624 /* mmap_sem is held in m_start */
635 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 625 walk_page_vma(vma, &smaps_walk);
636 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
637 626
638 show_map_vma(m, vma, is_pid); 627 show_map_vma(m, vma, is_pid);
639 628
@@ -668,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
668 (vma->vm_flags & VM_LOCKED) ? 657 (vma->vm_flags & VM_LOCKED) ?
669 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 658 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
670 659
671 if (vma->vm_flags & VM_NONLINEAR)
672 seq_printf(m, "Nonlinear: %8lu kB\n",
673 mss.nonlinear >> 10);
674
675 show_smap_vma_flags(m, vma); 660 show_smap_vma_flags(m, vma);
676 m_cache_vma(m, vma); 661 m_cache_vma(m, vma);
677 return 0; 662 return 0;
@@ -747,18 +732,18 @@ enum clear_refs_types {
747 CLEAR_REFS_ANON, 732 CLEAR_REFS_ANON,
748 CLEAR_REFS_MAPPED, 733 CLEAR_REFS_MAPPED,
749 CLEAR_REFS_SOFT_DIRTY, 734 CLEAR_REFS_SOFT_DIRTY,
735 CLEAR_REFS_MM_HIWATER_RSS,
750 CLEAR_REFS_LAST, 736 CLEAR_REFS_LAST,
751}; 737};
752 738
753struct clear_refs_private { 739struct clear_refs_private {
754 struct vm_area_struct *vma;
755 enum clear_refs_types type; 740 enum clear_refs_types type;
756}; 741};
757 742
743#ifdef CONFIG_MEM_SOFT_DIRTY
758static inline void clear_soft_dirty(struct vm_area_struct *vma, 744static inline void clear_soft_dirty(struct vm_area_struct *vma,
759 unsigned long addr, pte_t *pte) 745 unsigned long addr, pte_t *pte)
760{ 746{
761#ifdef CONFIG_MEM_SOFT_DIRTY
762 /* 747 /*
763 * The soft-dirty tracker uses #PF-s to catch writes 748 * The soft-dirty tracker uses #PF-s to catch writes
764 * to pages, so write-protect the pte as well. See the 749 * to pages, so write-protect the pte as well. See the
@@ -772,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
772 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); 757 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
773 } else if (is_swap_pte(ptent)) { 758 } else if (is_swap_pte(ptent)) {
774 ptent = pte_swp_clear_soft_dirty(ptent); 759 ptent = pte_swp_clear_soft_dirty(ptent);
775 } else if (pte_file(ptent)) {
776 ptent = pte_file_clear_soft_dirty(ptent);
777 } 760 }
778 761
779 set_pte_at(vma->vm_mm, addr, pte, ptent); 762 set_pte_at(vma->vm_mm, addr, pte, ptent);
780#endif
781} 763}
782 764
765static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
766 unsigned long addr, pmd_t *pmdp)
767{
768 pmd_t pmd = *pmdp;
769
770 pmd = pmd_wrprotect(pmd);
771 pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
772
773 if (vma->vm_flags & VM_SOFTDIRTY)
774 vma->vm_flags &= ~VM_SOFTDIRTY;
775
776 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
777}
778
779#else
780
781static inline void clear_soft_dirty(struct vm_area_struct *vma,
782 unsigned long addr, pte_t *pte)
783{
784}
785
786static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
787 unsigned long addr, pmd_t *pmdp)
788{
789}
790#endif
791
783static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 792static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
784 unsigned long end, struct mm_walk *walk) 793 unsigned long end, struct mm_walk *walk)
785{ 794{
786 struct clear_refs_private *cp = walk->private; 795 struct clear_refs_private *cp = walk->private;
787 struct vm_area_struct *vma = cp->vma; 796 struct vm_area_struct *vma = walk->vma;
788 pte_t *pte, ptent; 797 pte_t *pte, ptent;
789 spinlock_t *ptl; 798 spinlock_t *ptl;
790 struct page *page; 799 struct page *page;
791 800
792 split_huge_page_pmd(vma, addr, pmd); 801 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
802 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
803 clear_soft_dirty_pmd(vma, addr, pmd);
804 goto out;
805 }
806
807 page = pmd_page(*pmd);
808
809 /* Clear accessed and referenced bits. */
810 pmdp_test_and_clear_young(vma, addr, pmd);
811 ClearPageReferenced(page);
812out:
813 spin_unlock(ptl);
814 return 0;
815 }
816
793 if (pmd_trans_unstable(pmd)) 817 if (pmd_trans_unstable(pmd))
794 return 0; 818 return 0;
795 819
@@ -818,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
818 return 0; 842 return 0;
819} 843}
820 844
845static int clear_refs_test_walk(unsigned long start, unsigned long end,
846 struct mm_walk *walk)
847{
848 struct clear_refs_private *cp = walk->private;
849 struct vm_area_struct *vma = walk->vma;
850
851 if (vma->vm_flags & VM_PFNMAP)
852 return 1;
853
854 /*
855 * Writing 1 to /proc/pid/clear_refs affects all pages.
856 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
857 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
858 * Writing 4 to /proc/pid/clear_refs affects all pages.
859 */
860 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
861 return 1;
862 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
863 return 1;
864 return 0;
865}
866
821static ssize_t clear_refs_write(struct file *file, const char __user *buf, 867static ssize_t clear_refs_write(struct file *file, const char __user *buf,
822 size_t count, loff_t *ppos) 868 size_t count, loff_t *ppos)
823{ 869{
@@ -858,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
858 }; 904 };
859 struct mm_walk clear_refs_walk = { 905 struct mm_walk clear_refs_walk = {
860 .pmd_entry = clear_refs_pte_range, 906 .pmd_entry = clear_refs_pte_range,
907 .test_walk = clear_refs_test_walk,
861 .mm = mm, 908 .mm = mm,
862 .private = &cp, 909 .private = &cp,
863 }; 910 };
911
912 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
913 /*
914 * Writing 5 to /proc/pid/clear_refs resets the peak
915 * resident set size to this mm's current rss value.
916 */
917 down_write(&mm->mmap_sem);
918 reset_mm_hiwater_rss(mm);
919 up_write(&mm->mmap_sem);
920 goto out_mm;
921 }
922
864 down_read(&mm->mmap_sem); 923 down_read(&mm->mmap_sem);
865 if (type == CLEAR_REFS_SOFT_DIRTY) { 924 if (type == CLEAR_REFS_SOFT_DIRTY) {
866 for (vma = mm->mmap; vma; vma = vma->vm_next) { 925 for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -877,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
877 } 936 }
878 mmu_notifier_invalidate_range_start(mm, 0, -1); 937 mmu_notifier_invalidate_range_start(mm, 0, -1);
879 } 938 }
880 for (vma = mm->mmap; vma; vma = vma->vm_next) { 939 walk_page_range(0, ~0UL, &clear_refs_walk);
881 cp.vma = vma;
882 if (is_vm_hugetlb_page(vma))
883 continue;
884 /*
885 * Writing 1 to /proc/pid/clear_refs affects all pages.
886 *
887 * Writing 2 to /proc/pid/clear_refs only affects
888 * Anonymous pages.
889 *
890 * Writing 3 to /proc/pid/clear_refs only affects file
891 * mapped pages.
892 *
893 * Writing 4 to /proc/pid/clear_refs affects all pages.
894 */
895 if (type == CLEAR_REFS_ANON && vma->vm_file)
896 continue;
897 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
898 continue;
899 walk_page_range(vma->vm_start, vma->vm_end,
900 &clear_refs_walk);
901 }
902 if (type == CLEAR_REFS_SOFT_DIRTY) 940 if (type == CLEAR_REFS_SOFT_DIRTY)
903 mmu_notifier_invalidate_range_end(mm, 0, -1); 941 mmu_notifier_invalidate_range_end(mm, 0, -1);
904 flush_tlb_mm(mm); 942 flush_tlb_mm(mm);
905 up_read(&mm->mmap_sem); 943 up_read(&mm->mmap_sem);
944out_mm:
906 mmput(mm); 945 mmput(mm);
907 } 946 }
908 put_task_struct(task); 947 put_task_struct(task);
@@ -1066,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
1066static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1105static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1067 struct mm_walk *walk) 1106 struct mm_walk *walk)
1068{ 1107{
1069 struct vm_area_struct *vma; 1108 struct vm_area_struct *vma = walk->vma;
1070 struct pagemapread *pm = walk->private; 1109 struct pagemapread *pm = walk->private;
1071 spinlock_t *ptl; 1110 spinlock_t *ptl;
1072 pte_t *pte; 1111 pte_t *pte, *orig_pte;
1073 int err = 0; 1112 int err = 0;
1074 1113
1075 /* find the first VMA at or above 'addr' */ 1114 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1076 vma = find_vma(walk->mm, addr);
1077 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1078 int pmd_flags2; 1115 int pmd_flags2;
1079 1116
1080 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1117 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1100,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1100 if (pmd_trans_unstable(pmd)) 1137 if (pmd_trans_unstable(pmd))
1101 return 0; 1138 return 0;
1102 1139
1103 while (1) { 1140 /*
1104 /* End of address space hole, which we mark as non-present. */ 1141 * We can assume that @vma always points to a valid one and @end never
1105 unsigned long hole_end; 1142 * goes beyond vma->vm_end.
1106 1143 */
1107 if (vma) 1144 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1108 hole_end = min(end, vma->vm_start); 1145 for (; addr < end; pte++, addr += PAGE_SIZE) {
1109 else 1146 pagemap_entry_t pme;
1110 hole_end = end;
1111
1112 for (; addr < hole_end; addr += PAGE_SIZE) {
1113 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1114
1115 err = add_to_pagemap(addr, &pme, pm);
1116 if (err)
1117 return err;
1118 }
1119
1120 if (!vma || vma->vm_start >= end)
1121 break;
1122 /*
1123 * We can't possibly be in a hugetlb VMA. In general,
1124 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1125 * the pmd_entry can only be called on addresses in a
1126 * hugetlb if the walk starts in a non-hugetlb VMA and
1127 * spans a hugepage VMA. Since pagemap_read walks are
1128 * PMD-sized and PMD-aligned, this will never be true.
1129 */
1130 BUG_ON(is_vm_hugetlb_page(vma));
1131
1132 /* Addresses in the VMA. */
1133 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1134 pagemap_entry_t pme;
1135 pte = pte_offset_map(pmd, addr);
1136 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1137 pte_unmap(pte);
1138 err = add_to_pagemap(addr, &pme, pm);
1139 if (err)
1140 return err;
1141 }
1142 1147
1143 if (addr == end) 1148 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1149 err = add_to_pagemap(addr, &pme, pm);
1150 if (err)
1144 break; 1151 break;
1145
1146 vma = find_vma(walk->mm, addr);
1147 } 1152 }
1153 pte_unmap_unlock(orig_pte, ptl);
1148 1154
1149 cond_resched(); 1155 cond_resched();
1150 1156
@@ -1170,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1170 struct mm_walk *walk) 1176 struct mm_walk *walk)
1171{ 1177{
1172 struct pagemapread *pm = walk->private; 1178 struct pagemapread *pm = walk->private;
1173 struct vm_area_struct *vma; 1179 struct vm_area_struct *vma = walk->vma;
1174 int err = 0; 1180 int err = 0;
1175 int flags2; 1181 int flags2;
1176 pagemap_entry_t pme; 1182 pagemap_entry_t pme;
1177 1183
1178 vma = find_vma(walk->mm, addr); 1184 if (vma->vm_flags & VM_SOFTDIRTY)
1179 WARN_ON_ONCE(!vma);
1180
1181 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1182 flags2 = __PM_SOFT_DIRTY; 1185 flags2 = __PM_SOFT_DIRTY;
1183 else 1186 else
1184 flags2 = 0; 1187 flags2 = 0;
@@ -1338,7 +1341,6 @@ const struct file_operations proc_pagemap_operations = {
1338#ifdef CONFIG_NUMA 1341#ifdef CONFIG_NUMA
1339 1342
1340struct numa_maps { 1343struct numa_maps {
1341 struct vm_area_struct *vma;
1342 unsigned long pages; 1344 unsigned long pages;
1343 unsigned long anon; 1345 unsigned long anon;
1344 unsigned long active; 1346 unsigned long active;
@@ -1407,18 +1409,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1407static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 1409static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1408 unsigned long end, struct mm_walk *walk) 1410 unsigned long end, struct mm_walk *walk)
1409{ 1411{
1410 struct numa_maps *md; 1412 struct numa_maps *md = walk->private;
1413 struct vm_area_struct *vma = walk->vma;
1411 spinlock_t *ptl; 1414 spinlock_t *ptl;
1412 pte_t *orig_pte; 1415 pte_t *orig_pte;
1413 pte_t *pte; 1416 pte_t *pte;
1414 1417
1415 md = walk->private; 1418 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1416
1417 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1418 pte_t huge_pte = *(pte_t *)pmd; 1419 pte_t huge_pte = *(pte_t *)pmd;
1419 struct page *page; 1420 struct page *page;
1420 1421
1421 page = can_gather_numa_stats(huge_pte, md->vma, addr); 1422 page = can_gather_numa_stats(huge_pte, vma, addr);
1422 if (page) 1423 if (page)
1423 gather_stats(page, md, pte_dirty(huge_pte), 1424 gather_stats(page, md, pte_dirty(huge_pte),
1424 HPAGE_PMD_SIZE/PAGE_SIZE); 1425 HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1430,7 +1431,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1430 return 0; 1431 return 0;
1431 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1432 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1432 do { 1433 do {
1433 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 1434 struct page *page = can_gather_numa_stats(*pte, vma, addr);
1434 if (!page) 1435 if (!page)
1435 continue; 1436 continue;
1436 gather_stats(page, md, pte_dirty(*pte), 1); 1437 gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1440,7 +1441,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1440 return 0; 1441 return 0;
1441} 1442}
1442#ifdef CONFIG_HUGETLB_PAGE 1443#ifdef CONFIG_HUGETLB_PAGE
1443static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1444static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1444 unsigned long addr, unsigned long end, struct mm_walk *walk) 1445 unsigned long addr, unsigned long end, struct mm_walk *walk)
1445{ 1446{
1446 struct numa_maps *md; 1447 struct numa_maps *md;
@@ -1459,7 +1460,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1459} 1460}
1460 1461
1461#else 1462#else
1462static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1463static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1463 unsigned long addr, unsigned long end, struct mm_walk *walk) 1464 unsigned long addr, unsigned long end, struct mm_walk *walk)
1464{ 1465{
1465 return 0; 1466 return 0;
@@ -1477,7 +1478,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1477 struct numa_maps *md = &numa_priv->md; 1478 struct numa_maps *md = &numa_priv->md;
1478 struct file *file = vma->vm_file; 1479 struct file *file = vma->vm_file;
1479 struct mm_struct *mm = vma->vm_mm; 1480 struct mm_struct *mm = vma->vm_mm;
1480 struct mm_walk walk = {}; 1481 struct mm_walk walk = {
1482 .hugetlb_entry = gather_hugetlb_stats,
1483 .pmd_entry = gather_pte_stats,
1484 .private = md,
1485 .mm = mm,
1486 };
1481 struct mempolicy *pol; 1487 struct mempolicy *pol;
1482 char buffer[64]; 1488 char buffer[64];
1483 int nid; 1489 int nid;
@@ -1488,13 +1494,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1488 /* Ensure we start with an empty set of numa_maps statistics. */ 1494 /* Ensure we start with an empty set of numa_maps statistics. */
1489 memset(md, 0, sizeof(*md)); 1495 memset(md, 0, sizeof(*md));
1490 1496
1491 md->vma = vma;
1492
1493 walk.hugetlb_entry = gather_hugetbl_stats;
1494 walk.pmd_entry = gather_pte_stats;
1495 walk.private = md;
1496 walk.mm = mm;
1497
1498 pol = __get_vma_policy(vma, vma->vm_start); 1497 pol = __get_vma_policy(vma, vma->vm_start);
1499 if (pol) { 1498 if (pol) {
1500 mpol_to_str(buffer, sizeof(buffer), pol); 1499 mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1528,7 +1527,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1528 if (is_vm_hugetlb_page(vma)) 1527 if (is_vm_hugetlb_page(vma))
1529 seq_puts(m, " huge"); 1528 seq_puts(m, " huge");
1530 1529
1531 walk_page_range(vma->vm_start, vma->vm_end, &walk); 1530 /* mmap_sem is held by m_start */
1531 walk_page_vma(vma, &walk);
1532 1532
1533 if (!md->pages) 1533 if (!md->pages)
1534 goto out; 1534 goto out;
@@ -1557,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1557 for_each_node_state(nid, N_MEMORY) 1557 for_each_node_state(nid, N_MEMORY)
1558 if (md->node[nid]) 1558 if (md->node[nid])
1559 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1559 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1560
1561 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1560out: 1562out:
1561 seq_putc(m, '\n'); 1563 seq_putc(m, '\n');
1562 m_cache_vma(m, vma); 1564 m_cache_vma(m, vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a90d6d354199..4e61388ec03d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
546 nhdr_ptr = notes_section; 546 nhdr_ptr = notes_section;
547 while (nhdr_ptr->n_namesz != 0) { 547 while (nhdr_ptr->n_namesz != 0) {
548 sz = sizeof(Elf64_Nhdr) + 548 sz = sizeof(Elf64_Nhdr) +
549 ((nhdr_ptr->n_namesz + 3) & ~3) + 549 (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
550 ((nhdr_ptr->n_descsz + 3) & ~3); 550 (((u64)nhdr_ptr->n_descsz + 3) & ~3);
551 if ((real_sz + sz) > max_sz) { 551 if ((real_sz + sz) > max_sz) {
552 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", 552 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
553 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); 553 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
@@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
732 nhdr_ptr = notes_section; 732 nhdr_ptr = notes_section;
733 while (nhdr_ptr->n_namesz != 0) { 733 while (nhdr_ptr->n_namesz != 0) {
734 sz = sizeof(Elf32_Nhdr) + 734 sz = sizeof(Elf32_Nhdr) +
735 ((nhdr_ptr->n_namesz + 3) & ~3) + 735 (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
736 ((nhdr_ptr->n_descsz + 3) & ~3); 736 (((u64)nhdr_ptr->n_descsz + 3) & ~3);
737 if ((real_sz + sz) > max_sz) { 737 if ((real_sz + sz) > max_sz) {
738 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", 738 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
739 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); 739 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
44 { MS_SYNCHRONOUS, ",sync" }, 44 { MS_SYNCHRONOUS, ",sync" },
45 { MS_DIRSYNC, ",dirsync" }, 45 { MS_DIRSYNC, ",dirsync" },
46 { MS_MANDLOCK, ",mand" }, 46 { MS_MANDLOCK, ",mand" },
47 { MS_LAZYTIME, ",lazytime" },
47 { 0, NULL } 48 { 0, NULL }
48 }; 49 };
49 const struct proc_fs_info *fs_infop; 50 const struct proc_fs_info *fs_infop;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
21 When the option is enabled, pstore will log all kernel 21 When the option is enabled, pstore will log all kernel
22 messages, even if no oops or panic happened. 22 messages, even if no oops or panic happened.
23 23
24config PSTORE_PMSG
25 bool "Log user space messages"
26 depends on PSTORE
27 help
28 When the option is enabled, pstore will export a character
29 interface /dev/pmsg0 to log user space messages. On reboot
30 data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
31
32 If unsure, say N.
33
24config PSTORE_FTRACE 34config PSTORE_FTRACE
25 bool "Persistent function tracer" 35 bool "Persistent function tracer"
26 depends on PSTORE 36 depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
7pstore-objs += inode.o platform.o 7pstore-objs += inode.o platform.o
8obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o 8obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
9 9
10obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
11
10ramoops-objs += ram.o ram_core.o 12ramoops-objs += ram.o ram_core.o
11obj-$(CONFIG_PSTORE_RAM) += ramoops.o 13obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 50416602774d..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -338,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
338 338
339 switch (type) { 339 switch (type) {
340 case PSTORE_TYPE_DMESG: 340 case PSTORE_TYPE_DMESG:
341 sprintf(name, "dmesg-%s-%lld%s", psname, id, 341 scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
342 compressed ? ".enc.z" : ""); 342 psname, id, compressed ? ".enc.z" : "");
343 break; 343 break;
344 case PSTORE_TYPE_CONSOLE: 344 case PSTORE_TYPE_CONSOLE:
345 sprintf(name, "console-%s-%lld", psname, id); 345 scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
346 break; 346 break;
347 case PSTORE_TYPE_FTRACE: 347 case PSTORE_TYPE_FTRACE:
348 sprintf(name, "ftrace-%s-%lld", psname, id); 348 scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
349 break; 349 break;
350 case PSTORE_TYPE_MCE: 350 case PSTORE_TYPE_MCE:
351 sprintf(name, "mce-%s-%lld", psname, id); 351 scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
352 break; 352 break;
353 case PSTORE_TYPE_PPC_RTAS: 353 case PSTORE_TYPE_PPC_RTAS:
354 sprintf(name, "rtas-%s-%lld", psname, id); 354 scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
355 break; 355 break;
356 case PSTORE_TYPE_PPC_OF: 356 case PSTORE_TYPE_PPC_OF:
357 sprintf(name, "powerpc-ofw-%s-%lld", psname, id); 357 scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
358 psname, id);
358 break; 359 break;
359 case PSTORE_TYPE_PPC_COMMON: 360 case PSTORE_TYPE_PPC_COMMON:
360 sprintf(name, "powerpc-common-%s-%lld", psname, id); 361 scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
362 psname, id);
363 break;
364 case PSTORE_TYPE_PMSG:
365 scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
361 break; 366 break;
362 case PSTORE_TYPE_UNKNOWN: 367 case PSTORE_TYPE_UNKNOWN:
363 sprintf(name, "unknown-%s-%lld", psname, id); 368 scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
364 break; 369 break;
365 default: 370 default:
366 sprintf(name, "type%d-%s-%lld", type, psname, id); 371 scnprintf(name, sizeof(name), "type%d-%s-%lld",
372 type, psname, id);
367 break; 373 break;
368 } 374 }
369 375
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
45static inline void pstore_register_ftrace(void) {} 45static inline void pstore_register_ftrace(void) {}
46#endif 46#endif
47 47
48#ifdef CONFIG_PSTORE_PMSG
49extern void pstore_register_pmsg(void);
50#else
51static inline void pstore_register_pmsg(void) {}
52#endif
53
48extern struct pstore_info *psinfo; 54extern struct pstore_info *psinfo;
49 55
50extern void pstore_set_kmsg_bytes(int); 56extern void pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..c4c9a10c5760 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
301 301
302 if (big_oops_buf) { 302 if (big_oops_buf) {
303 dst = big_oops_buf; 303 dst = big_oops_buf;
304 hsize = sprintf(dst, "%s#%d Part%d\n", why, 304 hsize = sprintf(dst, "%s#%d Part%u\n", why,
305 oopscount, part); 305 oopscount, part);
306 size = big_oops_buf_sz - hsize; 306 size = big_oops_buf_sz - hsize;
307 307
@@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
321 } 321 }
322 } else { 322 } else {
323 dst = psinfo->buf; 323 dst = psinfo->buf;
324 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, 324 hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
325 part); 325 part);
326 size = psinfo->bufsize - hsize; 326 size = psinfo->bufsize - hsize;
327 dst += hsize; 327 dst += hsize;
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
447 if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { 447 if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
448 pstore_register_console(); 448 pstore_register_console();
449 pstore_register_ftrace(); 449 pstore_register_ftrace();
450 pstore_register_pmsg();
450 } 451 }
451 452
452 if (pstore_update_ms >= 0) { 453 if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright 2014 Google, Inc.
3 *
4 * This software is licensed under the terms of the GNU General Public
5 * License version 2, as published by the Free Software Foundation, and
6 * may be copied, distributed, and modified under those terms.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 */
13
14#include <linux/cdev.h>
15#include <linux/device.h>
16#include <linux/fs.h>
17#include <linux/uaccess.h>
18#include <linux/vmalloc.h>
19#include "internal.h"
20
21static DEFINE_MUTEX(pmsg_lock);
22#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
23
24static ssize_t write_pmsg(struct file *file, const char __user *buf,
25 size_t count, loff_t *ppos)
26{
27 size_t i, buffer_size;
28 char *buffer;
29
30 if (!count)
31 return 0;
32
33 if (!access_ok(VERIFY_READ, buf, count))
34 return -EFAULT;
35
36 buffer_size = count;
37 if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
38 buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
39 buffer = vmalloc(buffer_size);
40
41 mutex_lock(&pmsg_lock);
42 for (i = 0; i < count; ) {
43 size_t c = min(count - i, buffer_size);
44 u64 id;
45 long ret;
46
47 ret = __copy_from_user(buffer, buf + i, c);
48 if (unlikely(ret != 0)) {
49 mutex_unlock(&pmsg_lock);
50 vfree(buffer);
51 return -EFAULT;
52 }
53 psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
54 psinfo);
55
56 i += c;
57 }
58
59 mutex_unlock(&pmsg_lock);
60 vfree(buffer);
61 return count;
62}
63
64static const struct file_operations pmsg_fops = {
65 .owner = THIS_MODULE,
66 .llseek = noop_llseek,
67 .write = write_pmsg,
68};
69
70static struct class *pmsg_class;
71static int pmsg_major;
72#define PMSG_NAME "pmsg"
73#undef pr_fmt
74#define pr_fmt(fmt) PMSG_NAME ": " fmt
75
76static char *pmsg_devnode(struct device *dev, umode_t *mode)
77{
78 if (mode)
79 *mode = 0220;
80 return NULL;
81}
82
83void pstore_register_pmsg(void)
84{
85 struct device *pmsg_device;
86
87 pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
88 if (pmsg_major < 0) {
89 pr_err("register_chrdev failed\n");
90 goto err;
91 }
92
93 pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
94 if (IS_ERR(pmsg_class)) {
95 pr_err("device class file already in use\n");
96 goto err_class;
97 }
98 pmsg_class->devnode = pmsg_devnode;
99
100 pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
101 NULL, "%s%d", PMSG_NAME, 0);
102 if (IS_ERR(pmsg_device)) {
103 pr_err("failed to create device\n");
104 goto err_device;
105 }
106 return;
107
108err_device:
109 class_destroy(pmsg_class);
110err_class:
111 unregister_chrdev(pmsg_major, PMSG_NAME);
112err:
113 return;
114}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8613e5b35c22..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
51module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400); 51module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
52MODULE_PARM_DESC(ftrace_size, "size of ftrace log"); 52MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
53 53
54static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
55module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
56MODULE_PARM_DESC(pmsg_size, "size of user space message log");
57
54static ulong mem_address; 58static ulong mem_address;
55module_param(mem_address, ulong, 0400); 59module_param(mem_address, ulong, 0400);
56MODULE_PARM_DESC(mem_address, 60MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
82 struct persistent_ram_zone **przs; 86 struct persistent_ram_zone **przs;
83 struct persistent_ram_zone *cprz; 87 struct persistent_ram_zone *cprz;
84 struct persistent_ram_zone *fprz; 88 struct persistent_ram_zone *fprz;
89 struct persistent_ram_zone *mprz;
85 phys_addr_t phys_addr; 90 phys_addr_t phys_addr;
86 unsigned long size; 91 unsigned long size;
87 unsigned int memtype; 92 unsigned int memtype;
88 size_t record_size; 93 size_t record_size;
89 size_t console_size; 94 size_t console_size;
90 size_t ftrace_size; 95 size_t ftrace_size;
96 size_t pmsg_size;
91 int dump_oops; 97 int dump_oops;
92 struct persistent_ram_ecc_info ecc_info; 98 struct persistent_ram_ecc_info ecc_info;
93 unsigned int max_dump_cnt; 99 unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
96 unsigned int dump_read_cnt; 102 unsigned int dump_read_cnt;
97 unsigned int console_read_cnt; 103 unsigned int console_read_cnt;
98 unsigned int ftrace_read_cnt; 104 unsigned int ftrace_read_cnt;
105 unsigned int pmsg_read_cnt;
99 struct pstore_info pstore; 106 struct pstore_info pstore;
100}; 107};
101 108
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
109 cxt->dump_read_cnt = 0; 116 cxt->dump_read_cnt = 0;
110 cxt->console_read_cnt = 0; 117 cxt->console_read_cnt = 0;
111 cxt->ftrace_read_cnt = 0; 118 cxt->ftrace_read_cnt = 0;
119 cxt->pmsg_read_cnt = 0;
112 return 0; 120 return 0;
113} 121}
114 122
@@ -164,6 +172,12 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
164 return header_length; 172 return header_length;
165} 173}
166 174
175static bool prz_ok(struct persistent_ram_zone *prz)
176{
177 return !!prz && !!(persistent_ram_old_size(prz) +
178 persistent_ram_ecc_string(prz, NULL, 0));
179}
180
167static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, 181static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
168 int *count, struct timespec *time, 182 int *count, struct timespec *time,
169 char **buf, bool *compressed, 183 char **buf, bool *compressed,
@@ -178,13 +192,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
178 prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, 192 prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
179 cxt->max_dump_cnt, id, type, 193 cxt->max_dump_cnt, id, type,
180 PSTORE_TYPE_DMESG, 1); 194 PSTORE_TYPE_DMESG, 1);
181 if (!prz) 195 if (!prz_ok(prz))
182 prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 196 prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
183 1, id, type, PSTORE_TYPE_CONSOLE, 0); 197 1, id, type, PSTORE_TYPE_CONSOLE, 0);
184 if (!prz) 198 if (!prz_ok(prz))
185 prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, 199 prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
186 1, id, type, PSTORE_TYPE_FTRACE, 0); 200 1, id, type, PSTORE_TYPE_FTRACE, 0);
187 if (!prz) 201 if (!prz_ok(prz))
202 prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
203 1, id, type, PSTORE_TYPE_PMSG, 0);
204 if (!prz_ok(prz))
188 return 0; 205 return 0;
189 206
190 if (!persistent_ram_old(prz)) 207 if (!persistent_ram_old(prz))
@@ -252,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
252 return -ENOMEM; 269 return -ENOMEM;
253 persistent_ram_write(cxt->fprz, buf, size); 270 persistent_ram_write(cxt->fprz, buf, size);
254 return 0; 271 return 0;
272 } else if (type == PSTORE_TYPE_PMSG) {
273 if (!cxt->mprz)
274 return -ENOMEM;
275 persistent_ram_write(cxt->mprz, buf, size);
276 return 0;
255 } 277 }
256 278
257 if (type != PSTORE_TYPE_DMESG) 279 if (type != PSTORE_TYPE_DMESG)
@@ -309,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
309 case PSTORE_TYPE_FTRACE: 331 case PSTORE_TYPE_FTRACE:
310 prz = cxt->fprz; 332 prz = cxt->fprz;
311 break; 333 break;
334 case PSTORE_TYPE_PMSG:
335 prz = cxt->mprz;
336 break;
312 default: 337 default:
313 return -EINVAL; 338 return -EINVAL;
314 } 339 }
@@ -435,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
435 goto fail_out; 460 goto fail_out;
436 461
437 if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && 462 if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
438 !pdata->ftrace_size)) { 463 !pdata->ftrace_size && !pdata->pmsg_size)) {
439 pr_err("The memory size and the record/console size must be " 464 pr_err("The memory size and the record/console size must be "
440 "non-zero\n"); 465 "non-zero\n");
441 goto fail_out; 466 goto fail_out;
@@ -447,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
447 pdata->console_size = rounddown_pow_of_two(pdata->console_size); 472 pdata->console_size = rounddown_pow_of_two(pdata->console_size);
448 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) 473 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
449 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 474 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
475 if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
476 pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
450 477
451 cxt->size = pdata->mem_size; 478 cxt->size = pdata->mem_size;
452 cxt->phys_addr = pdata->mem_address; 479 cxt->phys_addr = pdata->mem_address;
@@ -454,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
454 cxt->record_size = pdata->record_size; 481 cxt->record_size = pdata->record_size;
455 cxt->console_size = pdata->console_size; 482 cxt->console_size = pdata->console_size;
456 cxt->ftrace_size = pdata->ftrace_size; 483 cxt->ftrace_size = pdata->ftrace_size;
484 cxt->pmsg_size = pdata->pmsg_size;
457 cxt->dump_oops = pdata->dump_oops; 485 cxt->dump_oops = pdata->dump_oops;
458 cxt->ecc_info = pdata->ecc_info; 486 cxt->ecc_info = pdata->ecc_info;
459 487
460 paddr = cxt->phys_addr; 488 paddr = cxt->phys_addr;
461 489
462 dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size; 490 dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
491 - cxt->pmsg_size;
463 err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); 492 err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
464 if (err) 493 if (err)
465 goto fail_out; 494 goto fail_out;
@@ -474,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev)
474 if (err) 503 if (err)
475 goto fail_init_fprz; 504 goto fail_init_fprz;
476 505
477 if (!cxt->przs && !cxt->cprz && !cxt->fprz) { 506 err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
478 pr_err("memory size too small, minimum is %zu\n", 507 if (err)
479 cxt->console_size + cxt->record_size + 508 goto fail_init_mprz;
480 cxt->ftrace_size);
481 err = -EINVAL;
482 goto fail_cnt;
483 }
484 509
485 cxt->pstore.data = cxt; 510 cxt->pstore.data = cxt;
486 /* 511 /*
@@ -525,7 +550,8 @@ fail_buf:
525 kfree(cxt->pstore.buf); 550 kfree(cxt->pstore.buf);
526fail_clear: 551fail_clear:
527 cxt->pstore.bufsize = 0; 552 cxt->pstore.bufsize = 0;
528fail_cnt: 553 kfree(cxt->mprz);
554fail_init_mprz:
529 kfree(cxt->fprz); 555 kfree(cxt->fprz);
530fail_init_fprz: 556fail_init_fprz:
531 kfree(cxt->cprz); 557 kfree(cxt->cprz);
@@ -583,6 +609,7 @@ static void ramoops_register_dummy(void)
583 dummy_data->record_size = record_size; 609 dummy_data->record_size = record_size;
584 dummy_data->console_size = ramoops_console_size; 610 dummy_data->console_size = ramoops_console_size;
585 dummy_data->ftrace_size = ramoops_ftrace_size; 611 dummy_data->ftrace_size = ramoops_ftrace_size;
612 dummy_data->pmsg_size = ramoops_pmsg_size;
586 dummy_data->dump_oops = dump_oops; 613 dummy_data->dump_oops = dump_oops;
587 /* 614 /*
588 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC 615 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1dd237e..4a09975aac90 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
5config QUOTA 5config QUOTA
6 bool "Quota support" 6 bool "Quota support"
7 select QUOTACTL 7 select QUOTACTL
8 select SRCU
8 help 9 help
9 If you say Y here, you will be able to set per user limits for disk 10 If you say Y here, you will be able to set per user limits for disk
10 usage (also called disk quotas). Currently, it works for the 11 usage (also called disk quotas). Currently, it works for the
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 69df5b239844..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
1248 1248
1249 return capable(CAP_SYS_RESOURCE) && 1249 return capable(CAP_SYS_RESOURCE) &&
1250 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || 1250 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
1251 !(info->dqi_flags & V1_DQF_RSQUASH)); 1251 !(info->dqi_flags & DQF_ROOT_SQUASH));
1252} 1252}
1253 1253
1254/* needs dq_data_lock */ 1254/* needs dq_data_lock */
@@ -2385,14 +2385,84 @@ out:
2385} 2385}
2386EXPORT_SYMBOL(dquot_quota_on_mount); 2386EXPORT_SYMBOL(dquot_quota_on_mount);
2387 2387
2388static inline qsize_t qbtos(qsize_t blocks) 2388static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
2389{ 2389{
2390 return blocks << QIF_DQBLKSIZE_BITS; 2390 int ret;
2391 int type;
2392 struct quota_info *dqopt = sb_dqopt(sb);
2393
2394 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
2395 return -ENOSYS;
2396 /* Accounting cannot be turned on while fs is mounted */
2397 flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
2398 if (!flags)
2399 return -EINVAL;
2400 for (type = 0; type < MAXQUOTAS; type++) {
2401 if (!(flags & qtype_enforce_flag(type)))
2402 continue;
2403 /* Can't enforce without accounting */
2404 if (!sb_has_quota_usage_enabled(sb, type))
2405 return -EINVAL;
2406 ret = dquot_enable(dqopt->files[type], type,
2407 dqopt->info[type].dqi_fmt_id,
2408 DQUOT_LIMITS_ENABLED);
2409 if (ret < 0)
2410 goto out_err;
2411 }
2412 return 0;
2413out_err:
2414 /* Backout enforcement enablement we already did */
2415 for (type--; type >= 0; type--) {
2416 if (flags & qtype_enforce_flag(type))
2417 dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
2418 }
2419 /* Error code translation for better compatibility with XFS */
2420 if (ret == -EBUSY)
2421 ret = -EEXIST;
2422 return ret;
2391} 2423}
2392 2424
2393static inline qsize_t stoqb(qsize_t space) 2425static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
2394{ 2426{
2395 return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; 2427 int ret;
2428 int type;
2429 struct quota_info *dqopt = sb_dqopt(sb);
2430
2431 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
2432 return -ENOSYS;
2433 /*
2434 * We don't support turning off accounting via quotactl. In principle
2435 * quota infrastructure can do this but filesystems don't expect
2436 * userspace to be able to do it.
2437 */
2438 if (flags &
2439 (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
2440 return -EOPNOTSUPP;
2441
2442 /* Filter out limits not enabled */
2443 for (type = 0; type < MAXQUOTAS; type++)
2444 if (!sb_has_quota_limits_enabled(sb, type))
2445 flags &= ~qtype_enforce_flag(type);
2446 /* Nothing left? */
2447 if (!flags)
2448 return -EEXIST;
2449 for (type = 0; type < MAXQUOTAS; type++) {
2450 if (flags & qtype_enforce_flag(type)) {
2451 ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
2452 if (ret < 0)
2453 goto out_err;
2454 }
2455 }
2456 return 0;
2457out_err:
2458 /* Backout enforcement disabling we already did */
2459 for (type--; type >= 0; type--) {
2460 if (flags & qtype_enforce_flag(type))
2461 dquot_enable(dqopt->files[type], type,
2462 dqopt->info[type].dqi_fmt_id,
2463 DQUOT_LIMITS_ENABLED);
2464 }
2465 return ret;
2396} 2466}
2397 2467
2398/* Generic routine for getting common part of quota structure */ 2468/* Generic routine for getting common part of quota structure */
@@ -2444,13 +2514,13 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
2444 return -EINVAL; 2514 return -EINVAL;
2445 2515
2446 if (((di->d_fieldmask & QC_SPC_SOFT) && 2516 if (((di->d_fieldmask & QC_SPC_SOFT) &&
2447 stoqb(di->d_spc_softlimit) > dqi->dqi_maxblimit) || 2517 di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
2448 ((di->d_fieldmask & QC_SPC_HARD) && 2518 ((di->d_fieldmask & QC_SPC_HARD) &&
2449 stoqb(di->d_spc_hardlimit) > dqi->dqi_maxblimit) || 2519 di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
2450 ((di->d_fieldmask & QC_INO_SOFT) && 2520 ((di->d_fieldmask & QC_INO_SOFT) &&
2451 (di->d_ino_softlimit > dqi->dqi_maxilimit)) || 2521 (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
2452 ((di->d_fieldmask & QC_INO_HARD) && 2522 ((di->d_fieldmask & QC_INO_HARD) &&
2453 (di->d_ino_hardlimit > dqi->dqi_maxilimit))) 2523 (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
2454 return -ERANGE; 2524 return -ERANGE;
2455 2525
2456 spin_lock(&dq_data_lock); 2526 spin_lock(&dq_data_lock);
@@ -2577,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2577 goto out; 2647 goto out;
2578 } 2648 }
2579 mi = sb_dqopt(sb)->info + type; 2649 mi = sb_dqopt(sb)->info + type;
2650 if (ii->dqi_valid & IIF_FLAGS) {
2651 if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
2652 (ii->dqi_flags & DQF_ROOT_SQUASH &&
2653 mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
2654 err = -EINVAL;
2655 goto out;
2656 }
2657 }
2580 spin_lock(&dq_data_lock); 2658 spin_lock(&dq_data_lock);
2581 if (ii->dqi_valid & IIF_BGRACE) 2659 if (ii->dqi_valid & IIF_BGRACE)
2582 mi->dqi_bgrace = ii->dqi_bgrace; 2660 mi->dqi_bgrace = ii->dqi_bgrace;
@@ -2606,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
2606}; 2684};
2607EXPORT_SYMBOL(dquot_quotactl_ops); 2685EXPORT_SYMBOL(dquot_quotactl_ops);
2608 2686
2687const struct quotactl_ops dquot_quotactl_sysfile_ops = {
2688 .quota_enable = dquot_quota_enable,
2689 .quota_disable = dquot_quota_disable,
2690 .quota_sync = dquot_quota_sync,
2691 .get_info = dquot_get_dqinfo,
2692 .set_info = dquot_set_dqinfo,
2693 .get_dqblk = dquot_get_dqblk,
2694 .set_dqblk = dquot_set_dqblk
2695};
2696EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
2697
2609static int do_proc_dqstats(struct ctl_table *table, int write, 2698static int do_proc_dqstats(struct ctl_table *table, int write,
2610 void __user *buffer, size_t *lenp, loff_t *ppos) 2699 void __user *buffer, size_t *lenp, loff_t *ppos)
2611{ 2700{
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 6f3856328eea..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,40 @@ static int quota_sync_all(int type)
66 return ret; 66 return ret;
67} 67}
68 68
69unsigned int qtype_enforce_flag(int type)
70{
71 switch (type) {
72 case USRQUOTA:
73 return FS_QUOTA_UDQ_ENFD;
74 case GRPQUOTA:
75 return FS_QUOTA_GDQ_ENFD;
76 case PRJQUOTA:
77 return FS_QUOTA_PDQ_ENFD;
78 }
79 return 0;
80}
81
69static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 82static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
70 struct path *path) 83 struct path *path)
71{ 84{
72 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta) 85 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
73 return -ENOSYS; 86 return -ENOSYS;
74 if (sb->s_qcop->quota_on_meta) 87 if (sb->s_qcop->quota_enable)
75 return sb->s_qcop->quota_on_meta(sb, type, id); 88 return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
76 if (IS_ERR(path)) 89 if (IS_ERR(path))
77 return PTR_ERR(path); 90 return PTR_ERR(path);
78 return sb->s_qcop->quota_on(sb, type, id, path); 91 return sb->s_qcop->quota_on(sb, type, id, path);
79} 92}
80 93
94static int quota_quotaoff(struct super_block *sb, int type)
95{
96 if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
97 return -ENOSYS;
98 if (sb->s_qcop->quota_disable)
99 return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
100 return sb->s_qcop->quota_off(sb, type);
101}
102
81static int quota_getfmt(struct super_block *sb, int type, void __user *addr) 103static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
82{ 104{
83 __u32 fmt; 105 __u32 fmt;
@@ -208,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
208 return sb->s_qcop->set_dqblk(sb, qid, &fdq); 230 return sb->s_qcop->set_dqblk(sb, qid, &fdq);
209} 231}
210 232
211static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 233static int quota_enable(struct super_block *sb, void __user *addr)
212{ 234{
213 __u32 flags; 235 __u32 flags;
214 236
215 if (copy_from_user(&flags, addr, sizeof(flags))) 237 if (copy_from_user(&flags, addr, sizeof(flags)))
216 return -EFAULT; 238 return -EFAULT;
217 if (!sb->s_qcop->set_xstate) 239 if (!sb->s_qcop->quota_enable)
218 return -ENOSYS; 240 return -ENOSYS;
219 return sb->s_qcop->set_xstate(sb, flags, cmd); 241 return sb->s_qcop->quota_enable(sb, flags);
242}
243
244static int quota_disable(struct super_block *sb, void __user *addr)
245{
246 __u32 flags;
247
248 if (copy_from_user(&flags, addr, sizeof(flags)))
249 return -EFAULT;
250 if (!sb->s_qcop->quota_disable)
251 return -ENOSYS;
252 return sb->s_qcop->quota_disable(sb, flags);
220} 253}
221 254
222static int quota_getxstate(struct super_block *sb, void __user *addr) 255static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -429,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
429 case Q_QUOTAON: 462 case Q_QUOTAON:
430 return quota_quotaon(sb, type, cmd, id, path); 463 return quota_quotaon(sb, type, cmd, id, path);
431 case Q_QUOTAOFF: 464 case Q_QUOTAOFF:
432 if (!sb->s_qcop->quota_off) 465 return quota_quotaoff(sb, type);
433 return -ENOSYS;
434 return sb->s_qcop->quota_off(sb, type);
435 case Q_GETFMT: 466 case Q_GETFMT:
436 return quota_getfmt(sb, type, addr); 467 return quota_getfmt(sb, type, addr);
437 case Q_GETINFO: 468 case Q_GETINFO:
@@ -447,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
447 return -ENOSYS; 478 return -ENOSYS;
448 return sb->s_qcop->quota_sync(sb, type); 479 return sb->s_qcop->quota_sync(sb, type);
449 case Q_XQUOTAON: 480 case Q_XQUOTAON:
481 return quota_enable(sb, addr);
450 case Q_XQUOTAOFF: 482 case Q_XQUOTAOFF:
451 return quota_setxstate(sb, cmd, addr); 483 return quota_disable(sb, addr);
452 case Q_XQUOTARM: 484 case Q_XQUOTARM:
453 return quota_rmxquota(sb, addr); 485 return quota_rmxquota(sb, addr);
454 case Q_XGETQSTAT: 486 case Q_XGETQSTAT:
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
169 } 169 }
170 ret = 0; 170 ret = 0;
171 /* limits are stored as unsigned 32-bit data */ 171 /* limits are stored as unsigned 32-bit data */
172 dqopt->info[type].dqi_maxblimit = 0xffffffff; 172 dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
173 dqopt->info[type].dqi_maxilimit = 0xffffffff; 173 dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
174 dqopt->info[type].dqi_igrace = 174 dqopt->info[type].dqi_igrace =
175 dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; 175 dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
176 dqopt->info[type].dqi_bgrace = 176 dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 02751ec695c5..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
117 qinfo = info->dqi_priv; 117 qinfo = info->dqi_priv;
118 if (version == 0) { 118 if (version == 0) {
119 /* limits are stored as unsigned 32-bit data */ 119 /* limits are stored as unsigned 32-bit data */
120 info->dqi_maxblimit = 0xffffffff; 120 info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
121 info->dqi_maxilimit = 0xffffffff; 121 info->dqi_max_ino_limit = 0xffffffff;
122 } else { 122 } else {
123 /* used space is stored as unsigned 64-bit value */ 123 /* used space is stored as unsigned 64-bit value in bytes */
124 info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ 124 info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
125 info->dqi_maxilimit = 0xffffffffffffffffULL; 125 info->dqi_max_ino_limit = 0xffffffffffffffffULL;
126 } 126 }
127 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 127 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
128 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 128 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
129 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); 129 /* No flags currently supported */
130 info->dqi_flags = 0;
130 qinfo->dqi_sb = sb; 131 qinfo->dqi_sb = sb;
131 qinfo->dqi_type = type; 132 qinfo->dqi_type = type;
132 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 133 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
@@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type)
157 info->dqi_flags &= ~DQF_INFO_DIRTY; 158 info->dqi_flags &= ~DQF_INFO_DIRTY;
158 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); 159 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
159 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); 160 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
160 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 161 /* No flags currently supported */
162 dinfo.dqi_flags = cpu_to_le32(0);
161 spin_unlock(&dq_data_lock); 163 spin_unlock(&dq_data_lock);
162 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); 164 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
163 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); 165 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
34 unsigned long flags); 34 unsigned long flags);
35static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); 35static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
36 36
37static unsigned ramfs_mmap_capabilities(struct file *file)
38{
39 return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
40 NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
41}
42
37const struct file_operations ramfs_file_operations = { 43const struct file_operations ramfs_file_operations = {
44 .mmap_capabilities = ramfs_mmap_capabilities,
38 .mmap = ramfs_nommu_mmap, 45 .mmap = ramfs_nommu_mmap,
39 .get_unmapped_area = ramfs_nommu_get_unmapped_area, 46 .get_unmapped_area = ramfs_nommu_get_unmapped_area,
40 .read = new_sync_read, 47 .read = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
50 .set_page_dirty = __set_page_dirty_no_writeback, 50 .set_page_dirty = __set_page_dirty_no_writeback,
51}; 51};
52 52
53static struct backing_dev_info ramfs_backing_dev_info = {
54 .name = "ramfs",
55 .ra_pages = 0, /* No readahead */
56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
57 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
58 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
59};
60
61struct inode *ramfs_get_inode(struct super_block *sb, 53struct inode *ramfs_get_inode(struct super_block *sb,
62 const struct inode *dir, umode_t mode, dev_t dev) 54 const struct inode *dir, umode_t mode, dev_t dev)
63{ 55{
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
67 inode->i_ino = get_next_ino(); 59 inode->i_ino = get_next_ino();
68 inode_init_owner(inode, dir, mode); 60 inode_init_owner(inode, dir, mode);
69 inode->i_mapping->a_ops = &ramfs_aops; 61 inode->i_mapping->a_ops = &ramfs_aops;
70 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
71 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 62 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
72 mapping_set_unevictable(inode->i_mapping); 63 mapping_set_unevictable(inode->i_mapping);
73 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 64 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
267int __init init_ramfs_fs(void) 258int __init init_ramfs_fs(void)
268{ 259{
269 static unsigned long once; 260 static unsigned long once;
270 int err;
271 261
272 if (test_and_set_bit(0, &once)) 262 if (test_and_set_bit(0, &once))
273 return 0; 263 return 0;
274 264 return register_filesystem(&ramfs_fs_type);
275 err = bdi_init(&ramfs_backing_dev_info);
276 if (err)
277 return err;
278
279 err = register_filesystem(&ramfs_fs_type);
280 if (err)
281 bdi_destroy(&ramfs_backing_dev_info);
282
283 return err;
284} 265}
285fs_initcall(init_ramfs_fs); 266fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..8e1b68786d66 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
333} 333}
334#endif 334#endif
335 335
336ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
337{
338 struct kiocb kiocb;
339 ssize_t ret;
340
341 if (!file->f_op->read_iter)
342 return -EINVAL;
343
344 init_sync_kiocb(&kiocb, file);
345 kiocb.ki_pos = *ppos;
346 kiocb.ki_nbytes = iov_iter_count(iter);
347
348 iter->type |= READ;
349 ret = file->f_op->read_iter(&kiocb, iter);
350 if (ret == -EIOCBQUEUED)
351 ret = wait_on_sync_kiocb(&kiocb);
352
353 if (ret > 0)
354 *ppos = kiocb.ki_pos;
355 return ret;
356}
357EXPORT_SYMBOL(vfs_iter_read);
358
359ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
360{
361 struct kiocb kiocb;
362 ssize_t ret;
363
364 if (!file->f_op->write_iter)
365 return -EINVAL;
366
367 init_sync_kiocb(&kiocb, file);
368 kiocb.ki_pos = *ppos;
369 kiocb.ki_nbytes = iov_iter_count(iter);
370
371 iter->type |= WRITE;
372 ret = file->f_op->write_iter(&kiocb, iter);
373 if (ret == -EIOCBQUEUED)
374 ret = wait_on_sync_kiocb(&kiocb);
375
376 if (ret > 0)
377 *ppos = kiocb.ki_pos;
378 return ret;
379}
380EXPORT_SYMBOL(vfs_iter_write);
381
336/* 382/*
337 * rw_verify_area doesn't like huge counts. We limit 383 * rw_verify_area doesn't like huge counts. We limit
338 * them to something that fits in "int" so that others 384 * them to something that fits in "int" so that others
@@ -358,7 +404,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
358 return retval; 404 return retval;
359 } 405 }
360 406
361 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 407 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
362 retval = locks_mandatory_area( 408 retval = locks_mandatory_area(
363 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, 409 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
364 inode, file, pos, count); 410 inode, file, pos, count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a7eec9888f10..e72401e1f995 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file,
2766 int old_ref = 0; 2766 int old_ref = 0;
2767 2767
2768 inode = mapping->host; 2768 inode = mapping->host;
2769 *fsdata = 0; 2769 *fsdata = NULL;
2770 if (flags & AOP_FLAG_CONT_EXPAND && 2770 if (flags & AOP_FLAG_CONT_EXPAND &&
2771 (pos & (inode->i_sb->s_blocksize - 1)) == 0) { 2771 (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
2772 pos ++; 2772 pos ++;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
70 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; 70 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
71} 71}
72 72
73static unsigned romfs_mmap_capabilities(struct file *file)
74{
75 struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
76
77 if (!mtd)
78 return NOMMU_MAP_COPY;
79 return mtd_mmap_capabilities(mtd);
80}
81
73const struct file_operations romfs_ro_fops = { 82const struct file_operations romfs_ro_fops = {
74 .llseek = generic_file_llseek, 83 .llseek = generic_file_llseek,
75 .read = new_sync_read, 84 .read = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
77 .splice_read = generic_file_splice_read, 86 .splice_read = generic_file_splice_read,
78 .mmap = romfs_mmap, 87 .mmap = romfs_mmap,
79 .get_unmapped_area = romfs_get_unmapped_area, 88 .get_unmapped_area = romfs_get_unmapped_area,
89 .mmap_capabilities = romfs_mmap_capabilities,
80}; 90};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
355 case ROMFH_REG: 355 case ROMFH_REG:
356 i->i_fop = &romfs_ro_fops; 356 i->i_fop = &romfs_ro_fops;
357 i->i_data.a_ops = &romfs_aops; 357 i->i_data.a_ops = &romfs_aops;
358 if (i->i_sb->s_mtd)
359 i->i_data.backing_dev_info =
360 i->i_sb->s_mtd->backing_dev_info;
361 if (nextfh & ROMFH_EXEC) 358 if (nextfh & ROMFH_EXEC)
362 mode |= S_IXUGO; 359 mode |= S_IXUGO;
363 break; 360 break;
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
971 if (ret == -EINTR) { 971 if (ret == -EINTR) {
972 struct restart_block *restart_block; 972 struct restart_block *restart_block;
973 973
974 restart_block = &current_thread_info()->restart_block; 974 restart_block = &current->restart_block;
975 restart_block->fn = do_restart_poll; 975 restart_block->fn = do_restart_poll;
976 restart_block->poll.ufds = ufds; 976 restart_block->poll.ufds = ufds;
977 restart_block->poll.nfds = nfds; 977 restart_block->poll.nfds = nfds;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
539 return res; 539 return res;
540} 540}
541 541
542int seq_bitmap(struct seq_file *m, const unsigned long *bits,
543 unsigned int nr_bits)
544{
545 if (m->count < m->size) {
546 int len = bitmap_scnprintf(m->buf + m->count,
547 m->size - m->count, bits, nr_bits);
548 if (m->count + len < m->size) {
549 m->count += len;
550 return 0;
551 }
552 }
553 seq_set_overflow(m);
554 return -1;
555}
556EXPORT_SYMBOL(seq_bitmap);
557
558int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
559 unsigned int nr_bits)
560{
561 if (m->count < m->size) {
562 int len = bitmap_scnlistprintf(m->buf + m->count,
563 m->size - m->count, bits, nr_bits);
564 if (m->count + len < m->size) {
565 m->count += len;
566 return 0;
567 }
568 }
569 seq_set_overflow(m);
570 return -1;
571}
572EXPORT_SYMBOL(seq_bitmap_list);
573
574static void *single_start(struct seq_file *p, loff_t *pos) 542static void *single_start(struct seq_file *p, loff_t *pos)
575{ 543{
576 return NULL + (*pos == 0); 544 return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
961 splice_from_pipe_begin(&sd); 961 splice_from_pipe_begin(&sd);
962 while (sd.total_len) { 962 while (sd.total_len) {
963 struct iov_iter from; 963 struct iov_iter from;
964 struct kiocb kiocb;
965 size_t left; 964 size_t left;
966 int n, idx; 965 int n, idx;
967 966
@@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1005 left -= this_len; 1004 left -= this_len;
1006 } 1005 }
1007 1006
1008 /* ... iov_iter */ 1007 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
1009 from.type = ITER_BVEC | WRITE; 1008 sd.total_len - left);
1010 from.bvec = array; 1009 ret = vfs_iter_write(out, &from, &sd.pos);
1011 from.nr_segs = n;
1012 from.count = sd.total_len - left;
1013 from.iov_offset = 0;
1014
1015 /* ... and iocb */
1016 init_sync_kiocb(&kiocb, out);
1017 kiocb.ki_pos = sd.pos;
1018 kiocb.ki_nbytes = sd.total_len - left;
1019
1020 /* now, send it */
1021 ret = out->f_op->write_iter(&kiocb, &from);
1022 if (-EIOCBQUEUED == ret)
1023 ret = wait_on_sync_kiocb(&kiocb);
1024
1025 if (ret <= 0) 1010 if (ret <= 0)
1026 break; 1011 break;
1027 1012
1028 sd.num_spliced += ret; 1013 sd.num_spliced += ret;
1029 sd.total_len -= ret; 1014 sd.total_len -= ret;
1030 *ppos = sd.pos = kiocb.ki_pos; 1015 *ppos = sd.pos;
1031 1016
1032 /* dismiss the fully eaten buffers, adjust the partial one */ 1017 /* dismiss the fully eaten buffers, adjust the partial one */
1033 while (ret) { 1018 while (ret) {
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..65a53efc1cf4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
36#include "internal.h" 36#include "internal.h"
37 37
38 38
39LIST_HEAD(super_blocks); 39static LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 40static DEFINE_SPINLOCK(sb_lock);
41 41
42static char *sb_writers_name[SB_FREEZE_LEVELS] = { 42static char *sb_writers_name[SB_FREEZE_LEVELS] = {
43 "sb_writers", 43 "sb_writers",
@@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
75 return SHRINK_STOP; 75 return SHRINK_STOP;
76 76
77 if (sb->s_op->nr_cached_objects) 77 if (sb->s_op->nr_cached_objects)
78 fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); 78 fs_objects = sb->s_op->nr_cached_objects(sb, sc);
79 79
80 inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); 80 inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
81 dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); 81 dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
82 total_objects = dentries + inodes + fs_objects + 1; 82 total_objects = dentries + inodes + fs_objects + 1;
83 if (!total_objects) 83 if (!total_objects)
84 total_objects = 1; 84 total_objects = 1;
@@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
86 /* proportion the scan between the caches */ 86 /* proportion the scan between the caches */
87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); 87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); 88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
89 fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
89 90
90 /* 91 /*
91 * prune the dcache first as the icache is pinned by it, then 92 * prune the dcache first as the icache is pinned by it, then
92 * prune the icache, followed by the filesystem specific caches 93 * prune the icache, followed by the filesystem specific caches
94 *
95 * Ensure that we always scan at least one object - memcg kmem
96 * accounting uses this to fully empty the caches.
93 */ 97 */
94 freed = prune_dcache_sb(sb, dentries, sc->nid); 98 sc->nr_to_scan = dentries + 1;
95 freed += prune_icache_sb(sb, inodes, sc->nid); 99 freed = prune_dcache_sb(sb, sc);
100 sc->nr_to_scan = inodes + 1;
101 freed += prune_icache_sb(sb, sc);
96 102
97 if (fs_objects) { 103 if (fs_objects) {
98 fs_objects = mult_frac(sc->nr_to_scan, fs_objects, 104 sc->nr_to_scan = fs_objects + 1;
99 total_objects); 105 freed += sb->s_op->free_cached_objects(sb, sc);
100 freed += sb->s_op->free_cached_objects(sb, fs_objects,
101 sc->nid);
102 } 106 }
103 107
104 drop_super(sb); 108 drop_super(sb);
@@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
118 * scalability bottleneck. The counts could get updated 122 * scalability bottleneck. The counts could get updated
119 * between super_cache_count and super_cache_scan anyway. 123 * between super_cache_count and super_cache_scan anyway.
120 * Call to super_cache_count with shrinker_rwsem held 124 * Call to super_cache_count with shrinker_rwsem held
121 * ensures the safety of call to list_lru_count_node() and 125 * ensures the safety of call to list_lru_shrink_count() and
122 * s_op->nr_cached_objects(). 126 * s_op->nr_cached_objects().
123 */ 127 */
124 if (sb->s_op && sb->s_op->nr_cached_objects) 128 if (sb->s_op && sb->s_op->nr_cached_objects)
125 total_objects = sb->s_op->nr_cached_objects(sb, 129 total_objects = sb->s_op->nr_cached_objects(sb, sc);
126 sc->nid);
127 130
128 total_objects += list_lru_count_node(&sb->s_dentry_lru, 131 total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
129 sc->nid); 132 total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
130 total_objects += list_lru_count_node(&sb->s_inode_lru,
131 sc->nid);
132 133
133 total_objects = vfs_pressure_ratio(total_objects); 134 total_objects = vfs_pressure_ratio(total_objects);
134 return total_objects; 135 return total_objects;
@@ -185,15 +186,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
185 } 186 }
186 init_waitqueue_head(&s->s_writers.wait); 187 init_waitqueue_head(&s->s_writers.wait);
187 init_waitqueue_head(&s->s_writers.wait_unfrozen); 188 init_waitqueue_head(&s->s_writers.wait_unfrozen);
189 s->s_bdi = &noop_backing_dev_info;
188 s->s_flags = flags; 190 s->s_flags = flags;
189 s->s_bdi = &default_backing_dev_info;
190 INIT_HLIST_NODE(&s->s_instances); 191 INIT_HLIST_NODE(&s->s_instances);
191 INIT_HLIST_BL_HEAD(&s->s_anon); 192 INIT_HLIST_BL_HEAD(&s->s_anon);
192 INIT_LIST_HEAD(&s->s_inodes); 193 INIT_LIST_HEAD(&s->s_inodes);
193 194
194 if (list_lru_init(&s->s_dentry_lru)) 195 if (list_lru_init_memcg(&s->s_dentry_lru))
195 goto fail; 196 goto fail;
196 if (list_lru_init(&s->s_inode_lru)) 197 if (list_lru_init_memcg(&s->s_inode_lru))
197 goto fail; 198 goto fail;
198 199
199 init_rwsem(&s->s_umount); 200 init_rwsem(&s->s_umount);
@@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
229 s->s_shrink.scan_objects = super_cache_scan; 230 s->s_shrink.scan_objects = super_cache_scan;
230 s->s_shrink.count_objects = super_cache_count; 231 s->s_shrink.count_objects = super_cache_count;
231 s->s_shrink.batch = 1024; 232 s->s_shrink.batch = 1024;
232 s->s_shrink.flags = SHRINKER_NUMA_AWARE; 233 s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
233 return s; 234 return s;
234 235
235fail: 236fail:
@@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s)
284 unregister_shrinker(&s->s_shrink); 285 unregister_shrinker(&s->s_shrink);
285 fs->kill_sb(s); 286 fs->kill_sb(s);
286 287
288 /*
289 * Since list_lru_destroy() may sleep, we cannot call it from
290 * put_super(), where we hold the sb_lock. Therefore we destroy
291 * the lru lists right now.
292 */
293 list_lru_destroy(&s->s_dentry_lru);
294 list_lru_destroy(&s->s_inode_lru);
295
287 put_filesystem(fs); 296 put_filesystem(fs);
288 put_super(s); 297 put_super(s);
289 } else { 298 } else {
@@ -706,9 +715,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
706 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 715 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
707 716
708 if (remount_ro) { 717 if (remount_ro) {
709 if (sb->s_pins.first) { 718 if (!hlist_empty(&sb->s_pins)) {
710 up_write(&sb->s_umount); 719 up_write(&sb->s_umount);
711 sb_pin_kill(sb); 720 group_pin_kill(&sb->s_pins);
712 down_write(&sb->s_umount); 721 down_write(&sb->s_umount);
713 if (!sb->s_root) 722 if (!sb->s_root)
714 return 0; 723 return 0;
@@ -863,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev);
863 872
864int set_anon_super(struct super_block *s, void *data) 873int set_anon_super(struct super_block *s, void *data)
865{ 874{
866 int error = get_anon_bdev(&s->s_dev); 875 return get_anon_bdev(&s->s_dev);
867 if (!error)
868 s->s_bdi = &noop_backing_dev_info;
869 return error;
870} 876}
871 877
872EXPORT_SYMBOL(set_anon_super); 878EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
1111 sb = root->d_sb; 1117 sb = root->d_sb;
1112 BUG_ON(!sb); 1118 BUG_ON(!sb);
1113 WARN_ON(!sb->s_bdi); 1119 WARN_ON(!sb->s_bdi);
1114 WARN_ON(sb->s_bdi == &default_backing_dev_info);
1115 sb->s_flags |= MS_BORN; 1120 sb->s_flags |= MS_BORN;
1116 1121
1117 error = security_sb_kern_mount(sb, flags, secdata); 1122 error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
177 */ 177 */
178int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) 178int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
179{ 179{
180 struct inode *inode = file->f_mapping->host;
181
180 if (!file->f_op->fsync) 182 if (!file->f_op->fsync)
181 return -EINVAL; 183 return -EINVAL;
184 if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
185 spin_lock(&inode->i_lock);
186 inode->i_state &= ~I_DIRTY_TIME;
187 spin_unlock(&inode->i_lock);
188 mark_inode_dirty_sync(inode);
189 }
182 return file->f_op->fsync(file, start, end, datasync); 190 return file->f_op->fsync(file, start, end, datasync);
183} 191}
184EXPORT_SYMBOL(vfs_fsync_range); 192EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
295 key = attr->key ?: (struct lock_class_key *)&attr->skey; 295 key = attr->key ?: (struct lock_class_key *)&attr->skey;
296#endif 296#endif
297 kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, 297 kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
298 (void *)attr, ns, true, key); 298 (void *)attr, ns, key);
299 if (IS_ERR(kn)) { 299 if (IS_ERR(kn)) {
300 if (PTR_ERR(kn) == -EEXIST) 300 if (PTR_ERR(kn) == -EEXIST)
301 sysfs_warn_dup(parent, attr->name); 301 sysfs_warn_dup(parent, attr->name);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 7d2a860ba788..2554d8835b48 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -99,7 +99,7 @@ static int internal_create_group(struct kobject *kobj, int update,
99 return -EINVAL; 99 return -EINVAL;
100 if (!grp->attrs && !grp->bin_attrs) { 100 if (!grp->attrs && !grp->bin_attrs) {
101 WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n", 101 WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
102 kobj->name, grp->name ? "" : grp->name); 102 kobj->name, grp->name ?: "");
103 return -EINVAL; 103 return -EINVAL;
104 } 104 }
105 if (grp->name) { 105 if (grp->name) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7ed13e1e216a..4cfb3e82c56f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2032,6 +2032,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2032 long long blk_offs; 2032 long long blk_offs;
2033 struct ubifs_data_node *dn = node; 2033 struct ubifs_data_node *dn = node;
2034 2034
2035 ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ);
2036
2035 /* 2037 /*
2036 * Search the inode node this data node belongs to and insert 2038 * Search the inode node this data node belongs to and insert
2037 * it to the RB-tree of inodes. 2039 * it to the RB-tree of inodes.
@@ -2060,6 +2062,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2060 struct ubifs_dent_node *dent = node; 2062 struct ubifs_dent_node *dent = node;
2061 struct fsck_inode *fscki1; 2063 struct fsck_inode *fscki1;
2062 2064
2065 ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ);
2066
2063 err = ubifs_validate_entry(c, dent); 2067 err = ubifs_validate_entry(c, dent);
2064 if (err) 2068 if (err)
2065 goto out_dump; 2069 goto out_dump;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..0fa6c803992e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
108 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
109 ubifs_current_time(inode); 109 ubifs_current_time(inode);
110 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
111 /* Disable readahead */
112 inode->i_mapping->backing_dev_info = &c->bdi;
113 111
114 switch (mode & S_IFMT) { 112 switch (mode & S_IFMT) {
115 case S_IFREG: 113 case S_IFREG:
@@ -272,6 +270,10 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
272 goto out_budg; 270 goto out_budg;
273 } 271 }
274 272
273 err = ubifs_init_security(dir, inode, &dentry->d_name);
274 if (err)
275 goto out_cancel;
276
275 mutex_lock(&dir_ui->ui_mutex); 277 mutex_lock(&dir_ui->ui_mutex);
276 dir->i_size += sz_change; 278 dir->i_size += sz_change;
277 dir_ui->ui_size = dir->i_size; 279 dir_ui->ui_size = dir->i_size;
@@ -728,6 +730,10 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
728 goto out_budg; 730 goto out_budg;
729 } 731 }
730 732
733 err = ubifs_init_security(dir, inode, &dentry->d_name);
734 if (err)
735 goto out_cancel;
736
731 mutex_lock(&dir_ui->ui_mutex); 737 mutex_lock(&dir_ui->ui_mutex);
732 insert_inode_hash(inode); 738 insert_inode_hash(inode);
733 inc_nlink(inode); 739 inc_nlink(inode);
@@ -808,6 +814,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
808 ui->data = dev; 814 ui->data = dev;
809 ui->data_len = devlen; 815 ui->data_len = devlen;
810 816
817 err = ubifs_init_security(dir, inode, &dentry->d_name);
818 if (err)
819 goto out_cancel;
820
811 mutex_lock(&dir_ui->ui_mutex); 821 mutex_lock(&dir_ui->ui_mutex);
812 dir->i_size += sz_change; 822 dir->i_size += sz_change;
813 dir_ui->ui_size = dir->i_size; 823 dir_ui->ui_size = dir->i_size;
@@ -884,6 +894,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
884 ui->data_len = len; 894 ui->data_len = len;
885 inode->i_size = ubifs_inode(inode)->ui_size = len; 895 inode->i_size = ubifs_inode(inode)->ui_size = len;
886 896
897 err = ubifs_init_security(dir, inode, &dentry->d_name);
898 if (err)
899 goto out_cancel;
900
887 mutex_lock(&dir_ui->ui_mutex); 901 mutex_lock(&dir_ui->ui_mutex);
888 dir->i_size += sz_change; 902 dir->i_size += sz_change;
889 dir_ui->ui_size = dir->i_size; 903 dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..e627c0acf626 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
1536 .fault = filemap_fault, 1536 .fault = filemap_fault,
1537 .map_pages = filemap_map_pages, 1537 .map_pages = filemap_map_pages,
1538 .page_mkwrite = ubifs_vm_page_mkwrite, 1538 .page_mkwrite = ubifs_vm_page_mkwrite,
1539 .remap_pages = generic_file_remap_pages,
1540}; 1539};
1541 1540
1542static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1541static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1574,6 +1573,10 @@ const struct inode_operations ubifs_symlink_inode_operations = {
1574 .follow_link = ubifs_follow_link, 1573 .follow_link = ubifs_follow_link,
1575 .setattr = ubifs_setattr, 1574 .setattr = ubifs_setattr,
1576 .getattr = ubifs_getattr, 1575 .getattr = ubifs_getattr,
1576 .setxattr = ubifs_setxattr,
1577 .getxattr = ubifs_getxattr,
1578 .listxattr = ubifs_listxattr,
1579 .removexattr = ubifs_removexattr,
1577}; 1580};
1578 1581
1579const struct file_operations ubifs_file_operations = { 1582const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 3187925e9879..9b40a1c5e160 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1028,9 +1028,22 @@ int ubifs_replay_journal(struct ubifs_info *c)
1028 1028
1029 do { 1029 do {
1030 err = replay_log_leb(c, lnum, 0, c->sbuf); 1030 err = replay_log_leb(c, lnum, 0, c->sbuf);
1031 if (err == 1) 1031 if (err == 1) {
1032 /* We hit the end of the log */ 1032 if (lnum != c->lhead_lnum)
1033 break; 1033 /* We hit the end of the log */
1034 break;
1035
1036 /*
1037 * The head of the log must always start with the
1038 * "commit start" node on a properly formatted UBIFS.
1039 * But we found no nodes at all, which means that
1040 * someting went wrong and we cannot proceed mounting
1041 * the file-system.
1042 */
1043 ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
1044 lnum, 0);
1045 err = -EINVAL;
1046 }
1034 if (err) 1047 if (err)
1035 goto out; 1048 goto out;
1036 lnum = ubifs_next_log_lnum(c, lnum); 1049 lnum = ubifs_next_log_lnum(c, lnum);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..93e946561c5c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
156 if (err) 156 if (err)
157 goto out_invalid; 157 goto out_invalid;
158 158
159 /* Disable read-ahead */
160 inode->i_mapping->backing_dev_info = &c->bdi;
161
162 switch (inode->i_mode & S_IFMT) { 159 switch (inode->i_mode & S_IFMT) {
163 case S_IFREG: 160 case S_IFREG:
164 inode->i_mapping->a_ops = &ubifs_file_address_operations; 161 inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
2017 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 2014 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
2018 */ 2015 */
2019 c->bdi.name = "ubifs", 2016 c->bdi.name = "ubifs",
2020 c->bdi.capabilities = BDI_CAP_MAP_COPY; 2017 c->bdi.capabilities = 0;
2021 err = bdi_init(&c->bdi); 2018 err = bdi_init(&c->bdi);
2022 if (err) 2019 if (err)
2023 goto out_close; 2020 goto out_close;
@@ -2039,6 +2036,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
2039 if (c->max_inode_sz > MAX_LFS_FILESIZE) 2036 if (c->max_inode_sz > MAX_LFS_FILESIZE)
2040 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; 2037 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
2041 sb->s_op = &ubifs_super_operations; 2038 sb->s_op = &ubifs_super_operations;
2039 sb->s_xattr = ubifs_xattr_handlers;
2042 2040
2043 mutex_lock(&c->umount_mutex); 2041 mutex_lock(&c->umount_mutex);
2044 err = mount_ubifs(c); 2042 err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c4fe900c67ab..bc04b9c69891 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -36,6 +36,7 @@
36#include <linux/mtd/ubi.h> 36#include <linux/mtd/ubi.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/backing-dev.h> 38#include <linux/backing-dev.h>
39#include <linux/security.h>
39#include "ubifs-media.h" 40#include "ubifs-media.h"
40 41
41/* Version of this UBIFS implementation */ 42/* Version of this UBIFS implementation */
@@ -1465,6 +1466,7 @@ extern spinlock_t ubifs_infos_lock;
1465extern atomic_long_t ubifs_clean_zn_cnt; 1466extern atomic_long_t ubifs_clean_zn_cnt;
1466extern struct kmem_cache *ubifs_inode_slab; 1467extern struct kmem_cache *ubifs_inode_slab;
1467extern const struct super_operations ubifs_super_operations; 1468extern const struct super_operations ubifs_super_operations;
1469extern const struct xattr_handler *ubifs_xattr_handlers[];
1468extern const struct address_space_operations ubifs_file_address_operations; 1470extern const struct address_space_operations ubifs_file_address_operations;
1469extern const struct file_operations ubifs_file_operations; 1471extern const struct file_operations ubifs_file_operations;
1470extern const struct inode_operations ubifs_file_inode_operations; 1472extern const struct inode_operations ubifs_file_inode_operations;
@@ -1754,6 +1756,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
1754 size_t size); 1756 size_t size);
1755ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); 1757ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
1756int ubifs_removexattr(struct dentry *dentry, const char *name); 1758int ubifs_removexattr(struct dentry *dentry, const char *name);
1759int ubifs_init_security(struct inode *dentry, struct inode *inode,
1760 const struct qstr *qstr);
1757 1761
1758/* super.c */ 1762/* super.c */
1759struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); 1763struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 5e0a63b1b0d5..a92be244a6fb 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -100,24 +100,30 @@ static const struct file_operations empty_fops;
100static int create_xattr(struct ubifs_info *c, struct inode *host, 100static int create_xattr(struct ubifs_info *c, struct inode *host,
101 const struct qstr *nm, const void *value, int size) 101 const struct qstr *nm, const void *value, int size)
102{ 102{
103 int err; 103 int err, names_len;
104 struct inode *inode; 104 struct inode *inode;
105 struct ubifs_inode *ui, *host_ui = ubifs_inode(host); 105 struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
106 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 106 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
107 .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, 107 .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
108 .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; 108 .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
109 109
110 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) 110 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
111 ubifs_err("inode %lu already has too many xattrs (%d), cannot create more",
112 host->i_ino, host_ui->xattr_cnt);
111 return -ENOSPC; 113 return -ENOSPC;
114 }
112 /* 115 /*
113 * Linux limits the maximum size of the extended attribute names list 116 * Linux limits the maximum size of the extended attribute names list
114 * to %XATTR_LIST_MAX. This means we should not allow creating more 117 * to %XATTR_LIST_MAX. This means we should not allow creating more
115 * extended attributes if the name list becomes larger. This limitation 118 * extended attributes if the name list becomes larger. This limitation
116 * is artificial for UBIFS, though. 119 * is artificial for UBIFS, though.
117 */ 120 */
118 if (host_ui->xattr_names + host_ui->xattr_cnt + 121 names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
119 nm->len + 1 > XATTR_LIST_MAX) 122 if (names_len > XATTR_LIST_MAX) {
123 ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
124 host->i_ino, names_len, XATTR_LIST_MAX);
120 return -ENOSPC; 125 return -ENOSPC;
126 }
121 127
122 err = ubifs_budget_space(c, &req); 128 err = ubifs_budget_space(c, &req);
123 if (err) 129 if (err)
@@ -293,18 +299,16 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
293 return ERR_PTR(-EINVAL); 299 return ERR_PTR(-EINVAL);
294} 300}
295 301
296int ubifs_setxattr(struct dentry *dentry, const char *name, 302static int setxattr(struct inode *host, const char *name, const void *value,
297 const void *value, size_t size, int flags) 303 size_t size, int flags)
298{ 304{
299 struct inode *inode, *host = dentry->d_inode; 305 struct inode *inode;
300 struct ubifs_info *c = host->i_sb->s_fs_info; 306 struct ubifs_info *c = host->i_sb->s_fs_info;
301 struct qstr nm = QSTR_INIT(name, strlen(name)); 307 struct qstr nm = QSTR_INIT(name, strlen(name));
302 struct ubifs_dent_node *xent; 308 struct ubifs_dent_node *xent;
303 union ubifs_key key; 309 union ubifs_key key;
304 int err, type; 310 int err, type;
305 311
306 dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name,
307 host->i_ino, dentry, size);
308 ubifs_assert(mutex_is_locked(&host->i_mutex)); 312 ubifs_assert(mutex_is_locked(&host->i_mutex));
309 313
310 if (size > UBIFS_MAX_INO_DATA) 314 if (size > UBIFS_MAX_INO_DATA)
@@ -356,6 +360,15 @@ out_free:
356 return err; 360 return err;
357} 361}
358 362
363int ubifs_setxattr(struct dentry *dentry, const char *name,
364 const void *value, size_t size, int flags)
365{
366 dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
367 name, dentry->d_inode->i_ino, dentry, size);
368
369 return setxattr(dentry->d_inode, name, value, size, flags);
370}
371
359ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, 372ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
360 size_t size) 373 size_t size)
361{ 374{
@@ -568,3 +581,84 @@ out_free:
568 kfree(xent); 581 kfree(xent);
569 return err; 582 return err;
570} 583}
584
585static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
586 const char *name, size_t name_len, int flags)
587{
588 const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
589 const size_t total_len = prefix_len + name_len + 1;
590
591 if (list && total_len <= list_size) {
592 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
593 memcpy(list + prefix_len, name, name_len);
594 list[prefix_len + name_len] = '\0';
595 }
596
597 return total_len;
598}
599
600static int security_getxattr(struct dentry *d, const char *name, void *buffer,
601 size_t size, int flags)
602{
603 return ubifs_getxattr(d, name, buffer, size);
604}
605
606static int security_setxattr(struct dentry *d, const char *name,
607 const void *value, size_t size, int flags,
608 int handler_flags)
609{
610 return ubifs_setxattr(d, name, value, size, flags);
611}
612
613static const struct xattr_handler ubifs_xattr_security_handler = {
614 .prefix = XATTR_SECURITY_PREFIX,
615 .list = security_listxattr,
616 .get = security_getxattr,
617 .set = security_setxattr,
618};
619
620const struct xattr_handler *ubifs_xattr_handlers[] = {
621 &ubifs_xattr_security_handler,
622 NULL,
623};
624
625static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
626 void *fs_info)
627{
628 const struct xattr *xattr;
629 char *name;
630 int err = 0;
631
632 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
633 name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
634 strlen(xattr->name) + 1, GFP_NOFS);
635 if (!name) {
636 err = -ENOMEM;
637 break;
638 }
639 strcpy(name, XATTR_SECURITY_PREFIX);
640 strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
641 err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
642 kfree(name);
643 if (err < 0)
644 break;
645 }
646
647 return err;
648}
649
650int ubifs_init_security(struct inode *dentry, struct inode *inode,
651 const struct qstr *qstr)
652{
653 int err;
654
655 mutex_lock(&inode->i_mutex);
656 err = security_inode_init_security(inode, dentry, qstr,
657 &init_xattrs, 0);
658 mutex_unlock(&inode->i_mutex);
659
660 if (err)
661 ubifs_err("cannot initialize security for inode %lu, error %d",
662 inode->i_ino, err);
663 return err;
664}
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..c6e17a744c3b 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -2,10 +2,12 @@ config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 select CRC_ITU_T 3 select CRC_ITU_T
4 help 4 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if 5 This is a file system used on some CD-ROMs and DVDs. Since the
6 you intend to mount DVD discs or CDRW's written in packet mode, or 6 file system is supported by multiple operating systems and is more
7 if written to by other UDF utilities, such as DirectCD. 7 compatible with standard unix file systems, it is also suitable for
8 Please read <file:Documentation/filesystems/udf.txt>. 8 removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
9 written in packet mode, or if you want to use UDF for removable USB
10 disks. Please read <file:Documentation/filesystems/udf.txt>.
9 11
10 To compile this file system support as a module, choose M here: the 12 To compile this file system support as a module, choose M here: the
11 module will be called udf. 13 module will be called udf.
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5bc71d9a674a..a445d599098d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
750 /* Are we beyond EOF? */ 750 /* Are we beyond EOF? */
751 if (etype == -1) { 751 if (etype == -1) {
752 int ret; 752 int ret;
753 isBeyondEOF = 1; 753 isBeyondEOF = true;
754 if (count) { 754 if (count) {
755 if (c) 755 if (c)
756 laarr[0] = laarr[1]; 756 laarr[0] = laarr[1];
@@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
792 endnum = c + 1; 792 endnum = c + 1;
793 lastblock = 1; 793 lastblock = 1;
794 } else { 794 } else {
795 isBeyondEOF = 0; 795 isBeyondEOF = false;
796 endnum = startnum = ((count > 2) ? 2 : count); 796 endnum = startnum = ((count > 2) ? 2 : count);
797 797
798 /* if the current extent is in position 0, 798 /* if the current extent is in position 0,
@@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
1288 struct kernel_lb_addr *iloc = &iinfo->i_location; 1288 struct kernel_lb_addr *iloc = &iinfo->i_location;
1289 unsigned int link_count; 1289 unsigned int link_count;
1290 unsigned int indirections = 0; 1290 unsigned int indirections = 0;
1291 int bs = inode->i_sb->s_blocksize;
1291 int ret = -EIO; 1292 int ret = -EIO;
1292 1293
1293reread: 1294reread:
@@ -1374,38 +1375,35 @@ reread:
1374 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { 1375 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
1375 iinfo->i_efe = 1; 1376 iinfo->i_efe = 1;
1376 iinfo->i_use = 0; 1377 iinfo->i_use = 0;
1377 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1378 ret = udf_alloc_i_data(inode, bs -
1378 sizeof(struct extendedFileEntry)); 1379 sizeof(struct extendedFileEntry));
1379 if (ret) 1380 if (ret)
1380 goto out; 1381 goto out;
1381 memcpy(iinfo->i_ext.i_data, 1382 memcpy(iinfo->i_ext.i_data,
1382 bh->b_data + sizeof(struct extendedFileEntry), 1383 bh->b_data + sizeof(struct extendedFileEntry),
1383 inode->i_sb->s_blocksize - 1384 bs - sizeof(struct extendedFileEntry));
1384 sizeof(struct extendedFileEntry));
1385 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { 1385 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
1386 iinfo->i_efe = 0; 1386 iinfo->i_efe = 0;
1387 iinfo->i_use = 0; 1387 iinfo->i_use = 0;
1388 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1388 ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
1389 sizeof(struct fileEntry));
1390 if (ret) 1389 if (ret)
1391 goto out; 1390 goto out;
1392 memcpy(iinfo->i_ext.i_data, 1391 memcpy(iinfo->i_ext.i_data,
1393 bh->b_data + sizeof(struct fileEntry), 1392 bh->b_data + sizeof(struct fileEntry),
1394 inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1393 bs - sizeof(struct fileEntry));
1395 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { 1394 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
1396 iinfo->i_efe = 0; 1395 iinfo->i_efe = 0;
1397 iinfo->i_use = 1; 1396 iinfo->i_use = 1;
1398 iinfo->i_lenAlloc = le32_to_cpu( 1397 iinfo->i_lenAlloc = le32_to_cpu(
1399 ((struct unallocSpaceEntry *)bh->b_data)-> 1398 ((struct unallocSpaceEntry *)bh->b_data)->
1400 lengthAllocDescs); 1399 lengthAllocDescs);
1401 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1400 ret = udf_alloc_i_data(inode, bs -
1402 sizeof(struct unallocSpaceEntry)); 1401 sizeof(struct unallocSpaceEntry));
1403 if (ret) 1402 if (ret)
1404 goto out; 1403 goto out;
1405 memcpy(iinfo->i_ext.i_data, 1404 memcpy(iinfo->i_ext.i_data,
1406 bh->b_data + sizeof(struct unallocSpaceEntry), 1405 bh->b_data + sizeof(struct unallocSpaceEntry),
1407 inode->i_sb->s_blocksize - 1406 bs - sizeof(struct unallocSpaceEntry));
1408 sizeof(struct unallocSpaceEntry));
1409 return 0; 1407 return 0;
1410 } 1408 }
1411 1409
@@ -1489,6 +1487,15 @@ reread:
1489 } 1487 }
1490 inode->i_generation = iinfo->i_unique; 1488 inode->i_generation = iinfo->i_unique;
1491 1489
1490 /*
1491 * Sanity check length of allocation descriptors and extended attrs to
1492 * avoid integer overflows
1493 */
1494 if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
1495 goto out;
1496 /* Now do exact checks */
1497 if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
1498 goto out;
1492 /* Sanity checks for files in ICB so that we don't get confused later */ 1499 /* Sanity checks for files in ICB so that we don't get confused later */
1493 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1500 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1494 /* 1501 /*
@@ -1498,8 +1505,7 @@ reread:
1498 if (iinfo->i_lenAlloc != inode->i_size) 1505 if (iinfo->i_lenAlloc != inode->i_size)
1499 goto out; 1506 goto out;
1500 /* File in ICB has to fit in there... */ 1507 /* File in ICB has to fit in there... */
1501 if (inode->i_size > inode->i_sb->s_blocksize - 1508 if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
1502 udf_file_entry_alloc_offset(inode))
1503 goto out; 1509 goto out;
1504 } 1510 }
1505 1511
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3ccb2f11fc76..f169411c4ea0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence(
1599 struct udf_vds_record *curr; 1599 struct udf_vds_record *curr;
1600 struct generic_desc *gd; 1600 struct generic_desc *gd;
1601 struct volDescPtr *vdp; 1601 struct volDescPtr *vdp;
1602 int done = 0; 1602 bool done = false;
1603 uint32_t vdsn; 1603 uint32_t vdsn;
1604 uint16_t ident; 1604 uint16_t ident;
1605 long next_s = 0, next_e = 0; 1605 long next_s = 0, next_e = 0;
@@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence(
1680 lastblock = next_e; 1680 lastblock = next_e;
1681 next_s = next_e = 0; 1681 next_s = next_e = 0;
1682 } else 1682 } else
1683 done = 1; 1683 done = true;
1684 break; 1684 break;
1685 } 1685 }
1686 brelse(bh); 1686 brelse(bh);
@@ -2300,6 +2300,7 @@ static void udf_put_super(struct super_block *sb)
2300 udf_close_lvid(sb); 2300 udf_close_lvid(sb);
2301 brelse(sbi->s_lvid_bh); 2301 brelse(sbi->s_lvid_bh);
2302 udf_sb_free_partitions(sb); 2302 udf_sb_free_partitions(sb);
2303 mutex_destroy(&sbi->s_alloc_mutex);
2303 kfree(sb->s_fs_info); 2304 kfree(sb->s_fs_info);
2304 sb->s_fs_info = NULL; 2305 sb->s_fs_info = NULL;
2305} 2306}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index da73801301d5..8092d3759a5e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -95,22 +95,18 @@
95 95
96void lock_ufs(struct super_block *sb) 96void lock_ufs(struct super_block *sb)
97{ 97{
98#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
99 struct ufs_sb_info *sbi = UFS_SB(sb); 98 struct ufs_sb_info *sbi = UFS_SB(sb);
100 99
101 mutex_lock(&sbi->mutex); 100 mutex_lock(&sbi->mutex);
102 sbi->mutex_owner = current; 101 sbi->mutex_owner = current;
103#endif
104} 102}
105 103
106void unlock_ufs(struct super_block *sb) 104void unlock_ufs(struct super_block *sb)
107{ 105{
108#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
109 struct ufs_sb_info *sbi = UFS_SB(sb); 106 struct ufs_sb_info *sbi = UFS_SB(sb);
110 107
111 sbi->mutex_owner = NULL; 108 sbi->mutex_owner = NULL;
112 mutex_unlock(&sbi->mutex); 109 mutex_unlock(&sbi->mutex);
113#endif
114} 110}
115 111
116static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) 112static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
@@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep;
1415static struct inode *ufs_alloc_inode(struct super_block *sb) 1411static struct inode *ufs_alloc_inode(struct super_block *sb)
1416{ 1412{
1417 struct ufs_inode_info *ei; 1413 struct ufs_inode_info *ei;
1418 ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); 1414
1415 ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
1419 if (!ei) 1416 if (!ei)
1420 return NULL; 1417 return NULL;
1418
1421 ei->vfs_inode.i_version = 1; 1419 ei->vfs_inode.i_version = 1;
1422 return &ei->vfs_inode; 1420 return &ei->vfs_inode;
1423} 1421}
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 53e95b2a1369..a7a3a63bb360 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -91,16 +91,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
91 return ptr; 91 return ptr;
92} 92}
93 93
94void
95kmem_free(const void *ptr)
96{
97 if (!is_vmalloc_addr(ptr)) {
98 kfree(ptr);
99 } else {
100 vfree(ptr);
101 }
102}
103
104void * 94void *
105kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, 95kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
106 xfs_km_flags_t flags) 96 xfs_km_flags_t flags)
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 64db0e53edea..cc6b768fc068 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -63,7 +63,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
63extern void *kmem_alloc(size_t, xfs_km_flags_t); 63extern void *kmem_alloc(size_t, xfs_km_flags_t);
64extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); 64extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
65extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); 65extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
66extern void kmem_free(const void *); 66static inline void kmem_free(const void *ptr)
67{
68 kvfree(ptr);
69}
67 70
68 71
69extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); 72extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5d38e8b8a913..15105dbc9e28 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -403,7 +403,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
403 if (!xfs_sb_version_hasattr2(&mp->m_sb)) { 403 if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
404 xfs_sb_version_addattr2(&mp->m_sb); 404 xfs_sb_version_addattr2(&mp->m_sb);
405 spin_unlock(&mp->m_sb_lock); 405 spin_unlock(&mp->m_sb_lock);
406 xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); 406 xfs_log_sb(tp);
407 } else 407 } else
408 spin_unlock(&mp->m_sb_lock); 408 spin_unlock(&mp->m_sb_lock);
409 } 409 }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b5eb4743f75a..61ec015dca16 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -973,7 +973,11 @@ xfs_bmap_local_to_extents(
973 *firstblock = args.fsbno; 973 *firstblock = args.fsbno;
974 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 974 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
975 975
976 /* initialise the block and copy the data */ 976 /*
977 * Initialise the block and copy the data
978 *
979 * Note: init_fn must set the buffer log item type correctly!
980 */
977 init_fn(tp, bp, ip, ifp); 981 init_fn(tp, bp, ip, ifp);
978 982
979 /* account for the change in fork size and log everything */ 983 /* account for the change in fork size and log everything */
@@ -1221,22 +1225,20 @@ xfs_bmap_add_attrfork(
1221 goto bmap_cancel; 1225 goto bmap_cancel;
1222 if (!xfs_sb_version_hasattr(&mp->m_sb) || 1226 if (!xfs_sb_version_hasattr(&mp->m_sb) ||
1223 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { 1227 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
1224 __int64_t sbfields = 0; 1228 bool log_sb = false;
1225 1229
1226 spin_lock(&mp->m_sb_lock); 1230 spin_lock(&mp->m_sb_lock);
1227 if (!xfs_sb_version_hasattr(&mp->m_sb)) { 1231 if (!xfs_sb_version_hasattr(&mp->m_sb)) {
1228 xfs_sb_version_addattr(&mp->m_sb); 1232 xfs_sb_version_addattr(&mp->m_sb);
1229 sbfields |= XFS_SB_VERSIONNUM; 1233 log_sb = true;
1230 } 1234 }
1231 if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { 1235 if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
1232 xfs_sb_version_addattr2(&mp->m_sb); 1236 xfs_sb_version_addattr2(&mp->m_sb);
1233 sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); 1237 log_sb = true;
1234 } 1238 }
1235 if (sbfields) { 1239 spin_unlock(&mp->m_sb_lock);
1236 spin_unlock(&mp->m_sb_lock); 1240 if (log_sb)
1237 xfs_mod_sb(tp, sbfields); 1241 xfs_log_sb(tp);
1238 } else
1239 spin_unlock(&mp->m_sb_lock);
1240 } 1242 }
1241 1243
1242 error = xfs_bmap_finish(&tp, &flist, &committed); 1244 error = xfs_bmap_finish(&tp, &flist, &committed);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 44db6db86402..b9d8a499d2c4 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -28,6 +28,37 @@ struct xfs_trans;
28extern kmem_zone_t *xfs_bmap_free_item_zone; 28extern kmem_zone_t *xfs_bmap_free_item_zone;
29 29
30/* 30/*
31 * Argument structure for xfs_bmap_alloc.
32 */
33struct xfs_bmalloca {
34 xfs_fsblock_t *firstblock; /* i/o first block allocated */
35 struct xfs_bmap_free *flist; /* bmap freelist */
36 struct xfs_trans *tp; /* transaction pointer */
37 struct xfs_inode *ip; /* incore inode pointer */
38 struct xfs_bmbt_irec prev; /* extent before the new one */
39 struct xfs_bmbt_irec got; /* extent after, or delayed */
40
41 xfs_fileoff_t offset; /* offset in file filling in */
42 xfs_extlen_t length; /* i/o length asked/allocated */
43 xfs_fsblock_t blkno; /* starting block of new extent */
44
45 struct xfs_btree_cur *cur; /* btree cursor */
46 xfs_extnum_t idx; /* current extent index */
47 int nallocs;/* number of extents alloc'd */
48 int logflags;/* flags for transaction logging */
49
50 xfs_extlen_t total; /* total blocks needed for xaction */
51 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
52 xfs_extlen_t minleft; /* amount must be left after alloc */
53 bool eof; /* set if allocating past last extent */
54 bool wasdel; /* replacing a delayed allocation */
55 bool userdata;/* set if is user data */
56 bool aeof; /* allocated space at eof */
57 bool conv; /* overwriting unwritten extents */
58 int flags;
59};
60
61/*
31 * List of extents to be free "later". 62 * List of extents to be free "later".
32 * The list is kept sorted on xbf_startblock. 63 * The list is kept sorted on xbf_startblock.
33 */ 64 */
@@ -149,6 +180,8 @@ void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
149void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, 180void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
150 struct xfs_bmap_free *flist, struct xfs_mount *mp); 181 struct xfs_bmap_free *flist, struct xfs_mount *mp);
151void xfs_bmap_cancel(struct xfs_bmap_free *flist); 182void xfs_bmap_cancel(struct xfs_bmap_free *flist);
183int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
184 int *committed);
152void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); 185void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
153int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, 186int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
154 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); 187 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fbd6da263571..8eb718979383 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -151,10 +151,13 @@ typedef struct xfs_sb {
151 __uint32_t sb_features2; /* additional feature bits */ 151 __uint32_t sb_features2; /* additional feature bits */
152 152
153 /* 153 /*
154 * bad features2 field as a result of failing to pad the sb 154 * bad features2 field as a result of failing to pad the sb structure to
155 * structure to 64 bits. Some machines will be using this field 155 * 64 bits. Some machines will be using this field for features2 bits.
156 * for features2 bits. Easiest just to mark it bad and not use 156 * Easiest just to mark it bad and not use it for anything else.
157 * it for anything else. 157 *
158 * This is not kept up to date in memory; it is always overwritten by
159 * the value in sb_features2 when formatting the incore superblock to
160 * the disk buffer.
158 */ 161 */
159 __uint32_t sb_bad_features2; 162 __uint32_t sb_bad_features2;
160 163
@@ -304,8 +307,8 @@ typedef enum {
304#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) 307#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
305#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) 308#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
306#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) 309#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
307#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) 310#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \
308#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) 311 XFS_SB_MVAL(BAD_FEATURES2))
309#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) 312#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
310#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) 313#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
311#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) 314#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
@@ -319,9 +322,9 @@ typedef enum {
319 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ 322 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
320 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ 323 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
321 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ 324 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
322 XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ 325 XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
323 XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ 326 XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
324 XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) 327 XFS_SB_PQUOTINO)
325 328
326 329
327/* 330/*
@@ -453,13 +456,11 @@ static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
453{ 456{
454 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; 457 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
455 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; 458 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
456 sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
457} 459}
458 460
459static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) 461static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
460{ 462{
461 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; 463 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
462 sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
463 if (!sbp->sb_features2) 464 if (!sbp->sb_features2)
464 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 465 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
465} 466}
@@ -475,7 +476,6 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
475{ 476{
476 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; 477 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
477 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; 478 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
478 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
479} 479}
480 480
481/* 481/*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 752915fa775a..b0a5fe95a3e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -40,69 +40,6 @@
40 * Physical superblock buffer manipulations. Shared with libxfs in userspace. 40 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
41 */ 41 */
42 42
43static const struct {
44 short offset;
45 short type; /* 0 = integer
46 * 1 = binary / string (no translation)
47 */
48} xfs_sb_info[] = {
49 { offsetof(xfs_sb_t, sb_magicnum), 0 },
50 { offsetof(xfs_sb_t, sb_blocksize), 0 },
51 { offsetof(xfs_sb_t, sb_dblocks), 0 },
52 { offsetof(xfs_sb_t, sb_rblocks), 0 },
53 { offsetof(xfs_sb_t, sb_rextents), 0 },
54 { offsetof(xfs_sb_t, sb_uuid), 1 },
55 { offsetof(xfs_sb_t, sb_logstart), 0 },
56 { offsetof(xfs_sb_t, sb_rootino), 0 },
57 { offsetof(xfs_sb_t, sb_rbmino), 0 },
58 { offsetof(xfs_sb_t, sb_rsumino), 0 },
59 { offsetof(xfs_sb_t, sb_rextsize), 0 },
60 { offsetof(xfs_sb_t, sb_agblocks), 0 },
61 { offsetof(xfs_sb_t, sb_agcount), 0 },
62 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
63 { offsetof(xfs_sb_t, sb_logblocks), 0 },
64 { offsetof(xfs_sb_t, sb_versionnum), 0 },
65 { offsetof(xfs_sb_t, sb_sectsize), 0 },
66 { offsetof(xfs_sb_t, sb_inodesize), 0 },
67 { offsetof(xfs_sb_t, sb_inopblock), 0 },
68 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
69 { offsetof(xfs_sb_t, sb_blocklog), 0 },
70 { offsetof(xfs_sb_t, sb_sectlog), 0 },
71 { offsetof(xfs_sb_t, sb_inodelog), 0 },
72 { offsetof(xfs_sb_t, sb_inopblog), 0 },
73 { offsetof(xfs_sb_t, sb_agblklog), 0 },
74 { offsetof(xfs_sb_t, sb_rextslog), 0 },
75 { offsetof(xfs_sb_t, sb_inprogress), 0 },
76 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
77 { offsetof(xfs_sb_t, sb_icount), 0 },
78 { offsetof(xfs_sb_t, sb_ifree), 0 },
79 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
80 { offsetof(xfs_sb_t, sb_frextents), 0 },
81 { offsetof(xfs_sb_t, sb_uquotino), 0 },
82 { offsetof(xfs_sb_t, sb_gquotino), 0 },
83 { offsetof(xfs_sb_t, sb_qflags), 0 },
84 { offsetof(xfs_sb_t, sb_flags), 0 },
85 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
86 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
87 { offsetof(xfs_sb_t, sb_unit), 0 },
88 { offsetof(xfs_sb_t, sb_width), 0 },
89 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
90 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
91 { offsetof(xfs_sb_t, sb_logsectsize), 0 },
92 { offsetof(xfs_sb_t, sb_logsunit), 0 },
93 { offsetof(xfs_sb_t, sb_features2), 0 },
94 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
95 { offsetof(xfs_sb_t, sb_features_compat), 0 },
96 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
97 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
98 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
99 { offsetof(xfs_sb_t, sb_crc), 0 },
100 { offsetof(xfs_sb_t, sb_pad), 0 },
101 { offsetof(xfs_sb_t, sb_pquotino), 0 },
102 { offsetof(xfs_sb_t, sb_lsn), 0 },
103 { sizeof(xfs_sb_t), 0 }
104};
105
106/* 43/*
107 * Reference counting access wrappers to the perag structures. 44 * Reference counting access wrappers to the perag structures.
108 * Because we never free per-ag structures, the only thing we 45 * Because we never free per-ag structures, the only thing we
@@ -461,58 +398,49 @@ xfs_sb_from_disk(
461 __xfs_sb_from_disk(to, from, true); 398 __xfs_sb_from_disk(to, from, true);
462} 399}
463 400
464static inline void 401static void
465xfs_sb_quota_to_disk( 402xfs_sb_quota_to_disk(
466 xfs_dsb_t *to, 403 struct xfs_dsb *to,
467 xfs_sb_t *from, 404 struct xfs_sb *from)
468 __int64_t *fields)
469{ 405{
470 __uint16_t qflags = from->sb_qflags; 406 __uint16_t qflags = from->sb_qflags;
471 407
408 to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
409 if (xfs_sb_version_has_pquotino(from)) {
410 to->sb_qflags = cpu_to_be16(from->sb_qflags);
411 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
412 to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
413 return;
414 }
415
472 /* 416 /*
473 * We need to do these manipilations only if we are working 417 * The in-core version of sb_qflags do not have XFS_OQUOTA_*
474 * with an older version of on-disk superblock. 418 * flags, whereas the on-disk version does. So, convert incore
419 * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
475 */ 420 */
476 if (xfs_sb_version_has_pquotino(from)) 421 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
477 return; 422 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
478 423
479 if (*fields & XFS_SB_QFLAGS) { 424 if (from->sb_qflags &
480 /* 425 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
481 * The in-core version of sb_qflags do not have 426 qflags |= XFS_OQUOTA_ENFD;
482 * XFS_OQUOTA_* flags, whereas the on-disk version 427 if (from->sb_qflags &
483 * does. So, convert incore XFS_{PG}QUOTA_* flags 428 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
484 * to on-disk XFS_OQUOTA_* flags. 429 qflags |= XFS_OQUOTA_CHKD;
485 */ 430 to->sb_qflags = cpu_to_be16(qflags);
486 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
487 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
488
489 if (from->sb_qflags &
490 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
491 qflags |= XFS_OQUOTA_ENFD;
492 if (from->sb_qflags &
493 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
494 qflags |= XFS_OQUOTA_CHKD;
495 to->sb_qflags = cpu_to_be16(qflags);
496 *fields &= ~XFS_SB_QFLAGS;
497 }
498 431
499 /* 432 /*
500 * GQUOTINO and PQUOTINO cannot be used together in versions of 433 * GQUOTINO and PQUOTINO cannot be used together in versions
501 * superblock that do not have pquotino. from->sb_flags tells us which 434 * of superblock that do not have pquotino. from->sb_flags
502 * quota is active and should be copied to disk. If neither are active, 435 * tells us which quota is active and should be copied to
503 * make sure we write NULLFSINO to the sb_gquotino field as a quota 436 * disk. If neither are active, we should NULL the inode.
504 * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
505 * bit is set.
506 * 437 *
507 * Note that we don't need to handle the sb_uquotino or sb_pquotino here 438 * In all cases, the separate pquotino must remain 0 because it
508 * as they do not require any translation. Hence the main sb field loop 439 * it beyond the "end" of the valid non-pquotino superblock.
509 * will write them appropriately from the in-core superblock.
510 */ 440 */
511 if ((*fields & XFS_SB_GQUOTINO) && 441 if (from->sb_qflags & XFS_GQUOTA_ACCT)
512 (from->sb_qflags & XFS_GQUOTA_ACCT))
513 to->sb_gquotino = cpu_to_be64(from->sb_gquotino); 442 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
514 else if ((*fields & XFS_SB_PQUOTINO) && 443 else if (from->sb_qflags & XFS_PQUOTA_ACCT)
515 (from->sb_qflags & XFS_PQUOTA_ACCT))
516 to->sb_gquotino = cpu_to_be64(from->sb_pquotino); 444 to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
517 else { 445 else {
518 /* 446 /*
@@ -526,63 +454,78 @@ xfs_sb_quota_to_disk(
526 to->sb_gquotino = cpu_to_be64(NULLFSINO); 454 to->sb_gquotino = cpu_to_be64(NULLFSINO);
527 } 455 }
528 456
529 *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); 457 to->sb_pquotino = 0;
530} 458}
531 459
532/*
533 * Copy in core superblock to ondisk one.
534 *
535 * The fields argument is mask of superblock fields to copy.
536 */
537void 460void
538xfs_sb_to_disk( 461xfs_sb_to_disk(
539 xfs_dsb_t *to, 462 struct xfs_dsb *to,
540 xfs_sb_t *from, 463 struct xfs_sb *from)
541 __int64_t fields)
542{ 464{
543 xfs_caddr_t to_ptr = (xfs_caddr_t)to; 465 xfs_sb_quota_to_disk(to, from);
544 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
545 xfs_sb_field_t f;
546 int first;
547 int size;
548
549 ASSERT(fields);
550 if (!fields)
551 return;
552 466
553 /* We should never write the crc here, it's updated in the IO path */ 467 to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
554 fields &= ~XFS_SB_CRC; 468 to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
555 469 to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
556 xfs_sb_quota_to_disk(to, from, &fields); 470 to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
557 while (fields) { 471 to->sb_rextents = cpu_to_be64(from->sb_rextents);
558 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 472 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
559 first = xfs_sb_info[f].offset; 473 to->sb_logstart = cpu_to_be64(from->sb_logstart);
560 size = xfs_sb_info[f + 1].offset - first; 474 to->sb_rootino = cpu_to_be64(from->sb_rootino);
561 475 to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
562 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); 476 to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
563 477 to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
564 if (size == 1 || xfs_sb_info[f].type == 1) { 478 to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
565 memcpy(to_ptr + first, from_ptr + first, size); 479 to->sb_agcount = cpu_to_be32(from->sb_agcount);
566 } else { 480 to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
567 switch (size) { 481 to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
568 case 2: 482 to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
569 *(__be16 *)(to_ptr + first) = 483 to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
570 cpu_to_be16(*(__u16 *)(from_ptr + first)); 484 to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
571 break; 485 to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
572 case 4: 486 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
573 *(__be32 *)(to_ptr + first) = 487 to->sb_blocklog = from->sb_blocklog;
574 cpu_to_be32(*(__u32 *)(from_ptr + first)); 488 to->sb_sectlog = from->sb_sectlog;
575 break; 489 to->sb_inodelog = from->sb_inodelog;
576 case 8: 490 to->sb_inopblog = from->sb_inopblog;
577 *(__be64 *)(to_ptr + first) = 491 to->sb_agblklog = from->sb_agblklog;
578 cpu_to_be64(*(__u64 *)(from_ptr + first)); 492 to->sb_rextslog = from->sb_rextslog;
579 break; 493 to->sb_inprogress = from->sb_inprogress;
580 default: 494 to->sb_imax_pct = from->sb_imax_pct;
581 ASSERT(0); 495 to->sb_icount = cpu_to_be64(from->sb_icount);
582 } 496 to->sb_ifree = cpu_to_be64(from->sb_ifree);
583 } 497 to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
498 to->sb_frextents = cpu_to_be64(from->sb_frextents);
584 499
585 fields &= ~(1LL << f); 500 to->sb_flags = from->sb_flags;
501 to->sb_shared_vn = from->sb_shared_vn;
502 to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
503 to->sb_unit = cpu_to_be32(from->sb_unit);
504 to->sb_width = cpu_to_be32(from->sb_width);
505 to->sb_dirblklog = from->sb_dirblklog;
506 to->sb_logsectlog = from->sb_logsectlog;
507 to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
508 to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
509
510 /*
511 * We need to ensure that bad_features2 always matches features2.
512 * Hence we enforce that here rather than having to remember to do it
513 * everywhere else that updates features2.
514 */
515 from->sb_bad_features2 = from->sb_features2;
516 to->sb_features2 = cpu_to_be32(from->sb_features2);
517 to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
518
519 if (xfs_sb_version_hascrc(from)) {
520 to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
521 to->sb_features_ro_compat =
522 cpu_to_be32(from->sb_features_ro_compat);
523 to->sb_features_incompat =
524 cpu_to_be32(from->sb_features_incompat);
525 to->sb_features_log_incompat =
526 cpu_to_be32(from->sb_features_log_incompat);
527 to->sb_pad = 0;
528 to->sb_lsn = cpu_to_be64(from->sb_lsn);
586 } 529 }
587} 530}
588 531
@@ -816,42 +759,51 @@ xfs_initialize_perag_data(
816} 759}
817 760
818/* 761/*
819 * xfs_mod_sb() can be used to copy arbitrary changes to the 762 * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
820 * in-core superblock into the superblock buffer to be logged. 763 * into the superblock buffer to be logged. It does not provide the higher
821 * It does not provide the higher level of locking that is 764 * level of locking that is needed to protect the in-core superblock from
822 * needed to protect the in-core superblock from concurrent 765 * concurrent access.
823 * access.
824 */ 766 */
825void 767void
826xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) 768xfs_log_sb(
769 struct xfs_trans *tp)
827{ 770{
828 xfs_buf_t *bp; 771 struct xfs_mount *mp = tp->t_mountp;
829 int first; 772 struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0);
830 int last;
831 xfs_mount_t *mp;
832 xfs_sb_field_t f;
833
834 ASSERT(fields);
835 if (!fields)
836 return;
837 mp = tp->t_mountp;
838 bp = xfs_trans_getsb(tp, mp, 0);
839 first = sizeof(xfs_sb_t);
840 last = 0;
841
842 /* translate/copy */
843 773
844 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); 774 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
775 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
776 xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
777}
845 778
846 /* find modified range */ 779/*
847 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); 780 * xfs_sync_sb
848 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 781 *
849 last = xfs_sb_info[f + 1].offset - 1; 782 * Sync the superblock to disk.
783 *
784 * Note that the caller is responsible for checking the frozen state of the
785 * filesystem. This procedure uses the non-blocking transaction allocator and
786 * thus will allow modifications to a frozen fs. This is required because this
787 * code can be called during the process of freezing where use of the high-level
788 * allocator would deadlock.
789 */
790int
791xfs_sync_sb(
792 struct xfs_mount *mp,
793 bool wait)
794{
795 struct xfs_trans *tp;
796 int error;
850 797
851 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 798 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
852 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 799 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
853 first = xfs_sb_info[f].offset; 800 if (error) {
801 xfs_trans_cancel(tp, 0);
802 return error;
803 }
854 804
855 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); 805 xfs_log_sb(tp);
856 xfs_trans_log_buf(tp, bp, first, last); 806 if (wait)
807 xfs_trans_set_sync(tp);
808 return xfs_trans_commit(tp, 0);
857} 809}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 8eb1c54bafbf..b25bb9a343f3 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,11 +27,12 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
27extern void xfs_perag_put(struct xfs_perag *pag); 27extern void xfs_perag_put(struct xfs_perag *pag);
28extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); 28extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
29 29
30extern void xfs_sb_calc_crc(struct xfs_buf *); 30extern void xfs_sb_calc_crc(struct xfs_buf *bp);
31extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 31extern void xfs_log_sb(struct xfs_trans *tp);
32extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *); 32extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
33extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 33extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
34extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 34extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
35extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
35extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); 36extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
36 37
37#endif /* __XFS_SB_H__ */ 38#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..8dda4b321343 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -82,7 +82,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
82#define XFS_TRANS_ATTR_RM 23 82#define XFS_TRANS_ATTR_RM 23
83#define XFS_TRANS_ATTR_FLAG 24 83#define XFS_TRANS_ATTR_FLAG 24
84#define XFS_TRANS_CLEAR_AGI_BUCKET 25 84#define XFS_TRANS_CLEAR_AGI_BUCKET 25
85#define XFS_TRANS_QM_SBCHANGE 26 85#define XFS_TRANS_SB_CHANGE 26
86/* 86/*
87 * Dummy entries since we use the transaction type to index into the 87 * Dummy entries since we use the transaction type to index into the
88 * trans_type[] in xlog_recover_print_trans_head() 88 * trans_type[] in xlog_recover_print_trans_head()
@@ -95,17 +95,15 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
95#define XFS_TRANS_QM_DQCLUSTER 32 95#define XFS_TRANS_QM_DQCLUSTER 32
96#define XFS_TRANS_QM_QINOCREATE 33 96#define XFS_TRANS_QM_QINOCREATE 33
97#define XFS_TRANS_QM_QUOTAOFF_END 34 97#define XFS_TRANS_QM_QUOTAOFF_END 34
98#define XFS_TRANS_SB_UNIT 35 98#define XFS_TRANS_FSYNC_TS 35
99#define XFS_TRANS_FSYNC_TS 36 99#define XFS_TRANS_GROWFSRT_ALLOC 36
100#define XFS_TRANS_GROWFSRT_ALLOC 37 100#define XFS_TRANS_GROWFSRT_ZERO 37
101#define XFS_TRANS_GROWFSRT_ZERO 38 101#define XFS_TRANS_GROWFSRT_FREE 38
102#define XFS_TRANS_GROWFSRT_FREE 39 102#define XFS_TRANS_SWAPEXT 39
103#define XFS_TRANS_SWAPEXT 40 103#define XFS_TRANS_CHECKPOINT 40
104#define XFS_TRANS_SB_COUNT 41 104#define XFS_TRANS_ICREATE 41
105#define XFS_TRANS_CHECKPOINT 42 105#define XFS_TRANS_CREATE_TMPFILE 42
106#define XFS_TRANS_ICREATE 43 106#define XFS_TRANS_TYPE_MAX 43
107#define XFS_TRANS_CREATE_TMPFILE 44
108#define XFS_TRANS_TYPE_MAX 44
109/* new transaction types need to be reflected in xfs_logprint(8) */ 107/* new transaction types need to be reflected in xfs_logprint(8) */
110 108
111#define XFS_TRANS_TYPES \ 109#define XFS_TRANS_TYPES \
@@ -113,7 +111,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
113 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ 111 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
114 { XFS_TRANS_INACTIVE, "INACTIVE" }, \ 112 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
115 { XFS_TRANS_CREATE, "CREATE" }, \ 113 { XFS_TRANS_CREATE, "CREATE" }, \
116 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
117 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ 114 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
118 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ 115 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
119 { XFS_TRANS_REMOVE, "REMOVE" }, \ 116 { XFS_TRANS_REMOVE, "REMOVE" }, \
@@ -134,23 +131,23 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
134 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ 131 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
135 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ 132 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
136 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ 133 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
137 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ 134 { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \
135 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
136 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
138 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ 137 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
139 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ 138 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
140 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ 139 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
141 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ 140 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
142 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ 141 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
143 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ 142 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
144 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
145 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ 143 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
146 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ 144 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
147 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ 145 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 146 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 147 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
151 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ 148 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
152 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 149 { XFS_TRANS_ICREATE, "ICREATE" }, \
153 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 150 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
154 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 151 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
155 152
156/* 153/*
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index c80c5236c3da..e7e26bd6468f 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -178,6 +178,8 @@ xfs_symlink_local_to_remote(
178 struct xfs_mount *mp = ip->i_mount; 178 struct xfs_mount *mp = ip->i_mount;
179 char *buf; 179 char *buf;
180 180
181 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
182
181 if (!xfs_sb_version_hascrc(&mp->m_sb)) { 183 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
182 bp->b_ops = NULL; 184 bp->b_ops = NULL;
183 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); 185 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6c1330f29050..68cb1e7bf2bb 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -716,17 +716,6 @@ xfs_calc_clear_agi_bucket_reservation(
716} 716}
717 717
718/* 718/*
719 * Clearing the quotaflags in the superblock.
720 * the super block for changing quota flags: sector size
721 */
722STATIC uint
723xfs_calc_qm_sbchange_reservation(
724 struct xfs_mount *mp)
725{
726 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
727}
728
729/*
730 * Adjusting quota limits. 719 * Adjusting quota limits.
731 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) 720 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
732 */ 721 */
@@ -864,9 +853,6 @@ xfs_trans_resv_calc(
864 * The following transactions are logged in logical format with 853 * The following transactions are logged in logical format with
865 * a default log count. 854 * a default log count.
866 */ 855 */
867 resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
868 resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
869
870 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); 856 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
871 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; 857 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
872 858
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..2d5bdfce6d8f 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -56,7 +56,6 @@ struct xfs_trans_resv {
56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ 56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ 57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ 58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
59 struct xfs_trans_res tr_qm_sbchange; /* change quota flags */
60 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ 59 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
61 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ 60 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
62 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ 61 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 18e2f3bbae5e..3a9b7a1b8704 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
135 */ 135 */
136STATIC int 136STATIC int
137xfs_setfilesize( 137xfs_setfilesize(
138 struct xfs_ioend *ioend) 138 struct xfs_inode *ip,
139 struct xfs_trans *tp,
140 xfs_off_t offset,
141 size_t size)
139{ 142{
140 struct xfs_inode *ip = XFS_I(ioend->io_inode);
141 struct xfs_trans *tp = ioend->io_append_trans;
142 xfs_fsize_t isize; 143 xfs_fsize_t isize;
143 144
144 /*
145 * The transaction may have been allocated in the I/O submission thread,
146 * thus we need to mark ourselves as beeing in a transaction manually.
147 * Similarly for freeze protection.
148 */
149 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
150 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
151 0, 1, _THIS_IP_);
152
153 xfs_ilock(ip, XFS_ILOCK_EXCL); 145 xfs_ilock(ip, XFS_ILOCK_EXCL);
154 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 146 isize = xfs_new_eof(ip, offset + size);
155 if (!isize) { 147 if (!isize) {
156 xfs_iunlock(ip, XFS_ILOCK_EXCL); 148 xfs_iunlock(ip, XFS_ILOCK_EXCL);
157 xfs_trans_cancel(tp, 0); 149 xfs_trans_cancel(tp, 0);
158 return 0; 150 return 0;
159 } 151 }
160 152
161 trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); 153 trace_xfs_setfilesize(ip, offset, size);
162 154
163 ip->i_d.di_size = isize; 155 ip->i_d.di_size = isize;
164 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -167,6 +159,25 @@ xfs_setfilesize(
167 return xfs_trans_commit(tp, 0); 159 return xfs_trans_commit(tp, 0);
168} 160}
169 161
162STATIC int
163xfs_setfilesize_ioend(
164 struct xfs_ioend *ioend)
165{
166 struct xfs_inode *ip = XFS_I(ioend->io_inode);
167 struct xfs_trans *tp = ioend->io_append_trans;
168
169 /*
170 * The transaction may have been allocated in the I/O submission thread,
171 * thus we need to mark ourselves as being in a transaction manually.
172 * Similarly for freeze protection.
173 */
174 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
175 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
176 0, 1, _THIS_IP_);
177
178 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
179}
180
170/* 181/*
171 * Schedule IO completion handling on the final put of an ioend. 182 * Schedule IO completion handling on the final put of an ioend.
172 * 183 *
@@ -182,8 +193,7 @@ xfs_finish_ioend(
182 193
183 if (ioend->io_type == XFS_IO_UNWRITTEN) 194 if (ioend->io_type == XFS_IO_UNWRITTEN)
184 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 195 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
185 else if (ioend->io_append_trans || 196 else if (ioend->io_append_trans)
186 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
187 queue_work(mp->m_data_workqueue, &ioend->io_work); 197 queue_work(mp->m_data_workqueue, &ioend->io_work);
188 else 198 else
189 xfs_destroy_ioend(ioend); 199 xfs_destroy_ioend(ioend);
@@ -215,22 +225,8 @@ xfs_end_io(
215 if (ioend->io_type == XFS_IO_UNWRITTEN) { 225 if (ioend->io_type == XFS_IO_UNWRITTEN) {
216 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 226 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
217 ioend->io_size); 227 ioend->io_size);
218 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
219 /*
220 * For direct I/O we do not know if we need to allocate blocks
221 * or not so we can't preallocate an append transaction as that
222 * results in nested reservations and log space deadlocks. Hence
223 * allocate the transaction here. While this is sub-optimal and
224 * can block IO completion for some time, we're stuck with doing
225 * it this way until we can pass the ioend to the direct IO
226 * allocation callbacks and avoid nesting that way.
227 */
228 error = xfs_setfilesize_trans_alloc(ioend);
229 if (error)
230 goto done;
231 error = xfs_setfilesize(ioend);
232 } else if (ioend->io_append_trans) { 228 } else if (ioend->io_append_trans) {
233 error = xfs_setfilesize(ioend); 229 error = xfs_setfilesize_ioend(ioend);
234 } else { 230 } else {
235 ASSERT(!xfs_ioend_is_append(ioend)); 231 ASSERT(!xfs_ioend_is_append(ioend));
236 } 232 }
@@ -242,17 +238,6 @@ done:
242} 238}
243 239
244/* 240/*
245 * Call IO completion handling in caller context on the final put of an ioend.
246 */
247STATIC void
248xfs_finish_ioend_sync(
249 struct xfs_ioend *ioend)
250{
251 if (atomic_dec_and_test(&ioend->io_remaining))
252 xfs_end_io(&ioend->io_work);
253}
254
255/*
256 * Allocate and initialise an IO completion structure. 241 * Allocate and initialise an IO completion structure.
257 * We need to track unwritten extent write completion here initially. 242 * We need to track unwritten extent write completion here initially.
258 * We'll need to extend this for updating the ondisk inode size later 243 * We'll need to extend this for updating the ondisk inode size later
@@ -273,7 +258,6 @@ xfs_alloc_ioend(
273 * all the I/O from calling the completion routine too early. 258 * all the I/O from calling the completion routine too early.
274 */ 259 */
275 atomic_set(&ioend->io_remaining, 1); 260 atomic_set(&ioend->io_remaining, 1);
276 ioend->io_isdirect = 0;
277 ioend->io_error = 0; 261 ioend->io_error = 0;
278 ioend->io_list = NULL; 262 ioend->io_list = NULL;
279 ioend->io_type = type; 263 ioend->io_type = type;
@@ -1459,11 +1443,7 @@ xfs_get_blocks_direct(
1459 * 1443 *
1460 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1444 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1461 * need to issue a transaction to convert the range from unwritten to written 1445 * need to issue a transaction to convert the range from unwritten to written
1462 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1446 * extents.
1463 * to do this and we are done. But in case this was a successful AIO
1464 * request this handler is called from interrupt context, from which we
1465 * can't start transactions. In that case offload the I/O completion to
1466 * the workqueues we also use for buffered I/O completion.
1467 */ 1447 */
1468STATIC void 1448STATIC void
1469xfs_end_io_direct_write( 1449xfs_end_io_direct_write(
@@ -1472,7 +1452,12 @@ xfs_end_io_direct_write(
1472 ssize_t size, 1452 ssize_t size,
1473 void *private) 1453 void *private)
1474{ 1454{
1475 struct xfs_ioend *ioend = iocb->private; 1455 struct inode *inode = file_inode(iocb->ki_filp);
1456 struct xfs_inode *ip = XFS_I(inode);
1457 struct xfs_mount *mp = ip->i_mount;
1458
1459 if (XFS_FORCED_SHUTDOWN(mp))
1460 return;
1476 1461
1477 /* 1462 /*
1478 * While the generic direct I/O code updates the inode size, it does 1463 * While the generic direct I/O code updates the inode size, it does
@@ -1480,22 +1465,33 @@ xfs_end_io_direct_write(
1480 * end_io handler thinks the on-disk size is outside the in-core 1465 * end_io handler thinks the on-disk size is outside the in-core
1481 * size. To prevent this just update it a little bit earlier here. 1466 * size. To prevent this just update it a little bit earlier here.
1482 */ 1467 */
1483 if (offset + size > i_size_read(ioend->io_inode)) 1468 if (offset + size > i_size_read(inode))
1484 i_size_write(ioend->io_inode, offset + size); 1469 i_size_write(inode, offset + size);
1485 1470
1486 /* 1471 /*
1487 * blockdev_direct_IO can return an error even after the I/O 1472 * For direct I/O we do not know if we need to allocate blocks or not,
1488 * completion handler was called. Thus we need to protect 1473 * so we can't preallocate an append transaction, as that results in
1489 * against double-freeing. 1474 * nested reservations and log space deadlocks. Hence allocate the
1475 * transaction here. While this is sub-optimal and can block IO
1476 * completion for some time, we're stuck with doing it this way until
1477 * we can pass the ioend to the direct IO allocation callbacks and
1478 * avoid nesting that way.
1490 */ 1479 */
1491 iocb->private = NULL; 1480 if (private && size > 0) {
1492 1481 xfs_iomap_write_unwritten(ip, offset, size);
1493 ioend->io_offset = offset; 1482 } else if (offset + size > ip->i_d.di_size) {
1494 ioend->io_size = size; 1483 struct xfs_trans *tp;
1495 if (private && size > 0) 1484 int error;
1496 ioend->io_type = XFS_IO_UNWRITTEN; 1485
1486 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1487 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1488 if (error) {
1489 xfs_trans_cancel(tp, 0);
1490 return;
1491 }
1497 1492
1498 xfs_finish_ioend_sync(ioend); 1493 xfs_setfilesize(ip, tp, offset, size);
1494 }
1499} 1495}
1500 1496
1501STATIC ssize_t 1497STATIC ssize_t
@@ -1507,39 +1503,16 @@ xfs_vm_direct_IO(
1507{ 1503{
1508 struct inode *inode = iocb->ki_filp->f_mapping->host; 1504 struct inode *inode = iocb->ki_filp->f_mapping->host;
1509 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1505 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1510 struct xfs_ioend *ioend = NULL;
1511 ssize_t ret;
1512 1506
1513 if (rw & WRITE) { 1507 if (rw & WRITE) {
1514 size_t size = iov_iter_count(iter); 1508 return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1515
1516 /*
1517 * We cannot preallocate a size update transaction here as we
1518 * don't know whether allocation is necessary or not. Hence we
1519 * can only tell IO completion that one is necessary if we are
1520 * not doing unwritten extent conversion.
1521 */
1522 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1523 if (offset + size > XFS_I(inode)->i_d.di_size)
1524 ioend->io_isdirect = 1;
1525
1526 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1527 offset, xfs_get_blocks_direct, 1509 offset, xfs_get_blocks_direct,
1528 xfs_end_io_direct_write, NULL, 1510 xfs_end_io_direct_write, NULL,
1529 DIO_ASYNC_EXTEND); 1511 DIO_ASYNC_EXTEND);
1530 if (ret != -EIOCBQUEUED && iocb->private)
1531 goto out_destroy_ioend;
1532 } else {
1533 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1534 offset, xfs_get_blocks_direct,
1535 NULL, NULL, 0);
1536 } 1512 }
1537 1513 return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1538 return ret; 1514 offset, xfs_get_blocks_direct,
1539 1515 NULL, NULL, 0);
1540out_destroy_ioend:
1541 xfs_destroy_ioend(ioend);
1542 return ret;
1543} 1516}
1544 1517
1545/* 1518/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f94dd459dff9..ac644e0137a4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
24 * Types of I/O for bmap clustering and I/O completion tracking. 24 * Types of I/O for bmap clustering and I/O completion tracking.
25 */ 25 */
26enum { 26enum {
27 XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */
28 XFS_IO_DELALLOC, /* covers delalloc region */ 27 XFS_IO_DELALLOC, /* covers delalloc region */
29 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ 28 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
30 XFS_IO_OVERWRITE, /* covers already allocated extent */ 29 XFS_IO_OVERWRITE, /* covers already allocated extent */
31}; 30};
32 31
33#define XFS_IO_TYPES \ 32#define XFS_IO_TYPES \
34 { 0, "" }, \
35 { XFS_IO_DELALLOC, "delalloc" }, \ 33 { XFS_IO_DELALLOC, "delalloc" }, \
36 { XFS_IO_UNWRITTEN, "unwritten" }, \ 34 { XFS_IO_UNWRITTEN, "unwritten" }, \
37 { XFS_IO_OVERWRITE, "overwrite" } 35 { XFS_IO_OVERWRITE, "overwrite" }
@@ -45,7 +43,6 @@ typedef struct xfs_ioend {
45 unsigned int io_type; /* delalloc / unwritten */ 43 unsigned int io_type; /* delalloc / unwritten */
46 int io_error; /* I/O error code */ 44 int io_error; /* I/O error code */
47 atomic_t io_remaining; /* hold count */ 45 atomic_t io_remaining; /* hold count */
48 unsigned int io_isdirect : 1;/* direct I/O */
49 struct inode *io_inode; /* file being written to */ 46 struct inode *io_inode; /* file being written to */
50 struct buffer_head *io_buffer_head;/* buffer linked list head */ 47 struct buffer_head *io_buffer_head;/* buffer linked list head */
51 struct buffer_head *io_buffer_tail;/* buffer linked list tail */ 48 struct buffer_head *io_buffer_tail;/* buffer linked list tail */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 2fdb72d2c908..736429a72a12 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -26,43 +26,8 @@ struct xfs_ifork;
26struct xfs_inode; 26struct xfs_inode;
27struct xfs_mount; 27struct xfs_mount;
28struct xfs_trans; 28struct xfs_trans;
29struct xfs_bmalloca;
29 30
30/*
31 * Argument structure for xfs_bmap_alloc.
32 */
33struct xfs_bmalloca {
34 xfs_fsblock_t *firstblock; /* i/o first block allocated */
35 struct xfs_bmap_free *flist; /* bmap freelist */
36 struct xfs_trans *tp; /* transaction pointer */
37 struct xfs_inode *ip; /* incore inode pointer */
38 struct xfs_bmbt_irec prev; /* extent before the new one */
39 struct xfs_bmbt_irec got; /* extent after, or delayed */
40
41 xfs_fileoff_t offset; /* offset in file filling in */
42 xfs_extlen_t length; /* i/o length asked/allocated */
43 xfs_fsblock_t blkno; /* starting block of new extent */
44
45 struct xfs_btree_cur *cur; /* btree cursor */
46 xfs_extnum_t idx; /* current extent index */
47 int nallocs;/* number of extents alloc'd */
48 int logflags;/* flags for transaction logging */
49
50 xfs_extlen_t total; /* total blocks needed for xaction */
51 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
52 xfs_extlen_t minleft; /* amount must be left after alloc */
53 bool eof; /* set if allocating past last extent */
54 bool wasdel; /* replacing a delayed allocation */
55 bool userdata;/* set if is user data */
56 bool aeof; /* allocated space at eof */
57 bool conv; /* overwriting unwritten extents */
58 int flags;
59 struct completion *done;
60 struct work_struct work;
61 int result;
62};
63
64int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
65 int *committed);
66int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); 31int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
67int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, 32int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
68 int whichfork, int *eof); 33 int whichfork, int *eof);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
1488static enum lru_status 1488static enum lru_status
1489xfs_buftarg_wait_rele( 1489xfs_buftarg_wait_rele(
1490 struct list_head *item, 1490 struct list_head *item,
1491 struct list_lru_one *lru,
1491 spinlock_t *lru_lock, 1492 spinlock_t *lru_lock,
1492 void *arg) 1493 void *arg)
1493 1494
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
1509 */ 1510 */
1510 atomic_set(&bp->b_lru_ref, 0); 1511 atomic_set(&bp->b_lru_ref, 0);
1511 bp->b_state |= XFS_BSTATE_DISPOSE; 1512 bp->b_state |= XFS_BSTATE_DISPOSE;
1512 list_move(item, dispose); 1513 list_lru_isolate_move(lru, item, dispose);
1513 spin_unlock(&bp->b_lock); 1514 spin_unlock(&bp->b_lock);
1514 return LRU_REMOVED; 1515 return LRU_REMOVED;
1515} 1516}
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
1546static enum lru_status 1547static enum lru_status
1547xfs_buftarg_isolate( 1548xfs_buftarg_isolate(
1548 struct list_head *item, 1549 struct list_head *item,
1550 struct list_lru_one *lru,
1549 spinlock_t *lru_lock, 1551 spinlock_t *lru_lock,
1550 void *arg) 1552 void *arg)
1551{ 1553{
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
1569 } 1571 }
1570 1572
1571 bp->b_state |= XFS_BSTATE_DISPOSE; 1573 bp->b_state |= XFS_BSTATE_DISPOSE;
1572 list_move(item, dispose); 1574 list_lru_isolate_move(lru, item, dispose);
1573 spin_unlock(&bp->b_lock); 1575 spin_unlock(&bp->b_lock);
1574 return LRU_REMOVED; 1576 return LRU_REMOVED;
1575} 1577}
@@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan(
1583 struct xfs_buftarg, bt_shrinker); 1585 struct xfs_buftarg, bt_shrinker);
1584 LIST_HEAD(dispose); 1586 LIST_HEAD(dispose);
1585 unsigned long freed; 1587 unsigned long freed;
1586 unsigned long nr_to_scan = sc->nr_to_scan;
1587 1588
1588 freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, 1589 freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1589 &dispose, &nr_to_scan); 1590 xfs_buftarg_isolate, &dispose);
1590 1591
1591 while (!list_empty(&dispose)) { 1592 while (!list_empty(&dispose)) {
1592 struct xfs_buf *bp; 1593 struct xfs_buf *bp;
@@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count(
1605{ 1606{
1606 struct xfs_buftarg *btp = container_of(shrink, 1607 struct xfs_buftarg *btp = container_of(shrink,
1607 struct xfs_buftarg, bt_shrinker); 1608 struct xfs_buftarg, bt_shrinker);
1608 return list_lru_count_node(&btp->bt_lru, sc->nid); 1609 return list_lru_shrink_count(&btp->bt_lru, sc);
1609} 1610}
1610 1611
1611void 1612void
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3f9bd58edec7..507d96a57ac7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -319,6 +319,10 @@ xfs_buf_item_format(
319 ASSERT(atomic_read(&bip->bli_refcount) > 0); 319 ASSERT(atomic_read(&bip->bli_refcount) > 0);
320 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 320 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
321 (bip->bli_flags & XFS_BLI_STALE)); 321 (bip->bli_flags & XFS_BLI_STALE));
322 ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
323 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
324 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
325
322 326
323 /* 327 /*
324 * If it is an inode buffer, transfer the in-memory state to the 328 * If it is an inode buffer, transfer the in-memory state to the
@@ -535,7 +539,7 @@ xfs_buf_item_push(
535 if ((bp->b_flags & XBF_WRITE_FAIL) && 539 if ((bp->b_flags & XBF_WRITE_FAIL) &&
536 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { 540 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
537 xfs_warn(bp->b_target->bt_mount, 541 xfs_warn(bp->b_target->bt_mount,
538"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", 542"Detected failing async write on buffer block 0x%llx. Retrying async write.",
539 (long long)bp->b_bn); 543 (long long)bp->b_bn);
540 } 544 }
541 545
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c24c67e22a2a..2f536f33cd26 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -86,7 +86,7 @@ static inline void xfs_dqflock(xfs_dquot_t *dqp)
86 wait_for_completion(&dqp->q_flush); 86 wait_for_completion(&dqp->q_flush);
87} 87}
88 88
89static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) 89static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
90{ 90{
91 return try_wait_for_completion(&dqp->q_flush); 91 return try_wait_for_completion(&dqp->q_flush);
92} 92}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..1cdba95c78cb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -127,6 +127,42 @@ xfs_iozero(
127 return (-status); 127 return (-status);
128} 128}
129 129
130int
131xfs_update_prealloc_flags(
132 struct xfs_inode *ip,
133 enum xfs_prealloc_flags flags)
134{
135 struct xfs_trans *tp;
136 int error;
137
138 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
139 error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
140 if (error) {
141 xfs_trans_cancel(tp, 0);
142 return error;
143 }
144
145 xfs_ilock(ip, XFS_ILOCK_EXCL);
146 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
147
148 if (!(flags & XFS_PREALLOC_INVISIBLE)) {
149 ip->i_d.di_mode &= ~S_ISUID;
150 if (ip->i_d.di_mode & S_IXGRP)
151 ip->i_d.di_mode &= ~S_ISGID;
152 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
153 }
154
155 if (flags & XFS_PREALLOC_SET)
156 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
157 if (flags & XFS_PREALLOC_CLEAR)
158 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
159
160 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
161 if (flags & XFS_PREALLOC_SYNC)
162 xfs_trans_set_sync(tp);
163 return xfs_trans_commit(tp, 0);
164}
165
130/* 166/*
131 * Fsync operations on directories are much simpler than on regular files, 167 * Fsync operations on directories are much simpler than on regular files,
132 * as there is no file data to flush, and thus also no need for explicit 168 * as there is no file data to flush, and thus also no need for explicit
@@ -699,7 +735,7 @@ xfs_file_buffered_aio_write(
699 735
700 iov_iter_truncate(from, count); 736 iov_iter_truncate(from, count);
701 /* We can write back this queue in page reclaim */ 737 /* We can write back this queue in page reclaim */
702 current->backing_dev_info = mapping->backing_dev_info; 738 current->backing_dev_info = inode_to_bdi(inode);
703 739
704write_retry: 740write_retry:
705 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 741 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@ -784,8 +820,8 @@ xfs_file_fallocate(
784{ 820{
785 struct inode *inode = file_inode(file); 821 struct inode *inode = file_inode(file);
786 struct xfs_inode *ip = XFS_I(inode); 822 struct xfs_inode *ip = XFS_I(inode);
787 struct xfs_trans *tp;
788 long error; 823 long error;
824 enum xfs_prealloc_flags flags = 0;
789 loff_t new_size = 0; 825 loff_t new_size = 0;
790 826
791 if (!S_ISREG(inode->i_mode)) 827 if (!S_ISREG(inode->i_mode))
@@ -822,6 +858,8 @@ xfs_file_fallocate(
822 if (error) 858 if (error)
823 goto out_unlock; 859 goto out_unlock;
824 } else { 860 } else {
861 flags |= XFS_PREALLOC_SET;
862
825 if (!(mode & FALLOC_FL_KEEP_SIZE) && 863 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
826 offset + len > i_size_read(inode)) { 864 offset + len > i_size_read(inode)) {
827 new_size = offset + len; 865 new_size = offset + len;
@@ -839,28 +877,10 @@ xfs_file_fallocate(
839 goto out_unlock; 877 goto out_unlock;
840 } 878 }
841 879
842 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
843 error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
844 if (error) {
845 xfs_trans_cancel(tp, 0);
846 goto out_unlock;
847 }
848
849 xfs_ilock(ip, XFS_ILOCK_EXCL);
850 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
851 ip->i_d.di_mode &= ~S_ISUID;
852 if (ip->i_d.di_mode & S_IXGRP)
853 ip->i_d.di_mode &= ~S_ISGID;
854
855 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
856 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
857
858 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
859 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
860
861 if (file->f_flags & O_DSYNC) 880 if (file->f_flags & O_DSYNC)
862 xfs_trans_set_sync(tp); 881 flags |= XFS_PREALLOC_SYNC;
863 error = xfs_trans_commit(tp, 0); 882
883 error = xfs_update_prealloc_flags(ip, flags);
864 if (error) 884 if (error)
865 goto out_unlock; 885 goto out_unlock;
866 886
@@ -1384,5 +1404,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
1384 .fault = filemap_fault, 1404 .fault = filemap_fault,
1385 .map_pages = filemap_map_pages, 1405 .map_pages = filemap_map_pages,
1386 .page_mkwrite = xfs_vm_page_mkwrite, 1406 .page_mkwrite = xfs_vm_page_mkwrite,
1387 .remap_pages = generic_file_remap_pages,
1388}; 1407};
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdc64220fcb0..fba6532efba4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -488,6 +488,7 @@ xfs_growfs_data_private(
488 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); 488 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
489 if (dpct) 489 if (dpct)
490 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 490 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
491 xfs_trans_set_sync(tp);
491 error = xfs_trans_commit(tp, 0); 492 error = xfs_trans_commit(tp, 0);
492 if (error) 493 if (error)
493 return error; 494 return error;
@@ -541,7 +542,7 @@ xfs_growfs_data_private(
541 saved_error = error; 542 saved_error = error;
542 continue; 543 continue;
543 } 544 }
544 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); 545 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
545 546
546 error = xfs_bwrite(bp); 547 error = xfs_bwrite(bp);
547 xfs_buf_relse(bp); 548 xfs_buf_relse(bp);
@@ -756,37 +757,6 @@ out:
756 return 0; 757 return 0;
757} 758}
758 759
759/*
760 * Dump a transaction into the log that contains no real change. This is needed
761 * to be able to make the log dirty or stamp the current tail LSN into the log
762 * during the covering operation.
763 *
764 * We cannot use an inode here for this - that will push dirty state back up
765 * into the VFS and then periodic inode flushing will prevent log covering from
766 * making progress. Hence we log a field in the superblock instead and use a
767 * synchronous transaction to ensure the superblock is immediately unpinned
768 * and can be written back.
769 */
770int
771xfs_fs_log_dummy(
772 xfs_mount_t *mp)
773{
774 xfs_trans_t *tp;
775 int error;
776
777 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
778 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
779 if (error) {
780 xfs_trans_cancel(tp, 0);
781 return error;
782 }
783
784 /* log the UUID because it is an unchanging field */
785 xfs_mod_sb(tp, XFS_SB_UUID);
786 xfs_trans_set_sync(tp);
787 return xfs_trans_commit(tp, 0);
788}
789
790int 760int
791xfs_fs_goingdown( 761xfs_fs_goingdown(
792 xfs_mount_t *mp, 762 xfs_mount_t *mp,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 41f804e740d7..daafa1f6d260 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1995,6 +1995,7 @@ xfs_iunlink(
1995 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1995 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1996 offset = offsetof(xfs_agi_t, agi_unlinked) + 1996 offset = offsetof(xfs_agi_t, agi_unlinked) +
1997 (sizeof(xfs_agino_t) * bucket_index); 1997 (sizeof(xfs_agino_t) * bucket_index);
1998 xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
1998 xfs_trans_log_buf(tp, agibp, offset, 1999 xfs_trans_log_buf(tp, agibp, offset,
1999 (offset + sizeof(xfs_agino_t) - 1)); 2000 (offset + sizeof(xfs_agino_t) - 1));
2000 return 0; 2001 return 0;
@@ -2086,6 +2087,7 @@ xfs_iunlink_remove(
2086 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 2087 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2087 offset = offsetof(xfs_agi_t, agi_unlinked) + 2088 offset = offsetof(xfs_agi_t, agi_unlinked) +
2088 (sizeof(xfs_agino_t) * bucket_index); 2089 (sizeof(xfs_agino_t) * bucket_index);
2090 xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
2089 xfs_trans_log_buf(tp, agibp, offset, 2091 xfs_trans_log_buf(tp, agibp, offset,
2090 (offset + sizeof(xfs_agino_t) - 1)); 2092 (offset + sizeof(xfs_agino_t) - 1));
2091 } else { 2093 } else {
@@ -2656,6 +2658,124 @@ xfs_sort_for_rename(
2656} 2658}
2657 2659
2658/* 2660/*
2661 * xfs_cross_rename()
2662 *
2663 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
2664 */
2665STATIC int
2666xfs_cross_rename(
2667 struct xfs_trans *tp,
2668 struct xfs_inode *dp1,
2669 struct xfs_name *name1,
2670 struct xfs_inode *ip1,
2671 struct xfs_inode *dp2,
2672 struct xfs_name *name2,
2673 struct xfs_inode *ip2,
2674 struct xfs_bmap_free *free_list,
2675 xfs_fsblock_t *first_block,
2676 int spaceres)
2677{
2678 int error = 0;
2679 int ip1_flags = 0;
2680 int ip2_flags = 0;
2681 int dp2_flags = 0;
2682
2683 /* Swap inode number for dirent in first parent */
2684 error = xfs_dir_replace(tp, dp1, name1,
2685 ip2->i_ino,
2686 first_block, free_list, spaceres);
2687 if (error)
2688 goto out;
2689
2690 /* Swap inode number for dirent in second parent */
2691 error = xfs_dir_replace(tp, dp2, name2,
2692 ip1->i_ino,
2693 first_block, free_list, spaceres);
2694 if (error)
2695 goto out;
2696
2697 /*
2698 * If we're renaming one or more directories across different parents,
2699 * update the respective ".." entries (and link counts) to match the new
2700 * parents.
2701 */
2702 if (dp1 != dp2) {
2703 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2704
2705 if (S_ISDIR(ip2->i_d.di_mode)) {
2706 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2707 dp1->i_ino, first_block,
2708 free_list, spaceres);
2709 if (error)
2710 goto out;
2711
2712 /* transfer ip2 ".." reference to dp1 */
2713 if (!S_ISDIR(ip1->i_d.di_mode)) {
2714 error = xfs_droplink(tp, dp2);
2715 if (error)
2716 goto out;
2717 error = xfs_bumplink(tp, dp1);
2718 if (error)
2719 goto out;
2720 }
2721
2722 /*
2723 * Although ip1 isn't changed here, userspace needs
2724 * to be warned about the change, so that applications
2725 * relying on it (like backup ones), will properly
2726 * notify the change
2727 */
2728 ip1_flags |= XFS_ICHGTIME_CHG;
2729 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2730 }
2731
2732 if (S_ISDIR(ip1->i_d.di_mode)) {
2733 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2734 dp2->i_ino, first_block,
2735 free_list, spaceres);
2736 if (error)
2737 goto out;
2738
2739 /* transfer ip1 ".." reference to dp2 */
2740 if (!S_ISDIR(ip2->i_d.di_mode)) {
2741 error = xfs_droplink(tp, dp1);
2742 if (error)
2743 goto out;
2744 error = xfs_bumplink(tp, dp2);
2745 if (error)
2746 goto out;
2747 }
2748
2749 /*
2750 * Although ip2 isn't changed here, userspace needs
2751 * to be warned about the change, so that applications
2752 * relying on it (like backup ones), will properly
2753 * notify the change
2754 */
2755 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2756 ip2_flags |= XFS_ICHGTIME_CHG;
2757 }
2758 }
2759
2760 if (ip1_flags) {
2761 xfs_trans_ichgtime(tp, ip1, ip1_flags);
2762 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2763 }
2764 if (ip2_flags) {
2765 xfs_trans_ichgtime(tp, ip2, ip2_flags);
2766 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2767 }
2768 if (dp2_flags) {
2769 xfs_trans_ichgtime(tp, dp2, dp2_flags);
2770 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2771 }
2772 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2773 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2774out:
2775 return error;
2776}
2777
2778/*
2659 * xfs_rename 2779 * xfs_rename
2660 */ 2780 */
2661int 2781int
@@ -2665,7 +2785,8 @@ xfs_rename(
2665 xfs_inode_t *src_ip, 2785 xfs_inode_t *src_ip,
2666 xfs_inode_t *target_dp, 2786 xfs_inode_t *target_dp,
2667 struct xfs_name *target_name, 2787 struct xfs_name *target_name,
2668 xfs_inode_t *target_ip) 2788 xfs_inode_t *target_ip,
2789 unsigned int flags)
2669{ 2790{
2670 xfs_trans_t *tp = NULL; 2791 xfs_trans_t *tp = NULL;
2671 xfs_mount_t *mp = src_dp->i_mount; 2792 xfs_mount_t *mp = src_dp->i_mount;
@@ -2743,6 +2864,18 @@ xfs_rename(
2743 } 2864 }
2744 2865
2745 /* 2866 /*
2867 * Handle RENAME_EXCHANGE flags
2868 */
2869 if (flags & RENAME_EXCHANGE) {
2870 error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
2871 target_dp, target_name, target_ip,
2872 &free_list, &first_block, spaceres);
2873 if (error)
2874 goto abort_return;
2875 goto finish_rename;
2876 }
2877
2878 /*
2746 * Set up the target. 2879 * Set up the target.
2747 */ 2880 */
2748 if (target_ip == NULL) { 2881 if (target_ip == NULL) {
@@ -2881,6 +3014,7 @@ xfs_rename(
2881 if (new_parent) 3014 if (new_parent)
2882 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3015 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
2883 3016
3017finish_rename:
2884 /* 3018 /*
2885 * If this is a synchronous mount, make sure that the 3019 * If this is a synchronous mount, make sure that the
2886 * rename transaction goes to disk before returning to 3020 * rename transaction goes to disk before returning to
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4ed2ba9342dc..86cd6b39bed7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,7 +338,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
338int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 338int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
339 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 339 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
340 struct xfs_name *target_name, 340 struct xfs_name *target_name,
341 struct xfs_inode *target_ip); 341 struct xfs_inode *target_ip, unsigned int flags);
342 342
343void xfs_ilock(xfs_inode_t *, uint); 343void xfs_ilock(xfs_inode_t *, uint);
344int xfs_ilock_nowait(xfs_inode_t *, uint); 344int xfs_ilock_nowait(xfs_inode_t *, uint);
@@ -377,6 +377,15 @@ int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
377int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); 377int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
378 378
379/* from xfs_file.c */ 379/* from xfs_file.c */
380enum xfs_prealloc_flags {
381 XFS_PREALLOC_SET = (1 << 1),
382 XFS_PREALLOC_CLEAR = (1 << 2),
383 XFS_PREALLOC_SYNC = (1 << 3),
384 XFS_PREALLOC_INVISIBLE = (1 << 4),
385};
386
387int xfs_update_prealloc_flags(struct xfs_inode *,
388 enum xfs_prealloc_flags);
380int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 389int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
381int xfs_iozero(struct xfs_inode *, loff_t, size_t); 390int xfs_iozero(struct xfs_inode *, loff_t, size_t);
382 391
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a1831980a68e..f7afb86c9148 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -606,11 +606,8 @@ xfs_ioc_space(
606 unsigned int cmd, 606 unsigned int cmd,
607 xfs_flock64_t *bf) 607 xfs_flock64_t *bf)
608{ 608{
609 struct xfs_mount *mp = ip->i_mount;
610 struct xfs_trans *tp;
611 struct iattr iattr; 609 struct iattr iattr;
612 bool setprealloc = false; 610 enum xfs_prealloc_flags flags = 0;
613 bool clrprealloc = false;
614 int error; 611 int error;
615 612
616 /* 613 /*
@@ -630,6 +627,11 @@ xfs_ioc_space(
630 if (!S_ISREG(inode->i_mode)) 627 if (!S_ISREG(inode->i_mode))
631 return -EINVAL; 628 return -EINVAL;
632 629
630 if (filp->f_flags & O_DSYNC)
631 flags |= XFS_PREALLOC_SYNC;
632 if (ioflags & XFS_IO_INVIS)
633 flags |= XFS_PREALLOC_INVISIBLE;
634
633 error = mnt_want_write_file(filp); 635 error = mnt_want_write_file(filp);
634 if (error) 636 if (error)
635 return error; 637 return error;
@@ -673,25 +675,23 @@ xfs_ioc_space(
673 } 675 }
674 676
675 if (bf->l_start < 0 || 677 if (bf->l_start < 0 ||
676 bf->l_start > mp->m_super->s_maxbytes || 678 bf->l_start > inode->i_sb->s_maxbytes ||
677 bf->l_start + bf->l_len < 0 || 679 bf->l_start + bf->l_len < 0 ||
678 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { 680 bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
679 error = -EINVAL; 681 error = -EINVAL;
680 goto out_unlock; 682 goto out_unlock;
681 } 683 }
682 684
683 switch (cmd) { 685 switch (cmd) {
684 case XFS_IOC_ZERO_RANGE: 686 case XFS_IOC_ZERO_RANGE:
687 flags |= XFS_PREALLOC_SET;
685 error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); 688 error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
686 if (!error)
687 setprealloc = true;
688 break; 689 break;
689 case XFS_IOC_RESVSP: 690 case XFS_IOC_RESVSP:
690 case XFS_IOC_RESVSP64: 691 case XFS_IOC_RESVSP64:
692 flags |= XFS_PREALLOC_SET;
691 error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, 693 error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
692 XFS_BMAPI_PREALLOC); 694 XFS_BMAPI_PREALLOC);
693 if (!error)
694 setprealloc = true;
695 break; 695 break;
696 case XFS_IOC_UNRESVSP: 696 case XFS_IOC_UNRESVSP:
697 case XFS_IOC_UNRESVSP64: 697 case XFS_IOC_UNRESVSP64:
@@ -701,6 +701,7 @@ xfs_ioc_space(
701 case XFS_IOC_ALLOCSP64: 701 case XFS_IOC_ALLOCSP64:
702 case XFS_IOC_FREESP: 702 case XFS_IOC_FREESP:
703 case XFS_IOC_FREESP64: 703 case XFS_IOC_FREESP64:
704 flags |= XFS_PREALLOC_CLEAR;
704 if (bf->l_start > XFS_ISIZE(ip)) { 705 if (bf->l_start > XFS_ISIZE(ip)) {
705 error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), 706 error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
706 bf->l_start - XFS_ISIZE(ip), 0); 707 bf->l_start - XFS_ISIZE(ip), 0);
@@ -712,8 +713,6 @@ xfs_ioc_space(
712 iattr.ia_size = bf->l_start; 713 iattr.ia_size = bf->l_start;
713 714
714 error = xfs_setattr_size(ip, &iattr); 715 error = xfs_setattr_size(ip, &iattr);
715 if (!error)
716 clrprealloc = true;
717 break; 716 break;
718 default: 717 default:
719 ASSERT(0); 718 ASSERT(0);
@@ -723,32 +722,7 @@ xfs_ioc_space(
723 if (error) 722 if (error)
724 goto out_unlock; 723 goto out_unlock;
725 724
726 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); 725 error = xfs_update_prealloc_flags(ip, flags);
727 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
728 if (error) {
729 xfs_trans_cancel(tp, 0);
730 goto out_unlock;
731 }
732
733 xfs_ilock(ip, XFS_ILOCK_EXCL);
734 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
735
736 if (!(ioflags & XFS_IO_INVIS)) {
737 ip->i_d.di_mode &= ~S_ISUID;
738 if (ip->i_d.di_mode & S_IXGRP)
739 ip->i_d.di_mode &= ~S_ISGID;
740 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
741 }
742
743 if (setprealloc)
744 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
745 else if (clrprealloc)
746 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
747
748 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
749 if (filp->f_flags & O_DSYNC)
750 xfs_trans_set_sync(tp);
751 error = xfs_trans_commit(tp, 0);
752 726
753out_unlock: 727out_unlock:
754 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 728 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -1013,20 +987,182 @@ xfs_diflags_to_linux(
1013 inode->i_flags &= ~S_NOATIME; 987 inode->i_flags &= ~S_NOATIME;
1014} 988}
1015 989
1016#define FSX_PROJID 1 990static int
1017#define FSX_EXTSIZE 2 991xfs_ioctl_setattr_xflags(
1018#define FSX_XFLAGS 4 992 struct xfs_trans *tp,
1019#define FSX_NONBLOCK 8 993 struct xfs_inode *ip,
994 struct fsxattr *fa)
995{
996 struct xfs_mount *mp = ip->i_mount;
997
998 /* Can't change realtime flag if any extents are allocated. */
999 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
1000 XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
1001 return -EINVAL;
1002
1003 /* If realtime flag is set then must have realtime device */
1004 if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
1005 if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
1006 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
1007 return -EINVAL;
1008 }
1009
1010 /*
1011 * Can't modify an immutable/append-only file unless
1012 * we have appropriate permission.
1013 */
1014 if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
1015 (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
1016 !capable(CAP_LINUX_IMMUTABLE))
1017 return -EPERM;
1018
1019 xfs_set_diflags(ip, fa->fsx_xflags);
1020 xfs_diflags_to_linux(ip);
1021 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1022 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1023 XFS_STATS_INC(xs_ig_attrchg);
1024 return 0;
1025}
1026
1027/*
1028 * Set up the transaction structure for the setattr operation, checking that we
1029 * have permission to do so. On success, return a clean transaction and the
1030 * inode locked exclusively ready for further operation specific checks. On
1031 * failure, return an error without modifying or locking the inode.
1032 */
1033static struct xfs_trans *
1034xfs_ioctl_setattr_get_trans(
1035 struct xfs_inode *ip)
1036{
1037 struct xfs_mount *mp = ip->i_mount;
1038 struct xfs_trans *tp;
1039 int error;
1040
1041 if (mp->m_flags & XFS_MOUNT_RDONLY)
1042 return ERR_PTR(-EROFS);
1043 if (XFS_FORCED_SHUTDOWN(mp))
1044 return ERR_PTR(-EIO);
1045
1046 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
1047 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1048 if (error)
1049 goto out_cancel;
1050
1051 xfs_ilock(ip, XFS_ILOCK_EXCL);
1052 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1053
1054 /*
1055 * CAP_FOWNER overrides the following restrictions:
1056 *
1057 * The user ID of the calling process must be equal to the file owner
1058 * ID, except in cases where the CAP_FSETID capability is applicable.
1059 */
1060 if (!inode_owner_or_capable(VFS_I(ip))) {
1061 error = -EPERM;
1062 goto out_cancel;
1063 }
1064
1065 if (mp->m_flags & XFS_MOUNT_WSYNC)
1066 xfs_trans_set_sync(tp);
1067
1068 return tp;
1069
1070out_cancel:
1071 xfs_trans_cancel(tp, 0);
1072 return ERR_PTR(error);
1073}
1074
1075/*
1076 * extent size hint validation is somewhat cumbersome. Rules are:
1077 *
1078 * 1. extent size hint is only valid for directories and regular files
1079 * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
1080 * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
1081 * 4. can only be changed on regular files if no extents are allocated
1082 * 5. can be changed on directories at any time
1083 * 6. extsize hint of 0 turns off hints, clears inode flags.
1084 * 7. Extent size must be a multiple of the appropriate block size.
1085 * 8. for non-realtime files, the extent size hint must be limited
1086 * to half the AG size to avoid alignment extending the extent beyond the
1087 * limits of the AG.
1088 */
1089static int
1090xfs_ioctl_setattr_check_extsize(
1091 struct xfs_inode *ip,
1092 struct fsxattr *fa)
1093{
1094 struct xfs_mount *mp = ip->i_mount;
1095
1096 if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
1097 return -EINVAL;
1098
1099 if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
1100 !S_ISDIR(ip->i_d.di_mode))
1101 return -EINVAL;
1102
1103 if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
1104 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
1105 return -EINVAL;
1106
1107 if (fa->fsx_extsize != 0) {
1108 xfs_extlen_t size;
1109 xfs_fsblock_t extsize_fsb;
1110
1111 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1112 if (extsize_fsb > MAXEXTLEN)
1113 return -EINVAL;
1114
1115 if (XFS_IS_REALTIME_INODE(ip) ||
1116 (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1117 size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
1118 } else {
1119 size = mp->m_sb.sb_blocksize;
1120 if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
1121 return -EINVAL;
1122 }
1123
1124 if (fa->fsx_extsize % size)
1125 return -EINVAL;
1126 } else
1127 fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
1128
1129 return 0;
1130}
1131
1132static int
1133xfs_ioctl_setattr_check_projid(
1134 struct xfs_inode *ip,
1135 struct fsxattr *fa)
1136{
1137 /* Disallow 32bit project ids if projid32bit feature is not enabled. */
1138 if (fa->fsx_projid > (__uint16_t)-1 &&
1139 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
1140 return -EINVAL;
1141
1142 /*
1143 * Project Quota ID state is only allowed to change from within the init
1144 * namespace. Enforce that restriction only if we are trying to change
1145 * the quota ID state. Everything else is allowed in user namespaces.
1146 */
1147 if (current_user_ns() == &init_user_ns)
1148 return 0;
1149
1150 if (xfs_get_projid(ip) != fa->fsx_projid)
1151 return -EINVAL;
1152 if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
1153 (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
1154 return -EINVAL;
1155
1156 return 0;
1157}
1020 1158
1021STATIC int 1159STATIC int
1022xfs_ioctl_setattr( 1160xfs_ioctl_setattr(
1023 xfs_inode_t *ip, 1161 xfs_inode_t *ip,
1024 struct fsxattr *fa, 1162 struct fsxattr *fa)
1025 int mask)
1026{ 1163{
1027 struct xfs_mount *mp = ip->i_mount; 1164 struct xfs_mount *mp = ip->i_mount;
1028 struct xfs_trans *tp; 1165 struct xfs_trans *tp;
1029 unsigned int lock_flags = 0;
1030 struct xfs_dquot *udqp = NULL; 1166 struct xfs_dquot *udqp = NULL;
1031 struct xfs_dquot *pdqp = NULL; 1167 struct xfs_dquot *pdqp = NULL;
1032 struct xfs_dquot *olddquot = NULL; 1168 struct xfs_dquot *olddquot = NULL;
@@ -1034,17 +1170,9 @@ xfs_ioctl_setattr(
1034 1170
1035 trace_xfs_ioctl_setattr(ip); 1171 trace_xfs_ioctl_setattr(ip);
1036 1172
1037 if (mp->m_flags & XFS_MOUNT_RDONLY) 1173 code = xfs_ioctl_setattr_check_projid(ip, fa);
1038 return -EROFS; 1174 if (code)
1039 if (XFS_FORCED_SHUTDOWN(mp)) 1175 return code;
1040 return -EIO;
1041
1042 /*
1043 * Disallow 32bit project ids when projid32bit feature is not enabled.
1044 */
1045 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
1046 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
1047 return -EINVAL;
1048 1176
1049 /* 1177 /*
1050 * If disk quotas is on, we make sure that the dquots do exist on disk, 1178 * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1054,7 +1182,7 @@ xfs_ioctl_setattr(
1054 * If the IDs do change before we take the ilock, we're covered 1182 * If the IDs do change before we take the ilock, we're covered
1055 * because the i_*dquot fields will get updated anyway. 1183 * because the i_*dquot fields will get updated anyway.
1056 */ 1184 */
1057 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { 1185 if (XFS_IS_QUOTA_ON(mp)) {
1058 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, 1186 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
1059 ip->i_d.di_gid, fa->fsx_projid, 1187 ip->i_d.di_gid, fa->fsx_projid,
1060 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); 1188 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
@@ -1062,175 +1190,49 @@ xfs_ioctl_setattr(
1062 return code; 1190 return code;
1063 } 1191 }
1064 1192
1065 /* 1193 tp = xfs_ioctl_setattr_get_trans(ip);
1066 * For the other attributes, we acquire the inode lock and 1194 if (IS_ERR(tp)) {
1067 * first do an error checking pass. 1195 code = PTR_ERR(tp);
1068 */ 1196 goto error_free_dquots;
1069 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
1070 code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1071 if (code)
1072 goto error_return;
1073
1074 lock_flags = XFS_ILOCK_EXCL;
1075 xfs_ilock(ip, lock_flags);
1076
1077 /*
1078 * CAP_FOWNER overrides the following restrictions:
1079 *
1080 * The user ID of the calling process must be equal
1081 * to the file owner ID, except in cases where the
1082 * CAP_FSETID capability is applicable.
1083 */
1084 if (!inode_owner_or_capable(VFS_I(ip))) {
1085 code = -EPERM;
1086 goto error_return;
1087 }
1088
1089 /*
1090 * Do a quota reservation only if projid is actually going to change.
1091 * Only allow changing of projid from init_user_ns since it is a
1092 * non user namespace aware identifier.
1093 */
1094 if (mask & FSX_PROJID) {
1095 if (current_user_ns() != &init_user_ns) {
1096 code = -EINVAL;
1097 goto error_return;
1098 }
1099
1100 if (XFS_IS_QUOTA_RUNNING(mp) &&
1101 XFS_IS_PQUOTA_ON(mp) &&
1102 xfs_get_projid(ip) != fa->fsx_projid) {
1103 ASSERT(tp);
1104 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
1105 pdqp, capable(CAP_FOWNER) ?
1106 XFS_QMOPT_FORCE_RES : 0);
1107 if (code) /* out of quota */
1108 goto error_return;
1109 }
1110 } 1197 }
1111 1198
1112 if (mask & FSX_EXTSIZE) {
1113 /*
1114 * Can't change extent size if any extents are allocated.
1115 */
1116 if (ip->i_d.di_nextents &&
1117 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
1118 fa->fsx_extsize)) {
1119 code = -EINVAL; /* EFBIG? */
1120 goto error_return;
1121 }
1122 1199
1123 /* 1200 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
1124 * Extent size must be a multiple of the appropriate block 1201 xfs_get_projid(ip) != fa->fsx_projid) {
1125 * size, if set at all. It must also be smaller than the 1202 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
1126 * maximum extent size supported by the filesystem. 1203 capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
1127 * 1204 if (code) /* out of quota */
1128 * Also, for non-realtime files, limit the extent size hint to 1205 goto error_trans_cancel;
1129 * half the size of the AGs in the filesystem so alignment
1130 * doesn't result in extents larger than an AG.
1131 */
1132 if (fa->fsx_extsize != 0) {
1133 xfs_extlen_t size;
1134 xfs_fsblock_t extsize_fsb;
1135
1136 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1137 if (extsize_fsb > MAXEXTLEN) {
1138 code = -EINVAL;
1139 goto error_return;
1140 }
1141
1142 if (XFS_IS_REALTIME_INODE(ip) ||
1143 ((mask & FSX_XFLAGS) &&
1144 (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
1145 size = mp->m_sb.sb_rextsize <<
1146 mp->m_sb.sb_blocklog;
1147 } else {
1148 size = mp->m_sb.sb_blocksize;
1149 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1150 code = -EINVAL;
1151 goto error_return;
1152 }
1153 }
1154
1155 if (fa->fsx_extsize % size) {
1156 code = -EINVAL;
1157 goto error_return;
1158 }
1159 }
1160 } 1206 }
1161 1207
1208 code = xfs_ioctl_setattr_check_extsize(ip, fa);
1209 if (code)
1210 goto error_trans_cancel;
1162 1211
1163 if (mask & FSX_XFLAGS) { 1212 code = xfs_ioctl_setattr_xflags(tp, ip, fa);
1164 /* 1213 if (code)
1165 * Can't change realtime flag if any extents are allocated. 1214 goto error_trans_cancel;
1166 */
1167 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
1168 (XFS_IS_REALTIME_INODE(ip)) !=
1169 (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1170 code = -EINVAL; /* EFBIG? */
1171 goto error_return;
1172 }
1173
1174 /*
1175 * If realtime flag is set then must have realtime data.
1176 */
1177 if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1178 if ((mp->m_sb.sb_rblocks == 0) ||
1179 (mp->m_sb.sb_rextsize == 0) ||
1180 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
1181 code = -EINVAL;
1182 goto error_return;
1183 }
1184 }
1185
1186 /*
1187 * Can't modify an immutable/append-only file unless
1188 * we have appropriate permission.
1189 */
1190 if ((ip->i_d.di_flags &
1191 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
1192 (fa->fsx_xflags &
1193 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
1194 !capable(CAP_LINUX_IMMUTABLE)) {
1195 code = -EPERM;
1196 goto error_return;
1197 }
1198 }
1199
1200 xfs_trans_ijoin(tp, ip, 0);
1201 1215
1202 /* 1216 /*
1203 * Change file ownership. Must be the owner or privileged. 1217 * Change file ownership. Must be the owner or privileged. CAP_FSETID
1218 * overrides the following restrictions:
1219 *
1220 * The set-user-ID and set-group-ID bits of a file will be cleared upon
1221 * successful return from chown()
1204 */ 1222 */
1205 if (mask & FSX_PROJID) {
1206 /*
1207 * CAP_FSETID overrides the following restrictions:
1208 *
1209 * The set-user-ID and set-group-ID bits of a file will be
1210 * cleared upon successful return from chown()
1211 */
1212 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
1213 !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
1214 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
1215
1216 /*
1217 * Change the ownerships and register quota modifications
1218 * in the transaction.
1219 */
1220 if (xfs_get_projid(ip) != fa->fsx_projid) {
1221 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1222 olddquot = xfs_qm_vop_chown(tp, ip,
1223 &ip->i_pdquot, pdqp);
1224 }
1225 ASSERT(ip->i_d.di_version > 1);
1226 xfs_set_projid(ip, fa->fsx_projid);
1227 }
1228 1223
1229 } 1224 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
1225 !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
1226 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
1230 1227
1231 if (mask & FSX_XFLAGS) { 1228 /* Change the ownerships and register project quota modifications */
1232 xfs_set_diflags(ip, fa->fsx_xflags); 1229 if (xfs_get_projid(ip) != fa->fsx_projid) {
1233 xfs_diflags_to_linux(ip); 1230 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1231 olddquot = xfs_qm_vop_chown(tp, ip,
1232 &ip->i_pdquot, pdqp);
1233 }
1234 ASSERT(ip->i_d.di_version > 1);
1235 xfs_set_projid(ip, fa->fsx_projid);
1234 } 1236 }
1235 1237
1236 /* 1238 /*
@@ -1238,34 +1240,12 @@ xfs_ioctl_setattr(
1238 * extent size hint should be set on the inode. If no extent size flags 1240 * extent size hint should be set on the inode. If no extent size flags
1239 * are set on the inode then unconditionally clear the extent size hint. 1241 * are set on the inode then unconditionally clear the extent size hint.
1240 */ 1242 */
1241 if (mask & FSX_EXTSIZE) { 1243 if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
1242 int extsize = 0; 1244 ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
1243 1245 else
1244 if (ip->i_d.di_flags & 1246 ip->i_d.di_extsize = 0;
1245 (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
1246 extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
1247 ip->i_d.di_extsize = extsize;
1248 }
1249
1250 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1251 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1252
1253 XFS_STATS_INC(xs_ig_attrchg);
1254 1247
1255 /*
1256 * If this is a synchronous mount, make sure that the
1257 * transaction goes to disk before returning to the user.
1258 * This is slightly sub-optimal in that truncates require
1259 * two sync transactions instead of one for wsync filesystems.
1260 * One for the truncate and one for the timestamps since we
1261 * don't want to change the timestamps unless we're sure the
1262 * truncate worked. Truncates are less than 1% of the laddis
1263 * mix so this probably isn't worth the trouble to optimize.
1264 */
1265 if (mp->m_flags & XFS_MOUNT_WSYNC)
1266 xfs_trans_set_sync(tp);
1267 code = xfs_trans_commit(tp, 0); 1248 code = xfs_trans_commit(tp, 0);
1268 xfs_iunlock(ip, lock_flags);
1269 1249
1270 /* 1250 /*
1271 * Release any dquot(s) the inode had kept before chown. 1251 * Release any dquot(s) the inode had kept before chown.
@@ -1276,12 +1256,11 @@ xfs_ioctl_setattr(
1276 1256
1277 return code; 1257 return code;
1278 1258
1279 error_return: 1259error_trans_cancel:
1260 xfs_trans_cancel(tp, 0);
1261error_free_dquots:
1280 xfs_qm_dqrele(udqp); 1262 xfs_qm_dqrele(udqp);
1281 xfs_qm_dqrele(pdqp); 1263 xfs_qm_dqrele(pdqp);
1282 xfs_trans_cancel(tp, 0);
1283 if (lock_flags)
1284 xfs_iunlock(ip, lock_flags);
1285 return code; 1264 return code;
1286} 1265}
1287 1266
@@ -1292,20 +1271,15 @@ xfs_ioc_fssetxattr(
1292 void __user *arg) 1271 void __user *arg)
1293{ 1272{
1294 struct fsxattr fa; 1273 struct fsxattr fa;
1295 unsigned int mask;
1296 int error; 1274 int error;
1297 1275
1298 if (copy_from_user(&fa, arg, sizeof(fa))) 1276 if (copy_from_user(&fa, arg, sizeof(fa)))
1299 return -EFAULT; 1277 return -EFAULT;
1300 1278
1301 mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
1302 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1303 mask |= FSX_NONBLOCK;
1304
1305 error = mnt_want_write_file(filp); 1279 error = mnt_want_write_file(filp);
1306 if (error) 1280 if (error)
1307 return error; 1281 return error;
1308 error = xfs_ioctl_setattr(ip, &fa, mask); 1282 error = xfs_ioctl_setattr(ip, &fa);
1309 mnt_drop_write_file(filp); 1283 mnt_drop_write_file(filp);
1310 return error; 1284 return error;
1311} 1285}
@@ -1325,14 +1299,14 @@ xfs_ioc_getxflags(
1325 1299
1326STATIC int 1300STATIC int
1327xfs_ioc_setxflags( 1301xfs_ioc_setxflags(
1328 xfs_inode_t *ip, 1302 struct xfs_inode *ip,
1329 struct file *filp, 1303 struct file *filp,
1330 void __user *arg) 1304 void __user *arg)
1331{ 1305{
1306 struct xfs_trans *tp;
1332 struct fsxattr fa; 1307 struct fsxattr fa;
1333 unsigned int flags; 1308 unsigned int flags;
1334 unsigned int mask; 1309 int error;
1335 int error;
1336 1310
1337 if (copy_from_user(&flags, arg, sizeof(flags))) 1311 if (copy_from_user(&flags, arg, sizeof(flags)))
1338 return -EFAULT; 1312 return -EFAULT;
@@ -1342,15 +1316,26 @@ xfs_ioc_setxflags(
1342 FS_SYNC_FL)) 1316 FS_SYNC_FL))
1343 return -EOPNOTSUPP; 1317 return -EOPNOTSUPP;
1344 1318
1345 mask = FSX_XFLAGS;
1346 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1347 mask |= FSX_NONBLOCK;
1348 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); 1319 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
1349 1320
1350 error = mnt_want_write_file(filp); 1321 error = mnt_want_write_file(filp);
1351 if (error) 1322 if (error)
1352 return error; 1323 return error;
1353 error = xfs_ioctl_setattr(ip, &fa, mask); 1324
1325 tp = xfs_ioctl_setattr_get_trans(ip);
1326 if (IS_ERR(tp)) {
1327 error = PTR_ERR(tp);
1328 goto out_drop_write;
1329 }
1330
1331 error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
1332 if (error) {
1333 xfs_trans_cancel(tp, 0);
1334 goto out_drop_write;
1335 }
1336
1337 error = xfs_trans_commit(tp, 0);
1338out_drop_write:
1354 mnt_drop_write_file(filp); 1339 mnt_drop_write_file(filp);
1355 return error; 1340 return error;
1356} 1341}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ec6772866f3d..bfc7c7c8a0c8 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -423,7 +423,7 @@ xfs_compat_attrmulti_by_handle(
423 423
424 ops = memdup_user(compat_ptr(am_hreq.ops), size); 424 ops = memdup_user(compat_ptr(am_hreq.ops), size);
425 if (IS_ERR(ops)) { 425 if (IS_ERR(ops)) {
426 error = -PTR_ERR(ops); 426 error = PTR_ERR(ops);
427 goto out_dput; 427 goto out_dput;
428 } 428 }
429 429
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c980e2a5086b..ccb1dd0d509e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,7 +802,7 @@ int
802xfs_iomap_write_unwritten( 802xfs_iomap_write_unwritten(
803 xfs_inode_t *ip, 803 xfs_inode_t *ip,
804 xfs_off_t offset, 804 xfs_off_t offset,
805 size_t count) 805 xfs_off_t count)
806{ 806{
807 xfs_mount_t *mp = ip->i_mount; 807 xfs_mount_t *mp = ip->i_mount;
808 xfs_fileoff_t offset_fsb; 808 xfs_fileoff_t offset_fsb;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 411fbb8919ef..8688e663d744 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,6 +27,6 @@ int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *); 27 struct xfs_bmbt_irec *);
28int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, 28int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
29 struct xfs_bmbt_irec *); 29 struct xfs_bmbt_irec *);
30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
31 31
32#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c50311cae1b1..ce80eeb8faa4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -380,18 +380,27 @@ xfs_vn_rename(
380 struct inode *odir, 380 struct inode *odir,
381 struct dentry *odentry, 381 struct dentry *odentry,
382 struct inode *ndir, 382 struct inode *ndir,
383 struct dentry *ndentry) 383 struct dentry *ndentry,
384 unsigned int flags)
384{ 385{
385 struct inode *new_inode = ndentry->d_inode; 386 struct inode *new_inode = ndentry->d_inode;
387 int omode = 0;
386 struct xfs_name oname; 388 struct xfs_name oname;
387 struct xfs_name nname; 389 struct xfs_name nname;
388 390
389 xfs_dentry_to_name(&oname, odentry, 0); 391 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
392 return -EINVAL;
393
394 /* if we are exchanging files, we need to set i_mode of both files */
395 if (flags & RENAME_EXCHANGE)
396 omode = ndentry->d_inode->i_mode;
397
398 xfs_dentry_to_name(&oname, odentry, omode);
390 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); 399 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
391 400
392 return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), 401 return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
393 XFS_I(ndir), &nname, new_inode ? 402 XFS_I(ndir), &nname,
394 XFS_I(new_inode) : NULL); 403 new_inode ? XFS_I(new_inode) : NULL, flags);
395} 404}
396 405
397/* 406/*
@@ -1144,7 +1153,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
1144 */ 1153 */
1145 .rmdir = xfs_vn_unlink, 1154 .rmdir = xfs_vn_unlink,
1146 .mknod = xfs_vn_mknod, 1155 .mknod = xfs_vn_mknod,
1147 .rename = xfs_vn_rename, 1156 .rename2 = xfs_vn_rename,
1148 .get_acl = xfs_get_acl, 1157 .get_acl = xfs_get_acl,
1149 .set_acl = xfs_set_acl, 1158 .set_acl = xfs_set_acl,
1150 .getattr = xfs_vn_getattr, 1159 .getattr = xfs_vn_getattr,
@@ -1172,7 +1181,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1172 */ 1181 */
1173 .rmdir = xfs_vn_unlink, 1182 .rmdir = xfs_vn_unlink,
1174 .mknod = xfs_vn_mknod, 1183 .mknod = xfs_vn_mknod,
1175 .rename = xfs_vn_rename, 1184 .rename2 = xfs_vn_rename,
1176 .get_acl = xfs_get_acl, 1185 .get_acl = xfs_get_acl,
1177 .set_acl = xfs_set_acl, 1186 .set_acl = xfs_set_acl,
1178 .getattr = xfs_vn_getattr, 1187 .getattr = xfs_vn_getattr,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e408bf5a3ff7..bcc7cfabb787 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -33,6 +33,7 @@
33#include "xfs_fsops.h" 33#include "xfs_fsops.h"
34#include "xfs_cksum.h" 34#include "xfs_cksum.h"
35#include "xfs_sysfs.h" 35#include "xfs_sysfs.h"
36#include "xfs_sb.h"
36 37
37kmem_zone_t *xfs_log_ticket_zone; 38kmem_zone_t *xfs_log_ticket_zone;
38 39
@@ -1290,9 +1291,20 @@ xfs_log_worker(
1290 struct xfs_mount *mp = log->l_mp; 1291 struct xfs_mount *mp = log->l_mp;
1291 1292
1292 /* dgc: errors ignored - not fatal and nowhere to report them */ 1293 /* dgc: errors ignored - not fatal and nowhere to report them */
1293 if (xfs_log_need_covered(mp)) 1294 if (xfs_log_need_covered(mp)) {
1294 xfs_fs_log_dummy(mp); 1295 /*
1295 else 1296 * Dump a transaction into the log that contains no real change.
1297 * This is needed to stamp the current tail LSN into the log
1298 * during the covering operation.
1299 *
1300 * We cannot use an inode here for this - that will push dirty
1301 * state back up into the VFS and then periodic inode flushing
1302 * will prevent log covering from making progress. Hence we
1303 * synchronously log the superblock instead to ensure the
1304 * superblock is immediately unpinned and can be written back.
1305 */
1306 xfs_sync_sb(mp, true);
1307 } else
1296 xfs_log_force(mp, 0); 1308 xfs_log_force(mp, 0);
1297 1309
1298 /* start pushing all the metadata that is currently dirty */ 1310 /* start pushing all the metadata that is currently dirty */
@@ -1395,6 +1407,8 @@ xlog_alloc_log(
1395 ASSERT(xfs_buf_islocked(bp)); 1407 ASSERT(xfs_buf_islocked(bp));
1396 xfs_buf_unlock(bp); 1408 xfs_buf_unlock(bp);
1397 1409
1410 /* use high priority wq for log I/O completion */
1411 bp->b_ioend_wq = mp->m_log_workqueue;
1398 bp->b_iodone = xlog_iodone; 1412 bp->b_iodone = xlog_iodone;
1399 log->l_xbuf = bp; 1413 log->l_xbuf = bp;
1400 1414
@@ -1427,6 +1441,8 @@ xlog_alloc_log(
1427 ASSERT(xfs_buf_islocked(bp)); 1441 ASSERT(xfs_buf_islocked(bp));
1428 xfs_buf_unlock(bp); 1442 xfs_buf_unlock(bp);
1429 1443
1444 /* use high priority wq for log I/O completion */
1445 bp->b_ioend_wq = mp->m_log_workqueue;
1430 bp->b_iodone = xlog_iodone; 1446 bp->b_iodone = xlog_iodone;
1431 iclog->ic_bp = bp; 1447 iclog->ic_bp = bp;
1432 iclog->ic_data = bp->b_addr; 1448 iclog->ic_data = bp->b_addr;
@@ -1806,8 +1822,6 @@ xlog_sync(
1806 XFS_BUF_ZEROFLAGS(bp); 1822 XFS_BUF_ZEROFLAGS(bp);
1807 XFS_BUF_ASYNC(bp); 1823 XFS_BUF_ASYNC(bp);
1808 bp->b_flags |= XBF_SYNCIO; 1824 bp->b_flags |= XBF_SYNCIO;
1809 /* use high priority completion wq */
1810 bp->b_ioend_wq = log->l_mp->m_log_workqueue;
1811 1825
1812 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { 1826 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1813 bp->b_flags |= XBF_FUA; 1827 bp->b_flags |= XBF_FUA;
@@ -1856,8 +1870,6 @@ xlog_sync(
1856 bp->b_flags |= XBF_SYNCIO; 1870 bp->b_flags |= XBF_SYNCIO;
1857 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1871 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1858 bp->b_flags |= XBF_FUA; 1872 bp->b_flags |= XBF_FUA;
1859 /* use high priority completion wq */
1860 bp->b_ioend_wq = log->l_mp->m_log_workqueue;
1861 1873
1862 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1874 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1863 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1875 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -2027,7 +2039,7 @@ xlog_print_tic_res(
2027 " total reg = %u bytes (o/flow = %u bytes)\n" 2039 " total reg = %u bytes (o/flow = %u bytes)\n"
2028 " ophdrs = %u (ophdr space = %u bytes)\n" 2040 " ophdrs = %u (ophdr space = %u bytes)\n"
2029 " ophdr + reg = %u bytes\n" 2041 " ophdr + reg = %u bytes\n"
2030 " num regions = %u\n", 2042 " num regions = %u",
2031 ((ticket->t_trans_type <= 0 || 2043 ((ticket->t_trans_type <= 0 ||
2032 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 2044 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
2033 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 2045 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d3d38836f87f..4fa80e63eea2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -408,11 +408,11 @@ xfs_update_alignment(xfs_mount_t *mp)
408 if (xfs_sb_version_hasdalign(sbp)) { 408 if (xfs_sb_version_hasdalign(sbp)) {
409 if (sbp->sb_unit != mp->m_dalign) { 409 if (sbp->sb_unit != mp->m_dalign) {
410 sbp->sb_unit = mp->m_dalign; 410 sbp->sb_unit = mp->m_dalign;
411 mp->m_update_flags |= XFS_SB_UNIT; 411 mp->m_update_sb = true;
412 } 412 }
413 if (sbp->sb_width != mp->m_swidth) { 413 if (sbp->sb_width != mp->m_swidth) {
414 sbp->sb_width = mp->m_swidth; 414 sbp->sb_width = mp->m_swidth;
415 mp->m_update_flags |= XFS_SB_WIDTH; 415 mp->m_update_sb = true;
416 } 416 }
417 } else { 417 } else {
418 xfs_warn(mp, 418 xfs_warn(mp,
@@ -583,38 +583,19 @@ int
583xfs_mount_reset_sbqflags( 583xfs_mount_reset_sbqflags(
584 struct xfs_mount *mp) 584 struct xfs_mount *mp)
585{ 585{
586 int error;
587 struct xfs_trans *tp;
588
589 mp->m_qflags = 0; 586 mp->m_qflags = 0;
590 587
591 /* 588 /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
592 * It is OK to look at sb_qflags here in mount path,
593 * without m_sb_lock.
594 */
595 if (mp->m_sb.sb_qflags == 0) 589 if (mp->m_sb.sb_qflags == 0)
596 return 0; 590 return 0;
597 spin_lock(&mp->m_sb_lock); 591 spin_lock(&mp->m_sb_lock);
598 mp->m_sb.sb_qflags = 0; 592 mp->m_sb.sb_qflags = 0;
599 spin_unlock(&mp->m_sb_lock); 593 spin_unlock(&mp->m_sb_lock);
600 594
601 /* 595 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
602 * If the fs is readonly, let the incore superblock run
603 * with quotas off but don't flush the update out to disk
604 */
605 if (mp->m_flags & XFS_MOUNT_RDONLY)
606 return 0; 596 return 0;
607 597
608 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 598 return xfs_sync_sb(mp, false);
609 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
610 if (error) {
611 xfs_trans_cancel(tp, 0);
612 xfs_alert(mp, "%s: Superblock update failed!", __func__);
613 return error;
614 }
615
616 xfs_mod_sb(tp, XFS_SB_QFLAGS);
617 return xfs_trans_commit(tp, 0);
618} 599}
619 600
620__uint64_t 601__uint64_t
@@ -659,26 +640,25 @@ xfs_mountfs(
659 xfs_sb_mount_common(mp, sbp); 640 xfs_sb_mount_common(mp, sbp);
660 641
661 /* 642 /*
662 * Check for a mismatched features2 values. Older kernels 643 * Check for a mismatched features2 values. Older kernels read & wrote
663 * read & wrote into the wrong sb offset for sb_features2 644 * into the wrong sb offset for sb_features2 on some platforms due to
664 * on some platforms due to xfs_sb_t not being 64bit size aligned 645 * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
665 * when sb_features2 was added, which made older superblock 646 * which made older superblock reading/writing routines swap it as a
666 * reading/writing routines swap it as a 64-bit value. 647 * 64-bit value.
667 * 648 *
668 * For backwards compatibility, we make both slots equal. 649 * For backwards compatibility, we make both slots equal.
669 * 650 *
670 * If we detect a mismatched field, we OR the set bits into the 651 * If we detect a mismatched field, we OR the set bits into the existing
671 * existing features2 field in case it has already been modified; we 652 * features2 field in case it has already been modified; we don't want
672 * don't want to lose any features. We then update the bad location 653 * to lose any features. We then update the bad location with the ORed
673 * with the ORed value so that older kernels will see any features2 654 * value so that older kernels will see any features2 flags. The
674 * flags, and mark the two fields as needing updates once the 655 * superblock writeback code ensures the new sb_features2 is copied to
675 * transaction subsystem is online. 656 * sb_bad_features2 before it is logged or written to disk.
676 */ 657 */
677 if (xfs_sb_has_mismatched_features2(sbp)) { 658 if (xfs_sb_has_mismatched_features2(sbp)) {
678 xfs_warn(mp, "correcting sb_features alignment problem"); 659 xfs_warn(mp, "correcting sb_features alignment problem");
679 sbp->sb_features2 |= sbp->sb_bad_features2; 660 sbp->sb_features2 |= sbp->sb_bad_features2;
680 sbp->sb_bad_features2 = sbp->sb_features2; 661 mp->m_update_sb = true;
681 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
682 662
683 /* 663 /*
684 * Re-check for ATTR2 in case it was found in bad_features2 664 * Re-check for ATTR2 in case it was found in bad_features2
@@ -692,17 +672,17 @@ xfs_mountfs(
692 if (xfs_sb_version_hasattr2(&mp->m_sb) && 672 if (xfs_sb_version_hasattr2(&mp->m_sb) &&
693 (mp->m_flags & XFS_MOUNT_NOATTR2)) { 673 (mp->m_flags & XFS_MOUNT_NOATTR2)) {
694 xfs_sb_version_removeattr2(&mp->m_sb); 674 xfs_sb_version_removeattr2(&mp->m_sb);
695 mp->m_update_flags |= XFS_SB_FEATURES2; 675 mp->m_update_sb = true;
696 676
697 /* update sb_versionnum for the clearing of the morebits */ 677 /* update sb_versionnum for the clearing of the morebits */
698 if (!sbp->sb_features2) 678 if (!sbp->sb_features2)
699 mp->m_update_flags |= XFS_SB_VERSIONNUM; 679 mp->m_update_sb = true;
700 } 680 }
701 681
702 /* always use v2 inodes by default now */ 682 /* always use v2 inodes by default now */
703 if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { 683 if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
704 mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; 684 mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
705 mp->m_update_flags |= XFS_SB_VERSIONNUM; 685 mp->m_update_sb = true;
706 } 686 }
707 687
708 /* 688 /*
@@ -895,8 +875,8 @@ xfs_mountfs(
895 * the next remount into writeable mode. Otherwise we would never 875 * the next remount into writeable mode. Otherwise we would never
896 * perform the update e.g. for the root filesystem. 876 * perform the update e.g. for the root filesystem.
897 */ 877 */
898 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 878 if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
899 error = xfs_mount_log_sb(mp, mp->m_update_flags); 879 error = xfs_sync_sb(mp, false);
900 if (error) { 880 if (error) {
901 xfs_warn(mp, "failed to write sb changes"); 881 xfs_warn(mp, "failed to write sb changes");
902 goto out_rtunmount; 882 goto out_rtunmount;
@@ -1103,9 +1083,6 @@ xfs_fs_writable(
1103int 1083int
1104xfs_log_sbcount(xfs_mount_t *mp) 1084xfs_log_sbcount(xfs_mount_t *mp)
1105{ 1085{
1106 xfs_trans_t *tp;
1107 int error;
1108
1109 /* allow this to proceed during the freeze sequence... */ 1086 /* allow this to proceed during the freeze sequence... */
1110 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) 1087 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
1111 return 0; 1088 return 0;
@@ -1119,17 +1096,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
1119 if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) 1096 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1120 return 0; 1097 return 0;
1121 1098
1122 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); 1099 return xfs_sync_sb(mp, true);
1123 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
1124 if (error) {
1125 xfs_trans_cancel(tp, 0);
1126 return error;
1127 }
1128
1129 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
1130 xfs_trans_set_sync(tp);
1131 error = xfs_trans_commit(tp, 0);
1132 return error;
1133} 1100}
1134 1101
1135/* 1102/*
@@ -1423,34 +1390,6 @@ xfs_freesb(
1423} 1390}
1424 1391
1425/* 1392/*
1426 * Used to log changes to the superblock unit and width fields which could
1427 * be altered by the mount options, as well as any potential sb_features2
1428 * fixup. Only the first superblock is updated.
1429 */
1430int
1431xfs_mount_log_sb(
1432 xfs_mount_t *mp,
1433 __int64_t fields)
1434{
1435 xfs_trans_t *tp;
1436 int error;
1437
1438 ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
1439 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
1440 XFS_SB_VERSIONNUM));
1441
1442 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1443 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
1444 if (error) {
1445 xfs_trans_cancel(tp, 0);
1446 return error;
1447 }
1448 xfs_mod_sb(tp, fields);
1449 error = xfs_trans_commit(tp, 0);
1450 return error;
1451}
1452
1453/*
1454 * If the underlying (data/log/rt) device is readonly, there are some 1393 * If the underlying (data/log/rt) device is readonly, there are some
1455 * operations that cannot proceed. 1394 * operations that cannot proceed.
1456 */ 1395 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69d4d3c..a5b2ff822653 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,8 +162,7 @@ typedef struct xfs_mount {
162 struct delayed_work m_reclaim_work; /* background inode reclaim */ 162 struct delayed_work m_reclaim_work; /* background inode reclaim */
163 struct delayed_work m_eofblocks_work; /* background eof blocks 163 struct delayed_work m_eofblocks_work; /* background eof blocks
164 trimming */ 164 trimming */
165 __int64_t m_update_flags; /* sb flags we need to update 165 bool m_update_sb; /* sb needs update in mount */
166 on the next remount,rw */
167 int64_t m_low_space[XFS_LOWSP_MAX]; 166 int64_t m_low_space[XFS_LOWSP_MAX];
168 /* low free space thresholds */ 167 /* low free space thresholds */
169 struct xfs_kobj m_kobj; 168 struct xfs_kobj m_kobj;
@@ -378,7 +377,7 @@ extern void xfs_unmountfs(xfs_mount_t *);
378extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 377extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
379extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 378extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
380 uint, int); 379 uint, int);
381extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); 380extern int xfs_mount_log_sb(xfs_mount_t *);
382extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 381extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
383extern int xfs_readsb(xfs_mount_t *, int); 382extern int xfs_readsb(xfs_mount_t *, int);
384extern void xfs_freesb(xfs_mount_t *); 383extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 79fb19dd9c83..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
430static enum lru_status 430static enum lru_status
431xfs_qm_dquot_isolate( 431xfs_qm_dquot_isolate(
432 struct list_head *item, 432 struct list_head *item,
433 struct list_lru_one *lru,
433 spinlock_t *lru_lock, 434 spinlock_t *lru_lock,
434 void *arg) 435 void *arg)
435 __releases(lru_lock) __acquires(lru_lock) 436 __releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
450 XFS_STATS_INC(xs_qm_dqwants); 451 XFS_STATS_INC(xs_qm_dqwants);
451 452
452 trace_xfs_dqreclaim_want(dqp); 453 trace_xfs_dqreclaim_want(dqp);
453 list_del_init(&dqp->q_lru); 454 list_lru_isolate(lru, &dqp->q_lru);
454 XFS_STATS_DEC(xs_qm_dquot_unused); 455 XFS_STATS_DEC(xs_qm_dquot_unused);
455 return LRU_REMOVED; 456 return LRU_REMOVED;
456 } 457 }
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
494 xfs_dqunlock(dqp); 495 xfs_dqunlock(dqp);
495 496
496 ASSERT(dqp->q_nrefs == 0); 497 ASSERT(dqp->q_nrefs == 0);
497 list_move_tail(&dqp->q_lru, &isol->dispose); 498 list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
498 XFS_STATS_DEC(xs_qm_dquot_unused); 499 XFS_STATS_DEC(xs_qm_dquot_unused);
499 trace_xfs_dqreclaim_done(dqp); 500 trace_xfs_dqreclaim_done(dqp);
500 XFS_STATS_INC(xs_qm_dqreclaims); 501 XFS_STATS_INC(xs_qm_dqreclaims);
@@ -523,7 +524,6 @@ xfs_qm_shrink_scan(
523 struct xfs_qm_isolate isol; 524 struct xfs_qm_isolate isol;
524 unsigned long freed; 525 unsigned long freed;
525 int error; 526 int error;
526 unsigned long nr_to_scan = sc->nr_to_scan;
527 527
528 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 528 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
529 return 0; 529 return 0;
@@ -531,8 +531,8 @@ xfs_qm_shrink_scan(
531 INIT_LIST_HEAD(&isol.buffers); 531 INIT_LIST_HEAD(&isol.buffers);
532 INIT_LIST_HEAD(&isol.dispose); 532 INIT_LIST_HEAD(&isol.dispose);
533 533
534 freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, 534 freed = list_lru_shrink_walk(&qi->qi_lru, sc,
535 &nr_to_scan); 535 xfs_qm_dquot_isolate, &isol);
536 536
537 error = xfs_buf_delwri_submit(&isol.buffers); 537 error = xfs_buf_delwri_submit(&isol.buffers);
538 if (error) 538 if (error)
@@ -557,7 +557,7 @@ xfs_qm_shrink_count(
557 struct xfs_quotainfo *qi = container_of(shrink, 557 struct xfs_quotainfo *qi = container_of(shrink,
558 struct xfs_quotainfo, qi_shrinker); 558 struct xfs_quotainfo, qi_shrinker);
559 559
560 return list_lru_count_node(&qi->qi_lru, sc->nid); 560 return list_lru_shrink_count(&qi->qi_lru, sc);
561} 561}
562 562
563/* 563/*
@@ -714,7 +714,6 @@ STATIC int
714xfs_qm_qino_alloc( 714xfs_qm_qino_alloc(
715 xfs_mount_t *mp, 715 xfs_mount_t *mp,
716 xfs_inode_t **ip, 716 xfs_inode_t **ip,
717 __int64_t sbfields,
718 uint flags) 717 uint flags)
719{ 718{
720 xfs_trans_t *tp; 719 xfs_trans_t *tp;
@@ -777,11 +776,6 @@ xfs_qm_qino_alloc(
777 spin_lock(&mp->m_sb_lock); 776 spin_lock(&mp->m_sb_lock);
778 if (flags & XFS_QMOPT_SBVERSION) { 777 if (flags & XFS_QMOPT_SBVERSION) {
779 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 778 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
780 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
781 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
782 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
783 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
784 XFS_SB_QFLAGS));
785 779
786 xfs_sb_version_addquota(&mp->m_sb); 780 xfs_sb_version_addquota(&mp->m_sb);
787 mp->m_sb.sb_uquotino = NULLFSINO; 781 mp->m_sb.sb_uquotino = NULLFSINO;
@@ -798,7 +792,7 @@ xfs_qm_qino_alloc(
798 else 792 else
799 mp->m_sb.sb_pquotino = (*ip)->i_ino; 793 mp->m_sb.sb_pquotino = (*ip)->i_ino;
800 spin_unlock(&mp->m_sb_lock); 794 spin_unlock(&mp->m_sb_lock);
801 xfs_mod_sb(tp, sbfields); 795 xfs_log_sb(tp);
802 796
803 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 797 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
804 xfs_alert(mp, "%s failed (error %d)!", __func__, error); 798 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
@@ -1451,7 +1445,7 @@ xfs_qm_mount_quotas(
1451 spin_unlock(&mp->m_sb_lock); 1445 spin_unlock(&mp->m_sb_lock);
1452 1446
1453 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { 1447 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
1454 if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { 1448 if (xfs_sync_sb(mp, false)) {
1455 /* 1449 /*
1456 * We could only have been turning quotas off. 1450 * We could only have been turning quotas off.
1457 * We aren't in very good shape actually because 1451 * We aren't in very good shape actually because
@@ -1482,7 +1476,6 @@ xfs_qm_init_quotainos(
1482 struct xfs_inode *gip = NULL; 1476 struct xfs_inode *gip = NULL;
1483 struct xfs_inode *pip = NULL; 1477 struct xfs_inode *pip = NULL;
1484 int error; 1478 int error;
1485 __int64_t sbflags = 0;
1486 uint flags = 0; 1479 uint flags = 0;
1487 1480
1488 ASSERT(mp->m_quotainfo); 1481 ASSERT(mp->m_quotainfo);
@@ -1517,9 +1510,6 @@ xfs_qm_init_quotainos(
1517 } 1510 }
1518 } else { 1511 } else {
1519 flags |= XFS_QMOPT_SBVERSION; 1512 flags |= XFS_QMOPT_SBVERSION;
1520 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1521 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
1522 XFS_SB_QFLAGS);
1523 } 1513 }
1524 1514
1525 /* 1515 /*
@@ -1530,7 +1520,6 @@ xfs_qm_init_quotainos(
1530 */ 1520 */
1531 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { 1521 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
1532 error = xfs_qm_qino_alloc(mp, &uip, 1522 error = xfs_qm_qino_alloc(mp, &uip,
1533 sbflags | XFS_SB_UQUOTINO,
1534 flags | XFS_QMOPT_UQUOTA); 1523 flags | XFS_QMOPT_UQUOTA);
1535 if (error) 1524 if (error)
1536 goto error_rele; 1525 goto error_rele;
@@ -1539,7 +1528,6 @@ xfs_qm_init_quotainos(
1539 } 1528 }
1540 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { 1529 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
1541 error = xfs_qm_qino_alloc(mp, &gip, 1530 error = xfs_qm_qino_alloc(mp, &gip,
1542 sbflags | XFS_SB_GQUOTINO,
1543 flags | XFS_QMOPT_GQUOTA); 1531 flags | XFS_QMOPT_GQUOTA);
1544 if (error) 1532 if (error)
1545 goto error_rele; 1533 goto error_rele;
@@ -1548,7 +1536,6 @@ xfs_qm_init_quotainos(
1548 } 1536 }
1549 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { 1537 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
1550 error = xfs_qm_qino_alloc(mp, &pip, 1538 error = xfs_qm_qino_alloc(mp, &pip,
1551 sbflags | XFS_SB_PQUOTINO,
1552 flags | XFS_QMOPT_PQUOTA); 1539 flags | XFS_QMOPT_PQUOTA);
1553 if (error) 1540 if (error)
1554 goto error_rele; 1541 goto error_rele;
@@ -1587,32 +1574,6 @@ xfs_qm_dqfree_one(
1587 xfs_qm_dqdestroy(dqp); 1574 xfs_qm_dqdestroy(dqp);
1588} 1575}
1589 1576
1590/*
1591 * Start a transaction and write the incore superblock changes to
1592 * disk. flags parameter indicates which fields have changed.
1593 */
1594int
1595xfs_qm_write_sb_changes(
1596 xfs_mount_t *mp,
1597 __int64_t flags)
1598{
1599 xfs_trans_t *tp;
1600 int error;
1601
1602 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1603 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
1604 if (error) {
1605 xfs_trans_cancel(tp, 0);
1606 return error;
1607 }
1608
1609 xfs_mod_sb(tp, flags);
1610 error = xfs_trans_commit(tp, 0);
1611
1612 return error;
1613}
1614
1615
1616/* --------------- utility functions for vnodeops ---------------- */ 1577/* --------------- utility functions for vnodeops ---------------- */
1617 1578
1618 1579
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 41f6c0b9d51c..0d4d3590cf85 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
157#define XFS_QM_RTBWARNLIMIT 5 157#define XFS_QM_RTBWARNLIMIT 5
158 158
159extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); 159extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
160extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
161 160
162/* dquot stuff */ 161/* dquot stuff */
163extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); 162extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index cb6168ec92c9..9b965db45800 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -91,8 +91,7 @@ xfs_qm_scall_quotaoff(
91 mutex_unlock(&q->qi_quotaofflock); 91 mutex_unlock(&q->qi_quotaofflock);
92 92
93 /* XXX what to do if error ? Revert back to old vals incore ? */ 93 /* XXX what to do if error ? Revert back to old vals incore ? */
94 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 94 return xfs_sync_sb(mp, false);
95 return error;
96 } 95 }
97 96
98 dqtype = 0; 97 dqtype = 0;
@@ -313,7 +312,6 @@ xfs_qm_scall_quotaon(
313{ 312{
314 int error; 313 int error;
315 uint qf; 314 uint qf;
316 __int64_t sbflags;
317 315
318 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 316 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
319 /* 317 /*
@@ -321,30 +319,22 @@ xfs_qm_scall_quotaon(
321 */ 319 */
322 flags &= ~(XFS_ALL_QUOTA_ACCT); 320 flags &= ~(XFS_ALL_QUOTA_ACCT);
323 321
324 sbflags = 0;
325
326 if (flags == 0) { 322 if (flags == 0) {
327 xfs_debug(mp, "%s: zero flags, m_qflags=%x", 323 xfs_debug(mp, "%s: zero flags, m_qflags=%x",
328 __func__, mp->m_qflags); 324 __func__, mp->m_qflags);
329 return -EINVAL; 325 return -EINVAL;
330 } 326 }
331 327
332 /* No fs can turn on quotas with a delayed effect */
333 ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
334
335 /* 328 /*
336 * Can't enforce without accounting. We check the superblock 329 * Can't enforce without accounting. We check the superblock
337 * qflags here instead of m_qflags because rootfs can have 330 * qflags here instead of m_qflags because rootfs can have
338 * quota acct on ondisk without m_qflags' knowing. 331 * quota acct on ondisk without m_qflags' knowing.
339 */ 332 */
340 if (((flags & XFS_UQUOTA_ACCT) == 0 && 333 if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
341 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
342 (flags & XFS_UQUOTA_ENFD)) || 334 (flags & XFS_UQUOTA_ENFD)) ||
343 ((flags & XFS_GQUOTA_ACCT) == 0 && 335 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
344 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
345 (flags & XFS_GQUOTA_ENFD)) || 336 (flags & XFS_GQUOTA_ENFD)) ||
346 ((flags & XFS_PQUOTA_ACCT) == 0 && 337 ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
347 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
348 (flags & XFS_PQUOTA_ENFD))) { 338 (flags & XFS_PQUOTA_ENFD))) {
349 xfs_debug(mp, 339 xfs_debug(mp,
350 "%s: Can't enforce without acct, flags=%x sbflags=%x", 340 "%s: Can't enforce without acct, flags=%x sbflags=%x",
@@ -369,11 +359,11 @@ xfs_qm_scall_quotaon(
369 /* 359 /*
370 * There's nothing to change if it's the same. 360 * There's nothing to change if it's the same.
371 */ 361 */
372 if ((qf & flags) == flags && sbflags == 0) 362 if ((qf & flags) == flags)
373 return -EEXIST; 363 return -EEXIST;
374 sbflags |= XFS_SB_QFLAGS;
375 364
376 if ((error = xfs_qm_write_sb_changes(mp, sbflags))) 365 error = xfs_sync_sb(mp, false);
366 if (error)
377 return error; 367 return error;
378 /* 368 /*
379 * If we aren't trying to switch on quota enforcement, we are done. 369 * If we aren't trying to switch on quota enforcement, we are done.
@@ -383,8 +373,7 @@ xfs_qm_scall_quotaon(
383 ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != 373 ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
384 (mp->m_qflags & XFS_PQUOTA_ACCT)) || 374 (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
385 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != 375 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
386 (mp->m_qflags & XFS_GQUOTA_ACCT)) || 376 (mp->m_qflags & XFS_GQUOTA_ACCT)))
387 (flags & XFS_ALL_QUOTA_ENFD) == 0)
388 return 0; 377 return 0;
389 378
390 if (! XFS_IS_QUOTA_RUNNING(mp)) 379 if (! XFS_IS_QUOTA_RUNNING(mp))
@@ -421,20 +410,12 @@ xfs_qm_scall_getqstat(
421 memset(out, 0, sizeof(fs_quota_stat_t)); 410 memset(out, 0, sizeof(fs_quota_stat_t));
422 411
423 out->qs_version = FS_QSTAT_VERSION; 412 out->qs_version = FS_QSTAT_VERSION;
424 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
425 out->qs_uquota.qfs_ino = NULLFSINO;
426 out->qs_gquota.qfs_ino = NULLFSINO;
427 return 0;
428 }
429
430 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 413 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
431 (XFS_ALL_QUOTA_ACCT| 414 (XFS_ALL_QUOTA_ACCT|
432 XFS_ALL_QUOTA_ENFD)); 415 XFS_ALL_QUOTA_ENFD));
433 if (q) { 416 uip = q->qi_uquotaip;
434 uip = q->qi_uquotaip; 417 gip = q->qi_gquotaip;
435 gip = q->qi_gquotaip; 418 pip = q->qi_pquotaip;
436 pip = q->qi_pquotaip;
437 }
438 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 419 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
439 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 420 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
440 0, 0, &uip) == 0) 421 0, 0, &uip) == 0)
@@ -480,14 +461,13 @@ xfs_qm_scall_getqstat(
480 if (temppqip) 461 if (temppqip)
481 IRELE(pip); 462 IRELE(pip);
482 } 463 }
483 if (q) { 464 out->qs_incoredqs = q->qi_dquots;
484 out->qs_incoredqs = q->qi_dquots; 465 out->qs_btimelimit = q->qi_btimelimit;
485 out->qs_btimelimit = q->qi_btimelimit; 466 out->qs_itimelimit = q->qi_itimelimit;
486 out->qs_itimelimit = q->qi_itimelimit; 467 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
487 out->qs_rtbtimelimit = q->qi_rtbtimelimit; 468 out->qs_bwarnlimit = q->qi_bwarnlimit;
488 out->qs_bwarnlimit = q->qi_bwarnlimit; 469 out->qs_iwarnlimit = q->qi_iwarnlimit;
489 out->qs_iwarnlimit = q->qi_iwarnlimit; 470
490 }
491 return 0; 471 return 0;
492} 472}
493 473
@@ -508,13 +488,6 @@ xfs_qm_scall_getqstatv(
508 bool tempgqip = false; 488 bool tempgqip = false;
509 bool temppqip = false; 489 bool temppqip = false;
510 490
511 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
512 out->qs_uquota.qfs_ino = NULLFSINO;
513 out->qs_gquota.qfs_ino = NULLFSINO;
514 out->qs_pquota.qfs_ino = NULLFSINO;
515 return 0;
516 }
517
518 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 491 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
519 (XFS_ALL_QUOTA_ACCT| 492 (XFS_ALL_QUOTA_ACCT|
520 XFS_ALL_QUOTA_ENFD)); 493 XFS_ALL_QUOTA_ENFD));
@@ -522,11 +495,9 @@ xfs_qm_scall_getqstatv(
522 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 495 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
523 out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; 496 out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
524 497
525 if (q) { 498 uip = q->qi_uquotaip;
526 uip = q->qi_uquotaip; 499 gip = q->qi_gquotaip;
527 gip = q->qi_gquotaip; 500 pip = q->qi_pquotaip;
528 pip = q->qi_pquotaip;
529 }
530 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 501 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
531 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 502 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
532 0, 0, &uip) == 0) 503 0, 0, &uip) == 0)
@@ -561,14 +532,13 @@ xfs_qm_scall_getqstatv(
561 if (temppqip) 532 if (temppqip)
562 IRELE(pip); 533 IRELE(pip);
563 } 534 }
564 if (q) { 535 out->qs_incoredqs = q->qi_dquots;
565 out->qs_incoredqs = q->qi_dquots; 536 out->qs_btimelimit = q->qi_btimelimit;
566 out->qs_btimelimit = q->qi_btimelimit; 537 out->qs_itimelimit = q->qi_itimelimit;
567 out->qs_itimelimit = q->qi_itimelimit; 538 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
568 out->qs_rtbtimelimit = q->qi_rtbtimelimit; 539 out->qs_bwarnlimit = q->qi_bwarnlimit;
569 out->qs_bwarnlimit = q->qi_bwarnlimit; 540 out->qs_iwarnlimit = q->qi_iwarnlimit;
570 out->qs_iwarnlimit = q->qi_iwarnlimit; 541
571 }
572 return 0; 542 return 0;
573} 543}
574 544
@@ -800,7 +770,7 @@ xfs_qm_log_quotaoff(
800 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; 770 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
801 spin_unlock(&mp->m_sb_lock); 771 spin_unlock(&mp->m_sb_lock);
802 772
803 xfs_mod_sb(tp, XFS_SB_QFLAGS); 773 xfs_log_sb(tp);
804 774
805 /* 775 /*
806 * We have to make sure that the transaction is secure on disk before we 776 * We have to make sure that the transaction is secure on disk before we
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 801a84c1cdc3..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
64 return xfs_qm_scall_getqstatv(mp, fqs); 64 return xfs_qm_scall_getqstatv(mp, fqs);
65} 65}
66 66
67STATIC int 67static unsigned int
68xfs_fs_set_xstate( 68xfs_quota_flags(unsigned int uflags)
69 struct super_block *sb,
70 unsigned int uflags,
71 int op)
72{ 69{
73 struct xfs_mount *mp = XFS_M(sb); 70 unsigned int flags = 0;
74 unsigned int flags = 0;
75
76 if (sb->s_flags & MS_RDONLY)
77 return -EROFS;
78 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
79 return -ENOSYS;
80 71
81 if (uflags & FS_QUOTA_UDQ_ACCT) 72 if (uflags & FS_QUOTA_UDQ_ACCT)
82 flags |= XFS_UQUOTA_ACCT; 73 flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
91 if (uflags & FS_QUOTA_PDQ_ENFD) 82 if (uflags & FS_QUOTA_PDQ_ENFD)
92 flags |= XFS_PQUOTA_ENFD; 83 flags |= XFS_PQUOTA_ENFD;
93 84
94 switch (op) { 85 return flags;
95 case Q_XQUOTAON: 86}
96 return xfs_qm_scall_quotaon(mp, flags); 87
97 case Q_XQUOTAOFF: 88STATIC int
98 if (!XFS_IS_QUOTA_ON(mp)) 89xfs_quota_enable(
99 return -EINVAL; 90 struct super_block *sb,
100 return xfs_qm_scall_quotaoff(mp, flags); 91 unsigned int uflags)
101 } 92{
93 struct xfs_mount *mp = XFS_M(sb);
94
95 if (sb->s_flags & MS_RDONLY)
96 return -EROFS;
97 if (!XFS_IS_QUOTA_RUNNING(mp))
98 return -ENOSYS;
99
100 return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
101}
102
103STATIC int
104xfs_quota_disable(
105 struct super_block *sb,
106 unsigned int uflags)
107{
108 struct xfs_mount *mp = XFS_M(sb);
109
110 if (sb->s_flags & MS_RDONLY)
111 return -EROFS;
112 if (!XFS_IS_QUOTA_RUNNING(mp))
113 return -ENOSYS;
114 if (!XFS_IS_QUOTA_ON(mp))
115 return -EINVAL;
102 116
103 return -EINVAL; 117 return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
104} 118}
105 119
106STATIC int 120STATIC int
@@ -166,7 +180,8 @@ xfs_fs_set_dqblk(
166const struct quotactl_ops xfs_quotactl_operations = { 180const struct quotactl_ops xfs_quotactl_operations = {
167 .get_xstatev = xfs_fs_get_xstatev, 181 .get_xstatev = xfs_fs_get_xstatev,
168 .get_xstate = xfs_fs_get_xstate, 182 .get_xstate = xfs_fs_get_xstate,
169 .set_xstate = xfs_fs_set_xstate, 183 .quota_enable = xfs_quota_enable,
184 .quota_disable = xfs_quota_disable,
170 .rm_xquota = xfs_fs_rm_xquota, 185 .rm_xquota = xfs_fs_rm_xquota,
171 .get_dqblk = xfs_fs_get_dqblk, 186 .get_dqblk = xfs_fs_get_dqblk,
172 .set_dqblk = xfs_fs_set_dqblk, 187 .set_dqblk = xfs_fs_set_dqblk,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19cbda196369..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -685,7 +685,7 @@ xfs_blkdev_get(
685 mp); 685 mp);
686 if (IS_ERR(*bdevp)) { 686 if (IS_ERR(*bdevp)) {
687 error = PTR_ERR(*bdevp); 687 error = PTR_ERR(*bdevp);
688 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); 688 xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
689 } 689 }
690 690
691 return error; 691 return error;
@@ -1111,6 +1111,11 @@ xfs_fs_statfs(
1111 statp->f_files, 1111 statp->f_files,
1112 mp->m_maxicount); 1112 mp->m_maxicount);
1113 1113
1114 /* If sb_icount overshot maxicount, report actual allocation */
1115 statp->f_files = max_t(typeof(statp->f_files),
1116 statp->f_files,
1117 sbp->sb_icount);
1118
1114 /* make sure statp->f_ffree does not underflow */ 1119 /* make sure statp->f_ffree does not underflow */
1115 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1120 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1116 statp->f_ffree = max_t(__int64_t, ffree, 0); 1121 statp->f_ffree = max_t(__int64_t, ffree, 0);
@@ -1257,13 +1262,13 @@ xfs_fs_remount(
1257 * If this is the first remount to writeable state we 1262 * If this is the first remount to writeable state we
1258 * might have some superblock changes to update. 1263 * might have some superblock changes to update.
1259 */ 1264 */
1260 if (mp->m_update_flags) { 1265 if (mp->m_update_sb) {
1261 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1266 error = xfs_sync_sb(mp, false);
1262 if (error) { 1267 if (error) {
1263 xfs_warn(mp, "failed to write sb changes"); 1268 xfs_warn(mp, "failed to write sb changes");
1264 return error; 1269 return error;
1265 } 1270 }
1266 mp->m_update_flags = 0; 1271 mp->m_update_sb = false;
1267 } 1272 }
1268 1273
1269 /* 1274 /*
@@ -1293,8 +1298,9 @@ xfs_fs_remount(
1293 1298
1294/* 1299/*
1295 * Second stage of a freeze. The data is already frozen so we only 1300 * Second stage of a freeze. The data is already frozen so we only
1296 * need to take care of the metadata. Once that's done write a dummy 1301 * need to take care of the metadata. Once that's done sync the superblock
1297 * record to dirty the log in case of a crash while frozen. 1302 * to the log to dirty it in case of a crash while frozen. This ensures that we
1303 * will recover the unlinked inode lists on the next mount.
1298 */ 1304 */
1299STATIC int 1305STATIC int
1300xfs_fs_freeze( 1306xfs_fs_freeze(
@@ -1304,7 +1310,7 @@ xfs_fs_freeze(
1304 1310
1305 xfs_save_resvblks(mp); 1311 xfs_save_resvblks(mp);
1306 xfs_quiesce_attr(mp); 1312 xfs_quiesce_attr(mp);
1307 return xfs_fs_log_dummy(mp); 1313 return xfs_sync_sb(mp, true);
1308} 1314}
1309 1315
1310STATIC int 1316STATIC int
@@ -1531,7 +1537,7 @@ xfs_fs_mount(
1531static long 1537static long
1532xfs_fs_nr_cached_objects( 1538xfs_fs_nr_cached_objects(
1533 struct super_block *sb, 1539 struct super_block *sb,
1534 int nid) 1540 struct shrink_control *sc)
1535{ 1541{
1536 return xfs_reclaim_inodes_count(XFS_M(sb)); 1542 return xfs_reclaim_inodes_count(XFS_M(sb));
1537} 1543}
@@ -1539,10 +1545,9 @@ xfs_fs_nr_cached_objects(
1539static long 1545static long
1540xfs_fs_free_cached_objects( 1546xfs_fs_free_cached_objects(
1541 struct super_block *sb, 1547 struct super_block *sb,
1542 long nr_to_scan, 1548 struct shrink_control *sc)
1543 int nid)
1544{ 1549{
1545 return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); 1550 return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
1546} 1551}
1547 1552
1548static const struct super_operations xfs_super_operations = { 1553static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 1743b9f8e23d..a0c8067cea6f 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -149,24 +149,6 @@ static struct ctl_table xfs_table[] = {
149 .extra2 = &xfs_params.inherit_noatim.max 149 .extra2 = &xfs_params.inherit_noatim.max
150 }, 150 },
151 { 151 {
152 .procname = "xfsbufd_centisecs",
153 .data = &xfs_params.xfs_buf_timer.val,
154 .maxlen = sizeof(int),
155 .mode = 0644,
156 .proc_handler = proc_dointvec_minmax,
157 .extra1 = &xfs_params.xfs_buf_timer.min,
158 .extra2 = &xfs_params.xfs_buf_timer.max
159 },
160 {
161 .procname = "age_buffer_centisecs",
162 .data = &xfs_params.xfs_buf_age.val,
163 .maxlen = sizeof(int),
164 .mode = 0644,
165 .proc_handler = proc_dointvec_minmax,
166 .extra1 = &xfs_params.xfs_buf_age.min,
167 .extra2 = &xfs_params.xfs_buf_age.max
168 },
169 {
170 .procname = "inherit_nosymlinks", 152 .procname = "inherit_nosymlinks",
171 .data = &xfs_params.inherit_nosym.val, 153 .data = &xfs_params.inherit_nosym.val,
172 .maxlen = sizeof(int), 154 .maxlen = sizeof(int),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fa3135b9bf04..eb90cd59a0ec 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -472,6 +472,7 @@ xfs_trans_apply_sb_deltas(
472 whole = 1; 472 whole = 1;
473 } 473 }
474 474
475 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
475 if (whole) 476 if (whole)
476 /* 477 /*
477 * Log the whole thing, the fields are noncontiguous. 478 * Log the whole thing, the fields are noncontiguous.
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0a4d4ab6d9a9..75798412859a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -327,9 +327,10 @@ xfs_trans_read_buf_map(
327 return -EIO; 327 return -EIO;
328 } 328 }
329 329
330 if (tp) 330 if (tp) {
331 _xfs_trans_bjoin(tp, bp, 1); 331 _xfs_trans_bjoin(tp, bp, 1);
332 trace_xfs_trans_read_buf(bp->b_fspriv); 332 trace_xfs_trans_read_buf(bp->b_fspriv);
333 }
333 *bpp = bp; 334 *bpp = bp;
334 return 0; 335 return 0;
335 336