aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/Kconfig24
-rw-r--r--fs/Kconfig.binfmt7
-rw-r--r--fs/Makefile2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c13
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/dir.c11
-rw-r--r--fs/affs/file.c49
-rw-r--r--fs/affs/inode.c7
-rw-r--r--fs/affs/namei.c47
-rw-r--r--fs/affs/super.c69
-rw-r--r--fs/afs/rxrpc.c14
-rw-r--r--fs/afs/volume.c2
-rw-r--r--fs/aio.c20
-rw-r--r--fs/befs/linuxvfs.c6
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/binfmt_som.c299
-rw-r--r--fs/block_dev.c77
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c55
-rw-r--r--fs/btrfs/ctree.h40
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/dev-replace.c25
-rw-r--r--fs/btrfs/disk-io.c108
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c264
-rw-r--r--fs/btrfs/extent_io.c91
-rw-r--r--fs/btrfs/extent_io.h65
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/free-space-cache.c13
-rw-r--r--fs/btrfs/inode-item.c9
-rw-r--r--fs/btrfs/inode.c166
-rw-r--r--fs/btrfs/qgroup.c3
-rw-r--r--fs/btrfs/raid56.c103
-rw-r--r--fs/btrfs/raid56.h11
-rw-r--r--fs/btrfs/reada.c19
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/scrub.c315
-rw-r--r--fs/btrfs/send.c9
-rw-r--r--fs/btrfs/super.c20
-rw-r--r--fs/btrfs/sysfs.c10
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/tests/qgroup-tests.c23
-rw-r--r--fs/btrfs/transaction.c29
-rw-r--r--fs/btrfs/transaction.h7
-rw-r--r--fs/btrfs/tree-log.c235
-rw-r--r--fs/btrfs/volumes.c242
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/ceph/acl.c14
-rw-r--r--fs/ceph/addr.c22
-rw-r--r--fs/ceph/caps.c127
-rw-r--r--fs/ceph/dir.c33
-rw-r--r--fs/ceph/file.c39
-rw-r--r--fs/ceph/inode.c43
-rw-r--r--fs/ceph/locks.c63
-rw-r--r--fs/ceph/mds_client.c131
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/snap.c54
-rw-r--r--fs/ceph/super.c24
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/char_dev.c24
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsglob.h6
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c41
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/ioctl.c21
-rw-r--r--fs/cifs/netmisc.c12
-rw-r--r--fs/cifs/readdir.c10
-rw-r--r--fs/cifs/smb2misc.c12
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2transport.c2
-rw-r--r--fs/cifs/smbencrypt.c2
-rw-r--r--fs/coda/dir.c138
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/configfs_internal.h2
-rw-r--r--fs/configfs/inode.c17
-rw-r--r--fs/configfs/mount.c11
-rw-r--r--fs/dax.c534
-rw-r--r--fs/dcache.c189
-rw-r--r--fs/debugfs/inode.c291
-rw-r--r--fs/dlm/netlink.c7
-rw-r--r--fs/drop_caches.c14
-rw-r--r--fs/ecryptfs/inode.c1
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efivarfs/Kconfig1
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c10
-rw-r--r--fs/exofs/inode.c3
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/Kconfig11
-rw-r--r--fs/ext2/Makefile1
-rw-r--r--fs/ext2/ext2.h10
-rw-r--r--fs/ext2/file.c44
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c38
-rw-r--r--fs/ext2/namei.c13
-rw-r--r--fs/ext2/super.c53
-rw-r--r--fs/ext2/xip.c91
-rw-r--r--fs/ext2/xip.h26
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/file.c270
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c159
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c95
-rw-r--r--fs/f2fs/Kconfig10
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/checkpoint.c95
-rw-r--r--fs/f2fs/data.c218
-rw-r--r--fs/f2fs/debug.c59
-rw-r--r--fs/f2fs/dir.c3
-rw-r--r--fs/f2fs/f2fs.h120
-rw-r--r--fs/f2fs/file.c101
-rw-r--r--fs/f2fs/gc.c38
-rw-r--r--fs/f2fs/gc.h33
-rw-r--r--fs/f2fs/inline.c32
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c2
-rw-r--r--fs/f2fs/node.c154
-rw-r--r--fs/f2fs/node.h45
-rw-r--r--fs/f2fs/recovery.c11
-rw-r--r--fs/f2fs/segment.c194
-rw-r--r--fs/f2fs/segment.h29
-rw-r--r--fs/f2fs/super.c75
-rw-r--r--fs/f2fs/trace.c159
-rw-r--r--fs/f2fs/trace.h46
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/fs-writeback.c76
-rw-r--r--fs/fs_pin.c96
-rw-r--r--fs/fuse/dev.c51
-rw-r--r--fs/fuse/dir.c31
-rw-r--r--fs/fuse/file.c11
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c6
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c14
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/quota.c60
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c138
-rw-r--r--fs/internal.h9
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/isofs/rock.c3
-rw-r--r--fs/isofs/util.c18
-rw-r--r--fs/jffs2/compr_rubin.c5
-rw-r--r--fs/jffs2/scan.c5
-rw-r--r--fs/jfs/endian24.h49
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_dtree.c4
-rw-r--r--fs/jfs/jfs_types.h55
-rw-r--r--fs/jfs/jfs_xtree.h25
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/kernfs/dir.c36
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/kernfs/inode.c13
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/mount.c1
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/mon.c13
-rw-r--r--fs/lockd/svc.c8
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/lockd/svcsubs.c26
-rw-r--r--fs/lockd/xdr.c8
-rw-r--r--fs/locks.c588
-rw-r--r--fs/mount.h4
-rw-r--r--fs/namei.c143
-rw-r--r--fs/namespace.c50
-rw-r--r--fs/ncpfs/dir.c98
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/ncpfs/ncp_fs_i.h1
-rw-r--r--fs/ncpfs/ncplib_kernel.h30
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.c8
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c8
-rw-r--r--fs/nfs/delegation.c45
-rw-r--r--fs/nfs/direct.c120
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c366
-rw-r--r--fs/nfs/filelayout/filelayout.h40
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c469
-rw-r--r--fs/nfs/flexfilelayout/Makefile5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1533
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h155
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c552
-rw-r--r--fs/nfs/idmap.c3
-rw-r--r--fs/nfs/inode.c14
-rw-r--r--fs/nfs/internal.h69
-rw-r--r--fs/nfs/nfs2xdr.c10
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3client.c41
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs3super.c2
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h9
-rw-r--r--fs/nfs/nfs4client.c51
-rw-r--r--fs/nfs/nfs4proc.c411
-rw-r--r--fs/nfs/nfs4session.c2
-rw-r--r--fs/nfs/nfs4session.h6
-rw-r--r--fs/nfs/nfs4state.c101
-rw-r--r--fs/nfs/nfs4super.c4
-rw-r--r--fs/nfs/nfs4xdr.c145
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/objlayout/objio_osd.c5
-rw-r--r--fs/nfs/pagelist.c300
-rw-r--r--fs/nfs/pnfs.c471
-rw-r--r--fs/nfs/pnfs.h139
-rw-r--r--fs/nfs/pnfs_nfs.c870
-rw-r--r--fs/nfs/read.c33
-rw-r--r--fs/nfs/super.c33
-rw-r--r--fs/nfs/write.c111
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile8
-rw-r--r--fs/nfsd/blocklayout.c189
-rw-r--r--fs/nfsd/blocklayoutxdr.c157
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/nfs4callback.c99
-rw-r--r--fs/nfsd/nfs4layouts.c721
-rw-r--r--fs/nfsd/nfs4proc.c310
-rw-r--r--fs/nfsd/nfs4state.c97
-rw-r--r--fs/nfsd/nfs4xdr.c362
-rw-r--r--fs/nfsd/nfsctl.c9
-rw-r--r--fs/nfsd/nfsd.h16
-rw-r--r--fs/nfsd/nfsfh.h18
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nfsd/pnfs.h86
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/trace.c5
-rw-r--r--fs/nfsd/trace.h54
-rw-r--r--fs/nfsd/xdr4.h59
-rw-r--r--fs/nfsd/xdr4cb.h7
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/page.c4
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/segment.c44
-rw-r--r--fs/nilfs2/segment.h5
-rw-r--r--fs/nilfs2/super.c6
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c45
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ocfs2/acl.c14
-rw-r--r--fs/ocfs2/alloc.c18
-rw-r--r--fs/ocfs2/aops.c242
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h12
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/dlm/dlmast.c6
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c14
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c12
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c80
-rw-r--r--fs/ocfs2/file.h9
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c111
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c327
-rw-r--r--fs/ocfs2/namei.h8
-rw-r--r--fs/ocfs2/ocfs2.h25
-rw-r--r--fs/ocfs2/ocfs2_fs.h14
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_local.c20
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/reservations.c2
-rw-r--r--fs/ocfs2/super.c51
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/open.c15
-rw-r--r--fs/proc/array.c44
-rw-r--r--fs/proc/generic.c27
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/task_mmu.c250
-rw-r--r--fs/proc/vmcore.c8
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/pstore/Kconfig10
-rw-r--r--fs/pstore/Makefile2
-rw-r--r--fs/pstore/inode.c26
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/pmsg.c114
-rw-r--r--fs/pstore/ram.c53
-rw-r--r--fs/quota/Kconfig1
-rw-r--r--fs/quota/dquot.c186
-rw-r--r--fs/quota/quota.c214
-rw-r--r--fs/quota/quota_v1.c4
-rw-r--r--fs/quota/quota_v2.c16
-rw-r--r--fs/ramfs/file-nommu.c7
-rw-r--r--fs/ramfs/inode.c21
-rw-r--r--fs/read_write.c48
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/romfs/mmap-nommu.c10
-rw-r--r--fs/romfs/super.c3
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c32
-rw-r--r--fs/splice.c23
-rw-r--r--fs/super.c63
-rw-r--r--fs/sync.c8
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/ubifs/replay.c19
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c112
-rw-r--r--fs/udf/Kconfig10
-rw-r--r--fs/udf/dir.c31
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c42
-rw-r--r--fs/udf/namei.c17
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/udf/symlink.c57
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/udf/unicode.c28
-rw-r--r--fs/ufs/super.c8
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h5
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c20
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h33
-rw-r--r--fs/xfs/libxfs/xfs_format.h24
-rw-r--r--fs/xfs/libxfs/xfs_fs.h (renamed from fs/xfs/xfs_fs.h)0
-rw-r--r--fs/xfs/libxfs/xfs_sb.c320
-rw-r--r--fs/xfs/libxfs/xfs_sb.h11
-rw-r--r--fs/xfs/libxfs/xfs_shared.h33
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c14
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h1
-rw-r--r--fs/xfs/libxfs/xfs_types.h (renamed from fs/xfs/xfs_types.h)0
-rw-r--r--fs/xfs/xfs_aops.c149
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_bmap_util.h37
-rw-r--r--fs/xfs/xfs_buf.c13
-rw-r--r--fs/xfs/xfs_buf_item.c6
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c6
-rw-r--r--fs/xfs/xfs_file.c81
-rw-r--r--fs/xfs/xfs_fsops.c40
-rw-r--r--fs/xfs/xfs_inode.c136
-rw-r--r--fs/xfs/xfs_inode.h11
-rw-r--r--fs/xfs/xfs_ioctl.c510
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c34
-rw-r--r--fs/xfs/xfs_iops.h1
-rw-r--r--fs/xfs/xfs_log.c28
-rw-r--r--fs/xfs/xfs_mount.c107
-rw-r--r--fs/xfs/xfs_mount.h16
-rw-r--r--fs/xfs/xfs_pnfs.c322
-rw-r--r--fs/xfs/xfs_pnfs.h18
-rw-r--r--fs/xfs/xfs_qm.c55
-rw-r--r--fs/xfs/xfs_qm.h5
-rw-r--r--fs/xfs/xfs_qm_syscalls.c244
-rw-r--r--fs/xfs/xfs_quotaops.c67
-rw-r--r--fs/xfs/xfs_super.c27
-rw-r--r--fs/xfs/xfs_sysctl.c18
-rw-r--r--fs/xfs/xfs_trans.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c5
396 files changed, 15379 insertions, 6965 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
335 } 335 }
336 init_rwsem(&v9ses->rename_sem); 336 init_rwsem(&v9ses->rename_sem);
337 337
338 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); 338 rc = bdi_setup_and_register(&v9ses->bdi, "9p");
339 if (rc) { 339 if (rc) {
340 kfree(v9ses->aname); 340 kfree(v9ses->aname);
341 kfree(v9ses->uname); 341 kfree(v9ses->uname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
831 .fault = filemap_fault, 831 .fault = filemap_fault,
832 .map_pages = filemap_map_pages, 832 .map_pages = filemap_map_pages,
833 .page_mkwrite = v9fs_vm_page_mkwrite, 833 .page_mkwrite = v9fs_vm_page_mkwrite,
834 .remap_pages = generic_file_remap_pages,
835}; 834};
836 835
837static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { 836static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
839 .fault = filemap_fault, 838 .fault = filemap_fault,
840 .map_pages = filemap_map_pages, 839 .map_pages = filemap_map_pages,
841 .page_mkwrite = v9fs_vm_page_mkwrite, 840 .page_mkwrite = v9fs_vm_page_mkwrite,
842 .remap_pages = generic_file_remap_pages,
843}; 841};
844 842
845 843
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..ec35851e5b71 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
13source "fs/ext2/Kconfig" 13source "fs/ext2/Kconfig"
14source "fs/ext3/Kconfig" 14source "fs/ext3/Kconfig"
15source "fs/ext4/Kconfig" 15source "fs/ext4/Kconfig"
16
17config FS_XIP
18# execute in place
19 bool
20 depends on EXT2_FS_XIP
21 default y
22
23source "fs/jbd/Kconfig" 16source "fs/jbd/Kconfig"
24source "fs/jbd2/Kconfig" 17source "fs/jbd2/Kconfig"
25 18
@@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig"
40source "fs/btrfs/Kconfig" 33source "fs/btrfs/Kconfig"
41source "fs/nilfs2/Kconfig" 34source "fs/nilfs2/Kconfig"
42 35
36config FS_DAX
37 bool "Direct Access (DAX) support"
38 depends on MMU
39 depends on !(ARM || MIPS || SPARC)
40 help
41 Direct Access (DAX) can be used on memory-backed block devices.
42 If the block device supports DAX and the filesystem supports DAX,
43 then you can avoid using the pagecache to buffer I/Os. Turning
44 on this option will compile in support for DAX; you will need to
45 mount the filesystem using the -o dax option.
46
47 If you do not have a block device that is capable of using this,
48 or if unsure, say N. Saying Y will increase the size of the kernel
49 by about 5kB.
50
43endif # BLOCK 51endif # BLOCK
44 52
45# Posix ACL utility routines 53# Posix ACL utility routines
@@ -165,6 +173,7 @@ config HUGETLB_PAGE
165 def_bool HUGETLBFS 173 def_bool HUGETLBFS
166 174
167source "fs/configfs/Kconfig" 175source "fs/configfs/Kconfig"
176source "fs/efivarfs/Kconfig"
168 177
169endmenu 178endmenu
170 179
@@ -209,7 +218,6 @@ source "fs/sysv/Kconfig"
209source "fs/ufs/Kconfig" 218source "fs/ufs/Kconfig"
210source "fs/exofs/Kconfig" 219source "fs/exofs/Kconfig"
211source "fs/f2fs/Kconfig" 220source "fs/f2fs/Kconfig"
212source "fs/efivarfs/Kconfig"
213 221
214endif # MISC_FILESYSTEMS 222endif # MISC_FILESYSTEMS
215 223
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c055d56ec63d..270c48148f79 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -149,13 +149,6 @@ config BINFMT_EM86
149 later load the module when you want to use a Linux/Intel binary. The 149 later load the module when you want to use a Linux/Intel binary. The
150 module will be called binfmt_em86. If unsure, say Y. 150 module will be called binfmt_em86. If unsure, say Y.
151 151
152config BINFMT_SOM
153 tristate "Kernel support for SOM binaries"
154 depends on PARISC && HPUX
155 help
156 SOM is a binary executable format inherited from HP/UX. Say
157 Y here to be able to load and execute SOM binaries directly.
158
159config BINFMT_MISC 152config BINFMT_MISC
160 tristate "Kernel support for MISC binaries" 153 tristate "Kernel support for MISC binaries"
161 ---help--- 154 ---help---
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..a88ac4838c9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
28obj-$(CONFIG_TIMERFD) += timerfd.o 28obj-$(CONFIG_TIMERFD) += timerfd.o
29obj-$(CONFIG_EVENTFD) += eventfd.o 29obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_AIO) += aio.o 30obj-$(CONFIG_AIO) += aio.o
31obj-$(CONFIG_FS_DAX) += dax.o
31obj-$(CONFIG_FILE_LOCKING) += locks.o 32obj-$(CONFIG_FILE_LOCKING) += locks.o
32obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 33obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 34obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
37obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o 38obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
38obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o 39obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
39obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o 40obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
40obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
41obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 41obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
42 42
43obj-$(CONFIG_FS_MBCACHE) += mbcache.o 43obj-$(CONFIG_FS_MBCACHE) += mbcache.o
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index ff44ff3ff015..c8764bd7497d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -30,6 +30,8 @@
30#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) 30#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
31#define AFFS_AC_MASK (AFFS_AC_SIZE-1) 31#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
32 32
33#define AFFSNAMEMAX 30U
34
33struct affs_ext_key { 35struct affs_ext_key {
34 u32 ext; /* idx of the extended block */ 36 u32 ext; /* idx of the extended block */
35 u32 key; /* block number */ 37 u32 key; /* block number */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index c852f2fa1710..388da1ea815d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -30,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
30 ino = bh->b_blocknr; 30 ino = bh->b_blocknr;
31 offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); 31 offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
32 32
33 pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino); 33 pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino);
34 34
35 dir_bh = affs_bread(sb, dir->i_ino); 35 dir_bh = affs_bread(sb, dir->i_ino);
36 if (!dir_bh) 36 if (!dir_bh)
@@ -80,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
80 sb = dir->i_sb; 80 sb = dir->i_sb;
81 rem_ino = rem_bh->b_blocknr; 81 rem_ino = rem_bh->b_blocknr;
82 offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); 82 offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
83 pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n", 83 pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino,
84 __func__, (u32)dir->i_ino, rem_ino, offset); 84 rem_ino, offset);
85 85
86 bh = affs_bread(sb, dir->i_ino); 86 bh = affs_bread(sb, dir->i_ino);
87 if (!bh) 87 if (!bh)
@@ -483,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
483{ 483{
484 int i; 484 int i;
485 485
486 if (len > 30) { 486 if (len > AFFSNAMEMAX) {
487 if (notruncate) 487 if (notruncate)
488 return -ENAMETOOLONG; 488 return -ENAMETOOLONG;
489 else 489 len = AFFSNAMEMAX;
490 len = 30;
491 } 490 }
492 for (i = 0; i < len; i++) { 491 for (i = 0; i < len; i++) {
493 if (name[i] < ' ' || name[i] == ':' 492 if (name[i] < ' ' || name[i] == ':'
@@ -508,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
508int 507int
509affs_copy_name(unsigned char *bstr, struct dentry *dentry) 508affs_copy_name(unsigned char *bstr, struct dentry *dentry)
510{ 509{
511 int len = min(dentry->d_name.len, 30u); 510 u32 len = min(dentry->d_name.len, AFFSNAMEMAX);
512 511
513 *bstr++ = len; 512 *bstr++ = len;
514 memcpy(bstr, dentry->d_name.name, len); 513 memcpy(bstr, dentry->d_name.name, len);
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c8de51185c23..675148950fed 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -99,7 +99,6 @@ err_bh_read:
99 99
100err_range: 100err_range:
101 affs_error(sb, "affs_free_block","Block %u outside partition", block); 101 affs_error(sb, "affs_free_block","Block %u outside partition", block);
102 return;
103} 102}
104 103
105/* 104/*
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 59f07bec92a6..ac4f318aafba 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
54 u32 ino; 54 u32 ino;
55 int error = 0; 55 int error = 0;
56 56
57 pr_debug("%s(ino=%lu,f_pos=%lx)\n", 57 pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
58 __func__, inode->i_ino, (unsigned long)ctx->pos);
59 58
60 if (ctx->pos < 2) { 59 if (ctx->pos < 2) {
61 file->private_data = (void *)0; 60 file->private_data = (void *)0;
@@ -115,11 +114,11 @@ inside:
115 break; 114 break;
116 } 115 }
117 116
118 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); 117 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0],
118 (u8)AFFSNAMEMAX);
119 name = AFFS_TAIL(sb, fh_bh)->name + 1; 119 name = AFFS_TAIL(sb, fh_bh)->name + 1;
120 pr_debug("readdir(): dir_emit(\"%.*s\", " 120 pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n",
121 "ino=%u), hash=%d, f_pos=%x\n", 121 namelen, name, ino, hash_pos, ctx->pos);
122 namelen, name, ino, hash_pos, (u32)ctx->pos);
123 122
124 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) 123 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
125 goto done; 124 goto done;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8faa6593ca6d..d2468bf95669 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -180,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
180 ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); 180 ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
181 if (ext < AFFS_I(inode)->i_extcnt) 181 if (ext < AFFS_I(inode)->i_extcnt)
182 goto read_ext; 182 goto read_ext;
183 if (ext > AFFS_I(inode)->i_extcnt) 183 BUG_ON(ext > AFFS_I(inode)->i_extcnt);
184 BUG();
185 bh = affs_alloc_extblock(inode, bh, ext); 184 bh = affs_alloc_extblock(inode, bh, ext);
186 if (IS_ERR(bh)) 185 if (IS_ERR(bh))
187 return bh; 186 return bh;
@@ -198,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
198 struct buffer_head *prev_bh; 197 struct buffer_head *prev_bh;
199 198
200 /* allocate a new extended block */ 199 /* allocate a new extended block */
201 if (ext > AFFS_I(inode)->i_extcnt) 200 BUG_ON(ext > AFFS_I(inode)->i_extcnt);
202 BUG();
203 201
204 /* get previous extended block */ 202 /* get previous extended block */
205 prev_bh = affs_get_extblock(inode, ext - 1); 203 prev_bh = affs_get_extblock(inode, ext - 1);
@@ -299,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
299 struct buffer_head *ext_bh; 297 struct buffer_head *ext_bh;
300 u32 ext; 298 u32 ext;
301 299
302 pr_debug("%s(%u, %lu)\n", 300 pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino,
303 __func__, (u32)inode->i_ino, (unsigned long)block); 301 (unsigned long long)block);
304 302
305 BUG_ON(block > (sector_t)0x7fffffffUL); 303 BUG_ON(block > (sector_t)0x7fffffffUL);
306 304
@@ -330,8 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
330 328
331 /* store new block */ 329 /* store new block */
332 if (bh_result->b_blocknr) 330 if (bh_result->b_blocknr)
333 affs_warning(sb, "get_block", "block already set (%lx)", 331 affs_warning(sb, "get_block",
334 (unsigned long)bh_result->b_blocknr); 332 "block already set (%llx)",
333 (unsigned long long)bh_result->b_blocknr);
335 AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); 334 AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
336 AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); 335 AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
337 affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); 336 affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -353,8 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
353 return 0; 352 return 0;
354 353
355err_big: 354err_big:
356 affs_error(inode->i_sb, "get_block", "strange block request %d", 355 affs_error(inode->i_sb, "get_block", "strange block request %llu",
357 (int)block); 356 (unsigned long long)block);
358 return -EIO; 357 return -EIO;
359err_ext: 358err_ext:
360 // unlock cache 359 // unlock cache
@@ -399,6 +398,13 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
399 size_t count = iov_iter_count(iter); 398 size_t count = iov_iter_count(iter);
400 ssize_t ret; 399 ssize_t ret;
401 400
401 if (rw == WRITE) {
402 loff_t size = offset + count;
403
404 if (AFFS_I(inode)->mmu_private < size)
405 return 0;
406 }
407
402 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); 408 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
403 if (ret < 0 && (rw & WRITE)) 409 if (ret < 0 && (rw & WRITE))
404 affs_write_failed(mapping, offset + count); 410 affs_write_failed(mapping, offset + count);
@@ -503,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
503 u32 bidx, boff, bsize; 509 u32 bidx, boff, bsize;
504 u32 tmp; 510 u32 tmp;
505 511
506 pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino, 512 pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
507 page->index, to); 513 page->index, to);
508 BUG_ON(to > PAGE_CACHE_SIZE); 514 BUG_ON(to > PAGE_CACHE_SIZE);
509 kmap(page); 515 kmap(page);
@@ -539,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
539 u32 size, bsize; 545 u32 size, bsize;
540 u32 tmp; 546 u32 tmp;
541 547
542 pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize); 548 pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize);
543 bsize = AFFS_SB(sb)->s_data_blksize; 549 bsize = AFFS_SB(sb)->s_data_blksize;
544 bh = NULL; 550 bh = NULL;
545 size = AFFS_I(inode)->mmu_private; 551 size = AFFS_I(inode)->mmu_private;
@@ -608,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
608 u32 to; 614 u32 to;
609 int err; 615 int err;
610 616
611 pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index); 617 pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
612 to = PAGE_CACHE_SIZE; 618 to = PAGE_CACHE_SIZE;
613 if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { 619 if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
614 to = inode->i_size & ~PAGE_CACHE_MASK; 620 to = inode->i_size & ~PAGE_CACHE_MASK;
@@ -631,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
631 pgoff_t index; 637 pgoff_t index;
632 int err = 0; 638 int err = 0;
633 639
634 pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino, 640 pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
635 (unsigned long long)pos, (unsigned long long)pos + len); 641 pos + len);
636 if (pos > AFFS_I(inode)->mmu_private) { 642 if (pos > AFFS_I(inode)->mmu_private) {
637 /* XXX: this probably leaves a too-big i_size in case of 643 /* XXX: this probably leaves a too-big i_size in case of
638 * failure. Should really be updating i_size at write_end time 644 * failure. Should really be updating i_size at write_end time
@@ -681,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
681 * due to write_begin. 687 * due to write_begin.
682 */ 688 */
683 689
684 pr_debug("%s(%u, %llu, %llu)\n", 690 pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
685 __func__, (u32)inode->i_ino, (unsigned long long)pos, 691 pos + len);
686 (unsigned long long)pos + len);
687 bsize = AFFS_SB(sb)->s_data_blksize; 692 bsize = AFFS_SB(sb)->s_data_blksize;
688 data = page_address(page); 693 data = page_address(page);
689 694
@@ -831,8 +836,8 @@ affs_truncate(struct inode *inode)
831 struct buffer_head *ext_bh; 836 struct buffer_head *ext_bh;
832 int i; 837 int i;
833 838
834 pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n", 839 pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n",
835 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); 840 inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size);
836 841
837 last_blk = 0; 842 last_blk = 0;
838 ext = 0; 843 ext = 0;
@@ -863,7 +868,7 @@ affs_truncate(struct inode *inode)
863 if (IS_ERR(ext_bh)) { 868 if (IS_ERR(ext_bh)) {
864 affs_warning(sb, "truncate", 869 affs_warning(sb, "truncate",
865 "unexpected read error for ext block %u (%ld)", 870 "unexpected read error for ext block %u (%ld)",
866 (unsigned int)ext, PTR_ERR(ext_bh)); 871 ext, PTR_ERR(ext_bh));
867 return; 872 return;
868 } 873 }
869 if (AFFS_I(inode)->i_lc) { 874 if (AFFS_I(inode)->i_lc) {
@@ -911,7 +916,7 @@ affs_truncate(struct inode *inode)
911 if (IS_ERR(bh)) { 916 if (IS_ERR(bh)) {
912 affs_warning(sb, "truncate", 917 affs_warning(sb, "truncate",
913 "unexpected read error for last block %u (%ld)", 918 "unexpected read error for last block %u (%ld)",
914 (unsigned int)ext, PTR_ERR(bh)); 919 ext, PTR_ERR(bh));
915 return; 920 return;
916 } 921 }
917 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); 922 tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index d0609a282e1d..6f34510449e8 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -13,8 +13,6 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include "affs.h" 14#include "affs.h"
15 15
16extern const struct inode_operations affs_symlink_inode_operations;
17
18struct inode *affs_iget(struct super_block *sb, unsigned long ino) 16struct inode *affs_iget(struct super_block *sb, unsigned long ino)
19{ 17{
20 struct affs_sb_info *sbi = AFFS_SB(sb); 18 struct affs_sb_info *sbi = AFFS_SB(sb);
@@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
348 u32 block = 0; 346 u32 block = 0;
349 int retval; 347 int retval;
350 348
351 pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n", 349 pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__,
352 __func__, (u32)dir->i_ino, 350 dir->i_ino, inode->i_ino, dentry, type);
353 (u32)inode->i_ino, dentry, type);
354 351
355 retval = -EIO; 352 retval = -EIO;
356 bh = affs_bread(sb, inode->i_ino); 353 bh = affs_bread(sb, inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index bbc38530e924..ffb7bd82c2a5 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
64{ 64{
65 const u8 *name = qstr->name; 65 const u8 *name = qstr->name;
66 unsigned long hash; 66 unsigned long hash;
67 int i; 67 int retval;
68 u32 len;
68 69
69 i = affs_check_name(qstr->name, qstr->len, notruncate); 70 retval = affs_check_name(qstr->name, qstr->len, notruncate);
70 if (i) 71 if (retval)
71 return i; 72 return retval;
72 73
73 hash = init_name_hash(); 74 hash = init_name_hash();
74 i = min(qstr->len, 30u); 75 len = min(qstr->len, AFFSNAMEMAX);
75 for (; i > 0; name++, i--) 76 for (; len > 0; name++, len--)
76 hash = partial_name_hash(toupper(*name), hash); 77 hash = partial_name_hash(toupper(*name), hash);
77 qstr->hash = end_name_hash(hash); 78 qstr->hash = end_name_hash(hash);
78 79
@@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len,
114 * If the names are longer than the allowed 30 chars, 115 * If the names are longer than the allowed 30 chars,
115 * the excess is ignored, so their length may differ. 116 * the excess is ignored, so their length may differ.
116 */ 117 */
117 if (len >= 30) { 118 if (len >= AFFSNAMEMAX) {
118 if (name->len < 30) 119 if (name->len < AFFSNAMEMAX)
119 return 1; 120 return 1;
120 len = 30; 121 len = AFFSNAMEMAX;
121 } else if (len != name->len) 122 } else if (len != name->len)
122 return 1; 123 return 1;
123 124
@@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
156 const u8 *name = dentry->d_name.name; 157 const u8 *name = dentry->d_name.name;
157 int len = dentry->d_name.len; 158 int len = dentry->d_name.len;
158 159
159 if (len >= 30) { 160 if (len >= AFFSNAMEMAX) {
160 if (*name2 < 30) 161 if (*name2 < AFFSNAMEMAX)
161 return 0; 162 return 0;
162 len = 30; 163 len = AFFSNAMEMAX;
163 } else if (len != *name2) 164 } else if (len != *name2)
164 return 0; 165 return 0;
165 166
@@ -173,9 +174,9 @@ int
173affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) 174affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
174{ 175{
175 toupper_t toupper = affs_get_toupper(sb); 176 toupper_t toupper = affs_get_toupper(sb);
176 int hash; 177 u32 hash;
177 178
178 hash = len = min(len, 30u); 179 hash = len = min(len, AFFSNAMEMAX);
179 for (; len > 0; len--) 180 for (; len > 0; len--)
180 hash = (hash * 13 + toupper(*name++)) & 0x7ff; 181 hash = (hash * 13 + toupper(*name++)) & 0x7ff;
181 182
@@ -248,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
248int 249int
249affs_unlink(struct inode *dir, struct dentry *dentry) 250affs_unlink(struct inode *dir, struct dentry *dentry)
250{ 251{
251 pr_debug("%s(dir=%d, %lu \"%pd\")\n", 252 pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
252 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, 253 dentry->d_inode->i_ino, dentry);
253 dentry);
254 254
255 return affs_remove_header(dentry); 255 return affs_remove_header(dentry);
256} 256}
@@ -317,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
317int 317int
318affs_rmdir(struct inode *dir, struct dentry *dentry) 318affs_rmdir(struct inode *dir, struct dentry *dentry)
319{ 319{
320 pr_debug("%s(dir=%u, %lu \"%pd\")\n", 320 pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
321 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, 321 dentry->d_inode->i_ino, dentry);
322 dentry);
323 322
324 return affs_remove_header(dentry); 323 return affs_remove_header(dentry);
325} 324}
@@ -404,8 +403,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
404{ 403{
405 struct inode *inode = old_dentry->d_inode; 404 struct inode *inode = old_dentry->d_inode;
406 405
407 pr_debug("%s(%u, %u, \"%pd\")\n", 406 pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
408 __func__, (u32)inode->i_ino, (u32)dir->i_ino,
409 dentry); 407 dentry);
410 408
411 return affs_add_entry(dir, inode, dentry, ST_LINKFILE); 409 return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -419,9 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
419 struct buffer_head *bh = NULL; 417 struct buffer_head *bh = NULL;
420 int retval; 418 int retval;
421 419
422 pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n", 420 pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
423 __func__, (u32)old_dir->i_ino, old_dentry, 421 old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
424 (u32)new_dir->i_ino, new_dentry);
425 422
426 retval = affs_check_name(new_dentry->d_name.name, 423 retval = affs_check_name(new_dentry->d_name.name,
427 new_dentry->d_name.len, 424 new_dentry->d_name.len,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index f754ab68a840..4cf0e9113fb6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -432,39 +432,39 @@ got_root:
432 sb->s_flags |= MS_RDONLY; 432 sb->s_flags |= MS_RDONLY;
433 } 433 }
434 switch (chksum) { 434 switch (chksum) {
435 case MUFS_FS: 435 case MUFS_FS:
436 case MUFS_INTLFFS: 436 case MUFS_INTLFFS:
437 case MUFS_DCFFS: 437 case MUFS_DCFFS:
438 sbi->s_flags |= SF_MUFS; 438 sbi->s_flags |= SF_MUFS;
439 /* fall thru */ 439 /* fall thru */
440 case FS_INTLFFS: 440 case FS_INTLFFS:
441 case FS_DCFFS: 441 case FS_DCFFS:
442 sbi->s_flags |= SF_INTL; 442 sbi->s_flags |= SF_INTL;
443 break; 443 break;
444 case MUFS_FFS: 444 case MUFS_FFS:
445 sbi->s_flags |= SF_MUFS; 445 sbi->s_flags |= SF_MUFS;
446 break; 446 break;
447 case FS_FFS: 447 case FS_FFS:
448 break; 448 break;
449 case MUFS_OFS: 449 case MUFS_OFS:
450 sbi->s_flags |= SF_MUFS; 450 sbi->s_flags |= SF_MUFS;
451 /* fall thru */ 451 /* fall thru */
452 case FS_OFS: 452 case FS_OFS:
453 sbi->s_flags |= SF_OFS; 453 sbi->s_flags |= SF_OFS;
454 sb->s_flags |= MS_NOEXEC; 454 sb->s_flags |= MS_NOEXEC;
455 break; 455 break;
456 case MUFS_DCOFS: 456 case MUFS_DCOFS:
457 case MUFS_INTLOFS: 457 case MUFS_INTLOFS:
458 sbi->s_flags |= SF_MUFS; 458 sbi->s_flags |= SF_MUFS;
459 case FS_DCOFS: 459 case FS_DCOFS:
460 case FS_INTLOFS: 460 case FS_INTLOFS:
461 sbi->s_flags |= SF_INTL | SF_OFS; 461 sbi->s_flags |= SF_INTL | SF_OFS;
462 sb->s_flags |= MS_NOEXEC; 462 sb->s_flags |= MS_NOEXEC;
463 break; 463 break;
464 default: 464 default:
465 pr_err("Unknown filesystem on device %s: %08X\n", 465 pr_err("Unknown filesystem on device %s: %08X\n",
466 sb->s_id, chksum); 466 sb->s_id, chksum);
467 return -EINVAL; 467 return -EINVAL;
468 } 468 }
469 469
470 if (mount_flags & SF_VERBOSE) { 470 if (mount_flags & SF_VERBOSE) {
@@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
584 buf->f_bavail = free; 584 buf->f_bavail = free;
585 buf->f_fsid.val[0] = (u32)id; 585 buf->f_fsid.val[0] = (u32)id;
586 buf->f_fsid.val[1] = (u32)(id >> 32); 586 buf->f_fsid.val[1] = (u32)(id >> 32);
587 buf->f_namelen = 30; 587 buf->f_namelen = AFFSNAMEMAX;
588 return 0; 588 return 0;
589} 589}
590 590
@@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb)
602 affs_free_bitmap(sb); 602 affs_free_bitmap(sb);
603 affs_brelse(sbi->s_root_bh); 603 affs_brelse(sbi->s_root_bh);
604 kfree(sbi->s_prefix); 604 kfree(sbi->s_prefix);
605 mutex_destroy(&sbi->s_bmlock);
605 kfree(sbi); 606 kfree(sbi);
606 } 607 }
607} 608}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 06e14bfb3496..dbc732e9a5c0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
306 306
307 _debug("- range %u-%u%s", 307 _debug("- range %u-%u%s",
308 offset, to, msg->msg_flags ? " [more]" : ""); 308 offset, to, msg->msg_flags ? " [more]" : "");
309 iov_iter_init(&msg->msg_iter, WRITE, 309 iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
310 (struct iovec *) iov, 1, to - offset); 310 iov, 1, to - offset);
311 311
312 /* have to change the state *before* sending the last 312 /* have to change the state *before* sending the last
313 * packet as RxRPC might give us the reply before it 313 * packet as RxRPC might give us the reply before it
@@ -384,7 +384,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
384 384
385 msg.msg_name = NULL; 385 msg.msg_name = NULL;
386 msg.msg_namelen = 0; 386 msg.msg_namelen = 0;
387 iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1, 387 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
388 call->request_size); 388 call->request_size);
389 msg.msg_control = NULL; 389 msg.msg_control = NULL;
390 msg.msg_controllen = 0; 390 msg.msg_controllen = 0;
@@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
770void afs_send_empty_reply(struct afs_call *call) 770void afs_send_empty_reply(struct afs_call *call)
771{ 771{
772 struct msghdr msg; 772 struct msghdr msg;
773 struct iovec iov[1]; 773 struct kvec iov[1];
774 774
775 _enter(""); 775 _enter("");
776 776
@@ -778,7 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
778 iov[0].iov_len = 0; 778 iov[0].iov_len = 0;
779 msg.msg_name = NULL; 779 msg.msg_name = NULL;
780 msg.msg_namelen = 0; 780 msg.msg_namelen = 0;
781 iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */ 781 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */
782 msg.msg_control = NULL; 782 msg.msg_control = NULL;
783 msg.msg_controllen = 0; 783 msg.msg_controllen = 0;
784 msg.msg_flags = 0; 784 msg.msg_flags = 0;
@@ -805,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call)
805void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) 805void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
806{ 806{
807 struct msghdr msg; 807 struct msghdr msg;
808 struct iovec iov[1]; 808 struct kvec iov[1];
809 int n; 809 int n;
810 810
811 _enter(""); 811 _enter("");
@@ -814,7 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
814 iov[0].iov_len = len; 814 iov[0].iov_len = len;
815 msg.msg_name = NULL; 815 msg.msg_name = NULL;
816 msg.msg_namelen = 0; 816 msg.msg_namelen = 0;
817 iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len); 817 iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
818 msg.msg_control = NULL; 818 msg.msg_control = NULL;
819 msg.msg_controllen = 0; 819 msg.msg_controllen = 0;
820 msg.msg_flags = 0; 820 msg.msg_flags = 0;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
106 volume->cell = params->cell; 106 volume->cell = params->cell;
107 volume->vid = vlocation->vldb.vid[params->type]; 107 volume->vid = vlocation->vldb.vid[params->type];
108 108
109 ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); 109 ret = bdi_setup_and_register(&volume->bdi, "afs");
110 if (ret) 110 if (ret)
111 goto error_bdi; 111 goto error_bdi;
112 112
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
165static const struct file_operations aio_ring_fops; 165static const struct file_operations aio_ring_fops;
166static const struct address_space_operations aio_ctx_aops; 166static const struct address_space_operations aio_ctx_aops;
167 167
168/* Backing dev info for aio fs.
169 * -no dirty page accounting or writeback happens
170 */
171static struct backing_dev_info aio_fs_backing_dev_info = {
172 .name = "aiofs",
173 .state = 0,
174 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
175};
176
177static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) 168static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
178{ 169{
179 struct qstr this = QSTR_INIT("[aio]", 5); 170 struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
185 176
186 inode->i_mapping->a_ops = &aio_ctx_aops; 177 inode->i_mapping->a_ops = &aio_ctx_aops;
187 inode->i_mapping->private_data = ctx; 178 inode->i_mapping->private_data = ctx;
188 inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
189 inode->i_size = PAGE_SIZE * nr_pages; 179 inode->i_size = PAGE_SIZE * nr_pages;
190 180
191 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); 181 path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
230 if (IS_ERR(aio_mnt)) 220 if (IS_ERR(aio_mnt))
231 panic("Failed to create aio fs mount."); 221 panic("Failed to create aio fs mount.");
232 222
233 if (bdi_init(&aio_fs_backing_dev_info))
234 panic("Failed to init aio fs backing dev info.");
235
236 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 223 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
237 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 224 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
238 225
@@ -1140,6 +1127,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
1140 long ret = 0; 1127 long ret = 0;
1141 int copy_ret; 1128 int copy_ret;
1142 1129
1130 /*
1131 * The mutex can block and wake us up and that will cause
1132 * wait_event_interruptible_hrtimeout() to schedule without sleeping
1133 * and repeat. This should be rare enough that it doesn't cause
1134 * peformance issues. See the comment in read_events() for more detail.
1135 */
1136 sched_annotate_sleep();
1143 mutex_lock(&ctx->ring_lock); 1137 mutex_lock(&ctx->ring_lock);
1144 1138
1145 /* Access to ->ring_pages here is protected by ctx->ring_lock. */ 1139 /* Access to ->ring_pages here is protected by ctx->ring_lock. */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index edf47774b03d..e089f1985fca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -274,9 +274,9 @@ more:
274static struct inode * 274static struct inode *
275befs_alloc_inode(struct super_block *sb) 275befs_alloc_inode(struct super_block *sb)
276{ 276{
277 struct befs_inode_info *bi; 277 struct befs_inode_info *bi;
278 bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep, 278
279 GFP_KERNEL); 279 bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
280 if (!bi) 280 if (!bi)
281 return NULL; 281 return NULL;
282 return &bi->vfs_inode; 282 return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02b16910f4c9..995986b8e36b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -645,11 +645,12 @@ out:
645 645
646static unsigned long randomize_stack_top(unsigned long stack_top) 646static unsigned long randomize_stack_top(unsigned long stack_top)
647{ 647{
648 unsigned int random_variable = 0; 648 unsigned long random_variable = 0;
649 649
650 if ((current->flags & PF_RANDOMIZE) && 650 if ((current->flags & PF_RANDOMIZE) &&
651 !(current->personality & ADDR_NO_RANDOMIZE)) { 651 !(current->personality & ADDR_NO_RANDOMIZE)) {
652 random_variable = get_random_int() & STACK_RND_MASK; 652 random_variable = (unsigned long) get_random_int();
653 random_variable &= STACK_RND_MASK;
653 random_variable <<= PAGE_SHIFT; 654 random_variable <<= PAGE_SHIFT;
654 } 655 }
655#ifdef CONFIG_STACK_GROWSUP 656#ifdef CONFIG_STACK_GROWSUP
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
deleted file mode 100644
index 4e00ed68d4a6..000000000000
--- a/fs/binfmt_som.c
+++ /dev/null
@@ -1,299 +0,0 @@
1/*
2 * linux/fs/binfmt_som.c
3 *
4 * These are the functions used to load SOM format executables as used
5 * by HP-UX.
6 *
7 * Copyright 1999 Matthew Wilcox <willy@bofh.ai>
8 * based on binfmt_elf which is
9 * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
10 */
11
12#include <linux/module.h>
13
14#include <linux/fs.h>
15#include <linux/stat.h>
16#include <linux/sched.h>
17#include <linux/mm.h>
18#include <linux/mman.h>
19#include <linux/errno.h>
20#include <linux/signal.h>
21#include <linux/binfmts.h>
22#include <linux/som.h>
23#include <linux/string.h>
24#include <linux/file.h>
25#include <linux/fcntl.h>
26#include <linux/ptrace.h>
27#include <linux/slab.h>
28#include <linux/shm.h>
29#include <linux/personality.h>
30#include <linux/init.h>
31
32#include <asm/uaccess.h>
33#include <asm/pgtable.h>
34
35
36#include <linux/elf.h>
37
38static int load_som_binary(struct linux_binprm * bprm);
39static int load_som_library(struct file *);
40
41/*
42 * If we don't support core dumping, then supply a NULL so we
43 * don't even try.
44 */
45#if 0
46static int som_core_dump(struct coredump_params *cprm);
47#else
48#define som_core_dump NULL
49#endif
50
51#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1))
52#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1))
53#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1))
54
55static struct linux_binfmt som_format = {
56 .module = THIS_MODULE,
57 .load_binary = load_som_binary,
58 .load_shlib = load_som_library,
59 .core_dump = som_core_dump,
60 .min_coredump = SOM_PAGESIZE
61};
62
63/*
64 * create_som_tables() parses the env- and arg-strings in new user
65 * memory and creates the pointer tables from them, and puts their
66 * addresses on the "stack", returning the new stack pointer value.
67 */
68static void create_som_tables(struct linux_binprm *bprm)
69{
70 char **argv, **envp;
71 int argc = bprm->argc;
72 int envc = bprm->envc;
73 unsigned long p;
74 unsigned long *sp;
75
76 /* Word-align the stack pointer */
77 sp = (unsigned long *)((bprm->p + 3) & ~3);
78
79 envp = (char **) sp;
80 sp += envc + 1;
81 argv = (char **) sp;
82 sp += argc + 1;
83
84 __put_user((unsigned long) envp,++sp);
85 __put_user((unsigned long) argv,++sp);
86
87 __put_user(argc, ++sp);
88
89 bprm->p = (unsigned long) sp;
90
91 p = current->mm->arg_start;
92 while (argc-- > 0) {
93 __put_user((char *)p,argv++);
94 p += strlen_user((char *)p);
95 }
96 __put_user(NULL, argv);
97 current->mm->arg_end = current->mm->env_start = p;
98 while (envc-- > 0) {
99 __put_user((char *)p,envp++);
100 p += strlen_user((char *)p);
101 }
102 __put_user(NULL, envp);
103 current->mm->env_end = p;
104}
105
106static int check_som_header(struct som_hdr *som_ex)
107{
108 int *buf = (int *)som_ex;
109 int i, ck;
110
111 if (som_ex->system_id != SOM_SID_PARISC_1_0 &&
112 som_ex->system_id != SOM_SID_PARISC_1_1 &&
113 som_ex->system_id != SOM_SID_PARISC_2_0)
114 return -ENOEXEC;
115
116 if (som_ex->a_magic != SOM_EXEC_NONSHARE &&
117 som_ex->a_magic != SOM_EXEC_SHARE &&
118 som_ex->a_magic != SOM_EXEC_DEMAND)
119 return -ENOEXEC;
120
121 if (som_ex->version_id != SOM_ID_OLD &&
122 som_ex->version_id != SOM_ID_NEW)
123 return -ENOEXEC;
124
125 ck = 0;
126 for (i=0; i<32; i++)
127 ck ^= buf[i];
128 if (ck != 0)
129 return -ENOEXEC;
130
131 return 0;
132}
133
134static int map_som_binary(struct file *file,
135 const struct som_exec_auxhdr *hpuxhdr)
136{
137 unsigned long code_start, code_size, data_start, data_size;
138 unsigned long bss_start, som_brk;
139 int retval;
140 int prot = PROT_READ | PROT_EXEC;
141 int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
142
143 mm_segment_t old_fs = get_fs();
144 set_fs(get_ds());
145
146 code_start = SOM_PAGESTART(hpuxhdr->exec_tmem);
147 code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
148 current->mm->start_code = code_start;
149 current->mm->end_code = code_start + code_size;
150 retval = vm_mmap(file, code_start, code_size, prot,
151 flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
152 if (retval < 0 && retval > -1024)
153 goto out;
154
155 data_start = SOM_PAGESTART(hpuxhdr->exec_dmem);
156 data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
157 current->mm->start_data = data_start;
158 current->mm->end_data = bss_start = data_start + data_size;
159 retval = vm_mmap(file, data_start, data_size,
160 prot | PROT_WRITE, flags,
161 SOM_PAGESTART(hpuxhdr->exec_dfile));
162 if (retval < 0 && retval > -1024)
163 goto out;
164
165 som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
166 current->mm->start_brk = current->mm->brk = som_brk;
167 retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
168 prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
169 if (retval > 0 || retval < -1024)
170 retval = 0;
171out:
172 set_fs(old_fs);
173 return retval;
174}
175
176
177/*
178 * These are the functions used to load SOM executables and shared
179 * libraries. There is no binary dependent code anywhere else.
180 */
181
182static int
183load_som_binary(struct linux_binprm * bprm)
184{
185 int retval;
186 unsigned int size;
187 unsigned long som_entry;
188 struct som_hdr *som_ex;
189 struct som_exec_auxhdr *hpuxhdr;
190 struct pt_regs *regs = current_pt_regs();
191
192 /* Get the exec-header */
193 som_ex = (struct som_hdr *) bprm->buf;
194
195 retval = check_som_header(som_ex);
196 if (retval != 0)
197 goto out;
198
199 /* Now read in the auxiliary header information */
200
201 retval = -ENOMEM;
202 size = som_ex->aux_header_size;
203 if (size > SOM_PAGESIZE)
204 goto out;
205 hpuxhdr = kmalloc(size, GFP_KERNEL);
206 if (!hpuxhdr)
207 goto out;
208
209 retval = kernel_read(bprm->file, som_ex->aux_header_location,
210 (char *) hpuxhdr, size);
211 if (retval != size) {
212 if (retval >= 0)
213 retval = -EIO;
214 goto out_free;
215 }
216
217 /* Flush all traces of the currently running executable */
218 retval = flush_old_exec(bprm);
219 if (retval)
220 goto out_free;
221
222 /* OK, This is the point of no return */
223 current->personality = PER_HPUX;
224 setup_new_exec(bprm);
225
226 /* Set the task size for HP-UX processes such that
227 * the gateway page is outside the address space.
228 * This can be fixed later, but for now, this is much
229 * easier.
230 */
231
232 current->thread.task_size = 0xc0000000;
233
234 /* Set map base to allow enough room for hp-ux heap growth */
235
236 current->thread.map_base = 0x80000000;
237
238 retval = map_som_binary(bprm->file, hpuxhdr);
239 if (retval < 0)
240 goto out_free;
241
242 som_entry = hpuxhdr->exec_entry;
243 kfree(hpuxhdr);
244
245 set_binfmt(&som_format);
246 install_exec_creds(bprm);
247 setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
248
249 create_som_tables(bprm);
250
251 current->mm->start_stack = bprm->p;
252
253#if 0
254 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
255 printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code);
256 printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code);
257 printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data);
258 printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack);
259 printk("(brk) %08lx\n" , (unsigned long) current->mm->brk);
260#endif
261
262 map_hpux_gateway_page(current,current->mm);
263
264 start_thread_som(regs, som_entry, bprm->p);
265 return 0;
266
267 /* error cleanup */
268out_free:
269 kfree(hpuxhdr);
270out:
271 return retval;
272}
273
274static int load_som_library(struct file *f)
275{
276/* No lib support in SOM yet. gizza chance.. */
277 return -ENOEXEC;
278}
279 /* Install the SOM loader.
280 * N.B. We *rely* on the table being the right size with the
281 * right number of free slots...
282 */
283
284static int __init init_som_binfmt(void)
285{
286 register_binfmt(&som_format);
287 return 0;
288}
289
290static void __exit exit_som_binfmt(void)
291{
292 /* Remove the SOM loader. */
293 unregister_binfmt(&som_format);
294}
295
296core_initcall(init_som_binfmt);
297module_exit(exit_som_binfmt);
298
299MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..975266be67d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
49} 49}
50EXPORT_SYMBOL(I_BDEV); 50EXPORT_SYMBOL(I_BDEV);
51 51
52/* 52static void bdev_write_inode(struct inode *inode)
53 * Move the inode from its current bdi to a new bdi. Make sure the inode
54 * is clean before moving so that it doesn't linger on the old bdi.
55 */
56static void bdev_inode_switch_bdi(struct inode *inode,
57 struct backing_dev_info *dst)
58{ 53{
59 while (true) { 54 spin_lock(&inode->i_lock);
60 spin_lock(&inode->i_lock); 55 while (inode->i_state & I_DIRTY) {
61 if (!(inode->i_state & I_DIRTY)) {
62 inode->i_data.backing_dev_info = dst;
63 spin_unlock(&inode->i_lock);
64 return;
65 }
66 spin_unlock(&inode->i_lock); 56 spin_unlock(&inode->i_lock);
67 WARN_ON_ONCE(write_inode_now(inode, true)); 57 WARN_ON_ONCE(write_inode_now(inode, true));
58 spin_lock(&inode->i_lock);
68 } 59 }
60 spin_unlock(&inode->i_lock);
69} 61}
70 62
71/* Kill _all_ buffers and pagecache , dirty or not.. */ 63/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
429} 421}
430EXPORT_SYMBOL_GPL(bdev_write_page); 422EXPORT_SYMBOL_GPL(bdev_write_page);
431 423
424/**
425 * bdev_direct_access() - Get the address for directly-accessibly memory
426 * @bdev: The device containing the memory
427 * @sector: The offset within the device
428 * @addr: Where to put the address of the memory
429 * @pfn: The Page Frame Number for the memory
430 * @size: The number of bytes requested
431 *
432 * If a block device is made up of directly addressable memory, this function
433 * will tell the caller the PFN and the address of the memory. The address
434 * may be directly dereferenced within the kernel without the need to call
435 * ioremap(), kmap() or similar. The PFN is suitable for inserting into
436 * page tables.
437 *
438 * Return: negative errno if an error occurs, otherwise the number of bytes
439 * accessible at this address.
440 */
441long bdev_direct_access(struct block_device *bdev, sector_t sector,
442 void **addr, unsigned long *pfn, long size)
443{
444 long avail;
445 const struct block_device_operations *ops = bdev->bd_disk->fops;
446
447 if (size < 0)
448 return size;
449 if (!ops->direct_access)
450 return -EOPNOTSUPP;
451 if ((sector + DIV_ROUND_UP(size, 512)) >
452 part_nr_sects_read(bdev->bd_part))
453 return -ERANGE;
454 sector += get_start_sect(bdev);
455 if (sector % (PAGE_SIZE / 512))
456 return -EINVAL;
457 avail = ops->direct_access(bdev, sector, addr, pfn, size);
458 if (!avail)
459 return -ERANGE;
460 return min(avail, size);
461}
462EXPORT_SYMBOL_GPL(bdev_direct_access);
463
432/* 464/*
433 * pseudo-fs 465 * pseudo-fs
434 */ 466 */
@@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev)
584 inode->i_bdev = bdev; 616 inode->i_bdev = bdev;
585 inode->i_data.a_ops = &def_blk_aops; 617 inode->i_data.a_ops = &def_blk_aops;
586 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 618 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
587 inode->i_data.backing_dev_info = &default_backing_dev_info;
588 spin_lock(&bdev_lock); 619 spin_lock(&bdev_lock);
589 list_add(&bdev->bd_list, &all_bdevs); 620 list_add(&bdev->bd_list, &all_bdevs);
590 spin_unlock(&bdev_lock); 621 spin_unlock(&bdev_lock);
@@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1145 bdev->bd_queue = disk->queue; 1176 bdev->bd_queue = disk->queue;
1146 bdev->bd_contains = bdev; 1177 bdev->bd_contains = bdev;
1147 if (!partno) { 1178 if (!partno) {
1148 struct backing_dev_info *bdi;
1149
1150 ret = -ENXIO; 1179 ret = -ENXIO;
1151 bdev->bd_part = disk_get_part(disk, partno); 1180 bdev->bd_part = disk_get_part(disk, partno);
1152 if (!bdev->bd_part) 1181 if (!bdev->bd_part)
@@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1172 } 1201 }
1173 } 1202 }
1174 1203
1175 if (!ret) { 1204 if (!ret)
1176 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1205 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1177 bdi = blk_get_backing_dev_info(bdev);
1178 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1179 }
1180 1206
1181 /* 1207 /*
1182 * If the device is invalidated, rescan partition 1208 * If the device is invalidated, rescan partition
@@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1203 if (ret) 1229 if (ret)
1204 goto out_clear; 1230 goto out_clear;
1205 bdev->bd_contains = whole; 1231 bdev->bd_contains = whole;
1206 bdev_inode_switch_bdi(bdev->bd_inode,
1207 whole->bd_inode->i_data.backing_dev_info);
1208 bdev->bd_part = disk_get_part(disk, partno); 1232 bdev->bd_part = disk_get_part(disk, partno);
1209 if (!(disk->flags & GENHD_FL_UP) || 1233 if (!(disk->flags & GENHD_FL_UP) ||
1210 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1234 !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1244 bdev->bd_disk = NULL; 1268 bdev->bd_disk = NULL;
1245 bdev->bd_part = NULL; 1269 bdev->bd_part = NULL;
1246 bdev->bd_queue = NULL; 1270 bdev->bd_queue = NULL;
1247 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1248 if (bdev != bdev->bd_contains) 1271 if (bdev != bdev->bd_contains)
1249 __blkdev_put(bdev->bd_contains, mode, 1); 1272 __blkdev_put(bdev->bd_contains, mode, 1);
1250 bdev->bd_contains = NULL; 1273 bdev->bd_contains = NULL;
@@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1464 WARN_ON_ONCE(bdev->bd_holders); 1487 WARN_ON_ONCE(bdev->bd_holders);
1465 sync_blockdev(bdev); 1488 sync_blockdev(bdev);
1466 kill_bdev(bdev); 1489 kill_bdev(bdev);
1467 /* ->release can cause the old bdi to disappear, 1490 /*
1468 * so must switch it out first 1491 * ->release can cause the queue to disappear, so flush all
1492 * dirty data before.
1469 */ 1493 */
1470 bdev_inode_switch_bdi(bdev->bd_inode, 1494 bdev_write_inode(bdev->bd_inode);
1471 &default_backing_dev_info);
1472 } 1495 }
1473 if (bdev->bd_contains == bdev) { 1496 if (bdev->bd_contains == bdev) {
1474 if (disk->fops->release) 1497 if (disk->fops->release)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768ebc8d1..80e9c18ea64f 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
8 select LZO_DECOMPRESS 8 select LZO_DECOMPRESS
9 select RAID6_PQ 9 select RAID6_PQ
10 select XOR_BLOCKS 10 select XOR_BLOCKS
11 select SRCU
11 12
12 help 13 help
13 Btrfs is a general purpose copy-on-write filesystem with extents, 14 Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2d3e32ebfd15..f55721ff9385 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
1246 return ret; 1246 return ret;
1247} 1247}
1248 1248
1249/*
1250 * this makes the path point to (inum INODE_ITEM ioff)
1251 */
1252int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1253 struct btrfs_path *path)
1254{
1255 struct btrfs_key key;
1256 return btrfs_find_item(fs_root, path, inum, ioff,
1257 BTRFS_INODE_ITEM_KEY, &key);
1258}
1259
1260static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1261 struct btrfs_path *path,
1262 struct btrfs_key *found_key)
1263{
1264 return btrfs_find_item(fs_root, path, inum, ioff,
1265 BTRFS_INODE_REF_KEY, found_key);
1266}
1267
1268int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, 1249int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1269 u64 start_off, struct btrfs_path *path, 1250 u64 start_off, struct btrfs_path *path,
1270 struct btrfs_inode_extref **ret_extref, 1251 struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
1374 btrfs_tree_read_unlock_blocking(eb); 1355 btrfs_tree_read_unlock_blocking(eb);
1375 free_extent_buffer(eb); 1356 free_extent_buffer(eb);
1376 } 1357 }
1377 ret = inode_ref_info(parent, 0, fs_root, path, &found_key); 1358 ret = btrfs_find_item(fs_root, path, parent, 0,
1359 BTRFS_INODE_REF_KEY, &found_key);
1378 if (ret > 0) 1360 if (ret > 0)
1379 ret = -ENOENT; 1361 ret = -ENOENT;
1380 if (ret) 1362 if (ret)
@@ -1552,7 +1534,6 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1552{ 1534{
1553 int ret; 1535 int ret;
1554 int type; 1536 int type;
1555 struct btrfs_tree_block_info *info;
1556 struct btrfs_extent_inline_ref *eiref; 1537 struct btrfs_extent_inline_ref *eiref;
1557 1538
1558 if (*ptr == (unsigned long)-1) 1539 if (*ptr == (unsigned long)-1)
@@ -1573,9 +1554,17 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1573 } 1554 }
1574 1555
1575 /* we can treat both ref types equally here */ 1556 /* we can treat both ref types equally here */
1576 info = (struct btrfs_tree_block_info *)(ei + 1);
1577 *out_root = btrfs_extent_inline_ref_offset(eb, eiref); 1557 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
1578 *out_level = btrfs_tree_block_level(eb, info); 1558
1559 if (key->type == BTRFS_EXTENT_ITEM_KEY) {
1560 struct btrfs_tree_block_info *info;
1561
1562 info = (struct btrfs_tree_block_info *)(ei + 1);
1563 *out_level = btrfs_tree_block_level(eb, info);
1564 } else {
1565 ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
1566 *out_level = (u8)key->offset;
1567 }
1579 1568
1580 if (ret == 1) 1569 if (ret == 1)
1581 *ptr = (unsigned long)-1; 1570 *ptr = (unsigned long)-1;
@@ -1720,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1720 struct btrfs_key found_key; 1709 struct btrfs_key found_key;
1721 1710
1722 while (!ret) { 1711 while (!ret) {
1723 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, 1712 ret = btrfs_find_item(fs_root, path, inum,
1724 &found_key); 1713 parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
1714 &found_key);
1715
1725 if (ret < 0) 1716 if (ret < 0)
1726 break; 1717 break;
1727 if (ret) { 1718 if (ret) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2a1ac6bfc724..9c41fbac3009 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -32,9 +32,6 @@ struct inode_fs_paths {
32typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, 32typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
33 void *ctx); 33 void *ctx);
34 34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, 35int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key, 36 struct btrfs_path *path, struct btrfs_key *found_key,
40 u64 *flags); 37 u64 *flags);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4aadadcfab20..de5e4f2adfea 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -185,6 +185,9 @@ struct btrfs_inode {
185 185
186 struct btrfs_delayed_node *delayed_node; 186 struct btrfs_delayed_node *delayed_node;
187 187
188 /* File creation time. */
189 struct timespec i_otime;
190
188 struct inode vfs_inode; 191 struct inode vfs_inode;
189}; 192};
190 193
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 14a72ed14ef7..993642199326 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
213 */ 213 */
214static void add_root_to_dirty_list(struct btrfs_root *root) 214static void add_root_to_dirty_list(struct btrfs_root *root)
215{ 215{
216 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
217 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
218 return;
219
216 spin_lock(&root->fs_info->trans_lock); 220 spin_lock(&root->fs_info->trans_lock);
217 if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && 221 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
218 list_empty(&root->dirty_list)) { 222 /* Want the extent tree to be the last on the list */
219 list_add(&root->dirty_list, 223 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
220 &root->fs_info->dirty_cowonly_roots); 224 list_move_tail(&root->dirty_list,
225 &root->fs_info->dirty_cowonly_roots);
226 else
227 list_move(&root->dirty_list,
228 &root->fs_info->dirty_cowonly_roots);
221 } 229 }
222 spin_unlock(&root->fs_info->trans_lock); 230 spin_unlock(&root->fs_info->trans_lock);
223} 231}
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1363 1371
1364 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1372 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1365 BUG_ON(tm->slot != 0); 1373 BUG_ON(tm->slot != 0);
1366 eb_rewin = alloc_dummy_extent_buffer(eb->start, 1374 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1367 fs_info->tree_root->nodesize);
1368 if (!eb_rewin) { 1375 if (!eb_rewin) {
1369 btrfs_tree_read_unlock_blocking(eb); 1376 btrfs_tree_read_unlock_blocking(eb);
1370 free_extent_buffer(eb); 1377 free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1444 } else if (old_root) { 1451 } else if (old_root) {
1445 btrfs_tree_read_unlock(eb_root); 1452 btrfs_tree_read_unlock(eb_root);
1446 free_extent_buffer(eb_root); 1453 free_extent_buffer(eb_root);
1447 eb = alloc_dummy_extent_buffer(logical, root->nodesize); 1454 eb = alloc_dummy_extent_buffer(root->fs_info, logical);
1448 } else { 1455 } else {
1449 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); 1456 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1450 eb = btrfs_clone_extent_buffer(eb_root); 1457 eb = btrfs_clone_extent_buffer(eb_root);
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
2282 if ((search <= target && target - search <= 65536) || 2289 if ((search <= target && target - search <= 65536) ||
2283 (search > target && search - target <= 65536)) { 2290 (search > target && search - target <= 65536)) {
2284 gen = btrfs_node_ptr_generation(node, nr); 2291 gen = btrfs_node_ptr_generation(node, nr);
2285 readahead_tree_block(root, search, blocksize); 2292 readahead_tree_block(root, search);
2286 nread += blocksize; 2293 nread += blocksize;
2287 } 2294 }
2288 nscan++; 2295 nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2301 u64 gen; 2308 u64 gen;
2302 u64 block1 = 0; 2309 u64 block1 = 0;
2303 u64 block2 = 0; 2310 u64 block2 = 0;
2304 int blocksize;
2305 2311
2306 parent = path->nodes[level + 1]; 2312 parent = path->nodes[level + 1];
2307 if (!parent) 2313 if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2309 2315
2310 nritems = btrfs_header_nritems(parent); 2316 nritems = btrfs_header_nritems(parent);
2311 slot = path->slots[level + 1]; 2317 slot = path->slots[level + 1];
2312 blocksize = root->nodesize;
2313 2318
2314 if (slot > 0) { 2319 if (slot > 0) {
2315 block1 = btrfs_node_blockptr(parent, slot - 1); 2320 block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2334 } 2339 }
2335 2340
2336 if (block1) 2341 if (block1)
2337 readahead_tree_block(root, block1, blocksize); 2342 readahead_tree_block(root, block1);
2338 if (block2) 2343 if (block2)
2339 readahead_tree_block(root, block2, blocksize); 2344 readahead_tree_block(root, block2);
2340} 2345}
2341 2346
2342 2347
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2609 return 0; 2614 return 0;
2610} 2615}
2611 2616
2612int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, 2617int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2613 u64 iobjectid, u64 ioff, u8 key_type, 2618 u64 iobjectid, u64 ioff, u8 key_type,
2614 struct btrfs_key *found_key) 2619 struct btrfs_key *found_key)
2615{ 2620{
2616 int ret; 2621 int ret;
2617 struct btrfs_key key; 2622 struct btrfs_key key;
2618 struct extent_buffer *eb; 2623 struct extent_buffer *eb;
2619 struct btrfs_path *path; 2624
2625 ASSERT(path);
2626 ASSERT(found_key);
2620 2627
2621 key.type = key_type; 2628 key.type = key_type;
2622 key.objectid = iobjectid; 2629 key.objectid = iobjectid;
2623 key.offset = ioff; 2630 key.offset = ioff;
2624 2631
2625 if (found_path == NULL) {
2626 path = btrfs_alloc_path();
2627 if (!path)
2628 return -ENOMEM;
2629 } else
2630 path = found_path;
2631
2632 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 2632 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2633 if ((ret < 0) || (found_key == NULL)) { 2633 if (ret < 0)
2634 if (path != found_path)
2635 btrfs_free_path(path);
2636 return ret; 2634 return ret;
2637 }
2638 2635
2639 eb = path->nodes[0]; 2636 eb = path->nodes[0];
2640 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 2637 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3383 add_root_to_dirty_list(root); 3380 add_root_to_dirty_list(root);
3384 extent_buffer_get(c); 3381 extent_buffer_get(c);
3385 path->nodes[level] = c; 3382 path->nodes[level] = c;
3386 path->locks[level] = BTRFS_WRITE_LOCK; 3383 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3387 path->slots[level] = 0; 3384 path->slots[level] = 0;
3388 return 0; 3385 return 0;
3389} 3386}
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4356 path->search_for_split = 1; 4353 path->search_for_split = 1;
4357 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4354 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4358 path->search_for_split = 0; 4355 path->search_for_split = 0;
4356 if (ret > 0)
4357 ret = -EAGAIN;
4359 if (ret < 0) 4358 if (ret < 0)
4360 goto err; 4359 goto err;
4361 4360
4362 ret = -EAGAIN; 4361 ret = -EAGAIN;
4363 leaf = path->nodes[0]; 4362 leaf = path->nodes[0];
4364 /* if our item isn't there or got smaller, return now */ 4363 /* if our item isn't there, return now */
4365 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 4364 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4366 goto err; 4365 goto err;
4367 4366
4368 /* the leaf has changed, it now has room. return now */ 4367 /* the leaf has changed, it now has room. return now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7e607416755a..84c3b00f3de8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
198 198
199#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) 199#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
200 200
201#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
202
201/* 203/*
202 * The key defines the order in the tree, and so it also defines (optimal) 204 * The key defines the order in the tree, and so it also defines (optimal)
203 * block layout. 205 * block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
1020 BTRFS_BLOCK_GROUP_RAID6 | \ 1022 BTRFS_BLOCK_GROUP_RAID6 | \
1021 BTRFS_BLOCK_GROUP_DUP | \ 1023 BTRFS_BLOCK_GROUP_DUP | \
1022 BTRFS_BLOCK_GROUP_RAID10) 1024 BTRFS_BLOCK_GROUP_RAID10)
1025#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
1026 BTRFS_BLOCK_GROUP_RAID6)
1027
1023/* 1028/*
1024 * We need a bit for restriper to be able to tell when chunks of type 1029 * We need a bit for restriper to be able to tell when chunks of type
1025 * SINGLE are available. This "extended" profile format is used in 1030 * SINGLE are available. This "extended" profile format is used in
@@ -1171,6 +1176,7 @@ struct btrfs_space_info {
1171 struct percpu_counter total_bytes_pinned; 1176 struct percpu_counter total_bytes_pinned;
1172 1177
1173 struct list_head list; 1178 struct list_head list;
1179 /* Protected by the spinlock 'lock'. */
1174 struct list_head ro_bgs; 1180 struct list_head ro_bgs;
1175 1181
1176 struct rw_semaphore groups_sem; 1182 struct rw_semaphore groups_sem;
@@ -1238,7 +1244,6 @@ enum btrfs_disk_cache_state {
1238 BTRFS_DC_ERROR = 1, 1244 BTRFS_DC_ERROR = 1,
1239 BTRFS_DC_CLEAR = 2, 1245 BTRFS_DC_CLEAR = 2,
1240 BTRFS_DC_SETUP = 3, 1246 BTRFS_DC_SETUP = 3,
1241 BTRFS_DC_NEED_WRITE = 4,
1242}; 1247};
1243 1248
1244struct btrfs_caching_control { 1249struct btrfs_caching_control {
@@ -1276,7 +1281,6 @@ struct btrfs_block_group_cache {
1276 unsigned long full_stripe_len; 1281 unsigned long full_stripe_len;
1277 1282
1278 unsigned int ro:1; 1283 unsigned int ro:1;
1279 unsigned int dirty:1;
1280 unsigned int iref:1; 1284 unsigned int iref:1;
1281 unsigned int has_caching_ctl:1; 1285 unsigned int has_caching_ctl:1;
1282 unsigned int removed:1; 1286 unsigned int removed:1;
@@ -1314,6 +1318,9 @@ struct btrfs_block_group_cache {
1314 struct list_head ro_list; 1318 struct list_head ro_list;
1315 1319
1316 atomic_t trimming; 1320 atomic_t trimming;
1321
1322 /* For dirty block groups */
1323 struct list_head dirty_list;
1317}; 1324};
1318 1325
1319/* delayed seq elem */ 1326/* delayed seq elem */
@@ -1740,6 +1747,7 @@ struct btrfs_fs_info {
1740 1747
1741 spinlock_t unused_bgs_lock; 1748 spinlock_t unused_bgs_lock;
1742 struct list_head unused_bgs; 1749 struct list_head unused_bgs;
1750 struct mutex unused_bg_unpin_mutex;
1743 1751
1744 /* For btrfs to record security options */ 1752 /* For btrfs to record security options */
1745 struct security_mnt_opts security_opts; 1753 struct security_mnt_opts security_opts;
@@ -1775,6 +1783,7 @@ struct btrfs_subvolume_writers {
1775#define BTRFS_ROOT_DEFRAG_RUNNING 6 1783#define BTRFS_ROOT_DEFRAG_RUNNING 6
1776#define BTRFS_ROOT_FORCE_COW 7 1784#define BTRFS_ROOT_FORCE_COW 7
1777#define BTRFS_ROOT_MULTI_LOG_TASKS 8 1785#define BTRFS_ROOT_MULTI_LOG_TASKS 8
1786#define BTRFS_ROOT_DIRTY 9
1778 1787
1779/* 1788/*
1780 * in ram representation of the tree. extent_root is used for all allocations 1789 * in ram representation of the tree. extent_root is used for all allocations
@@ -1793,8 +1802,6 @@ struct btrfs_root {
1793 struct btrfs_fs_info *fs_info; 1802 struct btrfs_fs_info *fs_info;
1794 struct extent_io_tree dirty_log_pages; 1803 struct extent_io_tree dirty_log_pages;
1795 1804
1796 struct kobject root_kobj;
1797 struct completion kobj_unregister;
1798 struct mutex objectid_mutex; 1805 struct mutex objectid_mutex;
1799 1806
1800 spinlock_t accounting_lock; 1807 spinlock_t accounting_lock;
@@ -2464,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
2464BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); 2471BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
2465BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); 2472BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
2466BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); 2473BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
2467
2468static inline struct btrfs_timespec *
2469btrfs_inode_atime(struct btrfs_inode_item *inode_item)
2470{
2471 unsigned long ptr = (unsigned long)inode_item;
2472 ptr += offsetof(struct btrfs_inode_item, atime);
2473 return (struct btrfs_timespec *)ptr;
2474}
2475
2476static inline struct btrfs_timespec *
2477btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
2478{
2479 unsigned long ptr = (unsigned long)inode_item;
2480 ptr += offsetof(struct btrfs_inode_item, mtime);
2481 return (struct btrfs_timespec *)ptr;
2482}
2483
2484static inline struct btrfs_timespec *
2485btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
2486{
2487 unsigned long ptr = (unsigned long)inode_item;
2488 ptr += offsetof(struct btrfs_inode_item, ctime);
2489 return (struct btrfs_timespec *)ptr;
2490}
2491
2492BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); 2474BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
2493BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); 2475BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
2494BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); 2476BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 054577bddaf2..82f0c7c95474 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1755 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); 1755 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
1756 btrfs_set_stack_inode_block_group(inode_item, 0); 1756 btrfs_set_stack_inode_block_group(inode_item, 0);
1757 1757
1758 btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), 1758 btrfs_set_stack_timespec_sec(&inode_item->atime,
1759 inode->i_atime.tv_sec); 1759 inode->i_atime.tv_sec);
1760 btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), 1760 btrfs_set_stack_timespec_nsec(&inode_item->atime,
1761 inode->i_atime.tv_nsec); 1761 inode->i_atime.tv_nsec);
1762 1762
1763 btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), 1763 btrfs_set_stack_timespec_sec(&inode_item->mtime,
1764 inode->i_mtime.tv_sec); 1764 inode->i_mtime.tv_sec);
1765 btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), 1765 btrfs_set_stack_timespec_nsec(&inode_item->mtime,
1766 inode->i_mtime.tv_nsec); 1766 inode->i_mtime.tv_nsec);
1767 1767
1768 btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), 1768 btrfs_set_stack_timespec_sec(&inode_item->ctime,
1769 inode->i_ctime.tv_sec); 1769 inode->i_ctime.tv_sec);
1770 btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), 1770 btrfs_set_stack_timespec_nsec(&inode_item->ctime,
1771 inode->i_ctime.tv_nsec); 1771 inode->i_ctime.tv_nsec);
1772
1773 btrfs_set_stack_timespec_sec(&inode_item->otime,
1774 BTRFS_I(inode)->i_otime.tv_sec);
1775 btrfs_set_stack_timespec_nsec(&inode_item->otime,
1776 BTRFS_I(inode)->i_otime.tv_nsec);
1772} 1777}
1773 1778
1774int btrfs_fill_inode(struct inode *inode, u32 *rdev) 1779int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1775{ 1780{
1776 struct btrfs_delayed_node *delayed_node; 1781 struct btrfs_delayed_node *delayed_node;
1777 struct btrfs_inode_item *inode_item; 1782 struct btrfs_inode_item *inode_item;
1778 struct btrfs_timespec *tspec;
1779 1783
1780 delayed_node = btrfs_get_delayed_node(inode); 1784 delayed_node = btrfs_get_delayed_node(inode);
1781 if (!delayed_node) 1785 if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1802 *rdev = btrfs_stack_inode_rdev(inode_item); 1806 *rdev = btrfs_stack_inode_rdev(inode_item);
1803 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); 1807 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
1804 1808
1805 tspec = btrfs_inode_atime(inode_item); 1809 inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
1806 inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); 1810 inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
1807 inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); 1811
1812 inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
1813 inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
1808 1814
1809 tspec = btrfs_inode_mtime(inode_item); 1815 inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
1810 inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); 1816 inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
1811 inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
1812 1817
1813 tspec = btrfs_inode_ctime(inode_item); 1818 BTRFS_I(inode)->i_otime.tv_sec =
1814 inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); 1819 btrfs_stack_timespec_sec(&inode_item->otime);
1815 inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); 1820 BTRFS_I(inode)->i_otime.tv_nsec =
1821 btrfs_stack_timespec_nsec(&inode_item->otime);
1816 1822
1817 inode->i_generation = BTRFS_I(inode)->generation; 1823 inode->i_generation = BTRFS_I(inode)->generation;
1818 BTRFS_I(inode)->index_cnt = (u64)-1; 1824 BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1857,6 +1863,14 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
1857{ 1863{
1858 struct btrfs_delayed_node *delayed_node; 1864 struct btrfs_delayed_node *delayed_node;
1859 1865
1866 /*
1867 * we don't do delayed inode updates during log recovery because it
1868 * leads to enospc problems. This means we also can't do
1869 * delayed inode refs
1870 */
1871 if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
1872 return -EAGAIN;
1873
1860 delayed_node = btrfs_get_or_create_delayed_node(inode); 1874 delayed_node = btrfs_get_or_create_delayed_node(inode);
1861 if (IS_ERR(delayed_node)) 1875 if (IS_ERR(delayed_node))
1862 return PTR_ERR(delayed_node); 1876 return PTR_ERR(delayed_node);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ca6a3a3b6b6c..5ec03d999c37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,18 +440,9 @@ leave:
440 */ 440 */
441static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) 441static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
442{ 442{
443 s64 writers;
444 DEFINE_WAIT(wait);
445
446 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 443 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
447 do { 444 wait_event(fs_info->replace_wait, !percpu_counter_sum(
448 prepare_to_wait(&fs_info->replace_wait, &wait, 445 &fs_info->bio_counter));
449 TASK_UNINTERRUPTIBLE);
450 writers = percpu_counter_sum(&fs_info->bio_counter);
451 if (writers)
452 schedule();
453 finish_wait(&fs_info->replace_wait, &wait);
454 } while (writers);
455} 446}
456 447
457/* 448/*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
932 923
933void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) 924void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
934{ 925{
935 DEFINE_WAIT(wait); 926 while (1) {
936again: 927 percpu_counter_inc(&fs_info->bio_counter);
937 percpu_counter_inc(&fs_info->bio_counter); 928 if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
938 if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { 929 &fs_info->fs_state)))
930 break;
931
939 btrfs_bio_counter_dec(fs_info); 932 btrfs_bio_counter_dec(fs_info);
940 wait_event(fs_info->replace_wait, 933 wait_event(fs_info->replace_wait,
941 !test_bit(BTRFS_FS_STATE_DEV_REPLACING, 934 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
942 &fs_info->fs_state)); 935 &fs_info->fs_state));
943 goto again;
944 } 936 }
945
946} 937}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..f79f38542a73 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
318 memcpy(&found, result, csum_size); 318 memcpy(&found, result, csum_size);
319 319
320 read_extent_buffer(buf, &val, 0, csum_size); 320 read_extent_buffer(buf, &val, 0, csum_size);
321 printk_ratelimited(KERN_INFO 321 printk_ratelimited(KERN_WARNING
322 "BTRFS: %s checksum verify failed on %llu wanted %X found %X " 322 "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
323 "level %d\n", 323 "level %d\n",
324 root->fs_info->sb->s_id, buf->start, 324 root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
367 ret = 0; 367 ret = 0;
368 goto out; 368 goto out;
369 } 369 }
370 printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", 370 printk_ratelimited(KERN_ERR
371 "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
371 eb->fs_info->sb->s_id, eb->start, 372 eb->fs_info->sb->s_id, eb->start,
372 parent_transid, btrfs_header_generation(eb)); 373 parent_transid, btrfs_header_generation(eb));
373 ret = 1; 374 ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
633 634
634 found_start = btrfs_header_bytenr(eb); 635 found_start = btrfs_header_bytenr(eb);
635 if (found_start != eb->start) { 636 if (found_start != eb->start) {
636 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " 637 printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
637 "%llu %llu\n", 638 "%llu %llu\n",
638 eb->fs_info->sb->s_id, found_start, eb->start); 639 eb->fs_info->sb->s_id, found_start, eb->start);
639 ret = -EIO; 640 ret = -EIO;
640 goto err; 641 goto err;
641 } 642 }
642 if (check_tree_block_fsid(root, eb)) { 643 if (check_tree_block_fsid(root, eb)) {
643 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", 644 printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
644 eb->fs_info->sb->s_id, eb->start); 645 eb->fs_info->sb->s_id, eb->start);
645 ret = -EIO; 646 ret = -EIO;
646 goto err; 647 goto err;
647 } 648 }
648 found_level = btrfs_header_level(eb); 649 found_level = btrfs_header_level(eb);
649 if (found_level >= BTRFS_MAX_LEVEL) { 650 if (found_level >= BTRFS_MAX_LEVEL) {
650 btrfs_info(root->fs_info, "bad tree block level %d", 651 btrfs_err(root->fs_info, "bad tree block level %d",
651 (int)btrfs_header_level(eb)); 652 (int)btrfs_header_level(eb));
652 ret = -EIO; 653 ret = -EIO;
653 goto err; 654 goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
1073 .set_page_dirty = btree_set_page_dirty, 1074 .set_page_dirty = btree_set_page_dirty,
1074}; 1075};
1075 1076
1076void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) 1077void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
1077{ 1078{
1078 struct extent_buffer *buf = NULL; 1079 struct extent_buffer *buf = NULL;
1079 struct inode *btree_inode = root->fs_info->btree_inode; 1080 struct inode *btree_inode = root->fs_info->btree_inode;
1080 1081
1081 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1082 buf = btrfs_find_create_tree_block(root, bytenr);
1082 if (!buf) 1083 if (!buf)
1083 return; 1084 return;
1084 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1085 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
1086 free_extent_buffer(buf); 1087 free_extent_buffer(buf);
1087} 1088}
1088 1089
1089int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1090int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
1090 int mirror_num, struct extent_buffer **eb) 1091 int mirror_num, struct extent_buffer **eb)
1091{ 1092{
1092 struct extent_buffer *buf = NULL; 1093 struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1094 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; 1095 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1095 int ret; 1096 int ret;
1096 1097
1097 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1098 buf = btrfs_find_create_tree_block(root, bytenr);
1098 if (!buf) 1099 if (!buf)
1099 return 0; 1100 return 0;
1100 1101
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1125} 1126}
1126 1127
1127struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1128struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1128 u64 bytenr, u32 blocksize) 1129 u64 bytenr)
1129{ 1130{
1130 if (btrfs_test_is_dummy_root(root)) 1131 if (btrfs_test_is_dummy_root(root))
1131 return alloc_test_extent_buffer(root->fs_info, bytenr, 1132 return alloc_test_extent_buffer(root->fs_info, bytenr);
1132 blocksize); 1133 return alloc_extent_buffer(root->fs_info, bytenr);
1133 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1134} 1134}
1135 1135
1136 1136
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1152 struct extent_buffer *buf = NULL; 1152 struct extent_buffer *buf = NULL;
1153 int ret; 1153 int ret;
1154 1154
1155 buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); 1155 buf = btrfs_find_create_tree_block(root, bytenr);
1156 if (!buf) 1156 if (!buf)
1157 return NULL; 1157 return NULL;
1158 1158
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1275 memset(&root->root_key, 0, sizeof(root->root_key)); 1275 memset(&root->root_key, 0, sizeof(root->root_key));
1276 memset(&root->root_item, 0, sizeof(root->root_item)); 1276 memset(&root->root_item, 0, sizeof(root->root_item));
1277 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 1277 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1278 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
1279 if (fs_info) 1278 if (fs_info)
1280 root->defrag_trans_start = fs_info->generation; 1279 root->defrag_trans_start = fs_info->generation;
1281 else 1280 else
1282 root->defrag_trans_start = 0; 1281 root->defrag_trans_start = 0;
1283 init_completion(&root->kobj_unregister);
1284 root->root_key.objectid = objectid; 1282 root->root_key.objectid = objectid;
1285 root->anon_dev = 0; 1283 root->anon_dev = 0;
1286 1284
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1630 bool check_ref) 1628 bool check_ref)
1631{ 1629{
1632 struct btrfs_root *root; 1630 struct btrfs_root *root;
1631 struct btrfs_path *path;
1632 struct btrfs_key key;
1633 int ret; 1633 int ret;
1634 1634
1635 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) 1635 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
1669 if (ret) 1669 if (ret)
1670 goto fail; 1670 goto fail;
1671 1671
1672 ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, 1672 path = btrfs_alloc_path();
1673 location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); 1673 if (!path) {
1674 ret = -ENOMEM;
1675 goto fail;
1676 }
1677 key.objectid = BTRFS_ORPHAN_OBJECTID;
1678 key.type = BTRFS_ORPHAN_ITEM_KEY;
1679 key.offset = location->objectid;
1680
1681 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1682 btrfs_free_path(path);
1674 if (ret < 0) 1683 if (ret < 0)
1675 goto fail; 1684 goto fail;
1676 if (ret == 0) 1685 if (ret == 0)
@@ -1715,12 +1724,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1715{ 1724{
1716 int err; 1725 int err;
1717 1726
1718 bdi->capabilities = BDI_CAP_MAP_COPY; 1727 err = bdi_setup_and_register(bdi, "btrfs");
1719 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1720 if (err) 1728 if (err)
1721 return err; 1729 return err;
1722 1730
1723 bdi->ra_pages = default_backing_dev_info.ra_pages; 1731 bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
1724 bdi->congested_fn = btrfs_congested_fn; 1732 bdi->congested_fn = btrfs_congested_fn;
1725 bdi->congested_data = info; 1733 bdi->congested_data = info;
1726 return 0; 1734 return 0;
@@ -2233,6 +2241,7 @@ int open_ctree(struct super_block *sb,
2233 spin_lock_init(&fs_info->qgroup_op_lock); 2241 spin_lock_init(&fs_info->qgroup_op_lock);
2234 spin_lock_init(&fs_info->buffer_lock); 2242 spin_lock_init(&fs_info->buffer_lock);
2235 spin_lock_init(&fs_info->unused_bgs_lock); 2243 spin_lock_init(&fs_info->unused_bgs_lock);
2244 mutex_init(&fs_info->unused_bg_unpin_mutex);
2236 rwlock_init(&fs_info->tree_mod_log_lock); 2245 rwlock_init(&fs_info->tree_mod_log_lock);
2237 mutex_init(&fs_info->reloc_mutex); 2246 mutex_init(&fs_info->reloc_mutex);
2238 mutex_init(&fs_info->delalloc_root_mutex); 2247 mutex_init(&fs_info->delalloc_root_mutex);
@@ -2319,7 +2328,6 @@ int open_ctree(struct super_block *sb,
2319 */ 2328 */
2320 fs_info->btree_inode->i_size = OFFSET_MAX; 2329 fs_info->btree_inode->i_size = OFFSET_MAX;
2321 fs_info->btree_inode->i_mapping->a_ops = &btree_aops; 2330 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2322 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
2323 2331
2324 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); 2332 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2325 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 2333 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
@@ -2498,7 +2506,7 @@ int open_ctree(struct super_block *sb,
2498 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 2506 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2499 2507
2500 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) 2508 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2501 printk(KERN_ERR "BTRFS: has skinny extents\n"); 2509 printk(KERN_INFO "BTRFS: has skinny extents\n");
2502 2510
2503 /* 2511 /*
2504 * flag our filesystem as having big metadata blocks if 2512 * flag our filesystem as having big metadata blocks if
@@ -2522,7 +2530,7 @@ int open_ctree(struct super_block *sb,
2522 */ 2530 */
2523 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2531 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2524 (sectorsize != nodesize)) { 2532 (sectorsize != nodesize)) {
2525 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " 2533 printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
2526 "are not allowed for mixed block groups on %s\n", 2534 "are not allowed for mixed block groups on %s\n",
2527 sb->s_id); 2535 sb->s_id);
2528 goto fail_alloc; 2536 goto fail_alloc;
@@ -2630,12 +2638,12 @@ int open_ctree(struct super_block *sb,
2630 sb->s_blocksize_bits = blksize_bits(sectorsize); 2638 sb->s_blocksize_bits = blksize_bits(sectorsize);
2631 2639
2632 if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 2640 if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
2633 printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); 2641 printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
2634 goto fail_sb_buffer; 2642 goto fail_sb_buffer;
2635 } 2643 }
2636 2644
2637 if (sectorsize != PAGE_SIZE) { 2645 if (sectorsize != PAGE_SIZE) {
2638 printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " 2646 printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
2639 "found on %s\n", (unsigned long)sectorsize, sb->s_id); 2647 "found on %s\n", (unsigned long)sectorsize, sb->s_id);
2640 goto fail_sb_buffer; 2648 goto fail_sb_buffer;
2641 } 2649 }
@@ -2644,7 +2652,7 @@ int open_ctree(struct super_block *sb,
2644 ret = btrfs_read_sys_array(tree_root); 2652 ret = btrfs_read_sys_array(tree_root);
2645 mutex_unlock(&fs_info->chunk_mutex); 2653 mutex_unlock(&fs_info->chunk_mutex);
2646 if (ret) { 2654 if (ret) {
2647 printk(KERN_WARNING "BTRFS: failed to read the system " 2655 printk(KERN_ERR "BTRFS: failed to read the system "
2648 "array on %s\n", sb->s_id); 2656 "array on %s\n", sb->s_id);
2649 goto fail_sb_buffer; 2657 goto fail_sb_buffer;
2650 } 2658 }
@@ -2659,7 +2667,7 @@ int open_ctree(struct super_block *sb,
2659 generation); 2667 generation);
2660 if (!chunk_root->node || 2668 if (!chunk_root->node ||
2661 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2669 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2662 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", 2670 printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
2663 sb->s_id); 2671 sb->s_id);
2664 goto fail_tree_roots; 2672 goto fail_tree_roots;
2665 } 2673 }
@@ -2671,7 +2679,7 @@ int open_ctree(struct super_block *sb,
2671 2679
2672 ret = btrfs_read_chunk_tree(chunk_root); 2680 ret = btrfs_read_chunk_tree(chunk_root);
2673 if (ret) { 2681 if (ret) {
2674 printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", 2682 printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
2675 sb->s_id); 2683 sb->s_id);
2676 goto fail_tree_roots; 2684 goto fail_tree_roots;
2677 } 2685 }
@@ -2683,7 +2691,7 @@ int open_ctree(struct super_block *sb,
2683 btrfs_close_extra_devices(fs_info, fs_devices, 0); 2691 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2684 2692
2685 if (!fs_devices->latest_bdev) { 2693 if (!fs_devices->latest_bdev) {
2686 printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", 2694 printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
2687 sb->s_id); 2695 sb->s_id);
2688 goto fail_tree_roots; 2696 goto fail_tree_roots;
2689 } 2697 }
@@ -2767,7 +2775,7 @@ retry_root_backup:
2767 2775
2768 ret = btrfs_recover_balance(fs_info); 2776 ret = btrfs_recover_balance(fs_info);
2769 if (ret) { 2777 if (ret) {
2770 printk(KERN_WARNING "BTRFS: failed to recover balance\n"); 2778 printk(KERN_ERR "BTRFS: failed to recover balance\n");
2771 goto fail_block_groups; 2779 goto fail_block_groups;
2772 } 2780 }
2773 2781
@@ -3862,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3862 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", 3870 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
3863 btrfs_super_log_root(sb)); 3871 btrfs_super_log_root(sb));
3864 3872
3873 /*
3874 * Check the lower bound, the alignment and other constraints are
3875 * checked later.
3876 */
3877 if (btrfs_super_nodesize(sb) < 4096) {
3878 printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
3879 btrfs_super_nodesize(sb));
3880 ret = -EINVAL;
3881 }
3882 if (btrfs_super_sectorsize(sb) < 4096) {
3883 printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
3884 btrfs_super_sectorsize(sb));
3885 ret = -EINVAL;
3886 }
3887
3865 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { 3888 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
3866 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", 3889 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
3867 fs_info->fsid, sb->dev_item.fsid); 3890 fs_info->fsid, sb->dev_item.fsid);
@@ -3875,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3875 if (btrfs_super_num_devices(sb) > (1UL << 31)) 3898 if (btrfs_super_num_devices(sb) > (1UL << 31))
3876 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", 3899 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
3877 btrfs_super_num_devices(sb)); 3900 btrfs_super_num_devices(sb));
3901 if (btrfs_super_num_devices(sb) == 0) {
3902 printk(KERN_ERR "BTRFS: number of devices is 0\n");
3903 ret = -EINVAL;
3904 }
3878 3905
3879 if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { 3906 if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
3880 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", 3907 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3883,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3883 } 3910 }
3884 3911
3885 /* 3912 /*
3913 * Obvious sys_chunk_array corruptions, it must hold at least one key
3914 * and one chunk
3915 */
3916 if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
3917 printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
3918 btrfs_super_sys_array_size(sb),
3919 BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
3920 ret = -EINVAL;
3921 }
3922 if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
3923 + sizeof(struct btrfs_chunk)) {
3924 printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
3925 btrfs_super_sys_array_size(sb),
3926 sizeof(struct btrfs_disk_key)
3927 + sizeof(struct btrfs_chunk));
3928 ret = -EINVAL;
3929 }
3930
3931 /*
3886 * The generation is a global counter, we'll trust it more than the others 3932 * The generation is a global counter, we'll trust it more than the others
3887 * but it's still possible that it's the one that's wrong. 3933 * but it's still possible that it's the one that's wrong.
3888 */ 3934 */
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 414651821fb3..27d44c0fd236 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
46 46
47struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 47struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
48 u64 parent_transid); 48 u64 parent_transid);
49void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); 49void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
51 int mirror_num, struct extent_buffer **eb); 51 int mirror_num, struct extent_buffer **eb);
52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
53 u64 bytenr, u32 blocksize); 53 u64 bytenr);
54void clean_tree_block(struct btrfs_trans_handle *trans, 54void clean_tree_block(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root, struct extent_buffer *buf); 55 struct btrfs_root *root, struct extent_buffer *buf);
56int open_ctree(struct super_block *sb, 56int open_ctree(struct super_block *sb,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a80b97100d90..571f402d3fc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@ enum {
74 RESERVE_ALLOC_NO_ACCOUNT = 2, 74 RESERVE_ALLOC_NO_ACCOUNT = 2,
75}; 75};
76 76
77static int update_block_group(struct btrfs_root *root, 77static int update_block_group(struct btrfs_trans_handle *trans,
78 u64 bytenr, u64 num_bytes, int alloc); 78 struct btrfs_root *root, u64 bytenr,
79 u64 num_bytes, int alloc);
79static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 80static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
80 struct btrfs_root *root, 81 struct btrfs_root *root,
81 u64 bytenr, u64 num_bytes, u64 parent, 82 u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1925 */ 1926 */
1926 ret = 0; 1927 ret = 0;
1927 } 1928 }
1928 kfree(bbio); 1929 btrfs_put_bbio(bbio);
1929 } 1930 }
1930 1931
1931 if (actual_bytes) 1932 if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2768 struct btrfs_delayed_ref_head *head; 2769 struct btrfs_delayed_ref_head *head;
2769 int ret; 2770 int ret;
2770 int run_all = count == (unsigned long)-1; 2771 int run_all = count == (unsigned long)-1;
2771 int run_most = 0;
2772 2772
2773 /* We'll clean this up in btrfs_cleanup_transaction */ 2773 /* We'll clean this up in btrfs_cleanup_transaction */
2774 if (trans->aborted) 2774 if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2778 root = root->fs_info->tree_root; 2778 root = root->fs_info->tree_root;
2779 2779
2780 delayed_refs = &trans->transaction->delayed_refs; 2780 delayed_refs = &trans->transaction->delayed_refs;
2781 if (count == 0) { 2781 if (count == 0)
2782 count = atomic_read(&delayed_refs->num_entries) * 2; 2782 count = atomic_read(&delayed_refs->num_entries) * 2;
2783 run_most = 1;
2784 }
2785 2783
2786again: 2784again:
2787#ifdef SCRAMBLE_DELAYED_REFS 2785#ifdef SCRAMBLE_DELAYED_REFS
@@ -3139,9 +3137,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
3139 struct extent_buffer *leaf; 3137 struct extent_buffer *leaf;
3140 3138
3141 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3139 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3142 if (ret < 0) 3140 if (ret) {
3141 if (ret > 0)
3142 ret = -ENOENT;
3143 goto fail; 3143 goto fail;
3144 BUG_ON(ret); /* Corruption */ 3144 }
3145 3145
3146 leaf = path->nodes[0]; 3146 leaf = path->nodes[0];
3147 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3147 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3149,11 +3149,9 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
3149 btrfs_mark_buffer_dirty(leaf); 3149 btrfs_mark_buffer_dirty(leaf);
3150 btrfs_release_path(path); 3150 btrfs_release_path(path);
3151fail: 3151fail:
3152 if (ret) { 3152 if (ret)
3153 btrfs_abort_transaction(trans, root, ret); 3153 btrfs_abort_transaction(trans, root, ret);
3154 return ret; 3154 return ret;
3155 }
3156 return 0;
3157 3155
3158} 3156}
3159 3157
@@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3315 struct btrfs_root *root) 3313 struct btrfs_root *root)
3316{ 3314{
3317 struct btrfs_block_group_cache *cache; 3315 struct btrfs_block_group_cache *cache;
3318 int err = 0; 3316 struct btrfs_transaction *cur_trans = trans->transaction;
3317 int ret = 0;
3319 struct btrfs_path *path; 3318 struct btrfs_path *path;
3320 u64 last = 0; 3319
3320 if (list_empty(&cur_trans->dirty_bgs))
3321 return 0;
3321 3322
3322 path = btrfs_alloc_path(); 3323 path = btrfs_alloc_path();
3323 if (!path) 3324 if (!path)
3324 return -ENOMEM; 3325 return -ENOMEM;
3325 3326
3326again: 3327 /*
3327 while (1) { 3328 * We don't need the lock here since we are protected by the transaction
3328 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3329 * commit. We want to do the cache_save_setup first and then run the
3329 while (cache) { 3330 * delayed refs to make sure we have the best chance at doing this all
3330 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3331 * in one shot.
3331 break; 3332 */
3332 cache = next_block_group(root, cache); 3333 while (!list_empty(&cur_trans->dirty_bgs)) {
3333 } 3334 cache = list_first_entry(&cur_trans->dirty_bgs,
3334 if (!cache) { 3335 struct btrfs_block_group_cache,
3335 if (last == 0) 3336 dirty_list);
3336 break; 3337 list_del_init(&cache->dirty_list);
3337 last = 0; 3338 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3338 continue; 3339 cache_save_setup(cache, trans, path);
3339 } 3340 if (!ret)
3340 err = cache_save_setup(cache, trans, path); 3341 ret = btrfs_run_delayed_refs(trans, root,
3341 last = cache->key.objectid + cache->key.offset; 3342 (unsigned long) -1);
3342 btrfs_put_block_group(cache); 3343 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
3343 } 3344 btrfs_write_out_cache(root, trans, cache, path);
3344 3345 if (!ret)
3345 while (1) { 3346 ret = write_one_cache_group(trans, root, path, cache);
3346 if (last == 0) {
3347 err = btrfs_run_delayed_refs(trans, root,
3348 (unsigned long)-1);
3349 if (err) /* File system offline */
3350 goto out;
3351 }
3352
3353 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3354 while (cache) {
3355 if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3356 btrfs_put_block_group(cache);
3357 goto again;
3358 }
3359
3360 if (cache->dirty)
3361 break;
3362 cache = next_block_group(root, cache);
3363 }
3364 if (!cache) {
3365 if (last == 0)
3366 break;
3367 last = 0;
3368 continue;
3369 }
3370
3371 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3372 cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3373 cache->dirty = 0;
3374 last = cache->key.objectid + cache->key.offset;
3375
3376 err = write_one_cache_group(trans, root, path, cache);
3377 btrfs_put_block_group(cache);
3378 if (err) /* File system offline */
3379 goto out;
3380 }
3381
3382 while (1) {
3383 /*
3384 * I don't think this is needed since we're just marking our
3385 * preallocated extent as written, but just in case it can't
3386 * hurt.
3387 */
3388 if (last == 0) {
3389 err = btrfs_run_delayed_refs(trans, root,
3390 (unsigned long)-1);
3391 if (err) /* File system offline */
3392 goto out;
3393 }
3394
3395 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3396 while (cache) {
3397 /*
3398 * Really this shouldn't happen, but it could if we
3399 * couldn't write the entire preallocated extent and
3400 * splitting the extent resulted in a new block.
3401 */
3402 if (cache->dirty) {
3403 btrfs_put_block_group(cache);
3404 goto again;
3405 }
3406 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3407 break;
3408 cache = next_block_group(root, cache);
3409 }
3410 if (!cache) {
3411 if (last == 0)
3412 break;
3413 last = 0;
3414 continue;
3415 }
3416
3417 err = btrfs_write_out_cache(root, trans, cache, path);
3418
3419 /*
3420 * If we didn't have an error then the cache state is still
3421 * NEED_WRITE, so we can set it to WRITTEN.
3422 */
3423 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3424 cache->disk_cache_state = BTRFS_DC_WRITTEN;
3425 last = cache->key.objectid + cache->key.offset;
3426 btrfs_put_block_group(cache); 3347 btrfs_put_block_group(cache);
3427 } 3348 }
3428out:
3429 3349
3430 btrfs_free_path(path); 3350 btrfs_free_path(path);
3431 return err; 3351 return ret;
3432} 3352}
3433 3353
3434int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3354int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5043/** 4963/**
5044 * drop_outstanding_extent - drop an outstanding extent 4964 * drop_outstanding_extent - drop an outstanding extent
5045 * @inode: the inode we're dropping the extent for 4965 * @inode: the inode we're dropping the extent for
4966 * @num_bytes: the number of bytes we're relaseing.
5046 * 4967 *
5047 * This is called when we are freeing up an outstanding extent, either called 4968 * This is called when we are freeing up an outstanding extent, either called
5048 * after an error or after an extent is written. This will return the number of 4969 * after an error or after an extent is written. This will return the number of
5049 * reserved extents that need to be freed. This must be called with 4970 * reserved extents that need to be freed. This must be called with
5050 * BTRFS_I(inode)->lock held. 4971 * BTRFS_I(inode)->lock held.
5051 */ 4972 */
5052static unsigned drop_outstanding_extent(struct inode *inode) 4973static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5053{ 4974{
5054 unsigned drop_inode_space = 0; 4975 unsigned drop_inode_space = 0;
5055 unsigned dropped_extents = 0; 4976 unsigned dropped_extents = 0;
4977 unsigned num_extents = 0;
5056 4978
5057 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4979 num_extents = (unsigned)div64_u64(num_bytes +
5058 BTRFS_I(inode)->outstanding_extents--; 4980 BTRFS_MAX_EXTENT_SIZE - 1,
4981 BTRFS_MAX_EXTENT_SIZE);
4982 ASSERT(num_extents);
4983 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
4984 BTRFS_I(inode)->outstanding_extents -= num_extents;
5059 4985
5060 if (BTRFS_I(inode)->outstanding_extents == 0 && 4986 if (BTRFS_I(inode)->outstanding_extents == 0 &&
5061 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4987 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5226 5152
5227out_fail: 5153out_fail:
5228 spin_lock(&BTRFS_I(inode)->lock); 5154 spin_lock(&BTRFS_I(inode)->lock);
5229 dropped = drop_outstanding_extent(inode); 5155 dropped = drop_outstanding_extent(inode, num_bytes);
5230 /* 5156 /*
5231 * If the inodes csum_bytes is the same as the original 5157 * If the inodes csum_bytes is the same as the original
5232 * csum_bytes then we know we haven't raced with any free()ers 5158 * csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5305 5231
5306 num_bytes = ALIGN(num_bytes, root->sectorsize); 5232 num_bytes = ALIGN(num_bytes, root->sectorsize);
5307 spin_lock(&BTRFS_I(inode)->lock); 5233 spin_lock(&BTRFS_I(inode)->lock);
5308 dropped = drop_outstanding_extent(inode); 5234 dropped = drop_outstanding_extent(inode, num_bytes);
5309 5235
5310 if (num_bytes) 5236 if (num_bytes)
5311 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5237 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5375 btrfs_free_reserved_data_space(inode, num_bytes); 5301 btrfs_free_reserved_data_space(inode, num_bytes);
5376} 5302}
5377 5303
5378static int update_block_group(struct btrfs_root *root, 5304static int update_block_group(struct btrfs_trans_handle *trans,
5379 u64 bytenr, u64 num_bytes, int alloc) 5305 struct btrfs_root *root, u64 bytenr,
5306 u64 num_bytes, int alloc)
5380{ 5307{
5381 struct btrfs_block_group_cache *cache = NULL; 5308 struct btrfs_block_group_cache *cache = NULL;
5382 struct btrfs_fs_info *info = root->fs_info; 5309 struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root,
5414 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5341 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5415 cache_block_group(cache, 1); 5342 cache_block_group(cache, 1);
5416 5343
5344 spin_lock(&trans->transaction->dirty_bgs_lock);
5345 if (list_empty(&cache->dirty_list)) {
5346 list_add_tail(&cache->dirty_list,
5347 &trans->transaction->dirty_bgs);
5348 btrfs_get_block_group(cache);
5349 }
5350 spin_unlock(&trans->transaction->dirty_bgs_lock);
5351
5417 byte_in_group = bytenr - cache->key.objectid; 5352 byte_in_group = bytenr - cache->key.objectid;
5418 WARN_ON(byte_in_group > cache->key.offset); 5353 WARN_ON(byte_in_group > cache->key.offset);
5419 5354
@@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root,
5424 cache->disk_cache_state < BTRFS_DC_CLEAR) 5359 cache->disk_cache_state < BTRFS_DC_CLEAR)
5425 cache->disk_cache_state = BTRFS_DC_CLEAR; 5360 cache->disk_cache_state = BTRFS_DC_CLEAR;
5426 5361
5427 cache->dirty = 1;
5428 old_val = btrfs_block_group_used(&cache->item); 5362 old_val = btrfs_block_group_used(&cache->item);
5429 num_bytes = min(total, cache->key.offset - byte_in_group); 5363 num_bytes = min(total, cache->key.offset - byte_in_group);
5430 if (alloc) { 5364 if (alloc) {
@@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5807 unpin = &fs_info->freed_extents[0]; 5741 unpin = &fs_info->freed_extents[0];
5808 5742
5809 while (1) { 5743 while (1) {
5744 mutex_lock(&fs_info->unused_bg_unpin_mutex);
5810 ret = find_first_extent_bit(unpin, 0, &start, &end, 5745 ret = find_first_extent_bit(unpin, 0, &start, &end,
5811 EXTENT_DIRTY, NULL); 5746 EXTENT_DIRTY, NULL);
5812 if (ret) 5747 if (ret) {
5748 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5813 break; 5749 break;
5750 }
5814 5751
5815 if (btrfs_test_opt(root, DISCARD)) 5752 if (btrfs_test_opt(root, DISCARD))
5816 ret = btrfs_discard_extent(root, start, 5753 ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5818 5755
5819 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5756 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5820 unpin_extent_range(root, start, end, true); 5757 unpin_extent_range(root, start, end, true);
5758 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5821 cond_resched(); 5759 cond_resched();
5822 } 5760 }
5823 5761
@@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6103 } 6041 }
6104 } 6042 }
6105 6043
6106 ret = update_block_group(root, bytenr, num_bytes, 0); 6044 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6107 if (ret) { 6045 if (ret) {
6108 btrfs_abort_transaction(trans, extent_root, ret); 6046 btrfs_abort_transaction(trans, extent_root, ret);
6109 goto out; 6047 goto out;
@@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6205 struct extent_buffer *buf, 6143 struct extent_buffer *buf,
6206 u64 parent, int last_ref) 6144 u64 parent, int last_ref)
6207{ 6145{
6208 struct btrfs_block_group_cache *cache = NULL;
6209 int pin = 1; 6146 int pin = 1;
6210 int ret; 6147 int ret;
6211 6148
@@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6221 if (!last_ref) 6158 if (!last_ref)
6222 return; 6159 return;
6223 6160
6224 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6225
6226 if (btrfs_header_generation(buf) == trans->transid) { 6161 if (btrfs_header_generation(buf) == trans->transid) {
6162 struct btrfs_block_group_cache *cache;
6163
6227 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6164 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6228 ret = check_ref_cleanup(trans, root, buf->start); 6165 ret = check_ref_cleanup(trans, root, buf->start);
6229 if (!ret) 6166 if (!ret)
6230 goto out; 6167 goto out;
6231 } 6168 }
6232 6169
6170 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6171
6233 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6172 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6234 pin_down_extent(root, cache, buf->start, buf->len, 1); 6173 pin_down_extent(root, cache, buf->start, buf->len, 1);
6174 btrfs_put_block_group(cache);
6235 goto out; 6175 goto out;
6236 } 6176 }
6237 6177
@@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6239 6179
6240 btrfs_add_free_space(cache, buf->start, buf->len); 6180 btrfs_add_free_space(cache, buf->start, buf->len);
6241 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 6181 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6182 btrfs_put_block_group(cache);
6242 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6183 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6243 pin = 0; 6184 pin = 0;
6244 } 6185 }
@@ -6253,7 +6194,6 @@ out:
6253 * anymore. 6194 * anymore.
6254 */ 6195 */
6255 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6196 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6256 btrfs_put_block_group(cache);
6257} 6197}
6258 6198
6259/* Can return -ENOMEM */ 6199/* Can return -ENOMEM */
@@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7063 if (ret) 7003 if (ret)
7064 return ret; 7004 return ret;
7065 7005
7066 ret = update_block_group(root, ins->objectid, ins->offset, 1); 7006 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7067 if (ret) { /* -ENOENT, logic error */ 7007 if (ret) { /* -ENOENT, logic error */
7068 btrfs_err(fs_info, "update block group failed for %llu %llu", 7008 btrfs_err(fs_info, "update block group failed for %llu %llu",
7069 ins->objectid, ins->offset); 7009 ins->objectid, ins->offset);
@@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7152 return ret; 7092 return ret;
7153 } 7093 }
7154 7094
7155 ret = update_block_group(root, ins->objectid, root->nodesize, 1); 7095 ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7096 1);
7156 if (ret) { /* -ENOENT, logic error */ 7097 if (ret) { /* -ENOENT, logic error */
7157 btrfs_err(fs_info, "update block group failed for %llu %llu", 7098 btrfs_err(fs_info, "update block group failed for %llu %llu",
7158 ins->objectid, ins->offset); 7099 ins->objectid, ins->offset);
@@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7217 7158
7218static struct extent_buffer * 7159static struct extent_buffer *
7219btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7160btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7220 u64 bytenr, u32 blocksize, int level) 7161 u64 bytenr, int level)
7221{ 7162{
7222 struct extent_buffer *buf; 7163 struct extent_buffer *buf;
7223 7164
7224 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 7165 buf = btrfs_find_create_tree_block(root, bytenr);
7225 if (!buf) 7166 if (!buf)
7226 return ERR_PTR(-ENOMEM); 7167 return ERR_PTR(-ENOMEM);
7227 btrfs_set_header_generation(buf, trans->transid); 7168 btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7340 7281
7341 if (btrfs_test_is_dummy_root(root)) { 7282 if (btrfs_test_is_dummy_root(root)) {
7342 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7283 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7343 blocksize, level); 7284 level);
7344 if (!IS_ERR(buf)) 7285 if (!IS_ERR(buf))
7345 root->alloc_bytenr += blocksize; 7286 root->alloc_bytenr += blocksize;
7346 return buf; 7287 return buf;
@@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7357 return ERR_PTR(ret); 7298 return ERR_PTR(ret);
7358 } 7299 }
7359 7300
7360 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 7301 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7361 blocksize, level);
7362 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7302 BUG_ON(IS_ERR(buf)); /* -ENOMEM */
7363 7303
7364 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7304 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7487 continue; 7427 continue;
7488 } 7428 }
7489reada: 7429reada:
7490 readahead_tree_block(root, bytenr, blocksize); 7430 readahead_tree_block(root, bytenr);
7491 nread++; 7431 nread++;
7492 } 7432 }
7493 wc->reada_slot = slot; 7433 wc->reada_slot = slot;
@@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7828 7768
7829 next = btrfs_find_tree_block(root, bytenr); 7769 next = btrfs_find_tree_block(root, bytenr);
7830 if (!next) { 7770 if (!next) {
7831 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7771 next = btrfs_find_create_tree_block(root, bytenr);
7832 if (!next) 7772 if (!next)
7833 return -ENOMEM; 7773 return -ENOMEM;
7834 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7774 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
8548 if (IS_ERR(trans)) 8488 if (IS_ERR(trans))
8549 return PTR_ERR(trans); 8489 return PTR_ERR(trans);
8550 8490
8551 alloc_flags = update_block_group_flags(root, cache->flags);
8552 if (alloc_flags != cache->flags) {
8553 ret = do_chunk_alloc(trans, root, alloc_flags,
8554 CHUNK_ALLOC_FORCE);
8555 if (ret < 0)
8556 goto out;
8557 }
8558
8559 ret = set_block_group_ro(cache, 0); 8491 ret = set_block_group_ro(cache, 0);
8560 if (!ret) 8492 if (!ret)
8561 goto out; 8493 goto out;
@@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
8566 goto out; 8498 goto out;
8567 ret = set_block_group_ro(cache, 0); 8499 ret = set_block_group_ro(cache, 0);
8568out: 8500out:
8501 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
8502 alloc_flags = update_block_group_flags(root, cache->flags);
8503 check_system_chunk(trans, root, alloc_flags);
8504 }
8505
8569 btrfs_end_transaction(trans, root); 8506 btrfs_end_transaction(trans, root);
8570 return ret; 8507 return ret;
8571} 8508}
@@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9005 INIT_LIST_HEAD(&cache->cluster_list); 8942 INIT_LIST_HEAD(&cache->cluster_list);
9006 INIT_LIST_HEAD(&cache->bg_list); 8943 INIT_LIST_HEAD(&cache->bg_list);
9007 INIT_LIST_HEAD(&cache->ro_list); 8944 INIT_LIST_HEAD(&cache->ro_list);
8945 INIT_LIST_HEAD(&cache->dirty_list);
9008 btrfs_init_free_space_ctl(cache); 8946 btrfs_init_free_space_ctl(cache);
9009 atomic_set(&cache->trimming, 0); 8947 atomic_set(&cache->trimming, 0);
9010 8948
@@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
9068 * b) Setting 'dirty flag' makes sure that we flush 9006 * b) Setting 'dirty flag' makes sure that we flush
9069 * the new space cache info onto disk. 9007 * the new space cache info onto disk.
9070 */ 9008 */
9071 cache->disk_cache_state = BTRFS_DC_CLEAR;
9072 if (btrfs_test_opt(root, SPACE_CACHE)) 9009 if (btrfs_test_opt(root, SPACE_CACHE))
9073 cache->dirty = 1; 9010 cache->disk_cache_state = BTRFS_DC_CLEAR;
9074 } 9011 }
9075 9012
9076 read_extent_buffer(leaf, &cache->item, 9013 read_extent_buffer(leaf, &cache->item,
@@ -9422,7 +9359,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9422 * are still on the list after taking the semaphore 9359 * are still on the list after taking the semaphore
9423 */ 9360 */
9424 list_del_init(&block_group->list); 9361 list_del_init(&block_group->list);
9425 list_del_init(&block_group->ro_list);
9426 if (list_empty(&block_group->space_info->block_groups[index])) { 9362 if (list_empty(&block_group->space_info->block_groups[index])) {
9427 kobj = block_group->space_info->block_group_kobjs[index]; 9363 kobj = block_group->space_info->block_group_kobjs[index];
9428 block_group->space_info->block_group_kobjs[index] = NULL; 9364 block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9461,9 +9397,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9461 } 9397 }
9462 } 9398 }
9463 9399
9400 spin_lock(&trans->transaction->dirty_bgs_lock);
9401 if (!list_empty(&block_group->dirty_list)) {
9402 list_del_init(&block_group->dirty_list);
9403 btrfs_put_block_group(block_group);
9404 }
9405 spin_unlock(&trans->transaction->dirty_bgs_lock);
9406
9464 btrfs_remove_free_space_cache(block_group); 9407 btrfs_remove_free_space_cache(block_group);
9465 9408
9466 spin_lock(&block_group->space_info->lock); 9409 spin_lock(&block_group->space_info->lock);
9410 list_del_init(&block_group->ro_list);
9467 block_group->space_info->total_bytes -= block_group->key.offset; 9411 block_group->space_info->total_bytes -= block_group->key.offset;
9468 block_group->space_info->bytes_readonly -= block_group->key.offset; 9412 block_group->space_info->bytes_readonly -= block_group->key.offset;
9469 block_group->space_info->disk_total -= block_group->key.offset * factor; 9413 block_group->space_info->disk_total -= block_group->key.offset * factor;
@@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9611 * Want to do this before we do anything else so we can recover 9555 * Want to do this before we do anything else so we can recover
9612 * properly if we fail to join the transaction. 9556 * properly if we fail to join the transaction.
9613 */ 9557 */
9614 trans = btrfs_join_transaction(root); 9558 /* 1 for btrfs_orphan_reserve_metadata() */
9559 trans = btrfs_start_transaction(root, 1);
9615 if (IS_ERR(trans)) { 9560 if (IS_ERR(trans)) {
9616 btrfs_set_block_group_rw(root, block_group); 9561 btrfs_set_block_group_rw(root, block_group);
9617 ret = PTR_ERR(trans); 9562 ret = PTR_ERR(trans);
@@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9624 */ 9569 */
9625 start = block_group->key.objectid; 9570 start = block_group->key.objectid;
9626 end = start + block_group->key.offset - 1; 9571 end = start + block_group->key.offset - 1;
9572 /*
9573 * Hold the unused_bg_unpin_mutex lock to avoid racing with
9574 * btrfs_finish_extent_commit(). If we are at transaction N,
9575 * another task might be running finish_extent_commit() for the
9576 * previous transaction N - 1, and have seen a range belonging
9577 * to the block group in freed_extents[] before we were able to
9578 * clear the whole block group range from freed_extents[]. This
9579 * means that task can lookup for the block group after we
9580 * unpinned it from freed_extents[] and removed it, leading to
9581 * a BUG_ON() at btrfs_unpin_extent_range().
9582 */
9583 mutex_lock(&fs_info->unused_bg_unpin_mutex);
9627 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 9584 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9628 EXTENT_DIRTY, GFP_NOFS); 9585 EXTENT_DIRTY, GFP_NOFS);
9629 if (ret) { 9586 if (ret) {
9587 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9630 btrfs_set_block_group_rw(root, block_group); 9588 btrfs_set_block_group_rw(root, block_group);
9631 goto end_trans; 9589 goto end_trans;
9632 } 9590 }
9633 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 9591 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9634 EXTENT_DIRTY, GFP_NOFS); 9592 EXTENT_DIRTY, GFP_NOFS);
9635 if (ret) { 9593 if (ret) {
9594 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9636 btrfs_set_block_group_rw(root, block_group); 9595 btrfs_set_block_group_rw(root, block_group);
9637 goto end_trans; 9596 goto end_trans;
9638 } 9597 }
9598 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9639 9599
9640 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9600 /* Reset pinned so btrfs_put_block_group doesn't complain */
9641 block_group->pinned = 0; 9601 block_group->pinned = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ebabd237153..c7233ff1d533 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
64 64
65 while (!list_empty(&states)) { 65 while (!list_empty(&states)) {
66 state = list_entry(states.next, struct extent_state, leak_list); 66 state = list_entry(states.next, struct extent_state, leak_list);
67 pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", 67 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
68 state->start, state->end, state->state, 68 state->start, state->end, state->state,
69 extent_state_in_tree(state), 69 extent_state_in_tree(state),
70 atomic_read(&state->refs)); 70 atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
396} 396}
397 397
398static void set_state_cb(struct extent_io_tree *tree, 398static void set_state_cb(struct extent_io_tree *tree,
399 struct extent_state *state, unsigned long *bits) 399 struct extent_state *state, unsigned *bits)
400{ 400{
401 if (tree->ops && tree->ops->set_bit_hook) 401 if (tree->ops && tree->ops->set_bit_hook)
402 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 402 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
403} 403}
404 404
405static void clear_state_cb(struct extent_io_tree *tree, 405static void clear_state_cb(struct extent_io_tree *tree,
406 struct extent_state *state, unsigned long *bits) 406 struct extent_state *state, unsigned *bits)
407{ 407{
408 if (tree->ops && tree->ops->clear_bit_hook) 408 if (tree->ops && tree->ops->clear_bit_hook)
409 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 409 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
410} 410}
411 411
412static void set_state_bits(struct extent_io_tree *tree, 412static void set_state_bits(struct extent_io_tree *tree,
413 struct extent_state *state, unsigned long *bits); 413 struct extent_state *state, unsigned *bits);
414 414
415/* 415/*
416 * insert an extent_state struct into the tree. 'bits' are set on the 416 * insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
426 struct extent_state *state, u64 start, u64 end, 426 struct extent_state *state, u64 start, u64 end,
427 struct rb_node ***p, 427 struct rb_node ***p,
428 struct rb_node **parent, 428 struct rb_node **parent,
429 unsigned long *bits) 429 unsigned *bits)
430{ 430{
431 struct rb_node *node; 431 struct rb_node *node;
432 432
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
511 */ 511 */
512static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 512static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
513 struct extent_state *state, 513 struct extent_state *state,
514 unsigned long *bits, int wake) 514 unsigned *bits, int wake)
515{ 515{
516 struct extent_state *next; 516 struct extent_state *next;
517 unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; 517 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
518 518
519 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 519 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
520 u64 range = state->end - state->start + 1; 520 u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
570 * This takes the tree lock, and returns 0 on success and < 0 on error. 570 * This takes the tree lock, and returns 0 on success and < 0 on error.
571 */ 571 */
572int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 572int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
573 unsigned long bits, int wake, int delete, 573 unsigned bits, int wake, int delete,
574 struct extent_state **cached_state, 574 struct extent_state **cached_state,
575 gfp_t mask) 575 gfp_t mask)
576{ 576{
@@ -789,9 +789,9 @@ out:
789 789
790static void set_state_bits(struct extent_io_tree *tree, 790static void set_state_bits(struct extent_io_tree *tree,
791 struct extent_state *state, 791 struct extent_state *state,
792 unsigned long *bits) 792 unsigned *bits)
793{ 793{
794 unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; 794 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
795 795
796 set_state_cb(tree, state, bits); 796 set_state_cb(tree, state, bits);
797 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 797 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
803 803
804static void cache_state_if_flags(struct extent_state *state, 804static void cache_state_if_flags(struct extent_state *state,
805 struct extent_state **cached_ptr, 805 struct extent_state **cached_ptr,
806 const u64 flags) 806 unsigned flags)
807{ 807{
808 if (cached_ptr && !(*cached_ptr)) { 808 if (cached_ptr && !(*cached_ptr)) {
809 if (!flags || (state->state & flags)) { 809 if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
833 833
834static int __must_check 834static int __must_check
835__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 835__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
836 unsigned long bits, unsigned long exclusive_bits, 836 unsigned bits, unsigned exclusive_bits,
837 u64 *failed_start, struct extent_state **cached_state, 837 u64 *failed_start, struct extent_state **cached_state,
838 gfp_t mask) 838 gfp_t mask)
839{ 839{
@@ -1034,7 +1034,7 @@ search_again:
1034} 1034}
1035 1035
1036int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1036int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1037 unsigned long bits, u64 * failed_start, 1037 unsigned bits, u64 * failed_start,
1038 struct extent_state **cached_state, gfp_t mask) 1038 struct extent_state **cached_state, gfp_t mask)
1039{ 1039{
1040 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1040 return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1060 * boundary bits like LOCK. 1060 * boundary bits like LOCK.
1061 */ 1061 */
1062int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1062int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1063 unsigned long bits, unsigned long clear_bits, 1063 unsigned bits, unsigned clear_bits,
1064 struct extent_state **cached_state, gfp_t mask) 1064 struct extent_state **cached_state, gfp_t mask)
1065{ 1065{
1066 struct extent_state *state; 1066 struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1268} 1268}
1269 1269
1270int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1270int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1271 unsigned long bits, gfp_t mask) 1271 unsigned bits, gfp_t mask)
1272{ 1272{
1273 return set_extent_bit(tree, start, end, bits, NULL, 1273 return set_extent_bit(tree, start, end, bits, NULL,
1274 NULL, mask); 1274 NULL, mask);
1275} 1275}
1276 1276
1277int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1277int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1278 unsigned long bits, gfp_t mask) 1278 unsigned bits, gfp_t mask)
1279{ 1279{
1280 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1280 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1281} 1281}
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1330 * us if waiting is desired. 1330 * us if waiting is desired.
1331 */ 1331 */
1332int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1332int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1333 unsigned long bits, struct extent_state **cached_state) 1333 unsigned bits, struct extent_state **cached_state)
1334{ 1334{
1335 int err; 1335 int err;
1336 u64 failed_start; 1336 u64 failed_start;
1337
1337 while (1) { 1338 while (1) {
1338 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1339 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1339 EXTENT_LOCKED, &failed_start, 1340 EXTENT_LOCKED, &failed_start,
@@ -1407,8 +1408,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1407 while (index <= end_index) { 1408 while (index <= end_index) {
1408 page = find_get_page(inode->i_mapping, index); 1409 page = find_get_page(inode->i_mapping, index);
1409 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1410 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1410 account_page_redirty(page);
1411 __set_page_dirty_nobuffers(page); 1411 __set_page_dirty_nobuffers(page);
1412 account_page_redirty(page);
1412 page_cache_release(page); 1413 page_cache_release(page);
1413 index++; 1414 index++;
1414 } 1415 }
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1440 */ 1441 */
1441static struct extent_state * 1442static struct extent_state *
1442find_first_extent_bit_state(struct extent_io_tree *tree, 1443find_first_extent_bit_state(struct extent_io_tree *tree,
1443 u64 start, unsigned long bits) 1444 u64 start, unsigned bits)
1444{ 1445{
1445 struct rb_node *node; 1446 struct rb_node *node;
1446 struct extent_state *state; 1447 struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
1474 * If nothing was found, 1 is returned. If found something, return 0. 1475 * If nothing was found, 1 is returned. If found something, return 0.
1475 */ 1476 */
1476int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1477int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1477 u64 *start_ret, u64 *end_ret, unsigned long bits, 1478 u64 *start_ret, u64 *end_ret, unsigned bits,
1478 struct extent_state **cached_state) 1479 struct extent_state **cached_state)
1479{ 1480{
1480 struct extent_state *state; 1481 struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
1753 1754
1754int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1755int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1755 struct page *locked_page, 1756 struct page *locked_page,
1756 unsigned long clear_bits, 1757 unsigned clear_bits,
1757 unsigned long page_ops) 1758 unsigned long page_ops)
1758{ 1759{
1759 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1760 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1810 */ 1811 */
1811u64 count_range_bits(struct extent_io_tree *tree, 1812u64 count_range_bits(struct extent_io_tree *tree,
1812 u64 *start, u64 search_end, u64 max_bytes, 1813 u64 *start, u64 search_end, u64 max_bytes,
1813 unsigned long bits, int contig) 1814 unsigned bits, int contig)
1814{ 1815{
1815 struct rb_node *node; 1816 struct rb_node *node;
1816 struct extent_state *state; 1817 struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
1928 * range is found set. 1929 * range is found set.
1929 */ 1930 */
1930int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1931int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1931 unsigned long bits, int filled, struct extent_state *cached) 1932 unsigned bits, int filled, struct extent_state *cached)
1932{ 1933{
1933 struct extent_state *state = NULL; 1934 struct extent_state *state = NULL;
1934 struct rb_node *node; 1935 struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2057 sector = bbio->stripes[mirror_num-1].physical >> 9; 2058 sector = bbio->stripes[mirror_num-1].physical >> 9;
2058 bio->bi_iter.bi_sector = sector; 2059 bio->bi_iter.bi_sector = sector;
2059 dev = bbio->stripes[mirror_num-1].dev; 2060 dev = bbio->stripes[mirror_num-1].dev;
2060 kfree(bbio); 2061 btrfs_put_bbio(bbio);
2061 if (!dev || !dev->bdev || !dev->writeable) { 2062 if (!dev || !dev->bdev || !dev->writeable) {
2062 bio_put(bio); 2063 bio_put(bio);
2063 return -EIO; 2064 return -EIO;
@@ -2190,7 +2191,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2190 2191
2191 next = next_state(state); 2192 next = next_state(state);
2192 2193
2193 failrec = (struct io_failure_record *)state->private; 2194 failrec = (struct io_failure_record *)(unsigned long)state->private;
2194 free_extent_state(state); 2195 free_extent_state(state);
2195 kfree(failrec); 2196 kfree(failrec);
2196 2197
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2816 bio_add_page(bio, page, page_size, offset) < page_size) { 2817 bio_add_page(bio, page, page_size, offset) < page_size) {
2817 ret = submit_one_bio(rw, bio, mirror_num, 2818 ret = submit_one_bio(rw, bio, mirror_num,
2818 prev_bio_flags); 2819 prev_bio_flags);
2819 if (ret < 0) 2820 if (ret < 0) {
2821 *bio_ret = NULL;
2820 return ret; 2822 return ret;
2823 }
2821 bio = NULL; 2824 bio = NULL;
2822 } else { 2825 } else {
2823 return 0; 2826 return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
3239 page, 3242 page,
3240 &delalloc_start, 3243 &delalloc_start,
3241 &delalloc_end, 3244 &delalloc_end,
3242 128 * 1024 * 1024); 3245 BTRFS_MAX_EXTENT_SIZE);
3243 if (nr_delalloc == 0) { 3246 if (nr_delalloc == 0) {
3244 delalloc_start = delalloc_end + 1; 3247 delalloc_start = delalloc_end + 1;
3245 continue; 3248 continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4598 4601
4599static struct extent_buffer * 4602static struct extent_buffer *
4600__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4603__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4601 unsigned long len, gfp_t mask) 4604 unsigned long len)
4602{ 4605{
4603 struct extent_buffer *eb = NULL; 4606 struct extent_buffer *eb = NULL;
4604 4607
4605 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4608 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
4606 if (eb == NULL) 4609 if (eb == NULL)
4607 return NULL; 4610 return NULL;
4608 eb->start = start; 4611 eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4643 struct extent_buffer *new; 4646 struct extent_buffer *new;
4644 unsigned long num_pages = num_extent_pages(src->start, src->len); 4647 unsigned long num_pages = num_extent_pages(src->start, src->len);
4645 4648
4646 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); 4649 new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
4647 if (new == NULL) 4650 if (new == NULL)
4648 return NULL; 4651 return NULL;
4649 4652
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4666 return new; 4669 return new;
4667} 4670}
4668 4671
4669struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4672struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4673 u64 start)
4670{ 4674{
4671 struct extent_buffer *eb; 4675 struct extent_buffer *eb;
4672 unsigned long num_pages = num_extent_pages(0, len); 4676 unsigned long len;
4677 unsigned long num_pages;
4673 unsigned long i; 4678 unsigned long i;
4674 4679
4675 eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); 4680 if (!fs_info) {
4681 /*
4682 * Called only from tests that don't always have a fs_info
4683 * available, but we know that nodesize is 4096
4684 */
4685 len = 4096;
4686 } else {
4687 len = fs_info->tree_root->nodesize;
4688 }
4689 num_pages = num_extent_pages(0, len);
4690
4691 eb = __alloc_extent_buffer(fs_info, start, len);
4676 if (!eb) 4692 if (!eb)
4677 return NULL; 4693 return NULL;
4678 4694
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4762 4778
4763#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4779#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4764struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4780struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4765 u64 start, unsigned long len) 4781 u64 start)
4766{ 4782{
4767 struct extent_buffer *eb, *exists = NULL; 4783 struct extent_buffer *eb, *exists = NULL;
4768 int ret; 4784 int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4770 eb = find_extent_buffer(fs_info, start); 4786 eb = find_extent_buffer(fs_info, start);
4771 if (eb) 4787 if (eb)
4772 return eb; 4788 return eb;
4773 eb = alloc_dummy_extent_buffer(start, len); 4789 eb = alloc_dummy_extent_buffer(fs_info, start);
4774 if (!eb) 4790 if (!eb)
4775 return NULL; 4791 return NULL;
4776 eb->fs_info = fs_info; 4792 eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
4808#endif 4824#endif
4809 4825
4810struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4826struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4811 u64 start, unsigned long len) 4827 u64 start)
4812{ 4828{
4829 unsigned long len = fs_info->tree_root->nodesize;
4813 unsigned long num_pages = num_extent_pages(start, len); 4830 unsigned long num_pages = num_extent_pages(start, len);
4814 unsigned long i; 4831 unsigned long i;
4815 unsigned long index = start >> PAGE_CACHE_SHIFT; 4832 unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4824 if (eb) 4841 if (eb)
4825 return eb; 4842 return eb;
4826 4843
4827 eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); 4844 eb = __alloc_extent_buffer(fs_info, start, len);
4828 if (!eb) 4845 if (!eb)
4829 return NULL; 4846 return NULL;
4830 4847
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ece9ce87edff..695b0ccfb755 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -4,22 +4,22 @@
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5 5
6/* bits for the extent state */ 6/* bits for the extent state */
7#define EXTENT_DIRTY 1 7#define EXTENT_DIRTY (1U << 0)
8#define EXTENT_WRITEBACK (1 << 1) 8#define EXTENT_WRITEBACK (1U << 1)
9#define EXTENT_UPTODATE (1 << 2) 9#define EXTENT_UPTODATE (1U << 2)
10#define EXTENT_LOCKED (1 << 3) 10#define EXTENT_LOCKED (1U << 3)
11#define EXTENT_NEW (1 << 4) 11#define EXTENT_NEW (1U << 4)
12#define EXTENT_DELALLOC (1 << 5) 12#define EXTENT_DELALLOC (1U << 5)
13#define EXTENT_DEFRAG (1 << 6) 13#define EXTENT_DEFRAG (1U << 6)
14#define EXTENT_BOUNDARY (1 << 9) 14#define EXTENT_BOUNDARY (1U << 9)
15#define EXTENT_NODATASUM (1 << 10) 15#define EXTENT_NODATASUM (1U << 10)
16#define EXTENT_DO_ACCOUNTING (1 << 11) 16#define EXTENT_DO_ACCOUNTING (1U << 11)
17#define EXTENT_FIRST_DELALLOC (1 << 12) 17#define EXTENT_FIRST_DELALLOC (1U << 12)
18#define EXTENT_NEED_WAIT (1 << 13) 18#define EXTENT_NEED_WAIT (1U << 13)
19#define EXTENT_DAMAGED (1 << 14) 19#define EXTENT_DAMAGED (1U << 14)
20#define EXTENT_NORESERVE (1 << 15) 20#define EXTENT_NORESERVE (1U << 15)
21#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 21#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
22#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 22#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
23 23
24/* 24/*
25 * flags for bio submission. The high bits indicate the compression 25 * flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
81 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 81 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
82 struct extent_state *state, int uptodate); 82 struct extent_state *state, int uptodate);
83 void (*set_bit_hook)(struct inode *inode, struct extent_state *state, 83 void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
84 unsigned long *bits); 84 unsigned *bits);
85 void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 85 void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
86 unsigned long *bits); 86 unsigned *bits);
87 void (*merge_extent_hook)(struct inode *inode, 87 void (*merge_extent_hook)(struct inode *inode,
88 struct extent_state *new, 88 struct extent_state *new,
89 struct extent_state *other); 89 struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
108 /* ADD NEW ELEMENTS AFTER THIS */ 108 /* ADD NEW ELEMENTS AFTER THIS */
109 wait_queue_head_t wq; 109 wait_queue_head_t wq;
110 atomic_t refs; 110 atomic_t refs;
111 unsigned long state; 111 unsigned state;
112 112
113 /* for use by the FS */ 113 /* for use by the FS */
114 u64 private; 114 u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
188int try_release_extent_buffer(struct page *page); 188int try_release_extent_buffer(struct page *page);
189int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); 189int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
190int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 190int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
191 unsigned long bits, struct extent_state **cached); 191 unsigned bits, struct extent_state **cached);
192int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); 192int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
193int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 193int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
194 struct extent_state **cached, gfp_t mask); 194 struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
202 202
203u64 count_range_bits(struct extent_io_tree *tree, 203u64 count_range_bits(struct extent_io_tree *tree,
204 u64 *start, u64 search_end, 204 u64 *start, u64 search_end,
205 u64 max_bytes, unsigned long bits, int contig); 205 u64 max_bytes, unsigned bits, int contig);
206 206
207void free_extent_state(struct extent_state *state); 207void free_extent_state(struct extent_state *state);
208int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 208int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
209 unsigned long bits, int filled, 209 unsigned bits, int filled,
210 struct extent_state *cached_state); 210 struct extent_state *cached_state);
211int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 211int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
212 unsigned long bits, gfp_t mask); 212 unsigned bits, gfp_t mask);
213int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 213int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
214 unsigned long bits, int wake, int delete, 214 unsigned bits, int wake, int delete,
215 struct extent_state **cached, gfp_t mask); 215 struct extent_state **cached, gfp_t mask);
216int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 216int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
217 unsigned long bits, gfp_t mask); 217 unsigned bits, gfp_t mask);
218int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 218int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
219 unsigned long bits, u64 *failed_start, 219 unsigned bits, u64 *failed_start,
220 struct extent_state **cached_state, gfp_t mask); 220 struct extent_state **cached_state, gfp_t mask);
221int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 221int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
222 struct extent_state **cached_state, gfp_t mask); 222 struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
229int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 229int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
230 gfp_t mask); 230 gfp_t mask);
231int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 231int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
232 unsigned long bits, unsigned long clear_bits, 232 unsigned bits, unsigned clear_bits,
233 struct extent_state **cached_state, gfp_t mask); 233 struct extent_state **cached_state, gfp_t mask);
234int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 234int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
235 struct extent_state **cached_state, gfp_t mask); 235 struct extent_state **cached_state, gfp_t mask);
236int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 236int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
237 struct extent_state **cached_state, gfp_t mask); 237 struct extent_state **cached_state, gfp_t mask);
238int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 238int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
239 u64 *start_ret, u64 *end_ret, unsigned long bits, 239 u64 *start_ret, u64 *end_ret, unsigned bits,
240 struct extent_state **cached_state); 240 struct extent_state **cached_state);
241int extent_invalidatepage(struct extent_io_tree *tree, 241int extent_invalidatepage(struct extent_io_tree *tree,
242 struct page *page, unsigned long offset); 242 struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
262void set_page_extent_mapped(struct page *page); 262void set_page_extent_mapped(struct page *page);
263 263
264struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 264struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
265 u64 start, unsigned long len); 265 u64 start);
266struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); 266struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
267 u64 start);
267struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); 268struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
268struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 269struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
269 u64 start); 270 u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
322int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); 323int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
323int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 324int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
324 struct page *locked_page, 325 struct page *locked_page,
325 unsigned long bits_to_clear, 326 unsigned bits_to_clear,
326 unsigned long page_ops); 327 unsigned long page_ops);
327struct bio * 328struct bio *
328btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 329btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
377 u64 *end, u64 max_bytes); 378 u64 *end, u64 max_bytes);
378#endif 379#endif
379struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 380struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
380 u64 start, unsigned long len); 381 u64 start);
381#endif 382#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1746 1746
1747 mutex_lock(&inode->i_mutex); 1747 mutex_lock(&inode->i_mutex);
1748 1748
1749 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1749 current->backing_dev_info = inode_to_bdi(inode);
1750 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1750 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1751 if (err) { 1751 if (err) {
1752 mutex_unlock(&inode->i_mutex); 1752 mutex_unlock(&inode->i_mutex);
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
2081 .fault = filemap_fault, 2081 .fault = filemap_fault,
2082 .map_pages = filemap_map_pages, 2082 .map_pages = filemap_map_pages,
2083 .page_mkwrite = btrfs_page_mkwrite, 2083 .page_mkwrite = btrfs_page_mkwrite,
2084 .remap_pages = generic_file_remap_pages,
2085}; 2084};
2086 2085
2087static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 2086static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7f136b..a71978578fa7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
651 struct io_ctl io_ctl; 651 struct io_ctl io_ctl;
652 struct btrfs_key key; 652 struct btrfs_key key;
653 struct btrfs_free_space *e, *n; 653 struct btrfs_free_space *e, *n;
654 struct list_head bitmaps; 654 LIST_HEAD(bitmaps);
655 u64 num_entries; 655 u64 num_entries;
656 u64 num_bitmaps; 656 u64 num_bitmaps;
657 u64 generation; 657 u64 generation;
658 u8 type; 658 u8 type;
659 int ret = 0; 659 int ret = 0;
660 660
661 INIT_LIST_HEAD(&bitmaps);
662
663 /* Nothing in the space cache, goodbye */ 661 /* Nothing in the space cache, goodbye */
664 if (!i_size_read(inode)) 662 if (!i_size_read(inode))
665 return 0; 663 return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1243 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1241 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1244 struct inode *inode; 1242 struct inode *inode;
1245 int ret = 0; 1243 int ret = 0;
1244 enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
1246 1245
1247 root = root->fs_info->tree_root; 1246 root = root->fs_info->tree_root;
1248 1247
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1266 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1265 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
1267 path, block_group->key.objectid); 1266 path, block_group->key.objectid);
1268 if (ret) { 1267 if (ret) {
1269 spin_lock(&block_group->lock); 1268 dcs = BTRFS_DC_ERROR;
1270 block_group->disk_cache_state = BTRFS_DC_ERROR;
1271 spin_unlock(&block_group->lock);
1272 ret = 0; 1269 ret = 0;
1273#ifdef DEBUG 1270#ifdef DEBUG
1274 btrfs_err(root->fs_info, 1271 btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1277#endif 1274#endif
1278 } 1275 }
1279 1276
1277 spin_lock(&block_group->lock);
1278 block_group->disk_cache_state = dcs;
1279 spin_unlock(&block_group->lock);
1280 iput(inode); 1280 iput(inode);
1281 return ret; 1281 return ret;
1282} 1282}
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
2903 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 2903 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2904 min_bytes); 2904 min_bytes);
2905 2905
2906 INIT_LIST_HEAD(&bitmaps);
2907 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2906 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2908 bytes + empty_size, 2907 bytes + empty_size,
2909 cont1_bytes, min_bytes); 2908 cont1_bytes, min_bytes);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8ffa4783cbf4..265e03c73f4d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
344 return -ENOMEM; 344 return -ENOMEM;
345 345
346 path->leave_spinning = 1; 346 path->leave_spinning = 1;
347 path->skip_release_on_error = 1;
347 ret = btrfs_insert_empty_item(trans, root, path, &key, 348 ret = btrfs_insert_empty_item(trans, root, path, &key,
348 ins_len); 349 ins_len);
349 if (ret == -EEXIST) { 350 if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
362 ptr = (unsigned long)(ref + 1); 363 ptr = (unsigned long)(ref + 1);
363 ret = 0; 364 ret = 0;
364 } else if (ret < 0) { 365 } else if (ret < 0) {
365 if (ret == -EOVERFLOW) 366 if (ret == -EOVERFLOW) {
366 ret = -EMLINK; 367 if (find_name_in_backref(path, name, name_len, &ref))
368 ret = -EEXIST;
369 else
370 ret = -EMLINK;
371 }
367 goto out; 372 goto out;
368 } else { 373 } else {
369 ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 374 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e687bb0dc73a..a85c23dfcddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1530static void btrfs_split_extent_hook(struct inode *inode, 1530static void btrfs_split_extent_hook(struct inode *inode,
1531 struct extent_state *orig, u64 split) 1531 struct extent_state *orig, u64 split)
1532{ 1532{
1533 u64 size;
1534
1533 /* not delalloc, ignore it */ 1535 /* not delalloc, ignore it */
1534 if (!(orig->state & EXTENT_DELALLOC)) 1536 if (!(orig->state & EXTENT_DELALLOC))
1535 return; 1537 return;
1536 1538
1539 size = orig->end - orig->start + 1;
1540 if (size > BTRFS_MAX_EXTENT_SIZE) {
1541 u64 num_extents;
1542 u64 new_size;
1543
1544 /*
1545 * We need the largest size of the remaining extent to see if we
1546 * need to add a new outstanding extent. Think of the following
1547 * case
1548 *
1549 * [MEAX_EXTENT_SIZEx2 - 4k][4k]
1550 *
1551 * The new_size would just be 4k and we'd think we had enough
1552 * outstanding extents for this if we only took one side of the
1553 * split, same goes for the other direction. We need to see if
1554 * the larger size still is the same amount of extents as the
1555 * original size, because if it is we need to add a new
1556 * outstanding extent. But if we split up and the larger size
1557 * is less than the original then we are good to go since we've
1558 * already accounted for the extra extent in our original
1559 * accounting.
1560 */
1561 new_size = orig->end - split + 1;
1562 if ((split - orig->start) > new_size)
1563 new_size = split - orig->start;
1564
1565 num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1566 BTRFS_MAX_EXTENT_SIZE);
1567 if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1568 BTRFS_MAX_EXTENT_SIZE) < num_extents)
1569 return;
1570 }
1571
1537 spin_lock(&BTRFS_I(inode)->lock); 1572 spin_lock(&BTRFS_I(inode)->lock);
1538 BTRFS_I(inode)->outstanding_extents++; 1573 BTRFS_I(inode)->outstanding_extents++;
1539 spin_unlock(&BTRFS_I(inode)->lock); 1574 spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
1549 struct extent_state *new, 1584 struct extent_state *new,
1550 struct extent_state *other) 1585 struct extent_state *other)
1551{ 1586{
1587 u64 new_size, old_size;
1588 u64 num_extents;
1589
1552 /* not delalloc, ignore it */ 1590 /* not delalloc, ignore it */
1553 if (!(other->state & EXTENT_DELALLOC)) 1591 if (!(other->state & EXTENT_DELALLOC))
1554 return; 1592 return;
1555 1593
1594 old_size = other->end - other->start + 1;
1595 new_size = old_size + (new->end - new->start + 1);
1596
1597 /* we're not bigger than the max, unreserve the space and go */
1598 if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1599 spin_lock(&BTRFS_I(inode)->lock);
1600 BTRFS_I(inode)->outstanding_extents--;
1601 spin_unlock(&BTRFS_I(inode)->lock);
1602 return;
1603 }
1604
1605 /*
1606 * If we grew by another max_extent, just return, we want to keep that
1607 * reserved amount.
1608 */
1609 num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1610 BTRFS_MAX_EXTENT_SIZE);
1611 if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1612 BTRFS_MAX_EXTENT_SIZE) > num_extents)
1613 return;
1614
1556 spin_lock(&BTRFS_I(inode)->lock); 1615 spin_lock(&BTRFS_I(inode)->lock);
1557 BTRFS_I(inode)->outstanding_extents--; 1616 BTRFS_I(inode)->outstanding_extents--;
1558 spin_unlock(&BTRFS_I(inode)->lock); 1617 spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1604 * have pending delalloc work to be done. 1663 * have pending delalloc work to be done.
1605 */ 1664 */
1606static void btrfs_set_bit_hook(struct inode *inode, 1665static void btrfs_set_bit_hook(struct inode *inode,
1607 struct extent_state *state, unsigned long *bits) 1666 struct extent_state *state, unsigned *bits)
1608{ 1667{
1609 1668
1610 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) 1669 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
1645 */ 1704 */
1646static void btrfs_clear_bit_hook(struct inode *inode, 1705static void btrfs_clear_bit_hook(struct inode *inode,
1647 struct extent_state *state, 1706 struct extent_state *state,
1648 unsigned long *bits) 1707 unsigned *bits)
1649{ 1708{
1650 u64 len = state->end + 1 - state->start; 1709 u64 len = state->end + 1 - state->start;
1710 u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1711 BTRFS_MAX_EXTENT_SIZE);
1651 1712
1652 spin_lock(&BTRFS_I(inode)->lock); 1713 spin_lock(&BTRFS_I(inode)->lock);
1653 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) 1714 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1667 *bits &= ~EXTENT_FIRST_DELALLOC; 1728 *bits &= ~EXTENT_FIRST_DELALLOC;
1668 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { 1729 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1669 spin_lock(&BTRFS_I(inode)->lock); 1730 spin_lock(&BTRFS_I(inode)->lock);
1670 BTRFS_I(inode)->outstanding_extents--; 1731 BTRFS_I(inode)->outstanding_extents -= num_extents;
1671 spin_unlock(&BTRFS_I(inode)->lock); 1732 spin_unlock(&BTRFS_I(inode)->lock);
1672 } 1733 }
1673 1734
@@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode,
2945 return 0; 3006 return 0;
2946zeroit: 3007zeroit:
2947 if (__ratelimit(&_rs)) 3008 if (__ratelimit(&_rs))
2948 btrfs_info(BTRFS_I(inode)->root->fs_info, 3009 btrfs_warn(BTRFS_I(inode)->root->fs_info,
2949 "csum failed ino %llu off %llu csum %u expected csum %u", 3010 "csum failed ino %llu off %llu csum %u expected csum %u",
2950 btrfs_ino(inode), start, csum, csum_expected); 3011 btrfs_ino(inode), start, csum, csum_expected);
2951 memset(kaddr + pgoff, 1, len); 3012 memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3407 3468
3408out: 3469out:
3409 if (ret) 3470 if (ret)
3410 btrfs_crit(root->fs_info, 3471 btrfs_err(root->fs_info,
3411 "could not do orphan cleanup %d", ret); 3472 "could not do orphan cleanup %d", ret);
3412 btrfs_free_path(path); 3473 btrfs_free_path(path);
3413 return ret; 3474 return ret;
@@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
3490 struct btrfs_path *path; 3551 struct btrfs_path *path;
3491 struct extent_buffer *leaf; 3552 struct extent_buffer *leaf;
3492 struct btrfs_inode_item *inode_item; 3553 struct btrfs_inode_item *inode_item;
3493 struct btrfs_timespec *tspec;
3494 struct btrfs_root *root = BTRFS_I(inode)->root; 3554 struct btrfs_root *root = BTRFS_I(inode)->root;
3495 struct btrfs_key location; 3555 struct btrfs_key location;
3496 unsigned long ptr; 3556 unsigned long ptr;
@@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
3527 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); 3587 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3528 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 3588 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3529 3589
3530 tspec = btrfs_inode_atime(inode_item); 3590 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3531 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3591 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3532 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3592
3593 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3594 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3533 3595
3534 tspec = btrfs_inode_mtime(inode_item); 3596 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3535 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3597 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3536 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3537 3598
3538 tspec = btrfs_inode_ctime(inode_item); 3599 BTRFS_I(inode)->i_otime.tv_sec =
3539 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); 3600 btrfs_timespec_sec(leaf, &inode_item->otime);
3540 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); 3601 BTRFS_I(inode)->i_otime.tv_nsec =
3602 btrfs_timespec_nsec(leaf, &inode_item->otime);
3541 3603
3542 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 3604 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3543 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 3605 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3608,7 +3670,6 @@ cache_acl:
3608 switch (inode->i_mode & S_IFMT) { 3670 switch (inode->i_mode & S_IFMT) {
3609 case S_IFREG: 3671 case S_IFREG:
3610 inode->i_mapping->a_ops = &btrfs_aops; 3672 inode->i_mapping->a_ops = &btrfs_aops;
3611 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3612 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 3673 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3613 inode->i_fop = &btrfs_file_operations; 3674 inode->i_fop = &btrfs_file_operations;
3614 inode->i_op = &btrfs_file_inode_operations; 3675 inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3684,6 @@ cache_acl:
3623 case S_IFLNK: 3684 case S_IFLNK:
3624 inode->i_op = &btrfs_symlink_inode_operations; 3685 inode->i_op = &btrfs_symlink_inode_operations;
3625 inode->i_mapping->a_ops = &btrfs_symlink_aops; 3686 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3626 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3627 break; 3687 break;
3628 default: 3688 default:
3629 inode->i_op = &btrfs_special_inode_operations; 3689 inode->i_op = &btrfs_special_inode_operations;
@@ -3658,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
3658 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3718 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3659 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3719 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3660 3720
3661 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3721 btrfs_set_token_timespec_sec(leaf, &item->atime,
3662 inode->i_atime.tv_sec, &token); 3722 inode->i_atime.tv_sec, &token);
3663 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3723 btrfs_set_token_timespec_nsec(leaf, &item->atime,
3664 inode->i_atime.tv_nsec, &token); 3724 inode->i_atime.tv_nsec, &token);
3665 3725
3666 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3726 btrfs_set_token_timespec_sec(leaf, &item->mtime,
3667 inode->i_mtime.tv_sec, &token); 3727 inode->i_mtime.tv_sec, &token);
3668 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3728 btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3669 inode->i_mtime.tv_nsec, &token); 3729 inode->i_mtime.tv_nsec, &token);
3670 3730
3671 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3731 btrfs_set_token_timespec_sec(leaf, &item->ctime,
3672 inode->i_ctime.tv_sec, &token); 3732 inode->i_ctime.tv_sec, &token);
3673 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3733 btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3674 inode->i_ctime.tv_nsec, &token); 3734 inode->i_ctime.tv_nsec, &token);
3675 3735
3736 btrfs_set_token_timespec_sec(leaf, &item->otime,
3737 BTRFS_I(inode)->i_otime.tv_sec, &token);
3738 btrfs_set_token_timespec_nsec(leaf, &item->otime,
3739 BTRFS_I(inode)->i_otime.tv_nsec, &token);
3740
3676 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3741 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3677 &token); 3742 &token);
3678 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, 3743 btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5009,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
5009 struct btrfs_root *new_root; 5074 struct btrfs_root *new_root;
5010 struct btrfs_root_ref *ref; 5075 struct btrfs_root_ref *ref;
5011 struct extent_buffer *leaf; 5076 struct extent_buffer *leaf;
5077 struct btrfs_key key;
5012 int ret; 5078 int ret;
5013 int err = 0; 5079 int err = 0;
5014 5080
@@ -5019,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
5019 } 5085 }
5020 5086
5021 err = -ENOENT; 5087 err = -ENOENT;
5022 ret = btrfs_find_item(root->fs_info->tree_root, path, 5088 key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5023 BTRFS_I(dir)->root->root_key.objectid, 5089 key.type = BTRFS_ROOT_REF_KEY;
5024 location->objectid, BTRFS_ROOT_REF_KEY, NULL); 5090 key.offset = location->objectid;
5091
5092 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
5093 0, 0);
5025 if (ret) { 5094 if (ret) {
5026 if (ret < 0) 5095 if (ret < 0)
5027 err = ret; 5096 err = ret;
@@ -5260,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s,
5260 inode->i_op = &btrfs_dir_ro_inode_operations; 5329 inode->i_op = &btrfs_dir_ro_inode_operations;
5261 inode->i_fop = &simple_dir_operations; 5330 inode->i_fop = &simple_dir_operations;
5262 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; 5331 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5263 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 5332 inode->i_mtime = CURRENT_TIME;
5333 inode->i_atime = inode->i_mtime;
5334 inode->i_ctime = inode->i_mtime;
5335 BTRFS_I(inode)->i_otime = inode->i_mtime;
5264 5336
5265 return inode; 5337 return inode;
5266} 5338}
@@ -5828,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5828 5900
5829 inode_init_owner(inode, dir, mode); 5901 inode_init_owner(inode, dir, mode);
5830 inode_set_bytes(inode, 0); 5902 inode_set_bytes(inode, 0);
5831 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 5903
5904 inode->i_mtime = CURRENT_TIME;
5905 inode->i_atime = inode->i_mtime;
5906 inode->i_ctime = inode->i_mtime;
5907 BTRFS_I(inode)->i_otime = inode->i_mtime;
5908
5832 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5909 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5833 struct btrfs_inode_item); 5910 struct btrfs_inode_item);
5834 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, 5911 memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -6088,7 +6165,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
6088 inode->i_fop = &btrfs_file_operations; 6165 inode->i_fop = &btrfs_file_operations;
6089 inode->i_op = &btrfs_file_inode_operations; 6166 inode->i_op = &btrfs_file_inode_operations;
6090 inode->i_mapping->a_ops = &btrfs_aops; 6167 inode->i_mapping->a_ops = &btrfs_aops;
6091 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6092 6168
6093 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6169 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6094 if (err) 6170 if (err)
@@ -6255,8 +6331,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6255 6331
6256out_fail: 6332out_fail:
6257 btrfs_end_transaction(trans, root); 6333 btrfs_end_transaction(trans, root);
6258 if (drop_on_err) 6334 if (drop_on_err) {
6335 inode_dec_link_count(inode);
6259 iput(inode); 6336 iput(inode);
6337 }
6260 btrfs_balance_delayed_items(root); 6338 btrfs_balance_delayed_items(root);
6261 btrfs_btree_balance_dirty(root); 6339 btrfs_btree_balance_dirty(root);
6262 return err; 6340 return err;
@@ -7135,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7135 u64 start = iblock << inode->i_blkbits; 7213 u64 start = iblock << inode->i_blkbits;
7136 u64 lockstart, lockend; 7214 u64 lockstart, lockend;
7137 u64 len = bh_result->b_size; 7215 u64 len = bh_result->b_size;
7216 u64 orig_len = len;
7138 int unlock_bits = EXTENT_LOCKED; 7217 int unlock_bits = EXTENT_LOCKED;
7139 int ret = 0; 7218 int ret = 0;
7140 7219
7141 if (create) 7220 if (create)
7142 unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; 7221 unlock_bits |= EXTENT_DIRTY;
7143 else 7222 else
7144 len = min_t(u64, len, root->sectorsize); 7223 len = min_t(u64, len, root->sectorsize);
7145 7224
@@ -7270,14 +7349,12 @@ unlock:
7270 if (start + len > i_size_read(inode)) 7349 if (start + len > i_size_read(inode))
7271 i_size_write(inode, start + len); 7350 i_size_write(inode, start + len);
7272 7351
7273 spin_lock(&BTRFS_I(inode)->lock); 7352 if (len < orig_len) {
7274 BTRFS_I(inode)->outstanding_extents++; 7353 spin_lock(&BTRFS_I(inode)->lock);
7275 spin_unlock(&BTRFS_I(inode)->lock); 7354 BTRFS_I(inode)->outstanding_extents++;
7276 7355 spin_unlock(&BTRFS_I(inode)->lock);
7277 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 7356 }
7278 lockstart + len - 1, EXTENT_DELALLOC, NULL, 7357 btrfs_free_reserved_data_space(inode, len);
7279 &cached_state, GFP_NOFS);
7280 BUG_ON(ret);
7281 } 7358 }
7282 7359
7283 /* 7360 /*
@@ -7806,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7806 } 7883 }
7807 7884
7808 /* async crcs make it difficult to collect full stripe writes. */ 7885 /* async crcs make it difficult to collect full stripe writes. */
7809 if (btrfs_get_alloc_profile(root, 1) & 7886 if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
7810 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7811 async_submit = 0; 7887 async_submit = 0;
7812 else 7888 else
7813 async_submit = 1; 7889 async_submit = 1;
@@ -8054,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
8054 else if (ret >= 0 && (size_t)ret < count) 8130 else if (ret >= 0 && (size_t)ret < count)
8055 btrfs_delalloc_release_space(inode, 8131 btrfs_delalloc_release_space(inode,
8056 count - (size_t)ret); 8132 count - (size_t)ret);
8057 else
8058 btrfs_delalloc_release_metadata(inode, 0);
8059 } 8133 }
8060out: 8134out:
8061 if (wakeup) 8135 if (wakeup)
@@ -8576,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8576 8650
8577 ei->delayed_node = NULL; 8651 ei->delayed_node = NULL;
8578 8652
8653 ei->i_otime.tv_sec = 0;
8654 ei->i_otime.tv_nsec = 0;
8655
8579 inode = &ei->vfs_inode; 8656 inode = &ei->vfs_inode;
8580 extent_map_tree_init(&ei->extent_tree); 8657 extent_map_tree_init(&ei->extent_tree);
8581 extent_io_tree_init(&ei->io_tree, &inode->i_data); 8658 extent_io_tree_init(&ei->io_tree, &inode->i_data);
@@ -9201,7 +9278,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9201 inode->i_fop = &btrfs_file_operations; 9278 inode->i_fop = &btrfs_file_operations;
9202 inode->i_op = &btrfs_file_inode_operations; 9279 inode->i_op = &btrfs_file_inode_operations;
9203 inode->i_mapping->a_ops = &btrfs_aops; 9280 inode->i_mapping->a_ops = &btrfs_aops;
9204 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9205 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9281 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9206 9282
9207 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 9283 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9245,7 +9321,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9245 9321
9246 inode->i_op = &btrfs_symlink_inode_operations; 9322 inode->i_op = &btrfs_symlink_inode_operations;
9247 inode->i_mapping->a_ops = &btrfs_symlink_aops; 9323 inode->i_mapping->a_ops = &btrfs_symlink_aops;
9248 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9249 inode_set_bytes(inode, name_len); 9324 inode_set_bytes(inode, name_len);
9250 btrfs_i_size_write(inode, name_len); 9325 btrfs_i_size_write(inode, name_len);
9251 err = btrfs_update_inode(trans, root, inode); 9326 err = btrfs_update_inode(trans, root, inode);
@@ -9457,7 +9532,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9457 inode->i_op = &btrfs_file_inode_operations; 9532 inode->i_op = &btrfs_file_inode_operations;
9458 9533
9459 inode->i_mapping->a_ops = &btrfs_aops; 9534 inode->i_mapping->a_ops = &btrfs_aops;
9460 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9461 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9535 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9462 9536
9463 ret = btrfs_init_inode_security(trans, inode, dir, NULL); 9537 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 48b60dbf807f..97159a8e91d4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1431 qgroup = u64_to_ptr(unode->aux); 1431 qgroup = u64_to_ptr(unode->aux);
1432 qgroup->rfer += sign * oper->num_bytes; 1432 qgroup->rfer += sign * oper->num_bytes;
1433 qgroup->rfer_cmpr += sign * oper->num_bytes; 1433 qgroup->rfer_cmpr += sign * oper->num_bytes;
1434 WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
1434 qgroup->excl += sign * oper->num_bytes; 1435 qgroup->excl += sign * oper->num_bytes;
1435 if (sign < 0)
1436 WARN_ON(qgroup->excl < oper->num_bytes);
1437 qgroup->excl_cmpr += sign * oper->num_bytes; 1436 qgroup->excl_cmpr += sign * oper->num_bytes;
1438 qgroup_dirty(fs_info, qgroup); 1437 qgroup_dirty(fs_info, qgroup);
1439 1438
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8ab2a17bbba8..5264858ed768 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,15 +58,6 @@
58 */ 58 */
59#define RBIO_CACHE_READY_BIT 3 59#define RBIO_CACHE_READY_BIT 3
60 60
61/*
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
67 */
68#define RBIO_HOLD_BBIO_MAP_BIT 4
69
70#define RBIO_CACHE_SIZE 1024 61#define RBIO_CACHE_SIZE 1024
71 62
72enum btrfs_rbio_ops { 63enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
79 struct btrfs_fs_info *fs_info; 70 struct btrfs_fs_info *fs_info;
80 struct btrfs_bio *bbio; 71 struct btrfs_bio *bbio;
81 72
82 /*
83 * logical block numbers for the start of each stripe
84 * The last one or two are p/q. These are sorted,
85 * so raid_map[0] is the start of our full stripe
86 */
87 u64 *raid_map;
88
89 /* while we're doing rmw on a stripe 73 /* while we're doing rmw on a stripe
90 * we put it into a hash table so we can 74 * we put it into a hash table so we can
91 * lock the stripe and merge more rbios 75 * lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
303 */ 287 */
304static int rbio_bucket(struct btrfs_raid_bio *rbio) 288static int rbio_bucket(struct btrfs_raid_bio *rbio)
305{ 289{
306 u64 num = rbio->raid_map[0]; 290 u64 num = rbio->bbio->raid_map[0];
307 291
308 /* 292 /*
309 * we shift down quite a bit. We're using byte 293 * we shift down quite a bit. We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
606 test_bit(RBIO_CACHE_BIT, &cur->flags)) 590 test_bit(RBIO_CACHE_BIT, &cur->flags))
607 return 0; 591 return 0;
608 592
609 if (last->raid_map[0] != 593 if (last->bbio->raid_map[0] !=
610 cur->raid_map[0]) 594 cur->bbio->raid_map[0])
611 return 0; 595 return 0;
612 596
613 /* we can't merge with different operations */ 597 /* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
689 spin_lock_irqsave(&h->lock, flags); 673 spin_lock_irqsave(&h->lock, flags);
690 list_for_each_entry(cur, &h->hash_list, hash_list) { 674 list_for_each_entry(cur, &h->hash_list, hash_list) {
691 walk++; 675 walk++;
692 if (cur->raid_map[0] == rbio->raid_map[0]) { 676 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
693 spin_lock(&cur->bio_list_lock); 677 spin_lock(&cur->bio_list_lock);
694 678
695 /* can we steal this cached rbio's pages? */ 679 /* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
841 remove_rbio_from_cache(rbio); 825 remove_rbio_from_cache(rbio);
842} 826}
843 827
844static inline void
845__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
846{
847 if (need) {
848 kfree(raid_map);
849 kfree(bbio);
850 }
851}
852
853static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
854{
855 __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
856 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
857}
858
859static void __free_raid_bio(struct btrfs_raid_bio *rbio) 828static void __free_raid_bio(struct btrfs_raid_bio *rbio)
860{ 829{
861 int i; 830 int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
875 } 844 }
876 } 845 }
877 846
878 free_bbio_and_raid_map(rbio); 847 btrfs_put_bbio(rbio->bbio);
879
880 kfree(rbio); 848 kfree(rbio);
881} 849}
882 850
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
985 * this does not allocate any pages for rbio->pages. 953 * this does not allocate any pages for rbio->pages.
986 */ 954 */
987static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 955static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
988 struct btrfs_bio *bbio, u64 *raid_map, 956 struct btrfs_bio *bbio, u64 stripe_len)
989 u64 stripe_len)
990{ 957{
991 struct btrfs_raid_bio *rbio; 958 struct btrfs_raid_bio *rbio;
992 int nr_data = 0; 959 int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
1007 INIT_LIST_HEAD(&rbio->stripe_cache); 974 INIT_LIST_HEAD(&rbio->stripe_cache);
1008 INIT_LIST_HEAD(&rbio->hash_list); 975 INIT_LIST_HEAD(&rbio->hash_list);
1009 rbio->bbio = bbio; 976 rbio->bbio = bbio;
1010 rbio->raid_map = raid_map;
1011 rbio->fs_info = root->fs_info; 977 rbio->fs_info = root->fs_info;
1012 rbio->stripe_len = stripe_len; 978 rbio->stripe_len = stripe_len;
1013 rbio->nr_pages = num_pages; 979 rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
1028 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 994 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; 995 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
1030 996
1031 if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) 997 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
998 nr_data = real_stripes - 1;
999 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1032 nr_data = real_stripes - 2; 1000 nr_data = real_stripes - 2;
1033 else 1001 else
1034 nr_data = real_stripes - 1; 1002 BUG();
1035 1003
1036 rbio->nr_data = nr_data; 1004 rbio->nr_data = nr_data;
1037 return rbio; 1005 return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1182 spin_lock_irq(&rbio->bio_list_lock); 1150 spin_lock_irq(&rbio->bio_list_lock);
1183 bio_list_for_each(bio, &rbio->bio_list) { 1151 bio_list_for_each(bio, &rbio->bio_list) {
1184 start = (u64)bio->bi_iter.bi_sector << 9; 1152 start = (u64)bio->bi_iter.bi_sector << 9;
1185 stripe_offset = start - rbio->raid_map[0]; 1153 stripe_offset = start - rbio->bbio->raid_map[0];
1186 page_index = stripe_offset >> PAGE_CACHE_SHIFT; 1154 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1187 1155
1188 for (i = 0; i < bio->bi_vcnt; i++) { 1156 for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1402 logical <<= 9; 1370 logical <<= 9;
1403 1371
1404 for (i = 0; i < rbio->nr_data; i++) { 1372 for (i = 0; i < rbio->nr_data; i++) {
1405 stripe_start = rbio->raid_map[i]; 1373 stripe_start = rbio->bbio->raid_map[i];
1406 if (logical >= stripe_start && 1374 if (logical >= stripe_start &&
1407 logical < stripe_start + rbio->stripe_len) { 1375 logical < stripe_start + rbio->stripe_len) {
1408 return i; 1376 return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1776 * our main entry point for writes from the rest of the FS. 1744 * our main entry point for writes from the rest of the FS.
1777 */ 1745 */
1778int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1746int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1779 struct btrfs_bio *bbio, u64 *raid_map, 1747 struct btrfs_bio *bbio, u64 stripe_len)
1780 u64 stripe_len)
1781{ 1748{
1782 struct btrfs_raid_bio *rbio; 1749 struct btrfs_raid_bio *rbio;
1783 struct btrfs_plug_cb *plug = NULL; 1750 struct btrfs_plug_cb *plug = NULL;
1784 struct blk_plug_cb *cb; 1751 struct blk_plug_cb *cb;
1785 int ret; 1752 int ret;
1786 1753
1787 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1754 rbio = alloc_rbio(root, bbio, stripe_len);
1788 if (IS_ERR(rbio)) { 1755 if (IS_ERR(rbio)) {
1789 __free_bbio_and_raid_map(bbio, raid_map, 1); 1756 btrfs_put_bbio(bbio);
1790 return PTR_ERR(rbio); 1757 return PTR_ERR(rbio);
1791 } 1758 }
1792 bio_list_add(&rbio->bio_list, bio); 1759 bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1885 } 1852 }
1886 1853
1887 /* all raid6 handling here */ 1854 /* all raid6 handling here */
1888 if (rbio->raid_map[rbio->real_stripes - 1] == 1855 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1889 RAID6_Q_STRIPE) {
1890
1891 /* 1856 /*
1892 * single failure, rebuild from parity raid5 1857 * single failure, rebuild from parity raid5
1893 * style 1858 * style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1922 * here due to a crc mismatch and we can't give them the 1887 * here due to a crc mismatch and we can't give them the
1923 * data they want 1888 * data they want
1924 */ 1889 */
1925 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 1890 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1926 if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 1891 if (rbio->bbio->raid_map[faila] ==
1892 RAID5_P_STRIPE) {
1927 err = -EIO; 1893 err = -EIO;
1928 goto cleanup; 1894 goto cleanup;
1929 } 1895 }
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1934 goto pstripe; 1900 goto pstripe;
1935 } 1901 }
1936 1902
1937 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1903 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1938 raid6_datap_recov(rbio->real_stripes, 1904 raid6_datap_recov(rbio->real_stripes,
1939 PAGE_SIZE, faila, pointers); 1905 PAGE_SIZE, faila, pointers);
1940 } else { 1906 } else {
@@ -2001,8 +1967,7 @@ cleanup:
2001 1967
2002cleanup_io: 1968cleanup_io:
2003 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1969 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2004 if (err == 0 && 1970 if (err == 0)
2005 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
2006 cache_rbio_pages(rbio); 1971 cache_rbio_pages(rbio);
2007 else 1972 else
2008 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1973 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
2156 * of the drive. 2121 * of the drive.
2157 */ 2122 */
2158int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2123int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2159 struct btrfs_bio *bbio, u64 *raid_map, 2124 struct btrfs_bio *bbio, u64 stripe_len,
2160 u64 stripe_len, int mirror_num, int generic_io) 2125 int mirror_num, int generic_io)
2161{ 2126{
2162 struct btrfs_raid_bio *rbio; 2127 struct btrfs_raid_bio *rbio;
2163 int ret; 2128 int ret;
2164 2129
2165 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2130 rbio = alloc_rbio(root, bbio, stripe_len);
2166 if (IS_ERR(rbio)) { 2131 if (IS_ERR(rbio)) {
2167 __free_bbio_and_raid_map(bbio, raid_map, generic_io); 2132 if (generic_io)
2133 btrfs_put_bbio(bbio);
2168 return PTR_ERR(rbio); 2134 return PTR_ERR(rbio);
2169 } 2135 }
2170 2136
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2175 rbio->faila = find_logical_bio_stripe(rbio, bio); 2141 rbio->faila = find_logical_bio_stripe(rbio, bio);
2176 if (rbio->faila == -1) { 2142 if (rbio->faila == -1) {
2177 BUG(); 2143 BUG();
2178 __free_bbio_and_raid_map(bbio, raid_map, generic_io); 2144 if (generic_io)
2145 btrfs_put_bbio(bbio);
2179 kfree(rbio); 2146 kfree(rbio);
2180 return -EIO; 2147 return -EIO;
2181 } 2148 }
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2184 btrfs_bio_counter_inc_noblocked(root->fs_info); 2151 btrfs_bio_counter_inc_noblocked(root->fs_info);
2185 rbio->generic_bio_cnt = 1; 2152 rbio->generic_bio_cnt = 1;
2186 } else { 2153 } else {
2187 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); 2154 btrfs_get_bbio(bbio);
2188 } 2155 }
2189 2156
2190 /* 2157 /*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
2240 2207
2241struct btrfs_raid_bio * 2208struct btrfs_raid_bio *
2242raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, 2209raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243 struct btrfs_bio *bbio, u64 *raid_map, 2210 struct btrfs_bio *bbio, u64 stripe_len,
2244 u64 stripe_len, struct btrfs_device *scrub_dev, 2211 struct btrfs_device *scrub_dev,
2245 unsigned long *dbitmap, int stripe_nsectors) 2212 unsigned long *dbitmap, int stripe_nsectors)
2246{ 2213{
2247 struct btrfs_raid_bio *rbio; 2214 struct btrfs_raid_bio *rbio;
2248 int i; 2215 int i;
2249 2216
2250 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2217 rbio = alloc_rbio(root, bbio, stripe_len);
2251 if (IS_ERR(rbio)) 2218 if (IS_ERR(rbio))
2252 return NULL; 2219 return NULL;
2253 bio_list_add(&rbio->bio_list, bio); 2220 bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2279 int stripe_offset; 2246 int stripe_offset;
2280 int index; 2247 int index;
2281 2248
2282 ASSERT(logical >= rbio->raid_map[0]); 2249 ASSERT(logical >= rbio->bbio->raid_map[0]);
2283 ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + 2250 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2284 rbio->stripe_len * rbio->nr_data); 2251 rbio->stripe_len * rbio->nr_data);
2285 stripe_offset = (int)(logical - rbio->raid_map[0]); 2252 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2286 index = stripe_offset >> PAGE_CACHE_SHIFT; 2253 index = stripe_offset >> PAGE_CACHE_SHIFT;
2287 rbio->bio_pages[index] = page; 2254 rbio->bio_pages[index] = page;
2288} 2255}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 31d4a157b5e3..2b5d7977d83b 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
43struct btrfs_device; 43struct btrfs_device;
44 44
45int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 45int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map, 46 struct btrfs_bio *bbio, u64 stripe_len,
47 u64 stripe_len, int mirror_num, int generic_io); 47 int mirror_num, int generic_io);
48int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 48int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
49 struct btrfs_bio *bbio, u64 *raid_map, 49 struct btrfs_bio *bbio, u64 stripe_len);
50 u64 stripe_len);
51 50
52struct btrfs_raid_bio * 51struct btrfs_raid_bio *
53raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, 52raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
54 struct btrfs_bio *bbio, u64 *raid_map, 53 struct btrfs_bio *bbio, u64 stripe_len,
55 u64 stripe_len, struct btrfs_device *scrub_dev, 54 struct btrfs_device *scrub_dev,
56 unsigned long *dbitmap, int stripe_nsectors); 55 unsigned long *dbitmap, int stripe_nsectors);
57void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, 56void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
58 struct page *page, u64 logical); 57 struct page *page, u64 logical);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b63ae20618fb..0e7beea92b4c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
66struct reada_extent { 66struct reada_extent {
67 u64 logical; 67 u64 logical;
68 struct btrfs_key top; 68 struct btrfs_key top;
69 u32 blocksize;
70 int err; 69 int err;
71 struct list_head extctl; 70 struct list_head extctl;
72 int refcnt; 71 int refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
349 348
350 blocksize = root->nodesize; 349 blocksize = root->nodesize;
351 re->logical = logical; 350 re->logical = logical;
352 re->blocksize = blocksize;
353 re->top = *top; 351 re->top = *top;
354 INIT_LIST_HEAD(&re->extctl); 352 INIT_LIST_HEAD(&re->extctl);
355 spin_lock_init(&re->lock); 353 spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
463 spin_unlock(&fs_info->reada_lock); 461 spin_unlock(&fs_info->reada_lock);
464 btrfs_dev_replace_unlock(&fs_info->dev_replace); 462 btrfs_dev_replace_unlock(&fs_info->dev_replace);
465 463
466 kfree(bbio); 464 btrfs_put_bbio(bbio);
467 return re; 465 return re;
468 466
469error: 467error:
@@ -488,7 +486,7 @@ error:
488 kref_put(&zone->refcnt, reada_zone_release); 486 kref_put(&zone->refcnt, reada_zone_release);
489 spin_unlock(&fs_info->reada_lock); 487 spin_unlock(&fs_info->reada_lock);
490 } 488 }
491 kfree(bbio); 489 btrfs_put_bbio(bbio);
492 kfree(re); 490 kfree(re);
493 return re_exist; 491 return re_exist;
494} 492}
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
660 int mirror_num = 0; 658 int mirror_num = 0;
661 struct extent_buffer *eb = NULL; 659 struct extent_buffer *eb = NULL;
662 u64 logical; 660 u64 logical;
663 u32 blocksize;
664 int ret; 661 int ret;
665 int i; 662 int i;
666 int need_kick = 0; 663 int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
694 spin_unlock(&fs_info->reada_lock); 691 spin_unlock(&fs_info->reada_lock);
695 return 0; 692 return 0;
696 } 693 }
697 dev->reada_next = re->logical + re->blocksize; 694 dev->reada_next = re->logical + fs_info->tree_root->nodesize;
698 re->refcnt++; 695 re->refcnt++;
699 696
700 spin_unlock(&fs_info->reada_lock); 697 spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
709 } 706 }
710 } 707 }
711 logical = re->logical; 708 logical = re->logical;
712 blocksize = re->blocksize;
713 709
714 spin_lock(&re->lock); 710 spin_lock(&re->lock);
715 if (re->scheduled_for == NULL) { 711 if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
724 return 0; 720 return 0;
725 721
726 atomic_inc(&dev->reada_in_flight); 722 atomic_inc(&dev->reada_in_flight);
727 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, 723 ret = reada_tree_block_flagged(fs_info->extent_root, logical,
728 mirror_num, &eb); 724 mirror_num, &eb);
729 if (ret) 725 if (ret)
730 __readahead_hook(fs_info->extent_root, NULL, logical, ret); 726 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
731 else if (eb) 727 else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
851 break; 847 break;
852 printk(KERN_DEBUG 848 printk(KERN_DEBUG
853 " re: logical %llu size %u empty %d for %lld", 849 " re: logical %llu size %u empty %d for %lld",
854 re->logical, re->blocksize, 850 re->logical, fs_info->tree_root->nodesize,
855 list_empty(&re->extctl), re->scheduled_for ? 851 list_empty(&re->extctl), re->scheduled_for ?
856 re->scheduled_for->devid : -1); 852 re->scheduled_for->devid : -1);
857 853
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
886 } 882 }
887 printk(KERN_DEBUG 883 printk(KERN_DEBUG
888 "re: logical %llu size %u list empty %d for %lld", 884 "re: logical %llu size %u list empty %d for %lld",
889 re->logical, re->blocksize, list_empty(&re->extctl), 885 re->logical, fs_info->tree_root->nodesize,
886 list_empty(&re->extctl),
890 re->scheduled_for ? re->scheduled_for->devid : -1); 887 re->scheduled_for ? re->scheduled_for->devid : -1);
891 for (i = 0; i < re->nzones; ++i) { 888 for (i = 0; i < re->nzones; ++i) {
892 printk(KERN_CONT " zone %llu-%llu devs", 889 printk(KERN_CONT " zone %llu-%llu devs",
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74257d6436ad..d83085381bcc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
2855 } 2855 }
2856} 2856}
2857 2857
2858static int tree_block_processed(u64 bytenr, u32 blocksize, 2858static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2859 struct reloc_control *rc)
2860{ 2859{
2860 u32 blocksize = rc->extent_root->nodesize;
2861
2861 if (test_range_bit(&rc->processed_blocks, bytenr, 2862 if (test_range_bit(&rc->processed_blocks, bytenr,
2862 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2863 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2863 return 1; 2864 return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2965 while (rb_node) { 2966 while (rb_node) {
2966 block = rb_entry(rb_node, struct tree_block, rb_node); 2967 block = rb_entry(rb_node, struct tree_block, rb_node);
2967 if (!block->key_ready) 2968 if (!block->key_ready)
2968 readahead_tree_block(rc->extent_root, block->bytenr, 2969 readahead_tree_block(rc->extent_root, block->bytenr);
2969 block->key.objectid);
2970 rb_node = rb_next(rb_node); 2970 rb_node = rb_next(rb_node);
2971 } 2971 }
2972 2972
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
3353 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, 3353 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3354 SKINNY_METADATA); 3354 SKINNY_METADATA);
3355 3355
3356 if (tree_block_processed(bytenr, blocksize, rc)) 3356 if (tree_block_processed(bytenr, rc))
3357 return 0; 3357 return 0;
3358 3358
3359 if (tree_search(blocks, bytenr)) 3359 if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
3611 if (added) 3611 if (added)
3612 goto next; 3612 goto next;
3613 3613
3614 if (!tree_block_processed(leaf->start, leaf->len, rc)) { 3614 if (!tree_block_processed(leaf->start, rc)) {
3615 block = kmalloc(sizeof(*block), GFP_NOFS); 3615 block = kmalloc(sizeof(*block), GFP_NOFS);
3616 if (!block) { 3616 if (!block) {
3617 err = -ENOMEM; 3617 err = -ENOMEM;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f2bb13a23f86..ec57687c9a4d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,7 +66,6 @@ struct scrub_ctx;
66struct scrub_recover { 66struct scrub_recover {
67 atomic_t refs; 67 atomic_t refs;
68 struct btrfs_bio *bbio; 68 struct btrfs_bio *bbio;
69 u64 *raid_map;
70 u64 map_length; 69 u64 map_length;
71}; 70};
72 71
@@ -80,7 +79,7 @@ struct scrub_page {
80 u64 logical; 79 u64 logical;
81 u64 physical; 80 u64 physical;
82 u64 physical_for_dev_replace; 81 u64 physical_for_dev_replace;
83 atomic_t ref_count; 82 atomic_t refs;
84 struct { 83 struct {
85 unsigned int mirror_num:8; 84 unsigned int mirror_num:8;
86 unsigned int have_csum:1; 85 unsigned int have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
113 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 112 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
114 int page_count; 113 int page_count;
115 atomic_t outstanding_pages; 114 atomic_t outstanding_pages;
116 atomic_t ref_count; /* free mem on transition to zero */ 115 atomic_t refs; /* free mem on transition to zero */
117 struct scrub_ctx *sctx; 116 struct scrub_ctx *sctx;
118 struct scrub_parity *sparity; 117 struct scrub_parity *sparity;
119 struct { 118 struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
142 141
143 int stripe_len; 142 int stripe_len;
144 143
145 atomic_t ref_count; 144 atomic_t refs;
146 145
147 struct list_head spages; 146 struct list_head spages;
148 147
@@ -194,6 +193,15 @@ struct scrub_ctx {
194 */ 193 */
195 struct btrfs_scrub_progress stat; 194 struct btrfs_scrub_progress stat;
196 spinlock_t stat_lock; 195 spinlock_t stat_lock;
196
197 /*
198 * Use a ref counter to avoid use-after-free issues. Scrub workers
199 * decrement bios_in_flight and workers_pending and then do a wakeup
200 * on the list_wait wait queue. We must ensure the main scrub task
201 * doesn't free the scrub context before or while the workers are
202 * doing the wakeup() call.
203 */
204 atomic_t refs;
197}; 205};
198 206
199struct scrub_fixup_nodatasum { 207struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
236static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 244static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
237static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 245static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
238static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 246static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
239static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 247static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
240 struct btrfs_fs_info *fs_info,
241 struct scrub_block *original_sblock,
242 u64 length, u64 logical,
243 struct scrub_block *sblocks_for_recheck); 248 struct scrub_block *sblocks_for_recheck);
244static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 249static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
245 struct scrub_block *sblock, int is_metadata, 250 struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
251 const u8 *csum, u64 generation, 256 const u8 *csum, u64 generation,
252 u16 csum_size); 257 u16 csum_size);
253static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 258static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
254 struct scrub_block *sblock_good, 259 struct scrub_block *sblock_good);
255 int force_write);
256static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 260static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
257 struct scrub_block *sblock_good, 261 struct scrub_block *sblock_good,
258 int page_num, int force_write); 262 int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
302static void copy_nocow_pages_worker(struct btrfs_work *work); 306static void copy_nocow_pages_worker(struct btrfs_work *work);
303static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 307static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
304static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 308static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
309static void scrub_put_ctx(struct scrub_ctx *sctx);
305 310
306 311
307static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 312static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
308{ 313{
314 atomic_inc(&sctx->refs);
309 atomic_inc(&sctx->bios_in_flight); 315 atomic_inc(&sctx->bios_in_flight);
310} 316}
311 317
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
313{ 319{
314 atomic_dec(&sctx->bios_in_flight); 320 atomic_dec(&sctx->bios_in_flight);
315 wake_up(&sctx->list_wait); 321 wake_up(&sctx->list_wait);
322 scrub_put_ctx(sctx);
316} 323}
317 324
318static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 325static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
346{ 353{
347 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
348 355
356 atomic_inc(&sctx->refs);
349 /* 357 /*
350 * increment scrubs_running to prevent cancel requests from 358 * increment scrubs_running to prevent cancel requests from
351 * completing as long as a worker is running. we must also 359 * completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
388 atomic_dec(&sctx->workers_pending); 396 atomic_dec(&sctx->workers_pending);
389 wake_up(&fs_info->scrub_pause_wait); 397 wake_up(&fs_info->scrub_pause_wait);
390 wake_up(&sctx->list_wait); 398 wake_up(&sctx->list_wait);
399 scrub_put_ctx(sctx);
391} 400}
392 401
393static void scrub_free_csums(struct scrub_ctx *sctx) 402static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
433 kfree(sctx); 442 kfree(sctx);
434} 443}
435 444
445static void scrub_put_ctx(struct scrub_ctx *sctx)
446{
447 if (atomic_dec_and_test(&sctx->refs))
448 scrub_free_ctx(sctx);
449}
450
436static noinline_for_stack 451static noinline_for_stack
437struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 452struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
438{ 453{
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
457 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 472 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
458 if (!sctx) 473 if (!sctx)
459 goto nomem; 474 goto nomem;
475 atomic_set(&sctx->refs, 1);
460 sctx->is_dev_replace = is_dev_replace; 476 sctx->is_dev_replace = is_dev_replace;
461 sctx->pages_per_rd_bio = pages_per_rd_bio; 477 sctx->pages_per_rd_bio = pages_per_rd_bio;
462 sctx->curr = -1; 478 sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
520 struct inode_fs_paths *ipath = NULL; 536 struct inode_fs_paths *ipath = NULL;
521 struct btrfs_root *local_root; 537 struct btrfs_root *local_root;
522 struct btrfs_key root_key; 538 struct btrfs_key root_key;
539 struct btrfs_key key;
523 540
524 root_key.objectid = root; 541 root_key.objectid = root;
525 root_key.type = BTRFS_ROOT_ITEM_KEY; 542 root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
530 goto err; 547 goto err;
531 } 548 }
532 549
533 ret = inode_item_info(inum, 0, local_root, swarn->path); 550 /*
551 * this makes the path point to (inum INODE_ITEM ioff)
552 */
553 key.objectid = inum;
554 key.type = BTRFS_INODE_ITEM_KEY;
555 key.offset = 0;
556
557 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
534 if (ret) { 558 if (ret) {
535 btrfs_release_path(swarn->path); 559 btrfs_release_path(swarn->path);
536 goto err; 560 goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
848static inline void scrub_put_recover(struct scrub_recover *recover) 872static inline void scrub_put_recover(struct scrub_recover *recover)
849{ 873{
850 if (atomic_dec_and_test(&recover->refs)) { 874 if (atomic_dec_and_test(&recover->refs)) {
851 kfree(recover->bbio); 875 btrfs_put_bbio(recover->bbio);
852 kfree(recover->raid_map);
853 kfree(recover); 876 kfree(recover);
854 } 877 }
855} 878}
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
955 } 978 }
956 979
957 /* setup the context, map the logical blocks and alloc the pages */ 980 /* setup the context, map the logical blocks and alloc the pages */
958 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, 981 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
959 logical, sblocks_for_recheck);
960 if (ret) { 982 if (ret) {
961 spin_lock(&sctx->stat_lock); 983 spin_lock(&sctx->stat_lock);
962 sctx->stat.read_errors++; 984 sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1030 if (!is_metadata && !have_csum) { 1052 if (!is_metadata && !have_csum) {
1031 struct scrub_fixup_nodatasum *fixup_nodatasum; 1053 struct scrub_fixup_nodatasum *fixup_nodatasum;
1032 1054
1033nodatasum_case:
1034 WARN_ON(sctx->is_dev_replace); 1055 WARN_ON(sctx->is_dev_replace);
1035 1056
1057nodatasum_case:
1058
1036 /* 1059 /*
1037 * !is_metadata and !have_csum, this means that the data 1060 * !is_metadata and !have_csum, this means that the data
1038 * might not be COW'ed, that it might be modified 1061 * might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
1091 sblock_other->no_io_error_seen) { 1114 sblock_other->no_io_error_seen) {
1092 if (sctx->is_dev_replace) { 1115 if (sctx->is_dev_replace) {
1093 scrub_write_block_to_dev_replace(sblock_other); 1116 scrub_write_block_to_dev_replace(sblock_other);
1117 goto corrected_error;
1094 } else { 1118 } else {
1095 int force_write = is_metadata || have_csum;
1096
1097 ret = scrub_repair_block_from_good_copy( 1119 ret = scrub_repair_block_from_good_copy(
1098 sblock_bad, sblock_other, 1120 sblock_bad, sblock_other);
1099 force_write); 1121 if (!ret)
1122 goto corrected_error;
1100 } 1123 }
1101 if (0 == ret)
1102 goto corrected_error;
1103 } 1124 }
1104 } 1125 }
1105 1126
1106 /* 1127 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1107 * for dev_replace, pick good pages and write to the target device. 1128 goto did_not_correct_error;
1108 */
1109 if (sctx->is_dev_replace) {
1110 success = 1;
1111 for (page_num = 0; page_num < sblock_bad->page_count;
1112 page_num++) {
1113 int sub_success;
1114
1115 sub_success = 0;
1116 for (mirror_index = 0;
1117 mirror_index < BTRFS_MAX_MIRRORS &&
1118 sblocks_for_recheck[mirror_index].page_count > 0;
1119 mirror_index++) {
1120 struct scrub_block *sblock_other =
1121 sblocks_for_recheck + mirror_index;
1122 struct scrub_page *page_other =
1123 sblock_other->pagev[page_num];
1124
1125 if (!page_other->io_error) {
1126 ret = scrub_write_page_to_dev_replace(
1127 sblock_other, page_num);
1128 if (ret == 0) {
1129 /* succeeded for this page */
1130 sub_success = 1;
1131 break;
1132 } else {
1133 btrfs_dev_replace_stats_inc(
1134 &sctx->dev_root->
1135 fs_info->dev_replace.
1136 num_write_errors);
1137 }
1138 }
1139 }
1140
1141 if (!sub_success) {
1142 /*
1143 * did not find a mirror to fetch the page
1144 * from. scrub_write_page_to_dev_replace()
1145 * handles this case (page->io_error), by
1146 * filling the block with zeros before
1147 * submitting the write request
1148 */
1149 success = 0;
1150 ret = scrub_write_page_to_dev_replace(
1151 sblock_bad, page_num);
1152 if (ret)
1153 btrfs_dev_replace_stats_inc(
1154 &sctx->dev_root->fs_info->
1155 dev_replace.num_write_errors);
1156 }
1157 }
1158
1159 goto out;
1160 }
1161 1129
1162 /* 1130 /*
1163 * for regular scrub, repair those pages that are errored.
1164 * In case of I/O errors in the area that is supposed to be 1131 * In case of I/O errors in the area that is supposed to be
1165 * repaired, continue by picking good copies of those pages. 1132 * repaired, continue by picking good copies of those pages.
1166 * Select the good pages from mirrors to rewrite bad pages from 1133 * Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
1184 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 1151 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1185 * area are unreadable. 1152 * area are unreadable.
1186 */ 1153 */
1187
1188 /* can only fix I/O errors from here on */
1189 if (sblock_bad->no_io_error_seen)
1190 goto did_not_correct_error;
1191
1192 success = 1; 1154 success = 1;
1193 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1155 for (page_num = 0; page_num < sblock_bad->page_count;
1156 page_num++) {
1194 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1157 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1158 struct scrub_block *sblock_other = NULL;
1195 1159
1196 if (!page_bad->io_error) 1160 /* skip no-io-error page in scrub */
1161 if (!page_bad->io_error && !sctx->is_dev_replace)
1197 continue; 1162 continue;
1198 1163
1199 for (mirror_index = 0; 1164 /* try to find no-io-error page in mirrors */
1200 mirror_index < BTRFS_MAX_MIRRORS && 1165 if (page_bad->io_error) {
1201 sblocks_for_recheck[mirror_index].page_count > 0; 1166 for (mirror_index = 0;
1202 mirror_index++) { 1167 mirror_index < BTRFS_MAX_MIRRORS &&
1203 struct scrub_block *sblock_other = sblocks_for_recheck + 1168 sblocks_for_recheck[mirror_index].page_count > 0;
1204 mirror_index; 1169 mirror_index++) {
1205 struct scrub_page *page_other = sblock_other->pagev[ 1170 if (!sblocks_for_recheck[mirror_index].
1206 page_num]; 1171 pagev[page_num]->io_error) {
1207 1172 sblock_other = sblocks_for_recheck +
1208 if (!page_other->io_error) { 1173 mirror_index;
1209 ret = scrub_repair_page_from_good_copy( 1174 break;
1210 sblock_bad, sblock_other, page_num, 0);
1211 if (0 == ret) {
1212 page_bad->io_error = 0;
1213 break; /* succeeded for this page */
1214 } 1175 }
1215 } 1176 }
1177 if (!sblock_other)
1178 success = 0;
1216 } 1179 }
1217 1180
1218 if (page_bad->io_error) { 1181 if (sctx->is_dev_replace) {
1219 /* did not find a mirror to copy the page from */ 1182 /*
1220 success = 0; 1183 * did not find a mirror to fetch the page
1184 * from. scrub_write_page_to_dev_replace()
1185 * handles this case (page->io_error), by
1186 * filling the block with zeros before
1187 * submitting the write request
1188 */
1189 if (!sblock_other)
1190 sblock_other = sblock_bad;
1191
1192 if (scrub_write_page_to_dev_replace(sblock_other,
1193 page_num) != 0) {
1194 btrfs_dev_replace_stats_inc(
1195 &sctx->dev_root->
1196 fs_info->dev_replace.
1197 num_write_errors);
1198 success = 0;
1199 }
1200 } else if (sblock_other) {
1201 ret = scrub_repair_page_from_good_copy(sblock_bad,
1202 sblock_other,
1203 page_num, 0);
1204 if (0 == ret)
1205 page_bad->io_error = 0;
1206 else
1207 success = 0;
1221 } 1208 }
1222 } 1209 }
1223 1210
1224 if (success) { 1211 if (success && !sctx->is_dev_replace) {
1225 if (is_metadata || have_csum) { 1212 if (is_metadata || have_csum) {
1226 /* 1213 /*
1227 * need to verify the checksum now that all 1214 * need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
1288 return 0; 1275 return 0;
1289} 1276}
1290 1277
1291static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) 1278static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1292{ 1279{
1293 if (raid_map) { 1280 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1294 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 1281 return 2;
1295 return 3; 1282 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1296 else 1283 return 3;
1297 return 2; 1284 else
1298 } else {
1299 return (int)bbio->num_stripes; 1285 return (int)bbio->num_stripes;
1300 }
1301} 1286}
1302 1287
1303static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, 1288static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1289 u64 *raid_map,
1304 u64 mapped_length, 1290 u64 mapped_length,
1305 int nstripes, int mirror, 1291 int nstripes, int mirror,
1306 int *stripe_index, 1292 int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1308{ 1294{
1309 int i; 1295 int i;
1310 1296
1311 if (raid_map) { 1297 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1312 /* RAID5/6 */ 1298 /* RAID5/6 */
1313 for (i = 0; i < nstripes; i++) { 1299 for (i = 0; i < nstripes; i++) {
1314 if (raid_map[i] == RAID6_Q_STRIPE || 1300 if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1329 } 1315 }
1330} 1316}
1331 1317
1332static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1318static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1333 struct btrfs_fs_info *fs_info,
1334 struct scrub_block *original_sblock,
1335 u64 length, u64 logical,
1336 struct scrub_block *sblocks_for_recheck) 1319 struct scrub_block *sblocks_for_recheck)
1337{ 1320{
1321 struct scrub_ctx *sctx = original_sblock->sctx;
1322 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1323 u64 length = original_sblock->page_count * PAGE_SIZE;
1324 u64 logical = original_sblock->pagev[0]->logical;
1338 struct scrub_recover *recover; 1325 struct scrub_recover *recover;
1339 struct btrfs_bio *bbio; 1326 struct btrfs_bio *bbio;
1340 u64 *raid_map;
1341 u64 sublen; 1327 u64 sublen;
1342 u64 mapped_length; 1328 u64 mapped_length;
1343 u64 stripe_offset; 1329 u64 stripe_offset;
1344 int stripe_index; 1330 int stripe_index;
1345 int page_index; 1331 int page_index = 0;
1346 int mirror_index; 1332 int mirror_index;
1347 int nmirrors; 1333 int nmirrors;
1348 int ret; 1334 int ret;
1349 1335
1350 /* 1336 /*
1351 * note: the two members ref_count and outstanding_pages 1337 * note: the two members refs and outstanding_pages
1352 * are not used (and not set) in the blocks that are used for 1338 * are not used (and not set) in the blocks that are used for
1353 * the recheck procedure 1339 * the recheck procedure
1354 */ 1340 */
1355 1341
1356 page_index = 0;
1357 while (length > 0) { 1342 while (length > 0) {
1358 sublen = min_t(u64, length, PAGE_SIZE); 1343 sublen = min_t(u64, length, PAGE_SIZE);
1359 mapped_length = sublen; 1344 mapped_length = sublen;
1360 bbio = NULL; 1345 bbio = NULL;
1361 raid_map = NULL;
1362 1346
1363 /* 1347 /*
1364 * with a length of PAGE_SIZE, each returned stripe 1348 * with a length of PAGE_SIZE, each returned stripe
1365 * represents one mirror 1349 * represents one mirror
1366 */ 1350 */
1367 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, 1351 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1368 &mapped_length, &bbio, 0, &raid_map); 1352 &mapped_length, &bbio, 0, 1);
1369 if (ret || !bbio || mapped_length < sublen) { 1353 if (ret || !bbio || mapped_length < sublen) {
1370 kfree(bbio); 1354 btrfs_put_bbio(bbio);
1371 kfree(raid_map);
1372 return -EIO; 1355 return -EIO;
1373 } 1356 }
1374 1357
1375 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); 1358 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1376 if (!recover) { 1359 if (!recover) {
1377 kfree(bbio); 1360 btrfs_put_bbio(bbio);
1378 kfree(raid_map);
1379 return -ENOMEM; 1361 return -ENOMEM;
1380 } 1362 }
1381 1363
1382 atomic_set(&recover->refs, 1); 1364 atomic_set(&recover->refs, 1);
1383 recover->bbio = bbio; 1365 recover->bbio = bbio;
1384 recover->raid_map = raid_map;
1385 recover->map_length = mapped_length; 1366 recover->map_length = mapped_length;
1386 1367
1387 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1368 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1388 1369
1389 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map); 1370 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1371
1390 for (mirror_index = 0; mirror_index < nmirrors; 1372 for (mirror_index = 0; mirror_index < nmirrors;
1391 mirror_index++) { 1373 mirror_index++) {
1392 struct scrub_block *sblock; 1374 struct scrub_block *sblock;
1393 struct scrub_page *page; 1375 struct scrub_page *page;
1394 1376
1395 if (mirror_index >= BTRFS_MAX_MIRRORS)
1396 continue;
1397
1398 sblock = sblocks_for_recheck + mirror_index; 1377 sblock = sblocks_for_recheck + mirror_index;
1399 sblock->sctx = sctx; 1378 sblock->sctx = sctx;
1400 page = kzalloc(sizeof(*page), GFP_NOFS); 1379 page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
1410 sblock->pagev[page_index] = page; 1389 sblock->pagev[page_index] = page;
1411 page->logical = logical; 1390 page->logical = logical;
1412 1391
1413 scrub_stripe_index_and_offset(logical, raid_map, 1392 scrub_stripe_index_and_offset(logical,
1393 bbio->map_type,
1394 bbio->raid_map,
1414 mapped_length, 1395 mapped_length,
1415 bbio->num_stripes, 1396 bbio->num_stripes -
1397 bbio->num_tgtdevs,
1416 mirror_index, 1398 mirror_index,
1417 &stripe_index, 1399 &stripe_index,
1418 &stripe_offset); 1400 &stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
1458 1440
1459static inline int scrub_is_page_on_raid56(struct scrub_page *page) 1441static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1460{ 1442{
1461 return page->recover && page->recover->raid_map; 1443 return page->recover &&
1444 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1462} 1445}
1463 1446
1464static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, 1447static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1475 bio->bi_end_io = scrub_bio_wait_endio; 1458 bio->bi_end_io = scrub_bio_wait_endio;
1476 1459
1477 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, 1460 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1478 page->recover->raid_map,
1479 page->recover->map_length, 1461 page->recover->map_length,
1480 page->mirror_num, 0); 1462 page->mirror_num, 0);
1481 if (ret) 1463 if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1615} 1597}
1616 1598
1617static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1599static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1618 struct scrub_block *sblock_good, 1600 struct scrub_block *sblock_good)
1619 int force_write)
1620{ 1601{
1621 int page_num; 1602 int page_num;
1622 int ret = 0; 1603 int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1626 1607
1627 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1608 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1628 sblock_good, 1609 sblock_good,
1629 page_num, 1610 page_num, 1);
1630 force_write);
1631 if (ret_sub) 1611 if (ret_sub)
1632 ret = ret_sub; 1612 ret = ret_sub;
1633 } 1613 }
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
2067 2047
2068static void scrub_block_get(struct scrub_block *sblock) 2048static void scrub_block_get(struct scrub_block *sblock)
2069{ 2049{
2070 atomic_inc(&sblock->ref_count); 2050 atomic_inc(&sblock->refs);
2071} 2051}
2072 2052
2073static void scrub_block_put(struct scrub_block *sblock) 2053static void scrub_block_put(struct scrub_block *sblock)
2074{ 2054{
2075 if (atomic_dec_and_test(&sblock->ref_count)) { 2055 if (atomic_dec_and_test(&sblock->refs)) {
2076 int i; 2056 int i;
2077 2057
2078 if (sblock->sparity) 2058 if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
2086 2066
2087static void scrub_page_get(struct scrub_page *spage) 2067static void scrub_page_get(struct scrub_page *spage)
2088{ 2068{
2089 atomic_inc(&spage->ref_count); 2069 atomic_inc(&spage->refs);
2090} 2070}
2091 2071
2092static void scrub_page_put(struct scrub_page *spage) 2072static void scrub_page_put(struct scrub_page *spage)
2093{ 2073{
2094 if (atomic_dec_and_test(&spage->ref_count)) { 2074 if (atomic_dec_and_test(&spage->refs)) {
2095 if (spage->page) 2075 if (spage->page)
2096 __free_page(spage->page); 2076 __free_page(spage->page);
2097 kfree(spage); 2077 kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2217 2197
2218 /* one ref inside this function, plus one for each page added to 2198 /* one ref inside this function, plus one for each page added to
2219 * a bio later on */ 2199 * a bio later on */
2220 atomic_set(&sblock->ref_count, 1); 2200 atomic_set(&sblock->refs, 1);
2221 sblock->sctx = sctx; 2201 sblock->sctx = sctx;
2222 sblock->no_io_error_seen = 1; 2202 sblock->no_io_error_seen = 1;
2223 2203
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
2510 2490
2511 /* one ref inside this function, plus one for each page added to 2491 /* one ref inside this function, plus one for each page added to
2512 * a bio later on */ 2492 * a bio later on */
2513 atomic_set(&sblock->ref_count, 1); 2493 atomic_set(&sblock->refs, 1);
2514 sblock->sctx = sctx; 2494 sblock->sctx = sctx;
2515 sblock->no_io_error_seen = 1; 2495 sblock->no_io_error_seen = 1;
2516 sblock->sparity = sparity; 2496 sblock->sparity = sparity;
@@ -2607,9 +2587,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
2607 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, 2587 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2608 flags, gen, mirror_num, 2588 flags, gen, mirror_num,
2609 have_csum ? csum : NULL); 2589 have_csum ? csum : NULL);
2610skip:
2611 if (ret) 2590 if (ret)
2612 return ret; 2591 return ret;
2592skip:
2613 len -= l; 2593 len -= l;
2614 logical += l; 2594 logical += l;
2615 physical += l; 2595 physical += l;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2705 struct btrfs_raid_bio *rbio; 2685 struct btrfs_raid_bio *rbio;
2706 struct scrub_page *spage; 2686 struct scrub_page *spage;
2707 struct btrfs_bio *bbio = NULL; 2687 struct btrfs_bio *bbio = NULL;
2708 u64 *raid_map = NULL;
2709 u64 length; 2688 u64 length;
2710 int ret; 2689 int ret;
2711 2690
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2716 length = sparity->logic_end - sparity->logic_start + 1; 2695 length = sparity->logic_end - sparity->logic_start + 1;
2717 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, 2696 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2718 sparity->logic_start, 2697 sparity->logic_start,
2719 &length, &bbio, 0, &raid_map); 2698 &length, &bbio, 0, 1);
2720 if (ret || !bbio || !raid_map) 2699 if (ret || !bbio || !bbio->raid_map)
2721 goto bbio_out; 2700 goto bbio_out;
2722 2701
2723 bio = btrfs_io_bio_alloc(GFP_NOFS, 0); 2702 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2729 bio->bi_end_io = scrub_parity_bio_endio; 2708 bio->bi_end_io = scrub_parity_bio_endio;
2730 2709
2731 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, 2710 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2732 raid_map, length, 2711 length, sparity->scrub_dev,
2733 sparity->scrub_dev,
2734 sparity->dbitmap, 2712 sparity->dbitmap,
2735 sparity->nsectors); 2713 sparity->nsectors);
2736 if (!rbio) 2714 if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2747rbio_out: 2725rbio_out:
2748 bio_put(bio); 2726 bio_put(bio);
2749bbio_out: 2727bbio_out:
2750 kfree(bbio); 2728 btrfs_put_bbio(bbio);
2751 kfree(raid_map);
2752 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 2729 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2753 sparity->nsectors); 2730 sparity->nsectors);
2754 spin_lock(&sctx->stat_lock); 2731 spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
2765 2742
2766static void scrub_parity_get(struct scrub_parity *sparity) 2743static void scrub_parity_get(struct scrub_parity *sparity)
2767{ 2744{
2768 atomic_inc(&sparity->ref_count); 2745 atomic_inc(&sparity->refs);
2769} 2746}
2770 2747
2771static void scrub_parity_put(struct scrub_parity *sparity) 2748static void scrub_parity_put(struct scrub_parity *sparity)
2772{ 2749{
2773 if (!atomic_dec_and_test(&sparity->ref_count)) 2750 if (!atomic_dec_and_test(&sparity->refs))
2774 return; 2751 return;
2775 2752
2776 scrub_parity_check_and_repair(sparity); 2753 scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2820 sparity->scrub_dev = sdev; 2797 sparity->scrub_dev = sdev;
2821 sparity->logic_start = logic_start; 2798 sparity->logic_start = logic_start;
2822 sparity->logic_end = logic_end; 2799 sparity->logic_end = logic_end;
2823 atomic_set(&sparity->ref_count, 1); 2800 atomic_set(&sparity->refs, 1);
2824 INIT_LIST_HEAD(&sparity->spages); 2801 INIT_LIST_HEAD(&sparity->spages);
2825 sparity->dbitmap = sparity->bitmap; 2802 sparity->dbitmap = sparity->bitmap;
2826 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; 2803 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3037 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3014 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3038 increment = map->stripe_len; 3015 increment = map->stripe_len;
3039 mirror_num = num % map->num_stripes + 1; 3016 mirror_num = num % map->num_stripes + 1;
3040 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3017 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3041 BTRFS_BLOCK_GROUP_RAID6)) {
3042 get_raid56_logic_offset(physical, num, map, &offset, NULL); 3018 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3043 increment = map->stripe_len * nr_data_stripes(map); 3019 increment = map->stripe_len * nr_data_stripes(map);
3044 mirror_num = 1; 3020 mirror_num = 1;
@@ -3053,7 +3029,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3053 3029
3054 ppath = btrfs_alloc_path(); 3030 ppath = btrfs_alloc_path();
3055 if (!ppath) { 3031 if (!ppath) {
3056 btrfs_free_path(ppath); 3032 btrfs_free_path(path);
3057 return -ENOMEM; 3033 return -ENOMEM;
3058 } 3034 }
3059 3035
@@ -3065,6 +3041,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3065 path->search_commit_root = 1; 3041 path->search_commit_root = 1;
3066 path->skip_locking = 1; 3042 path->skip_locking = 1;
3067 3043
3044 ppath->search_commit_root = 1;
3045 ppath->skip_locking = 1;
3068 /* 3046 /*
3069 * trigger the readahead for extent tree csum tree and wait for 3047 * trigger the readahead for extent tree csum tree and wait for
3070 * completion. During readahead, the scrub is officially paused 3048 * completion. During readahead, the scrub is officially paused
@@ -3072,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3072 */ 3050 */
3073 logical = base + offset; 3051 logical = base + offset;
3074 physical_end = physical + nstripes * map->stripe_len; 3052 physical_end = physical + nstripes * map->stripe_len;
3075 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3053 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3076 BTRFS_BLOCK_GROUP_RAID6)) {
3077 get_raid56_logic_offset(physical_end, num, 3054 get_raid56_logic_offset(physical_end, num,
3078 map, &logic_end, NULL); 3055 map, &logic_end, NULL);
3079 logic_end += base; 3056 logic_end += base;
@@ -3119,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3119 ret = 0; 3096 ret = 0;
3120 while (physical < physical_end) { 3097 while (physical < physical_end) {
3121 /* for raid56, we skip parity stripe */ 3098 /* for raid56, we skip parity stripe */
3122 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3099 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3123 BTRFS_BLOCK_GROUP_RAID6)) {
3124 ret = get_raid56_logic_offset(physical, num, 3100 ret = get_raid56_logic_offset(physical, num,
3125 map, &logical, &stripe_logical); 3101 map, &logical, &stripe_logical);
3126 logical += base; 3102 logical += base;
@@ -3278,8 +3254,7 @@ again:
3278 scrub_free_csums(sctx); 3254 scrub_free_csums(sctx);
3279 if (extent_logical + extent_len < 3255 if (extent_logical + extent_len <
3280 key.objectid + bytes) { 3256 key.objectid + bytes) {
3281 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3257 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3282 BTRFS_BLOCK_GROUP_RAID6)) {
3283 /* 3258 /*
3284 * loop until we find next data stripe 3259 * loop until we find next data stripe
3285 * or we have finished all stripes. 3260 * or we have finished all stripes.
@@ -3773,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3773 scrub_workers_put(fs_info); 3748 scrub_workers_put(fs_info);
3774 mutex_unlock(&fs_info->scrub_lock); 3749 mutex_unlock(&fs_info->scrub_lock);
3775 3750
3776 scrub_free_ctx(sctx); 3751 scrub_put_ctx(sctx);
3777 3752
3778 return ret; 3753 return ret;
3779} 3754}
@@ -3879,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3879 &mapped_length, &bbio, 0); 3854 &mapped_length, &bbio, 0);
3880 if (ret || !bbio || mapped_length < extent_len || 3855 if (ret || !bbio || mapped_length < extent_len ||
3881 !bbio->stripes[0].dev->bdev) { 3856 !bbio->stripes[0].dev->bdev) {
3882 kfree(bbio); 3857 btrfs_put_bbio(bbio);
3883 return; 3858 return;
3884 } 3859 }
3885 3860
3886 *extent_physical = bbio->stripes[0].physical; 3861 *extent_physical = bbio->stripes[0].physical;
3887 *extent_mirror_num = bbio->mirror_num; 3862 *extent_mirror_num = bbio->mirror_num;
3888 *extent_dev = bbio->stripes[0].dev; 3863 *extent_dev = bbio->stripes[0].dev;
3889 kfree(bbio); 3864 btrfs_put_bbio(bbio);
3890} 3865}
3891 3866
3892static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 3867static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 804432dbc351..fe5857223515 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2471 if (ret < 0) 2471 if (ret < 0)
2472 goto out; 2472 goto out;
2473 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2473 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2474 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, 2474 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2475 btrfs_inode_atime(ii)); 2475 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2476 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, 2476 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2477 btrfs_inode_mtime(ii));
2478 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
2479 btrfs_inode_ctime(ii));
2480 /* TODO Add otime support when the otime patches get into upstream */ 2477 /* TODO Add otime support when the otime patches get into upstream */
2481 2478
2482 ret = send_cmd(sctx); 2479 ret = send_cmd(sctx);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 60f7cbe815e9..05fef198ff94 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1000,10 +1000,20 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
1000 */ 1000 */
1001 if (fs_info->pending_changes == 0) 1001 if (fs_info->pending_changes == 0)
1002 return 0; 1002 return 0;
1003 /*
1004 * A non-blocking test if the fs is frozen. We must not
1005 * start a new transaction here otherwise a deadlock
1006 * happens. The pending operations are delayed to the
1007 * next commit after thawing.
1008 */
1009 if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
1010 __sb_end_write(sb, SB_FREEZE_WRITE);
1011 else
1012 return 0;
1003 trans = btrfs_start_transaction(root, 0); 1013 trans = btrfs_start_transaction(root, 0);
1004 } else {
1005 return PTR_ERR(trans);
1006 } 1014 }
1015 if (IS_ERR(trans))
1016 return PTR_ERR(trans);
1007 } 1017 }
1008 return btrfs_commit_transaction(trans, root); 1018 return btrfs_commit_transaction(trans, root);
1009} 1019}
@@ -1948,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
1948 return btrfs_commit_transaction(trans, root); 1958 return btrfs_commit_transaction(trans, root);
1949} 1959}
1950 1960
1951static int btrfs_unfreeze(struct super_block *sb)
1952{
1953 return 0;
1954}
1955
1956static int btrfs_show_devname(struct seq_file *m, struct dentry *root) 1961static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
1957{ 1962{
1958 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); 1963 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2001,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
2001 .statfs = btrfs_statfs, 2006 .statfs = btrfs_statfs,
2002 .remount_fs = btrfs_remount, 2007 .remount_fs = btrfs_remount,
2003 .freeze_fs = btrfs_freeze, 2008 .freeze_fs = btrfs_freeze,
2004 .unfreeze_fs = btrfs_unfreeze,
2005}; 2009};
2006 2010
2007static const struct file_operations btrfs_ctl_fops = { 2011static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92db3f648df4..94edb0a2a026 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
733 733
734 ret = btrfs_init_debugfs(); 734 ret = btrfs_init_debugfs();
735 if (ret) 735 if (ret)
736 return ret; 736 goto out1;
737 737
738 init_feature_attrs(); 738 init_feature_attrs();
739 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); 739 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
740 if (ret)
741 goto out2;
742
743 return 0;
744out2:
745 debugfs_remove_recursive(btrfs_debugfs_root_dentry);
746out1:
747 kset_unregister(btrfs_kset);
740 748
741 return ret; 749 return ret;
742} 750}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index cc286ce97d1e..f51963a8f929 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
53 return -ENOMEM; 53 return -ENOMEM;
54 } 54 }
55 55
56 path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); 56 path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
57 if (!eb) { 57 if (!eb) {
58 test_msg("Could not allocate dummy buffer\n"); 58 test_msg("Could not allocate dummy buffer\n");
59 ret = -ENOMEM; 59 ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7e99c2f98dd0..9e9f2368177d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
258 } 258 }
259 ret = 0; 259 ret = 0;
260out_bits: 260out_bits:
261 clear_extent_bits(&tmp, 0, total_dirty - 1, 261 clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
262 (unsigned long)-1, GFP_NOFS);
263out: 262out:
264 if (locked_page) 263 if (locked_page)
265 page_cache_release(locked_page); 264 page_cache_release(locked_page);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ae0f5b8bb80..a116b55ce788 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
255 goto out; 255 goto out;
256 } 256 }
257 257
258 root->node = alloc_dummy_extent_buffer(0, 4096); 258 root->node = alloc_dummy_extent_buffer(NULL, 4096);
259 if (!root->node) { 259 if (!root->node) {
260 test_msg("Couldn't allocate dummy buffer\n"); 260 test_msg("Couldn't allocate dummy buffer\n");
261 goto out; 261 goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
843 goto out; 843 goto out;
844 } 844 }
845 845
846 root->node = alloc_dummy_extent_buffer(0, 4096); 846 root->node = alloc_dummy_extent_buffer(NULL, 4096);
847 if (!root->node) { 847 if (!root->node) {
848 test_msg("Couldn't allocate dummy buffer\n"); 848 test_msg("Couldn't allocate dummy buffer\n");
849 goto out; 849 goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ec3dcb202357..73f299ebdabb 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
404 ret = -ENOMEM; 404 ret = -ENOMEM;
405 goto out; 405 goto out;
406 } 406 }
407 /* We are using this root as our extent root */
408 root->fs_info->extent_root = root;
409
410 /*
411 * Some of the paths we test assume we have a filled out fs_info, so we
412 * just need to add the root in there so we don't panic.
413 */
414 root->fs_info->tree_root = root;
415 root->fs_info->quota_root = root;
416 root->fs_info->quota_enabled = 1;
407 417
408 /* 418 /*
409 * Can't use bytenr 0, some things freak out 419 * Can't use bytenr 0, some things freak out
410 * *cough*backref walking code*cough* 420 * *cough*backref walking code*cough*
411 */ 421 */
412 root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); 422 root->node = alloc_test_extent_buffer(root->fs_info, 4096);
413 if (!root->node) { 423 if (!root->node) {
414 test_msg("Couldn't allocate dummy buffer\n"); 424 test_msg("Couldn't allocate dummy buffer\n");
415 ret = -ENOMEM; 425 ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
448 goto out; 458 goto out;
449 } 459 }
450 460
451 /* We are using this root as our extent root */
452 root->fs_info->extent_root = root;
453
454 /*
455 * Some of the paths we test assume we have a filled out fs_info, so we
456 * just need to addt he root in there so we don't panic.
457 */
458 root->fs_info->tree_root = root;
459 root->fs_info->quota_root = root;
460 root->fs_info->quota_enabled = 1;
461
462 test_msg("Running qgroup tests\n"); 461 test_msg("Running qgroup tests\n");
463 ret = test_no_shared_qgroup(root); 462 ret = test_no_shared_qgroup(root);
464 if (ret) 463 if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a605d4e2f2bc..7e80f32550a6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@ loop:
220 * commit the transaction. 220 * commit the transaction.
221 */ 221 */
222 atomic_set(&cur_trans->use_count, 2); 222 atomic_set(&cur_trans->use_count, 2);
223 cur_trans->have_free_bgs = 0;
223 cur_trans->start_time = get_seconds(); 224 cur_trans->start_time = get_seconds();
224 225
225 cur_trans->delayed_refs.href_root = RB_ROOT; 226 cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
248 INIT_LIST_HEAD(&cur_trans->pending_chunks); 249 INIT_LIST_HEAD(&cur_trans->pending_chunks);
249 INIT_LIST_HEAD(&cur_trans->switch_commits); 250 INIT_LIST_HEAD(&cur_trans->switch_commits);
250 INIT_LIST_HEAD(&cur_trans->pending_ordered); 251 INIT_LIST_HEAD(&cur_trans->pending_ordered);
252 INIT_LIST_HEAD(&cur_trans->dirty_bgs);
253 spin_lock_init(&cur_trans->dirty_bgs_lock);
251 list_add_tail(&cur_trans->list, &fs_info->trans_list); 254 list_add_tail(&cur_trans->list, &fs_info->trans_list);
252 extent_io_tree_init(&cur_trans->dirty_pages, 255 extent_io_tree_init(&cur_trans->dirty_pages,
253 fs_info->btree_inode->i_mapping); 256 fs_info->btree_inode->i_mapping);
@@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
1020 u64 old_root_bytenr; 1023 u64 old_root_bytenr;
1021 u64 old_root_used; 1024 u64 old_root_used;
1022 struct btrfs_root *tree_root = root->fs_info->tree_root; 1025 struct btrfs_root *tree_root = root->fs_info->tree_root;
1026 bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
1023 1027
1024 old_root_used = btrfs_root_used(&root->root_item); 1028 old_root_used = btrfs_root_used(&root->root_item);
1025 btrfs_write_dirty_block_groups(trans, root); 1029 btrfs_write_dirty_block_groups(trans, root);
@@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
1027 while (1) { 1031 while (1) {
1028 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 1032 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
1029 if (old_root_bytenr == root->node->start && 1033 if (old_root_bytenr == root->node->start &&
1030 old_root_used == btrfs_root_used(&root->root_item)) 1034 old_root_used == btrfs_root_used(&root->root_item) &&
1035 (!extent_root ||
1036 list_empty(&trans->transaction->dirty_bgs)))
1031 break; 1037 break;
1032 1038
1033 btrfs_set_root_node(&root->root_item, root->node); 1039 btrfs_set_root_node(&root->root_item, root->node);
@@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
1038 return ret; 1044 return ret;
1039 1045
1040 old_root_used = btrfs_root_used(&root->root_item); 1046 old_root_used = btrfs_root_used(&root->root_item);
1041 ret = btrfs_write_dirty_block_groups(trans, root); 1047 if (extent_root) {
1048 ret = btrfs_write_dirty_block_groups(trans, root);
1049 if (ret)
1050 return ret;
1051 }
1052 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1053 if (ret)
1054 return ret;
1055 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1042 if (ret) 1056 if (ret)
1043 return ret; 1057 return ret;
1044 } 1058 }
@@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
1061 struct extent_buffer *eb; 1075 struct extent_buffer *eb;
1062 int ret; 1076 int ret;
1063 1077
1064 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1065 if (ret)
1066 return ret;
1067
1068 eb = btrfs_lock_root_node(fs_info->tree_root); 1078 eb = btrfs_lock_root_node(fs_info->tree_root);
1069 ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 1079 ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
1070 0, &eb); 1080 0, &eb);
@@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
1097 next = fs_info->dirty_cowonly_roots.next; 1107 next = fs_info->dirty_cowonly_roots.next;
1098 list_del_init(next); 1108 list_del_init(next);
1099 root = list_entry(next, struct btrfs_root, dirty_list); 1109 root = list_entry(next, struct btrfs_root, dirty_list);
1110 clear_bit(BTRFS_ROOT_DIRTY, &root->state);
1100 1111
1101 if (root != fs_info->extent_root) 1112 if (root != fs_info->extent_root)
1102 list_add_tail(&root->dirty_list, 1113 list_add_tail(&root->dirty_list,
@@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1983 switch_commit_roots(cur_trans, root->fs_info); 1994 switch_commit_roots(cur_trans, root->fs_info);
1984 1995
1985 assert_qgroups_uptodate(trans); 1996 assert_qgroups_uptodate(trans);
1997 ASSERT(list_empty(&cur_trans->dirty_bgs));
1986 update_super_roots(root); 1998 update_super_roots(root);
1987 1999
1988 btrfs_set_super_log_root(root->fs_info->super_copy, 0); 2000 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2026 2038
2027 btrfs_finish_extent_commit(trans, root); 2039 btrfs_finish_extent_commit(trans, root);
2028 2040
2041 if (cur_trans->have_free_bgs)
2042 btrfs_clear_space_info_full(root->fs_info);
2043
2029 root->fs_info->last_trans_committed = cur_trans->transid; 2044 root->fs_info->last_trans_committed = cur_trans->transid;
2030 /* 2045 /*
2031 * We needn't acquire the lock here because there is no other task 2046 * We needn't acquire the lock here because there is no other task
@@ -2118,7 +2133,7 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
2118 unsigned long prev; 2133 unsigned long prev;
2119 unsigned long bit; 2134 unsigned long bit;
2120 2135
2121 prev = cmpxchg(&fs_info->pending_changes, 0, 0); 2136 prev = xchg(&fs_info->pending_changes, 0);
2122 if (!prev) 2137 if (!prev)
2123 return; 2138 return;
2124 2139
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 00ed29c4b3f9..937050a2b68e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
47 atomic_t num_writers; 47 atomic_t num_writers;
48 atomic_t use_count; 48 atomic_t use_count;
49 49
50 /*
51 * true if there is free bgs operations in this transaction
52 */
53 int have_free_bgs;
54
50 /* Be protected by fs_info->trans_lock when we want to change it. */ 55 /* Be protected by fs_info->trans_lock when we want to change it. */
51 enum btrfs_trans_state state; 56 enum btrfs_trans_state state;
52 struct list_head list; 57 struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
58 struct list_head pending_chunks; 63 struct list_head pending_chunks;
59 struct list_head pending_ordered; 64 struct list_head pending_ordered;
60 struct list_head switch_commits; 65 struct list_head switch_commits;
66 struct list_head dirty_bgs;
67 spinlock_t dirty_bgs_lock;
61 struct btrfs_delayed_ref_root delayed_refs; 68 struct btrfs_delayed_ref_root delayed_refs;
62 int aborted; 69 int aborted;
63}; 70};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9a02da16f2be..9a37f8b39bae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
453insert: 453insert:
454 btrfs_release_path(path); 454 btrfs_release_path(path);
455 /* try to insert the key into the destination tree */ 455 /* try to insert the key into the destination tree */
456 path->skip_release_on_error = 1;
456 ret = btrfs_insert_empty_item(trans, root, path, 457 ret = btrfs_insert_empty_item(trans, root, path,
457 key, item_size); 458 key, item_size);
459 path->skip_release_on_error = 0;
458 460
459 /* make sure any existing item is the correct size */ 461 /* make sure any existing item is the correct size */
460 if (ret == -EEXIST) { 462 if (ret == -EEXIST || ret == -EOVERFLOW) {
461 u32 found_size; 463 u32 found_size;
462 found_size = btrfs_item_size_nr(path->nodes[0], 464 found_size = btrfs_item_size_nr(path->nodes[0],
463 path->slots[0]); 465 path->slots[0]);
@@ -488,8 +490,20 @@ insert:
488 src_item = (struct btrfs_inode_item *)src_ptr; 490 src_item = (struct btrfs_inode_item *)src_ptr;
489 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 dst_item = (struct btrfs_inode_item *)dst_ptr;
490 492
491 if (btrfs_inode_generation(eb, src_item) == 0) 493 if (btrfs_inode_generation(eb, src_item) == 0) {
494 struct extent_buffer *dst_eb = path->nodes[0];
495
496 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
497 S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
498 struct btrfs_map_token token;
499 u64 ino_size = btrfs_inode_size(eb, src_item);
500
501 btrfs_init_map_token(&token);
502 btrfs_set_token_inode_size(dst_eb, dst_item,
503 ino_size, &token);
504 }
492 goto no_copy; 505 goto no_copy;
506 }
493 507
494 if (overwrite_root && 508 if (overwrite_root &&
495 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 509 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
844static noinline int backref_in_log(struct btrfs_root *log, 858static noinline int backref_in_log(struct btrfs_root *log,
845 struct btrfs_key *key, 859 struct btrfs_key *key,
846 u64 ref_objectid, 860 u64 ref_objectid,
847 char *name, int namelen) 861 const char *name, int namelen)
848{ 862{
849 struct btrfs_path *path; 863 struct btrfs_path *path;
850 struct btrfs_inode_ref *ref; 864 struct btrfs_inode_ref *ref;
@@ -1254,13 +1268,14 @@ out:
1254} 1268}
1255 1269
1256static int insert_orphan_item(struct btrfs_trans_handle *trans, 1270static int insert_orphan_item(struct btrfs_trans_handle *trans,
1257 struct btrfs_root *root, u64 offset) 1271 struct btrfs_root *root, u64 ino)
1258{ 1272{
1259 int ret; 1273 int ret;
1260 ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, 1274
1261 offset, BTRFS_ORPHAN_ITEM_KEY, NULL); 1275 ret = btrfs_insert_orphan_item(trans, root, ino);
1262 if (ret > 0) 1276 if (ret == -EEXIST)
1263 ret = btrfs_insert_orphan_item(trans, root, offset); 1277 ret = 0;
1278
1264 return ret; 1279 return ret;
1265} 1280}
1266 1281
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
1287 leaf = path->nodes[0]; 1302 leaf = path->nodes[0];
1288 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1303 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1289 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1304 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1305 cur_offset = 0;
1290 1306
1291 while (cur_offset < item_size) { 1307 while (cur_offset < item_size) {
1292 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1308 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
1302 } 1318 }
1303 btrfs_release_path(path); 1319 btrfs_release_path(path);
1304 1320
1305 if (ret < 0) 1321 if (ret < 0 && ret != -ENOENT)
1306 return ret; 1322 return ret;
1307 return nlink; 1323 return nlink;
1308} 1324}
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1394 nlink = ret; 1410 nlink = ret;
1395 1411
1396 ret = count_inode_extrefs(root, inode, path); 1412 ret = count_inode_extrefs(root, inode, path);
1397 if (ret == -ENOENT)
1398 ret = 0;
1399
1400 if (ret < 0) 1413 if (ret < 0)
1401 goto out; 1414 goto out;
1402 1415
@@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1557} 1570}
1558 1571
1559/* 1572/*
1573 * Return true if an inode reference exists in the log for the given name,
1574 * inode and parent inode.
1575 */
1576static bool name_in_log_ref(struct btrfs_root *log_root,
1577 const char *name, const int name_len,
1578 const u64 dirid, const u64 ino)
1579{
1580 struct btrfs_key search_key;
1581
1582 search_key.objectid = ino;
1583 search_key.type = BTRFS_INODE_REF_KEY;
1584 search_key.offset = dirid;
1585 if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1586 return true;
1587
1588 search_key.type = BTRFS_INODE_EXTREF_KEY;
1589 search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1590 if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1591 return true;
1592
1593 return false;
1594}
1595
1596/*
1560 * take a single entry in a log directory item and replay it into 1597 * take a single entry in a log directory item and replay it into
1561 * the subvolume. 1598 * the subvolume.
1562 * 1599 *
@@ -1666,10 +1703,17 @@ out:
1666 return ret; 1703 return ret;
1667 1704
1668insert: 1705insert:
1706 if (name_in_log_ref(root->log_root, name, name_len,
1707 key->objectid, log_key.objectid)) {
1708 /* The dentry will be added later. */
1709 ret = 0;
1710 update_size = false;
1711 goto out;
1712 }
1669 btrfs_release_path(path); 1713 btrfs_release_path(path);
1670 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1714 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1671 name, name_len, log_type, &log_key); 1715 name, name_len, log_type, &log_key);
1672 if (ret && ret != -ENOENT) 1716 if (ret && ret != -ENOENT && ret != -EEXIST)
1673 goto out; 1717 goto out;
1674 update_size = false; 1718 update_size = false;
1675 ret = 0; 1719 ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2164 parent = path->nodes[*level]; 2208 parent = path->nodes[*level];
2165 root_owner = btrfs_header_owner(parent); 2209 root_owner = btrfs_header_owner(parent);
2166 2210
2167 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 2211 next = btrfs_find_create_tree_block(root, bytenr);
2168 if (!next) 2212 if (!next)
2169 return -ENOMEM; 2213 return -ENOMEM;
2170 2214
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
2416 mutex_unlock(&root->log_mutex); 2460 mutex_unlock(&root->log_mutex);
2417 if (atomic_read(&root->log_writers)) 2461 if (atomic_read(&root->log_writers))
2418 schedule(); 2462 schedule();
2419 mutex_lock(&root->log_mutex);
2420 finish_wait(&root->log_writer_wait, &wait); 2463 finish_wait(&root->log_writer_wait, &wait);
2464 mutex_lock(&root->log_mutex);
2421 } 2465 }
2422} 2466}
2423 2467
@@ -2591,6 +2635,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2591 } 2635 }
2592 2636
2593 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2637 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2638 blk_finish_plug(&plug);
2594 mutex_unlock(&log_root_tree->log_mutex); 2639 mutex_unlock(&log_root_tree->log_mutex);
2595 ret = root_log_ctx.log_ret; 2640 ret = root_log_ctx.log_ret;
2596 goto out; 2641 goto out;
@@ -3218,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
3218static void fill_inode_item(struct btrfs_trans_handle *trans, 3263static void fill_inode_item(struct btrfs_trans_handle *trans,
3219 struct extent_buffer *leaf, 3264 struct extent_buffer *leaf,
3220 struct btrfs_inode_item *item, 3265 struct btrfs_inode_item *item,
3221 struct inode *inode, int log_inode_only) 3266 struct inode *inode, int log_inode_only,
3267 u64 logged_isize)
3222{ 3268{
3223 struct btrfs_map_token token; 3269 struct btrfs_map_token token;
3224 3270
@@ -3231,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
3231 * to say 'update this inode with these values' 3277 * to say 'update this inode with these values'
3232 */ 3278 */
3233 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3279 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3234 btrfs_set_token_inode_size(leaf, item, 0, &token); 3280 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3235 } else { 3281 } else {
3236 btrfs_set_token_inode_generation(leaf, item, 3282 btrfs_set_token_inode_generation(leaf, item,
3237 BTRFS_I(inode)->generation, 3283 BTRFS_I(inode)->generation,
@@ -3244,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
3244 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3290 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3245 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3291 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3246 3292
3247 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3293 btrfs_set_token_timespec_sec(leaf, &item->atime,
3248 inode->i_atime.tv_sec, &token); 3294 inode->i_atime.tv_sec, &token);
3249 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3295 btrfs_set_token_timespec_nsec(leaf, &item->atime,
3250 inode->i_atime.tv_nsec, &token); 3296 inode->i_atime.tv_nsec, &token);
3251 3297
3252 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3298 btrfs_set_token_timespec_sec(leaf, &item->mtime,
3253 inode->i_mtime.tv_sec, &token); 3299 inode->i_mtime.tv_sec, &token);
3254 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3300 btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3255 inode->i_mtime.tv_nsec, &token); 3301 inode->i_mtime.tv_nsec, &token);
3256 3302
3257 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3303 btrfs_set_token_timespec_sec(leaf, &item->ctime,
3258 inode->i_ctime.tv_sec, &token); 3304 inode->i_ctime.tv_sec, &token);
3259 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3305 btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3260 inode->i_ctime.tv_nsec, &token); 3306 inode->i_ctime.tv_nsec, &token);
3261 3307
3262 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3308 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3283,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
3283 return ret; 3329 return ret;
3284 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3330 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3285 struct btrfs_inode_item); 3331 struct btrfs_inode_item);
3286 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3332 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3287 btrfs_release_path(path); 3333 btrfs_release_path(path);
3288 return 0; 3334 return 0;
3289} 3335}
@@ -3292,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3292 struct inode *inode, 3338 struct inode *inode,
3293 struct btrfs_path *dst_path, 3339 struct btrfs_path *dst_path,
3294 struct btrfs_path *src_path, u64 *last_extent, 3340 struct btrfs_path *src_path, u64 *last_extent,
3295 int start_slot, int nr, int inode_only) 3341 int start_slot, int nr, int inode_only,
3342 u64 logged_isize)
3296{ 3343{
3297 unsigned long src_offset; 3344 unsigned long src_offset;
3298 unsigned long dst_offset; 3345 unsigned long dst_offset;
@@ -3349,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3349 dst_path->slots[0], 3396 dst_path->slots[0],
3350 struct btrfs_inode_item); 3397 struct btrfs_inode_item);
3351 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3398 fill_inode_item(trans, dst_path->nodes[0], inode_item,
3352 inode, inode_only == LOG_INODE_EXISTS); 3399 inode, inode_only == LOG_INODE_EXISTS,
3400 logged_isize);
3353 } else { 3401 } else {
3354 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3402 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3355 src_offset, ins_sizes[i]); 3403 src_offset, ins_sizes[i]);
@@ -3901,6 +3949,33 @@ process:
3901 return ret; 3949 return ret;
3902} 3950}
3903 3951
3952static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
3953 struct btrfs_path *path, u64 *size_ret)
3954{
3955 struct btrfs_key key;
3956 int ret;
3957
3958 key.objectid = btrfs_ino(inode);
3959 key.type = BTRFS_INODE_ITEM_KEY;
3960 key.offset = 0;
3961
3962 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
3963 if (ret < 0) {
3964 return ret;
3965 } else if (ret > 0) {
3966 *size_ret = i_size_read(inode);
3967 } else {
3968 struct btrfs_inode_item *item;
3969
3970 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3971 struct btrfs_inode_item);
3972 *size_ret = btrfs_inode_size(path->nodes[0], item);
3973 }
3974
3975 btrfs_release_path(path);
3976 return 0;
3977}
3978
3904/* log a single inode in the tree log. 3979/* log a single inode in the tree log.
3905 * At least one parent directory for this inode must exist in the tree 3980 * At least one parent directory for this inode must exist in the tree
3906 * or be logged already. 3981 * or be logged already.
@@ -3938,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3938 bool fast_search = false; 4013 bool fast_search = false;
3939 u64 ino = btrfs_ino(inode); 4014 u64 ino = btrfs_ino(inode);
3940 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4015 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4016 u64 logged_isize = 0;
3941 4017
3942 path = btrfs_alloc_path(); 4018 path = btrfs_alloc_path();
3943 if (!path) 4019 if (!path)
@@ -3965,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3965 max_key.type = (u8)-1; 4041 max_key.type = (u8)-1;
3966 max_key.offset = (u64)-1; 4042 max_key.offset = (u64)-1;
3967 4043
3968 /* Only run delayed items if we are a dir or a new file */ 4044 /*
4045 * Only run delayed items if we are a dir or a new file.
4046 * Otherwise commit the delayed inode only, which is needed in
4047 * order for the log replay code to mark inodes for link count
4048 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4049 */
3969 if (S_ISDIR(inode->i_mode) || 4050 if (S_ISDIR(inode->i_mode) ||
3970 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { 4051 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
3971 ret = btrfs_commit_inode_delayed_items(trans, inode); 4052 ret = btrfs_commit_inode_delayed_items(trans, inode);
3972 if (ret) { 4053 else
3973 btrfs_free_path(path); 4054 ret = btrfs_commit_inode_delayed_inode(inode);
3974 btrfs_free_path(dst_path); 4055
3975 return ret; 4056 if (ret) {
3976 } 4057 btrfs_free_path(path);
4058 btrfs_free_path(dst_path);
4059 return ret;
3977 } 4060 }
3978 4061
3979 mutex_lock(&BTRFS_I(inode)->log_mutex); 4062 mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3987,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3987 if (S_ISDIR(inode->i_mode)) { 4070 if (S_ISDIR(inode->i_mode)) {
3988 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4071 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3989 4072
3990 if (inode_only == LOG_INODE_EXISTS) 4073 if (inode_only == LOG_INODE_EXISTS) {
3991 max_key_type = BTRFS_XATTR_ITEM_KEY; 4074 max_key_type = BTRFS_INODE_EXTREF_KEY;
4075 max_key.type = max_key_type;
4076 }
3992 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4077 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3993 } else { 4078 } else {
3994 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4079 if (inode_only == LOG_INODE_EXISTS) {
3995 &BTRFS_I(inode)->runtime_flags)) { 4080 /*
3996 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4081 * Make sure the new inode item we write to the log has
3997 &BTRFS_I(inode)->runtime_flags); 4082 * the same isize as the current one (if it exists).
3998 ret = btrfs_truncate_inode_items(trans, log, 4083 * This is necessary to prevent data loss after log
3999 inode, 0, 0); 4084 * replay, and also to prevent doing a wrong expanding
4000 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4085 * truncate - for e.g. create file, write 4K into offset
4001 &BTRFS_I(inode)->runtime_flags) || 4086 * 0, fsync, write 4K into offset 4096, add hard link,
4087 * fsync some other file (to sync log), power fail - if
4088 * we use the inode's current i_size, after log replay
4089 * we get a 8Kb file, with the last 4Kb extent as a hole
4090 * (zeroes), as if an expanding truncate happened,
4091 * instead of getting a file of 4Kb only.
4092 */
4093 err = logged_inode_size(log, inode, path,
4094 &logged_isize);
4095 if (err)
4096 goto out_unlock;
4097 }
4098 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4099 &BTRFS_I(inode)->runtime_flags)) {
4100 if (inode_only == LOG_INODE_EXISTS) {
4101 max_key.type = BTRFS_INODE_EXTREF_KEY;
4102 ret = drop_objectid_items(trans, log, path, ino,
4103 max_key.type);
4104 } else {
4105 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4106 &BTRFS_I(inode)->runtime_flags);
4107 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4108 &BTRFS_I(inode)->runtime_flags);
4109 ret = btrfs_truncate_inode_items(trans, log,
4110 inode, 0, 0);
4111 }
4112 } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
4113 &BTRFS_I(inode)->runtime_flags) ||
4002 inode_only == LOG_INODE_EXISTS) { 4114 inode_only == LOG_INODE_EXISTS) {
4003 if (inode_only == LOG_INODE_ALL) 4115 if (inode_only == LOG_INODE_ALL) {
4116 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4117 &BTRFS_I(inode)->runtime_flags);
4004 fast_search = true; 4118 fast_search = true;
4005 max_key.type = BTRFS_XATTR_ITEM_KEY; 4119 max_key.type = BTRFS_XATTR_ITEM_KEY;
4120 } else {
4121 max_key.type = BTRFS_INODE_EXTREF_KEY;
4122 }
4006 ret = drop_objectid_items(trans, log, path, ino, 4123 ret = drop_objectid_items(trans, log, path, ino,
4007 max_key.type); 4124 max_key.type);
4008 } else { 4125 } else {
@@ -4046,7 +4163,8 @@ again:
4046 } 4163 }
4047 4164
4048 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4165 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4049 ins_start_slot, ins_nr, inode_only); 4166 ins_start_slot, ins_nr, inode_only,
4167 logged_isize);
4050 if (ret < 0) { 4168 if (ret < 0) {
4051 err = ret; 4169 err = ret;
4052 goto out_unlock; 4170 goto out_unlock;
@@ -4070,7 +4188,7 @@ next_slot:
4070 if (ins_nr) { 4188 if (ins_nr) {
4071 ret = copy_items(trans, inode, dst_path, path, 4189 ret = copy_items(trans, inode, dst_path, path,
4072 &last_extent, ins_start_slot, 4190 &last_extent, ins_start_slot,
4073 ins_nr, inode_only); 4191 ins_nr, inode_only, logged_isize);
4074 if (ret < 0) { 4192 if (ret < 0) {
4075 err = ret; 4193 err = ret;
4076 goto out_unlock; 4194 goto out_unlock;
@@ -4091,7 +4209,8 @@ next_slot:
4091 } 4209 }
4092 if (ins_nr) { 4210 if (ins_nr) {
4093 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4211 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4094 ins_start_slot, ins_nr, inode_only); 4212 ins_start_slot, ins_nr, inode_only,
4213 logged_isize);
4095 if (ret < 0) { 4214 if (ret < 0) {
4096 err = ret; 4215 err = ret;
4097 goto out_unlock; 4216 goto out_unlock;
@@ -4272,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4272 struct dentry *old_parent = NULL; 4391 struct dentry *old_parent = NULL;
4273 int ret = 0; 4392 int ret = 0;
4274 u64 last_committed = root->fs_info->last_trans_committed; 4393 u64 last_committed = root->fs_info->last_trans_committed;
4394 const struct dentry * const first_parent = parent;
4395 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
4396 last_committed);
4275 4397
4276 sb = inode->i_sb; 4398 sb = inode->i_sb;
4277 4399
@@ -4327,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4327 goto end_trans; 4449 goto end_trans;
4328 } 4450 }
4329 4451
4330 inode_only = LOG_INODE_EXISTS;
4331 while (1) { 4452 while (1) {
4332 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4453 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4333 break; 4454 break;
@@ -4336,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4336 if (root != BTRFS_I(inode)->root) 4457 if (root != BTRFS_I(inode)->root)
4337 break; 4458 break;
4338 4459
4460 /*
4461 * On unlink we must make sure our immediate parent directory
4462 * inode is fully logged. This is to prevent leaving dangling
4463 * directory index entries and a wrong directory inode's i_size.
4464 * Not doing so can result in a directory being impossible to
4465 * delete after log replay (rmdir will always fail with error
4466 * -ENOTEMPTY).
4467 */
4468 if (did_unlink && parent == first_parent)
4469 inode_only = LOG_INODE_ALL;
4470 else
4471 inode_only = LOG_INODE_EXISTS;
4472
4339 if (BTRFS_I(inode)->generation > 4473 if (BTRFS_I(inode)->generation >
4340 root->fs_info->last_trans_committed) { 4474 root->fs_info->last_trans_committed ||
4475 inode_only == LOG_INODE_ALL) {
4341 ret = btrfs_log_inode(trans, root, inode, inode_only, 4476 ret = btrfs_log_inode(trans, root, inode, inode_only,
4342 0, LLONG_MAX, ctx); 4477 0, LLONG_MAX, ctx);
4343 if (ret) 4478 if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50c5a8762aed..cd4d1315aaa9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1310,6 +1310,8 @@ again:
1310 if (ret) { 1310 if (ret) {
1311 btrfs_error(root->fs_info, ret, 1311 btrfs_error(root->fs_info, ret,
1312 "Failed to remove dev extent item"); 1312 "Failed to remove dev extent item");
1313 } else {
1314 trans->transaction->have_free_bgs = 1;
1313 } 1315 }
1314out: 1316out:
1315 btrfs_free_path(path); 1317 btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4196 4198
4197static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 4199static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4198{ 4200{
4199 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 4201 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4200 return; 4202 return;
4201 4203
4202 btrfs_set_fs_incompat(info, RAID56); 4204 btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4803 4805
4804 BUG_ON(em->start > logical || em->start + em->len < logical); 4806 BUG_ON(em->start > logical || em->start + em->len < logical);
4805 map = (struct map_lookup *)em->bdev; 4807 map = (struct map_lookup *)em->bdev;
4806 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4808 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
4807 BTRFS_BLOCK_GROUP_RAID6)) {
4808 len = map->stripe_len * nr_data_stripes(map); 4809 len = map->stripe_len * nr_data_stripes(map);
4809 }
4810 free_extent_map(em); 4810 free_extent_map(em);
4811 return len; 4811 return len;
4812} 4812}
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4826 4826
4827 BUG_ON(em->start > logical || em->start + em->len < logical); 4827 BUG_ON(em->start > logical || em->start + em->len < logical);
4828 map = (struct map_lookup *)em->bdev; 4828 map = (struct map_lookup *)em->bdev;
4829 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4829 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
4830 BTRFS_BLOCK_GROUP_RAID6))
4831 ret = 1; 4830 ret = 1;
4832 free_extent_map(em); 4831 free_extent_map(em);
4833 return ret; 4832 return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
4876} 4875}
4877 4876
4878/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4877/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4879static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4878static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
4880{ 4879{
4881 struct btrfs_bio_stripe s; 4880 struct btrfs_bio_stripe s;
4882 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
4883 int i; 4881 int i;
4884 u64 l; 4882 u64 l;
4885 int again = 1; 4883 int again = 1;
4886 int m;
4887 4884
4888 while (again) { 4885 while (again) {
4889 again = 0; 4886 again = 0;
4890 for (i = 0; i < real_stripes - 1; i++) { 4887 for (i = 0; i < num_stripes - 1; i++) {
4891 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4888 if (parity_smaller(bbio->raid_map[i],
4889 bbio->raid_map[i+1])) {
4892 s = bbio->stripes[i]; 4890 s = bbio->stripes[i];
4893 l = raid_map[i]; 4891 l = bbio->raid_map[i];
4894 bbio->stripes[i] = bbio->stripes[i+1]; 4892 bbio->stripes[i] = bbio->stripes[i+1];
4895 raid_map[i] = raid_map[i+1]; 4893 bbio->raid_map[i] = bbio->raid_map[i+1];
4896 bbio->stripes[i+1] = s; 4894 bbio->stripes[i+1] = s;
4897 raid_map[i+1] = l; 4895 bbio->raid_map[i+1] = l;
4898
4899 if (bbio->tgtdev_map) {
4900 m = bbio->tgtdev_map[i];
4901 bbio->tgtdev_map[i] =
4902 bbio->tgtdev_map[i + 1];
4903 bbio->tgtdev_map[i + 1] = m;
4904 }
4905 4896
4906 again = 1; 4897 again = 1;
4907 } 4898 }
@@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4909 } 4900 }
4910} 4901}
4911 4902
4903static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
4904{
4905 struct btrfs_bio *bbio = kzalloc(
4906 sizeof(struct btrfs_bio) +
4907 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
4908 sizeof(int) * (real_stripes) +
4909 sizeof(u64) * (real_stripes),
4910 GFP_NOFS);
4911 if (!bbio)
4912 return NULL;
4913
4914 atomic_set(&bbio->error, 0);
4915 atomic_set(&bbio->refs, 1);
4916
4917 return bbio;
4918}
4919
4920void btrfs_get_bbio(struct btrfs_bio *bbio)
4921{
4922 WARN_ON(!atomic_read(&bbio->refs));
4923 atomic_inc(&bbio->refs);
4924}
4925
4926void btrfs_put_bbio(struct btrfs_bio *bbio)
4927{
4928 if (!bbio)
4929 return;
4930 if (atomic_dec_and_test(&bbio->refs))
4931 kfree(bbio);
4932}
4933
4912static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4934static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4913 u64 logical, u64 *length, 4935 u64 logical, u64 *length,
4914 struct btrfs_bio **bbio_ret, 4936 struct btrfs_bio **bbio_ret,
4915 int mirror_num, u64 **raid_map_ret) 4937 int mirror_num, int need_raid_map)
4916{ 4938{
4917 struct extent_map *em; 4939 struct extent_map *em;
4918 struct map_lookup *map; 4940 struct map_lookup *map;
@@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4925 u64 stripe_nr_orig; 4947 u64 stripe_nr_orig;
4926 u64 stripe_nr_end; 4948 u64 stripe_nr_end;
4927 u64 stripe_len; 4949 u64 stripe_len;
4928 u64 *raid_map = NULL;
4929 int stripe_index; 4950 int stripe_index;
4930 int i; 4951 int i;
4931 int ret = 0; 4952 int ret = 0;
@@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4976 stripe_offset = offset - stripe_offset; 4997 stripe_offset = offset - stripe_offset;
4977 4998
4978 /* if we're here for raid56, we need to know the stripe aligned start */ 4999 /* if we're here for raid56, we need to know the stripe aligned start */
4979 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 5000 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
4980 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 5001 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4981 raid56_full_stripe_start = offset; 5002 raid56_full_stripe_start = offset;
4982 5003
@@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4989 5010
4990 if (rw & REQ_DISCARD) { 5011 if (rw & REQ_DISCARD) {
4991 /* we don't discard raid56 yet */ 5012 /* we don't discard raid56 yet */
4992 if (map->type & 5013 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
4993 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4994 ret = -EOPNOTSUPP; 5014 ret = -EOPNOTSUPP;
4995 goto out; 5015 goto out;
4996 } 5016 }
@@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5000 /* For writes to RAID[56], allow a full stripeset across all disks. 5020 /* For writes to RAID[56], allow a full stripeset across all disks.
5001 For other RAID types and for RAID[56] reads, just allow a single 5021 For other RAID types and for RAID[56] reads, just allow a single
5002 stripe (on a single disk). */ 5022 stripe (on a single disk). */
5003 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 5023 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5004 (rw & REQ_WRITE)) { 5024 (rw & REQ_WRITE)) {
5005 max_len = stripe_len * nr_data_stripes(map) - 5025 max_len = stripe_len * nr_data_stripes(map) -
5006 (offset - raid56_full_stripe_start); 5026 (offset - raid56_full_stripe_start);
@@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5047 u64 physical_of_found = 0; 5067 u64 physical_of_found = 0;
5048 5068
5049 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 5069 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
5050 logical, &tmp_length, &tmp_bbio, 0, NULL); 5070 logical, &tmp_length, &tmp_bbio, 0, 0);
5051 if (ret) { 5071 if (ret) {
5052 WARN_ON(tmp_bbio != NULL); 5072 WARN_ON(tmp_bbio != NULL);
5053 goto out; 5073 goto out;
@@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5061 * is not left of the left cursor 5081 * is not left of the left cursor
5062 */ 5082 */
5063 ret = -EIO; 5083 ret = -EIO;
5064 kfree(tmp_bbio); 5084 btrfs_put_bbio(tmp_bbio);
5065 goto out; 5085 goto out;
5066 } 5086 }
5067 5087
@@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5096 } else { 5116 } else {
5097 WARN_ON(1); 5117 WARN_ON(1);
5098 ret = -EIO; 5118 ret = -EIO;
5099 kfree(tmp_bbio); 5119 btrfs_put_bbio(tmp_bbio);
5100 goto out; 5120 goto out;
5101 } 5121 }
5102 5122
5103 kfree(tmp_bbio); 5123 btrfs_put_bbio(tmp_bbio);
5104 } else if (mirror_num > map->num_stripes) { 5124 } else if (mirror_num > map->num_stripes) {
5105 mirror_num = 0; 5125 mirror_num = 0;
5106 } 5126 }
@@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5166 mirror_num = stripe_index - old_stripe_index + 1; 5186 mirror_num = stripe_index - old_stripe_index + 1;
5167 } 5187 }
5168 5188
5169 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5189 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5170 BTRFS_BLOCK_GROUP_RAID6)) { 5190 if (need_raid_map &&
5171 u64 tmp;
5172
5173 if (raid_map_ret &&
5174 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || 5191 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5175 mirror_num > 1)) { 5192 mirror_num > 1)) {
5176 int i, rot;
5177
5178 /* push stripe_nr back to the start of the full stripe */ 5193 /* push stripe_nr back to the start of the full stripe */
5179 stripe_nr = raid56_full_stripe_start; 5194 stripe_nr = raid56_full_stripe_start;
5180 do_div(stripe_nr, stripe_len * nr_data_stripes(map)); 5195 do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5183 num_stripes = map->num_stripes; 5198 num_stripes = map->num_stripes;
5184 max_errors = nr_parity_stripes(map); 5199 max_errors = nr_parity_stripes(map);
5185 5200
5186 raid_map = kmalloc_array(num_stripes, sizeof(u64),
5187 GFP_NOFS);
5188 if (!raid_map) {
5189 ret = -ENOMEM;
5190 goto out;
5191 }
5192
5193 /* Work out the disk rotation on this stripe-set */
5194 tmp = stripe_nr;
5195 rot = do_div(tmp, num_stripes);
5196
5197 /* Fill in the logical address of each stripe */
5198 tmp = stripe_nr * nr_data_stripes(map);
5199 for (i = 0; i < nr_data_stripes(map); i++)
5200 raid_map[(i+rot) % num_stripes] =
5201 em->start + (tmp + i) * map->stripe_len;
5202
5203 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5204 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5205 raid_map[(i+rot+1) % num_stripes] =
5206 RAID6_Q_STRIPE;
5207
5208 *length = map->stripe_len; 5201 *length = map->stripe_len;
5209 stripe_index = 0; 5202 stripe_index = 0;
5210 stripe_offset = 0; 5203 stripe_offset = 0;
5211 } else { 5204 } else {
5205 u64 tmp;
5206
5212 /* 5207 /*
5213 * Mirror #0 or #1 means the original data block. 5208 * Mirror #0 or #1 means the original data block.
5214 * Mirror #2 is RAID5 parity block. 5209 * Mirror #2 is RAID5 parity block.
@@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5246 tgtdev_indexes = num_stripes; 5241 tgtdev_indexes = num_stripes;
5247 } 5242 }
5248 5243
5249 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), 5244 bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
5250 GFP_NOFS);
5251 if (!bbio) { 5245 if (!bbio) {
5252 kfree(raid_map);
5253 ret = -ENOMEM; 5246 ret = -ENOMEM;
5254 goto out; 5247 goto out;
5255 } 5248 }
5256 atomic_set(&bbio->error, 0);
5257 if (dev_replace_is_ongoing) 5249 if (dev_replace_is_ongoing)
5258 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); 5250 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5259 5251
5252 /* build raid_map */
5253 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
5254 need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5255 mirror_num > 1)) {
5256 u64 tmp;
5257 int i, rot;
5258
5259 bbio->raid_map = (u64 *)((void *)bbio->stripes +
5260 sizeof(struct btrfs_bio_stripe) *
5261 num_alloc_stripes +
5262 sizeof(int) * tgtdev_indexes);
5263
5264 /* Work out the disk rotation on this stripe-set */
5265 tmp = stripe_nr;
5266 rot = do_div(tmp, num_stripes);
5267
5268 /* Fill in the logical address of each stripe */
5269 tmp = stripe_nr * nr_data_stripes(map);
5270 for (i = 0; i < nr_data_stripes(map); i++)
5271 bbio->raid_map[(i+rot) % num_stripes] =
5272 em->start + (tmp + i) * map->stripe_len;
5273
5274 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5275 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5276 bbio->raid_map[(i+rot+1) % num_stripes] =
5277 RAID6_Q_STRIPE;
5278 }
5279
5260 if (rw & REQ_DISCARD) { 5280 if (rw & REQ_DISCARD) {
5261 int factor = 0; 5281 int factor = 0;
5262 int sub_stripes = 0; 5282 int sub_stripes = 0;
@@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5340 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5360 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5341 max_errors = btrfs_chunk_max_errors(map); 5361 max_errors = btrfs_chunk_max_errors(map);
5342 5362
5363 if (bbio->raid_map)
5364 sort_parity_stripes(bbio, num_stripes);
5365
5343 tgtdev_indexes = 0; 5366 tgtdev_indexes = 0;
5344 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5367 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5345 dev_replace->tgtdev != NULL) { 5368 dev_replace->tgtdev != NULL) {
@@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5427 } 5450 }
5428 5451
5429 *bbio_ret = bbio; 5452 *bbio_ret = bbio;
5453 bbio->map_type = map->type;
5430 bbio->num_stripes = num_stripes; 5454 bbio->num_stripes = num_stripes;
5431 bbio->max_errors = max_errors; 5455 bbio->max_errors = max_errors;
5432 bbio->mirror_num = mirror_num; 5456 bbio->mirror_num = mirror_num;
@@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5443 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 5467 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5444 bbio->mirror_num = map->num_stripes + 1; 5468 bbio->mirror_num = map->num_stripes + 1;
5445 } 5469 }
5446 if (raid_map) {
5447 sort_parity_stripes(bbio, raid_map);
5448 *raid_map_ret = raid_map;
5449 }
5450out: 5470out:
5451 if (dev_replace_is_ongoing) 5471 if (dev_replace_is_ongoing)
5452 btrfs_dev_replace_unlock(dev_replace); 5472 btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5459 struct btrfs_bio **bbio_ret, int mirror_num) 5479 struct btrfs_bio **bbio_ret, int mirror_num)
5460{ 5480{
5461 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5481 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5462 mirror_num, NULL); 5482 mirror_num, 0);
5463} 5483}
5464 5484
5465/* For Scrub/replace */ 5485/* For Scrub/replace */
5466int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, 5486int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5467 u64 logical, u64 *length, 5487 u64 logical, u64 *length,
5468 struct btrfs_bio **bbio_ret, int mirror_num, 5488 struct btrfs_bio **bbio_ret, int mirror_num,
5469 u64 **raid_map_ret) 5489 int need_raid_map)
5470{ 5490{
5471 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 5491 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5472 mirror_num, raid_map_ret); 5492 mirror_num, need_raid_map);
5473} 5493}
5474 5494
5475int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5495int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5511 do_div(length, map->num_stripes / map->sub_stripes); 5531 do_div(length, map->num_stripes / map->sub_stripes);
5512 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5532 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5513 do_div(length, map->num_stripes); 5533 do_div(length, map->num_stripes);
5514 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 5534 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5515 BTRFS_BLOCK_GROUP_RAID6)) {
5516 do_div(length, nr_data_stripes(map)); 5535 do_div(length, nr_data_stripes(map));
5517 rmap_len = map->stripe_len * nr_data_stripes(map); 5536 rmap_len = map->stripe_len * nr_data_stripes(map);
5518 } 5537 }
@@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
5565 bio_endio_nodec(bio, err); 5584 bio_endio_nodec(bio, err);
5566 else 5585 else
5567 bio_endio(bio, err); 5586 bio_endio(bio, err);
5568 kfree(bbio); 5587 btrfs_put_bbio(bbio);
5569} 5588}
5570 5589
5571static void btrfs_end_bio(struct bio *bio, int err) 5590static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5808 u64 logical = (u64)bio->bi_iter.bi_sector << 9; 5827 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5809 u64 length = 0; 5828 u64 length = 0;
5810 u64 map_length; 5829 u64 map_length;
5811 u64 *raid_map = NULL;
5812 int ret; 5830 int ret;
5813 int dev_nr = 0; 5831 int dev_nr = 0;
5814 int total_devs = 1; 5832 int total_devs = 1;
@@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5819 5837
5820 btrfs_bio_counter_inc_blocked(root->fs_info); 5838 btrfs_bio_counter_inc_blocked(root->fs_info);
5821 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5839 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5822 mirror_num, &raid_map); 5840 mirror_num, 1);
5823 if (ret) { 5841 if (ret) {
5824 btrfs_bio_counter_dec(root->fs_info); 5842 btrfs_bio_counter_dec(root->fs_info);
5825 return ret; 5843 return ret;
@@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5832 bbio->fs_info = root->fs_info; 5850 bbio->fs_info = root->fs_info;
5833 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5851 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5834 5852
5835 if (raid_map) { 5853 if (bbio->raid_map) {
5836 /* In this case, map_length has been set to the length of 5854 /* In this case, map_length has been set to the length of
5837 a single stripe; not the whole write */ 5855 a single stripe; not the whole write */
5838 if (rw & WRITE) { 5856 if (rw & WRITE) {
5839 ret = raid56_parity_write(root, bio, bbio, 5857 ret = raid56_parity_write(root, bio, bbio, map_length);
5840 raid_map, map_length);
5841 } else { 5858 } else {
5842 ret = raid56_parity_recover(root, bio, bbio, 5859 ret = raid56_parity_recover(root, bio, bbio, map_length,
5843 raid_map, map_length,
5844 mirror_num, 1); 5860 mirror_num, 1);
5845 } 5861 }
5846 5862
@@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
6238 struct extent_buffer *sb; 6254 struct extent_buffer *sb;
6239 struct btrfs_disk_key *disk_key; 6255 struct btrfs_disk_key *disk_key;
6240 struct btrfs_chunk *chunk; 6256 struct btrfs_chunk *chunk;
6241 u8 *ptr; 6257 u8 *array_ptr;
6242 unsigned long sb_ptr; 6258 unsigned long sb_array_offset;
6243 int ret = 0; 6259 int ret = 0;
6244 u32 num_stripes; 6260 u32 num_stripes;
6245 u32 array_size; 6261 u32 array_size;
6246 u32 len = 0; 6262 u32 len = 0;
6247 u32 cur; 6263 u32 cur_offset;
6248 struct btrfs_key key; 6264 struct btrfs_key key;
6249 6265
6250 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, 6266 ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
6251 BTRFS_SUPER_INFO_SIZE); 6267 /*
6268 * This will create extent buffer of nodesize, superblock size is
6269 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6270 * overallocate but we can keep it as-is, only the first page is used.
6271 */
6272 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
6252 if (!sb) 6273 if (!sb)
6253 return -ENOMEM; 6274 return -ENOMEM;
6254 btrfs_set_buffer_uptodate(sb); 6275 btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
6271 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 6292 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6272 array_size = btrfs_super_sys_array_size(super_copy); 6293 array_size = btrfs_super_sys_array_size(super_copy);
6273 6294
6274 ptr = super_copy->sys_chunk_array; 6295 array_ptr = super_copy->sys_chunk_array;
6275 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); 6296 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6276 cur = 0; 6297 cur_offset = 0;
6298
6299 while (cur_offset < array_size) {
6300 disk_key = (struct btrfs_disk_key *)array_ptr;
6301 len = sizeof(*disk_key);
6302 if (cur_offset + len > array_size)
6303 goto out_short_read;
6277 6304
6278 while (cur < array_size) {
6279 disk_key = (struct btrfs_disk_key *)ptr;
6280 btrfs_disk_key_to_cpu(&key, disk_key); 6305 btrfs_disk_key_to_cpu(&key, disk_key);
6281 6306
6282 len = sizeof(*disk_key); ptr += len; 6307 array_ptr += len;
6283 sb_ptr += len; 6308 sb_array_offset += len;
6284 cur += len; 6309 cur_offset += len;
6285 6310
6286 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 6311 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6287 chunk = (struct btrfs_chunk *)sb_ptr; 6312 chunk = (struct btrfs_chunk *)sb_array_offset;
6313 /*
6314 * At least one btrfs_chunk with one stripe must be
6315 * present, exact stripe count check comes afterwards
6316 */
6317 len = btrfs_chunk_item_size(1);
6318 if (cur_offset + len > array_size)
6319 goto out_short_read;
6320
6321 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6322 len = btrfs_chunk_item_size(num_stripes);
6323 if (cur_offset + len > array_size)
6324 goto out_short_read;
6325
6288 ret = read_one_chunk(root, &key, sb, chunk); 6326 ret = read_one_chunk(root, &key, sb, chunk);
6289 if (ret) 6327 if (ret)
6290 break; 6328 break;
6291 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6292 len = btrfs_chunk_item_size(num_stripes);
6293 } else { 6329 } else {
6294 ret = -EIO; 6330 ret = -EIO;
6295 break; 6331 break;
6296 } 6332 }
6297 ptr += len; 6333 array_ptr += len;
6298 sb_ptr += len; 6334 sb_array_offset += len;
6299 cur += len; 6335 cur_offset += len;
6300 } 6336 }
6301 free_extent_buffer(sb); 6337 free_extent_buffer(sb);
6302 return ret; 6338 return ret;
6339
6340out_short_read:
6341 printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
6342 len, cur_offset);
6343 free_extent_buffer(sb);
6344 return -EIO;
6303} 6345}
6304 6346
6305int btrfs_read_chunk_tree(struct btrfs_root *root) 6347int btrfs_read_chunk_tree(struct btrfs_root *root)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d6fe73c0f4a2..83069dec6898 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
295#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) 295#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
296 296
297struct btrfs_bio { 297struct btrfs_bio {
298 atomic_t refs;
298 atomic_t stripes_pending; 299 atomic_t stripes_pending;
299 struct btrfs_fs_info *fs_info; 300 struct btrfs_fs_info *fs_info;
301 u64 map_type; /* get from map_lookup->type */
300 bio_end_io_t *end_io; 302 bio_end_io_t *end_io;
301 struct bio *orig_bio; 303 struct bio *orig_bio;
302 unsigned long flags; 304 unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
307 int mirror_num; 309 int mirror_num;
308 int num_tgtdevs; 310 int num_tgtdevs;
309 int *tgtdev_map; 311 int *tgtdev_map;
312 /*
313 * logical block numbers for the start of each stripe
314 * The last one or two are p/q. These are sorted,
315 * so raid_map[0] is the start of our full stripe
316 */
317 u64 *raid_map;
310 struct btrfs_bio_stripe stripes[]; 318 struct btrfs_bio_stripe stripes[];
311}; 319};
312 320
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
388 396
389int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 397int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
390 u64 end, u64 *length); 398 u64 end, u64 *length);
391 399void btrfs_get_bbio(struct btrfs_bio *bbio);
392#define btrfs_bio_size(total_stripes, real_stripes) \ 400void btrfs_put_bbio(struct btrfs_bio *bbio);
393 (sizeof(struct btrfs_bio) + \
394 (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
395 (sizeof(int) * (real_stripes)))
396
397int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 401int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
398 u64 logical, u64 *length, 402 u64 logical, u64 *length,
399 struct btrfs_bio **bbio_ret, int mirror_num); 403 struct btrfs_bio **bbio_ret, int mirror_num);
400int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, 404int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
401 u64 logical, u64 *length, 405 u64 logical, u64 *length,
402 struct btrfs_bio **bbio_ret, int mirror_num, 406 struct btrfs_bio **bbio_ret, int mirror_num,
403 u64 **raid_map_ret); 407 int need_raid_map);
404int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 408int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
405 u64 chunk_start, u64 physical, u64 devid, 409 u64 chunk_start, u64 physical, u64 devid,
406 u64 **logical, int *naddrs, int *stripe_len); 410 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 5bd853ba44ff..64fa248343f6 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode,
40 spin_unlock(&ci->i_ceph_lock); 40 spin_unlock(&ci->i_ceph_lock);
41} 41}
42 42
43static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
44 int type)
45{
46 struct ceph_inode_info *ci = ceph_inode(inode);
47 struct posix_acl *acl = ACL_NOT_CACHED;
48
49 spin_lock(&ci->i_ceph_lock);
50 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
51 acl = get_cached_acl(inode, type);
52 spin_unlock(&ci->i_ceph_lock);
53
54 return acl;
55}
56
57struct posix_acl *ceph_get_acl(struct inode *inode, int type) 43struct posix_acl *ceph_get_acl(struct inode *inode, int type)
58{ 44{
59 int size; 45 int size;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f5013d92a7e6..fd5599d32362 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page)
196 u64 len = PAGE_CACHE_SIZE; 196 u64 len = PAGE_CACHE_SIZE;
197 197
198 if (off >= i_size_read(inode)) { 198 if (off >= i_size_read(inode)) {
199 zero_user_segment(page, err, PAGE_CACHE_SIZE); 199 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
200 SetPageUptodate(page); 200 SetPageUptodate(page);
201 return 0; 201 return 0;
202 } 202 }
203 203
204 /* 204 if (ci->i_inline_version != CEPH_INLINE_NONE) {
205 * Uptodate inline data should have been added into page cache 205 /*
206 * while getting Fcr caps. 206 * Uptodate inline data should have been added
207 */ 207 * into page cache while getting Fcr caps.
208 if (ci->i_inline_version != CEPH_INLINE_NONE) 208 */
209 return -EINVAL; 209 if (off == 0)
210 return -EINVAL;
211 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
212 SetPageUptodate(page);
213 return 0;
214 }
210 215
211 err = ceph_readpage_from_fscache(inode, page); 216 err = ceph_readpage_from_fscache(inode, page);
212 if (err == 0) 217 if (err == 0)
@@ -1416,7 +1421,7 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
1416 } 1421 }
1417 } 1422 }
1418 1423
1419 dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", 1424 dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
1420 inode, ceph_vinop(inode), len, locked_page); 1425 inode, ceph_vinop(inode), len, locked_page);
1421 1426
1422 if (len > 0) { 1427 if (len > 0) {
@@ -1569,7 +1574,6 @@ out:
1569static struct vm_operations_struct ceph_vmops = { 1574static struct vm_operations_struct ceph_vmops = {
1570 .fault = ceph_filemap_fault, 1575 .fault = ceph_filemap_fault,
1571 .page_mkwrite = ceph_page_mkwrite, 1576 .page_mkwrite = ceph_page_mkwrite,
1572 .remap_pages = generic_file_remap_pages,
1573}; 1577};
1574 1578
1575int ceph_mmap(struct file *file, struct vm_area_struct *vma) 1579int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b93c631c6c87..8172775428a0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode,
577 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, 577 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
578 realmino); 578 realmino);
579 if (realm) { 579 if (realm) {
580 ceph_get_snap_realm(mdsc, realm);
581 spin_lock(&realm->inodes_with_caps_lock); 580 spin_lock(&realm->inodes_with_caps_lock);
582 ci->i_snap_realm = realm; 581 ci->i_snap_realm = realm;
583 list_add(&ci->i_snap_realm_item, 582 list_add(&ci->i_snap_realm_item,
@@ -1451,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode,
1451 spin_lock(&mdsc->cap_dirty_lock); 1450 spin_lock(&mdsc->cap_dirty_lock);
1452 list_del_init(&ci->i_dirty_item); 1451 list_del_init(&ci->i_dirty_item);
1453 1452
1454 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1455 if (list_empty(&ci->i_flushing_item)) { 1453 if (list_empty(&ci->i_flushing_item)) {
1454 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1456 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); 1455 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1457 mdsc->num_cap_flushing++; 1456 mdsc->num_cap_flushing++;
1458 dout(" inode %p now flushing seq %lld\n", inode, 1457 dout(" inode %p now flushing seq %lld\n", inode,
@@ -2073,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
2073 * requested from the MDS. 2072 * requested from the MDS.
2074 */ 2073 */
2075static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2074static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2076 loff_t endoff, int *got, struct page **pinned_page, 2075 loff_t endoff, int *got, int *check_max, int *err)
2077 int *check_max, int *err)
2078{ 2076{
2079 struct inode *inode = &ci->vfs_inode; 2077 struct inode *inode = &ci->vfs_inode;
2080 int ret = 0; 2078 int ret = 0;
2081 int have, implemented, _got = 0; 2079 int have, implemented;
2082 int file_wanted; 2080 int file_wanted;
2083 2081
2084 dout("get_cap_refs %p need %s want %s\n", inode, 2082 dout("get_cap_refs %p need %s want %s\n", inode,
2085 ceph_cap_string(need), ceph_cap_string(want)); 2083 ceph_cap_string(need), ceph_cap_string(want));
2086again: 2084
2087 spin_lock(&ci->i_ceph_lock); 2085 spin_lock(&ci->i_ceph_lock);
2088 2086
2089 /* make sure file is actually open */ 2087 /* make sure file is actually open */
@@ -2138,50 +2136,34 @@ again:
2138 inode, ceph_cap_string(have), ceph_cap_string(not), 2136 inode, ceph_cap_string(have), ceph_cap_string(not),
2139 ceph_cap_string(revoking)); 2137 ceph_cap_string(revoking));
2140 if ((revoking & not) == 0) { 2138 if ((revoking & not) == 0) {
2141 _got = need | (have & want); 2139 *got = need | (have & want);
2142 __take_cap_refs(ci, _got); 2140 __take_cap_refs(ci, *got);
2143 ret = 1; 2141 ret = 1;
2144 } 2142 }
2145 } else { 2143 } else {
2144 int session_readonly = false;
2145 if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
2146 struct ceph_mds_session *s = ci->i_auth_cap->session;
2147 spin_lock(&s->s_cap_lock);
2148 session_readonly = s->s_readonly;
2149 spin_unlock(&s->s_cap_lock);
2150 }
2151 if (session_readonly) {
2152 dout("get_cap_refs %p needed %s but mds%d readonly\n",
2153 inode, ceph_cap_string(need), ci->i_auth_cap->mds);
2154 *err = -EROFS;
2155 ret = 1;
2156 goto out_unlock;
2157 }
2158
2146 dout("get_cap_refs %p have %s needed %s\n", inode, 2159 dout("get_cap_refs %p have %s needed %s\n", inode,
2147 ceph_cap_string(have), ceph_cap_string(need)); 2160 ceph_cap_string(have), ceph_cap_string(need));
2148 } 2161 }
2149out_unlock: 2162out_unlock:
2150 spin_unlock(&ci->i_ceph_lock); 2163 spin_unlock(&ci->i_ceph_lock);
2151 2164
2152 if (ci->i_inline_version != CEPH_INLINE_NONE &&
2153 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2154 i_size_read(inode) > 0) {
2155 int ret1;
2156 struct page *page = find_get_page(inode->i_mapping, 0);
2157 if (page) {
2158 if (PageUptodate(page)) {
2159 *pinned_page = page;
2160 goto out;
2161 }
2162 page_cache_release(page);
2163 }
2164 /*
2165 * drop cap refs first because getattr while holding
2166 * caps refs can cause deadlock.
2167 */
2168 ceph_put_cap_refs(ci, _got);
2169 _got = 0;
2170
2171 /* getattr request will bring inline data into page cache */
2172 ret1 = __ceph_do_getattr(inode, NULL,
2173 CEPH_STAT_CAP_INLINE_DATA, true);
2174 if (ret1 >= 0) {
2175 ret = 0;
2176 goto again;
2177 }
2178 *err = ret1;
2179 ret = 1;
2180 }
2181out:
2182 dout("get_cap_refs %p ret %d got %s\n", inode, 2165 dout("get_cap_refs %p ret %d got %s\n", inode,
2183 ret, ceph_cap_string(_got)); 2166 ret, ceph_cap_string(*got));
2184 *got = _got;
2185 return ret; 2167 return ret;
2186} 2168}
2187 2169
@@ -2221,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2221int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2203int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2222 loff_t endoff, int *got, struct page **pinned_page) 2204 loff_t endoff, int *got, struct page **pinned_page)
2223{ 2205{
2224 int check_max, ret, err; 2206 int _got, check_max, ret, err = 0;
2225 2207
2226retry: 2208retry:
2227 if (endoff > 0) 2209 if (endoff > 0)
2228 check_max_size(&ci->vfs_inode, endoff); 2210 check_max_size(&ci->vfs_inode, endoff);
2211 _got = 0;
2229 check_max = 0; 2212 check_max = 0;
2230 err = 0;
2231 ret = wait_event_interruptible(ci->i_cap_wq, 2213 ret = wait_event_interruptible(ci->i_cap_wq,
2232 try_get_cap_refs(ci, need, want, endoff, 2214 try_get_cap_refs(ci, need, want, endoff,
2233 got, pinned_page, 2215 &_got, &check_max, &err));
2234 &check_max, &err));
2235 if (err) 2216 if (err)
2236 ret = err; 2217 ret = err;
2218 if (ret < 0)
2219 return ret;
2220
2237 if (check_max) 2221 if (check_max)
2238 goto retry; 2222 goto retry;
2239 return ret; 2223
2224 if (ci->i_inline_version != CEPH_INLINE_NONE &&
2225 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2226 i_size_read(&ci->vfs_inode) > 0) {
2227 struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
2228 if (page) {
2229 if (PageUptodate(page)) {
2230 *pinned_page = page;
2231 goto out;
2232 }
2233 page_cache_release(page);
2234 }
2235 /*
2236 * drop cap refs first because getattr while holding
2237 * caps refs can cause deadlock.
2238 */
2239 ceph_put_cap_refs(ci, _got);
2240 _got = 0;
2241
2242 /* getattr request will bring inline data into page cache */
2243 ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
2244 CEPH_STAT_CAP_INLINE_DATA, true);
2245 if (ret < 0)
2246 return ret;
2247 goto retry;
2248 }
2249out:
2250 *got = _got;
2251 return 0;
2240} 2252}
2241 2253
2242/* 2254/*
@@ -2432,13 +2444,13 @@ static void invalidate_aliases(struct inode *inode)
2432 */ 2444 */
2433static void handle_cap_grant(struct ceph_mds_client *mdsc, 2445static void handle_cap_grant(struct ceph_mds_client *mdsc,
2434 struct inode *inode, struct ceph_mds_caps *grant, 2446 struct inode *inode, struct ceph_mds_caps *grant,
2435 void *snaptrace, int snaptrace_len,
2436 u64 inline_version, 2447 u64 inline_version,
2437 void *inline_data, int inline_len, 2448 void *inline_data, int inline_len,
2438 struct ceph_buffer *xattr_buf, 2449 struct ceph_buffer *xattr_buf,
2439 struct ceph_mds_session *session, 2450 struct ceph_mds_session *session,
2440 struct ceph_cap *cap, int issued) 2451 struct ceph_cap *cap, int issued)
2441 __releases(ci->i_ceph_lock) 2452 __releases(ci->i_ceph_lock)
2453 __releases(mdsc->snap_rwsem)
2442{ 2454{
2443 struct ceph_inode_info *ci = ceph_inode(inode); 2455 struct ceph_inode_info *ci = ceph_inode(inode);
2444 int mds = session->s_mds; 2456 int mds = session->s_mds;
@@ -2639,10 +2651,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2639 spin_unlock(&ci->i_ceph_lock); 2651 spin_unlock(&ci->i_ceph_lock);
2640 2652
2641 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 2653 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
2642 down_write(&mdsc->snap_rwsem);
2643 ceph_update_snap_trace(mdsc, snaptrace,
2644 snaptrace + snaptrace_len, false);
2645 downgrade_write(&mdsc->snap_rwsem);
2646 kick_flushing_inode_caps(mdsc, session, inode); 2654 kick_flushing_inode_caps(mdsc, session, inode);
2647 up_read(&mdsc->snap_rwsem); 2655 up_read(&mdsc->snap_rwsem);
2648 if (newcaps & ~issued) 2656 if (newcaps & ~issued)
@@ -3052,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3052 struct ceph_cap *cap; 3060 struct ceph_cap *cap;
3053 struct ceph_mds_caps *h; 3061 struct ceph_mds_caps *h;
3054 struct ceph_mds_cap_peer *peer = NULL; 3062 struct ceph_mds_cap_peer *peer = NULL;
3063 struct ceph_snap_realm *realm;
3055 int mds = session->s_mds; 3064 int mds = session->s_mds;
3056 int op, issued; 3065 int op, issued;
3057 u32 seq, mseq; 3066 u32 seq, mseq;
@@ -3153,11 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3153 goto done_unlocked; 3162 goto done_unlocked;
3154 3163
3155 case CEPH_CAP_OP_IMPORT: 3164 case CEPH_CAP_OP_IMPORT:
3165 realm = NULL;
3166 if (snaptrace_len) {
3167 down_write(&mdsc->snap_rwsem);
3168 ceph_update_snap_trace(mdsc, snaptrace,
3169 snaptrace + snaptrace_len,
3170 false, &realm);
3171 downgrade_write(&mdsc->snap_rwsem);
3172 } else {
3173 down_read(&mdsc->snap_rwsem);
3174 }
3156 handle_cap_import(mdsc, inode, h, peer, session, 3175 handle_cap_import(mdsc, inode, h, peer, session,
3157 &cap, &issued); 3176 &cap, &issued);
3158 handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, 3177 handle_cap_grant(mdsc, inode, h,
3159 inline_version, inline_data, inline_len, 3178 inline_version, inline_data, inline_len,
3160 msg->middle, session, cap, issued); 3179 msg->middle, session, cap, issued);
3180 if (realm)
3181 ceph_put_snap_realm(mdsc, realm);
3161 goto done_unlocked; 3182 goto done_unlocked;
3162 } 3183 }
3163 3184
@@ -3177,7 +3198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3177 case CEPH_CAP_OP_GRANT: 3198 case CEPH_CAP_OP_GRANT:
3178 __ceph_caps_issued(ci, &issued); 3199 __ceph_caps_issued(ci, &issued);
3179 issued |= __ceph_caps_dirty(ci); 3200 issued |= __ceph_caps_dirty(ci);
3180 handle_cap_grant(mdsc, inode, h, NULL, 0, 3201 handle_cap_grant(mdsc, inode, h,
3181 inline_version, inline_data, inline_len, 3202 inline_version, inline_data, inline_len,
3182 msg->middle, session, cap, issued); 3203 msg->middle, session, cap, issued);
3183 goto done_unlocked; 3204 goto done_unlocked;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c241603764fd..0411dbb15815 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -26,8 +26,6 @@
26 * point by name. 26 * point by name.
27 */ 27 */
28 28
29const struct inode_operations ceph_dir_iops;
30const struct file_operations ceph_dir_fops;
31const struct dentry_operations ceph_dentry_ops; 29const struct dentry_operations ceph_dentry_ops;
32 30
33/* 31/*
@@ -672,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
672 /* 670 /*
673 * We created the item, then did a lookup, and found 671 * We created the item, then did a lookup, and found
674 * it was already linked to another inode we already 672 * it was already linked to another inode we already
675 * had in our cache (and thus got spliced). Link our 673 * had in our cache (and thus got spliced). To not
676 * dentry to that inode, but don't hash it, just in 674 * confuse VFS (especially when inode is a directory),
677 * case the VFS wants to dereference it. 675 * we don't link our dentry to that inode, return an
676 * error instead.
677 *
678 * This event should be rare and it happens only when
679 * we talk to old MDS. Recent MDS does not send traceless
680 * reply for request that creates new inode.
678 */ 681 */
679 BUG_ON(!result->d_inode); 682 d_drop(result);
680 d_instantiate(dentry, result->d_inode); 683 return -ESTALE;
681 return 0;
682 } 684 }
683 return PTR_ERR(result); 685 return PTR_ERR(result);
684} 686}
@@ -1335,6 +1337,13 @@ const struct file_operations ceph_dir_fops = {
1335 .fsync = ceph_dir_fsync, 1337 .fsync = ceph_dir_fsync,
1336}; 1338};
1337 1339
1340const struct file_operations ceph_snapdir_fops = {
1341 .iterate = ceph_readdir,
1342 .llseek = ceph_dir_llseek,
1343 .open = ceph_open,
1344 .release = ceph_release,
1345};
1346
1338const struct inode_operations ceph_dir_iops = { 1347const struct inode_operations ceph_dir_iops = {
1339 .lookup = ceph_lookup, 1348 .lookup = ceph_lookup,
1340 .permission = ceph_permission, 1349 .permission = ceph_permission,
@@ -1357,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = {
1357 .atomic_open = ceph_atomic_open, 1366 .atomic_open = ceph_atomic_open,
1358}; 1367};
1359 1368
1369const struct inode_operations ceph_snapdir_iops = {
1370 .lookup = ceph_lookup,
1371 .permission = ceph_permission,
1372 .getattr = ceph_getattr,
1373 .mkdir = ceph_mkdir,
1374 .rmdir = ceph_unlink,
1375};
1376
1360const struct dentry_operations ceph_dentry_ops = { 1377const struct dentry_operations ceph_dentry_ops = {
1361 .d_revalidate = ceph_d_revalidate, 1378 .d_revalidate = ceph_d_revalidate,
1362 .d_release = ceph_d_release, 1379 .d_release = ceph_d_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..a3d774b35149 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
275 err = ceph_mdsc_do_request(mdsc, 275 err = ceph_mdsc_do_request(mdsc,
276 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 276 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
277 req); 277 req);
278 err = ceph_handle_snapdir(req, dentry, err);
278 if (err) 279 if (err)
279 goto out_req; 280 goto out_req;
280 281
281 err = ceph_handle_snapdir(req, dentry, err);
282 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 282 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
283 err = ceph_handle_notrace_create(dir, dentry); 283 err = ceph_handle_notrace_create(dir, dentry);
284 284
@@ -392,13 +392,14 @@ more:
392 if (ret >= 0) { 392 if (ret >= 0) {
393 int didpages; 393 int didpages;
394 if (was_short && (pos + ret < inode->i_size)) { 394 if (was_short && (pos + ret < inode->i_size)) {
395 u64 tmp = min(this_len - ret, 395 int zlen = min(this_len - ret,
396 inode->i_size - pos - ret); 396 inode->i_size - pos - ret);
397 int zoff = (o_direct ? buf_align : io_align) +
398 read + ret;
397 dout(" zero gap %llu to %llu\n", 399 dout(" zero gap %llu to %llu\n",
398 pos + ret, pos + ret + tmp); 400 pos + ret, pos + ret + zlen);
399 ceph_zero_page_vector_range(page_align + read + ret, 401 ceph_zero_page_vector_range(zoff, zlen, pages);
400 tmp, pages); 402 ret += zlen;
401 ret += tmp;
402 } 403 }
403 404
404 didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; 405 didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@ -878,28 +879,34 @@ again:
878 879
879 i_size = i_size_read(inode); 880 i_size = i_size_read(inode);
880 if (retry_op == READ_INLINE) { 881 if (retry_op == READ_INLINE) {
881 /* does not support inline data > PAGE_SIZE */ 882 BUG_ON(ret > 0 || read > 0);
882 if (i_size > PAGE_CACHE_SIZE) { 883 if (iocb->ki_pos < i_size &&
883 ret = -EIO; 884 iocb->ki_pos < PAGE_CACHE_SIZE) {
884 } else if (iocb->ki_pos < i_size) {
885 loff_t end = min_t(loff_t, i_size, 885 loff_t end = min_t(loff_t, i_size,
886 iocb->ki_pos + len); 886 iocb->ki_pos + len);
887 end = min_t(loff_t, end, PAGE_CACHE_SIZE);
887 if (statret < end) 888 if (statret < end)
888 zero_user_segment(page, statret, end); 889 zero_user_segment(page, statret, end);
889 ret = copy_page_to_iter(page, 890 ret = copy_page_to_iter(page,
890 iocb->ki_pos & ~PAGE_MASK, 891 iocb->ki_pos & ~PAGE_MASK,
891 end - iocb->ki_pos, to); 892 end - iocb->ki_pos, to);
892 iocb->ki_pos += ret; 893 iocb->ki_pos += ret;
893 } else { 894 read += ret;
894 ret = 0; 895 }
896 if (iocb->ki_pos < i_size && read < len) {
897 size_t zlen = min_t(size_t, len - read,
898 i_size - iocb->ki_pos);
899 ret = iov_iter_zero(zlen, to);
900 iocb->ki_pos += ret;
901 read += ret;
895 } 902 }
896 __free_pages(page, 0); 903 __free_pages(page, 0);
897 return ret; 904 return read;
898 } 905 }
899 906
900 /* hit EOF or hole? */ 907 /* hit EOF or hole? */
901 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 908 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
902 ret < len) { 909 ret < len) {
903 dout("sync_read hit hole, ppos %lld < size %lld" 910 dout("sync_read hit hole, ppos %lld < size %lld"
904 ", reading more\n", iocb->ki_pos, 911 ", reading more\n", iocb->ki_pos,
905 inode->i_size); 912 inode->i_size);
@@ -945,7 +952,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
945 mutex_lock(&inode->i_mutex); 952 mutex_lock(&inode->i_mutex);
946 953
947 /* We can write back this queue in page reclaim */ 954 /* We can write back this queue in page reclaim */
948 current->backing_dev_info = file->f_mapping->backing_dev_info; 955 current->backing_dev_info = inode_to_bdi(inode);
949 956
950 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 957 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
951 if (err) 958 if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..119c43c80638 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent)
82 inode->i_mode = parent->i_mode; 82 inode->i_mode = parent->i_mode;
83 inode->i_uid = parent->i_uid; 83 inode->i_uid = parent->i_uid;
84 inode->i_gid = parent->i_gid; 84 inode->i_gid = parent->i_gid;
85 inode->i_op = &ceph_dir_iops; 85 inode->i_op = &ceph_snapdir_iops;
86 inode->i_fop = &ceph_dir_fops; 86 inode->i_fop = &ceph_snapdir_fops;
87 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 87 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
88 ci->i_rbytes = 0; 88 ci->i_rbytes = 0;
89 return inode; 89 return inode;
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
783 } 783 }
784 784
785 inode->i_mapping->a_ops = &ceph_aops; 785 inode->i_mapping->a_ops = &ceph_aops;
786 inode->i_mapping->backing_dev_info =
787 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
788 786
789 switch (inode->i_mode & S_IFMT) { 787 switch (inode->i_mode & S_IFMT) {
790 case S_IFIFO: 788 case S_IFIFO:
@@ -840,30 +838,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
840 ceph_vinop(inode), inode->i_mode); 838 ceph_vinop(inode), inode->i_mode);
841 } 839 }
842 840
843 /* set dir completion flag? */
844 if (S_ISDIR(inode->i_mode) &&
845 ci->i_files == 0 && ci->i_subdirs == 0 &&
846 ceph_snap(inode) == CEPH_NOSNAP &&
847 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
848 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
849 !__ceph_dir_is_complete(ci)) {
850 dout(" marking %p complete (empty)\n", inode);
851 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count),
852 ci->i_ordered_count);
853 }
854
855 /* were we issued a capability? */ 841 /* were we issued a capability? */
856 if (info->cap.caps) { 842 if (info->cap.caps) {
857 if (ceph_snap(inode) == CEPH_NOSNAP) { 843 if (ceph_snap(inode) == CEPH_NOSNAP) {
844 unsigned caps = le32_to_cpu(info->cap.caps);
858 ceph_add_cap(inode, session, 845 ceph_add_cap(inode, session,
859 le64_to_cpu(info->cap.cap_id), 846 le64_to_cpu(info->cap.cap_id),
860 cap_fmode, 847 cap_fmode, caps,
861 le32_to_cpu(info->cap.caps),
862 le32_to_cpu(info->cap.wanted), 848 le32_to_cpu(info->cap.wanted),
863 le32_to_cpu(info->cap.seq), 849 le32_to_cpu(info->cap.seq),
864 le32_to_cpu(info->cap.mseq), 850 le32_to_cpu(info->cap.mseq),
865 le64_to_cpu(info->cap.realm), 851 le64_to_cpu(info->cap.realm),
866 info->cap.flags, &new_cap); 852 info->cap.flags, &new_cap);
853
854 /* set dir completion flag? */
855 if (S_ISDIR(inode->i_mode) &&
856 ci->i_files == 0 && ci->i_subdirs == 0 &&
857 (caps & CEPH_CAP_FILE_SHARED) &&
858 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
859 !__ceph_dir_is_complete(ci)) {
860 dout(" marking %p complete (empty)\n", inode);
861 __ceph_dir_set_complete(ci,
862 atomic_read(&ci->i_release_count),
863 ci->i_ordered_count);
864 }
865
867 wake = true; 866 wake = true;
868 } else { 867 } else {
869 dout(" %p got snap_caps %s\n", inode, 868 dout(" %p got snap_caps %s\n", inode,
@@ -1448,12 +1447,14 @@ retry_lookup:
1448 } 1447 }
1449 1448
1450 if (!dn->d_inode) { 1449 if (!dn->d_inode) {
1451 dn = splice_dentry(dn, in, NULL); 1450 struct dentry *realdn = splice_dentry(dn, in, NULL);
1452 if (IS_ERR(dn)) { 1451 if (IS_ERR(realdn)) {
1453 err = PTR_ERR(dn); 1452 err = PTR_ERR(realdn);
1453 d_drop(dn);
1454 dn = NULL; 1454 dn = NULL;
1455 goto next_item; 1455 goto next_item;
1456 } 1456 }
1457 dn = realdn;
1457 } 1458 }
1458 1459
1459 di = dn->d_fsdata; 1460 di = dn->d_fsdata;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index c35c5c614e38..4347039ecc18 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -239,23 +239,26 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
239 return err; 239 return err;
240} 240}
241 241
242/** 242/*
243 * Must be called with lock_flocks() already held. Fills in the passed 243 * Fills in the passed counter variables, so you can prepare pagelist metadata
244 * counter variables, so you can prepare pagelist metadata before calling 244 * before calling ceph_encode_locks.
245 * ceph_encode_locks.
246 */ 245 */
247void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) 246void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
248{ 247{
249 struct file_lock *lock; 248 struct file_lock *lock;
249 struct file_lock_context *ctx;
250 250
251 *fcntl_count = 0; 251 *fcntl_count = 0;
252 *flock_count = 0; 252 *flock_count = 0;
253 253
254 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 254 ctx = inode->i_flctx;
255 if (lock->fl_flags & FL_POSIX) 255 if (ctx) {
256 spin_lock(&ctx->flc_lock);
257 list_for_each_entry(lock, &ctx->flc_posix, fl_list)
256 ++(*fcntl_count); 258 ++(*fcntl_count);
257 else if (lock->fl_flags & FL_FLOCK) 259 list_for_each_entry(lock, &ctx->flc_flock, fl_list)
258 ++(*flock_count); 260 ++(*flock_count);
261 spin_unlock(&ctx->flc_lock);
259 } 262 }
260 dout("counted %d flock locks and %d fcntl locks", 263 dout("counted %d flock locks and %d fcntl locks",
261 *flock_count, *fcntl_count); 264 *flock_count, *fcntl_count);
@@ -271,6 +274,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
271 int num_fcntl_locks, int num_flock_locks) 274 int num_fcntl_locks, int num_flock_locks)
272{ 275{
273 struct file_lock *lock; 276 struct file_lock *lock;
277 struct file_lock_context *ctx = inode->i_flctx;
274 int err = 0; 278 int err = 0;
275 int seen_fcntl = 0; 279 int seen_fcntl = 0;
276 int seen_flock = 0; 280 int seen_flock = 0;
@@ -279,33 +283,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
279 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 283 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
280 num_fcntl_locks); 284 num_fcntl_locks);
281 285
282 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 286 if (!ctx)
283 if (lock->fl_flags & FL_POSIX) { 287 return 0;
284 ++seen_fcntl; 288
285 if (seen_fcntl > num_fcntl_locks) { 289 spin_lock(&ctx->flc_lock);
286 err = -ENOSPC; 290 list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
287 goto fail; 291 ++seen_fcntl;
288 } 292 if (seen_fcntl > num_fcntl_locks) {
289 err = lock_to_ceph_filelock(lock, &flocks[l]); 293 err = -ENOSPC;
290 if (err) 294 goto fail;
291 goto fail;
292 ++l;
293 } 295 }
296 err = lock_to_ceph_filelock(lock, &flocks[l]);
297 if (err)
298 goto fail;
299 ++l;
294 } 300 }
295 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 301 list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
296 if (lock->fl_flags & FL_FLOCK) { 302 ++seen_flock;
297 ++seen_flock; 303 if (seen_flock > num_flock_locks) {
298 if (seen_flock > num_flock_locks) { 304 err = -ENOSPC;
299 err = -ENOSPC; 305 goto fail;
300 goto fail;
301 }
302 err = lock_to_ceph_filelock(lock, &flocks[l]);
303 if (err)
304 goto fail;
305 ++l;
306 } 306 }
307 err = lock_to_ceph_filelock(lock, &flocks[l]);
308 if (err)
309 goto fail;
310 ++l;
307 } 311 }
308fail: 312fail:
313 spin_unlock(&ctx->flc_lock);
309 return err; 314 return err;
310} 315}
311 316
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d2171f4a6980..71c073f38e54 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
480 mdsc->max_sessions = newmax; 480 mdsc->max_sessions = newmax;
481 } 481 }
482 mdsc->sessions[mds] = s; 482 mdsc->sessions[mds] = s;
483 atomic_inc(&mdsc->num_sessions);
483 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ 484 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
484 485
485 ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, 486 ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
@@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
503 mdsc->sessions[s->s_mds] = NULL; 504 mdsc->sessions[s->s_mds] = NULL;
504 ceph_con_close(&s->s_con); 505 ceph_con_close(&s->s_con);
505 ceph_put_mds_session(s); 506 ceph_put_mds_session(s);
507 atomic_dec(&mdsc->num_sessions);
506} 508}
507 509
508/* 510/*
@@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
842 struct ceph_options *opt = mdsc->fsc->client->options; 844 struct ceph_options *opt = mdsc->fsc->client->options;
843 void *p; 845 void *p;
844 846
845 const char* metadata[3][2] = { 847 const char* metadata[][2] = {
846 {"hostname", utsname()->nodename}, 848 {"hostname", utsname()->nodename},
849 {"kernel_version", utsname()->release},
847 {"entity_id", opt->name ? opt->name : ""}, 850 {"entity_id", opt->name ? opt->name : ""},
848 {NULL, NULL} 851 {NULL, NULL}
849 }; 852 };
@@ -1464,19 +1467,33 @@ out_unlocked:
1464 return err; 1467 return err;
1465} 1468}
1466 1469
1470static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
1471{
1472 struct ceph_inode_info *ci = ceph_inode(inode);
1473 int ret;
1474 spin_lock(&ci->i_ceph_lock);
1475 if (ci->i_flushing_caps)
1476 ret = ci->i_cap_flush_seq >= want_flush_seq;
1477 else
1478 ret = 1;
1479 spin_unlock(&ci->i_ceph_lock);
1480 return ret;
1481}
1482
1467/* 1483/*
1468 * flush all dirty inode data to disk. 1484 * flush all dirty inode data to disk.
1469 * 1485 *
1470 * returns true if we've flushed through want_flush_seq 1486 * returns true if we've flushed through want_flush_seq
1471 */ 1487 */
1472static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) 1488static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1473{ 1489{
1474 int mds, ret = 1; 1490 int mds;
1475 1491
1476 dout("check_cap_flush want %lld\n", want_flush_seq); 1492 dout("check_cap_flush want %lld\n", want_flush_seq);
1477 mutex_lock(&mdsc->mutex); 1493 mutex_lock(&mdsc->mutex);
1478 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { 1494 for (mds = 0; mds < mdsc->max_sessions; mds++) {
1479 struct ceph_mds_session *session = mdsc->sessions[mds]; 1495 struct ceph_mds_session *session = mdsc->sessions[mds];
1496 struct inode *inode = NULL;
1480 1497
1481 if (!session) 1498 if (!session)
1482 continue; 1499 continue;
@@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1489 list_entry(session->s_cap_flushing.next, 1506 list_entry(session->s_cap_flushing.next,
1490 struct ceph_inode_info, 1507 struct ceph_inode_info,
1491 i_flushing_item); 1508 i_flushing_item);
1492 struct inode *inode = &ci->vfs_inode;
1493 1509
1494 spin_lock(&ci->i_ceph_lock); 1510 if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
1495 if (ci->i_cap_flush_seq <= want_flush_seq) {
1496 dout("check_cap_flush still flushing %p " 1511 dout("check_cap_flush still flushing %p "
1497 "seq %lld <= %lld to mds%d\n", inode, 1512 "seq %lld <= %lld to mds%d\n",
1498 ci->i_cap_flush_seq, want_flush_seq, 1513 &ci->vfs_inode, ci->i_cap_flush_seq,
1499 session->s_mds); 1514 want_flush_seq, session->s_mds);
1500 ret = 0; 1515 inode = igrab(&ci->vfs_inode);
1501 } 1516 }
1502 spin_unlock(&ci->i_ceph_lock);
1503 } 1517 }
1504 mutex_unlock(&session->s_mutex); 1518 mutex_unlock(&session->s_mutex);
1505 ceph_put_mds_session(session); 1519 ceph_put_mds_session(session);
1506 1520
1507 if (!ret) 1521 if (inode) {
1508 return ret; 1522 wait_event(mdsc->cap_flushing_wq,
1523 check_cap_flush(inode, want_flush_seq));
1524 iput(inode);
1525 }
1526
1509 mutex_lock(&mdsc->mutex); 1527 mutex_lock(&mdsc->mutex);
1510 } 1528 }
1511 1529
1512 mutex_unlock(&mdsc->mutex); 1530 mutex_unlock(&mdsc->mutex);
1513 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); 1531 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1514 return ret;
1515} 1532}
1516 1533
1517/* 1534/*
@@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1923 head->num_releases = cpu_to_le16(releases); 1940 head->num_releases = cpu_to_le16(releases);
1924 1941
1925 /* time stamp */ 1942 /* time stamp */
1926 ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 1943 {
1944 struct ceph_timespec ts;
1945 ceph_encode_timespec(&ts, &req->r_stamp);
1946 ceph_encode_copy(&p, &ts, sizeof(ts));
1947 }
1927 1948
1928 BUG_ON(p > end); 1949 BUG_ON(p > end);
1929 msg->front.iov_len = p - msg->front.iov_base; 1950 msg->front.iov_len = p - msg->front.iov_base;
@@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
2012 2033
2013 /* time stamp */ 2034 /* time stamp */
2014 p = msg->front.iov_base + req->r_request_release_offset; 2035 p = msg->front.iov_base + req->r_request_release_offset;
2015 ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 2036 {
2037 struct ceph_timespec ts;
2038 ceph_encode_timespec(&ts, &req->r_stamp);
2039 ceph_encode_copy(&p, &ts, sizeof(ts));
2040 }
2016 2041
2017 msg->front.iov_len = p - msg->front.iov_base; 2042 msg->front.iov_len = p - msg->front.iov_base;
2018 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 2043 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
2159 p = rb_next(p); 2184 p = rb_next(p);
2160 if (req->r_got_unsafe) 2185 if (req->r_got_unsafe)
2161 continue; 2186 continue;
2187 if (req->r_attempts > 0)
2188 continue; /* only new requests */
2162 if (req->r_session && 2189 if (req->r_session &&
2163 req->r_session->s_mds == mds) { 2190 req->r_session->s_mds == mds) {
2164 dout(" kicking tid %llu\n", req->r_tid); 2191 dout(" kicking tid %llu\n", req->r_tid);
@@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2286 struct ceph_mds_request *req; 2313 struct ceph_mds_request *req;
2287 struct ceph_mds_reply_head *head = msg->front.iov_base; 2314 struct ceph_mds_reply_head *head = msg->front.iov_base;
2288 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ 2315 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
2316 struct ceph_snap_realm *realm;
2289 u64 tid; 2317 u64 tid;
2290 int err, result; 2318 int err, result;
2291 int mds = session->s_mds; 2319 int mds = session->s_mds;
@@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2401 } 2429 }
2402 2430
2403 /* snap trace */ 2431 /* snap trace */
2432 realm = NULL;
2404 if (rinfo->snapblob_len) { 2433 if (rinfo->snapblob_len) {
2405 down_write(&mdsc->snap_rwsem); 2434 down_write(&mdsc->snap_rwsem);
2406 ceph_update_snap_trace(mdsc, rinfo->snapblob, 2435 ceph_update_snap_trace(mdsc, rinfo->snapblob,
2407 rinfo->snapblob + rinfo->snapblob_len, 2436 rinfo->snapblob + rinfo->snapblob_len,
2408 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); 2437 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
2438 &realm);
2409 downgrade_write(&mdsc->snap_rwsem); 2439 downgrade_write(&mdsc->snap_rwsem);
2410 } else { 2440 } else {
2411 down_read(&mdsc->snap_rwsem); 2441 down_read(&mdsc->snap_rwsem);
@@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2423 mutex_unlock(&req->r_fill_mutex); 2453 mutex_unlock(&req->r_fill_mutex);
2424 2454
2425 up_read(&mdsc->snap_rwsem); 2455 up_read(&mdsc->snap_rwsem);
2456 if (realm)
2457 ceph_put_snap_realm(mdsc, realm);
2426out_err: 2458out_err:
2427 mutex_lock(&mdsc->mutex); 2459 mutex_lock(&mdsc->mutex);
2428 if (!req->r_aborted) { 2460 if (!req->r_aborted) {
@@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
2487 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); 2519 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2488 BUG_ON(req->r_err); 2520 BUG_ON(req->r_err);
2489 BUG_ON(req->r_got_result); 2521 BUG_ON(req->r_got_result);
2522 req->r_attempts = 0;
2490 req->r_num_fwd = fwd_seq; 2523 req->r_num_fwd = fwd_seq;
2491 req->r_resend_mds = next_mds; 2524 req->r_resend_mds = next_mds;
2492 put_request_session(req); 2525 put_request_session(req);
@@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session,
2580 send_flushmsg_ack(mdsc, session, seq); 2613 send_flushmsg_ack(mdsc, session, seq);
2581 break; 2614 break;
2582 2615
2616 case CEPH_SESSION_FORCE_RO:
2617 dout("force_session_readonly %p\n", session);
2618 spin_lock(&session->s_cap_lock);
2619 session->s_readonly = true;
2620 spin_unlock(&session->s_cap_lock);
2621 wake_up_session_caps(session, 0);
2622 break;
2623
2583 default: 2624 default:
2584 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); 2625 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2585 WARN_ON(1); 2626 WARN_ON(1);
@@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2610 struct ceph_mds_session *session) 2651 struct ceph_mds_session *session)
2611{ 2652{
2612 struct ceph_mds_request *req, *nreq; 2653 struct ceph_mds_request *req, *nreq;
2654 struct rb_node *p;
2613 int err; 2655 int err;
2614 2656
2615 dout("replay_unsafe_requests mds%d\n", session->s_mds); 2657 dout("replay_unsafe_requests mds%d\n", session->s_mds);
@@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2622 ceph_con_send(&session->s_con, req->r_request); 2664 ceph_con_send(&session->s_con, req->r_request);
2623 } 2665 }
2624 } 2666 }
2667
2668 /*
2669 * also re-send old requests when MDS enters reconnect stage. So that MDS
2670 * can process completed request in clientreplay stage.
2671 */
2672 p = rb_first(&mdsc->request_tree);
2673 while (p) {
2674 req = rb_entry(p, struct ceph_mds_request, r_node);
2675 p = rb_next(p);
2676 if (req->r_got_unsafe)
2677 continue;
2678 if (req->r_attempts == 0)
2679 continue; /* only old requests */
2680 if (req->r_session &&
2681 req->r_session->s_mds == session->s_mds) {
2682 err = __prepare_send_request(mdsc, req, session->s_mds);
2683 if (!err) {
2684 ceph_msg_get(req->r_request);
2685 ceph_con_send(&session->s_con, req->r_request);
2686 }
2687 }
2688 }
2625 mutex_unlock(&mdsc->mutex); 2689 mutex_unlock(&mdsc->mutex);
2626} 2690}
2627 2691
@@ -2700,20 +2764,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2700 struct ceph_filelock *flocks; 2764 struct ceph_filelock *flocks;
2701 2765
2702encode_again: 2766encode_again:
2703 spin_lock(&inode->i_lock);
2704 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2767 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2705 spin_unlock(&inode->i_lock);
2706 flocks = kmalloc((num_fcntl_locks+num_flock_locks) * 2768 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2707 sizeof(struct ceph_filelock), GFP_NOFS); 2769 sizeof(struct ceph_filelock), GFP_NOFS);
2708 if (!flocks) { 2770 if (!flocks) {
2709 err = -ENOMEM; 2771 err = -ENOMEM;
2710 goto out_free; 2772 goto out_free;
2711 } 2773 }
2712 spin_lock(&inode->i_lock);
2713 err = ceph_encode_locks_to_buffer(inode, flocks, 2774 err = ceph_encode_locks_to_buffer(inode, flocks,
2714 num_fcntl_locks, 2775 num_fcntl_locks,
2715 num_flock_locks); 2776 num_flock_locks);
2716 spin_unlock(&inode->i_lock);
2717 if (err) { 2777 if (err) {
2718 kfree(flocks); 2778 kfree(flocks);
2719 if (err == -ENOSPC) 2779 if (err == -ENOSPC)
@@ -2791,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2791 spin_unlock(&session->s_gen_ttl_lock); 2851 spin_unlock(&session->s_gen_ttl_lock);
2792 2852
2793 spin_lock(&session->s_cap_lock); 2853 spin_lock(&session->s_cap_lock);
2854 /* don't know if session is readonly */
2855 session->s_readonly = 0;
2794 /* 2856 /*
2795 * notify __ceph_remove_cap() that we are composing cap reconnect. 2857 * notify __ceph_remove_cap() that we are composing cap reconnect.
2796 * If a cap get released before being added to the cap reconnect, 2858 * If a cap get released before being added to the cap reconnect,
@@ -2937,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2937 mutex_unlock(&s->s_mutex); 2999 mutex_unlock(&s->s_mutex);
2938 s->s_state = CEPH_MDS_SESSION_RESTARTING; 3000 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2939 } 3001 }
2940
2941 /* kick any requests waiting on the recovering mds */
2942 kick_requests(mdsc, i);
2943 } else if (oldstate == newstate) { 3002 } else if (oldstate == newstate) {
2944 continue; /* nothing new with this mds */ 3003 continue; /* nothing new with this mds */
2945 } 3004 }
@@ -3299,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3299 init_waitqueue_head(&mdsc->session_close_wq); 3358 init_waitqueue_head(&mdsc->session_close_wq);
3300 INIT_LIST_HEAD(&mdsc->waiting_for_map); 3359 INIT_LIST_HEAD(&mdsc->waiting_for_map);
3301 mdsc->sessions = NULL; 3360 mdsc->sessions = NULL;
3361 atomic_set(&mdsc->num_sessions, 0);
3302 mdsc->max_sessions = 0; 3362 mdsc->max_sessions = 0;
3303 mdsc->stopping = 0; 3363 mdsc->stopping = 0;
3304 init_rwsem(&mdsc->snap_rwsem); 3364 init_rwsem(&mdsc->snap_rwsem);
@@ -3432,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3432 dout("sync\n"); 3492 dout("sync\n");
3433 mutex_lock(&mdsc->mutex); 3493 mutex_lock(&mdsc->mutex);
3434 want_tid = mdsc->last_tid; 3494 want_tid = mdsc->last_tid;
3435 want_flush = mdsc->cap_flush_seq;
3436 mutex_unlock(&mdsc->mutex); 3495 mutex_unlock(&mdsc->mutex);
3437 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
3438 3496
3439 ceph_flush_dirty_caps(mdsc); 3497 ceph_flush_dirty_caps(mdsc);
3498 spin_lock(&mdsc->cap_dirty_lock);
3499 want_flush = mdsc->cap_flush_seq;
3500 spin_unlock(&mdsc->cap_dirty_lock);
3501
3502 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
3440 3503
3441 wait_unsafe_requests(mdsc, want_tid); 3504 wait_unsafe_requests(mdsc, want_tid);
3442 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); 3505 wait_caps_flush(mdsc, want_flush);
3443} 3506}
3444 3507
3445/* 3508/*
@@ -3447,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3447 */ 3510 */
3448static bool done_closing_sessions(struct ceph_mds_client *mdsc) 3511static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3449{ 3512{
3450 int i, n = 0;
3451
3452 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3513 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3453 return true; 3514 return true;
3454 3515 return atomic_read(&mdsc->num_sessions) == 0;
3455 mutex_lock(&mdsc->mutex);
3456 for (i = 0; i < mdsc->max_sessions; i++)
3457 if (mdsc->sessions[i])
3458 n++;
3459 mutex_unlock(&mdsc->mutex);
3460 return n == 0;
3461} 3516}
3462 3517
3463/* 3518/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e2817d00f7d9..1875b5d985c6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -137,6 +137,7 @@ struct ceph_mds_session {
137 int s_nr_caps, s_trim_caps; 137 int s_nr_caps, s_trim_caps;
138 int s_num_cap_releases; 138 int s_num_cap_releases;
139 int s_cap_reconnect; 139 int s_cap_reconnect;
140 int s_readonly;
140 struct list_head s_cap_releases; /* waiting cap_release messages */ 141 struct list_head s_cap_releases; /* waiting cap_release messages */
141 struct list_head s_cap_releases_done; /* ready to send */ 142 struct list_head s_cap_releases_done; /* ready to send */
142 struct ceph_cap *s_cap_iterator; 143 struct ceph_cap *s_cap_iterator;
@@ -272,6 +273,7 @@ struct ceph_mds_client {
272 struct list_head waiting_for_map; 273 struct list_head waiting_for_map;
273 274
274 struct ceph_mds_session **sessions; /* NULL for mds if no session */ 275 struct ceph_mds_session **sessions; /* NULL for mds if no session */
276 atomic_t num_sessions;
275 int max_sessions; /* len of s_mds_sessions */ 277 int max_sessions; /* len of s_mds_sessions */
276 int stopping; /* true if shutting down */ 278 int stopping; /* true if shutting down */
277 279
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ce35fbd4ba5d..a97e39f09ba6 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
70 * safe. we do need to protect against concurrent empty list 70 * safe. we do need to protect against concurrent empty list
71 * additions, however. 71 * additions, however.
72 */ 72 */
73 if (atomic_read(&realm->nref) == 0) { 73 if (atomic_inc_return(&realm->nref) == 1) {
74 spin_lock(&mdsc->snap_empty_lock); 74 spin_lock(&mdsc->snap_empty_lock);
75 list_del_init(&realm->empty_item); 75 list_del_init(&realm->empty_item);
76 spin_unlock(&mdsc->snap_empty_lock); 76 spin_unlock(&mdsc->snap_empty_lock);
77 } 77 }
78
79 atomic_inc(&realm->nref);
80} 78}
81 79
82static void __insert_snap_realm(struct rb_root *root, 80static void __insert_snap_realm(struct rb_root *root,
@@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
116 if (!realm) 114 if (!realm)
117 return ERR_PTR(-ENOMEM); 115 return ERR_PTR(-ENOMEM);
118 116
119 atomic_set(&realm->nref, 0); /* tree does not take a ref */ 117 atomic_set(&realm->nref, 1); /* for caller */
120 realm->ino = ino; 118 realm->ino = ino;
121 INIT_LIST_HEAD(&realm->children); 119 INIT_LIST_HEAD(&realm->children);
122 INIT_LIST_HEAD(&realm->child_item); 120 INIT_LIST_HEAD(&realm->child_item);
@@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
134 * 132 *
135 * caller must hold snap_rwsem for write. 133 * caller must hold snap_rwsem for write.
136 */ 134 */
137struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 135static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
138 u64 ino) 136 u64 ino)
139{ 137{
140 struct rb_node *n = mdsc->snap_realms.rb_node; 138 struct rb_node *n = mdsc->snap_realms.rb_node;
141 struct ceph_snap_realm *r; 139 struct ceph_snap_realm *r;
@@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
154 return NULL; 152 return NULL;
155} 153}
156 154
155struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
156 u64 ino)
157{
158 struct ceph_snap_realm *r;
159 r = __lookup_snap_realm(mdsc, ino);
160 if (r)
161 ceph_get_snap_realm(mdsc, r);
162 return r;
163}
164
157static void __put_snap_realm(struct ceph_mds_client *mdsc, 165static void __put_snap_realm(struct ceph_mds_client *mdsc,
158 struct ceph_snap_realm *realm); 166 struct ceph_snap_realm *realm);
159 167
@@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
273 } 281 }
274 realm->parent_ino = parentino; 282 realm->parent_ino = parentino;
275 realm->parent = parent; 283 realm->parent = parent;
276 ceph_get_snap_realm(mdsc, parent);
277 list_add(&realm->child_item, &parent->children); 284 list_add(&realm->child_item, &parent->children);
278 return 1; 285 return 1;
279} 286}
@@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
631 * Caller must hold snap_rwsem for write. 638 * Caller must hold snap_rwsem for write.
632 */ 639 */
633int ceph_update_snap_trace(struct ceph_mds_client *mdsc, 640int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
634 void *p, void *e, bool deletion) 641 void *p, void *e, bool deletion,
642 struct ceph_snap_realm **realm_ret)
635{ 643{
636 struct ceph_mds_snap_realm *ri; /* encoded */ 644 struct ceph_mds_snap_realm *ri; /* encoded */
637 __le64 *snaps; /* encoded */ 645 __le64 *snaps; /* encoded */
638 __le64 *prior_parent_snaps; /* encoded */ 646 __le64 *prior_parent_snaps; /* encoded */
639 struct ceph_snap_realm *realm; 647 struct ceph_snap_realm *realm = NULL;
648 struct ceph_snap_realm *first_realm = NULL;
640 int invalidate = 0; 649 int invalidate = 0;
641 int err = -ENOMEM; 650 int err = -ENOMEM;
642 LIST_HEAD(dirty_realms); 651 LIST_HEAD(dirty_realms);
@@ -704,13 +713,18 @@ more:
704 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 713 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
705 realm, invalidate, p, e); 714 realm, invalidate, p, e);
706 715
707 if (p < e)
708 goto more;
709
710 /* invalidate when we reach the _end_ (root) of the trace */ 716 /* invalidate when we reach the _end_ (root) of the trace */
711 if (invalidate) 717 if (invalidate && p >= e)
712 rebuild_snap_realms(realm); 718 rebuild_snap_realms(realm);
713 719
720 if (!first_realm)
721 first_realm = realm;
722 else
723 ceph_put_snap_realm(mdsc, realm);
724
725 if (p < e)
726 goto more;
727
714 /* 728 /*
715 * queue cap snaps _after_ we've built the new snap contexts, 729 * queue cap snaps _after_ we've built the new snap contexts,
716 * so that i_head_snapc can be set appropriately. 730 * so that i_head_snapc can be set appropriately.
@@ -721,12 +735,21 @@ more:
721 queue_realm_cap_snaps(realm); 735 queue_realm_cap_snaps(realm);
722 } 736 }
723 737
738 if (realm_ret)
739 *realm_ret = first_realm;
740 else
741 ceph_put_snap_realm(mdsc, first_realm);
742
724 __cleanup_empty_realms(mdsc); 743 __cleanup_empty_realms(mdsc);
725 return 0; 744 return 0;
726 745
727bad: 746bad:
728 err = -EINVAL; 747 err = -EINVAL;
729fail: 748fail:
749 if (realm && !IS_ERR(realm))
750 ceph_put_snap_realm(mdsc, realm);
751 if (first_realm)
752 ceph_put_snap_realm(mdsc, first_realm);
730 pr_err("update_snap_trace error %d\n", err); 753 pr_err("update_snap_trace error %d\n", err);
731 return err; 754 return err;
732} 755}
@@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
844 if (IS_ERR(realm)) 867 if (IS_ERR(realm))
845 goto out; 868 goto out;
846 } 869 }
847 ceph_get_snap_realm(mdsc, realm);
848 870
849 dout("splitting snap_realm %llx %p\n", realm->ino, realm); 871 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
850 for (i = 0; i < num_split_inos; i++) { 872 for (i = 0; i < num_split_inos; i++) {
@@ -905,7 +927,7 @@ skip_inode:
905 /* we may have taken some of the old realm's children. */ 927 /* we may have taken some of the old realm's children. */
906 for (i = 0; i < num_split_realms; i++) { 928 for (i = 0; i < num_split_realms; i++) {
907 struct ceph_snap_realm *child = 929 struct ceph_snap_realm *child =
908 ceph_lookup_snap_realm(mdsc, 930 __lookup_snap_realm(mdsc,
909 le64_to_cpu(split_realms[i])); 931 le64_to_cpu(split_realms[i]));
910 if (!child) 932 if (!child)
911 continue; 933 continue;
@@ -918,7 +940,7 @@ skip_inode:
918 * snap, we can avoid queueing cap_snaps. 940 * snap, we can avoid queueing cap_snaps.
919 */ 941 */
920 ceph_update_snap_trace(mdsc, p, e, 942 ceph_update_snap_trace(mdsc, p, e,
921 op == CEPH_SNAP_OP_DESTROY); 943 op == CEPH_SNAP_OP_DESTROY, NULL);
922 944
923 if (op == CEPH_SNAP_OP_SPLIT) 945 if (op == CEPH_SNAP_OP_SPLIT)
924 /* we took a reference when we created the realm, above */ 946 /* we took a reference when we created the realm, above */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 50f06cddc94b..a63997b8bcff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
40 40
41 dout("put_super\n"); 41 dout("put_super\n");
42 ceph_mdsc_close_sessions(fsc->mdsc); 42 ceph_mdsc_close_sessions(fsc->mdsc);
43
44 /*
45 * ensure we release the bdi before put_anon_super releases
46 * the device name.
47 */
48 if (s->s_bdi == &fsc->backing_dev_info) {
49 bdi_unregister(&fsc->backing_dev_info);
50 s->s_bdi = NULL;
51 }
52
53 return;
54} 43}
55 44
56static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 45static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -425,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
425 seq_puts(m, ",noshare"); 414 seq_puts(m, ",noshare");
426 if (opt->flags & CEPH_OPT_NOCRC) 415 if (opt->flags & CEPH_OPT_NOCRC)
427 seq_puts(m, ",nocrc"); 416 seq_puts(m, ",nocrc");
417 if (opt->flags & CEPH_OPT_NOMSGAUTH)
418 seq_puts(m, ",nocephx_require_signatures");
419 if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
420 seq_puts(m, ",notcp_nodelay");
428 421
429 if (opt->name) 422 if (opt->name)
430 seq_printf(m, ",name=%s", opt->name); 423 seq_printf(m, ",name=%s", opt->name);
@@ -910,7 +903,7 @@ static int ceph_register_bdi(struct super_block *sb,
910 >> PAGE_SHIFT; 903 >> PAGE_SHIFT;
911 else 904 else
912 fsc->backing_dev_info.ra_pages = 905 fsc->backing_dev_info.ra_pages =
913 default_backing_dev_info.ra_pages; 906 VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
914 907
915 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", 908 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
916 atomic_long_inc_return(&bdi_seq)); 909 atomic_long_inc_return(&bdi_seq));
@@ -1002,11 +995,16 @@ out_final:
1002static void ceph_kill_sb(struct super_block *s) 995static void ceph_kill_sb(struct super_block *s)
1003{ 996{
1004 struct ceph_fs_client *fsc = ceph_sb_to_client(s); 997 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
998 dev_t dev = s->s_dev;
999
1005 dout("kill_sb %p\n", s); 1000 dout("kill_sb %p\n", s);
1001
1006 ceph_mdsc_pre_umount(fsc->mdsc); 1002 ceph_mdsc_pre_umount(fsc->mdsc);
1007 kill_anon_super(s); /* will call put_super after sb is r/o */ 1003 generic_shutdown_super(s);
1008 ceph_mdsc_destroy(fsc); 1004 ceph_mdsc_destroy(fsc);
1005
1009 destroy_fs_client(fsc); 1006 destroy_fs_client(fsc);
1007 free_anon_bdev(dev);
1010} 1008}
1011 1009
1012static struct file_system_type ceph_fs_type = { 1010static struct file_system_type ceph_fs_type = {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e1aa32d0759d..04c8124ed30e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
693extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, 693extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
694 struct ceph_snap_realm *realm); 694 struct ceph_snap_realm *realm);
695extern int ceph_update_snap_trace(struct ceph_mds_client *m, 695extern int ceph_update_snap_trace(struct ceph_mds_client *m,
696 void *p, void *e, bool deletion); 696 void *p, void *e, bool deletion,
697 struct ceph_snap_realm **realm_ret);
697extern void ceph_handle_snap(struct ceph_mds_client *mdsc, 698extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
698 struct ceph_mds_session *session, 699 struct ceph_mds_session *session,
699 struct ceph_msg *msg); 700 struct ceph_msg *msg);
@@ -892,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
892int ceph_uninline_data(struct file *filp, struct page *locked_page); 893int ceph_uninline_data(struct file *filp, struct page *locked_page);
893/* dir.c */ 894/* dir.c */
894extern const struct file_operations ceph_dir_fops; 895extern const struct file_operations ceph_dir_fops;
896extern const struct file_operations ceph_snapdir_fops;
895extern const struct inode_operations ceph_dir_iops; 897extern const struct inode_operations ceph_dir_iops;
898extern const struct inode_operations ceph_snapdir_iops;
896extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 899extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
897 ceph_snapdir_dentry_ops; 900 ceph_snapdir_dentry_ops;
898 901
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
24 24
25#include "internal.h" 25#include "internal.h"
26 26
27/*
28 * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
29 * devices
30 * - permits shared-mmap for read, write and/or exec
31 * - does not permit private mmap in NOMMU mode (can't do COW)
32 * - no readahead or I/O queue unplugging required
33 */
34struct backing_dev_info directly_mappable_cdev_bdi = {
35 .name = "char",
36 .capabilities = (
37#ifdef CONFIG_MMU
38 /* permit private copies of the data to be taken */
39 BDI_CAP_MAP_COPY |
40#endif
41 /* permit direct mmap, for read, write or exec */
42 BDI_CAP_MAP_DIRECT |
43 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
44 /* no writeback happens */
45 BDI_CAP_NO_ACCT_AND_WRITEBACK),
46};
47
48static struct kobj_map *cdev_map; 27static struct kobj_map *cdev_map;
49 28
50static DEFINE_MUTEX(chrdevs_lock); 29static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
575void __init chrdev_init(void) 554void __init chrdev_init(void)
576{ 555{
577 cdev_map = kobj_map_init(base_probe, &chrdevs_lock); 556 cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
578 if (bdi_init(&directly_mappable_cdev_bdi))
579 panic("Failed to init directly mappable cdev bdi");
580} 557}
581 558
582 559
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
590EXPORT_SYMBOL(cdev_add); 567EXPORT_SYMBOL(cdev_add);
591EXPORT_SYMBOL(__register_chrdev); 568EXPORT_SYMBOL(__register_chrdev);
592EXPORT_SYMBOL(__unregister_chrdev); 569EXPORT_SYMBOL(__unregister_chrdev);
593EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9c56ef776407..7febcf2475c5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -606,9 +606,11 @@ cifs_security_flags_handle_must_flags(unsigned int *flags)
606 *flags = CIFSSEC_MUST_NTLMV2; 606 *flags = CIFSSEC_MUST_NTLMV2;
607 else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM) 607 else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
608 *flags = CIFSSEC_MUST_NTLM; 608 *flags = CIFSSEC_MUST_NTLM;
609 else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) 609 else if (CIFSSEC_MUST_LANMAN &&
610 (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
610 *flags = CIFSSEC_MUST_LANMAN; 611 *flags = CIFSSEC_MUST_LANMAN;
611 else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) 612 else if (CIFSSEC_MUST_PLNTXT &&
613 (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
612 *flags = CIFSSEC_MUST_PLNTXT; 614 *flags = CIFSSEC_MUST_PLNTXT;
613 615
614 *flags |= signflags; 616 *flags |= signflags;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6e139111fdb2..22b289a3b1c4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -661,16 +661,16 @@ set_credits(struct TCP_Server_Info *server, const int val)
661 server->ops->set_credits(server, val); 661 server->ops->set_credits(server, val);
662} 662}
663 663
664static inline __u64 664static inline __le64
665get_next_mid64(struct TCP_Server_Info *server) 665get_next_mid64(struct TCP_Server_Info *server)
666{ 666{
667 return server->ops->get_next_mid(server); 667 return cpu_to_le64(server->ops->get_next_mid(server));
668} 668}
669 669
670static inline __le16 670static inline __le16
671get_next_mid(struct TCP_Server_Info *server) 671get_next_mid(struct TCP_Server_Info *server)
672{ 672{
673 __u16 mid = get_next_mid64(server); 673 __u16 mid = server->ops->get_next_mid(server);
674 /* 674 /*
675 * The value in the SMB header should be little endian for easy 675 * The value in the SMB header should be little endian for easy
676 * on-the-wire decoding. 676 * on-the-wire decoding.
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
3446 int referral_walks_count = 0; 3446 int referral_walks_count = 0;
3447#endif 3447#endif
3448 3448
3449 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); 3449 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
3450 if (rc) 3450 if (rc)
3451 return rc; 3451 return rc;
3452 3452
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 96b7e9b7706d..a94b3e673182 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -366,6 +366,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
366 struct cifsLockInfo *li, *tmp; 366 struct cifsLockInfo *li, *tmp;
367 struct cifs_fid fid; 367 struct cifs_fid fid;
368 struct cifs_pending_open open; 368 struct cifs_pending_open open;
369 bool oplock_break_cancelled;
369 370
370 spin_lock(&cifs_file_list_lock); 371 spin_lock(&cifs_file_list_lock);
371 if (--cifs_file->count > 0) { 372 if (--cifs_file->count > 0) {
@@ -397,7 +398,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
397 } 398 }
398 spin_unlock(&cifs_file_list_lock); 399 spin_unlock(&cifs_file_list_lock);
399 400
400 cancel_work_sync(&cifs_file->oplock_break); 401 oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
401 402
402 if (!tcon->need_reconnect && !cifs_file->invalidHandle) { 403 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
403 struct TCP_Server_Info *server = tcon->ses->server; 404 struct TCP_Server_Info *server = tcon->ses->server;
@@ -409,6 +410,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
409 _free_xid(xid); 410 _free_xid(xid);
410 } 411 }
411 412
413 if (oplock_break_cancelled)
414 cifs_done_oplock_break(cifsi);
415
412 cifs_del_pending_open(&open); 416 cifs_del_pending_open(&open);
413 417
414 /* 418 /*
@@ -1109,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
1109 return rc; 1113 return rc;
1110} 1114}
1111 1115
1112/* copied from fs/locks.c with a name change */
1113#define cifs_for_each_lock(inode, lockp) \
1114 for (lockp = &inode->i_flock; *lockp != NULL; \
1115 lockp = &(*lockp)->fl_next)
1116
1117struct lock_to_push { 1116struct lock_to_push {
1118 struct list_head llist; 1117 struct list_head llist;
1119 __u64 offset; 1118 __u64 offset;
@@ -1128,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1128{ 1127{
1129 struct inode *inode = cfile->dentry->d_inode; 1128 struct inode *inode = cfile->dentry->d_inode;
1130 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1129 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1131 struct file_lock *flock, **before; 1130 struct file_lock *flock;
1132 unsigned int count = 0, i = 0; 1131 struct file_lock_context *flctx = inode->i_flctx;
1132 unsigned int count = 0, i;
1133 int rc = 0, xid, type; 1133 int rc = 0, xid, type;
1134 struct list_head locks_to_send, *el; 1134 struct list_head locks_to_send, *el;
1135 struct lock_to_push *lck, *tmp; 1135 struct lock_to_push *lck, *tmp;
@@ -1137,12 +1137,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1137 1137
1138 xid = get_xid(); 1138 xid = get_xid();
1139 1139
1140 spin_lock(&inode->i_lock); 1140 if (!flctx)
1141 cifs_for_each_lock(inode, before) { 1141 goto out;
1142 if ((*before)->fl_flags & FL_POSIX) 1142
1143 count++; 1143 spin_lock(&flctx->flc_lock);
1144 list_for_each(el, &flctx->flc_posix) {
1145 count++;
1144 } 1146 }
1145 spin_unlock(&inode->i_lock); 1147 spin_unlock(&flctx->flc_lock);
1146 1148
1147 INIT_LIST_HEAD(&locks_to_send); 1149 INIT_LIST_HEAD(&locks_to_send);
1148 1150
@@ -1151,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1151 * added to the list while we are holding cinode->lock_sem that 1153 * added to the list while we are holding cinode->lock_sem that
1152 * protects locking operations of this inode. 1154 * protects locking operations of this inode.
1153 */ 1155 */
1154 for (; i < count; i++) { 1156 for (i = 0; i < count; i++) {
1155 lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); 1157 lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
1156 if (!lck) { 1158 if (!lck) {
1157 rc = -ENOMEM; 1159 rc = -ENOMEM;
@@ -1161,11 +1163,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1161 } 1163 }
1162 1164
1163 el = locks_to_send.next; 1165 el = locks_to_send.next;
1164 spin_lock(&inode->i_lock); 1166 spin_lock(&flctx->flc_lock);
1165 cifs_for_each_lock(inode, before) { 1167 list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
1166 flock = *before;
1167 if ((flock->fl_flags & FL_POSIX) == 0)
1168 continue;
1169 if (el == &locks_to_send) { 1168 if (el == &locks_to_send) {
1170 /* 1169 /*
1171 * The list ended. We don't have enough allocated 1170 * The list ended. We don't have enough allocated
@@ -1185,9 +1184,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1185 lck->length = length; 1184 lck->length = length;
1186 lck->type = type; 1185 lck->type = type;
1187 lck->offset = flock->fl_start; 1186 lck->offset = flock->fl_start;
1188 el = el->next;
1189 } 1187 }
1190 spin_unlock(&inode->i_lock); 1188 spin_unlock(&flctx->flc_lock);
1191 1189
1192 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { 1190 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
1193 int stored_rc; 1191 int stored_rc;
@@ -3244,7 +3242,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
3244 .fault = filemap_fault, 3242 .fault = filemap_fault,
3245 .map_pages = filemap_map_pages, 3243 .map_pages = filemap_map_pages,
3246 .page_mkwrite = cifs_page_mkwrite, 3244 .page_mkwrite = cifs_page_mkwrite,
3247 .remap_pages = generic_file_remap_pages,
3248}; 3245};
3249 3246
3250int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) 3247int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
937 inode->i_flags |= S_NOATIME | S_NOCMTIME; 937 inode->i_flags |= S_NOATIME | S_NOCMTIME;
938 if (inode->i_state & I_NEW) { 938 if (inode->i_state & I_NEW) {
939 inode->i_ino = hash; 939 inode->i_ino = hash;
940 if (S_ISREG(inode->i_mode))
941 inode->i_data.backing_dev_info = sb->s_bdi;
942#ifdef CONFIG_CIFS_FSCACHE 940#ifdef CONFIG_CIFS_FSCACHE
943 /* initialize per-inode cache cookie pointer */ 941 /* initialize per-inode cache cookie pointer */
944 CIFS_I(inode)->fscache = NULL; 942 CIFS_I(inode)->fscache = NULL;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 45cb59bcc791..8b7898b7670f 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -86,21 +86,16 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
86 } 86 }
87 87
88 src_inode = file_inode(src_file.file); 88 src_inode = file_inode(src_file.file);
89 rc = -EINVAL;
90 if (S_ISDIR(src_inode->i_mode))
91 goto out_fput;
89 92
90 /* 93 /*
91 * Note: cifs case is easier than btrfs since server responsible for 94 * Note: cifs case is easier than btrfs since server responsible for
92 * checks for proper open modes and file type and if it wants 95 * checks for proper open modes and file type and if it wants
93 * server could even support copy of range where source = target 96 * server could even support copy of range where source = target
94 */ 97 */
95 98 lock_two_nondirectories(target_inode, src_inode);
96 /* so we do not deadlock racing two ioctls on same files */
97 if (target_inode < src_inode) {
98 mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
99 mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
100 } else {
101 mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
102 mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
103 }
104 99
105 /* determine range to clone */ 100 /* determine range to clone */
106 rc = -EINVAL; 101 rc = -EINVAL;
@@ -124,13 +119,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
124out_unlock: 119out_unlock:
125 /* although unlocking in the reverse order from locking is not 120 /* although unlocking in the reverse order from locking is not
126 strictly necessary here it is a little cleaner to be consistent */ 121 strictly necessary here it is a little cleaner to be consistent */
127 if (target_inode < src_inode) { 122 unlock_two_nondirectories(src_inode, target_inode);
128 mutex_unlock(&src_inode->i_mutex);
129 mutex_unlock(&target_inode->i_mutex);
130 } else {
131 mutex_unlock(&target_inode->i_mutex);
132 mutex_unlock(&src_inode->i_mutex);
133 }
134out_fput: 123out_fput:
135 fdput(src_file); 124 fdput(src_file);
136out_drop_write: 125out_drop_write:
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index b333ff60781d..abae6dd2c6b9 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -926,6 +926,7 @@ cifs_NTtimeToUnix(__le64 ntutc)
926 926
927 /* Subtract the NTFS time offset, then convert to 1s intervals. */ 927 /* Subtract the NTFS time offset, then convert to 1s intervals. */
928 s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; 928 s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
929 u64 abs_t;
929 930
930 /* 931 /*
931 * Unfortunately can not use normal 64 bit division on 32 bit arch, but 932 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
@@ -933,13 +934,14 @@ cifs_NTtimeToUnix(__le64 ntutc)
933 * to special case them 934 * to special case them
934 */ 935 */
935 if (t < 0) { 936 if (t < 0) {
936 t = -t; 937 abs_t = -t;
937 ts.tv_nsec = (long)(do_div(t, 10000000) * 100); 938 ts.tv_nsec = (long)(do_div(abs_t, 10000000) * 100);
938 ts.tv_nsec = -ts.tv_nsec; 939 ts.tv_nsec = -ts.tv_nsec;
939 ts.tv_sec = -t; 940 ts.tv_sec = -abs_t;
940 } else { 941 } else {
941 ts.tv_nsec = (long)do_div(t, 10000000) * 100; 942 abs_t = t;
942 ts.tv_sec = t; 943 ts.tv_nsec = (long)do_div(abs_t, 10000000) * 100;
944 ts.tv_sec = abs_t;
943 } 945 }
944 946
945 return ts; 947 return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8eaf20a80649..c295338e0a98 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -69,7 +69,8 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
69 * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT 69 * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
70 * 70 *
71 * Find the dentry that matches "name". If there isn't one, create one. If it's 71 * Find the dentry that matches "name". If there isn't one, create one. If it's
72 * a negative dentry or the uniqueid changed, then drop it and recreate it. 72 * a negative dentry or the uniqueid or filetype(mode) changed,
73 * then drop it and recreate it.
73 */ 74 */
74static void 75static void
75cifs_prime_dcache(struct dentry *parent, struct qstr *name, 76cifs_prime_dcache(struct dentry *parent, struct qstr *name,
@@ -97,8 +98,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
97 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) 98 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
98 fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; 99 fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
99 100
100 /* update inode in place if i_ino didn't change */ 101 /* update inode in place
101 if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { 102 * if both i_ino and i_mode didn't change */
103 if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
104 (inode->i_mode & S_IFMT) ==
105 (fattr->cf_mode & S_IFMT)) {
102 cifs_fattr_to_inode(inode, fattr); 106 cifs_fattr_to_inode(inode, fattr);
103 goto out; 107 goto out;
104 } 108 }
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index f1cefc9763ed..689f035915cf 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -32,12 +32,14 @@
32static int 32static int
33check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) 33check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
34{ 34{
35 __u64 wire_mid = le64_to_cpu(hdr->MessageId);
36
35 /* 37 /*
36 * Make sure that this really is an SMB, that it is a response, 38 * Make sure that this really is an SMB, that it is a response,
37 * and that the message ids match. 39 * and that the message ids match.
38 */ 40 */
39 if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && 41 if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
40 (mid == hdr->MessageId)) { 42 (mid == wire_mid)) {
41 if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) 43 if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
42 return 0; 44 return 0;
43 else { 45 else {
@@ -51,11 +53,11 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
51 if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) 53 if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
52 cifs_dbg(VFS, "Bad protocol string signature header %x\n", 54 cifs_dbg(VFS, "Bad protocol string signature header %x\n",
53 *(unsigned int *) hdr->ProtocolId); 55 *(unsigned int *) hdr->ProtocolId);
54 if (mid != hdr->MessageId) 56 if (mid != wire_mid)
55 cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", 57 cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
56 mid, hdr->MessageId); 58 mid, wire_mid);
57 } 59 }
58 cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId); 60 cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid);
59 return 1; 61 return 1;
60} 62}
61 63
@@ -95,7 +97,7 @@ smb2_check_message(char *buf, unsigned int length)
95{ 97{
96 struct smb2_hdr *hdr = (struct smb2_hdr *)buf; 98 struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
97 struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; 99 struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
98 __u64 mid = hdr->MessageId; 100 __u64 mid = le64_to_cpu(hdr->MessageId);
99 __u32 len = get_rfc1002_length(buf); 101 __u32 len = get_rfc1002_length(buf);
100 __u32 clc_len; /* calculated length */ 102 __u32 clc_len; /* calculated length */
101 int command; 103 int command;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 93fd0586f9ec..96b5d40a2ece 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -176,10 +176,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
176{ 176{
177 struct mid_q_entry *mid; 177 struct mid_q_entry *mid;
178 struct smb2_hdr *hdr = (struct smb2_hdr *)buf; 178 struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
179 __u64 wire_mid = le64_to_cpu(hdr->MessageId);
179 180
180 spin_lock(&GlobalMid_Lock); 181 spin_lock(&GlobalMid_Lock);
181 list_for_each_entry(mid, &server->pending_mid_q, qhead) { 182 list_for_each_entry(mid, &server->pending_mid_q, qhead) {
182 if ((mid->mid == hdr->MessageId) && 183 if ((mid->mid == wire_mid) &&
183 (mid->mid_state == MID_REQUEST_SUBMITTED) && 184 (mid->mid_state == MID_REQUEST_SUBMITTED) &&
184 (mid->command == hdr->Command)) { 185 (mid->command == hdr->Command)) {
185 spin_unlock(&GlobalMid_Lock); 186 spin_unlock(&GlobalMid_Lock);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index ce858477002a..70867d54fb8b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -110,7 +110,7 @@ struct smb2_hdr {
110 __le16 CreditRequest; /* CreditResponse */ 110 __le16 CreditRequest; /* CreditResponse */
111 __le32 Flags; 111 __le32 Flags;
112 __le32 NextCommand; 112 __le32 NextCommand;
113 __u64 MessageId; /* opaque - so can stay little endian */ 113 __le64 MessageId;
114 __le32 ProcessId; 114 __le32 ProcessId;
115 __u32 TreeId; /* opaque - so do not make little endian */ 115 __u32 TreeId; /* opaque - so do not make little endian */
116 __u64 SessionId; /* opaque - so do not make little endian */ 116 __u64 SessionId; /* opaque - so do not make little endian */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 5111e7272db6..d4c5b6f109a7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -490,7 +490,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
490 return temp; 490 return temp;
491 else { 491 else {
492 memset(temp, 0, sizeof(struct mid_q_entry)); 492 memset(temp, 0, sizeof(struct mid_q_entry));
493 temp->mid = smb_buffer->MessageId; /* always LE */ 493 temp->mid = le64_to_cpu(smb_buffer->MessageId);
494 temp->pid = current->pid; 494 temp->pid = current->pid;
495 temp->command = smb_buffer->Command; /* Always LE */ 495 temp->command = smb_buffer->Command; /* Always LE */
496 temp->when_alloc = jiffies; 496 temp->when_alloc = jiffies;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 6c1566366a66..a4232ec4f2ba 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -221,7 +221,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
221 } 221 }
222 222
223 rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); 223 rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
224 memset(wpwd, 0, 129 * sizeof(__le16)); 224 memzero_explicit(wpwd, sizeof(wpwd));
225 225
226 return rc; 226 return rc;
227} 227}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 86c893884eb9..281ee011bb6a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -28,29 +28,6 @@
28 28
29#include "coda_int.h" 29#include "coda_int.h"
30 30
31/* dir inode-ops */
32static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl);
33static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
34static int coda_link(struct dentry *old_dentry, struct inode *dir_inode,
35 struct dentry *entry);
36static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
37static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
38 const char *symname);
39static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
40static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
41static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
42 struct inode *new_inode, struct dentry *new_dentry);
43
44/* dir file-ops */
45static int coda_readdir(struct file *file, struct dir_context *ctx);
46
47/* dentry ops */
48static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
49static int coda_dentry_delete(const struct dentry *);
50
51/* support routines */
52static int coda_venus_readdir(struct file *, struct dir_context *);
53
54/* same as fs/bad_inode.c */ 31/* same as fs/bad_inode.c */
55static int coda_return_EIO(void) 32static int coda_return_EIO(void)
56{ 33{
@@ -58,38 +35,6 @@ static int coda_return_EIO(void)
58} 35}
59#define CODA_EIO_ERROR ((void *) (coda_return_EIO)) 36#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
60 37
61const struct dentry_operations coda_dentry_operations =
62{
63 .d_revalidate = coda_dentry_revalidate,
64 .d_delete = coda_dentry_delete,
65};
66
67const struct inode_operations coda_dir_inode_operations =
68{
69 .create = coda_create,
70 .lookup = coda_lookup,
71 .link = coda_link,
72 .unlink = coda_unlink,
73 .symlink = coda_symlink,
74 .mkdir = coda_mkdir,
75 .rmdir = coda_rmdir,
76 .mknod = CODA_EIO_ERROR,
77 .rename = coda_rename,
78 .permission = coda_permission,
79 .getattr = coda_getattr,
80 .setattr = coda_setattr,
81};
82
83const struct file_operations coda_dir_operations = {
84 .llseek = generic_file_llseek,
85 .read = generic_read_dir,
86 .iterate = coda_readdir,
87 .open = coda_open,
88 .release = coda_release,
89 .fsync = coda_fsync,
90};
91
92
93/* inode operations for directories */ 38/* inode operations for directories */
94/* access routines: lookup, readlink, permission */ 39/* access routines: lookup, readlink, permission */
95static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) 40static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
@@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
374 return error; 319 return error;
375} 320}
376 321
377
378/* file operations for directories */
379static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
380{
381 struct coda_file_info *cfi;
382 struct file *host_file;
383 int ret;
384
385 cfi = CODA_FTOC(coda_file);
386 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
387 host_file = cfi->cfi_container;
388
389 if (host_file->f_op->iterate) {
390 struct inode *host_inode = file_inode(host_file);
391 mutex_lock(&host_inode->i_mutex);
392 ret = -ENOENT;
393 if (!IS_DEADDIR(host_inode)) {
394 ret = host_file->f_op->iterate(host_file, ctx);
395 file_accessed(host_file);
396 }
397 mutex_unlock(&host_inode->i_mutex);
398 return ret;
399 }
400 /* Venus: we must read Venus dirents from a file */
401 return coda_venus_readdir(coda_file, ctx);
402}
403
404static inline unsigned int CDT2DT(unsigned char cdt) 322static inline unsigned int CDT2DT(unsigned char cdt)
405{ 323{
406 unsigned int dt; 324 unsigned int dt;
@@ -495,6 +413,33 @@ out:
495 return 0; 413 return 0;
496} 414}
497 415
416/* file operations for directories */
417static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
418{
419 struct coda_file_info *cfi;
420 struct file *host_file;
421 int ret;
422
423 cfi = CODA_FTOC(coda_file);
424 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
425 host_file = cfi->cfi_container;
426
427 if (host_file->f_op->iterate) {
428 struct inode *host_inode = file_inode(host_file);
429
430 mutex_lock(&host_inode->i_mutex);
431 ret = -ENOENT;
432 if (!IS_DEADDIR(host_inode)) {
433 ret = host_file->f_op->iterate(host_file, ctx);
434 file_accessed(host_file);
435 }
436 mutex_unlock(&host_inode->i_mutex);
437 return ret;
438 }
439 /* Venus: we must read Venus dirents from a file */
440 return coda_venus_readdir(coda_file, ctx);
441}
442
498/* called when a cache lookup succeeds */ 443/* called when a cache lookup succeeds */
499static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) 444static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
500{ 445{
@@ -603,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode)
603 } 548 }
604 return 0; 549 return 0;
605} 550}
551
552const struct dentry_operations coda_dentry_operations = {
553 .d_revalidate = coda_dentry_revalidate,
554 .d_delete = coda_dentry_delete,
555};
556
557const struct inode_operations coda_dir_inode_operations = {
558 .create = coda_create,
559 .lookup = coda_lookup,
560 .link = coda_link,
561 .unlink = coda_unlink,
562 .symlink = coda_symlink,
563 .mkdir = coda_mkdir,
564 .rmdir = coda_rmdir,
565 .mknod = CODA_EIO_ERROR,
566 .rename = coda_rename,
567 .permission = coda_permission,
568 .getattr = coda_getattr,
569 .setattr = coda_setattr,
570};
571
572const struct file_operations coda_dir_operations = {
573 .llseek = generic_file_llseek,
574 .read = generic_read_dir,
575 .iterate = coda_readdir,
576 .open = coda_open,
577 .release = coda_release,
578 .fsync = coda_fsync,
579};
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
183 goto unlock_out; 183 goto unlock_out;
184 } 184 }
185 185
186 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); 186 error = bdi_setup_and_register(&vc->bdi, "coda");
187 if (error) 187 if (error)
188 goto unlock_out; 188 goto unlock_out;
189 189
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
70 70
71extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); 71extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
72extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); 72extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
73extern int configfs_inode_init(void);
74extern void configfs_inode_exit(void);
75 73
76extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); 74extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
77extern int configfs_make_dirent(struct configfs_dirent *, 75extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
50 .write_end = simple_write_end, 50 .write_end = simple_write_end,
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
55 .ra_pages = 0, /* No readahead */
56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
57};
58
59static const struct inode_operations configfs_inode_operations ={ 53static const struct inode_operations configfs_inode_operations ={
60 .setattr = configfs_setattr, 54 .setattr = configfs_setattr,
61}; 55};
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
137 if (inode) { 131 if (inode) {
138 inode->i_ino = get_next_ino(); 132 inode->i_ino = get_next_ino();
139 inode->i_mapping->a_ops = &configfs_aops; 133 inode->i_mapping->a_ops = &configfs_aops;
140 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
141 inode->i_op = &configfs_inode_operations; 134 inode->i_op = &configfs_inode_operations;
142 135
143 if (sd->s_iattr) { 136 if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
283 } 276 }
284 mutex_unlock(&dir->d_inode->i_mutex); 277 mutex_unlock(&dir->d_inode->i_mutex);
285} 278}
286
287int __init configfs_inode_init(void)
288{
289 return bdi_init(&configfs_backing_dev_info);
290}
291
292void configfs_inode_exit(void)
293{
294 bdi_destroy(&configfs_backing_dev_info);
295}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
145 if (!config_kobj) 145 if (!config_kobj)
146 goto out2; 146 goto out2;
147 147
148 err = configfs_inode_init();
149 if (err)
150 goto out3;
151
152 err = register_filesystem(&configfs_fs_type); 148 err = register_filesystem(&configfs_fs_type);
153 if (err) 149 if (err)
154 goto out4; 150 goto out3;
155 151
156 return 0; 152 return 0;
157out4:
158 pr_err("Unable to register filesystem!\n");
159 configfs_inode_exit();
160out3: 153out3:
154 pr_err("Unable to register filesystem!\n");
161 kobject_put(config_kobj); 155 kobject_put(config_kobj);
162out2: 156out2:
163 kmem_cache_destroy(configfs_dir_cachep); 157 kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
172 kobject_put(config_kobj); 166 kobject_put(config_kobj);
173 kmem_cache_destroy(configfs_dir_cachep); 167 kmem_cache_destroy(configfs_dir_cachep);
174 configfs_dir_cachep = NULL; 168 configfs_dir_cachep = NULL;
175 configfs_inode_exit();
176} 169}
177 170
178MODULE_AUTHOR("Oracle"); 171MODULE_AUTHOR("Oracle");
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..ed1619ec6537
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
1/*
2 * fs/dax.c - Direct Access filesystem code
3 * Copyright (c) 2013-2014 Intel Corporation
4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/atomic.h>
18#include <linux/blkdev.h>
19#include <linux/buffer_head.h>
20#include <linux/fs.h>
21#include <linux/genhd.h>
22#include <linux/highmem.h>
23#include <linux/memcontrol.h>
24#include <linux/mm.h>
25#include <linux/mutex.h>
26#include <linux/sched.h>
27#include <linux/uio.h>
28#include <linux/vmstat.h>
29
30int dax_clear_blocks(struct inode *inode, sector_t block, long size)
31{
32 struct block_device *bdev = inode->i_sb->s_bdev;
33 sector_t sector = block << (inode->i_blkbits - 9);
34
35 might_sleep();
36 do {
37 void *addr;
38 unsigned long pfn;
39 long count;
40
41 count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
42 if (count < 0)
43 return count;
44 BUG_ON(size < count);
45 while (count > 0) {
46 unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
47 if (pgsz > count)
48 pgsz = count;
49 if (pgsz < PAGE_SIZE)
50 memset(addr, 0, pgsz);
51 else
52 clear_page(addr);
53 addr += pgsz;
54 size -= pgsz;
55 count -= pgsz;
56 BUG_ON(pgsz & 511);
57 sector += pgsz / 512;
58 cond_resched();
59 }
60 } while (size);
61
62 return 0;
63}
64EXPORT_SYMBOL_GPL(dax_clear_blocks);
65
66static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
67{
68 unsigned long pfn;
69 sector_t sector = bh->b_blocknr << (blkbits - 9);
70 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
71}
72
73static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
74 loff_t end)
75{
76 loff_t final = end - pos + first; /* The final byte of the buffer */
77
78 if (first > 0)
79 memset(addr, 0, first);
80 if (final < size)
81 memset(addr + final, 0, size - final);
82}
83
84static bool buffer_written(struct buffer_head *bh)
85{
86 return buffer_mapped(bh) && !buffer_unwritten(bh);
87}
88
89/*
90 * When ext4 encounters a hole, it returns without modifying the buffer_head
91 * which means that we can't trust b_size. To cope with this, we set b_state
92 * to 0 before calling get_block and, if any bit is set, we know we can trust
93 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
94 * and would save us time calling get_block repeatedly.
95 */
96static bool buffer_size_valid(struct buffer_head *bh)
97{
98 return bh->b_state != 0;
99}
100
101static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
102 loff_t start, loff_t end, get_block_t get_block,
103 struct buffer_head *bh)
104{
105 ssize_t retval = 0;
106 loff_t pos = start;
107 loff_t max = start;
108 loff_t bh_max = start;
109 void *addr;
110 bool hole = false;
111
112 if (rw != WRITE)
113 end = min(end, i_size_read(inode));
114
115 while (pos < end) {
116 unsigned len;
117 if (pos == max) {
118 unsigned blkbits = inode->i_blkbits;
119 sector_t block = pos >> blkbits;
120 unsigned first = pos - (block << blkbits);
121 long size;
122
123 if (pos == bh_max) {
124 bh->b_size = PAGE_ALIGN(end - pos);
125 bh->b_state = 0;
126 retval = get_block(inode, block, bh,
127 rw == WRITE);
128 if (retval)
129 break;
130 if (!buffer_size_valid(bh))
131 bh->b_size = 1 << blkbits;
132 bh_max = pos - first + bh->b_size;
133 } else {
134 unsigned done = bh->b_size -
135 (bh_max - (pos - first));
136 bh->b_blocknr += done >> blkbits;
137 bh->b_size -= done;
138 }
139
140 hole = (rw != WRITE) && !buffer_written(bh);
141 if (hole) {
142 addr = NULL;
143 size = bh->b_size - first;
144 } else {
145 retval = dax_get_addr(bh, &addr, blkbits);
146 if (retval < 0)
147 break;
148 if (buffer_unwritten(bh) || buffer_new(bh))
149 dax_new_buf(addr, retval, first, pos,
150 end);
151 addr += first;
152 size = retval - first;
153 }
154 max = min(pos + size, end);
155 }
156
157 if (rw == WRITE)
158 len = copy_from_iter(addr, max - pos, iter);
159 else if (!hole)
160 len = copy_to_iter(addr, max - pos, iter);
161 else
162 len = iov_iter_zero(max - pos, iter);
163
164 if (!len)
165 break;
166
167 pos += len;
168 addr += len;
169 }
170
171 return (pos == start) ? retval : pos - start;
172}
173
174/**
175 * dax_do_io - Perform I/O to a DAX file
176 * @rw: READ to read or WRITE to write
177 * @iocb: The control block for this I/O
178 * @inode: The file which the I/O is directed at
179 * @iter: The addresses to do I/O from or to
180 * @pos: The file offset where the I/O starts
181 * @get_block: The filesystem method used to translate file offsets to blocks
182 * @end_io: A filesystem callback for I/O completion
183 * @flags: See below
184 *
185 * This function uses the same locking scheme as do_blockdev_direct_IO:
186 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
187 * caller for writes. For reads, we take and release the i_mutex ourselves.
188 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
189 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
190 * is in progress.
191 */
192ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
193 struct iov_iter *iter, loff_t pos,
194 get_block_t get_block, dio_iodone_t end_io, int flags)
195{
196 struct buffer_head bh;
197 ssize_t retval = -EINVAL;
198 loff_t end = pos + iov_iter_count(iter);
199
200 memset(&bh, 0, sizeof(bh));
201
202 if ((flags & DIO_LOCKING) && (rw == READ)) {
203 struct address_space *mapping = inode->i_mapping;
204 mutex_lock(&inode->i_mutex);
205 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
206 if (retval) {
207 mutex_unlock(&inode->i_mutex);
208 goto out;
209 }
210 }
211
212 /* Protects against truncate */
213 atomic_inc(&inode->i_dio_count);
214
215 retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
216
217 if ((flags & DIO_LOCKING) && (rw == READ))
218 mutex_unlock(&inode->i_mutex);
219
220 if ((retval > 0) && end_io)
221 end_io(iocb, pos, retval, bh.b_private);
222
223 inode_dio_done(inode);
224 out:
225 return retval;
226}
227EXPORT_SYMBOL_GPL(dax_do_io);
228
229/*
230 * The user has performed a load from a hole in the file. Allocating
231 * a new page in the file would cause excessive storage usage for
232 * workloads with sparse files. We allocate a page cache page instead.
233 * We'll kick it out of the page cache if it's ever written to,
234 * otherwise it will simply fall out of the page cache under memory
235 * pressure without ever having been dirtied.
236 */
237static int dax_load_hole(struct address_space *mapping, struct page *page,
238 struct vm_fault *vmf)
239{
240 unsigned long size;
241 struct inode *inode = mapping->host;
242 if (!page)
243 page = find_or_create_page(mapping, vmf->pgoff,
244 GFP_KERNEL | __GFP_ZERO);
245 if (!page)
246 return VM_FAULT_OOM;
247 /* Recheck i_size under page lock to avoid truncate race */
248 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
249 if (vmf->pgoff >= size) {
250 unlock_page(page);
251 page_cache_release(page);
252 return VM_FAULT_SIGBUS;
253 }
254
255 vmf->page = page;
256 return VM_FAULT_LOCKED;
257}
258
259static int copy_user_bh(struct page *to, struct buffer_head *bh,
260 unsigned blkbits, unsigned long vaddr)
261{
262 void *vfrom, *vto;
263 if (dax_get_addr(bh, &vfrom, blkbits) < 0)
264 return -EIO;
265 vto = kmap_atomic(to);
266 copy_user_page(vto, vfrom, vaddr, to);
267 kunmap_atomic(vto);
268 return 0;
269}
270
271static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
272 struct vm_area_struct *vma, struct vm_fault *vmf)
273{
274 struct address_space *mapping = inode->i_mapping;
275 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
276 unsigned long vaddr = (unsigned long)vmf->virtual_address;
277 void *addr;
278 unsigned long pfn;
279 pgoff_t size;
280 int error;
281
282 i_mmap_lock_read(mapping);
283
284 /*
285 * Check truncate didn't happen while we were allocating a block.
286 * If it did, this block may or may not be still allocated to the
287 * file. We can't tell the filesystem to free it because we can't
288 * take i_mutex here. In the worst case, the file still has blocks
289 * allocated past the end of the file.
290 */
291 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
292 if (unlikely(vmf->pgoff >= size)) {
293 error = -EIO;
294 goto out;
295 }
296
297 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
298 if (error < 0)
299 goto out;
300 if (error < PAGE_SIZE) {
301 error = -EIO;
302 goto out;
303 }
304
305 if (buffer_unwritten(bh) || buffer_new(bh))
306 clear_page(addr);
307
308 error = vm_insert_mixed(vma, vaddr, pfn);
309
310 out:
311 i_mmap_unlock_read(mapping);
312
313 if (bh->b_end_io)
314 bh->b_end_io(bh, 1);
315
316 return error;
317}
318
319static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
320 get_block_t get_block)
321{
322 struct file *file = vma->vm_file;
323 struct address_space *mapping = file->f_mapping;
324 struct inode *inode = mapping->host;
325 struct page *page;
326 struct buffer_head bh;
327 unsigned long vaddr = (unsigned long)vmf->virtual_address;
328 unsigned blkbits = inode->i_blkbits;
329 sector_t block;
330 pgoff_t size;
331 int error;
332 int major = 0;
333
334 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
335 if (vmf->pgoff >= size)
336 return VM_FAULT_SIGBUS;
337
338 memset(&bh, 0, sizeof(bh));
339 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
340 bh.b_size = PAGE_SIZE;
341
342 repeat:
343 page = find_get_page(mapping, vmf->pgoff);
344 if (page) {
345 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
346 page_cache_release(page);
347 return VM_FAULT_RETRY;
348 }
349 if (unlikely(page->mapping != mapping)) {
350 unlock_page(page);
351 page_cache_release(page);
352 goto repeat;
353 }
354 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
355 if (unlikely(vmf->pgoff >= size)) {
356 /*
357 * We have a struct page covering a hole in the file
358 * from a read fault and we've raced with a truncate
359 */
360 error = -EIO;
361 goto unlock_page;
362 }
363 }
364
365 error = get_block(inode, block, &bh, 0);
366 if (!error && (bh.b_size < PAGE_SIZE))
367 error = -EIO; /* fs corruption? */
368 if (error)
369 goto unlock_page;
370
371 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
372 if (vmf->flags & FAULT_FLAG_WRITE) {
373 error = get_block(inode, block, &bh, 1);
374 count_vm_event(PGMAJFAULT);
375 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
376 major = VM_FAULT_MAJOR;
377 if (!error && (bh.b_size < PAGE_SIZE))
378 error = -EIO;
379 if (error)
380 goto unlock_page;
381 } else {
382 return dax_load_hole(mapping, page, vmf);
383 }
384 }
385
386 if (vmf->cow_page) {
387 struct page *new_page = vmf->cow_page;
388 if (buffer_written(&bh))
389 error = copy_user_bh(new_page, &bh, blkbits, vaddr);
390 else
391 clear_user_highpage(new_page, vaddr);
392 if (error)
393 goto unlock_page;
394 vmf->page = page;
395 if (!page) {
396 i_mmap_lock_read(mapping);
397 /* Check we didn't race with truncate */
398 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
399 PAGE_SHIFT;
400 if (vmf->pgoff >= size) {
401 i_mmap_unlock_read(mapping);
402 error = -EIO;
403 goto out;
404 }
405 }
406 return VM_FAULT_LOCKED;
407 }
408
409 /* Check we didn't race with a read fault installing a new page */
410 if (!page && major)
411 page = find_lock_page(mapping, vmf->pgoff);
412
413 if (page) {
414 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
415 PAGE_CACHE_SIZE, 0);
416 delete_from_page_cache(page);
417 unlock_page(page);
418 page_cache_release(page);
419 }
420
421 error = dax_insert_mapping(inode, &bh, vma, vmf);
422
423 out:
424 if (error == -ENOMEM)
425 return VM_FAULT_OOM | major;
426 /* -EBUSY is fine, somebody else faulted on the same PTE */
427 if ((error < 0) && (error != -EBUSY))
428 return VM_FAULT_SIGBUS | major;
429 return VM_FAULT_NOPAGE | major;
430
431 unlock_page:
432 if (page) {
433 unlock_page(page);
434 page_cache_release(page);
435 }
436 goto out;
437}
438
439/**
440 * dax_fault - handle a page fault on a DAX file
441 * @vma: The virtual memory area where the fault occurred
442 * @vmf: The description of the fault
443 * @get_block: The filesystem method used to translate file offsets to blocks
444 *
445 * When a page fault occurs, filesystems may call this helper in their
446 * fault handler for DAX files.
447 */
448int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
449 get_block_t get_block)
450{
451 int result;
452 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
453
454 if (vmf->flags & FAULT_FLAG_WRITE) {
455 sb_start_pagefault(sb);
456 file_update_time(vma->vm_file);
457 }
458 result = do_dax_fault(vma, vmf, get_block);
459 if (vmf->flags & FAULT_FLAG_WRITE)
460 sb_end_pagefault(sb);
461
462 return result;
463}
464EXPORT_SYMBOL_GPL(dax_fault);
465
466/**
467 * dax_zero_page_range - zero a range within a page of a DAX file
468 * @inode: The file being truncated
469 * @from: The file offset that is being truncated to
470 * @length: The number of bytes to zero
471 * @get_block: The filesystem method used to translate file offsets to blocks
472 *
473 * This function can be called by a filesystem when it is zeroing part of a
474 * page in a DAX file. This is intended for hole-punch operations. If
475 * you are truncating a file, the helper function dax_truncate_page() may be
476 * more convenient.
477 *
478 * We work in terms of PAGE_CACHE_SIZE here for commonality with
479 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
480 * took care of disposing of the unnecessary blocks. Even if the filesystem
481 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
482 * since the file might be mmapped.
483 */
484int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
485 get_block_t get_block)
486{
487 struct buffer_head bh;
488 pgoff_t index = from >> PAGE_CACHE_SHIFT;
489 unsigned offset = from & (PAGE_CACHE_SIZE-1);
490 int err;
491
492 /* Block boundary? Nothing to do */
493 if (!length)
494 return 0;
495 BUG_ON((offset + length) > PAGE_CACHE_SIZE);
496
497 memset(&bh, 0, sizeof(bh));
498 bh.b_size = PAGE_CACHE_SIZE;
499 err = get_block(inode, index, &bh, 0);
500 if (err < 0)
501 return err;
502 if (buffer_written(&bh)) {
503 void *addr;
504 err = dax_get_addr(&bh, &addr, inode->i_blkbits);
505 if (err < 0)
506 return err;
507 memset(addr + offset, 0, length);
508 }
509
510 return 0;
511}
512EXPORT_SYMBOL_GPL(dax_zero_page_range);
513
514/**
515 * dax_truncate_page - handle a partial page being truncated in a DAX file
516 * @inode: The file being truncated
517 * @from: The file offset that is being truncated to
518 * @get_block: The filesystem method used to translate file offsets to blocks
519 *
520 * Similar to block_truncate_page(), this function can be called by a
521 * filesystem when it is truncating a DAX file to handle the partial page.
522 *
523 * We work in terms of PAGE_CACHE_SIZE here for commonality with
524 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
525 * took care of disposing of the unnecessary blocks. Even if the filesystem
526 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
527 * since the file might be mmapped.
528 */
529int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
530{
531 unsigned length = PAGE_CACHE_ALIGN(from) - from;
532 return dax_zero_page_range(inode, from, length, get_block);
533}
534EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..dc400fd29f4d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,6 +38,8 @@
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/list_lru.h> 40#include <linux/list_lru.h>
41#include <linux/kasan.h>
42
41#include "internal.h" 43#include "internal.h"
42#include "mount.h" 44#include "mount.h"
43 45
@@ -400,19 +402,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
400 * LRU lists entirely, while shrink_move moves it to the indicated 402 * LRU lists entirely, while shrink_move moves it to the indicated
401 * private list. 403 * private list.
402 */ 404 */
403static void d_lru_isolate(struct dentry *dentry) 405static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
404{ 406{
405 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 407 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
406 dentry->d_flags &= ~DCACHE_LRU_LIST; 408 dentry->d_flags &= ~DCACHE_LRU_LIST;
407 this_cpu_dec(nr_dentry_unused); 409 this_cpu_dec(nr_dentry_unused);
408 list_del_init(&dentry->d_lru); 410 list_lru_isolate(lru, &dentry->d_lru);
409} 411}
410 412
411static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) 413static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
414 struct list_head *list)
412{ 415{
413 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 416 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
414 dentry->d_flags |= DCACHE_SHRINK_LIST; 417 dentry->d_flags |= DCACHE_SHRINK_LIST;
415 list_move_tail(&dentry->d_lru, list); 418 list_lru_isolate_move(lru, &dentry->d_lru, list);
416} 419}
417 420
418/* 421/*
@@ -508,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry)
508 * dentry_iput drops the locks, at which point nobody (except 511 * dentry_iput drops the locks, at which point nobody (except
509 * transient RCU lookups) can reach this dentry. 512 * transient RCU lookups) can reach this dentry.
510 */ 513 */
511 BUG_ON((int)dentry->d_lockref.count > 0); 514 BUG_ON(dentry->d_lockref.count > 0);
512 this_cpu_dec(nr_dentry); 515 this_cpu_dec(nr_dentry);
513 if (dentry->d_op && dentry->d_op->d_release) 516 if (dentry->d_op && dentry->d_op->d_release)
514 dentry->d_op->d_release(dentry); 517 dentry->d_op->d_release(dentry);
@@ -561,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
561 struct dentry *parent = dentry->d_parent; 564 struct dentry *parent = dentry->d_parent;
562 if (IS_ROOT(dentry)) 565 if (IS_ROOT(dentry))
563 return NULL; 566 return NULL;
564 if (unlikely((int)dentry->d_lockref.count < 0)) 567 if (unlikely(dentry->d_lockref.count < 0))
565 return NULL; 568 return NULL;
566 if (likely(spin_trylock(&parent->d_lock))) 569 if (likely(spin_trylock(&parent->d_lock)))
567 return parent; 570 return parent;
@@ -590,6 +593,110 @@ again:
590 return parent; 593 return parent;
591} 594}
592 595
596/*
597 * Try to do a lockless dput(), and return whether that was successful.
598 *
599 * If unsuccessful, we return false, having already taken the dentry lock.
600 *
601 * The caller needs to hold the RCU read lock, so that the dentry is
602 * guaranteed to stay around even if the refcount goes down to zero!
603 */
604static inline bool fast_dput(struct dentry *dentry)
605{
606 int ret;
607 unsigned int d_flags;
608
609 /*
610 * If we have a d_op->d_delete() operation, we sould not
611 * let the dentry count go to zero, so use "put__or_lock".
612 */
613 if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
614 return lockref_put_or_lock(&dentry->d_lockref);
615
616 /*
617 * .. otherwise, we can try to just decrement the
618 * lockref optimistically.
619 */
620 ret = lockref_put_return(&dentry->d_lockref);
621
622 /*
623 * If the lockref_put_return() failed due to the lock being held
624 * by somebody else, the fast path has failed. We will need to
625 * get the lock, and then check the count again.
626 */
627 if (unlikely(ret < 0)) {
628 spin_lock(&dentry->d_lock);
629 if (dentry->d_lockref.count > 1) {
630 dentry->d_lockref.count--;
631 spin_unlock(&dentry->d_lock);
632 return 1;
633 }
634 return 0;
635 }
636
637 /*
638 * If we weren't the last ref, we're done.
639 */
640 if (ret)
641 return 1;
642
643 /*
644 * Careful, careful. The reference count went down
645 * to zero, but we don't hold the dentry lock, so
646 * somebody else could get it again, and do another
647 * dput(), and we need to not race with that.
648 *
649 * However, there is a very special and common case
650 * where we don't care, because there is nothing to
651 * do: the dentry is still hashed, it does not have
652 * a 'delete' op, and it's referenced and already on
653 * the LRU list.
654 *
655 * NOTE! Since we aren't locked, these values are
656 * not "stable". However, it is sufficient that at
657 * some point after we dropped the reference the
658 * dentry was hashed and the flags had the proper
659 * value. Other dentry users may have re-gotten
660 * a reference to the dentry and change that, but
661 * our work is done - we can leave the dentry
662 * around with a zero refcount.
663 */
664 smp_rmb();
665 d_flags = ACCESS_ONCE(dentry->d_flags);
666 d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
667
668 /* Nothing to do? Dropping the reference was all we needed? */
669 if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
670 return 1;
671
672 /*
673 * Not the fast normal case? Get the lock. We've already decremented
674 * the refcount, but we'll need to re-check the situation after
675 * getting the lock.
676 */
677 spin_lock(&dentry->d_lock);
678
679 /*
680 * Did somebody else grab a reference to it in the meantime, and
681 * we're no longer the last user after all? Alternatively, somebody
682 * else could have killed it and marked it dead. Either way, we
683 * don't need to do anything else.
684 */
685 if (dentry->d_lockref.count) {
686 spin_unlock(&dentry->d_lock);
687 return 1;
688 }
689
690 /*
691 * Re-get the reference we optimistically dropped. We hold the
692 * lock, and we just tested that it was zero, so we can just
693 * set it to 1.
694 */
695 dentry->d_lockref.count = 1;
696 return 0;
697}
698
699
593/* 700/*
594 * This is dput 701 * This is dput
595 * 702 *
@@ -622,8 +729,14 @@ void dput(struct dentry *dentry)
622 return; 729 return;
623 730
624repeat: 731repeat:
625 if (lockref_put_or_lock(&dentry->d_lockref)) 732 rcu_read_lock();
733 if (likely(fast_dput(dentry))) {
734 rcu_read_unlock();
626 return; 735 return;
736 }
737
738 /* Slow case: now with the dentry lock held */
739 rcu_read_unlock();
627 740
628 /* Unreachable? Get rid of it */ 741 /* Unreachable? Get rid of it */
629 if (unlikely(d_unhashed(dentry))) 742 if (unlikely(d_unhashed(dentry)))
@@ -810,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list)
810 * We found an inuse dentry which was not removed from 923 * We found an inuse dentry which was not removed from
811 * the LRU because of laziness during lookup. Do not free it. 924 * the LRU because of laziness during lookup. Do not free it.
812 */ 925 */
813 if ((int)dentry->d_lockref.count > 0) { 926 if (dentry->d_lockref.count > 0) {
814 spin_unlock(&dentry->d_lock); 927 spin_unlock(&dentry->d_lock);
815 if (parent) 928 if (parent)
816 spin_unlock(&parent->d_lock); 929 spin_unlock(&parent->d_lock);
@@ -869,8 +982,8 @@ static void shrink_dentry_list(struct list_head *list)
869 } 982 }
870} 983}
871 984
872static enum lru_status 985static enum lru_status dentry_lru_isolate(struct list_head *item,
873dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) 986 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
874{ 987{
875 struct list_head *freeable = arg; 988 struct list_head *freeable = arg;
876 struct dentry *dentry = container_of(item, struct dentry, d_lru); 989 struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +1003,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
890 * another pass through the LRU. 1003 * another pass through the LRU.
891 */ 1004 */
892 if (dentry->d_lockref.count) { 1005 if (dentry->d_lockref.count) {
893 d_lru_isolate(dentry); 1006 d_lru_isolate(lru, dentry);
894 spin_unlock(&dentry->d_lock); 1007 spin_unlock(&dentry->d_lock);
895 return LRU_REMOVED; 1008 return LRU_REMOVED;
896 } 1009 }
@@ -921,7 +1034,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
921 return LRU_ROTATE; 1034 return LRU_ROTATE;
922 } 1035 }
923 1036
924 d_lru_shrink_move(dentry, freeable); 1037 d_lru_shrink_move(lru, dentry, freeable);
925 spin_unlock(&dentry->d_lock); 1038 spin_unlock(&dentry->d_lock);
926 1039
927 return LRU_REMOVED; 1040 return LRU_REMOVED;
@@ -930,30 +1043,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
930/** 1043/**
931 * prune_dcache_sb - shrink the dcache 1044 * prune_dcache_sb - shrink the dcache
932 * @sb: superblock 1045 * @sb: superblock
933 * @nr_to_scan : number of entries to try to free 1046 * @sc: shrink control, passed to list_lru_shrink_walk()
934 * @nid: which node to scan for freeable entities
935 * 1047 *
936 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 1048 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
937 * done when we need more memory an called from the superblock shrinker 1049 * is done when we need more memory and called from the superblock shrinker
938 * function. 1050 * function.
939 * 1051 *
940 * This function may fail to free any resources if all the dentries are in 1052 * This function may fail to free any resources if all the dentries are in
941 * use. 1053 * use.
942 */ 1054 */
943long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 1055long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
944 int nid)
945{ 1056{
946 LIST_HEAD(dispose); 1057 LIST_HEAD(dispose);
947 long freed; 1058 long freed;
948 1059
949 freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, 1060 freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
950 &dispose, &nr_to_scan); 1061 dentry_lru_isolate, &dispose);
951 shrink_dentry_list(&dispose); 1062 shrink_dentry_list(&dispose);
952 return freed; 1063 return freed;
953} 1064}
954 1065
955static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, 1066static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
956 spinlock_t *lru_lock, void *arg) 1067 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
957{ 1068{
958 struct list_head *freeable = arg; 1069 struct list_head *freeable = arg;
959 struct dentry *dentry = container_of(item, struct dentry, d_lru); 1070 struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -966,7 +1077,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
966 if (!spin_trylock(&dentry->d_lock)) 1077 if (!spin_trylock(&dentry->d_lock))
967 return LRU_SKIP; 1078 return LRU_SKIP;
968 1079
969 d_lru_shrink_move(dentry, freeable); 1080 d_lru_shrink_move(lru, dentry, freeable);
970 spin_unlock(&dentry->d_lock); 1081 spin_unlock(&dentry->d_lock);
971 1082
972 return LRU_REMOVED; 1083 return LRU_REMOVED;
@@ -1430,6 +1541,9 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1430 } 1541 }
1431 atomic_set(&p->u.count, 1); 1542 atomic_set(&p->u.count, 1);
1432 dname = p->name; 1543 dname = p->name;
1544 if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
1545 kasan_unpoison_shadow(dname,
1546 round_up(name->len + 1, sizeof(unsigned long)));
1433 } else { 1547 } else {
1434 dname = dentry->d_iname; 1548 dname = dentry->d_iname;
1435 } 1549 }
@@ -2187,37 +2301,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
2187} 2301}
2188EXPORT_SYMBOL(d_hash_and_lookup); 2302EXPORT_SYMBOL(d_hash_and_lookup);
2189 2303
2190/**
2191 * d_validate - verify dentry provided from insecure source (deprecated)
2192 * @dentry: The dentry alleged to be valid child of @dparent
2193 * @dparent: The parent dentry (known to be valid)
2194 *
2195 * An insecure source has sent us a dentry, here we verify it and dget() it.
2196 * This is used by ncpfs in its readdir implementation.
2197 * Zero is returned in the dentry is invalid.
2198 *
2199 * This function is slow for big directories, and deprecated, do not use it.
2200 */
2201int d_validate(struct dentry *dentry, struct dentry *dparent)
2202{
2203 struct dentry *child;
2204
2205 spin_lock(&dparent->d_lock);
2206 list_for_each_entry(child, &dparent->d_subdirs, d_child) {
2207 if (dentry == child) {
2208 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2209 __dget_dlock(dentry);
2210 spin_unlock(&dentry->d_lock);
2211 spin_unlock(&dparent->d_lock);
2212 return 1;
2213 }
2214 }
2215 spin_unlock(&dparent->d_lock);
2216
2217 return 0;
2218}
2219EXPORT_SYMBOL(d_validate);
2220
2221/* 2304/*
2222 * When a file is deleted, we have two options: 2305 * When a file is deleted, we have two options:
2223 * - turn this dentry into a negative dentry 2306 * - turn this dentry into a negative dentry
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 05f2960ed7c3..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount;
34static int debugfs_mount_count; 34static int debugfs_mount_count;
35static bool debugfs_registered; 35static bool debugfs_registered;
36 36
37static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev, 37static struct inode *debugfs_get_inode(struct super_block *sb)
38 void *data, const struct file_operations *fops)
39
40{ 38{
41 struct inode *inode = new_inode(sb); 39 struct inode *inode = new_inode(sb);
42
43 if (inode) { 40 if (inode) {
44 inode->i_ino = get_next_ino(); 41 inode->i_ino = get_next_ino();
45 inode->i_mode = mode;
46 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 42 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
47 switch (mode & S_IFMT) {
48 default:
49 init_special_inode(inode, mode, dev);
50 break;
51 case S_IFREG:
52 inode->i_fop = fops ? fops : &debugfs_file_operations;
53 inode->i_private = data;
54 break;
55 case S_IFLNK:
56 inode->i_op = &debugfs_link_operations;
57 inode->i_private = data;
58 break;
59 case S_IFDIR:
60 inode->i_op = &simple_dir_inode_operations;
61 inode->i_fop = &simple_dir_operations;
62
63 /* directory inodes start off with i_nlink == 2
64 * (for "." entry) */
65 inc_nlink(inode);
66 break;
67 }
68 } 43 }
69 return inode; 44 return inode;
70} 45}
71 46
72/* SMP-safe */
73static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
74 umode_t mode, dev_t dev, void *data,
75 const struct file_operations *fops)
76{
77 struct inode *inode;
78 int error = -EPERM;
79
80 if (dentry->d_inode)
81 return -EEXIST;
82
83 inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
84 if (inode) {
85 d_instantiate(dentry, inode);
86 dget(dentry);
87 error = 0;
88 }
89 return error;
90}
91
92static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
93{
94 int res;
95
96 mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
97 res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL);
98 if (!res) {
99 inc_nlink(dir);
100 fsnotify_mkdir(dir, dentry);
101 }
102 return res;
103}
104
105static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
106 void *data)
107{
108 mode = (mode & S_IALLUGO) | S_IFLNK;
109 return debugfs_mknod(dir, dentry, mode, 0, data, NULL);
110}
111
112static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
113 void *data, const struct file_operations *fops)
114{
115 int res;
116
117 mode = (mode & S_IALLUGO) | S_IFREG;
118 res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
119 if (!res)
120 fsnotify_create(dir, dentry);
121 return res;
122}
123
124static inline int debugfs_positive(struct dentry *dentry) 47static inline int debugfs_positive(struct dentry *dentry)
125{ 48{
126 return dentry->d_inode && !d_unhashed(dentry); 49 return dentry->d_inode && !d_unhashed(dentry);
@@ -252,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
252 .show_options = debugfs_show_options, 175 .show_options = debugfs_show_options,
253}; 176};
254 177
178static struct vfsmount *debugfs_automount(struct path *path)
179{
180 struct vfsmount *(*f)(void *);
181 f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
182 return f(path->dentry->d_inode->i_private);
183}
184
185static const struct dentry_operations debugfs_dops = {
186 .d_delete = always_delete_dentry,
187 .d_automount = debugfs_automount,
188};
189
255static int debug_fill_super(struct super_block *sb, void *data, int silent) 190static int debug_fill_super(struct super_block *sb, void *data, int silent)
256{ 191{
257 static struct tree_descr debug_files[] = {{""}}; 192 static struct tree_descr debug_files[] = {{""}};
@@ -276,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
276 goto fail; 211 goto fail;
277 212
278 sb->s_op = &debugfs_super_operations; 213 sb->s_op = &debugfs_super_operations;
214 sb->s_d_op = &debugfs_dops;
279 215
280 debugfs_apply_options(sb); 216 debugfs_apply_options(sb);
281 217
@@ -302,11 +238,9 @@ static struct file_system_type debug_fs_type = {
302}; 238};
303MODULE_ALIAS_FS("debugfs"); 239MODULE_ALIAS_FS("debugfs");
304 240
305static struct dentry *__create_file(const char *name, umode_t mode, 241static struct dentry *start_creating(const char *name, struct dentry *parent)
306 struct dentry *parent, void *data,
307 const struct file_operations *fops)
308{ 242{
309 struct dentry *dentry = NULL; 243 struct dentry *dentry;
310 int error; 244 int error;
311 245
312 pr_debug("debugfs: creating file '%s'\n",name); 246 pr_debug("debugfs: creating file '%s'\n",name);
@@ -314,7 +248,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
314 error = simple_pin_fs(&debug_fs_type, &debugfs_mount, 248 error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
315 &debugfs_mount_count); 249 &debugfs_mount_count);
316 if (error) 250 if (error)
317 goto exit; 251 return ERR_PTR(error);
318 252
319 /* If the parent is not specified, we create it in the root. 253 /* If the parent is not specified, we create it in the root.
320 * We need the root dentry to do this, which is in the super 254 * We need the root dentry to do this, which is in the super
@@ -326,31 +260,26 @@ static struct dentry *__create_file(const char *name, umode_t mode,
326 260
327 mutex_lock(&parent->d_inode->i_mutex); 261 mutex_lock(&parent->d_inode->i_mutex);
328 dentry = lookup_one_len(name, parent, strlen(name)); 262 dentry = lookup_one_len(name, parent, strlen(name));
329 if (!IS_ERR(dentry)) { 263 if (!IS_ERR(dentry) && dentry->d_inode) {
330 switch (mode & S_IFMT) {
331 case S_IFDIR:
332 error = debugfs_mkdir(parent->d_inode, dentry, mode);
333
334 break;
335 case S_IFLNK:
336 error = debugfs_link(parent->d_inode, dentry, mode,
337 data);
338 break;
339 default:
340 error = debugfs_create(parent->d_inode, dentry, mode,
341 data, fops);
342 break;
343 }
344 dput(dentry); 264 dput(dentry);
345 } else 265 dentry = ERR_PTR(-EEXIST);
346 error = PTR_ERR(dentry);
347 mutex_unlock(&parent->d_inode->i_mutex);
348
349 if (error) {
350 dentry = NULL;
351 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
352 } 266 }
353exit: 267 if (IS_ERR(dentry))
268 mutex_unlock(&parent->d_inode->i_mutex);
269 return dentry;
270}
271
272static struct dentry *failed_creating(struct dentry *dentry)
273{
274 mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
275 dput(dentry);
276 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
277 return NULL;
278}
279
280static struct dentry *end_creating(struct dentry *dentry)
281{
282 mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
354 return dentry; 283 return dentry;
355} 284}
356 285
@@ -384,19 +313,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
384 struct dentry *parent, void *data, 313 struct dentry *parent, void *data,
385 const struct file_operations *fops) 314 const struct file_operations *fops)
386{ 315{
387 switch (mode & S_IFMT) { 316 struct dentry *dentry;
388 case S_IFREG: 317 struct inode *inode;
389 case 0: 318
390 break; 319 if (!(mode & S_IFMT))
391 default: 320 mode |= S_IFREG;
392 BUG(); 321 BUG_ON(!S_ISREG(mode));
393 } 322 dentry = start_creating(name, parent);
323
324 if (IS_ERR(dentry))
325 return NULL;
394 326
395 return __create_file(name, mode, parent, data, fops); 327 inode = debugfs_get_inode(dentry->d_sb);
328 if (unlikely(!inode))
329 return failed_creating(dentry);
330
331 inode->i_mode = mode;
332 inode->i_fop = fops ? fops : &debugfs_file_operations;
333 inode->i_private = data;
334 d_instantiate(dentry, inode);
335 fsnotify_create(dentry->d_parent->d_inode, dentry);
336 return end_creating(dentry);
396} 337}
397EXPORT_SYMBOL_GPL(debugfs_create_file); 338EXPORT_SYMBOL_GPL(debugfs_create_file);
398 339
399/** 340/**
341 * debugfs_create_file_size - create a file in the debugfs filesystem
342 * @name: a pointer to a string containing the name of the file to create.
343 * @mode: the permission that the file should have.
344 * @parent: a pointer to the parent dentry for this file. This should be a
345 * directory dentry if set. If this parameter is NULL, then the
346 * file will be created in the root of the debugfs filesystem.
347 * @data: a pointer to something that the caller will want to get to later
348 * on. The inode.i_private pointer will point to this value on
349 * the open() call.
350 * @fops: a pointer to a struct file_operations that should be used for
351 * this file.
352 * @file_size: initial file size
353 *
354 * This is the basic "create a file" function for debugfs. It allows for a
355 * wide range of flexibility in creating a file, or a directory (if you want
356 * to create a directory, the debugfs_create_dir() function is
357 * recommended to be used instead.)
358 *
359 * This function will return a pointer to a dentry if it succeeds. This
360 * pointer must be passed to the debugfs_remove() function when the file is
361 * to be removed (no automatic cleanup happens if your module is unloaded,
362 * you are responsible here.) If an error occurs, %NULL will be returned.
363 *
364 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
365 * returned.
366 */
367struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
368 struct dentry *parent, void *data,
369 const struct file_operations *fops,
370 loff_t file_size)
371{
372 struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
373
374 if (de)
375 de->d_inode->i_size = file_size;
376 return de;
377}
378EXPORT_SYMBOL_GPL(debugfs_create_file_size);
379
380/**
400 * debugfs_create_dir - create a directory in the debugfs filesystem 381 * debugfs_create_dir - create a directory in the debugfs filesystem
401 * @name: a pointer to a string containing the name of the directory to 382 * @name: a pointer to a string containing the name of the directory to
402 * create. 383 * create.
@@ -416,12 +397,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
416 */ 397 */
417struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) 398struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
418{ 399{
419 return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 400 struct dentry *dentry = start_creating(name, parent);
420 parent, NULL, NULL); 401 struct inode *inode;
402
403 if (IS_ERR(dentry))
404 return NULL;
405
406 inode = debugfs_get_inode(dentry->d_sb);
407 if (unlikely(!inode))
408 return failed_creating(dentry);
409
410 inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
411 inode->i_op = &simple_dir_inode_operations;
412 inode->i_fop = &simple_dir_operations;
413
414 /* directory inodes start off with i_nlink == 2 (for "." entry) */
415 inc_nlink(inode);
416 d_instantiate(dentry, inode);
417 inc_nlink(dentry->d_parent->d_inode);
418 fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
419 return end_creating(dentry);
421} 420}
422EXPORT_SYMBOL_GPL(debugfs_create_dir); 421EXPORT_SYMBOL_GPL(debugfs_create_dir);
423 422
424/** 423/**
424 * debugfs_create_automount - create automount point in the debugfs filesystem
425 * @name: a pointer to a string containing the name of the file to create.
426 * @parent: a pointer to the parent dentry for this file. This should be a
427 * directory dentry if set. If this parameter is NULL, then the
428 * file will be created in the root of the debugfs filesystem.
429 * @f: function to be called when pathname resolution steps on that one.
430 * @data: opaque argument to pass to f().
431 *
432 * @f should return what ->d_automount() would.
433 */
434struct dentry *debugfs_create_automount(const char *name,
435 struct dentry *parent,
436 struct vfsmount *(*f)(void *),
437 void *data)
438{
439 struct dentry *dentry = start_creating(name, parent);
440 struct inode *inode;
441
442 if (IS_ERR(dentry))
443 return NULL;
444
445 inode = debugfs_get_inode(dentry->d_sb);
446 if (unlikely(!inode))
447 return failed_creating(dentry);
448
449 inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
450 inode->i_flags |= S_AUTOMOUNT;
451 inode->i_private = data;
452 dentry->d_fsdata = (void *)f;
453 d_instantiate(dentry, inode);
454 return end_creating(dentry);
455}
456EXPORT_SYMBOL(debugfs_create_automount);
457
458/**
425 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem 459 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
426 * @name: a pointer to a string containing the name of the symbolic link to 460 * @name: a pointer to a string containing the name of the symbolic link to
427 * create. 461 * create.
@@ -447,17 +481,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
447struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, 481struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
448 const char *target) 482 const char *target)
449{ 483{
450 struct dentry *result; 484 struct dentry *dentry;
451 char *link; 485 struct inode *inode;
452 486 char *link = kstrdup(target, GFP_KERNEL);
453 link = kstrdup(target, GFP_KERNEL);
454 if (!link) 487 if (!link)
455 return NULL; 488 return NULL;
456 489
457 result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL); 490 dentry = start_creating(name, parent);
458 if (!result) 491 if (IS_ERR(dentry)) {
459 kfree(link); 492 kfree(link);
460 return result; 493 return NULL;
494 }
495
496 inode = debugfs_get_inode(dentry->d_sb);
497 if (unlikely(!inode)) {
498 kfree(link);
499 return failed_creating(dentry);
500 }
501 inode->i_mode = S_IFLNK | S_IRWXUGO;
502 inode->i_op = &debugfs_link_operations;
503 inode->i_private = link;
504 d_instantiate(dentry, inode);
505 return end_creating(dentry);
461} 506}
462EXPORT_SYMBOL_GPL(debugfs_create_symlink); 507EXPORT_SYMBOL_GPL(debugfs_create_symlink);
463 508
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
56{ 56{
57 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 57 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
58 void *data = genlmsg_data(genlhdr); 58 void *data = genlmsg_data(genlhdr);
59 int rv;
60 59
61 rv = genlmsg_end(skb, data); 60 genlmsg_end(skb, data);
62 if (rv < 0) {
63 nlmsg_free(skb);
64 return rv;
65 }
66 61
67 return genlmsg_unicast(&init_net, skb, listener_nlportid); 62 return genlmsg_unicast(&init_net, skb, listener_nlportid);
68} 63}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
37 iput(toput_inode); 37 iput(toput_inode);
38} 38}
39 39
40static void drop_slab(void)
41{
42 int nr_objects;
43
44 do {
45 int nid;
46
47 nr_objects = 0;
48 for_each_online_node(nid)
49 nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
50 1000, 1000);
51 } while (nr_objects > 10);
52}
53
54int drop_caches_sysctl_handler(struct ctl_table *table, int write, 40int drop_caches_sysctl_handler(struct ctl_table *table, int write,
55 void __user *buffer, size_t *length, loff_t *ppos) 41 void __user *buffer, size_t *length, loff_t *ppos)
56{ 42{
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
67 inode->i_ino = lower_inode->i_ino; 67 inode->i_ino = lower_inode->i_ino;
68 inode->i_version++; 68 inode->i_version++;
69 inode->i_mapping->a_ops = &ecryptfs_aops; 69 inode->i_mapping->a_ops = &ecryptfs_aops;
70 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
71 70
72 if (S_ISLNK(inode->i_mode)) 71 if (S_ISLNK(inode->i_mode))
73 inode->i_op = &ecryptfs_symlink_iops; 72 inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
520 goto out; 520 goto out;
521 } 521 }
522 522
523 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); 523 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
524 if (rc) 524 if (rc)
525 goto out1; 525 goto out1;
526 526
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 367bbb10c543..c2499ef174a2 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
1config EFIVAR_FS 1config EFIVAR_FS
2 tristate "EFI Variable filesystem" 2 tristate "EFI Variable filesystem"
3 depends on EFI 3 depends on EFI
4 default m
4 help 5 help
5 efivarfs is a replacement filesystem for the old EFI 6 efivarfs is a replacement filesystem for the old EFI
6 variable support via sysfs, as it doesn't suffer from the 7 variable support via sysfs, as it doesn't suffer from the
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
140 140
141 name[len] = '-'; 141 name[len] = '-';
142 142
143 efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); 143 efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
144 144
145 name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; 145 name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
146 146
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 4b0a226024fa..8d0c0df01854 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
118{ 118{
119 struct eventfd_ctx *ctx = file->private_data; 119 struct eventfd_ctx *ctx = file->private_data;
120 unsigned int events = 0; 120 unsigned int events = 0;
121 unsigned long flags; 121 u64 count;
122 122
123 poll_wait(file, &ctx->wqh, wait); 123 poll_wait(file, &ctx->wqh, wait);
124 smp_rmb();
125 count = ctx->count;
124 126
125 spin_lock_irqsave(&ctx->wqh.lock, flags); 127 if (count > 0)
126 if (ctx->count > 0)
127 events |= POLLIN; 128 events |= POLLIN;
128 if (ctx->count == ULLONG_MAX) 129 if (count == ULLONG_MAX)
129 events |= POLLERR; 130 events |= POLLERR;
130 if (ULLONG_MAX - 1 > ctx->count) 131 if (ULLONG_MAX - 1 > count)
131 events |= POLLOUT; 132 events |= POLLOUT;
132 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
133 133
134 return events; 134 return events;
135} 135}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f94491352..1e009cad8d5c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1639,9 +1639,9 @@ fetch_events:
1639 1639
1640 spin_lock_irqsave(&ep->lock, flags); 1640 spin_lock_irqsave(&ep->lock, flags);
1641 } 1641 }
1642 __remove_wait_queue(&ep->wq, &wait);
1643 1642
1644 set_current_state(TASK_RUNNING); 1643 __remove_wait_queue(&ep->wq, &wait);
1644 __set_current_state(TASK_RUNNING);
1645 } 1645 }
1646check_events: 1646check_events:
1647 /* Is it worth to try to dig for events ? */ 1647 /* Is it worth to try to dig for events ? */
diff --git a/fs/exec.c b/fs/exec.c
index ad8798e26be9..c7f9b733406d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -794,8 +794,14 @@ exit:
794 794
795struct file *open_exec(const char *name) 795struct file *open_exec(const char *name)
796{ 796{
797 struct filename tmp = { .name = name }; 797 struct filename *filename = getname_kernel(name);
798 return do_open_execat(AT_FDCWD, &tmp, 0); 798 struct file *f = ERR_CAST(filename);
799
800 if (!IS_ERR(filename)) {
801 f = do_open_execat(AT_FDCWD, filename, 0);
802 putname(filename);
803 }
804 return f;
799} 805}
800EXPORT_SYMBOL(open_exec); 806EXPORT_SYMBOL(open_exec);
801 807
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
985 .direct_IO = exofs_direct_IO, 985 .direct_IO = exofs_direct_IO,
986 986
987 /* With these NULL has special meaning or default is not exported */ 987 /* With these NULL has special meaning or default is not exported */
988 .get_xip_mem = NULL,
989 .migratepage = NULL, 988 .migratepage = NULL,
990 .launder_page = NULL, 989 .launder_page = NULL,
991 .is_partially_uptodate = NULL, 990 .is_partially_uptodate = NULL,
@@ -1214,7 +1213,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1214 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); 1213 memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
1215 } 1214 }
1216 1215
1217 inode->i_mapping->backing_dev_info = sb->s_bdi;
1218 if (S_ISREG(inode->i_mode)) { 1216 if (S_ISREG(inode->i_mode)) {
1219 inode->i_op = &exofs_file_inode_operations; 1217 inode->i_op = &exofs_file_inode_operations;
1220 inode->i_fop = &exofs_file_operations; 1218 inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1312,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
1314 1312
1315 set_obj_2bcreated(oi); 1313 set_obj_2bcreated(oi);
1316 1314
1317 inode->i_mapping->backing_dev_info = sb->s_bdi;
1318 inode_init_owner(inode, dir, mode); 1315 inode_init_owner(inode, dir, mode);
1319 inode->i_ino = sbi->s_nextid++; 1316 inode->i_ino = sbi->s_nextid++;
1320 inode->i_blkbits = EXOFS_BLKSHIFT; 1317 inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
836 goto free_sbi; 836 goto free_sbi;
837 } 837 }
838 838
839 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); 839 ret = bdi_setup_and_register(&sbi->bdi, "exofs");
840 if (ret) { 840 if (ret) {
841 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); 841 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
842 dput(sb->s_root); 842 dput(sb->s_root);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
42 42
43 If you are not using a security module that requires using 43 If you are not using a security module that requires using
44 extended attributes for file security labels, say N. 44 extended attributes for file security labels, say N.
45
46config EXT2_FS_XIP
47 bool "Ext2 execute in place support"
48 depends on EXT2_FS && MMU
49 help
50 Execute in place can be used on memory-backed block devices. If you
51 enable this option, you can select to mount block devices which are
52 capable of this feature without using the page cache.
53
54 If you do not use a block device that is capable of using this,
55 or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index f42af45cfd88..445b0e996a12 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 10ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o 11ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
12ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o 12ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o
13ext2-$(CONFIG_EXT2_FS_XIP) += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e4279ead4a05..678f9ab08c48 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,10 +380,15 @@ struct ext2_inode {
380#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ 380#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
381#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ 381#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */
382#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ 382#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */
383#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ 383#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */
384#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ 384#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
385#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ 385#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
386#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ 386#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
387#ifdef CONFIG_FS_DAX
388#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
389#else
390#define EXT2_MOUNT_DAX 0
391#endif
387 392
388 393
389#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt 394#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
788 int datasync); 793 int datasync);
789extern const struct inode_operations ext2_file_inode_operations; 794extern const struct inode_operations ext2_file_inode_operations;
790extern const struct file_operations ext2_file_operations; 795extern const struct file_operations ext2_file_operations;
791extern const struct file_operations ext2_xip_file_operations; 796extern const struct file_operations ext2_dax_file_operations;
792 797
793/* inode.c */ 798/* inode.c */
794extern const struct address_space_operations ext2_aops; 799extern const struct address_space_operations ext2_aops;
795extern const struct address_space_operations ext2_aops_xip;
796extern const struct address_space_operations ext2_nobh_aops; 800extern const struct address_space_operations ext2_nobh_aops;
797 801
798/* namei.c */ 802/* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..e31701713516 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
25#include "xattr.h" 25#include "xattr.h"
26#include "acl.h" 26#include "acl.h"
27 27
28#ifdef CONFIG_FS_DAX
29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
30{
31 return dax_fault(vma, vmf, ext2_get_block);
32}
33
34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
35{
36 return dax_mkwrite(vma, vmf, ext2_get_block);
37}
38
39static const struct vm_operations_struct ext2_dax_vm_ops = {
40 .fault = ext2_dax_fault,
41 .page_mkwrite = ext2_dax_mkwrite,
42};
43
44static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
45{
46 if (!IS_DAX(file_inode(file)))
47 return generic_file_mmap(file, vma);
48
49 file_accessed(file);
50 vma->vm_ops = &ext2_dax_vm_ops;
51 vma->vm_flags |= VM_MIXEDMAP;
52 return 0;
53}
54#else
55#define ext2_file_mmap generic_file_mmap
56#endif
57
28/* 58/*
29 * Called when filp is released. This happens when all file descriptors 59 * Called when filp is released. This happens when all file descriptors
30 * for a single struct file are closed. Note that different open() calls 60 * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
70#ifdef CONFIG_COMPAT 100#ifdef CONFIG_COMPAT
71 .compat_ioctl = ext2_compat_ioctl, 101 .compat_ioctl = ext2_compat_ioctl,
72#endif 102#endif
73 .mmap = generic_file_mmap, 103 .mmap = ext2_file_mmap,
74 .open = dquot_file_open, 104 .open = dquot_file_open,
75 .release = ext2_release_file, 105 .release = ext2_release_file,
76 .fsync = ext2_fsync, 106 .fsync = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
78 .splice_write = iter_file_splice_write, 108 .splice_write = iter_file_splice_write,
79}; 109};
80 110
81#ifdef CONFIG_EXT2_FS_XIP 111#ifdef CONFIG_FS_DAX
82const struct file_operations ext2_xip_file_operations = { 112const struct file_operations ext2_dax_file_operations = {
83 .llseek = generic_file_llseek, 113 .llseek = generic_file_llseek,
84 .read = xip_file_read, 114 .read = new_sync_read,
85 .write = xip_file_write, 115 .write = new_sync_write,
116 .read_iter = generic_file_read_iter,
117 .write_iter = generic_file_write_iter,
86 .unlocked_ioctl = ext2_ioctl, 118 .unlocked_ioctl = ext2_ioctl,
87#ifdef CONFIG_COMPAT 119#ifdef CONFIG_COMPAT
88 .compat_ioctl = ext2_compat_ioctl, 120 .compat_ioctl = ext2_compat_ioctl,
89#endif 121#endif
90 .mmap = xip_file_mmap, 122 .mmap = ext2_file_mmap,
91 .open = dquot_file_open, 123 .open = dquot_file_open,
92 .release = ext2_release_file, 124 .release = ext2_release_file,
93 .fsync = ext2_fsync, 125 .fsync = ext2_fsync,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
170 struct ext2_group_desc * gdp; 170 struct ext2_group_desc * gdp;
171 struct backing_dev_info *bdi; 171 struct backing_dev_info *bdi;
172 172
173 bdi = inode->i_mapping->backing_dev_info; 173 bdi = inode_to_bdi(inode);
174 if (bdi_read_congested(bdi)) 174 if (bdi_read_congested(bdi))
175 return; 175 return;
176 if (bdi_write_congested(bdi)) 176 if (bdi_write_congested(bdi))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..6434bc000125 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -34,7 +34,6 @@
34#include <linux/aio.h> 34#include <linux/aio.h>
35#include "ext2.h" 35#include "ext2.h"
36#include "acl.h" 36#include "acl.h"
37#include "xip.h"
38#include "xattr.h" 37#include "xattr.h"
39 38
40static int __ext2_write_inode(struct inode *inode, int do_sync); 39static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
731 goto cleanup; 730 goto cleanup;
732 } 731 }
733 732
734 if (ext2_use_xip(inode->i_sb)) { 733 if (IS_DAX(inode)) {
735 /* 734 /*
736 * we need to clear the block 735 * block must be initialised before we put it in the tree
736 * so that it's not found by another thread before it's
737 * initialised
737 */ 738 */
738 err = ext2_clear_xip_target (inode, 739 err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
739 le32_to_cpu(chain[depth-1].key)); 740 1 << inode->i_blkbits);
740 if (err) { 741 if (err) {
741 mutex_unlock(&ei->truncate_mutex); 742 mutex_unlock(&ei->truncate_mutex);
742 goto cleanup; 743 goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
859 size_t count = iov_iter_count(iter); 860 size_t count = iov_iter_count(iter);
860 ssize_t ret; 861 ssize_t ret;
861 862
862 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); 863 if (IS_DAX(inode))
864 ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
865 NULL, DIO_LOCKING);
866 else
867 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
868 ext2_get_block);
863 if (ret < 0 && (rw & WRITE)) 869 if (ret < 0 && (rw & WRITE))
864 ext2_write_failed(mapping, offset + count); 870 ext2_write_failed(mapping, offset + count);
865 return ret; 871 return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
885 .error_remove_page = generic_error_remove_page, 891 .error_remove_page = generic_error_remove_page,
886}; 892};
887 893
888const struct address_space_operations ext2_aops_xip = {
889 .bmap = ext2_bmap,
890 .get_xip_mem = ext2_get_xip_mem,
891};
892
893const struct address_space_operations ext2_nobh_aops = { 894const struct address_space_operations ext2_nobh_aops = {
894 .readpage = ext2_readpage, 895 .readpage = ext2_readpage,
895 .readpages = ext2_readpages, 896 .readpages = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
1201 1202
1202 inode_dio_wait(inode); 1203 inode_dio_wait(inode);
1203 1204
1204 if (mapping_is_xip(inode->i_mapping)) 1205 if (IS_DAX(inode))
1205 error = xip_truncate_page(inode->i_mapping, newsize); 1206 error = dax_truncate_page(inode, newsize, ext2_get_block);
1206 else if (test_opt(inode->i_sb, NOBH)) 1207 else if (test_opt(inode->i_sb, NOBH))
1207 error = nobh_truncate_page(inode->i_mapping, 1208 error = nobh_truncate_page(inode->i_mapping,
1208 newsize, ext2_get_block); 1209 newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
1273{ 1274{
1274 unsigned int flags = EXT2_I(inode)->i_flags; 1275 unsigned int flags = EXT2_I(inode)->i_flags;
1275 1276
1276 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 1277 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
1278 S_DIRSYNC | S_DAX);
1277 if (flags & EXT2_SYNC_FL) 1279 if (flags & EXT2_SYNC_FL)
1278 inode->i_flags |= S_SYNC; 1280 inode->i_flags |= S_SYNC;
1279 if (flags & EXT2_APPEND_FL) 1281 if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
1284 inode->i_flags |= S_NOATIME; 1286 inode->i_flags |= S_NOATIME;
1285 if (flags & EXT2_DIRSYNC_FL) 1287 if (flags & EXT2_DIRSYNC_FL)
1286 inode->i_flags |= S_DIRSYNC; 1288 inode->i_flags |= S_DIRSYNC;
1289 if (test_opt(inode->i_sb, DAX))
1290 inode->i_flags |= S_DAX;
1287} 1291}
1288 1292
1289/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ 1293/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1384 1388
1385 if (S_ISREG(inode->i_mode)) { 1389 if (S_ISREG(inode->i_mode)) {
1386 inode->i_op = &ext2_file_inode_operations; 1390 inode->i_op = &ext2_file_inode_operations;
1387 if (ext2_use_xip(inode->i_sb)) { 1391 if (test_opt(inode->i_sb, DAX)) {
1388 inode->i_mapping->a_ops = &ext2_aops_xip; 1392 inode->i_mapping->a_ops = &ext2_aops;
1389 inode->i_fop = &ext2_xip_file_operations; 1393 inode->i_fop = &ext2_dax_file_operations;
1390 } else if (test_opt(inode->i_sb, NOBH)) { 1394 } else if (test_opt(inode->i_sb, NOBH)) {
1391 inode->i_mapping->a_ops = &ext2_nobh_aops; 1395 inode->i_mapping->a_ops = &ext2_nobh_aops;
1392 inode->i_fop = &ext2_file_operations; 1396 inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c268d0af1db9..148f6e3789ea 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -35,7 +35,6 @@
35#include "ext2.h" 35#include "ext2.h"
36#include "xattr.h" 36#include "xattr.h"
37#include "acl.h" 37#include "acl.h"
38#include "xip.h"
39 38
40static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) 39static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
41{ 40{
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
105 return PTR_ERR(inode); 104 return PTR_ERR(inode);
106 105
107 inode->i_op = &ext2_file_inode_operations; 106 inode->i_op = &ext2_file_inode_operations;
108 if (ext2_use_xip(inode->i_sb)) { 107 if (test_opt(inode->i_sb, DAX)) {
109 inode->i_mapping->a_ops = &ext2_aops_xip; 108 inode->i_mapping->a_ops = &ext2_aops;
110 inode->i_fop = &ext2_xip_file_operations; 109 inode->i_fop = &ext2_dax_file_operations;
111 } else if (test_opt(inode->i_sb, NOBH)) { 110 } else if (test_opt(inode->i_sb, NOBH)) {
112 inode->i_mapping->a_ops = &ext2_nobh_aops; 111 inode->i_mapping->a_ops = &ext2_nobh_aops;
113 inode->i_fop = &ext2_file_operations; 112 inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
126 return PTR_ERR(inode); 125 return PTR_ERR(inode);
127 126
128 inode->i_op = &ext2_file_inode_operations; 127 inode->i_op = &ext2_file_inode_operations;
129 if (ext2_use_xip(inode->i_sb)) { 128 if (test_opt(inode->i_sb, DAX)) {
130 inode->i_mapping->a_ops = &ext2_aops_xip; 129 inode->i_mapping->a_ops = &ext2_aops;
131 inode->i_fop = &ext2_xip_file_operations; 130 inode->i_fop = &ext2_dax_file_operations;
132 } else if (test_opt(inode->i_sb, NOBH)) { 131 } else if (test_opt(inode->i_sb, NOBH)) {
133 inode->i_mapping->a_ops = &ext2_nobh_aops; 132 inode->i_mapping->a_ops = &ext2_nobh_aops;
134 inode->i_fop = &ext2_file_operations; 133 inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ae55fddc26a9..d0e746e96511 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -35,7 +35,6 @@
35#include "ext2.h" 35#include "ext2.h"
36#include "xattr.h" 36#include "xattr.h"
37#include "acl.h" 37#include "acl.h"
38#include "xip.h"
39 38
40static void ext2_sync_super(struct super_block *sb, 39static void ext2_sync_super(struct super_block *sb,
41 struct ext2_super_block *es, int wait); 40 struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
292 seq_puts(seq, ",grpquota"); 291 seq_puts(seq, ",grpquota");
293#endif 292#endif
294 293
295#if defined(CONFIG_EXT2_FS_XIP) 294#ifdef CONFIG_FS_DAX
296 if (sbi->s_mount_opt & EXT2_MOUNT_XIP) 295 if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
297 seq_puts(seq, ",xip"); 296 seq_puts(seq, ",xip");
297 if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
298 seq_puts(seq, ",dax");
298#endif 299#endif
299 300
300 if (!test_opt(sb, RESERVATION)) 301 if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
403 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, 404 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
404 Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, 405 Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
405 Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, 406 Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
406 Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, 407 Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
407 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation 408 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
408}; 409};
409 410
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
432 {Opt_acl, "acl"}, 433 {Opt_acl, "acl"},
433 {Opt_noacl, "noacl"}, 434 {Opt_noacl, "noacl"},
434 {Opt_xip, "xip"}, 435 {Opt_xip, "xip"},
436 {Opt_dax, "dax"},
435 {Opt_grpquota, "grpquota"}, 437 {Opt_grpquota, "grpquota"},
436 {Opt_ignore, "noquota"}, 438 {Opt_ignore, "noquota"},
437 {Opt_quota, "quota"}, 439 {Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
559 break; 561 break;
560#endif 562#endif
561 case Opt_xip: 563 case Opt_xip:
562#ifdef CONFIG_EXT2_FS_XIP 564 ext2_msg(sb, KERN_INFO, "use dax instead of xip");
563 set_opt (sbi->s_mount_opt, XIP); 565 set_opt(sbi->s_mount_opt, XIP);
566 /* Fall through */
567 case Opt_dax:
568#ifdef CONFIG_FS_DAX
569 set_opt(sbi->s_mount_opt, DAX);
564#else 570#else
565 ext2_msg(sb, KERN_INFO, "xip option not supported"); 571 ext2_msg(sb, KERN_INFO, "dax option not supported");
566#endif 572#endif
567 break; 573 break;
568 574
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
877 ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? 883 ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
878 MS_POSIXACL : 0); 884 MS_POSIXACL : 0);
879 885
880 ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
881 EXT2_MOUNT_XIP if not */
882
883 if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && 886 if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
884 (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || 887 (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
885 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || 888 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
909 912
910 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 913 blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
911 914
912 if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { 915 if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
913 if (!silent) 916 if (blocksize != PAGE_SIZE) {
914 ext2_msg(sb, KERN_ERR, 917 ext2_msg(sb, KERN_ERR,
915 "error: unsupported blocksize for xip"); 918 "error: unsupported blocksize for dax");
916 goto failed_mount; 919 goto failed_mount;
920 }
921 if (!sb->s_bdev->bd_disk->fops->direct_access) {
922 ext2_msg(sb, KERN_ERR,
923 "error: device does not support dax");
924 goto failed_mount;
925 }
917 } 926 }
918 927
919 /* If the blocksize doesn't match, re-read the thing.. */ 928 /* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1259{ 1268{
1260 struct ext2_sb_info * sbi = EXT2_SB(sb); 1269 struct ext2_sb_info * sbi = EXT2_SB(sb);
1261 struct ext2_super_block * es; 1270 struct ext2_super_block * es;
1262 unsigned long old_mount_opt = sbi->s_mount_opt;
1263 struct ext2_mount_options old_opts; 1271 struct ext2_mount_options old_opts;
1264 unsigned long old_sb_flags; 1272 unsigned long old_sb_flags;
1265 int err; 1273 int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1284 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1292 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1285 ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1293 ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1286 1294
1287 ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
1288 EXT2_MOUNT_XIP if not */
1289
1290 if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
1291 ext2_msg(sb, KERN_WARNING,
1292 "warning: unsupported blocksize for xip");
1293 err = -EINVAL;
1294 goto restore_opts;
1295 }
1296
1297 es = sbi->s_es; 1295 es = sbi->s_es;
1298 if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { 1296 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
1299 ext2_msg(sb, KERN_WARNING, "warning: refusing change of " 1297 ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
1300 "xip flag with busy inodes while remounting"); 1298 "dax flag with busy inodes while remounting");
1301 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; 1299 sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
1302 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1303 } 1300 }
1304 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1301 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1305 spin_unlock(&sbi->s_lock); 1302 spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644
index e98171a11cfe..000000000000
--- a/fs/ext2/xip.c
+++ /dev/null
@@ -1,91 +0,0 @@
1/*
2 * linux/fs/ext2/xip.c
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte (cotte@de.ibm.com)
6 */
7
8#include <linux/mm.h>
9#include <linux/fs.h>
10#include <linux/genhd.h>
11#include <linux/buffer_head.h>
12#include <linux/blkdev.h>
13#include "ext2.h"
14#include "xip.h"
15
16static inline int
17__inode_direct_access(struct inode *inode, sector_t block,
18 void **kaddr, unsigned long *pfn)
19{
20 struct block_device *bdev = inode->i_sb->s_bdev;
21 const struct block_device_operations *ops = bdev->bd_disk->fops;
22 sector_t sector;
23
24 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
25
26 BUG_ON(!ops->direct_access);
27 return ops->direct_access(bdev, sector, kaddr, pfn);
28}
29
30static inline int
31__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
32 sector_t *result)
33{
34 struct buffer_head tmp;
35 int rc;
36
37 memset(&tmp, 0, sizeof(struct buffer_head));
38 tmp.b_size = 1 << inode->i_blkbits;
39 rc = ext2_get_block(inode, pgoff, &tmp, create);
40 *result = tmp.b_blocknr;
41
42 /* did we get a sparse block (hole in the file)? */
43 if (!tmp.b_blocknr && !rc) {
44 BUG_ON(create);
45 rc = -ENODATA;
46 }
47
48 return rc;
49}
50
51int
52ext2_clear_xip_target(struct inode *inode, sector_t block)
53{
54 void *kaddr;
55 unsigned long pfn;
56 int rc;
57
58 rc = __inode_direct_access(inode, block, &kaddr, &pfn);
59 if (!rc)
60 clear_page(kaddr);
61 return rc;
62}
63
64void ext2_xip_verify_sb(struct super_block *sb)
65{
66 struct ext2_sb_info *sbi = EXT2_SB(sb);
67
68 if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
69 !sb->s_bdev->bd_disk->fops->direct_access) {
70 sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
71 ext2_msg(sb, KERN_WARNING,
72 "warning: ignoring xip option - "
73 "not supported by bdev");
74 }
75}
76
77int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
78 void **kmem, unsigned long *pfn)
79{
80 int rc;
81 sector_t block;
82
83 /* first, retrieve the sector number */
84 rc = __ext2_get_block(mapping->host, pgoff, create, &block);
85 if (rc)
86 return rc;
87
88 /* retrieve address of the target data */
89 rc = __inode_direct_access(mapping->host, block, kmem, pfn);
90 return rc;
91}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644
index 18b34d2f31b3..000000000000
--- a/fs/ext2/xip.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/*
2 * linux/fs/ext2/xip.h
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte (cotte@de.ibm.com)
6 */
7
8#ifdef CONFIG_EXT2_FS_XIP
9extern void ext2_xip_verify_sb (struct super_block *);
10extern int ext2_clear_xip_target (struct inode *, sector_t);
11
12static inline int ext2_use_xip (struct super_block *sb)
13{
14 struct ext2_sb_info *sbi = EXT2_SB(sb);
15 return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
16}
17int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
18 void **, unsigned long *);
19#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
20#else
21#define mapping_is_xip(map) 0
22#define ext2_xip_verify_sb(sb) do { } while (0)
23#define ext2_use_xip(sb) 0
24#define ext2_clear_xip_target(inode, chain) 0
25#define ext2_get_xip_mem NULL
26#endif
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9b4e7d750d4f..d4dbf3c259b3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb)
466 } 466 }
467 sb->s_fs_info = NULL; 467 sb->s_fs_info = NULL;
468 kfree(sbi->s_blockgroup_lock); 468 kfree(sbi->s_blockgroup_lock);
469 mutex_destroy(&sbi->s_orphan_lock);
470 mutex_destroy(&sbi->s_resize_lock);
469 kfree(sbi); 471 kfree(sbi);
470} 472}
471 473
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba67bb1f..982d934fd9ac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -965,6 +965,11 @@ struct ext4_inode_info {
965#define EXT4_MOUNT_ERRORS_MASK 0x00070 965#define EXT4_MOUNT_ERRORS_MASK 0x00070
966#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ 966#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
967#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ 967#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
968#ifdef CONFIG_FS_DAX
969#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
970#else
971#define EXT4_MOUNT_DAX 0
972#endif
968#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ 973#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
969#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ 974#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
970#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ 975#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
2578/* file.c */ 2583/* file.c */
2579extern const struct inode_operations ext4_file_inode_operations; 2584extern const struct inode_operations ext4_file_inode_operations;
2580extern const struct file_operations ext4_file_operations; 2585extern const struct file_operations ext4_file_operations;
2586extern const struct file_operations ext4_dax_file_operations;
2581extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); 2587extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
2582 2588
2583/* inline.c */ 2589/* inline.c */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e5d3eadf47b1..bed43081720f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5166 5166
5167 /* fallback to generic here if not in extents fmt */ 5167 /* fallback to generic here if not in extents fmt */
5168 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5168 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5169 return __generic_block_fiemap(inode, fieinfo, start, len, 5169 return generic_block_fiemap(inode, fieinfo, start, len,
5170 ext4_get_block); 5170 ext4_get_block);
5171 5171
5172 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 5172 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5173 return -EBADR; 5173 return -EBADR;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 513c12cf444c..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
95 struct inode *inode = file_inode(iocb->ki_filp); 95 struct inode *inode = file_inode(iocb->ki_filp);
96 struct mutex *aio_mutex = NULL; 96 struct mutex *aio_mutex = NULL;
97 struct blk_plug plug; 97 struct blk_plug plug;
98 int o_direct = file->f_flags & O_DIRECT; 98 int o_direct = io_is_direct(file);
99 int overwrite = 0; 99 int overwrite = 0;
100 size_t length = iov_iter_count(from); 100 size_t length = iov_iter_count(from);
101 ssize_t ret; 101 ssize_t ret;
@@ -191,17 +191,41 @@ errout:
191 return ret; 191 return ret;
192} 192}
193 193
194#ifdef CONFIG_FS_DAX
195static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
196{
197 return dax_fault(vma, vmf, ext4_get_block);
198 /* Is this the right get_block? */
199}
200
201static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
202{
203 return dax_mkwrite(vma, vmf, ext4_get_block);
204}
205
206static const struct vm_operations_struct ext4_dax_vm_ops = {
207 .fault = ext4_dax_fault,
208 .page_mkwrite = ext4_dax_mkwrite,
209};
210#else
211#define ext4_dax_vm_ops ext4_file_vm_ops
212#endif
213
194static const struct vm_operations_struct ext4_file_vm_ops = { 214static const struct vm_operations_struct ext4_file_vm_ops = {
195 .fault = filemap_fault, 215 .fault = filemap_fault,
196 .map_pages = filemap_map_pages, 216 .map_pages = filemap_map_pages,
197 .page_mkwrite = ext4_page_mkwrite, 217 .page_mkwrite = ext4_page_mkwrite,
198 .remap_pages = generic_file_remap_pages,
199}; 218};
200 219
201static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 220static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
202{ 221{
203 file_accessed(file); 222 file_accessed(file);
204 vma->vm_ops = &ext4_file_vm_ops; 223 if (IS_DAX(file_inode(file))) {
224 vma->vm_ops = &ext4_dax_vm_ops;
225 vma->vm_flags |= VM_MIXEDMAP;
226 } else {
227 vma->vm_ops = &ext4_file_vm_ops;
228 }
205 return 0; 229 return 0;
206} 230}
207 231
@@ -273,19 +297,24 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
273 * we determine this extent as a data or a hole according to whether the 297 * we determine this extent as a data or a hole according to whether the
274 * page cache has data or not. 298 * page cache has data or not.
275 */ 299 */
276static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, 300static int ext4_find_unwritten_pgoff(struct inode *inode,
277 loff_t endoff, loff_t *offset) 301 int whence,
302 struct ext4_map_blocks *map,
303 loff_t *offset)
278{ 304{
279 struct pagevec pvec; 305 struct pagevec pvec;
306 unsigned int blkbits;
280 pgoff_t index; 307 pgoff_t index;
281 pgoff_t end; 308 pgoff_t end;
309 loff_t endoff;
282 loff_t startoff; 310 loff_t startoff;
283 loff_t lastoff; 311 loff_t lastoff;
284 int found = 0; 312 int found = 0;
285 313
314 blkbits = inode->i_sb->s_blocksize_bits;
286 startoff = *offset; 315 startoff = *offset;
287 lastoff = startoff; 316 lastoff = startoff;
288 317 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
289 318
290 index = startoff >> PAGE_CACHE_SHIFT; 319 index = startoff >> PAGE_CACHE_SHIFT;
291 end = endoff >> PAGE_CACHE_SHIFT; 320 end = endoff >> PAGE_CACHE_SHIFT;
@@ -403,144 +432,147 @@ out:
403static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 432static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
404{ 433{
405 struct inode *inode = file->f_mapping->host; 434 struct inode *inode = file->f_mapping->host;
406 struct fiemap_extent_info fie; 435 struct ext4_map_blocks map;
407 struct fiemap_extent ext[2]; 436 struct extent_status es;
408 loff_t next; 437 ext4_lblk_t start, last, end;
409 int i, ret = 0; 438 loff_t dataoff, isize;
439 int blkbits;
440 int ret = 0;
410 441
411 mutex_lock(&inode->i_mutex); 442 mutex_lock(&inode->i_mutex);
412 if (offset >= inode->i_size) { 443
444 isize = i_size_read(inode);
445 if (offset >= isize) {
413 mutex_unlock(&inode->i_mutex); 446 mutex_unlock(&inode->i_mutex);
414 return -ENXIO; 447 return -ENXIO;
415 } 448 }
416 fie.fi_flags = 0; 449
417 fie.fi_extents_max = 2; 450 blkbits = inode->i_sb->s_blocksize_bits;
418 fie.fi_extents_start = (struct fiemap_extent __user *) &ext; 451 start = offset >> blkbits;
419 while (1) { 452 last = start;
420 mm_segment_t old_fs = get_fs(); 453 end = isize >> blkbits;
421 454 dataoff = offset;
422 fie.fi_extents_mapped = 0; 455
423 memset(ext, 0, sizeof(*ext) * fie.fi_extents_max); 456 do {
424 457 map.m_lblk = last;
425 set_fs(get_ds()); 458 map.m_len = end - last + 1;
426 ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); 459 ret = ext4_map_blocks(NULL, inode, &map, 0);
427 set_fs(old_fs); 460 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
428 if (ret) 461 if (last != start)
462 dataoff = (loff_t)last << blkbits;
429 break; 463 break;
464 }
430 465
431 /* No extents found, EOF */ 466 /*
432 if (!fie.fi_extents_mapped) { 467 * If there is a delay extent at this offset,
433 ret = -ENXIO; 468 * it will be as a data.
469 */
470 ext4_es_find_delayed_extent_range(inode, last, last, &es);
471 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
472 if (last != start)
473 dataoff = (loff_t)last << blkbits;
434 break; 474 break;
435 } 475 }
436 for (i = 0; i < fie.fi_extents_mapped; i++) {
437 next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
438 476
439 if (offset < (loff_t)ext[i].fe_logical) 477 /*
440 offset = (loff_t)ext[i].fe_logical; 478 * If there is a unwritten extent at this offset,
441 /* 479 * it will be as a data or a hole according to page
442 * If extent is not unwritten, then it contains valid 480 * cache that has data or not.
443 * data, mapped or delayed. 481 */
444 */ 482 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
445 if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) 483 int unwritten;
446 goto out; 484 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
485 &map, &dataoff);
486 if (unwritten)
487 break;
488 }
447 489
448 /* 490 last++;
449 * If there is a unwritten extent at this offset, 491 dataoff = (loff_t)last << blkbits;
450 * it will be as a data or a hole according to page 492 } while (last <= end);
451 * cache that has data or not.
452 */
453 if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
454 next, &offset))
455 goto out;
456 493
457 if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
458 ret = -ENXIO;
459 goto out;
460 }
461 offset = next;
462 }
463 }
464 if (offset > inode->i_size)
465 offset = inode->i_size;
466out:
467 mutex_unlock(&inode->i_mutex); 494 mutex_unlock(&inode->i_mutex);
468 if (ret)
469 return ret;
470 495
471 return vfs_setpos(file, offset, maxsize); 496 if (dataoff > isize)
497 return -ENXIO;
498
499 return vfs_setpos(file, dataoff, maxsize);
472} 500}
473 501
474/* 502/*
475 * ext4_seek_hole() retrieves the offset for SEEK_HOLE 503 * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
476 */ 504 */
477static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 505static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
478{ 506{
479 struct inode *inode = file->f_mapping->host; 507 struct inode *inode = file->f_mapping->host;
480 struct fiemap_extent_info fie; 508 struct ext4_map_blocks map;
481 struct fiemap_extent ext[2]; 509 struct extent_status es;
482 loff_t next; 510 ext4_lblk_t start, last, end;
483 int i, ret = 0; 511 loff_t holeoff, isize;
512 int blkbits;
513 int ret = 0;
484 514
485 mutex_lock(&inode->i_mutex); 515 mutex_lock(&inode->i_mutex);
486 if (offset >= inode->i_size) { 516
517 isize = i_size_read(inode);
518 if (offset >= isize) {
487 mutex_unlock(&inode->i_mutex); 519 mutex_unlock(&inode->i_mutex);
488 return -ENXIO; 520 return -ENXIO;
489 } 521 }
490 522
491 fie.fi_flags = 0; 523 blkbits = inode->i_sb->s_blocksize_bits;
492 fie.fi_extents_max = 2; 524 start = offset >> blkbits;
493 fie.fi_extents_start = (struct fiemap_extent __user *)&ext; 525 last = start;
494 while (1) { 526 end = isize >> blkbits;
495 mm_segment_t old_fs = get_fs(); 527 holeoff = offset;
496 528
497 fie.fi_extents_mapped = 0; 529 do {
498 memset(ext, 0, sizeof(*ext)); 530 map.m_lblk = last;
499 531 map.m_len = end - last + 1;
500 set_fs(get_ds()); 532 ret = ext4_map_blocks(NULL, inode, &map, 0);
501 ret = ext4_fiemap(inode, &fie, offset, maxsize - offset); 533 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
502 set_fs(old_fs); 534 last += ret;
503 if (ret) 535 holeoff = (loff_t)last << blkbits;
504 break; 536 continue;
537 }
505 538
506 /* No extents found */ 539 /*
507 if (!fie.fi_extents_mapped) 540 * If there is a delay extent at this offset,
508 break; 541 * we will skip this extent.
542 */
543 ext4_es_find_delayed_extent_range(inode, last, last, &es);
544 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
545 last = es.es_lblk + es.es_len;
546 holeoff = (loff_t)last << blkbits;
547 continue;
548 }
509 549
510 for (i = 0; i < fie.fi_extents_mapped; i++) { 550 /*
511 next = (loff_t)(ext[i].fe_logical + ext[i].fe_length); 551 * If there is a unwritten extent at this offset,
512 /* 552 * it will be as a data or a hole according to page
513 * If extent is not unwritten, then it contains valid 553 * cache that has data or not.
514 * data, mapped or delayed. 554 */
515 */ 555 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
516 if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) { 556 int unwritten;
517 if (offset < (loff_t)ext[i].fe_logical) 557 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
518 goto out; 558 &map, &holeoff);
519 offset = next; 559 if (!unwritten) {
560 last += ret;
561 holeoff = (loff_t)last << blkbits;
520 continue; 562 continue;
521 } 563 }
522 /*
523 * If there is a unwritten extent at this offset,
524 * it will be as a data or a hole according to page
525 * cache that has data or not.
526 */
527 if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
528 next, &offset))
529 goto out;
530
531 offset = next;
532 if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
533 goto out;
534 } 564 }
535 } 565
536 if (offset > inode->i_size) 566 /* find a hole */
537 offset = inode->i_size; 567 break;
538out: 568 } while (last <= end);
569
539 mutex_unlock(&inode->i_mutex); 570 mutex_unlock(&inode->i_mutex);
540 if (ret)
541 return ret;
542 571
543 return vfs_setpos(file, offset, maxsize); 572 if (holeoff > isize)
573 holeoff = isize;
574
575 return vfs_setpos(file, holeoff, maxsize);
544} 576}
545 577
546/* 578/*
@@ -592,6 +624,26 @@ const struct file_operations ext4_file_operations = {
592 .fallocate = ext4_fallocate, 624 .fallocate = ext4_fallocate,
593}; 625};
594 626
627#ifdef CONFIG_FS_DAX
628const struct file_operations ext4_dax_file_operations = {
629 .llseek = ext4_llseek,
630 .read = new_sync_read,
631 .write = new_sync_write,
632 .read_iter = generic_file_read_iter,
633 .write_iter = ext4_file_write_iter,
634 .unlocked_ioctl = ext4_ioctl,
635#ifdef CONFIG_COMPAT
636 .compat_ioctl = ext4_compat_ioctl,
637#endif
638 .mmap = ext4_file_mmap,
639 .open = ext4_file_open,
640 .release = ext4_release_file,
641 .fsync = ext4_sync_file,
642 /* Splice not yet supported with DAX */
643 .fallocate = ext4_fallocate,
644};
645#endif
646
595const struct inode_operations ext4_file_inode_operations = { 647const struct inode_operations ext4_file_inode_operations = {
596 .setattr = ext4_setattr, 648 .setattr = ext4_setattr,
597 .getattr = ext4_getattr, 649 .getattr = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..6b9878a24182 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
689 inode_dio_done(inode); 689 inode_dio_done(inode);
690 goto locked; 690 goto locked;
691 } 691 }
692 ret = __blockdev_direct_IO(rw, iocb, inode, 692 if (IS_DAX(inode))
693 inode->i_sb->s_bdev, iter, offset, 693 ret = dax_do_io(rw, iocb, inode, iter, offset,
694 ext4_get_block, NULL, NULL, 0); 694 ext4_get_block, NULL, 0);
695 else
696 ret = __blockdev_direct_IO(rw, iocb, inode,
697 inode->i_sb->s_bdev, iter, offset,
698 ext4_get_block, NULL, NULL, 0);
695 inode_dio_done(inode); 699 inode_dio_done(inode);
696 } else { 700 } else {
697locked: 701locked:
698 ret = blockdev_direct_IO(rw, iocb, inode, iter, 702 if (IS_DAX(inode))
699 offset, ext4_get_block); 703 ret = dax_do_io(rw, iocb, inode, iter, offset,
704 ext4_get_block, NULL, DIO_LOCKING);
705 else
706 ret = blockdev_direct_IO(rw, iocb, inode, iter,
707 offset, ext4_get_block);
700 708
701 if (unlikely((rw & WRITE) && ret < 0)) { 709 if (unlikely((rw & WRITE) && ret < 0)) {
702 loff_t isize = i_size_read(inode); 710 loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..85404f15e53a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
657 return retval; 657 return retval;
658} 658}
659 659
660static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
661{
662 struct inode *inode = bh->b_assoc_map->host;
663 /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
664 loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
665 int err;
666 if (!uptodate)
667 return;
668 WARN_ON(!buffer_unwritten(bh));
669 err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
670}
671
660/* Maximum number of blocks we map for direct IO at once. */ 672/* Maximum number of blocks we map for direct IO at once. */
661#define DIO_MAX_BLOCKS 4096 673#define DIO_MAX_BLOCKS 4096
662 674
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
694 706
695 map_bh(bh, inode->i_sb, map.m_pblk); 707 map_bh(bh, inode->i_sb, map.m_pblk);
696 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; 708 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
709 if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
710 bh->b_assoc_map = inode->i_mapping;
711 bh->b_private = (void *)(unsigned long)iblock;
712 bh->b_end_io = ext4_end_io_unwritten;
713 }
697 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) 714 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
698 set_buffer_defer_completion(bh); 715 set_buffer_defer_completion(bh);
699 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 716 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3010 get_block_func = ext4_get_block_write; 3027 get_block_func = ext4_get_block_write;
3011 dio_flags = DIO_LOCKING; 3028 dio_flags = DIO_LOCKING;
3012 } 3029 }
3013 ret = __blockdev_direct_IO(rw, iocb, inode, 3030 if (IS_DAX(inode))
3014 inode->i_sb->s_bdev, iter, 3031 ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
3015 offset, 3032 ext4_end_io_dio, dio_flags);
3016 get_block_func, 3033 else
3017 ext4_end_io_dio, 3034 ret = __blockdev_direct_IO(rw, iocb, inode,
3018 NULL, 3035 inode->i_sb->s_bdev, iter, offset,
3019 dio_flags); 3036 get_block_func,
3037 ext4_end_io_dio, NULL, dio_flags);
3020 3038
3021 /* 3039 /*
3022 * Put our reference to io_end. This can free the io_end structure e.g. 3040 * Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
3180 inode->i_mapping->a_ops = &ext4_aops; 3198 inode->i_mapping->a_ops = &ext4_aops;
3181} 3199}
3182 3200
3183/* 3201static int __ext4_block_zero_page_range(handle_t *handle,
3184 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3185 * starting from file offset 'from'. The range to be zero'd must
3186 * be contained with in one block. If the specified range exceeds
3187 * the end of the block it will be shortened to end of the block
3188 * that cooresponds to 'from'
3189 */
3190static int ext4_block_zero_page_range(handle_t *handle,
3191 struct address_space *mapping, loff_t from, loff_t length) 3202 struct address_space *mapping, loff_t from, loff_t length)
3192{ 3203{
3193 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3204 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3194 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3205 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3195 unsigned blocksize, max, pos; 3206 unsigned blocksize, pos;
3196 ext4_lblk_t iblock; 3207 ext4_lblk_t iblock;
3197 struct inode *inode = mapping->host; 3208 struct inode *inode = mapping->host;
3198 struct buffer_head *bh; 3209 struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
3205 return -ENOMEM; 3216 return -ENOMEM;
3206 3217
3207 blocksize = inode->i_sb->s_blocksize; 3218 blocksize = inode->i_sb->s_blocksize;
3208 max = blocksize - (offset & (blocksize - 1));
3209
3210 /*
3211 * correct length if it does not fall between
3212 * 'from' and the end of the block
3213 */
3214 if (length > max || length < 0)
3215 length = max;
3216 3219
3217 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3220 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3218 3221
@@ -3278,6 +3281,33 @@ unlock:
3278} 3281}
3279 3282
3280/* 3283/*
3284 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3285 * starting from file offset 'from'. The range to be zero'd must
3286 * be contained with in one block. If the specified range exceeds
3287 * the end of the block it will be shortened to end of the block
3288 * that cooresponds to 'from'
3289 */
3290static int ext4_block_zero_page_range(handle_t *handle,
3291 struct address_space *mapping, loff_t from, loff_t length)
3292{
3293 struct inode *inode = mapping->host;
3294 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3295 unsigned blocksize = inode->i_sb->s_blocksize;
3296 unsigned max = blocksize - (offset & (blocksize - 1));
3297
3298 /*
3299 * correct length if it does not fall between
3300 * 'from' and the end of the block
3301 */
3302 if (length > max || length < 0)
3303 length = max;
3304
3305 if (IS_DAX(inode))
3306 return dax_zero_page_range(inode, from, length, ext4_get_block);
3307 return __ext4_block_zero_page_range(handle, mapping, from, length);
3308}
3309
3310/*
3281 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3311 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3282 * up to the end of the block which corresponds to `from'. 3312 * up to the end of the block which corresponds to `from'.
3283 * This required during truncate. We need to physically zero the tail end 3313 * This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
3798 new_fl |= S_NOATIME; 3828 new_fl |= S_NOATIME;
3799 if (flags & EXT4_DIRSYNC_FL) 3829 if (flags & EXT4_DIRSYNC_FL)
3800 new_fl |= S_DIRSYNC; 3830 new_fl |= S_DIRSYNC;
3831 if (test_opt(inode->i_sb, DAX))
3832 new_fl |= S_DAX;
3801 inode_set_flags(inode, new_fl, 3833 inode_set_flags(inode, new_fl,
3802 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 3834 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
3803} 3835}
3804 3836
3805/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 3837/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4052 4084
4053 if (S_ISREG(inode->i_mode)) { 4085 if (S_ISREG(inode->i_mode)) {
4054 inode->i_op = &ext4_file_inode_operations; 4086 inode->i_op = &ext4_file_inode_operations;
4055 inode->i_fop = &ext4_file_operations; 4087 if (test_opt(inode->i_sb, DAX))
4088 inode->i_fop = &ext4_dax_file_operations;
4089 else
4090 inode->i_fop = &ext4_file_operations;
4056 ext4_set_aops(inode); 4091 ext4_set_aops(inode);
4057 } else if (S_ISDIR(inode->i_mode)) { 4092 } else if (S_ISDIR(inode->i_mode)) {
4058 inode->i_op = &ext4_dir_inode_operations; 4093 inode->i_op = &ext4_dir_inode_operations;
@@ -4139,6 +4174,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
4139 return 0; 4174 return 0;
4140} 4175}
4141 4176
4177struct other_inode {
4178 unsigned long orig_ino;
4179 struct ext4_inode *raw_inode;
4180};
4181
4182static int other_inode_match(struct inode * inode, unsigned long ino,
4183 void *data)
4184{
4185 struct other_inode *oi = (struct other_inode *) data;
4186
4187 if ((inode->i_ino != ino) ||
4188 (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
4189 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
4190 ((inode->i_state & I_DIRTY_TIME) == 0))
4191 return 0;
4192 spin_lock(&inode->i_lock);
4193 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
4194 I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
4195 (inode->i_state & I_DIRTY_TIME)) {
4196 struct ext4_inode_info *ei = EXT4_I(inode);
4197
4198 inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
4199 spin_unlock(&inode->i_lock);
4200
4201 spin_lock(&ei->i_raw_lock);
4202 EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
4203 EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
4204 EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
4205 ext4_inode_csum_set(inode, oi->raw_inode, ei);
4206 spin_unlock(&ei->i_raw_lock);
4207 trace_ext4_other_inode_update_time(inode, oi->orig_ino);
4208 return -1;
4209 }
4210 spin_unlock(&inode->i_lock);
4211 return -1;
4212}
4213
4214/*
4215 * Opportunistically update the other time fields for other inodes in
4216 * the same inode table block.
4217 */
4218static void ext4_update_other_inodes_time(struct super_block *sb,
4219 unsigned long orig_ino, char *buf)
4220{
4221 struct other_inode oi;
4222 unsigned long ino;
4223 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4224 int inode_size = EXT4_INODE_SIZE(sb);
4225
4226 oi.orig_ino = orig_ino;
4227 ino = orig_ino & ~(inodes_per_block - 1);
4228 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
4229 if (ino == orig_ino)
4230 continue;
4231 oi.raw_inode = (struct ext4_inode *) buf;
4232 (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
4233 }
4234}
4235
4142/* 4236/*
4143 * Post the struct inode info into an on-disk inode location in the 4237 * Post the struct inode info into an on-disk inode location in the
4144 * buffer-cache. This gobbles the caller's reference to the 4238 * buffer-cache. This gobbles the caller's reference to the
@@ -4248,10 +4342,11 @@ static int ext4_do_update_inode(handle_t *handle,
4248 cpu_to_le16(ei->i_extra_isize); 4342 cpu_to_le16(ei->i_extra_isize);
4249 } 4343 }
4250 } 4344 }
4251
4252 ext4_inode_csum_set(inode, raw_inode, ei); 4345 ext4_inode_csum_set(inode, raw_inode, ei);
4253
4254 spin_unlock(&ei->i_raw_lock); 4346 spin_unlock(&ei->i_raw_lock);
4347 if (inode->i_sb->s_flags & MS_LAZYTIME)
4348 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
4349 bh->b_data);
4255 4350
4256 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4351 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4257 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 4352 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4534,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4534 * Truncate pagecache after we've waited for commit 4629 * Truncate pagecache after we've waited for commit
4535 * in data=journal mode to make pages freeable. 4630 * in data=journal mode to make pages freeable.
4536 */ 4631 */
4537 truncate_pagecache(inode, inode->i_size); 4632 truncate_pagecache(inode, inode->i_size);
4538 } 4633 }
4539 /* 4634 /*
4540 * We want to call ext4_truncate() even if attr->ia_size == 4635 * We want to call ext4_truncate() even if attr->ia_size ==
@@ -4840,11 +4935,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4840 * If the inode is marked synchronous, we don't honour that here - doing 4935 * If the inode is marked synchronous, we don't honour that here - doing
4841 * so would cause a commit on atime updates, which we don't bother doing. 4936 * so would cause a commit on atime updates, which we don't bother doing.
4842 * We handle synchronous inodes at the highest possible level. 4937 * We handle synchronous inodes at the highest possible level.
4938 *
4939 * If only the I_DIRTY_TIME flag is set, we can skip everything. If
4940 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
4941 * to copy into the on-disk inode structure are the timestamp files.
4843 */ 4942 */
4844void ext4_dirty_inode(struct inode *inode, int flags) 4943void ext4_dirty_inode(struct inode *inode, int flags)
4845{ 4944{
4846 handle_t *handle; 4945 handle_t *handle;
4847 4946
4947 if (flags == I_DIRTY_TIME)
4948 return;
4848 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4949 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
4849 if (IS_ERR(handle)) 4950 if (IS_ERR(handle))
4850 goto out; 4951 goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
2235 err = PTR_ERR(inode); 2235 err = PTR_ERR(inode);
2236 if (!IS_ERR(inode)) { 2236 if (!IS_ERR(inode)) {
2237 inode->i_op = &ext4_file_inode_operations; 2237 inode->i_op = &ext4_file_inode_operations;
2238 inode->i_fop = &ext4_file_operations; 2238 if (test_opt(inode->i_sb, DAX))
2239 inode->i_fop = &ext4_dax_file_operations;
2240 else
2241 inode->i_fop = &ext4_file_operations;
2239 ext4_set_aops(inode); 2242 ext4_set_aops(inode);
2240 err = ext4_add_nondir(handle, dentry, inode); 2243 err = ext4_add_nondir(handle, dentry, inode);
2241 if (!err && IS_DIRSYNC(dir)) 2244 if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
2299 err = PTR_ERR(inode); 2302 err = PTR_ERR(inode);
2300 if (!IS_ERR(inode)) { 2303 if (!IS_ERR(inode)) {
2301 inode->i_op = &ext4_file_inode_operations; 2304 inode->i_op = &ext4_file_inode_operations;
2302 inode->i_fop = &ext4_file_operations; 2305 if (test_opt(inode->i_sb, DAX))
2306 inode->i_fop = &ext4_dax_file_operations;
2307 else
2308 inode->i_fop = &ext4_file_operations;
2303 ext4_set_aops(inode); 2309 ext4_set_aops(inode);
2304 d_tmpfile(dentry, inode); 2310 d_tmpfile(dentry, inode);
2305 err = ext4_orphan_add(handle, inode); 2311 err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bf76f405a5f9..8a8ec6293b19 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -24,6 +24,18 @@ int ext4_resize_begin(struct super_block *sb)
24 return -EPERM; 24 return -EPERM;
25 25
26 /* 26 /*
27 * If we are not using the primary superblock/GDT copy don't resize,
28 * because the user tools have no way of handling this. Probably a
29 * bad time to do it anyways.
30 */
31 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
32 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
33 ext4_warning(sb, "won't resize using backup superblock at %llu",
34 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
35 return -EPERM;
36 }
37
38 /*
27 * We are not allowed to do online-resizing on a filesystem mounted 39 * We are not allowed to do online-resizing on a filesystem mounted
28 * with error, because it can destroy the filesystem easily. 40 * with error, because it can destroy the filesystem easily.
29 */ 41 */
@@ -758,18 +770,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
758 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", 770 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
759 gdb_num); 771 gdb_num);
760 772
761 /*
762 * If we are not using the primary superblock/GDT copy don't resize,
763 * because the user tools have no way of handling this. Probably a
764 * bad time to do it anyways.
765 */
766 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
767 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
768 ext4_warning(sb, "won't resize using backup superblock at %llu",
769 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
770 return -EPERM;
771 }
772
773 gdb_bh = sb_bread(sb, gdblock); 773 gdb_bh = sb_bread(sb, gdblock);
774 if (!gdb_bh) 774 if (!gdb_bh)
775 return -EIO; 775 return -EIO;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 43c92b1685cb..1adac6868e6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
334static int block_device_ejected(struct super_block *sb) 334static int block_device_ejected(struct super_block *sb)
335{ 335{
336 struct inode *bd_inode = sb->s_bdev->bd_inode; 336 struct inode *bd_inode = sb->s_bdev->bd_inode;
337 struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; 337 struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
338 338
339 return bdi->dev == NULL; 339 return bdi->dev == NULL;
340} 340}
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
1046static int ext4_write_info(struct super_block *sb, int type); 1046static int ext4_write_info(struct super_block *sb, int type);
1047static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1047static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1048 struct path *path); 1048 struct path *path);
1049static int ext4_quota_on_sysfile(struct super_block *sb, int type,
1050 int format_id);
1051static int ext4_quota_off(struct super_block *sb, int type); 1049static int ext4_quota_off(struct super_block *sb, int type);
1052static int ext4_quota_off_sysfile(struct super_block *sb, int type);
1053static int ext4_quota_on_mount(struct super_block *sb, int type); 1050static int ext4_quota_on_mount(struct super_block *sb, int type);
1054static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1051static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1055 size_t len, loff_t off); 1052 size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
1084 .get_dqblk = dquot_get_dqblk, 1081 .get_dqblk = dquot_get_dqblk,
1085 .set_dqblk = dquot_set_dqblk 1082 .set_dqblk = dquot_set_dqblk
1086}; 1083};
1087
1088static const struct quotactl_ops ext4_qctl_sysfile_operations = {
1089 .quota_on_meta = ext4_quota_on_sysfile,
1090 .quota_off = ext4_quota_off_sysfile,
1091 .quota_sync = dquot_quota_sync,
1092 .get_info = dquot_get_dqinfo,
1093 .set_info = dquot_set_dqinfo,
1094 .get_dqblk = dquot_get_dqblk,
1095 .set_dqblk = dquot_set_dqblk
1096};
1097#endif 1084#endif
1098 1085
1099static const struct super_operations ext4_sops = { 1086static const struct super_operations ext4_sops = {
@@ -1137,8 +1124,9 @@ enum {
1137 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1124 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1138 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1125 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1139 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, 1126 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1140 Opt_usrquota, Opt_grpquota, Opt_i_version, 1127 Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
1141 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, 1128 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1129 Opt_lazytime, Opt_nolazytime,
1142 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1130 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1143 Opt_inode_readahead_blks, Opt_journal_ioprio, 1131 Opt_inode_readahead_blks, Opt_journal_ioprio,
1144 Opt_dioread_nolock, Opt_dioread_lock, 1132 Opt_dioread_nolock, Opt_dioread_lock,
@@ -1200,8 +1188,11 @@ static const match_table_t tokens = {
1200 {Opt_barrier, "barrier"}, 1188 {Opt_barrier, "barrier"},
1201 {Opt_nobarrier, "nobarrier"}, 1189 {Opt_nobarrier, "nobarrier"},
1202 {Opt_i_version, "i_version"}, 1190 {Opt_i_version, "i_version"},
1191 {Opt_dax, "dax"},
1203 {Opt_stripe, "stripe=%u"}, 1192 {Opt_stripe, "stripe=%u"},
1204 {Opt_delalloc, "delalloc"}, 1193 {Opt_delalloc, "delalloc"},
1194 {Opt_lazytime, "lazytime"},
1195 {Opt_nolazytime, "nolazytime"},
1205 {Opt_nodelalloc, "nodelalloc"}, 1196 {Opt_nodelalloc, "nodelalloc"},
1206 {Opt_removed, "mblk_io_submit"}, 1197 {Opt_removed, "mblk_io_submit"},
1207 {Opt_removed, "nomblk_io_submit"}, 1198 {Opt_removed, "nomblk_io_submit"},
@@ -1384,6 +1375,7 @@ static const struct mount_opts {
1384 {Opt_min_batch_time, 0, MOPT_GTE0}, 1375 {Opt_min_batch_time, 0, MOPT_GTE0},
1385 {Opt_inode_readahead_blks, 0, MOPT_GTE0}, 1376 {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1386 {Opt_init_itable, 0, MOPT_GTE0}, 1377 {Opt_init_itable, 0, MOPT_GTE0},
1378 {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
1387 {Opt_stripe, 0, MOPT_GTE0}, 1379 {Opt_stripe, 0, MOPT_GTE0},
1388 {Opt_resuid, 0, MOPT_GTE0}, 1380 {Opt_resuid, 0, MOPT_GTE0},
1389 {Opt_resgid, 0, MOPT_GTE0}, 1381 {Opt_resgid, 0, MOPT_GTE0},
@@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1459 case Opt_i_version: 1451 case Opt_i_version:
1460 sb->s_flags |= MS_I_VERSION; 1452 sb->s_flags |= MS_I_VERSION;
1461 return 1; 1453 return 1;
1454 case Opt_lazytime:
1455 sb->s_flags |= MS_LAZYTIME;
1456 return 1;
1457 case Opt_nolazytime:
1458 sb->s_flags &= ~MS_LAZYTIME;
1459 return 1;
1462 } 1460 }
1463 1461
1464 for (m = ext4_mount_opts; m->token != Opt_err; m++) 1462 for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1620 } 1618 }
1621 sbi->s_jquota_fmt = m->mount_opt; 1619 sbi->s_jquota_fmt = m->mount_opt;
1622#endif 1620#endif
1621#ifndef CONFIG_FS_DAX
1622 } else if (token == Opt_dax) {
1623 ext4_msg(sb, KERN_INFO, "dax option not supported");
1624 return -1;
1625#endif
1623 } else { 1626 } else {
1624 if (!args->from) 1627 if (!args->from)
1625 arg = 1; 1628 arg = 1;
@@ -3482,7 +3485,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3482 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3485 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3483 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && 3486 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
3484 EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 3487 EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3485 ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are " 3488 ext4_warning(sb, "metadata_csum and uninit_bg are "
3486 "redundant flags; please run fsck."); 3489 "redundant flags; please run fsck.");
3487 3490
3488 /* Check for a known checksum algorithm */ 3491 /* Check for a known checksum algorithm */
@@ -3602,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3602 "both data=journal and dioread_nolock"); 3605 "both data=journal and dioread_nolock");
3603 goto failed_mount; 3606 goto failed_mount;
3604 } 3607 }
3608 if (test_opt(sb, DAX)) {
3609 ext4_msg(sb, KERN_ERR, "can't mount with "
3610 "both data=journal and dax");
3611 goto failed_mount;
3612 }
3605 if (test_opt(sb, DELALLOC)) 3613 if (test_opt(sb, DELALLOC))
3606 clear_opt(sb, DELALLOC); 3614 clear_opt(sb, DELALLOC);
3607 } 3615 }
@@ -3665,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3665 goto failed_mount; 3673 goto failed_mount;
3666 } 3674 }
3667 3675
3676 if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
3677 if (blocksize != PAGE_SIZE) {
3678 ext4_msg(sb, KERN_ERR,
3679 "error: unsupported blocksize for dax");
3680 goto failed_mount;
3681 }
3682 if (!sb->s_bdev->bd_disk->fops->direct_access) {
3683 ext4_msg(sb, KERN_ERR,
3684 "error: device does not support dax");
3685 goto failed_mount;
3686 }
3687 }
3688
3668 if (sb->s_blocksize != blocksize) { 3689 if (sb->s_blocksize != blocksize) {
3669 /* Validate the filesystem blocksize */ 3690 /* Validate the filesystem blocksize */
3670 if (!sb_set_blocksize(sb, blocksize)) { 3691 if (!sb_set_blocksize(sb, blocksize)) {
@@ -3935,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3935#ifdef CONFIG_QUOTA 3956#ifdef CONFIG_QUOTA
3936 sb->dq_op = &ext4_quota_operations; 3957 sb->dq_op = &ext4_quota_operations;
3937 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) 3958 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
3938 sb->s_qcop = &ext4_qctl_sysfile_operations; 3959 sb->s_qcop = &dquot_quotactl_sysfile_ops;
3939 else 3960 else
3940 sb->s_qcop = &ext4_qctl_operations; 3961 sb->s_qcop = &ext4_qctl_operations;
3941 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 3962 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -4882,6 +4903,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4882 err = -EINVAL; 4903 err = -EINVAL;
4883 goto restore_opts; 4904 goto restore_opts;
4884 } 4905 }
4906 if (test_opt(sb, DAX)) {
4907 ext4_msg(sb, KERN_ERR, "can't mount with "
4908 "both data=journal and dax");
4909 err = -EINVAL;
4910 goto restore_opts;
4911 }
4912 }
4913
4914 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
4915 ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
4916 "dax flag with busy inodes while remounting");
4917 sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
4885 } 4918 }
4886 4919
4887 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 4920 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
@@ -5020,6 +5053,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
5020 } 5053 }
5021#endif 5054#endif
5022 5055
5056 *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
5023 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); 5057 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
5024 kfree(orig_data); 5058 kfree(orig_data);
5025 return 0; 5059 return 0;
@@ -5288,21 +5322,6 @@ static int ext4_enable_quotas(struct super_block *sb)
5288 return 0; 5322 return 0;
5289} 5323}
5290 5324
5291/*
5292 * quota_on function that is used when QUOTA feature is set.
5293 */
5294static int ext4_quota_on_sysfile(struct super_block *sb, int type,
5295 int format_id)
5296{
5297 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
5298 return -EINVAL;
5299
5300 /*
5301 * USAGE was enabled at mount time. Only need to enable LIMITS now.
5302 */
5303 return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
5304}
5305
5306static int ext4_quota_off(struct super_block *sb, int type) 5325static int ext4_quota_off(struct super_block *sb, int type)
5307{ 5326{
5308 struct inode *inode = sb_dqopt(sb)->files[type]; 5327 struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5348,6 @@ out:
5329 return dquot_quota_off(sb, type); 5348 return dquot_quota_off(sb, type);
5330} 5349}
5331 5350
5332/*
5333 * quota_off function that is used when QUOTA feature is set.
5334 */
5335static int ext4_quota_off_sysfile(struct super_block *sb, int type)
5336{
5337 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
5338 return -EINVAL;
5339
5340 /* Disable only the limits. */
5341 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
5342}
5343
5344/* Read data from quotafile - avoid pagecache and such because we cannot afford 5351/* Read data from quotafile - avoid pagecache and such because we cannot afford
5345 * acquiring the locks... As quota files are never truncated and quota code 5352 * acquiring the locks... As quota files are never truncated and quota code
5346 * itself serializes the operations (and no one else should touch the files) 5353 * itself serializes the operations (and no one else should touch the files)
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 736a348509f7..94e2d2ffabe1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -71,3 +71,13 @@ config F2FS_CHECK_FS
71 Enables BUG_ONs which check the filesystem consistency in runtime. 71 Enables BUG_ONs which check the filesystem consistency in runtime.
72 72
73 If you want to improve the performance, say N. 73 If you want to improve the performance, say N.
74
75config F2FS_IO_TRACE
76 bool "F2FS IO tracer"
77 depends on F2FS_FS
78 depends on FUNCTION_TRACER
79 help
80 F2FS IO trace is based on a function trace, which gathers process
81 information and block IO patterns in the filesystem level.
82
83 If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2e35da12d292..d92397731db8 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -5,3 +5,4 @@ f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o 5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o 6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
7f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o 7f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
8f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1ccb26bc2a0b..742202779bd5 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
62 if (count == 0) 62 if (count == 0)
63 return NULL; 63 return NULL;
64 64
65 acl = posix_acl_alloc(count, GFP_KERNEL); 65 acl = posix_acl_alloc(count, GFP_NOFS);
66 if (!acl) 66 if (!acl)
67 return ERR_PTR(-ENOMEM); 67 return ERR_PTR(-ENOMEM);
68 68
@@ -116,7 +116,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
116 int i; 116 int i;
117 117
118 f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * 118 f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
119 sizeof(struct f2fs_acl_entry), GFP_KERNEL); 119 sizeof(struct f2fs_acl_entry), GFP_NOFS);
120 if (!f2fs_acl) 120 if (!f2fs_acl)
121 return ERR_PTR(-ENOMEM); 121 return ERR_PTR(-ENOMEM);
122 122
@@ -396,7 +396,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
396 posix_acl_release(default_acl); 396 posix_acl_release(default_acl);
397 } 397 }
398 if (acl) { 398 if (acl) {
399 if (error) 399 if (!error)
400 error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, 400 error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
401 ipage); 401 ipage);
402 posix_acl_release(acl); 402 posix_acl_release(acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e6c271fefaca..7f794b72b3b7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,10 +20,11 @@
20#include "f2fs.h" 20#include "f2fs.h"
21#include "node.h" 21#include "node.h"
22#include "segment.h" 22#include "segment.h"
23#include "trace.h"
23#include <trace/events/f2fs.h> 24#include <trace/events/f2fs.h>
24 25
25static struct kmem_cache *ino_entry_slab; 26static struct kmem_cache *ino_entry_slab;
26static struct kmem_cache *inode_entry_slab; 27struct kmem_cache *inode_entry_slab;
27 28
28/* 29/*
29 * We guarantee no failure on the returned page. 30 * We guarantee no failure on the returned page.
@@ -50,6 +51,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
50{ 51{
51 struct address_space *mapping = META_MAPPING(sbi); 52 struct address_space *mapping = META_MAPPING(sbi);
52 struct page *page; 53 struct page *page;
54 struct f2fs_io_info fio = {
55 .type = META,
56 .rw = READ_SYNC | REQ_META | REQ_PRIO,
57 .blk_addr = index,
58 };
53repeat: 59repeat:
54 page = grab_cache_page(mapping, index); 60 page = grab_cache_page(mapping, index);
55 if (!page) { 61 if (!page) {
@@ -59,8 +65,7 @@ repeat:
59 if (PageUptodate(page)) 65 if (PageUptodate(page))
60 goto out; 66 goto out;
61 67
62 if (f2fs_submit_page_bio(sbi, page, index, 68 if (f2fs_submit_page_bio(sbi, page, &fio))
63 READ_SYNC | REQ_META | REQ_PRIO))
64 goto repeat; 69 goto repeat;
65 70
66 lock_page(page); 71 lock_page(page);
@@ -112,14 +117,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
112 block_t prev_blk_addr = 0; 117 block_t prev_blk_addr = 0;
113 struct page *page; 118 struct page *page;
114 block_t blkno = start; 119 block_t blkno = start;
115
116 struct f2fs_io_info fio = { 120 struct f2fs_io_info fio = {
117 .type = META, 121 .type = META,
118 .rw = READ_SYNC | REQ_META | REQ_PRIO 122 .rw = READ_SYNC | REQ_META | REQ_PRIO
119 }; 123 };
120 124
121 for (; nrpages-- > 0; blkno++) { 125 for (; nrpages-- > 0; blkno++) {
122 block_t blk_addr;
123 126
124 if (!is_valid_blkaddr(sbi, blkno, type)) 127 if (!is_valid_blkaddr(sbi, blkno, type))
125 goto out; 128 goto out;
@@ -130,27 +133,27 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
130 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) 133 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
131 blkno = 0; 134 blkno = 0;
132 /* get nat block addr */ 135 /* get nat block addr */
133 blk_addr = current_nat_addr(sbi, 136 fio.blk_addr = current_nat_addr(sbi,
134 blkno * NAT_ENTRY_PER_BLOCK); 137 blkno * NAT_ENTRY_PER_BLOCK);
135 break; 138 break;
136 case META_SIT: 139 case META_SIT:
137 /* get sit block addr */ 140 /* get sit block addr */
138 blk_addr = current_sit_addr(sbi, 141 fio.blk_addr = current_sit_addr(sbi,
139 blkno * SIT_ENTRY_PER_BLOCK); 142 blkno * SIT_ENTRY_PER_BLOCK);
140 if (blkno != start && prev_blk_addr + 1 != blk_addr) 143 if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
141 goto out; 144 goto out;
142 prev_blk_addr = blk_addr; 145 prev_blk_addr = fio.blk_addr;
143 break; 146 break;
144 case META_SSA: 147 case META_SSA:
145 case META_CP: 148 case META_CP:
146 case META_POR: 149 case META_POR:
147 blk_addr = blkno; 150 fio.blk_addr = blkno;
148 break; 151 break;
149 default: 152 default:
150 BUG(); 153 BUG();
151 } 154 }
152 155
153 page = grab_cache_page(META_MAPPING(sbi), blk_addr); 156 page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
154 if (!page) 157 if (!page)
155 continue; 158 continue;
156 if (PageUptodate(page)) { 159 if (PageUptodate(page)) {
@@ -158,7 +161,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
158 continue; 161 continue;
159 } 162 }
160 163
161 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); 164 f2fs_submit_page_mbio(sbi, page, &fio);
162 f2fs_put_page(page, 0); 165 f2fs_put_page(page, 0);
163 } 166 }
164out: 167out:
@@ -187,7 +190,7 @@ static int f2fs_write_meta_page(struct page *page,
187 190
188 trace_f2fs_writepage(page, META); 191 trace_f2fs_writepage(page, META);
189 192
190 if (unlikely(sbi->por_doing)) 193 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
191 goto redirty_out; 194 goto redirty_out;
192 if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) 195 if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
193 goto redirty_out; 196 goto redirty_out;
@@ -299,6 +302,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
299 if (!PageDirty(page)) { 302 if (!PageDirty(page)) {
300 __set_page_dirty_nobuffers(page); 303 __set_page_dirty_nobuffers(page);
301 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); 304 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
305 SetPagePrivate(page);
306 f2fs_trace_pid(page);
302 return 1; 307 return 1;
303 } 308 }
304 return 0; 309 return 0;
@@ -308,6 +313,8 @@ const struct address_space_operations f2fs_meta_aops = {
308 .writepage = f2fs_write_meta_page, 313 .writepage = f2fs_write_meta_page,
309 .writepages = f2fs_write_meta_pages, 314 .writepages = f2fs_write_meta_pages,
310 .set_page_dirty = f2fs_set_meta_page_dirty, 315 .set_page_dirty = f2fs_set_meta_page_dirty,
316 .invalidatepage = f2fs_invalidate_page,
317 .releasepage = f2fs_release_page,
311}; 318};
312 319
313static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) 320static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -462,7 +469,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
462 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) 469 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
463 return; 470 return;
464 471
465 sbi->por_doing = true; 472 set_sbi_flag(sbi, SBI_POR_DOING);
466 473
467 start_blk = __start_cp_addr(sbi) + 1 + 474 start_blk = __start_cp_addr(sbi) + 1 +
468 le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); 475 le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
@@ -483,7 +490,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
483 } 490 }
484 /* clear Orphan Flag */ 491 /* clear Orphan Flag */
485 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); 492 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
486 sbi->por_doing = false; 493 clear_sbi_flag(sbi, SBI_POR_DOING);
487 return; 494 return;
488} 495}
489 496
@@ -567,7 +574,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
567 if (crc_offset >= blk_size) 574 if (crc_offset >= blk_size)
568 goto invalid_cp1; 575 goto invalid_cp1;
569 576
570 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); 577 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
571 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 578 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
572 goto invalid_cp1; 579 goto invalid_cp1;
573 580
@@ -582,7 +589,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
582 if (crc_offset >= blk_size) 589 if (crc_offset >= blk_size)
583 goto invalid_cp2; 590 goto invalid_cp2;
584 591
585 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); 592 crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
586 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 593 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
587 goto invalid_cp2; 594 goto invalid_cp2;
588 595
@@ -669,7 +676,7 @@ fail_no_cp:
669 return -EINVAL; 676 return -EINVAL;
670} 677}
671 678
672static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) 679static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
673{ 680{
674 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 681 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
675 682
@@ -686,7 +693,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
686void update_dirty_page(struct inode *inode, struct page *page) 693void update_dirty_page(struct inode *inode, struct page *page)
687{ 694{
688 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 695 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
689 struct dir_inode_entry *new; 696 struct inode_entry *new;
690 int ret = 0; 697 int ret = 0;
691 698
692 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) 699 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
@@ -710,12 +717,13 @@ void update_dirty_page(struct inode *inode, struct page *page)
710 kmem_cache_free(inode_entry_slab, new); 717 kmem_cache_free(inode_entry_slab, new);
711out: 718out:
712 SetPagePrivate(page); 719 SetPagePrivate(page);
720 f2fs_trace_pid(page);
713} 721}
714 722
715void add_dirty_dir_inode(struct inode *inode) 723void add_dirty_dir_inode(struct inode *inode)
716{ 724{
717 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 725 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
718 struct dir_inode_entry *new = 726 struct inode_entry *new =
719 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 727 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
720 int ret = 0; 728 int ret = 0;
721 729
@@ -733,7 +741,7 @@ void add_dirty_dir_inode(struct inode *inode)
733void remove_dirty_dir_inode(struct inode *inode) 741void remove_dirty_dir_inode(struct inode *inode)
734{ 742{
735 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 743 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
736 struct dir_inode_entry *entry; 744 struct inode_entry *entry;
737 745
738 if (!S_ISDIR(inode->i_mode)) 746 if (!S_ISDIR(inode->i_mode))
739 return; 747 return;
@@ -763,7 +771,7 @@ void remove_dirty_dir_inode(struct inode *inode)
763void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) 771void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
764{ 772{
765 struct list_head *head; 773 struct list_head *head;
766 struct dir_inode_entry *entry; 774 struct inode_entry *entry;
767 struct inode *inode; 775 struct inode *inode;
768retry: 776retry:
769 if (unlikely(f2fs_cp_error(sbi))) 777 if (unlikely(f2fs_cp_error(sbi)))
@@ -776,7 +784,7 @@ retry:
776 spin_unlock(&sbi->dir_inode_lock); 784 spin_unlock(&sbi->dir_inode_lock);
777 return; 785 return;
778 } 786 }
779 entry = list_entry(head->next, struct dir_inode_entry, list); 787 entry = list_entry(head->next, struct inode_entry, list);
780 inode = igrab(entry->inode); 788 inode = igrab(entry->inode);
781 spin_unlock(&sbi->dir_inode_lock); 789 spin_unlock(&sbi->dir_inode_lock);
782 if (inode) { 790 if (inode) {
@@ -922,7 +930,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
922 ckpt->next_free_nid = cpu_to_le32(last_nid); 930 ckpt->next_free_nid = cpu_to_le32(last_nid);
923 931
924 /* 2 cp + n data seg summary + orphan inode blocks */ 932 /* 2 cp + n data seg summary + orphan inode blocks */
925 data_sum_blocks = npages_for_summary_flush(sbi); 933 data_sum_blocks = npages_for_summary_flush(sbi, false);
926 if (data_sum_blocks < NR_CURSEG_DATA_TYPE) 934 if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
927 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); 935 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
928 else 936 else
@@ -932,24 +940,31 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
932 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + 940 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
933 orphan_blocks); 941 orphan_blocks);
934 942
935 if (cpc->reason == CP_UMOUNT) { 943 if (__remain_node_summaries(cpc->reason))
936 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
937 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ 944 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
938 cp_payload_blks + data_sum_blocks + 945 cp_payload_blks + data_sum_blocks +
939 orphan_blocks + NR_CURSEG_NODE_TYPE); 946 orphan_blocks + NR_CURSEG_NODE_TYPE);
940 } else { 947 else
941 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
942 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + 948 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
943 cp_payload_blks + data_sum_blocks + 949 cp_payload_blks + data_sum_blocks +
944 orphan_blocks); 950 orphan_blocks);
945 } 951
952 if (cpc->reason == CP_UMOUNT)
953 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
954 else
955 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
956
957 if (cpc->reason == CP_FASTBOOT)
958 set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
959 else
960 clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
946 961
947 if (orphan_num) 962 if (orphan_num)
948 set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 963 set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
949 else 964 else
950 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 965 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
951 966
952 if (sbi->need_fsck) 967 if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
953 set_ckpt_flags(ckpt, CP_FSCK_FLAG); 968 set_ckpt_flags(ckpt, CP_FSCK_FLAG);
954 969
955 /* update SIT/NAT bitmap */ 970 /* update SIT/NAT bitmap */
@@ -966,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
966 /* write out checkpoint buffer at block 0 */ 981 /* write out checkpoint buffer at block 0 */
967 cp_page = grab_meta_page(sbi, start_blk++); 982 cp_page = grab_meta_page(sbi, start_blk++);
968 kaddr = page_address(cp_page); 983 kaddr = page_address(cp_page);
969 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); 984 memcpy(kaddr, ckpt, F2FS_BLKSIZE);
970 set_page_dirty(cp_page); 985 set_page_dirty(cp_page);
971 f2fs_put_page(cp_page, 1); 986 f2fs_put_page(cp_page, 1);
972 987
973 for (i = 1; i < 1 + cp_payload_blks; i++) { 988 for (i = 1; i < 1 + cp_payload_blks; i++) {
974 cp_page = grab_meta_page(sbi, start_blk++); 989 cp_page = grab_meta_page(sbi, start_blk++);
975 kaddr = page_address(cp_page); 990 kaddr = page_address(cp_page);
976 memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, 991 memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
977 (1 << sbi->log_blocksize));
978 set_page_dirty(cp_page); 992 set_page_dirty(cp_page);
979 f2fs_put_page(cp_page, 1); 993 f2fs_put_page(cp_page, 1);
980 } 994 }
@@ -986,7 +1000,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
986 1000
987 write_data_summaries(sbi, start_blk); 1001 write_data_summaries(sbi, start_blk);
988 start_blk += data_sum_blocks; 1002 start_blk += data_sum_blocks;
989 if (cpc->reason == CP_UMOUNT) { 1003 if (__remain_node_summaries(cpc->reason)) {
990 write_node_summaries(sbi, start_blk); 1004 write_node_summaries(sbi, start_blk);
991 start_blk += NR_CURSEG_NODE_TYPE; 1005 start_blk += NR_CURSEG_NODE_TYPE;
992 } 1006 }
@@ -994,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
994 /* writeout checkpoint block */ 1008 /* writeout checkpoint block */
995 cp_page = grab_meta_page(sbi, start_blk); 1009 cp_page = grab_meta_page(sbi, start_blk);
996 kaddr = page_address(cp_page); 1010 kaddr = page_address(cp_page);
997 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); 1011 memcpy(kaddr, ckpt, F2FS_BLKSIZE);
998 set_page_dirty(cp_page); 1012 set_page_dirty(cp_page);
999 f2fs_put_page(cp_page, 1); 1013 f2fs_put_page(cp_page, 1);
1000 1014
@@ -1023,7 +1037,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1023 return; 1037 return;
1024 1038
1025 clear_prefree_segments(sbi); 1039 clear_prefree_segments(sbi);
1026 F2FS_RESET_SB_DIRT(sbi); 1040 clear_sbi_flag(sbi, SBI_IS_DIRTY);
1027} 1041}
1028 1042
1029/* 1043/*
@@ -1038,10 +1052,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1038 1052
1039 mutex_lock(&sbi->cp_mutex); 1053 mutex_lock(&sbi->cp_mutex);
1040 1054
1041 if (!sbi->s_dirty && cpc->reason != CP_DISCARD) 1055 if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
1056 cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
1042 goto out; 1057 goto out;
1043 if (unlikely(f2fs_cp_error(sbi))) 1058 if (unlikely(f2fs_cp_error(sbi)))
1044 goto out; 1059 goto out;
1060 if (f2fs_readonly(sbi->sb))
1061 goto out;
1045 if (block_operations(sbi)) 1062 if (block_operations(sbi))
1046 goto out; 1063 goto out;
1047 1064
@@ -1102,8 +1119,8 @@ int __init create_checkpoint_caches(void)
1102 sizeof(struct ino_entry)); 1119 sizeof(struct ino_entry));
1103 if (!ino_entry_slab) 1120 if (!ino_entry_slab)
1104 return -ENOMEM; 1121 return -ENOMEM;
1105 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 1122 inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
1106 sizeof(struct dir_inode_entry)); 1123 sizeof(struct inode_entry));
1107 if (!inode_entry_slab) { 1124 if (!inode_entry_slab) {
1108 kmem_cache_destroy(ino_entry_slab); 1125 kmem_cache_destroy(ino_entry_slab);
1109 return -ENOMEM; 1126 return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ec697b37f19..985ed023a750 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
22#include "f2fs.h" 22#include "f2fs.h"
23#include "node.h" 23#include "node.h"
24#include "segment.h" 24#include "segment.h"
25#include "trace.h"
25#include <trace/events/f2fs.h> 26#include <trace/events/f2fs.h>
26 27
27static void f2fs_read_end_io(struct bio *bio, int err) 28static void f2fs_read_end_io(struct bio *bio, int err)
@@ -95,11 +96,9 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
95 return; 96 return;
96 97
97 if (is_read_io(fio->rw)) 98 if (is_read_io(fio->rw))
98 trace_f2fs_submit_read_bio(io->sbi->sb, fio->rw, 99 trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
99 fio->type, io->bio);
100 else 100 else
101 trace_f2fs_submit_write_bio(io->sbi->sb, fio->rw, 101 trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
102 fio->type, io->bio);
103 102
104 submit_bio(fio->rw, io->bio); 103 submit_bio(fio->rw, io->bio);
105 io->bio = NULL; 104 io->bio = NULL;
@@ -132,14 +131,15 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
132 * Return unlocked page. 131 * Return unlocked page.
133 */ 132 */
134int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, 133int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
135 block_t blk_addr, int rw) 134 struct f2fs_io_info *fio)
136{ 135{
137 struct bio *bio; 136 struct bio *bio;
138 137
139 trace_f2fs_submit_page_bio(page, blk_addr, rw); 138 trace_f2fs_submit_page_bio(page, fio);
139 f2fs_trace_ios(page, fio, 0);
140 140
141 /* Allocate a new bio */ 141 /* Allocate a new bio */
142 bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); 142 bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
143 143
144 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 144 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
145 bio_put(bio); 145 bio_put(bio);
@@ -147,12 +147,12 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
147 return -EFAULT; 147 return -EFAULT;
148 } 148 }
149 149
150 submit_bio(rw, bio); 150 submit_bio(fio->rw, bio);
151 return 0; 151 return 0;
152} 152}
153 153
154void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, 154void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
155 block_t blk_addr, struct f2fs_io_info *fio) 155 struct f2fs_io_info *fio)
156{ 156{
157 enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); 157 enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
158 struct f2fs_bio_info *io; 158 struct f2fs_bio_info *io;
@@ -160,21 +160,21 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
160 160
161 io = is_read ? &sbi->read_io : &sbi->write_io[btype]; 161 io = is_read ? &sbi->read_io : &sbi->write_io[btype];
162 162
163 verify_block_addr(sbi, blk_addr); 163 verify_block_addr(sbi, fio->blk_addr);
164 164
165 down_write(&io->io_rwsem); 165 down_write(&io->io_rwsem);
166 166
167 if (!is_read) 167 if (!is_read)
168 inc_page_count(sbi, F2FS_WRITEBACK); 168 inc_page_count(sbi, F2FS_WRITEBACK);
169 169
170 if (io->bio && (io->last_block_in_bio != blk_addr - 1 || 170 if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
171 io->fio.rw != fio->rw)) 171 io->fio.rw != fio->rw))
172 __submit_merged_bio(io); 172 __submit_merged_bio(io);
173alloc_new: 173alloc_new:
174 if (io->bio == NULL) { 174 if (io->bio == NULL) {
175 int bio_blocks = MAX_BIO_BLOCKS(sbi); 175 int bio_blocks = MAX_BIO_BLOCKS(sbi);
176 176
177 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); 177 io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
178 io->fio = *fio; 178 io->fio = *fio;
179 } 179 }
180 180
@@ -184,10 +184,11 @@ alloc_new:
184 goto alloc_new; 184 goto alloc_new;
185 } 185 }
186 186
187 io->last_block_in_bio = blk_addr; 187 io->last_block_in_bio = fio->blk_addr;
188 f2fs_trace_ios(page, fio, 0);
188 189
189 up_write(&io->io_rwsem); 190 up_write(&io->io_rwsem);
190 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); 191 trace_f2fs_submit_page_mbio(page, fio);
191} 192}
192 193
193/* 194/*
@@ -196,7 +197,7 @@ alloc_new:
196 * ->node_page 197 * ->node_page
197 * update block addresses in the node page 198 * update block addresses in the node page
198 */ 199 */
199static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) 200static void __set_data_blkaddr(struct dnode_of_data *dn)
200{ 201{
201 struct f2fs_node *rn; 202 struct f2fs_node *rn;
202 __le32 *addr_array; 203 __le32 *addr_array;
@@ -209,7 +210,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
209 210
210 /* Get physical address of data block */ 211 /* Get physical address of data block */
211 addr_array = blkaddr_in_node(rn); 212 addr_array = blkaddr_in_node(rn);
212 addr_array[ofs_in_node] = cpu_to_le32(new_addr); 213 addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
213 set_page_dirty(node_page); 214 set_page_dirty(node_page);
214} 215}
215 216
@@ -224,8 +225,8 @@ int reserve_new_block(struct dnode_of_data *dn)
224 225
225 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); 226 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
226 227
227 __set_data_blkaddr(dn, NEW_ADDR);
228 dn->data_blkaddr = NEW_ADDR; 228 dn->data_blkaddr = NEW_ADDR;
229 __set_data_blkaddr(dn);
229 mark_inode_dirty(dn->inode); 230 mark_inode_dirty(dn->inode);
230 sync_inode_page(dn); 231 sync_inode_page(dn);
231 return 0; 232 return 0;
@@ -273,7 +274,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
273 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 274 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
274 size_t count; 275 size_t count;
275 276
276 clear_buffer_new(bh_result); 277 set_buffer_new(bh_result);
277 map_bh(bh_result, inode->i_sb, 278 map_bh(bh_result, inode->i_sb,
278 start_blkaddr + pgofs - start_fofs); 279 start_blkaddr + pgofs - start_fofs);
279 count = end_fofs - pgofs + 1; 280 count = end_fofs - pgofs + 1;
@@ -290,23 +291,24 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
290 return 0; 291 return 0;
291} 292}
292 293
293void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) 294void update_extent_cache(struct dnode_of_data *dn)
294{ 295{
295 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 296 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
296 pgoff_t fofs, start_fofs, end_fofs; 297 pgoff_t fofs, start_fofs, end_fofs;
297 block_t start_blkaddr, end_blkaddr; 298 block_t start_blkaddr, end_blkaddr;
298 int need_update = true; 299 int need_update = true;
299 300
300 f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR); 301 f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
301 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
302 dn->ofs_in_node;
303 302
304 /* Update the page address in the parent node */ 303 /* Update the page address in the parent node */
305 __set_data_blkaddr(dn, blk_addr); 304 __set_data_blkaddr(dn);
306 305
307 if (is_inode_flag_set(fi, FI_NO_EXTENT)) 306 if (is_inode_flag_set(fi, FI_NO_EXTENT))
308 return; 307 return;
309 308
309 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
310 dn->ofs_in_node;
311
310 write_lock(&fi->ext.ext_lock); 312 write_lock(&fi->ext.ext_lock);
311 313
312 start_fofs = fi->ext.fofs; 314 start_fofs = fi->ext.fofs;
@@ -320,16 +322,16 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
320 322
321 /* Initial extent */ 323 /* Initial extent */
322 if (fi->ext.len == 0) { 324 if (fi->ext.len == 0) {
323 if (blk_addr != NULL_ADDR) { 325 if (dn->data_blkaddr != NULL_ADDR) {
324 fi->ext.fofs = fofs; 326 fi->ext.fofs = fofs;
325 fi->ext.blk_addr = blk_addr; 327 fi->ext.blk_addr = dn->data_blkaddr;
326 fi->ext.len = 1; 328 fi->ext.len = 1;
327 } 329 }
328 goto end_update; 330 goto end_update;
329 } 331 }
330 332
331 /* Front merge */ 333 /* Front merge */
332 if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { 334 if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
333 fi->ext.fofs--; 335 fi->ext.fofs--;
334 fi->ext.blk_addr--; 336 fi->ext.blk_addr--;
335 fi->ext.len++; 337 fi->ext.len++;
@@ -337,7 +339,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
337 } 339 }
338 340
339 /* Back merge */ 341 /* Back merge */
340 if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) { 342 if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
341 fi->ext.len++; 343 fi->ext.len++;
342 goto end_update; 344 goto end_update;
343 } 345 }
@@ -376,6 +378,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
376 struct dnode_of_data dn; 378 struct dnode_of_data dn;
377 struct page *page; 379 struct page *page;
378 int err; 380 int err;
381 struct f2fs_io_info fio = {
382 .type = DATA,
383 .rw = sync ? READ_SYNC : READA,
384 };
379 385
380 page = find_get_page(mapping, index); 386 page = find_get_page(mapping, index);
381 if (page && PageUptodate(page)) 387 if (page && PageUptodate(page))
@@ -404,8 +410,8 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
404 return page; 410 return page;
405 } 411 }
406 412
407 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr, 413 fio.blk_addr = dn.data_blkaddr;
408 sync ? READ_SYNC : READA); 414 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
409 if (err) 415 if (err)
410 return ERR_PTR(err); 416 return ERR_PTR(err);
411 417
@@ -430,7 +436,10 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
430 struct dnode_of_data dn; 436 struct dnode_of_data dn;
431 struct page *page; 437 struct page *page;
432 int err; 438 int err;
433 439 struct f2fs_io_info fio = {
440 .type = DATA,
441 .rw = READ_SYNC,
442 };
434repeat: 443repeat:
435 page = grab_cache_page(mapping, index); 444 page = grab_cache_page(mapping, index);
436 if (!page) 445 if (!page)
@@ -464,8 +473,8 @@ repeat:
464 return page; 473 return page;
465 } 474 }
466 475
467 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, 476 fio.blk_addr = dn.data_blkaddr;
468 dn.data_blkaddr, READ_SYNC); 477 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
469 if (err) 478 if (err)
470 return ERR_PTR(err); 479 return ERR_PTR(err);
471 480
@@ -515,8 +524,12 @@ repeat:
515 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 524 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
516 SetPageUptodate(page); 525 SetPageUptodate(page);
517 } else { 526 } else {
518 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, 527 struct f2fs_io_info fio = {
519 dn.data_blkaddr, READ_SYNC); 528 .type = DATA,
529 .rw = READ_SYNC,
530 .blk_addr = dn.data_blkaddr,
531 };
532 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
520 if (err) 533 if (err)
521 goto put_err; 534 goto put_err;
522 535
@@ -550,30 +563,25 @@ static int __allocate_data_block(struct dnode_of_data *dn)
550 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 563 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
551 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 564 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
552 struct f2fs_summary sum; 565 struct f2fs_summary sum;
553 block_t new_blkaddr;
554 struct node_info ni; 566 struct node_info ni;
567 int seg = CURSEG_WARM_DATA;
555 pgoff_t fofs; 568 pgoff_t fofs;
556 int type;
557 569
558 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 570 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
559 return -EPERM; 571 return -EPERM;
560 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) 572 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
561 return -ENOSPC; 573 return -ENOSPC;
562 574
563 __set_data_blkaddr(dn, NEW_ADDR);
564 dn->data_blkaddr = NEW_ADDR;
565
566 get_node_info(sbi, dn->nid, &ni); 575 get_node_info(sbi, dn->nid, &ni);
567 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 576 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
568 577
569 type = CURSEG_WARM_DATA; 578 if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
579 seg = CURSEG_DIRECT_IO;
570 580
571 allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); 581 allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
572 582
573 /* direct IO doesn't use extent cache to maximize the performance */ 583 /* direct IO doesn't use extent cache to maximize the performance */
574 set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); 584 __set_data_blkaddr(dn);
575 update_extent_cache(new_blkaddr, dn);
576 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
577 585
578 /* update i_size */ 586 /* update i_size */
579 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 587 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -581,10 +589,59 @@ static int __allocate_data_block(struct dnode_of_data *dn)
581 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) 589 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
582 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); 590 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
583 591
584 dn->data_blkaddr = new_blkaddr;
585 return 0; 592 return 0;
586} 593}
587 594
595static void __allocate_data_blocks(struct inode *inode, loff_t offset,
596 size_t count)
597{
598 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
599 struct dnode_of_data dn;
600 u64 start = F2FS_BYTES_TO_BLK(offset);
601 u64 len = F2FS_BYTES_TO_BLK(count);
602 bool allocated;
603 u64 end_offset;
604
605 while (len) {
606 f2fs_balance_fs(sbi);
607 f2fs_lock_op(sbi);
608
609 /* When reading holes, we need its node page */
610 set_new_dnode(&dn, inode, NULL, NULL, 0);
611 if (get_dnode_of_data(&dn, start, ALLOC_NODE))
612 goto out;
613
614 allocated = false;
615 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
616
617 while (dn.ofs_in_node < end_offset && len) {
618 if (dn.data_blkaddr == NULL_ADDR) {
619 if (__allocate_data_block(&dn))
620 goto sync_out;
621 allocated = true;
622 }
623 len--;
624 start++;
625 dn.ofs_in_node++;
626 }
627
628 if (allocated)
629 sync_inode_page(&dn);
630
631 f2fs_put_dnode(&dn);
632 f2fs_unlock_op(sbi);
633 }
634 return;
635
636sync_out:
637 if (allocated)
638 sync_inode_page(&dn);
639 f2fs_put_dnode(&dn);
640out:
641 f2fs_unlock_op(sbi);
642 return;
643}
644
588/* 645/*
589 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. 646 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
590 * If original data blocks are allocated, then give them to blockdev. 647 * If original data blocks are allocated, then give them to blockdev.
@@ -610,10 +667,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
610 if (check_extent_cache(inode, pgofs, bh_result)) 667 if (check_extent_cache(inode, pgofs, bh_result))
611 goto out; 668 goto out;
612 669
613 if (create) { 670 if (create)
614 f2fs_balance_fs(F2FS_I_SB(inode));
615 f2fs_lock_op(F2FS_I_SB(inode)); 671 f2fs_lock_op(F2FS_I_SB(inode));
616 }
617 672
618 /* When reading holes, we need its node page */ 673 /* When reading holes, we need its node page */
619 set_new_dnode(&dn, inode, NULL, NULL, 0); 674 set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -627,12 +682,14 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
627 goto put_out; 682 goto put_out;
628 683
629 if (dn.data_blkaddr != NULL_ADDR) { 684 if (dn.data_blkaddr != NULL_ADDR) {
685 set_buffer_new(bh_result);
630 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 686 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
631 } else if (create) { 687 } else if (create) {
632 err = __allocate_data_block(&dn); 688 err = __allocate_data_block(&dn);
633 if (err) 689 if (err)
634 goto put_out; 690 goto put_out;
635 allocated = true; 691 allocated = true;
692 set_buffer_new(bh_result);
636 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 693 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
637 } else { 694 } else {
638 goto put_out; 695 goto put_out;
@@ -745,7 +802,6 @@ static int f2fs_read_data_pages(struct file *file,
745int do_write_data_page(struct page *page, struct f2fs_io_info *fio) 802int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
746{ 803{
747 struct inode *inode = page->mapping->host; 804 struct inode *inode = page->mapping->host;
748 block_t old_blkaddr, new_blkaddr;
749 struct dnode_of_data dn; 805 struct dnode_of_data dn;
750 int err = 0; 806 int err = 0;
751 807
@@ -754,10 +810,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
754 if (err) 810 if (err)
755 return err; 811 return err;
756 812
757 old_blkaddr = dn.data_blkaddr; 813 fio->blk_addr = dn.data_blkaddr;
758 814
759 /* This page is already truncated */ 815 /* This page is already truncated */
760 if (old_blkaddr == NULL_ADDR) 816 if (fio->blk_addr == NULL_ADDR)
761 goto out_writepage; 817 goto out_writepage;
762 818
763 set_page_writeback(page); 819 set_page_writeback(page);
@@ -766,14 +822,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
766 * If current allocation needs SSR, 822 * If current allocation needs SSR,
767 * it had better in-place writes for updated data. 823 * it had better in-place writes for updated data.
768 */ 824 */
769 if (unlikely(old_blkaddr != NEW_ADDR && 825 if (unlikely(fio->blk_addr != NEW_ADDR &&
770 !is_cold_data(page) && 826 !is_cold_data(page) &&
771 need_inplace_update(inode))) { 827 need_inplace_update(inode))) {
772 rewrite_data_page(page, old_blkaddr, fio); 828 rewrite_data_page(page, fio);
773 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); 829 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
774 } else { 830 } else {
775 write_data_page(page, &dn, &new_blkaddr, fio); 831 write_data_page(page, &dn, fio);
776 update_extent_cache(new_blkaddr, &dn); 832 update_extent_cache(&dn);
777 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); 833 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
778 } 834 }
779out_writepage: 835out_writepage:
@@ -812,7 +868,12 @@ static int f2fs_write_data_page(struct page *page,
812 868
813 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 869 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
814write: 870write:
815 if (unlikely(sbi->por_doing)) 871 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
872 goto redirty_out;
873 if (f2fs_is_drop_cache(inode))
874 goto out;
875 if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
876 available_free_memory(sbi, BASE_CHECK))
816 goto redirty_out; 877 goto redirty_out;
817 878
818 /* Dentry blocks are controlled by checkpoint */ 879 /* Dentry blocks are controlled by checkpoint */
@@ -826,7 +887,6 @@ write:
826 /* we should bypass data pages to proceed the kworkder jobs */ 887 /* we should bypass data pages to proceed the kworkder jobs */
827 if (unlikely(f2fs_cp_error(sbi))) { 888 if (unlikely(f2fs_cp_error(sbi))) {
828 SetPageError(page); 889 SetPageError(page);
829 unlock_page(page);
830 goto out; 890 goto out;
831 } 891 }
832 892
@@ -1002,8 +1062,12 @@ put_next:
1002 if (dn.data_blkaddr == NEW_ADDR) { 1062 if (dn.data_blkaddr == NEW_ADDR) {
1003 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 1063 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
1004 } else { 1064 } else {
1005 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 1065 struct f2fs_io_info fio = {
1006 READ_SYNC); 1066 .type = DATA,
1067 .rw = READ_SYNC,
1068 .blk_addr = dn.data_blkaddr,
1069 };
1070 err = f2fs_submit_page_bio(sbi, page, &fio);
1007 if (err) 1071 if (err)
1008 goto fail; 1072 goto fail;
1009 1073
@@ -1092,6 +1156,9 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1092 1156
1093 trace_f2fs_direct_IO_enter(inode, offset, count, rw); 1157 trace_f2fs_direct_IO_enter(inode, offset, count, rw);
1094 1158
1159 if (rw & WRITE)
1160 __allocate_data_blocks(inode, offset, count);
1161
1095 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); 1162 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
1096 if (err < 0 && (rw & WRITE)) 1163 if (err < 0 && (rw & WRITE))
1097 f2fs_write_failed(mapping, offset + count); 1164 f2fs_write_failed(mapping, offset + count);
@@ -1101,24 +1168,33 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1101 return err; 1168 return err;
1102} 1169}
1103 1170
1104static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 1171void f2fs_invalidate_page(struct page *page, unsigned int offset,
1105 unsigned int length) 1172 unsigned int length)
1106{ 1173{
1107 struct inode *inode = page->mapping->host; 1174 struct inode *inode = page->mapping->host;
1175 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1108 1176
1109 if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE) 1177 if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
1178 (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
1110 return; 1179 return;
1111 1180
1112 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) 1181 if (PageDirty(page)) {
1113 invalidate_inmem_page(inode, page); 1182 if (inode->i_ino == F2FS_META_INO(sbi))
1114 1183 dec_page_count(sbi, F2FS_DIRTY_META);
1115 if (PageDirty(page)) 1184 else if (inode->i_ino == F2FS_NODE_INO(sbi))
1116 inode_dec_dirty_pages(inode); 1185 dec_page_count(sbi, F2FS_DIRTY_NODES);
1186 else
1187 inode_dec_dirty_pages(inode);
1188 }
1117 ClearPagePrivate(page); 1189 ClearPagePrivate(page);
1118} 1190}
1119 1191
1120static int f2fs_release_data_page(struct page *page, gfp_t wait) 1192int f2fs_release_page(struct page *page, gfp_t wait)
1121{ 1193{
1194 /* If this is dirty page, keep PagePrivate */
1195 if (PageDirty(page))
1196 return 0;
1197
1122 ClearPagePrivate(page); 1198 ClearPagePrivate(page);
1123 return 1; 1199 return 1;
1124} 1200}
@@ -1132,7 +1208,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
1132 1208
1133 SetPageUptodate(page); 1209 SetPageUptodate(page);
1134 1210
1135 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) { 1211 if (f2fs_is_atomic_file(inode)) {
1136 register_inmem_page(inode, page); 1212 register_inmem_page(inode, page);
1137 return 1; 1213 return 1;
1138 } 1214 }
@@ -1168,8 +1244,8 @@ const struct address_space_operations f2fs_dblock_aops = {
1168 .write_begin = f2fs_write_begin, 1244 .write_begin = f2fs_write_begin,
1169 .write_end = f2fs_write_end, 1245 .write_end = f2fs_write_end,
1170 .set_page_dirty = f2fs_set_data_page_dirty, 1246 .set_page_dirty = f2fs_set_data_page_dirty,
1171 .invalidatepage = f2fs_invalidate_data_page, 1247 .invalidatepage = f2fs_invalidate_page,
1172 .releasepage = f2fs_release_data_page, 1248 .releasepage = f2fs_release_page,
1173 .direct_IO = f2fs_direct_IO, 1249 .direct_IO = f2fs_direct_IO,
1174 .bmap = f2fs_bmap, 1250 .bmap = f2fs_bmap,
1175}; 1251};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91e8f699ab30..e671373cc8ab 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -40,6 +40,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
40 si->ndirty_dirs = sbi->n_dirty_dirs; 40 si->ndirty_dirs = sbi->n_dirty_dirs;
41 si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); 41 si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
42 si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); 42 si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
43 si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
43 si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; 44 si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
44 si->rsvd_segs = reserved_segments(sbi); 45 si->rsvd_segs = reserved_segments(sbi);
45 si->overp_segs = overprovision_segments(sbi); 46 si->overp_segs = overprovision_segments(sbi);
@@ -57,7 +58,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
57 si->node_pages = NODE_MAPPING(sbi)->nrpages; 58 si->node_pages = NODE_MAPPING(sbi)->nrpages;
58 si->meta_pages = META_MAPPING(sbi)->nrpages; 59 si->meta_pages = META_MAPPING(sbi)->nrpages;
59 si->nats = NM_I(sbi)->nat_cnt; 60 si->nats = NM_I(sbi)->nat_cnt;
60 si->sits = SIT_I(sbi)->dirty_sentries; 61 si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
62 si->sits = MAIN_SEGS(sbi);
63 si->dirty_sits = SIT_I(sbi)->dirty_sentries;
61 si->fnids = NM_I(sbi)->fcnt; 64 si->fnids = NM_I(sbi)->fcnt;
62 si->bg_gc = sbi->bg_gc; 65 si->bg_gc = sbi->bg_gc;
63 si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) 66 si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -79,6 +82,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
79 si->segment_count[i] = sbi->segment_count[i]; 82 si->segment_count[i] = sbi->segment_count[i];
80 si->block_count[i] = sbi->block_count[i]; 83 si->block_count[i] = sbi->block_count[i];
81 } 84 }
85
86 si->inplace_count = atomic_read(&sbi->inplace_count);
82} 87}
83 88
84/* 89/*
@@ -137,6 +142,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
137 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); 142 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
138 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); 143 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
139 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); 144 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
145 si->base_mem += SIT_VBLOCK_MAP_SIZE;
140 if (sbi->segs_per_sec > 1) 146 if (sbi->segs_per_sec > 1)
141 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); 147 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
142 si->base_mem += __bitmap_size(sbi, SIT_BITMAP); 148 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -159,20 +165,32 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
159 si->base_mem += sizeof(struct f2fs_nm_info); 165 si->base_mem += sizeof(struct f2fs_nm_info);
160 si->base_mem += __bitmap_size(sbi, NAT_BITMAP); 166 si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
161 167
168get_cache:
169 si->cache_mem = 0;
170
162 /* build gc */ 171 /* build gc */
163 si->base_mem += sizeof(struct f2fs_gc_kthread); 172 if (sbi->gc_thread)
173 si->cache_mem += sizeof(struct f2fs_gc_kthread);
174
175 /* build merge flush thread */
176 if (SM_I(sbi)->cmd_control_info)
177 si->cache_mem += sizeof(struct flush_cmd_control);
164 178
165get_cache:
166 /* free nids */ 179 /* free nids */
167 si->cache_mem = NM_I(sbi)->fcnt; 180 si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
168 si->cache_mem += NM_I(sbi)->nat_cnt; 181 si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
169 npages = NODE_MAPPING(sbi)->nrpages; 182 si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
170 si->cache_mem += npages << PAGE_CACHE_SHIFT; 183 sizeof(struct nat_entry_set);
171 npages = META_MAPPING(sbi)->nrpages; 184 si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
172 si->cache_mem += npages << PAGE_CACHE_SHIFT; 185 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
173 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
174 for (i = 0; i <= UPDATE_INO; i++) 186 for (i = 0; i <= UPDATE_INO; i++)
175 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); 187 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
188
189 si->page_mem = 0;
190 npages = NODE_MAPPING(sbi)->nrpages;
191 si->page_mem += npages << PAGE_CACHE_SHIFT;
192 npages = META_MAPPING(sbi)->nrpages;
193 si->page_mem += npages << PAGE_CACHE_SHIFT;
176} 194}
177 195
178static int stat_show(struct seq_file *s, void *v) 196static int stat_show(struct seq_file *s, void *v)
@@ -250,16 +268,16 @@ static int stat_show(struct seq_file *s, void *v)
250 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", 268 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
251 si->hit_ext, si->total_ext); 269 si->hit_ext, si->total_ext);
252 seq_puts(s, "\nBalancing F2FS Async:\n"); 270 seq_puts(s, "\nBalancing F2FS Async:\n");
253 seq_printf(s, " - inmem: %4d\n", 271 seq_printf(s, " - inmem: %4d, wb: %4d\n",
254 si->inmem_pages); 272 si->inmem_pages, si->wb_pages);
255 seq_printf(s, " - nodes: %4d in %4d\n", 273 seq_printf(s, " - nodes: %4d in %4d\n",
256 si->ndirty_node, si->node_pages); 274 si->ndirty_node, si->node_pages);
257 seq_printf(s, " - dents: %4d in dirs:%4d\n", 275 seq_printf(s, " - dents: %4d in dirs:%4d\n",
258 si->ndirty_dent, si->ndirty_dirs); 276 si->ndirty_dent, si->ndirty_dirs);
259 seq_printf(s, " - meta: %4d in %4d\n", 277 seq_printf(s, " - meta: %4d in %4d\n",
260 si->ndirty_meta, si->meta_pages); 278 si->ndirty_meta, si->meta_pages);
261 seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", 279 seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
262 si->nats, si->sits); 280 si->dirty_nats, si->nats, si->dirty_sits, si->sits);
263 seq_printf(s, " - free_nids: %9d\n", 281 seq_printf(s, " - free_nids: %9d\n",
264 si->fnids); 282 si->fnids);
265 seq_puts(s, "\nDistribution of User Blocks:"); 283 seq_puts(s, "\nDistribution of User Blocks:");
@@ -277,6 +295,7 @@ static int stat_show(struct seq_file *s, void *v)
277 for (j = 0; j < si->util_free; j++) 295 for (j = 0; j < si->util_free; j++)
278 seq_putc(s, '-'); 296 seq_putc(s, '-');
279 seq_puts(s, "]\n\n"); 297 seq_puts(s, "]\n\n");
298 seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
280 seq_printf(s, "SSR: %u blocks in %u segments\n", 299 seq_printf(s, "SSR: %u blocks in %u segments\n",
281 si->block_count[SSR], si->segment_count[SSR]); 300 si->block_count[SSR], si->segment_count[SSR]);
282 seq_printf(s, "LFS: %u blocks in %u segments\n", 301 seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -289,9 +308,14 @@ static int stat_show(struct seq_file *s, void *v)
289 308
290 /* memory footprint */ 309 /* memory footprint */
291 update_mem_info(si->sbi); 310 update_mem_info(si->sbi);
292 seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", 311 seq_printf(s, "\nMemory: %u KB\n",
293 (si->base_mem + si->cache_mem) >> 10, 312 (si->base_mem + si->cache_mem + si->page_mem) >> 10);
294 si->base_mem >> 10, si->cache_mem >> 10); 313 seq_printf(s, " - static: %u KB\n",
314 si->base_mem >> 10);
315 seq_printf(s, " - cached: %u KB\n",
316 si->cache_mem >> 10);
317 seq_printf(s, " - paged : %u KB\n",
318 si->page_mem >> 10);
295 } 319 }
296 mutex_unlock(&f2fs_stat_mutex); 320 mutex_unlock(&f2fs_stat_mutex);
297 return 0; 321 return 0;
@@ -331,6 +355,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
331 355
332 atomic_set(&sbi->inline_inode, 0); 356 atomic_set(&sbi->inline_inode, 0);
333 atomic_set(&sbi->inline_dir, 0); 357 atomic_set(&sbi->inline_dir, 0);
358 atomic_set(&sbi->inplace_count, 0);
334 359
335 mutex_lock(&f2fs_stat_mutex); 360 mutex_lock(&f2fs_stat_mutex);
336 list_add_tail(&si->stat_list, &f2fs_stat_list); 361 list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b1a7d5737cd0..b74097a7f6d9 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -286,8 +286,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
286 f2fs_wait_on_page_writeback(page, type); 286 f2fs_wait_on_page_writeback(page, type);
287 de->ino = cpu_to_le32(inode->i_ino); 287 de->ino = cpu_to_le32(inode->i_ino);
288 set_de_type(de, inode); 288 set_de_type(de, inode);
289 if (!f2fs_has_inline_dentry(dir)) 289 f2fs_dentry_kunmap(dir, page);
290 kunmap(page);
291 set_page_dirty(page); 290 set_page_dirty(page);
292 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 291 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
293 mark_inode_dirty(dir); 292 mark_inode_dirty(dir);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec58bb2373fc..7fa3313ab0e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -28,7 +28,7 @@
28 do { \ 28 do { \
29 if (unlikely(condition)) { \ 29 if (unlikely(condition)) { \
30 WARN_ON(1); \ 30 WARN_ON(1); \
31 sbi->need_fsck = true; \ 31 set_sbi_flag(sbi, SBI_NEED_FSCK); \
32 } \ 32 } \
33 } while (0) 33 } while (0)
34#define f2fs_down_write(x, y) down_write(x) 34#define f2fs_down_write(x, y) down_write(x)
@@ -100,10 +100,15 @@ enum {
100 100
101enum { 101enum {
102 CP_UMOUNT, 102 CP_UMOUNT,
103 CP_FASTBOOT,
103 CP_SYNC, 104 CP_SYNC,
104 CP_DISCARD, 105 CP_DISCARD,
105}; 106};
106 107
108#define DEF_BATCHED_TRIM_SECTIONS 32
109#define BATCHED_TRIM_SEGMENTS(sbi) \
110 (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
111
107struct cp_control { 112struct cp_control {
108 int reason; 113 int reason;
109 __u64 trim_start; 114 __u64 trim_start;
@@ -136,8 +141,14 @@ struct ino_entry {
136 nid_t ino; /* inode number */ 141 nid_t ino; /* inode number */
137}; 142};
138 143
139/* for the list of directory inodes */ 144/*
140struct dir_inode_entry { 145 * for the list of directory inodes or gc inodes.
146 * NOTE: there are two slab users for this structure, if we add/modify/delete
147 * fields in structure for one of slab users, it may affect fields or size of
148 * other one, in this condition, it's better to split both of slab and related
149 * data structure.
150 */
151struct inode_entry {
141 struct list_head list; /* list head */ 152 struct list_head list; /* list head */
142 struct inode *inode; /* vfs inode pointer */ 153 struct inode *inode; /* vfs inode pointer */
143}; 154};
@@ -196,11 +207,14 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
196 */ 207 */
197#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS 208#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
198#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS 209#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
210#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION
199 211
200#define F2FS_IOCTL_MAGIC 0xf5 212#define F2FS_IOCTL_MAGIC 0xf5
201#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) 213#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
202#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) 214#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
203#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) 215#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
216#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
217#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
204 218
205#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 219#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
206/* 220/*
@@ -295,7 +309,7 @@ struct f2fs_inode_info {
295 nid_t i_xattr_nid; /* node id that contains xattrs */ 309 nid_t i_xattr_nid; /* node id that contains xattrs */
296 unsigned long long xattr_ver; /* cp version of xattr modification */ 310 unsigned long long xattr_ver; /* cp version of xattr modification */
297 struct extent_info ext; /* in-memory extent cache entry */ 311 struct extent_info ext; /* in-memory extent cache entry */
298 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ 312 struct inode_entry *dirty_dir; /* the pointer of dirty dir */
299 313
300 struct radix_tree_root inmem_root; /* radix tree for inmem pages */ 314 struct radix_tree_root inmem_root; /* radix tree for inmem pages */
301 struct list_head inmem_pages; /* inmemory pages managed by f2fs */ 315 struct list_head inmem_pages; /* inmemory pages managed by f2fs */
@@ -398,7 +412,8 @@ enum {
398 CURSEG_HOT_NODE, /* direct node blocks of directory files */ 412 CURSEG_HOT_NODE, /* direct node blocks of directory files */
399 CURSEG_WARM_NODE, /* direct node blocks of normal files */ 413 CURSEG_WARM_NODE, /* direct node blocks of normal files */
400 CURSEG_COLD_NODE, /* indirect node blocks */ 414 CURSEG_COLD_NODE, /* indirect node blocks */
401 NO_CHECK_TYPE 415 NO_CHECK_TYPE,
416 CURSEG_DIRECT_IO, /* to use for the direct IO path */
402}; 417};
403 418
404struct flush_cmd { 419struct flush_cmd {
@@ -437,6 +452,9 @@ struct f2fs_sm_info {
437 int nr_discards; /* # of discards in the list */ 452 int nr_discards; /* # of discards in the list */
438 int max_discards; /* max. discards to be issued */ 453 int max_discards; /* max. discards to be issued */
439 454
455 /* for batched trimming */
456 unsigned int trim_sections; /* # of sections to trim */
457
440 struct list_head sit_entry_set; /* sit entry set list */ 458 struct list_head sit_entry_set; /* sit entry set list */
441 459
442 unsigned int ipu_policy; /* in-place-update policy */ 460 unsigned int ipu_policy; /* in-place-update policy */
@@ -489,6 +507,7 @@ enum page_type {
489struct f2fs_io_info { 507struct f2fs_io_info {
490 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ 508 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
491 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ 509 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
510 block_t blk_addr; /* block address to be written */
492}; 511};
493 512
494#define is_read_io(rw) (((rw) & 1) == READ) 513#define is_read_io(rw) (((rw) & 1) == READ)
@@ -508,13 +527,20 @@ struct inode_management {
508 unsigned long ino_num; /* number of entries */ 527 unsigned long ino_num; /* number of entries */
509}; 528};
510 529
530/* For s_flag in struct f2fs_sb_info */
531enum {
532 SBI_IS_DIRTY, /* dirty flag for checkpoint */
533 SBI_IS_CLOSE, /* specify unmounting */
534 SBI_NEED_FSCK, /* need fsck.f2fs to fix */
535 SBI_POR_DOING, /* recovery is doing or not */
536};
537
511struct f2fs_sb_info { 538struct f2fs_sb_info {
512 struct super_block *sb; /* pointer to VFS super block */ 539 struct super_block *sb; /* pointer to VFS super block */
513 struct proc_dir_entry *s_proc; /* proc entry */ 540 struct proc_dir_entry *s_proc; /* proc entry */
514 struct buffer_head *raw_super_buf; /* buffer head of raw sb */ 541 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
515 struct f2fs_super_block *raw_super; /* raw super block pointer */ 542 struct f2fs_super_block *raw_super; /* raw super block pointer */
516 int s_dirty; /* dirty flag for checkpoint */ 543 int s_flag; /* flags for sbi */
517 bool need_fsck; /* need fsck.f2fs to fix */
518 544
519 /* for node-related operations */ 545 /* for node-related operations */
520 struct f2fs_nm_info *nm_info; /* node manager */ 546 struct f2fs_nm_info *nm_info; /* node manager */
@@ -534,7 +560,6 @@ struct f2fs_sb_info {
534 struct rw_semaphore cp_rwsem; /* blocking FS operations */ 560 struct rw_semaphore cp_rwsem; /* blocking FS operations */
535 struct rw_semaphore node_write; /* locking node writes */ 561 struct rw_semaphore node_write; /* locking node writes */
536 struct mutex writepages; /* mutex for writepages() */ 562 struct mutex writepages; /* mutex for writepages() */
537 bool por_doing; /* recovery is doing or not */
538 wait_queue_head_t cp_wait; 563 wait_queue_head_t cp_wait;
539 564
540 struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ 565 struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -589,6 +614,7 @@ struct f2fs_sb_info {
589 struct f2fs_stat_info *stat_info; /* FS status information */ 614 struct f2fs_stat_info *stat_info; /* FS status information */
590 unsigned int segment_count[2]; /* # of allocated segments */ 615 unsigned int segment_count[2]; /* # of allocated segments */
591 unsigned int block_count[2]; /* # of allocated blocks */ 616 unsigned int block_count[2]; /* # of allocated blocks */
617 atomic_t inplace_count; /* # of inplace update */
592 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ 618 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
593 atomic_t inline_inode; /* # of inline_data inodes */ 619 atomic_t inline_inode; /* # of inline_data inodes */
594 atomic_t inline_dir; /* # of inline_dentry inodes */ 620 atomic_t inline_dir; /* # of inline_dentry inodes */
@@ -686,14 +712,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
686 return sbi->node_inode->i_mapping; 712 return sbi->node_inode->i_mapping;
687} 713}
688 714
689static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) 715static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
690{ 716{
691 sbi->s_dirty = 1; 717 return sbi->s_flag & (0x01 << type);
692} 718}
693 719
694static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) 720static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
695{ 721{
696 sbi->s_dirty = 0; 722 sbi->s_flag |= (0x01 << type);
723}
724
725static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
726{
727 sbi->s_flag &= ~(0x01 << type);
697} 728}
698 729
699static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) 730static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -741,6 +772,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
741 up_write(&sbi->cp_rwsem); 772 up_write(&sbi->cp_rwsem);
742} 773}
743 774
775static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
776{
777 int reason = CP_SYNC;
778
779 if (test_opt(sbi, FASTBOOT))
780 reason = CP_FASTBOOT;
781 if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
782 reason = CP_UMOUNT;
783 return reason;
784}
785
786static inline bool __remain_node_summaries(int reason)
787{
788 return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
789}
790
791static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
792{
793 return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
794 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
795}
796
744/* 797/*
745 * Check whether the given nid is within node id range. 798 * Check whether the given nid is within node id range.
746 */ 799 */
@@ -805,7 +858,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
805static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) 858static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
806{ 859{
807 atomic_inc(&sbi->nr_pages[count_type]); 860 atomic_inc(&sbi->nr_pages[count_type]);
808 F2FS_SET_SB_DIRT(sbi); 861 set_sbi_flag(sbi, SBI_IS_DIRTY);
809} 862}
810 863
811static inline void inode_inc_dirty_pages(struct inode *inode) 864static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -1113,6 +1166,7 @@ enum {
1113 FI_NEED_IPU, /* used for ipu per file */ 1166 FI_NEED_IPU, /* used for ipu per file */
1114 FI_ATOMIC_FILE, /* indicate atomic file */ 1167 FI_ATOMIC_FILE, /* indicate atomic file */
1115 FI_VOLATILE_FILE, /* indicate volatile file */ 1168 FI_VOLATILE_FILE, /* indicate volatile file */
1169 FI_DROP_CACHE, /* drop dirty page cache */
1116 FI_DATA_EXIST, /* indicate data exists */ 1170 FI_DATA_EXIST, /* indicate data exists */
1117}; 1171};
1118 1172
@@ -1220,6 +1274,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
1220 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); 1274 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
1221} 1275}
1222 1276
1277static inline bool f2fs_is_drop_cache(struct inode *inode)
1278{
1279 return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
1280}
1281
1223static inline void *inline_data_addr(struct page *page) 1282static inline void *inline_data_addr(struct page *page)
1224{ 1283{
1225 struct f2fs_inode *ri = F2FS_INODE(page); 1284 struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1389,7 +1448,6 @@ void destroy_node_manager_caches(void);
1389 * segment.c 1448 * segment.c
1390 */ 1449 */
1391void register_inmem_page(struct inode *, struct page *); 1450void register_inmem_page(struct inode *, struct page *);
1392void invalidate_inmem_page(struct inode *, struct page *);
1393void commit_inmem_pages(struct inode *, bool); 1451void commit_inmem_pages(struct inode *, bool);
1394void f2fs_balance_fs(struct f2fs_sb_info *); 1452void f2fs_balance_fs(struct f2fs_sb_info *);
1395void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1453void f2fs_balance_fs_bg(struct f2fs_sb_info *);
@@ -1401,16 +1459,16 @@ void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1401void clear_prefree_segments(struct f2fs_sb_info *); 1459void clear_prefree_segments(struct f2fs_sb_info *);
1402void release_discard_addrs(struct f2fs_sb_info *); 1460void release_discard_addrs(struct f2fs_sb_info *);
1403void discard_next_dnode(struct f2fs_sb_info *, block_t); 1461void discard_next_dnode(struct f2fs_sb_info *, block_t);
1404int npages_for_summary_flush(struct f2fs_sb_info *); 1462int npages_for_summary_flush(struct f2fs_sb_info *, bool);
1405void allocate_new_segments(struct f2fs_sb_info *); 1463void allocate_new_segments(struct f2fs_sb_info *);
1406int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); 1464int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
1407struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1465struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1408void write_meta_page(struct f2fs_sb_info *, struct page *); 1466void write_meta_page(struct f2fs_sb_info *, struct page *);
1409void write_node_page(struct f2fs_sb_info *, struct page *, 1467void write_node_page(struct f2fs_sb_info *, struct page *,
1410 struct f2fs_io_info *, unsigned int, block_t, block_t *); 1468 unsigned int, struct f2fs_io_info *);
1411void write_data_page(struct page *, struct dnode_of_data *, block_t *, 1469void write_data_page(struct page *, struct dnode_of_data *,
1412 struct f2fs_io_info *); 1470 struct f2fs_io_info *);
1413void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); 1471void rewrite_data_page(struct page *, struct f2fs_io_info *);
1414void recover_data_page(struct f2fs_sb_info *, struct page *, 1472void recover_data_page(struct f2fs_sb_info *, struct page *,
1415 struct f2fs_summary *, block_t, block_t); 1473 struct f2fs_summary *, block_t, block_t);
1416void allocate_data_block(struct f2fs_sb_info *, struct page *, 1474void allocate_data_block(struct f2fs_sb_info *, struct page *,
@@ -1457,17 +1515,20 @@ void destroy_checkpoint_caches(void);
1457 * data.c 1515 * data.c
1458 */ 1516 */
1459void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); 1517void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
1460int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); 1518int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
1461void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, 1519 struct f2fs_io_info *);
1520void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
1462 struct f2fs_io_info *); 1521 struct f2fs_io_info *);
1463int reserve_new_block(struct dnode_of_data *); 1522int reserve_new_block(struct dnode_of_data *);
1464int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); 1523int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
1465void update_extent_cache(block_t, struct dnode_of_data *); 1524void update_extent_cache(struct dnode_of_data *);
1466struct page *find_data_page(struct inode *, pgoff_t, bool); 1525struct page *find_data_page(struct inode *, pgoff_t, bool);
1467struct page *get_lock_data_page(struct inode *, pgoff_t); 1526struct page *get_lock_data_page(struct inode *, pgoff_t);
1468struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); 1527struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1469int do_write_data_page(struct page *, struct f2fs_io_info *); 1528int do_write_data_page(struct page *, struct f2fs_io_info *);
1470int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); 1529int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
1530void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
1531int f2fs_release_page(struct page *, gfp_t);
1471 1532
1472/* 1533/*
1473 * gc.c 1534 * gc.c
@@ -1477,8 +1538,6 @@ void stop_gc_thread(struct f2fs_sb_info *);
1477block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); 1538block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
1478int f2fs_gc(struct f2fs_sb_info *); 1539int f2fs_gc(struct f2fs_sb_info *);
1479void build_gc_manager(struct f2fs_sb_info *); 1540void build_gc_manager(struct f2fs_sb_info *);
1480int __init create_gc_caches(void);
1481void destroy_gc_caches(void);
1482 1541
1483/* 1542/*
1484 * recovery.c 1543 * recovery.c
@@ -1497,9 +1556,9 @@ struct f2fs_stat_info {
1497 int main_area_segs, main_area_sections, main_area_zones; 1556 int main_area_segs, main_area_sections, main_area_zones;
1498 int hit_ext, total_ext; 1557 int hit_ext, total_ext;
1499 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; 1558 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1500 int nats, sits, fnids; 1559 int nats, dirty_nats, sits, dirty_sits, fnids;
1501 int total_count, utilization; 1560 int total_count, utilization;
1502 int bg_gc, inline_inode, inline_dir, inmem_pages; 1561 int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
1503 unsigned int valid_count, valid_node_count, valid_inode_count; 1562 unsigned int valid_count, valid_node_count, valid_inode_count;
1504 unsigned int bimodal, avg_vblocks; 1563 unsigned int bimodal, avg_vblocks;
1505 int util_free, util_valid, util_invalid; 1564 int util_free, util_valid, util_invalid;
@@ -1514,7 +1573,8 @@ struct f2fs_stat_info {
1514 1573
1515 unsigned int segment_count[2]; 1574 unsigned int segment_count[2];
1516 unsigned int block_count[2]; 1575 unsigned int block_count[2];
1517 unsigned base_mem, cache_mem; 1576 unsigned int inplace_count;
1577 unsigned base_mem, cache_mem, page_mem;
1518}; 1578};
1519 1579
1520static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) 1580static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1553,7 +1613,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1553 ((sbi)->segment_count[(curseg)->alloc_type]++) 1613 ((sbi)->segment_count[(curseg)->alloc_type]++)
1554#define stat_inc_block_count(sbi, curseg) \ 1614#define stat_inc_block_count(sbi, curseg) \
1555 ((sbi)->block_count[(curseg)->alloc_type]++) 1615 ((sbi)->block_count[(curseg)->alloc_type]++)
1556 1616#define stat_inc_inplace_blocks(sbi) \
1617 (atomic_inc(&(sbi)->inplace_count))
1557#define stat_inc_seg_count(sbi, type) \ 1618#define stat_inc_seg_count(sbi, type) \
1558 do { \ 1619 do { \
1559 struct f2fs_stat_info *si = F2FS_STAT(sbi); \ 1620 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
@@ -1599,6 +1660,7 @@ void f2fs_destroy_root_stats(void);
1599#define stat_dec_inline_dir(inode) 1660#define stat_dec_inline_dir(inode)
1600#define stat_inc_seg_type(sbi, curseg) 1661#define stat_inc_seg_type(sbi, curseg)
1601#define stat_inc_block_count(sbi, curseg) 1662#define stat_inc_block_count(sbi, curseg)
1663#define stat_inc_inplace_blocks(sbi)
1602#define stat_inc_seg_count(si, type) 1664#define stat_inc_seg_count(si, type)
1603#define stat_inc_tot_blk_count(si, blks) 1665#define stat_inc_tot_blk_count(si, blks)
1604#define stat_inc_data_blk_count(si, blks) 1666#define stat_inc_data_blk_count(si, blks)
@@ -1619,6 +1681,7 @@ extern const struct address_space_operations f2fs_meta_aops;
1619extern const struct inode_operations f2fs_dir_inode_operations; 1681extern const struct inode_operations f2fs_dir_inode_operations;
1620extern const struct inode_operations f2fs_symlink_inode_operations; 1682extern const struct inode_operations f2fs_symlink_inode_operations;
1621extern const struct inode_operations f2fs_special_inode_operations; 1683extern const struct inode_operations f2fs_special_inode_operations;
1684extern struct kmem_cache *inode_entry_slab;
1622 1685
1623/* 1686/*
1624 * inline.c 1687 * inline.c
@@ -1629,7 +1692,6 @@ int f2fs_read_inline_data(struct inode *, struct page *);
1629int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); 1692int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
1630int f2fs_convert_inline_inode(struct inode *); 1693int f2fs_convert_inline_inode(struct inode *);
1631int f2fs_write_inline_data(struct inode *, struct page *); 1694int f2fs_write_inline_data(struct inode *, struct page *);
1632void truncate_inline_data(struct page *, u64);
1633bool recover_inline_data(struct inode *, struct page *); 1695bool recover_inline_data(struct inode *, struct page *);
1634struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, 1696struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
1635 struct page **); 1697 struct page **);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..98dac27bc3f7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -26,6 +26,7 @@
26#include "segment.h" 26#include "segment.h"
27#include "xattr.h" 27#include "xattr.h"
28#include "acl.h" 28#include "acl.h"
29#include "trace.h"
29#include <trace/events/f2fs.h> 30#include <trace/events/f2fs.h>
30 31
31static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, 32static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
@@ -92,7 +93,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
92 .fault = filemap_fault, 93 .fault = filemap_fault,
93 .map_pages = filemap_map_pages, 94 .map_pages = filemap_map_pages,
94 .page_mkwrite = f2fs_vm_page_mkwrite, 95 .page_mkwrite = f2fs_vm_page_mkwrite,
95 .remap_pages = generic_file_remap_pages,
96}; 96};
97 97
98static int get_parent_ino(struct inode *inode, nid_t *pino) 98static int get_parent_ino(struct inode *inode, nid_t *pino)
@@ -246,6 +246,10 @@ go_write:
246sync_nodes: 246sync_nodes:
247 sync_node_pages(sbi, ino, &wbc); 247 sync_node_pages(sbi, ino, &wbc);
248 248
249 /* if cp_error was enabled, we should avoid infinite loop */
250 if (unlikely(f2fs_cp_error(sbi)))
251 goto out;
252
249 if (need_inode_block_update(sbi, ino)) { 253 if (need_inode_block_update(sbi, ino)) {
250 mark_inode_dirty_sync(inode); 254 mark_inode_dirty_sync(inode);
251 f2fs_write_inode(inode, NULL); 255 f2fs_write_inode(inode, NULL);
@@ -265,6 +269,7 @@ flush_out:
265 ret = f2fs_issue_flush(sbi); 269 ret = f2fs_issue_flush(sbi);
266out: 270out:
267 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 271 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
272 f2fs_trace_ios(NULL, NULL, 1);
268 return ret; 273 return ret;
269} 274}
270 275
@@ -351,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
351 /* find data/hole in dnode block */ 356 /* find data/hole in dnode block */
352 for (; dn.ofs_in_node < end_offset; 357 for (; dn.ofs_in_node < end_offset;
353 dn.ofs_in_node++, pgofs++, 358 dn.ofs_in_node++, pgofs++,
354 data_ofs = pgofs << PAGE_CACHE_SHIFT) { 359 data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
355 block_t blkaddr; 360 block_t blkaddr;
356 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); 361 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
357 362
@@ -427,7 +432,8 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
427 if (blkaddr == NULL_ADDR) 432 if (blkaddr == NULL_ADDR)
428 continue; 433 continue;
429 434
430 update_extent_cache(NULL_ADDR, dn); 435 dn->data_blkaddr = NULL_ADDR;
436 update_extent_cache(dn);
431 invalidate_blocks(sbi, blkaddr); 437 invalidate_blocks(sbi, blkaddr);
432 nr_free++; 438 nr_free++;
433 } 439 }
@@ -484,8 +490,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
484 490
485 trace_f2fs_truncate_blocks_enter(inode, from); 491 trace_f2fs_truncate_blocks_enter(inode, from);
486 492
487 free_from = (pgoff_t) 493 free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
488 ((from + blocksize - 1) >> (sbi->log_blocksize));
489 494
490 if (lock) 495 if (lock)
491 f2fs_lock_op(sbi); 496 f2fs_lock_op(sbi);
@@ -836,6 +841,19 @@ static long f2fs_fallocate(struct file *file, int mode,
836 return ret; 841 return ret;
837} 842}
838 843
844static int f2fs_release_file(struct inode *inode, struct file *filp)
845{
846 /* some remained atomic pages should discarded */
847 if (f2fs_is_atomic_file(inode))
848 commit_inmem_pages(inode, true);
849 if (f2fs_is_volatile_file(inode)) {
850 set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
851 filemap_fdatawrite(inode->i_mapping);
852 clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
853 }
854 return 0;
855}
856
839#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) 857#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
840#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) 858#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
841 859
@@ -906,29 +924,30 @@ out:
906 return ret; 924 return ret;
907} 925}
908 926
927static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
928{
929 struct inode *inode = file_inode(filp);
930
931 return put_user(inode->i_generation, (int __user *)arg);
932}
933
909static int f2fs_ioc_start_atomic_write(struct file *filp) 934static int f2fs_ioc_start_atomic_write(struct file *filp)
910{ 935{
911 struct inode *inode = file_inode(filp); 936 struct inode *inode = file_inode(filp);
912 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
913 937
914 if (!inode_owner_or_capable(inode)) 938 if (!inode_owner_or_capable(inode))
915 return -EACCES; 939 return -EACCES;
916 940
917 f2fs_balance_fs(sbi); 941 f2fs_balance_fs(F2FS_I_SB(inode));
942
943 if (f2fs_is_atomic_file(inode))
944 return 0;
918 945
919 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); 946 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
920 947
921 return f2fs_convert_inline_inode(inode); 948 return f2fs_convert_inline_inode(inode);
922} 949}
923 950
924static int f2fs_release_file(struct inode *inode, struct file *filp)
925{
926 /* some remained atomic pages should discarded */
927 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
928 commit_inmem_pages(inode, true);
929 return 0;
930}
931
932static int f2fs_ioc_commit_atomic_write(struct file *filp) 951static int f2fs_ioc_commit_atomic_write(struct file *filp)
933{ 952{
934 struct inode *inode = file_inode(filp); 953 struct inode *inode = file_inode(filp);
@@ -949,6 +968,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
949 968
950 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); 969 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
951 mnt_drop_write_file(filp); 970 mnt_drop_write_file(filp);
971 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
952 return ret; 972 return ret;
953} 973}
954 974
@@ -959,11 +979,56 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
959 if (!inode_owner_or_capable(inode)) 979 if (!inode_owner_or_capable(inode))
960 return -EACCES; 980 return -EACCES;
961 981
982 if (f2fs_is_volatile_file(inode))
983 return 0;
984
962 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); 985 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
963 986
964 return f2fs_convert_inline_inode(inode); 987 return f2fs_convert_inline_inode(inode);
965} 988}
966 989
990static int f2fs_ioc_release_volatile_write(struct file *filp)
991{
992 struct inode *inode = file_inode(filp);
993
994 if (!inode_owner_or_capable(inode))
995 return -EACCES;
996
997 if (!f2fs_is_volatile_file(inode))
998 return 0;
999
1000 punch_hole(inode, 0, F2FS_BLKSIZE);
1001 return 0;
1002}
1003
1004static int f2fs_ioc_abort_volatile_write(struct file *filp)
1005{
1006 struct inode *inode = file_inode(filp);
1007 int ret;
1008
1009 if (!inode_owner_or_capable(inode))
1010 return -EACCES;
1011
1012 ret = mnt_want_write_file(filp);
1013 if (ret)
1014 return ret;
1015
1016 f2fs_balance_fs(F2FS_I_SB(inode));
1017
1018 if (f2fs_is_atomic_file(inode)) {
1019 commit_inmem_pages(inode, false);
1020 clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
1021 }
1022
1023 if (f2fs_is_volatile_file(inode)) {
1024 clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
1025 filemap_fdatawrite(inode->i_mapping);
1026 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
1027 }
1028 mnt_drop_write_file(filp);
1029 return ret;
1030}
1031
967static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) 1032static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
968{ 1033{
969 struct inode *inode = file_inode(filp); 1034 struct inode *inode = file_inode(filp);
@@ -1001,12 +1066,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1001 return f2fs_ioc_getflags(filp, arg); 1066 return f2fs_ioc_getflags(filp, arg);
1002 case F2FS_IOC_SETFLAGS: 1067 case F2FS_IOC_SETFLAGS:
1003 return f2fs_ioc_setflags(filp, arg); 1068 return f2fs_ioc_setflags(filp, arg);
1069 case F2FS_IOC_GETVERSION:
1070 return f2fs_ioc_getversion(filp, arg);
1004 case F2FS_IOC_START_ATOMIC_WRITE: 1071 case F2FS_IOC_START_ATOMIC_WRITE:
1005 return f2fs_ioc_start_atomic_write(filp); 1072 return f2fs_ioc_start_atomic_write(filp);
1006 case F2FS_IOC_COMMIT_ATOMIC_WRITE: 1073 case F2FS_IOC_COMMIT_ATOMIC_WRITE:
1007 return f2fs_ioc_commit_atomic_write(filp); 1074 return f2fs_ioc_commit_atomic_write(filp);
1008 case F2FS_IOC_START_VOLATILE_WRITE: 1075 case F2FS_IOC_START_VOLATILE_WRITE:
1009 return f2fs_ioc_start_volatile_write(filp); 1076 return f2fs_ioc_start_volatile_write(filp);
1077 case F2FS_IOC_RELEASE_VOLATILE_WRITE:
1078 return f2fs_ioc_release_volatile_write(filp);
1079 case F2FS_IOC_ABORT_VOLATILE_WRITE:
1080 return f2fs_ioc_abort_volatile_write(filp);
1010 case FITRIM: 1081 case FITRIM:
1011 return f2fs_ioc_fitrim(filp, arg); 1082 return f2fs_ioc_fitrim(filp, arg);
1012 default: 1083 default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index eec0933a4819..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -24,8 +24,6 @@
24#include "gc.h" 24#include "gc.h"
25#include <trace/events/f2fs.h> 25#include <trace/events/f2fs.h>
26 26
27static struct kmem_cache *winode_slab;
28
29static int gc_thread_func(void *data) 27static int gc_thread_func(void *data)
30{ 28{
31 struct f2fs_sb_info *sbi = data; 29 struct f2fs_sb_info *sbi = data;
@@ -46,7 +44,7 @@ static int gc_thread_func(void *data)
46 break; 44 break;
47 45
48 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 46 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
49 wait_ms = increase_sleep_time(gc_th, wait_ms); 47 increase_sleep_time(gc_th, &wait_ms);
50 continue; 48 continue;
51 } 49 }
52 50
@@ -67,15 +65,15 @@ static int gc_thread_func(void *data)
67 continue; 65 continue;
68 66
69 if (!is_idle(sbi)) { 67 if (!is_idle(sbi)) {
70 wait_ms = increase_sleep_time(gc_th, wait_ms); 68 increase_sleep_time(gc_th, &wait_ms);
71 mutex_unlock(&sbi->gc_mutex); 69 mutex_unlock(&sbi->gc_mutex);
72 continue; 70 continue;
73 } 71 }
74 72
75 if (has_enough_invalid_blocks(sbi)) 73 if (has_enough_invalid_blocks(sbi))
76 wait_ms = decrease_sleep_time(gc_th, wait_ms); 74 decrease_sleep_time(gc_th, &wait_ms);
77 else 75 else
78 wait_ms = increase_sleep_time(gc_th, wait_ms); 76 increase_sleep_time(gc_th, &wait_ms);
79 77
80 stat_inc_bggc_count(sbi); 78 stat_inc_bggc_count(sbi);
81 79
@@ -356,13 +354,10 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
356 iput(inode); 354 iput(inode);
357 return; 355 return;
358 } 356 }
359 new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS); 357 new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
360 new_ie->inode = inode; 358 new_ie->inode = inode;
361retry: 359
362 if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) { 360 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
363 cond_resched();
364 goto retry;
365 }
366 list_add_tail(&new_ie->list, &gc_list->ilist); 361 list_add_tail(&new_ie->list, &gc_list->ilist);
367} 362}
368 363
@@ -373,7 +368,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list)
373 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); 368 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
374 iput(ie->inode); 369 iput(ie->inode);
375 list_del(&ie->list); 370 list_del(&ie->list);
376 kmem_cache_free(winode_slab, ie); 371 kmem_cache_free(inode_entry_slab, ie);
377 } 372 }
378} 373}
379 374
@@ -703,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
703 .iroot = RADIX_TREE_INIT(GFP_NOFS), 698 .iroot = RADIX_TREE_INIT(GFP_NOFS),
704 }; 699 };
705 700
706 cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC; 701 cpc.reason = __get_cp_reason(sbi);
707
708gc_more: 702gc_more:
709 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 703 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
710 goto stop; 704 goto stop;
@@ -750,17 +744,3 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
750{ 744{
751 DIRTY_I(sbi)->v_ops = &default_v_ops; 745 DIRTY_I(sbi)->v_ops = &default_v_ops;
752} 746}
753
754int __init create_gc_caches(void)
755{
756 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
757 sizeof(struct inode_entry));
758 if (!winode_slab)
759 return -ENOMEM;
760 return 0;
761}
762
763void destroy_gc_caches(void)
764{
765 kmem_cache_destroy(winode_slab);
766}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 6ff7ad38463e..b4a65be9f7d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,11 +35,6 @@ struct f2fs_gc_kthread {
35 unsigned int gc_idle; 35 unsigned int gc_idle;
36}; 36};
37 37
38struct inode_entry {
39 struct list_head list;
40 struct inode *inode;
41};
42
43struct gc_inode_list { 38struct gc_inode_list {
44 struct list_head ilist; 39 struct list_head ilist;
45 struct radix_tree_root iroot; 40 struct radix_tree_root iroot;
@@ -69,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
69 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; 64 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
70} 65}
71 66
72static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) 67static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
68 long *wait)
73{ 69{
74 if (wait == gc_th->no_gc_sleep_time) 70 if (*wait == gc_th->no_gc_sleep_time)
75 return wait; 71 return;
76 72
77 wait += gc_th->min_sleep_time; 73 *wait += gc_th->min_sleep_time;
78 if (wait > gc_th->max_sleep_time) 74 if (*wait > gc_th->max_sleep_time)
79 wait = gc_th->max_sleep_time; 75 *wait = gc_th->max_sleep_time;
80 return wait;
81} 76}
82 77
83static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) 78static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
79 long *wait)
84{ 80{
85 if (wait == gc_th->no_gc_sleep_time) 81 if (*wait == gc_th->no_gc_sleep_time)
86 wait = gc_th->max_sleep_time; 82 *wait = gc_th->max_sleep_time;
87 83
88 wait -= gc_th->min_sleep_time; 84 *wait -= gc_th->min_sleep_time;
89 if (wait <= gc_th->min_sleep_time) 85 if (*wait <= gc_th->min_sleep_time)
90 wait = gc_th->min_sleep_time; 86 *wait = gc_th->min_sleep_time;
91 return wait;
92} 87}
93 88
94static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) 89static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index f2d3c581e776..1484c00133cd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -50,6 +50,12 @@ void read_inline_data(struct page *page, struct page *ipage)
50 SetPageUptodate(page); 50 SetPageUptodate(page);
51} 51}
52 52
53static void truncate_inline_data(struct page *ipage)
54{
55 f2fs_wait_on_page_writeback(ipage, NODE);
56 memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
57}
58
53int f2fs_read_inline_data(struct inode *inode, struct page *page) 59int f2fs_read_inline_data(struct inode *inode, struct page *page)
54{ 60{
55 struct page *ipage; 61 struct page *ipage;
@@ -79,7 +85,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
79int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) 85int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
80{ 86{
81 void *src_addr, *dst_addr; 87 void *src_addr, *dst_addr;
82 block_t new_blk_addr;
83 struct f2fs_io_info fio = { 88 struct f2fs_io_info fio = {
84 .type = DATA, 89 .type = DATA,
85 .rw = WRITE_SYNC | REQ_PRIO, 90 .rw = WRITE_SYNC | REQ_PRIO,
@@ -115,9 +120,9 @@ no_update:
115 120
116 /* write data page to try to make data consistent */ 121 /* write data page to try to make data consistent */
117 set_page_writeback(page); 122 set_page_writeback(page);
118 123 fio.blk_addr = dn->data_blkaddr;
119 write_data_page(page, dn, &new_blk_addr, &fio); 124 write_data_page(page, dn, &fio);
120 update_extent_cache(new_blk_addr, dn); 125 update_extent_cache(dn);
121 f2fs_wait_on_page_writeback(page, DATA); 126 f2fs_wait_on_page_writeback(page, DATA);
122 if (dirty) 127 if (dirty)
123 inode_dec_dirty_pages(dn->inode); 128 inode_dec_dirty_pages(dn->inode);
@@ -126,7 +131,7 @@ no_update:
126 set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); 131 set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
127 132
128 /* clear inline data and flag after data writeback */ 133 /* clear inline data and flag after data writeback */
129 truncate_inline_data(dn->inode_page, 0); 134 truncate_inline_data(dn->inode_page);
130clear_out: 135clear_out:
131 stat_dec_inline_inode(dn->inode); 136 stat_dec_inline_inode(dn->inode);
132 f2fs_clear_inline_inode(dn->inode); 137 f2fs_clear_inline_inode(dn->inode);
@@ -199,19 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
199 return 0; 204 return 0;
200} 205}
201 206
202void truncate_inline_data(struct page *ipage, u64 from)
203{
204 void *addr;
205
206 if (from >= MAX_INLINE_DATA)
207 return;
208
209 f2fs_wait_on_page_writeback(ipage, NODE);
210
211 addr = inline_data_addr(ipage);
212 memset(addr + from, 0, MAX_INLINE_DATA - from);
213}
214
215bool recover_inline_data(struct inode *inode, struct page *npage) 207bool recover_inline_data(struct inode *inode, struct page *npage)
216{ 208{
217 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 209 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -253,7 +245,7 @@ process_inline:
253 if (f2fs_has_inline_data(inode)) { 245 if (f2fs_has_inline_data(inode)) {
254 ipage = get_node_page(sbi, inode->i_ino); 246 ipage = get_node_page(sbi, inode->i_ino);
255 f2fs_bug_on(sbi, IS_ERR(ipage)); 247 f2fs_bug_on(sbi, IS_ERR(ipage));
256 truncate_inline_data(ipage, 0); 248 truncate_inline_data(ipage);
257 f2fs_clear_inline_inode(inode); 249 f2fs_clear_inline_inode(inode);
258 update_inode(inode, ipage); 250 update_inode(inode, ipage);
259 f2fs_put_page(ipage, 1); 251 f2fs_put_page(ipage, 1);
@@ -371,7 +363,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
371 set_page_dirty(page); 363 set_page_dirty(page);
372 364
373 /* clear inline dir and flag after data writeback */ 365 /* clear inline dir and flag after data writeback */
374 truncate_inline_data(ipage, 0); 366 truncate_inline_data(ipage);
375 367
376 stat_dec_inline_dir(dir); 368 stat_dec_inline_dir(dir);
377 clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); 369 clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 196cc7843aaf..2d002e3738a7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,29 +67,23 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
67 } 67 }
68} 68}
69 69
70static int __recover_inline_status(struct inode *inode, struct page *ipage) 70static void __recover_inline_status(struct inode *inode, struct page *ipage)
71{ 71{
72 void *inline_data = inline_data_addr(ipage); 72 void *inline_data = inline_data_addr(ipage);
73 struct f2fs_inode *ri; 73 __le32 *start = inline_data;
74 void *zbuf; 74 __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
75 75
76 zbuf = kzalloc(MAX_INLINE_DATA, GFP_NOFS); 76 while (start < end) {
77 if (!zbuf) 77 if (*start++) {
78 return -ENOMEM; 78 f2fs_wait_on_page_writeback(ipage, NODE);
79 79
80 if (!memcmp(zbuf, inline_data, MAX_INLINE_DATA)) { 80 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
81 kfree(zbuf); 81 set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
82 return 0; 82 set_page_dirty(ipage);
83 return;
84 }
83 } 85 }
84 kfree(zbuf); 86 return;
85
86 f2fs_wait_on_page_writeback(ipage, NODE);
87 set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
88
89 ri = F2FS_INODE(ipage);
90 set_raw_inline(F2FS_I(inode), ri);
91 set_page_dirty(ipage);
92 return 0;
93} 87}
94 88
95static int do_read_inode(struct inode *inode) 89static int do_read_inode(struct inode *inode)
@@ -98,7 +92,6 @@ static int do_read_inode(struct inode *inode)
98 struct f2fs_inode_info *fi = F2FS_I(inode); 92 struct f2fs_inode_info *fi = F2FS_I(inode);
99 struct page *node_page; 93 struct page *node_page;
100 struct f2fs_inode *ri; 94 struct f2fs_inode *ri;
101 int err = 0;
102 95
103 /* Check if ino is within scope */ 96 /* Check if ino is within scope */
104 if (check_nid_range(sbi, inode->i_ino)) { 97 if (check_nid_range(sbi, inode->i_ino)) {
@@ -142,7 +135,7 @@ static int do_read_inode(struct inode *inode)
142 135
143 /* check data exist */ 136 /* check data exist */
144 if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) 137 if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
145 err = __recover_inline_status(inode, node_page); 138 __recover_inline_status(inode, node_page);
146 139
147 /* get rdev by using inline_info */ 140 /* get rdev by using inline_info */
148 __get_inode_rdev(inode, ri); 141 __get_inode_rdev(inode, ri);
@@ -152,7 +145,7 @@ static int do_read_inode(struct inode *inode)
152 stat_inc_inline_inode(inode); 145 stat_inc_inline_inode(inode);
153 stat_inc_inline_dir(inode); 146 stat_inc_inline_dir(inode);
154 147
155 return err; 148 return 0;
156} 149}
157 150
158struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) 151struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
@@ -304,7 +297,7 @@ void f2fs_evict_inode(struct inode *inode)
304 nid_t xnid = F2FS_I(inode)->i_xattr_nid; 297 nid_t xnid = F2FS_I(inode)->i_xattr_nid;
305 298
306 /* some remained atomic pages should discarded */ 299 /* some remained atomic pages should discarded */
307 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) 300 if (f2fs_is_atomic_file(inode))
308 commit_inmem_pages(inode, true); 301 commit_inmem_pages(inode, true);
309 302
310 trace_f2fs_evict_inode(inode); 303 trace_f2fs_evict_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 547a2deeb1ac..e79639a9787a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -299,7 +299,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
299 inode->i_op = &f2fs_dir_inode_operations; 299 inode->i_op = &f2fs_dir_inode_operations;
300 inode->i_fop = &f2fs_dir_operations; 300 inode->i_fop = &f2fs_dir_operations;
301 inode->i_mapping->a_ops = &f2fs_dblock_aops; 301 inode->i_mapping->a_ops = &f2fs_dblock_aops;
302 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); 302 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
303 303
304 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 304 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
305 f2fs_lock_op(sbi); 305 f2fs_lock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f83326ca32ef..97bd9d3db882 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
19#include "f2fs.h" 19#include "f2fs.h"
20#include "node.h" 20#include "node.h"
21#include "segment.h" 21#include "segment.h"
22#include "trace.h"
22#include <trace/events/f2fs.h> 23#include <trace/events/f2fs.h>
23 24
24#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) 25#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
@@ -57,12 +58,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
57 } else if (type == INO_ENTRIES) { 58 } else if (type == INO_ENTRIES) {
58 int i; 59 int i;
59 60
60 if (sbi->sb->s_bdi->dirty_exceeded)
61 return false;
62 for (i = 0; i <= UPDATE_INO; i++) 61 for (i = 0; i <= UPDATE_INO; i++)
63 mem_size += (sbi->im[i].ino_num * 62 mem_size += (sbi->im[i].ino_num *
64 sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; 63 sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
65 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); 64 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
65 } else {
66 if (sbi->sb->s_bdi->dirty_exceeded)
67 return false;
66 } 68 }
67 return res; 69 return res;
68} 70}
@@ -268,7 +270,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
268 e = __lookup_nat_cache(nm_i, ni->nid); 270 e = __lookup_nat_cache(nm_i, ni->nid);
269 if (!e) { 271 if (!e) {
270 e = grab_nat_entry(nm_i, ni->nid); 272 e = grab_nat_entry(nm_i, ni->nid);
271 e->ni = *ni; 273 copy_node_info(&e->ni, ni);
272 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); 274 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
273 } else if (new_blkaddr == NEW_ADDR) { 275 } else if (new_blkaddr == NEW_ADDR) {
274 /* 276 /*
@@ -276,7 +278,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
276 * previous nat entry can be remained in nat cache. 278 * previous nat entry can be remained in nat cache.
277 * So, reinitialize it with new information. 279 * So, reinitialize it with new information.
278 */ 280 */
279 e->ni = *ni; 281 copy_node_info(&e->ni, ni);
280 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); 282 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
281 } 283 }
282 284
@@ -346,7 +348,6 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
346 struct nat_entry *e; 348 struct nat_entry *e;
347 int i; 349 int i;
348 350
349 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
350 ni->nid = nid; 351 ni->nid = nid;
351 352
352 /* Check nat cache */ 353 /* Check nat cache */
@@ -361,6 +362,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
361 if (e) 362 if (e)
362 return; 363 return;
363 364
365 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
366
364 /* Check current segment summary */ 367 /* Check current segment summary */
365 mutex_lock(&curseg->curseg_mutex); 368 mutex_lock(&curseg->curseg_mutex);
366 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); 369 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -471,7 +474,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
471{ 474{
472 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 475 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
473 struct page *npage[4]; 476 struct page *npage[4];
474 struct page *parent; 477 struct page *parent = NULL;
475 int offset[4]; 478 int offset[4];
476 unsigned int noffset[4]; 479 unsigned int noffset[4];
477 nid_t nids[4]; 480 nid_t nids[4];
@@ -488,6 +491,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
488 if (IS_ERR(npage[0])) 491 if (IS_ERR(npage[0]))
489 return PTR_ERR(npage[0]); 492 return PTR_ERR(npage[0]);
490 } 493 }
494
495 /* if inline_data is set, should not report any block indices */
496 if (f2fs_has_inline_data(dn->inode) && index) {
497 err = -EINVAL;
498 f2fs_put_page(npage[0], 1);
499 goto release_out;
500 }
501
491 parent = npage[0]; 502 parent = npage[0];
492 if (level != 0) 503 if (level != 0)
493 nids[1] = get_nid(parent, offset[0], true); 504 nids[1] = get_nid(parent, offset[0], true);
@@ -585,7 +596,7 @@ static void truncate_node(struct dnode_of_data *dn)
585 } 596 }
586invalidate: 597invalidate:
587 clear_node_page_dirty(dn->node_page); 598 clear_node_page_dirty(dn->node_page);
588 F2FS_SET_SB_DIRT(sbi); 599 set_sbi_flag(sbi, SBI_IS_DIRTY);
589 600
590 f2fs_put_page(dn->node_page, 1); 601 f2fs_put_page(dn->node_page, 1);
591 602
@@ -976,6 +987,10 @@ static int read_node_page(struct page *page, int rw)
976{ 987{
977 struct f2fs_sb_info *sbi = F2FS_P_SB(page); 988 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
978 struct node_info ni; 989 struct node_info ni;
990 struct f2fs_io_info fio = {
991 .type = NODE,
992 .rw = rw,
993 };
979 994
980 get_node_info(sbi, page->index, &ni); 995 get_node_info(sbi, page->index, &ni);
981 996
@@ -987,7 +1002,8 @@ static int read_node_page(struct page *page, int rw)
987 if (PageUptodate(page)) 1002 if (PageUptodate(page))
988 return LOCKED_PAGE; 1003 return LOCKED_PAGE;
989 1004
990 return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); 1005 fio.blk_addr = ni.blk_addr;
1006 return f2fs_submit_page_bio(sbi, page, &fio);
991} 1007}
992 1008
993/* 1009/*
@@ -1028,11 +1044,11 @@ repeat:
1028 err = read_node_page(page, READ_SYNC); 1044 err = read_node_page(page, READ_SYNC);
1029 if (err < 0) 1045 if (err < 0)
1030 return ERR_PTR(err); 1046 return ERR_PTR(err);
1031 else if (err == LOCKED_PAGE) 1047 else if (err != LOCKED_PAGE)
1032 goto got_it; 1048 lock_page(page);
1033 1049
1034 lock_page(page);
1035 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { 1050 if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
1051 ClearPageUptodate(page);
1036 f2fs_put_page(page, 1); 1052 f2fs_put_page(page, 1);
1037 return ERR_PTR(-EIO); 1053 return ERR_PTR(-EIO);
1038 } 1054 }
@@ -1040,7 +1056,6 @@ repeat:
1040 f2fs_put_page(page, 1); 1056 f2fs_put_page(page, 1);
1041 goto repeat; 1057 goto repeat;
1042 } 1058 }
1043got_it:
1044 return page; 1059 return page;
1045} 1060}
1046 1061
@@ -1268,7 +1283,6 @@ static int f2fs_write_node_page(struct page *page,
1268{ 1283{
1269 struct f2fs_sb_info *sbi = F2FS_P_SB(page); 1284 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1270 nid_t nid; 1285 nid_t nid;
1271 block_t new_addr;
1272 struct node_info ni; 1286 struct node_info ni;
1273 struct f2fs_io_info fio = { 1287 struct f2fs_io_info fio = {
1274 .type = NODE, 1288 .type = NODE,
@@ -1277,7 +1291,7 @@ static int f2fs_write_node_page(struct page *page,
1277 1291
1278 trace_f2fs_writepage(page, NODE); 1292 trace_f2fs_writepage(page, NODE);
1279 1293
1280 if (unlikely(sbi->por_doing)) 1294 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1281 goto redirty_out; 1295 goto redirty_out;
1282 if (unlikely(f2fs_cp_error(sbi))) 1296 if (unlikely(f2fs_cp_error(sbi)))
1283 goto redirty_out; 1297 goto redirty_out;
@@ -1303,9 +1317,11 @@ static int f2fs_write_node_page(struct page *page,
1303 } else { 1317 } else {
1304 down_read(&sbi->node_write); 1318 down_read(&sbi->node_write);
1305 } 1319 }
1320
1306 set_page_writeback(page); 1321 set_page_writeback(page);
1307 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1322 fio.blk_addr = ni.blk_addr;
1308 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); 1323 write_node_page(sbi, page, nid, &fio);
1324 set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
1309 dec_page_count(sbi, F2FS_DIRTY_NODES); 1325 dec_page_count(sbi, F2FS_DIRTY_NODES);
1310 up_read(&sbi->node_write); 1326 up_read(&sbi->node_write);
1311 unlock_page(page); 1327 unlock_page(page);
@@ -1355,26 +1371,12 @@ static int f2fs_set_node_page_dirty(struct page *page)
1355 __set_page_dirty_nobuffers(page); 1371 __set_page_dirty_nobuffers(page);
1356 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); 1372 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1357 SetPagePrivate(page); 1373 SetPagePrivate(page);
1374 f2fs_trace_pid(page);
1358 return 1; 1375 return 1;
1359 } 1376 }
1360 return 0; 1377 return 0;
1361} 1378}
1362 1379
1363static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1364 unsigned int length)
1365{
1366 struct inode *inode = page->mapping->host;
1367 if (PageDirty(page))
1368 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
1369 ClearPagePrivate(page);
1370}
1371
1372static int f2fs_release_node_page(struct page *page, gfp_t wait)
1373{
1374 ClearPagePrivate(page);
1375 return 1;
1376}
1377
1378/* 1380/*
1379 * Structure of the f2fs node operations 1381 * Structure of the f2fs node operations
1380 */ 1382 */
@@ -1382,8 +1384,8 @@ const struct address_space_operations f2fs_node_aops = {
1382 .writepage = f2fs_write_node_page, 1384 .writepage = f2fs_write_node_page,
1383 .writepages = f2fs_write_node_pages, 1385 .writepages = f2fs_write_node_pages,
1384 .set_page_dirty = f2fs_set_node_page_dirty, 1386 .set_page_dirty = f2fs_set_node_page_dirty,
1385 .invalidatepage = f2fs_invalidate_node_page, 1387 .invalidatepage = f2fs_invalidate_page,
1386 .releasepage = f2fs_release_node_page, 1388 .releasepage = f2fs_release_page,
1387}; 1389};
1388 1390
1389static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, 1391static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1726,80 +1728,41 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1726 return 0; 1728 return 0;
1727} 1729}
1728 1730
1729/*
1730 * ra_sum_pages() merge contiguous pages into one bio and submit.
1731 * these pre-read pages are allocated in bd_inode's mapping tree.
1732 */
1733static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
1734 int start, int nrpages)
1735{
1736 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1737 struct address_space *mapping = inode->i_mapping;
1738 int i, page_idx = start;
1739 struct f2fs_io_info fio = {
1740 .type = META,
1741 .rw = READ_SYNC | REQ_META | REQ_PRIO
1742 };
1743
1744 for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
1745 /* alloc page in bd_inode for reading node summary info */
1746 pages[i] = grab_cache_page(mapping, page_idx);
1747 if (!pages[i])
1748 break;
1749 f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
1750 }
1751
1752 f2fs_submit_merged_bio(sbi, META, READ);
1753 return i;
1754}
1755
1756int restore_node_summary(struct f2fs_sb_info *sbi, 1731int restore_node_summary(struct f2fs_sb_info *sbi,
1757 unsigned int segno, struct f2fs_summary_block *sum) 1732 unsigned int segno, struct f2fs_summary_block *sum)
1758{ 1733{
1759 struct f2fs_node *rn; 1734 struct f2fs_node *rn;
1760 struct f2fs_summary *sum_entry; 1735 struct f2fs_summary *sum_entry;
1761 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1762 block_t addr; 1736 block_t addr;
1763 int bio_blocks = MAX_BIO_BLOCKS(sbi); 1737 int bio_blocks = MAX_BIO_BLOCKS(sbi);
1764 struct page *pages[bio_blocks]; 1738 int i, idx, last_offset, nrpages;
1765 int i, idx, last_offset, nrpages, err = 0;
1766 1739
1767 /* scan the node segment */ 1740 /* scan the node segment */
1768 last_offset = sbi->blocks_per_seg; 1741 last_offset = sbi->blocks_per_seg;
1769 addr = START_BLOCK(sbi, segno); 1742 addr = START_BLOCK(sbi, segno);
1770 sum_entry = &sum->entries[0]; 1743 sum_entry = &sum->entries[0];
1771 1744
1772 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { 1745 for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
1773 nrpages = min(last_offset - i, bio_blocks); 1746 nrpages = min(last_offset - i, bio_blocks);
1774 1747
1775 /* readahead node pages */ 1748 /* readahead node pages */
1776 nrpages = ra_sum_pages(sbi, pages, addr, nrpages); 1749 ra_meta_pages(sbi, addr, nrpages, META_POR);
1777 if (!nrpages)
1778 return -ENOMEM;
1779 1750
1780 for (idx = 0; idx < nrpages; idx++) { 1751 for (idx = addr; idx < addr + nrpages; idx++) {
1781 if (err) 1752 struct page *page = get_meta_page(sbi, idx);
1782 goto skip;
1783 1753
1784 lock_page(pages[idx]); 1754 rn = F2FS_NODE(page);
1785 if (unlikely(!PageUptodate(pages[idx]))) { 1755 sum_entry->nid = rn->footer.nid;
1786 err = -EIO; 1756 sum_entry->version = 0;
1787 } else { 1757 sum_entry->ofs_in_node = 0;
1788 rn = F2FS_NODE(pages[idx]); 1758 sum_entry++;
1789 sum_entry->nid = rn->footer.nid; 1759 f2fs_put_page(page, 1);
1790 sum_entry->version = 0;
1791 sum_entry->ofs_in_node = 0;
1792 sum_entry++;
1793 }
1794 unlock_page(pages[idx]);
1795skip:
1796 page_cache_release(pages[idx]);
1797 } 1760 }
1798 1761
1799 invalidate_mapping_pages(inode->i_mapping, addr, 1762 invalidate_mapping_pages(META_MAPPING(sbi), addr,
1800 addr + nrpages); 1763 addr + nrpages);
1801 } 1764 }
1802 return err; 1765 return 0;
1803} 1766}
1804 1767
1805static void remove_nats_in_journal(struct f2fs_sb_info *sbi) 1768static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -1923,7 +1886,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1923 struct f2fs_nm_info *nm_i = NM_I(sbi); 1886 struct f2fs_nm_info *nm_i = NM_I(sbi);
1924 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1887 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1925 struct f2fs_summary_block *sum = curseg->sum_blk; 1888 struct f2fs_summary_block *sum = curseg->sum_blk;
1926 struct nat_entry_set *setvec[NATVEC_SIZE]; 1889 struct nat_entry_set *setvec[SETVEC_SIZE];
1927 struct nat_entry_set *set, *tmp; 1890 struct nat_entry_set *set, *tmp;
1928 unsigned int found; 1891 unsigned int found;
1929 nid_t set_idx = 0; 1892 nid_t set_idx = 0;
@@ -1940,7 +1903,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1940 remove_nats_in_journal(sbi); 1903 remove_nats_in_journal(sbi);
1941 1904
1942 while ((found = __gang_lookup_nat_set(nm_i, 1905 while ((found = __gang_lookup_nat_set(nm_i,
1943 set_idx, NATVEC_SIZE, setvec))) { 1906 set_idx, SETVEC_SIZE, setvec))) {
1944 unsigned idx; 1907 unsigned idx;
1945 set_idx = setvec[found - 1]->set + 1; 1908 set_idx = setvec[found - 1]->set + 1;
1946 for (idx = 0; idx < found; idx++) 1909 for (idx = 0; idx < found; idx++)
@@ -2020,6 +1983,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2020 struct f2fs_nm_info *nm_i = NM_I(sbi); 1983 struct f2fs_nm_info *nm_i = NM_I(sbi);
2021 struct free_nid *i, *next_i; 1984 struct free_nid *i, *next_i;
2022 struct nat_entry *natvec[NATVEC_SIZE]; 1985 struct nat_entry *natvec[NATVEC_SIZE];
1986 struct nat_entry_set *setvec[SETVEC_SIZE];
2023 nid_t nid = 0; 1987 nid_t nid = 0;
2024 unsigned int found; 1988 unsigned int found;
2025 1989
@@ -2044,11 +2008,27 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2044 while ((found = __gang_lookup_nat_cache(nm_i, 2008 while ((found = __gang_lookup_nat_cache(nm_i,
2045 nid, NATVEC_SIZE, natvec))) { 2009 nid, NATVEC_SIZE, natvec))) {
2046 unsigned idx; 2010 unsigned idx;
2011
2047 nid = nat_get_nid(natvec[found - 1]) + 1; 2012 nid = nat_get_nid(natvec[found - 1]) + 1;
2048 for (idx = 0; idx < found; idx++) 2013 for (idx = 0; idx < found; idx++)
2049 __del_from_nat_cache(nm_i, natvec[idx]); 2014 __del_from_nat_cache(nm_i, natvec[idx]);
2050 } 2015 }
2051 f2fs_bug_on(sbi, nm_i->nat_cnt); 2016 f2fs_bug_on(sbi, nm_i->nat_cnt);
2017
2018 /* destroy nat set cache */
2019 nid = 0;
2020 while ((found = __gang_lookup_nat_set(nm_i,
2021 nid, SETVEC_SIZE, setvec))) {
2022 unsigned idx;
2023
2024 nid = setvec[found - 1]->set + 1;
2025 for (idx = 0; idx < found; idx++) {
2026 /* entry_cnt is not zero, when cp_error was occurred */
2027 f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
2028 radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
2029 kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2030 }
2031 }
2052 up_write(&nm_i->nat_tree_lock); 2032 up_write(&nm_i->nat_tree_lock);
2053 2033
2054 kfree(nm_i->nat_bitmap); 2034 kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d10b6448a671..f405bbf2435a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,10 +25,19 @@
25 25
26/* vector size for gang look-up from nat cache that consists of radix tree */ 26/* vector size for gang look-up from nat cache that consists of radix tree */
27#define NATVEC_SIZE 64 27#define NATVEC_SIZE 64
28#define SETVEC_SIZE 32
28 29
29/* return value for read_node_page */ 30/* return value for read_node_page */
30#define LOCKED_PAGE 1 31#define LOCKED_PAGE 1
31 32
33/* For flag in struct node_info */
34enum {
35 IS_CHECKPOINTED, /* is it checkpointed before? */
36 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
37 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
38 IS_DIRTY, /* this nat entry is dirty? */
39};
40
32/* 41/*
33 * For node information 42 * For node information
34 */ 43 */
@@ -37,18 +46,11 @@ struct node_info {
37 nid_t ino; /* inode number of the node's owner */ 46 nid_t ino; /* inode number of the node's owner */
38 block_t blk_addr; /* block address of the node */ 47 block_t blk_addr; /* block address of the node */
39 unsigned char version; /* version of the node */ 48 unsigned char version; /* version of the node */
40}; 49 unsigned char flag; /* for node information bits */
41
42enum {
43 IS_CHECKPOINTED, /* is it checkpointed before? */
44 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
45 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
46 IS_DIRTY, /* this nat entry is dirty? */
47}; 50};
48 51
49struct nat_entry { 52struct nat_entry {
50 struct list_head list; /* for clean or dirty nat list */ 53 struct list_head list; /* for clean or dirty nat list */
51 unsigned char flag; /* for node information bits */
52 struct node_info ni; /* in-memory node information */ 54 struct node_info ni; /* in-memory node information */
53}; 55};
54 56
@@ -63,20 +65,30 @@ struct nat_entry {
63 65
64#define inc_node_version(version) (++version) 66#define inc_node_version(version) (++version)
65 67
68static inline void copy_node_info(struct node_info *dst,
69 struct node_info *src)
70{
71 dst->nid = src->nid;
72 dst->ino = src->ino;
73 dst->blk_addr = src->blk_addr;
74 dst->version = src->version;
75 /* should not copy flag here */
76}
77
66static inline void set_nat_flag(struct nat_entry *ne, 78static inline void set_nat_flag(struct nat_entry *ne,
67 unsigned int type, bool set) 79 unsigned int type, bool set)
68{ 80{
69 unsigned char mask = 0x01 << type; 81 unsigned char mask = 0x01 << type;
70 if (set) 82 if (set)
71 ne->flag |= mask; 83 ne->ni.flag |= mask;
72 else 84 else
73 ne->flag &= ~mask; 85 ne->ni.flag &= ~mask;
74} 86}
75 87
76static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) 88static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
77{ 89{
78 unsigned char mask = 0x01 << type; 90 unsigned char mask = 0x01 << type;
79 return ne->flag & mask; 91 return ne->ni.flag & mask;
80} 92}
81 93
82static inline void nat_reset_flag(struct nat_entry *ne) 94static inline void nat_reset_flag(struct nat_entry *ne)
@@ -108,6 +120,7 @@ enum mem_type {
108 NAT_ENTRIES, /* indicates the cached nat entry */ 120 NAT_ENTRIES, /* indicates the cached nat entry */
109 DIRTY_DENTS, /* indicates dirty dentry pages */ 121 DIRTY_DENTS, /* indicates dirty dentry pages */
110 INO_ENTRIES, /* indicates inode entries */ 122 INO_ENTRIES, /* indicates inode entries */
123 BASE_CHECK, /* check kernel status */
111}; 124};
112 125
113struct nat_entry_set { 126struct nat_entry_set {
@@ -200,11 +213,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
200 nid_t ino, unsigned int ofs, bool reset) 213 nid_t ino, unsigned int ofs, bool reset)
201{ 214{
202 struct f2fs_node *rn = F2FS_NODE(page); 215 struct f2fs_node *rn = F2FS_NODE(page);
216 unsigned int old_flag = 0;
217
203 if (reset) 218 if (reset)
204 memset(rn, 0, sizeof(*rn)); 219 memset(rn, 0, sizeof(*rn));
220 else
221 old_flag = le32_to_cpu(rn->footer.flag);
222
205 rn->footer.nid = cpu_to_le32(nid); 223 rn->footer.nid = cpu_to_le32(nid);
206 rn->footer.ino = cpu_to_le32(ino); 224 rn->footer.ino = cpu_to_le32(ino);
207 rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT); 225
226 /* should remain old flag bits such as COLD_BIT_SHIFT */
227 rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
228 (old_flag & OFFSET_BIT_MASK));
208} 229}
209 230
210static inline void copy_node_footer(struct page *dst, struct page *src) 231static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9160a37e1c7a..41afb9534bbd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -346,6 +346,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
346 if (IS_INODE(page)) { 346 if (IS_INODE(page)) {
347 recover_inline_xattr(inode, page); 347 recover_inline_xattr(inode, page);
348 } else if (f2fs_has_xattr_block(ofs_of_node(page))) { 348 } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
349 /*
350 * Deprecated; xattr blocks should be found from cold log.
351 * But, we should remain this for backward compatibility.
352 */
349 recover_xattr_data(inode, page, blkaddr); 353 recover_xattr_data(inode, page, blkaddr);
350 goto out; 354 goto out;
351 } 355 }
@@ -396,7 +400,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
396 400
397 /* write dummy data page */ 401 /* write dummy data page */
398 recover_data_page(sbi, NULL, &sum, src, dest); 402 recover_data_page(sbi, NULL, &sum, src, dest);
399 update_extent_cache(dest, &dn); 403 dn.data_blkaddr = dest;
404 update_extent_cache(&dn);
400 recovered++; 405 recovered++;
401 } 406 }
402 dn.ofs_in_node++; 407 dn.ofs_in_node++;
@@ -503,7 +508,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
503 INIT_LIST_HEAD(&inode_list); 508 INIT_LIST_HEAD(&inode_list);
504 509
505 /* step #1: find fsynced inode numbers */ 510 /* step #1: find fsynced inode numbers */
506 sbi->por_doing = true; 511 set_sbi_flag(sbi, SBI_POR_DOING);
507 512
508 /* prevent checkpoint */ 513 /* prevent checkpoint */
509 mutex_lock(&sbi->cp_mutex); 514 mutex_lock(&sbi->cp_mutex);
@@ -536,7 +541,7 @@ out:
536 truncate_inode_pages_final(META_MAPPING(sbi)); 541 truncate_inode_pages_final(META_MAPPING(sbi));
537 } 542 }
538 543
539 sbi->por_doing = false; 544 clear_sbi_flag(sbi, SBI_POR_DOING);
540 if (err) { 545 if (err) {
541 discard_next_dnode(sbi, blkaddr); 546 discard_next_dnode(sbi, blkaddr);
542 547
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 42607a679923..daee4ab913da 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
20#include "f2fs.h" 20#include "f2fs.h"
21#include "segment.h" 21#include "segment.h"
22#include "node.h" 22#include "node.h"
23#include "trace.h"
23#include <trace/events/f2fs.h> 24#include <trace/events/f2fs.h>
24 25
25#define __reverse_ffz(x) __reverse_ffs(~(x)) 26#define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -181,6 +182,7 @@ void register_inmem_page(struct inode *inode, struct page *page)
181 int err; 182 int err;
182 183
183 SetPagePrivate(page); 184 SetPagePrivate(page);
185 f2fs_trace_pid(page);
184 186
185 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); 187 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
186 188
@@ -205,23 +207,6 @@ retry:
205 mutex_unlock(&fi->inmem_lock); 207 mutex_unlock(&fi->inmem_lock);
206} 208}
207 209
208void invalidate_inmem_page(struct inode *inode, struct page *page)
209{
210 struct f2fs_inode_info *fi = F2FS_I(inode);
211 struct inmem_pages *cur;
212
213 mutex_lock(&fi->inmem_lock);
214 cur = radix_tree_lookup(&fi->inmem_root, page->index);
215 if (cur) {
216 radix_tree_delete(&fi->inmem_root, cur->page->index);
217 f2fs_put_page(cur->page, 0);
218 list_del(&cur->list);
219 kmem_cache_free(inmem_entry_slab, cur);
220 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
221 }
222 mutex_unlock(&fi->inmem_lock);
223}
224
225void commit_inmem_pages(struct inode *inode, bool abort) 210void commit_inmem_pages(struct inode *inode, bool abort)
226{ 211{
227 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 212 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -230,7 +215,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
230 bool submit_bio = false; 215 bool submit_bio = false;
231 struct f2fs_io_info fio = { 216 struct f2fs_io_info fio = {
232 .type = DATA, 217 .type = DATA,
233 .rw = WRITE_SYNC, 218 .rw = WRITE_SYNC | REQ_PRIO,
234 }; 219 };
235 220
236 /* 221 /*
@@ -240,33 +225,38 @@ void commit_inmem_pages(struct inode *inode, bool abort)
240 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this 225 * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
241 * inode becomes free by iget_locked in f2fs_iget. 226 * inode becomes free by iget_locked in f2fs_iget.
242 */ 227 */
243 if (!abort) 228 if (!abort) {
244 f2fs_balance_fs(sbi); 229 f2fs_balance_fs(sbi);
245 230 f2fs_lock_op(sbi);
246 f2fs_lock_op(sbi); 231 }
247 232
248 mutex_lock(&fi->inmem_lock); 233 mutex_lock(&fi->inmem_lock);
249 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { 234 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
250 lock_page(cur->page); 235 if (!abort) {
251 if (!abort && cur->page->mapping == inode->i_mapping) { 236 lock_page(cur->page);
252 f2fs_wait_on_page_writeback(cur->page, DATA); 237 if (cur->page->mapping == inode->i_mapping) {
253 if (clear_page_dirty_for_io(cur->page)) 238 f2fs_wait_on_page_writeback(cur->page, DATA);
254 inode_dec_dirty_pages(inode); 239 if (clear_page_dirty_for_io(cur->page))
255 do_write_data_page(cur->page, &fio); 240 inode_dec_dirty_pages(inode);
256 submit_bio = true; 241 do_write_data_page(cur->page, &fio);
242 submit_bio = true;
243 }
244 f2fs_put_page(cur->page, 1);
245 } else {
246 put_page(cur->page);
257 } 247 }
258 radix_tree_delete(&fi->inmem_root, cur->page->index); 248 radix_tree_delete(&fi->inmem_root, cur->page->index);
259 f2fs_put_page(cur->page, 1);
260 list_del(&cur->list); 249 list_del(&cur->list);
261 kmem_cache_free(inmem_entry_slab, cur); 250 kmem_cache_free(inmem_entry_slab, cur);
262 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); 251 dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
263 } 252 }
264 if (submit_bio)
265 f2fs_submit_merged_bio(sbi, DATA, WRITE);
266 mutex_unlock(&fi->inmem_lock); 253 mutex_unlock(&fi->inmem_lock);
267 254
268 filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX); 255 if (!abort) {
269 f2fs_unlock_op(sbi); 256 f2fs_unlock_op(sbi);
257 if (submit_bio)
258 f2fs_submit_merged_bio(sbi, DATA, WRITE);
259 }
270} 260}
271 261
272/* 262/*
@@ -290,7 +280,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
290 /* check the # of cached NAT entries and prefree segments */ 280 /* check the # of cached NAT entries and prefree segments */
291 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || 281 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
292 excess_prefree_segs(sbi) || 282 excess_prefree_segs(sbi) ||
293 available_free_memory(sbi, INO_ENTRIES)) 283 !available_free_memory(sbi, INO_ENTRIES))
294 f2fs_sync_fs(sbi->sb, true); 284 f2fs_sync_fs(sbi->sb, true);
295} 285}
296 286
@@ -515,12 +505,13 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
515 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); 505 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
516 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 506 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
517 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 507 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
518 unsigned long dmap[entries]; 508 unsigned long *dmap = SIT_I(sbi)->tmp_map;
519 unsigned int start = 0, end = -1; 509 unsigned int start = 0, end = -1;
520 bool force = (cpc->reason == CP_DISCARD); 510 bool force = (cpc->reason == CP_DISCARD);
521 int i; 511 int i;
522 512
523 if (!force && !test_opt(sbi, DISCARD)) 513 if (!force && (!test_opt(sbi, DISCARD) ||
514 SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
524 return; 515 return;
525 516
526 if (force && !se->valid_blocks) { 517 if (force && !se->valid_blocks) {
@@ -548,7 +539,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
548 539
549 /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ 540 /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
550 for (i = 0; i < entries; i++) 541 for (i = 0; i < entries; i++)
551 dmap[i] = ~(cur_map[i] | ckpt_map[i]); 542 dmap[i] = force ? ~ckpt_map[i] :
543 (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
552 544
553 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { 545 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
554 start = __find_rev_next_bit(dmap, max_blocks, end + 1); 546 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
@@ -735,7 +727,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
735/* 727/*
736 * Calculate the number of current summary pages for writing 728 * Calculate the number of current summary pages for writing
737 */ 729 */
738int npages_for_summary_flush(struct f2fs_sb_info *sbi) 730int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
739{ 731{
740 int valid_sum_count = 0; 732 int valid_sum_count = 0;
741 int i, sum_in_page; 733 int i, sum_in_page;
@@ -743,8 +735,13 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
743 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { 735 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
744 if (sbi->ckpt->alloc_type[i] == SSR) 736 if (sbi->ckpt->alloc_type[i] == SSR)
745 valid_sum_count += sbi->blocks_per_seg; 737 valid_sum_count += sbi->blocks_per_seg;
746 else 738 else {
747 valid_sum_count += curseg_blkoff(sbi, i); 739 if (for_ra)
740 valid_sum_count += le16_to_cpu(
741 F2FS_CKPT(sbi)->cur_data_blkoff[i]);
742 else
743 valid_sum_count += curseg_blkoff(sbi, i);
744 }
748 } 745 }
749 746
750 sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - 747 sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
@@ -803,7 +800,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
803 int go_left = 0; 800 int go_left = 0;
804 int i; 801 int i;
805 802
806 write_lock(&free_i->segmap_lock); 803 spin_lock(&free_i->segmap_lock);
807 804
808 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { 805 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
809 segno = find_next_zero_bit(free_i->free_segmap, 806 segno = find_next_zero_bit(free_i->free_segmap,
@@ -876,7 +873,7 @@ got_it:
876 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); 873 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
877 __set_inuse(sbi, segno); 874 __set_inuse(sbi, segno);
878 *newseg = segno; 875 *newseg = segno;
879 write_unlock(&free_i->segmap_lock); 876 spin_unlock(&free_i->segmap_lock);
880} 877}
881 878
882static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) 879static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -927,7 +924,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
927{ 924{
928 struct seg_entry *se = get_seg_entry(sbi, seg->segno); 925 struct seg_entry *se = get_seg_entry(sbi, seg->segno);
929 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); 926 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
930 unsigned long target_map[entries]; 927 unsigned long *target_map = SIT_I(sbi)->tmp_map;
931 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 928 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
932 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 929 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
933 int i, pos; 930 int i, pos;
@@ -1027,18 +1024,22 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
1027 stat_inc_seg_type(sbi, curseg); 1024 stat_inc_seg_type(sbi, curseg);
1028} 1025}
1029 1026
1027static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
1028{
1029 struct curseg_info *curseg = CURSEG_I(sbi, type);
1030 unsigned int old_segno;
1031
1032 old_segno = curseg->segno;
1033 SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
1034 locate_dirty_segment(sbi, old_segno);
1035}
1036
1030void allocate_new_segments(struct f2fs_sb_info *sbi) 1037void allocate_new_segments(struct f2fs_sb_info *sbi)
1031{ 1038{
1032 struct curseg_info *curseg;
1033 unsigned int old_curseg;
1034 int i; 1039 int i;
1035 1040
1036 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { 1041 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
1037 curseg = CURSEG_I(sbi, i); 1042 __allocate_new_segments(sbi, i);
1038 old_curseg = curseg->segno;
1039 SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
1040 locate_dirty_segment(sbi, old_curseg);
1041 }
1042} 1043}
1043 1044
1044static const struct segment_allocation default_salloc_ops = { 1045static const struct segment_allocation default_salloc_ops = {
@@ -1047,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
1047 1048
1048int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) 1049int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
1049{ 1050{
1050 __u64 start = range->start >> sbi->log_blocksize; 1051 __u64 start = F2FS_BYTES_TO_BLK(range->start);
1051 __u64 end = start + (range->len >> sbi->log_blocksize) - 1; 1052 __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
1052 unsigned int start_segno, end_segno; 1053 unsigned int start_segno, end_segno;
1053 struct cp_control cpc; 1054 struct cp_control cpc;
1054 1055
@@ -1065,16 +1066,21 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
1065 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : 1066 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
1066 GET_SEGNO(sbi, end); 1067 GET_SEGNO(sbi, end);
1067 cpc.reason = CP_DISCARD; 1068 cpc.reason = CP_DISCARD;
1068 cpc.trim_start = start_segno; 1069 cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
1069 cpc.trim_end = end_segno;
1070 cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
1071 1070
1072 /* do checkpoint to issue discard commands safely */ 1071 /* do checkpoint to issue discard commands safely */
1073 mutex_lock(&sbi->gc_mutex); 1072 for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
1074 write_checkpoint(sbi, &cpc); 1073 cpc.trim_start = start_segno;
1075 mutex_unlock(&sbi->gc_mutex); 1074 cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
1075 BATCHED_TRIM_SEGMENTS(sbi),
1076 sbi->segs_per_sec) - 1, end_segno);
1077
1078 mutex_lock(&sbi->gc_mutex);
1079 write_checkpoint(sbi, &cpc);
1080 mutex_unlock(&sbi->gc_mutex);
1081 }
1076out: 1082out:
1077 range->len = cpc.trimmed << sbi->log_blocksize; 1083 range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
1078 return 0; 1084 return 0;
1079} 1085}
1080 1086
@@ -1151,11 +1157,18 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1151{ 1157{
1152 struct sit_info *sit_i = SIT_I(sbi); 1158 struct sit_info *sit_i = SIT_I(sbi);
1153 struct curseg_info *curseg; 1159 struct curseg_info *curseg;
1160 bool direct_io = (type == CURSEG_DIRECT_IO);
1161
1162 type = direct_io ? CURSEG_WARM_DATA : type;
1154 1163
1155 curseg = CURSEG_I(sbi, type); 1164 curseg = CURSEG_I(sbi, type);
1156 1165
1157 mutex_lock(&curseg->curseg_mutex); 1166 mutex_lock(&curseg->curseg_mutex);
1158 1167
1168 /* direct_io'ed data is aligned to the segment for better performance */
1169 if (direct_io && curseg->next_blkoff)
1170 __allocate_new_segments(sbi, type);
1171
1159 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 1172 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
1160 1173
1161 /* 1174 /*
@@ -1187,39 +1200,39 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1187} 1200}
1188 1201
1189static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, 1202static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
1190 block_t old_blkaddr, block_t *new_blkaddr, 1203 struct f2fs_summary *sum,
1191 struct f2fs_summary *sum, struct f2fs_io_info *fio) 1204 struct f2fs_io_info *fio)
1192{ 1205{
1193 int type = __get_segment_type(page, fio->type); 1206 int type = __get_segment_type(page, fio->type);
1194 1207
1195 allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); 1208 allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
1196 1209
1197 /* writeout dirty page into bdev */ 1210 /* writeout dirty page into bdev */
1198 f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); 1211 f2fs_submit_page_mbio(sbi, page, fio);
1199} 1212}
1200 1213
1201void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) 1214void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
1202{ 1215{
1203 struct f2fs_io_info fio = { 1216 struct f2fs_io_info fio = {
1204 .type = META, 1217 .type = META,
1205 .rw = WRITE_SYNC | REQ_META | REQ_PRIO 1218 .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
1219 .blk_addr = page->index,
1206 }; 1220 };
1207 1221
1208 set_page_writeback(page); 1222 set_page_writeback(page);
1209 f2fs_submit_page_mbio(sbi, page, page->index, &fio); 1223 f2fs_submit_page_mbio(sbi, page, &fio);
1210} 1224}
1211 1225
1212void write_node_page(struct f2fs_sb_info *sbi, struct page *page, 1226void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
1213 struct f2fs_io_info *fio, 1227 unsigned int nid, struct f2fs_io_info *fio)
1214 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
1215{ 1228{
1216 struct f2fs_summary sum; 1229 struct f2fs_summary sum;
1217 set_summary(&sum, nid, 0, 0); 1230 set_summary(&sum, nid, 0, 0);
1218 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); 1231 do_write_page(sbi, page, &sum, fio);
1219} 1232}
1220 1233
1221void write_data_page(struct page *page, struct dnode_of_data *dn, 1234void write_data_page(struct page *page, struct dnode_of_data *dn,
1222 block_t *new_blkaddr, struct f2fs_io_info *fio) 1235 struct f2fs_io_info *fio)
1223{ 1236{
1224 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); 1237 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1225 struct f2fs_summary sum; 1238 struct f2fs_summary sum;
@@ -1228,14 +1241,14 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
1228 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); 1241 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
1229 get_node_info(sbi, dn->nid, &ni); 1242 get_node_info(sbi, dn->nid, &ni);
1230 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1243 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
1231 1244 do_write_page(sbi, page, &sum, fio);
1232 do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); 1245 dn->data_blkaddr = fio->blk_addr;
1233} 1246}
1234 1247
1235void rewrite_data_page(struct page *page, block_t old_blkaddr, 1248void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
1236 struct f2fs_io_info *fio)
1237{ 1249{
1238 f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio); 1250 stat_inc_inplace_blocks(F2FS_P_SB(page));
1251 f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
1239} 1252}
1240 1253
1241void recover_data_page(struct f2fs_sb_info *sbi, 1254void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1393,7 +1406,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1393 segno = le32_to_cpu(ckpt->cur_data_segno[type]); 1406 segno = le32_to_cpu(ckpt->cur_data_segno[type]);
1394 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - 1407 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
1395 CURSEG_HOT_DATA]); 1408 CURSEG_HOT_DATA]);
1396 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) 1409 if (__exist_node_summaries(sbi))
1397 blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); 1410 blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
1398 else 1411 else
1399 blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); 1412 blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1402,7 +1415,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1402 CURSEG_HOT_NODE]); 1415 CURSEG_HOT_NODE]);
1403 blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - 1416 blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
1404 CURSEG_HOT_NODE]); 1417 CURSEG_HOT_NODE]);
1405 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) 1418 if (__exist_node_summaries(sbi))
1406 blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, 1419 blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
1407 type - CURSEG_HOT_NODE); 1420 type - CURSEG_HOT_NODE);
1408 else 1421 else
@@ -1413,7 +1426,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1413 sum = (struct f2fs_summary_block *)page_address(new); 1426 sum = (struct f2fs_summary_block *)page_address(new);
1414 1427
1415 if (IS_NODESEG(type)) { 1428 if (IS_NODESEG(type)) {
1416 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) { 1429 if (__exist_node_summaries(sbi)) {
1417 struct f2fs_summary *ns = &sum->entries[0]; 1430 struct f2fs_summary *ns = &sum->entries[0];
1418 int i; 1431 int i;
1419 for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { 1432 for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1450,12 +1463,22 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1450 int err; 1463 int err;
1451 1464
1452 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { 1465 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
1466 int npages = npages_for_summary_flush(sbi, true);
1467
1468 if (npages >= 2)
1469 ra_meta_pages(sbi, start_sum_block(sbi), npages,
1470 META_CP);
1471
1453 /* restore for compacted data summary */ 1472 /* restore for compacted data summary */
1454 if (read_compacted_summaries(sbi)) 1473 if (read_compacted_summaries(sbi))
1455 return -EINVAL; 1474 return -EINVAL;
1456 type = CURSEG_HOT_NODE; 1475 type = CURSEG_HOT_NODE;
1457 } 1476 }
1458 1477
1478 if (__exist_node_summaries(sbi))
1479 ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
1480 NR_CURSEG_TYPE - type, META_CP);
1481
1459 for (; type <= CURSEG_COLD_NODE; type++) { 1482 for (; type <= CURSEG_COLD_NODE; type++) {
1460 err = read_normal_summaries(sbi, type); 1483 err = read_normal_summaries(sbi, type);
1461 if (err) 1484 if (err)
@@ -1549,8 +1572,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1549 1572
1550void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) 1573void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1551{ 1574{
1552 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) 1575 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1553 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1554} 1576}
1555 1577
1556int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, 1578int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
@@ -1754,7 +1776,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1754 se = get_seg_entry(sbi, segno); 1776 se = get_seg_entry(sbi, segno);
1755 1777
1756 /* add discard candidates */ 1778 /* add discard candidates */
1757 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) { 1779 if (cpc->reason != CP_DISCARD) {
1758 cpc->trim_start = segno; 1780 cpc->trim_start = segno;
1759 add_discard_addrs(sbi, cpc); 1781 add_discard_addrs(sbi, cpc);
1760 } 1782 }
@@ -1833,6 +1855,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1833 return -ENOMEM; 1855 return -ENOMEM;
1834 } 1856 }
1835 1857
1858 sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1859 if (!sit_i->tmp_map)
1860 return -ENOMEM;
1861
1836 if (sbi->segs_per_sec > 1) { 1862 if (sbi->segs_per_sec > 1) {
1837 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * 1863 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
1838 sizeof(struct sec_entry)); 1864 sizeof(struct sec_entry));
@@ -1897,7 +1923,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1897 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); 1923 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
1898 free_i->free_segments = 0; 1924 free_i->free_segments = 0;
1899 free_i->free_sections = 0; 1925 free_i->free_sections = 0;
1900 rwlock_init(&free_i->segmap_lock); 1926 spin_lock_init(&free_i->segmap_lock);
1901 return 0; 1927 return 0;
1902} 1928}
1903 1929
@@ -2110,6 +2136,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
2110 sm_info->nr_discards = 0; 2136 sm_info->nr_discards = 0;
2111 sm_info->max_discards = 0; 2137 sm_info->max_discards = 0;
2112 2138
2139 sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
2140
2113 INIT_LIST_HEAD(&sm_info->sit_entry_set); 2141 INIT_LIST_HEAD(&sm_info->sit_entry_set);
2114 2142
2115 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { 2143 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
@@ -2212,6 +2240,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
2212 kfree(sit_i->sentries[start].ckpt_valid_map); 2240 kfree(sit_i->sentries[start].ckpt_valid_map);
2213 } 2241 }
2214 } 2242 }
2243 kfree(sit_i->tmp_map);
2244
2215 vfree(sit_i->sentries); 2245 vfree(sit_i->sentries);
2216 vfree(sit_i->sec_entries); 2246 vfree(sit_i->sec_entries);
2217 kfree(sit_i->dirty_sentries_bitmap); 2247 kfree(sit_i->dirty_sentries_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7f327c0ba4e3..7fd35111cf62 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -189,6 +189,7 @@ struct sit_info {
189 char *sit_bitmap; /* SIT bitmap pointer */ 189 char *sit_bitmap; /* SIT bitmap pointer */
190 unsigned int bitmap_size; /* SIT bitmap size */ 190 unsigned int bitmap_size; /* SIT bitmap size */
191 191
192 unsigned long *tmp_map; /* bitmap for temporal use */
192 unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ 193 unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
193 unsigned int dirty_sentries; /* # of dirty sentries */ 194 unsigned int dirty_sentries; /* # of dirty sentries */
194 unsigned int sents_per_block; /* # of SIT entries per block */ 195 unsigned int sents_per_block; /* # of SIT entries per block */
@@ -207,7 +208,7 @@ struct free_segmap_info {
207 unsigned int start_segno; /* start segment number logically */ 208 unsigned int start_segno; /* start segment number logically */
208 unsigned int free_segments; /* # of free segments */ 209 unsigned int free_segments; /* # of free segments */
209 unsigned int free_sections; /* # of free sections */ 210 unsigned int free_sections; /* # of free sections */
210 rwlock_t segmap_lock; /* free segmap lock */ 211 spinlock_t segmap_lock; /* free segmap lock */
211 unsigned long *free_segmap; /* free segment bitmap */ 212 unsigned long *free_segmap; /* free segment bitmap */
212 unsigned long *free_secmap; /* free section bitmap */ 213 unsigned long *free_secmap; /* free section bitmap */
213}; 214};
@@ -318,9 +319,9 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
318 unsigned int max, unsigned int segno) 319 unsigned int max, unsigned int segno)
319{ 320{
320 unsigned int ret; 321 unsigned int ret;
321 read_lock(&free_i->segmap_lock); 322 spin_lock(&free_i->segmap_lock);
322 ret = find_next_bit(free_i->free_segmap, max, segno); 323 ret = find_next_bit(free_i->free_segmap, max, segno);
323 read_unlock(&free_i->segmap_lock); 324 spin_unlock(&free_i->segmap_lock);
324 return ret; 325 return ret;
325} 326}
326 327
@@ -331,7 +332,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
331 unsigned int start_segno = secno * sbi->segs_per_sec; 332 unsigned int start_segno = secno * sbi->segs_per_sec;
332 unsigned int next; 333 unsigned int next;
333 334
334 write_lock(&free_i->segmap_lock); 335 spin_lock(&free_i->segmap_lock);
335 clear_bit(segno, free_i->free_segmap); 336 clear_bit(segno, free_i->free_segmap);
336 free_i->free_segments++; 337 free_i->free_segments++;
337 338
@@ -340,7 +341,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
340 clear_bit(secno, free_i->free_secmap); 341 clear_bit(secno, free_i->free_secmap);
341 free_i->free_sections++; 342 free_i->free_sections++;
342 } 343 }
343 write_unlock(&free_i->segmap_lock); 344 spin_unlock(&free_i->segmap_lock);
344} 345}
345 346
346static inline void __set_inuse(struct f2fs_sb_info *sbi, 347static inline void __set_inuse(struct f2fs_sb_info *sbi,
@@ -362,7 +363,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
362 unsigned int start_segno = secno * sbi->segs_per_sec; 363 unsigned int start_segno = secno * sbi->segs_per_sec;
363 unsigned int next; 364 unsigned int next;
364 365
365 write_lock(&free_i->segmap_lock); 366 spin_lock(&free_i->segmap_lock);
366 if (test_and_clear_bit(segno, free_i->free_segmap)) { 367 if (test_and_clear_bit(segno, free_i->free_segmap)) {
367 free_i->free_segments++; 368 free_i->free_segments++;
368 369
@@ -373,7 +374,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
373 free_i->free_sections++; 374 free_i->free_sections++;
374 } 375 }
375 } 376 }
376 write_unlock(&free_i->segmap_lock); 377 spin_unlock(&free_i->segmap_lock);
377} 378}
378 379
379static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, 380static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
@@ -381,13 +382,13 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
381{ 382{
382 struct free_segmap_info *free_i = FREE_I(sbi); 383 struct free_segmap_info *free_i = FREE_I(sbi);
383 unsigned int secno = segno / sbi->segs_per_sec; 384 unsigned int secno = segno / sbi->segs_per_sec;
384 write_lock(&free_i->segmap_lock); 385 spin_lock(&free_i->segmap_lock);
385 if (!test_and_set_bit(segno, free_i->free_segmap)) { 386 if (!test_and_set_bit(segno, free_i->free_segmap)) {
386 free_i->free_segments--; 387 free_i->free_segments--;
387 if (!test_and_set_bit(secno, free_i->free_secmap)) 388 if (!test_and_set_bit(secno, free_i->free_secmap))
388 free_i->free_sections--; 389 free_i->free_sections--;
389 } 390 }
390 write_unlock(&free_i->segmap_lock); 391 spin_unlock(&free_i->segmap_lock);
391} 392}
392 393
393static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, 394static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
@@ -460,7 +461,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
460 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); 461 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
461 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); 462 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
462 463
463 if (unlikely(sbi->por_doing)) 464 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
464 return false; 465 return false;
465 466
466 return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + 467 return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
@@ -599,13 +600,13 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
599static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) 600static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
600{ 601{
601 if (segno > TOTAL_SEGS(sbi) - 1) 602 if (segno > TOTAL_SEGS(sbi) - 1)
602 sbi->need_fsck = true; 603 set_sbi_flag(sbi, SBI_NEED_FSCK);
603} 604}
604 605
605static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) 606static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
606{ 607{
607 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) 608 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
608 sbi->need_fsck = true; 609 set_sbi_flag(sbi, SBI_NEED_FSCK);
609} 610}
610 611
611/* 612/*
@@ -616,11 +617,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
616{ 617{
617 /* check segment usage */ 618 /* check segment usage */
618 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) 619 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
619 sbi->need_fsck = true; 620 set_sbi_flag(sbi, SBI_NEED_FSCK);
620 621
621 /* check boundary of a given segment number */ 622 /* check boundary of a given segment number */
622 if (segno > TOTAL_SEGS(sbi) - 1) 623 if (segno > TOTAL_SEGS(sbi) - 1)
623 sbi->need_fsck = true; 624 set_sbi_flag(sbi, SBI_NEED_FSCK);
624} 625}
625#endif 626#endif
626 627
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f71421d70475..f2fe666a6ea9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -30,6 +30,7 @@
30#include "segment.h" 30#include "segment.h"
31#include "xattr.h" 31#include "xattr.h"
32#include "gc.h" 32#include "gc.h"
33#include "trace.h"
33 34
34#define CREATE_TRACE_POINTS 35#define CREATE_TRACE_POINTS
35#include <trace/events/f2fs.h> 36#include <trace/events/f2fs.h>
@@ -41,6 +42,7 @@ static struct kset *f2fs_kset;
41enum { 42enum {
42 Opt_gc_background, 43 Opt_gc_background,
43 Opt_disable_roll_forward, 44 Opt_disable_roll_forward,
45 Opt_norecovery,
44 Opt_discard, 46 Opt_discard,
45 Opt_noheap, 47 Opt_noheap,
46 Opt_user_xattr, 48 Opt_user_xattr,
@@ -61,6 +63,7 @@ enum {
61static match_table_t f2fs_tokens = { 63static match_table_t f2fs_tokens = {
62 {Opt_gc_background, "background_gc=%s"}, 64 {Opt_gc_background, "background_gc=%s"},
63 {Opt_disable_roll_forward, "disable_roll_forward"}, 65 {Opt_disable_roll_forward, "disable_roll_forward"},
66 {Opt_norecovery, "norecovery"},
64 {Opt_discard, "discard"}, 67 {Opt_discard, "discard"},
65 {Opt_noheap, "no_heap"}, 68 {Opt_noheap, "no_heap"},
66 {Opt_user_xattr, "user_xattr"}, 69 {Opt_user_xattr, "user_xattr"},
@@ -192,6 +195,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
192F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); 195F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
193F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); 196F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
194F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 197F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
198F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
195F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 199F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
196F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 200F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
197F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); 201F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -207,6 +211,7 @@ static struct attribute *f2fs_attrs[] = {
207 ATTR_LIST(gc_idle), 211 ATTR_LIST(gc_idle),
208 ATTR_LIST(reclaim_segments), 212 ATTR_LIST(reclaim_segments),
209 ATTR_LIST(max_small_discards), 213 ATTR_LIST(max_small_discards),
214 ATTR_LIST(batched_trim_sections),
210 ATTR_LIST(ipu_policy), 215 ATTR_LIST(ipu_policy),
211 ATTR_LIST(min_ipu_util), 216 ATTR_LIST(min_ipu_util),
212 ATTR_LIST(min_fsync_blocks), 217 ATTR_LIST(min_fsync_blocks),
@@ -286,6 +291,12 @@ static int parse_options(struct super_block *sb, char *options)
286 case Opt_disable_roll_forward: 291 case Opt_disable_roll_forward:
287 set_opt(sbi, DISABLE_ROLL_FORWARD); 292 set_opt(sbi, DISABLE_ROLL_FORWARD);
288 break; 293 break;
294 case Opt_norecovery:
295 /* this option mounts f2fs with ro */
296 set_opt(sbi, DISABLE_ROLL_FORWARD);
297 if (!f2fs_readonly(sb))
298 return -EINVAL;
299 break;
289 case Opt_discard: 300 case Opt_discard:
290 set_opt(sbi, DISCARD); 301 set_opt(sbi, DISCARD);
291 break; 302 break;
@@ -446,8 +457,13 @@ static void f2fs_put_super(struct super_block *sb)
446 f2fs_destroy_stats(sbi); 457 f2fs_destroy_stats(sbi);
447 stop_gc_thread(sbi); 458 stop_gc_thread(sbi);
448 459
449 /* We don't need to do checkpoint when it's clean */ 460 /*
450 if (sbi->s_dirty) { 461 * We don't need to do checkpoint when superblock is clean.
462 * But, the previous checkpoint was not done by umount, it needs to do
463 * clean checkpoint again.
464 */
465 if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
466 !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
451 struct cp_control cpc = { 467 struct cp_control cpc = {
452 .reason = CP_UMOUNT, 468 .reason = CP_UMOUNT,
453 }; 469 };
@@ -486,13 +502,15 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
486 if (sync) { 502 if (sync) {
487 struct cp_control cpc; 503 struct cp_control cpc;
488 504
489 cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC; 505 cpc.reason = __get_cp_reason(sbi);
506
490 mutex_lock(&sbi->gc_mutex); 507 mutex_lock(&sbi->gc_mutex);
491 write_checkpoint(sbi, &cpc); 508 write_checkpoint(sbi, &cpc);
492 mutex_unlock(&sbi->gc_mutex); 509 mutex_unlock(&sbi->gc_mutex);
493 } else { 510 } else {
494 f2fs_balance_fs(sbi); 511 f2fs_balance_fs(sbi);
495 } 512 }
513 f2fs_trace_ios(NULL, NULL, 1);
496 514
497 return 0; 515 return 0;
498} 516}
@@ -887,7 +905,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
887 atomic_set(&sbi->nr_pages[i], 0); 905 atomic_set(&sbi->nr_pages[i], 0);
888 906
889 sbi->dir_level = DEF_DIR_LEVEL; 907 sbi->dir_level = DEF_DIR_LEVEL;
890 sbi->need_fsck = false; 908 clear_sbi_flag(sbi, SBI_NEED_FSCK);
891} 909}
892 910
893/* 911/*
@@ -942,6 +960,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
942 struct inode *root; 960 struct inode *root;
943 long err = -EINVAL; 961 long err = -EINVAL;
944 bool retry = true; 962 bool retry = true;
963 char *options = NULL;
945 int i; 964 int i;
946 965
947try_onemore: 966try_onemore:
@@ -973,9 +992,15 @@ try_onemore:
973 set_opt(sbi, POSIX_ACL); 992 set_opt(sbi, POSIX_ACL);
974#endif 993#endif
975 /* parse mount options */ 994 /* parse mount options */
976 err = parse_options(sb, (char *)data); 995 options = kstrdup((const char *)data, GFP_KERNEL);
977 if (err) 996 if (data && !options) {
997 err = -ENOMEM;
978 goto free_sb_buf; 998 goto free_sb_buf;
999 }
1000
1001 err = parse_options(sb, options);
1002 if (err)
1003 goto free_options;
979 1004
980 sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); 1005 sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
981 sb->s_max_links = F2FS_LINK_MAX; 1006 sb->s_max_links = F2FS_LINK_MAX;
@@ -998,7 +1023,7 @@ try_onemore:
998 mutex_init(&sbi->writepages); 1023 mutex_init(&sbi->writepages);
999 mutex_init(&sbi->cp_mutex); 1024 mutex_init(&sbi->cp_mutex);
1000 init_rwsem(&sbi->node_write); 1025 init_rwsem(&sbi->node_write);
1001 sbi->por_doing = false; 1026 clear_sbi_flag(sbi, SBI_POR_DOING);
1002 spin_lock_init(&sbi->stat_lock); 1027 spin_lock_init(&sbi->stat_lock);
1003 1028
1004 init_rwsem(&sbi->read_io.io_rwsem); 1029 init_rwsem(&sbi->read_io.io_rwsem);
@@ -1019,7 +1044,7 @@ try_onemore:
1019 if (IS_ERR(sbi->meta_inode)) { 1044 if (IS_ERR(sbi->meta_inode)) {
1020 f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); 1045 f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
1021 err = PTR_ERR(sbi->meta_inode); 1046 err = PTR_ERR(sbi->meta_inode);
1022 goto free_sb_buf; 1047 goto free_options;
1023 } 1048 }
1024 1049
1025 err = get_valid_checkpoint(sbi); 1050 err = get_valid_checkpoint(sbi);
@@ -1122,10 +1147,19 @@ try_onemore:
1122 goto free_proc; 1147 goto free_proc;
1123 1148
1124 if (!retry) 1149 if (!retry)
1125 sbi->need_fsck = true; 1150 set_sbi_flag(sbi, SBI_NEED_FSCK);
1126 1151
1127 /* recover fsynced data */ 1152 /* recover fsynced data */
1128 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1153 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1154 /*
1155 * mount should be failed, when device has readonly mode, and
1156 * previous checkpoint was not done by clean system shutdown.
1157 */
1158 if (bdev_read_only(sb->s_bdev) &&
1159 !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
1160 err = -EROFS;
1161 goto free_kobj;
1162 }
1129 err = recover_fsync_data(sbi); 1163 err = recover_fsync_data(sbi);
1130 if (err) { 1164 if (err) {
1131 f2fs_msg(sb, KERN_ERR, 1165 f2fs_msg(sb, KERN_ERR,
@@ -1144,6 +1178,7 @@ try_onemore:
1144 if (err) 1178 if (err)
1145 goto free_kobj; 1179 goto free_kobj;
1146 } 1180 }
1181 kfree(options);
1147 return 0; 1182 return 0;
1148 1183
1149free_kobj: 1184free_kobj:
@@ -1168,6 +1203,8 @@ free_cp:
1168free_meta_inode: 1203free_meta_inode:
1169 make_bad_inode(sbi->meta_inode); 1204 make_bad_inode(sbi->meta_inode);
1170 iput(sbi->meta_inode); 1205 iput(sbi->meta_inode);
1206free_options:
1207 kfree(options);
1171free_sb_buf: 1208free_sb_buf:
1172 brelse(raw_super_buf); 1209 brelse(raw_super_buf);
1173free_sbi: 1210free_sbi:
@@ -1188,11 +1225,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
1188 return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); 1225 return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
1189} 1226}
1190 1227
1228static void kill_f2fs_super(struct super_block *sb)
1229{
1230 if (sb->s_root)
1231 set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
1232 kill_block_super(sb);
1233}
1234
1191static struct file_system_type f2fs_fs_type = { 1235static struct file_system_type f2fs_fs_type = {
1192 .owner = THIS_MODULE, 1236 .owner = THIS_MODULE,
1193 .name = "f2fs", 1237 .name = "f2fs",
1194 .mount = f2fs_mount, 1238 .mount = f2fs_mount,
1195 .kill_sb = kill_block_super, 1239 .kill_sb = kill_f2fs_super,
1196 .fs_flags = FS_REQUIRES_DEV, 1240 .fs_flags = FS_REQUIRES_DEV,
1197}; 1241};
1198MODULE_ALIAS_FS("f2fs"); 1242MODULE_ALIAS_FS("f2fs");
@@ -1220,6 +1264,8 @@ static int __init init_f2fs_fs(void)
1220{ 1264{
1221 int err; 1265 int err;
1222 1266
1267 f2fs_build_trace_ios();
1268
1223 err = init_inodecache(); 1269 err = init_inodecache();
1224 if (err) 1270 if (err)
1225 goto fail; 1271 goto fail;
@@ -1229,12 +1275,9 @@ static int __init init_f2fs_fs(void)
1229 err = create_segment_manager_caches(); 1275 err = create_segment_manager_caches();
1230 if (err) 1276 if (err)
1231 goto free_node_manager_caches; 1277 goto free_node_manager_caches;
1232 err = create_gc_caches();
1233 if (err)
1234 goto free_segment_manager_caches;
1235 err = create_checkpoint_caches(); 1278 err = create_checkpoint_caches();
1236 if (err) 1279 if (err)
1237 goto free_gc_caches; 1280 goto free_segment_manager_caches;
1238 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); 1281 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
1239 if (!f2fs_kset) { 1282 if (!f2fs_kset) {
1240 err = -ENOMEM; 1283 err = -ENOMEM;
@@ -1251,8 +1294,6 @@ free_kset:
1251 kset_unregister(f2fs_kset); 1294 kset_unregister(f2fs_kset);
1252free_checkpoint_caches: 1295free_checkpoint_caches:
1253 destroy_checkpoint_caches(); 1296 destroy_checkpoint_caches();
1254free_gc_caches:
1255 destroy_gc_caches();
1256free_segment_manager_caches: 1297free_segment_manager_caches:
1257 destroy_segment_manager_caches(); 1298 destroy_segment_manager_caches();
1258free_node_manager_caches: 1299free_node_manager_caches:
@@ -1269,11 +1310,11 @@ static void __exit exit_f2fs_fs(void)
1269 f2fs_destroy_root_stats(); 1310 f2fs_destroy_root_stats();
1270 unregister_filesystem(&f2fs_fs_type); 1311 unregister_filesystem(&f2fs_fs_type);
1271 destroy_checkpoint_caches(); 1312 destroy_checkpoint_caches();
1272 destroy_gc_caches();
1273 destroy_segment_manager_caches(); 1313 destroy_segment_manager_caches();
1274 destroy_node_manager_caches(); 1314 destroy_node_manager_caches();
1275 destroy_inodecache(); 1315 destroy_inodecache();
1276 kset_unregister(f2fs_kset); 1316 kset_unregister(f2fs_kset);
1317 f2fs_destroy_trace_ios();
1277} 1318}
1278 1319
1279module_init(init_f2fs_fs) 1320module_init(init_f2fs_fs)
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
new file mode 100644
index 000000000000..875aa8179bc1
--- /dev/null
+++ b/fs/f2fs/trace.c
@@ -0,0 +1,159 @@
1/*
2 * f2fs IO tracer
3 *
4 * Copyright (c) 2014 Motorola Mobility
5 * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/sched.h>
14#include <linux/radix-tree.h>
15
16#include "f2fs.h"
17#include "trace.h"
18
19static RADIX_TREE(pids, GFP_ATOMIC);
20static spinlock_t pids_lock;
21static struct last_io_info last_io;
22
23static inline void __print_last_io(void)
24{
25 if (!last_io.len)
26 return;
27
28 trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
29 last_io.major, last_io.minor,
30 last_io.pid, "----------------",
31 last_io.type,
32 last_io.fio.rw, last_io.fio.blk_addr,
33 last_io.len);
34 memset(&last_io, 0, sizeof(last_io));
35}
36
37static int __file_type(struct inode *inode, pid_t pid)
38{
39 if (f2fs_is_atomic_file(inode))
40 return __ATOMIC_FILE;
41 else if (f2fs_is_volatile_file(inode))
42 return __VOLATILE_FILE;
43 else if (S_ISDIR(inode->i_mode))
44 return __DIR_FILE;
45 else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
46 return __NODE_FILE;
47 else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
48 return __META_FILE;
49 else if (pid)
50 return __NORMAL_FILE;
51 else
52 return __MISC_FILE;
53}
54
55void f2fs_trace_pid(struct page *page)
56{
57 struct inode *inode = page->mapping->host;
58 pid_t pid = task_pid_nr(current);
59 void *p;
60
61 page->private = pid;
62
63 if (radix_tree_preload(GFP_NOFS))
64 return;
65
66 spin_lock(&pids_lock);
67 p = radix_tree_lookup(&pids, pid);
68 if (p == current)
69 goto out;
70 if (p)
71 radix_tree_delete(&pids, pid);
72
73 f2fs_radix_tree_insert(&pids, pid, current);
74
75 trace_printk("%3x:%3x %4x %-16s\n",
76 MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
77 pid, current->comm);
78out:
79 spin_unlock(&pids_lock);
80 radix_tree_preload_end();
81}
82
83void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
84{
85 struct inode *inode;
86 pid_t pid;
87 int major, minor;
88
89 if (flush) {
90 __print_last_io();
91 return;
92 }
93
94 inode = page->mapping->host;
95 pid = page_private(page);
96
97 major = MAJOR(inode->i_sb->s_dev);
98 minor = MINOR(inode->i_sb->s_dev);
99
100 if (last_io.major == major && last_io.minor == minor &&
101 last_io.pid == pid &&
102 last_io.type == __file_type(inode, pid) &&
103 last_io.fio.rw == fio->rw &&
104 last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
105 last_io.len++;
106 return;
107 }
108
109 __print_last_io();
110
111 last_io.major = major;
112 last_io.minor = minor;
113 last_io.pid = pid;
114 last_io.type = __file_type(inode, pid);
115 last_io.fio = *fio;
116 last_io.len = 1;
117 return;
118}
119
120void f2fs_build_trace_ios(void)
121{
122 spin_lock_init(&pids_lock);
123}
124
125#define PIDVEC_SIZE 128
126static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
127 unsigned int max_items)
128{
129 struct radix_tree_iter iter;
130 void **slot;
131 unsigned int ret = 0;
132
133 if (unlikely(!max_items))
134 return 0;
135
136 radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
137 results[ret] = iter.index;
138 if (++ret == PIDVEC_SIZE)
139 break;
140 }
141 return ret;
142}
143
144void f2fs_destroy_trace_ios(void)
145{
146 pid_t pid[PIDVEC_SIZE];
147 pid_t next_pid = 0;
148 unsigned int found;
149
150 spin_lock(&pids_lock);
151 while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
152 unsigned idx;
153
154 next_pid = pid[found - 1] + 1;
155 for (idx = 0; idx < found; idx++)
156 radix_tree_delete(&pids, pid[idx]);
157 }
158 spin_unlock(&pids_lock);
159}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
new file mode 100644
index 000000000000..1041dbeb52ae
--- /dev/null
+++ b/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
1/*
2 * f2fs IO tracer
3 *
4 * Copyright (c) 2014 Motorola Mobility
5 * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef __F2FS_TRACE_H__
12#define __F2FS_TRACE_H__
13
14#ifdef CONFIG_F2FS_IO_TRACE
15#include <trace/events/f2fs.h>
16
17enum file_type {
18 __NORMAL_FILE,
19 __DIR_FILE,
20 __NODE_FILE,
21 __META_FILE,
22 __ATOMIC_FILE,
23 __VOLATILE_FILE,
24 __MISC_FILE,
25};
26
27struct last_io_info {
28 int major, minor;
29 pid_t pid;
30 enum file_type type;
31 struct f2fs_io_info fio;
32 block_t len;
33};
34
35extern void f2fs_trace_pid(struct page *);
36extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
37extern void f2fs_build_trace_ios(void);
38extern void f2fs_destroy_trace_ios(void);
39#else
40#define f2fs_trace_pid(p)
41#define f2fs_trace_ios(p, i, n)
42#define f2fs_build_trace_ios()
43#define f2fs_destroy_trace_ios()
44
45#endif
46#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7b41a2dcdd76..497c7c5263c7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -580,7 +580,7 @@ static void fat_set_state(struct super_block *sb,
580{ 580{
581 struct buffer_head *bh; 581 struct buffer_head *bh;
582 struct fat_boot_sector *b; 582 struct fat_boot_sector *b;
583 struct msdos_sb_info *sbi = sb->s_fs_info; 583 struct msdos_sb_info *sbi = MSDOS_SB(sb);
584 584
585 /* do not change any thing if mounted read only */ 585 /* do not change any thing if mounted read only */
586 if ((sb->s_flags & MS_RDONLY) && !force) 586 if ((sb->s_flags & MS_RDONLY) && !force)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 99d440a4a6ba..ee85cd4e136a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
740 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 740 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
741 * is defined as O_NONBLOCK on some platforms and not on others. 741 * is defined as O_NONBLOCK on some platforms and not on others.
742 */ 742 */
743 BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 743 BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
744 O_RDONLY | O_WRONLY | O_RDWR | 744 O_RDONLY | O_WRONLY | O_RDWR |
745 O_CREAT | O_EXCL | O_NOCTTY | 745 O_CREAT | O_EXCL | O_NOCTTY |
746 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 746 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
747 __O_SYNC | O_DSYNC | FASYNC | 747 __O_SYNC | O_DSYNC | FASYNC |
748 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 748 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
749 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 749 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
750 __FMODE_EXEC | O_PATH | __O_TMPFILE 750 __FMODE_EXEC | O_PATH | __O_TMPFILE |
751 __FMODE_NONOTIFY
751 )); 752 ));
752 753
753 fasync_cache = kmem_cache_create("fasync_cache", 754 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..073657f755d4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
66} 66}
67EXPORT_SYMBOL(writeback_in_progress); 67EXPORT_SYMBOL(writeback_in_progress);
68 68
69static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) 69struct backing_dev_info *inode_to_bdi(struct inode *inode)
70{ 70{
71 struct super_block *sb = inode->i_sb; 71 struct super_block *sb;
72 72
73 if (sb_is_blkdev_sb(sb)) 73 if (!inode)
74 return inode->i_mapping->backing_dev_info; 74 return &noop_backing_dev_info;
75 75
76 sb = inode->i_sb;
77#ifdef CONFIG_BLOCK
78 if (sb_is_blkdev_sb(sb))
79 return blk_get_backing_dev_info(I_BDEV(inode));
80#endif
76 return sb->s_bdi; 81 return sb->s_bdi;
77} 82}
83EXPORT_SYMBOL_GPL(inode_to_bdi);
78 84
79static inline struct inode *wb_inode(struct list_head *head) 85static inline struct inode *wb_inode(struct list_head *head)
80{ 86{
@@ -247,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
247 return ret; 253 return ret;
248} 254}
249 255
256#define EXPIRE_DIRTY_ATIME 0x0001
257
250/* 258/*
251 * Move expired (dirtied before work->older_than_this) dirty inodes from 259 * Move expired (dirtied before work->older_than_this) dirty inodes from
252 * @delaying_queue to @dispatch_queue. 260 * @delaying_queue to @dispatch_queue.
253 */ 261 */
254static int move_expired_inodes(struct list_head *delaying_queue, 262static int move_expired_inodes(struct list_head *delaying_queue,
255 struct list_head *dispatch_queue, 263 struct list_head *dispatch_queue,
264 int flags,
256 struct wb_writeback_work *work) 265 struct wb_writeback_work *work)
257{ 266{
267 unsigned long *older_than_this = NULL;
268 unsigned long expire_time;
258 LIST_HEAD(tmp); 269 LIST_HEAD(tmp);
259 struct list_head *pos, *node; 270 struct list_head *pos, *node;
260 struct super_block *sb = NULL; 271 struct super_block *sb = NULL;
@@ -262,13 +273,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 int do_sb_sort = 0; 273 int do_sb_sort = 0;
263 int moved = 0; 274 int moved = 0;
264 275
276 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
277 older_than_this = work->older_than_this;
278 else if ((work->reason == WB_REASON_SYNC) == 0) {
279 expire_time = jiffies - (HZ * 86400);
280 older_than_this = &expire_time;
281 }
265 while (!list_empty(delaying_queue)) { 282 while (!list_empty(delaying_queue)) {
266 inode = wb_inode(delaying_queue->prev); 283 inode = wb_inode(delaying_queue->prev);
267 if (work->older_than_this && 284 if (older_than_this &&
268 inode_dirtied_after(inode, *work->older_than_this)) 285 inode_dirtied_after(inode, *older_than_this))
269 break; 286 break;
270 list_move(&inode->i_wb_list, &tmp); 287 list_move(&inode->i_wb_list, &tmp);
271 moved++; 288 moved++;
289 if (flags & EXPIRE_DIRTY_ATIME)
290 set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
272 if (sb_is_blkdev_sb(inode->i_sb)) 291 if (sb_is_blkdev_sb(inode->i_sb))
273 continue; 292 continue;
274 if (sb && sb != inode->i_sb) 293 if (sb && sb != inode->i_sb)
@@ -309,9 +328,12 @@ out:
309static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) 328static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
310{ 329{
311 int moved; 330 int moved;
331
312 assert_spin_locked(&wb->list_lock); 332 assert_spin_locked(&wb->list_lock);
313 list_splice_init(&wb->b_more_io, &wb->b_io); 333 list_splice_init(&wb->b_more_io, &wb->b_io);
314 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); 334 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
335 moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
336 EXPIRE_DIRTY_ATIME, work);
315 trace_writeback_queue_io(wb, work, moved); 337 trace_writeback_queue_io(wb, work, moved);
316} 338}
317 339
@@ -435,6 +457,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
435 * updates after data IO completion. 457 * updates after data IO completion.
436 */ 458 */
437 redirty_tail(inode, wb); 459 redirty_tail(inode, wb);
460 } else if (inode->i_state & I_DIRTY_TIME) {
461 list_move(&inode->i_wb_list, &wb->b_dirty_time);
438 } else { 462 } else {
439 /* The inode is clean. Remove from writeback lists. */ 463 /* The inode is clean. Remove from writeback lists. */
440 list_del_init(&inode->i_wb_list); 464 list_del_init(&inode->i_wb_list);
@@ -481,7 +505,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
481 spin_lock(&inode->i_lock); 505 spin_lock(&inode->i_lock);
482 506
483 dirty = inode->i_state & I_DIRTY; 507 dirty = inode->i_state & I_DIRTY;
484 inode->i_state &= ~I_DIRTY; 508 if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
509 (inode->i_state & I_DIRTY_TIME)) ||
510 (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
511 dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
512 trace_writeback_lazytime(inode);
513 }
514 inode->i_state &= ~dirty;
485 515
486 /* 516 /*
487 * Paired with smp_mb() in __mark_inode_dirty(). This allows 517 * Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -501,8 +531,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
501 531
502 spin_unlock(&inode->i_lock); 532 spin_unlock(&inode->i_lock);
503 533
534 if (dirty & I_DIRTY_TIME)
535 mark_inode_dirty_sync(inode);
504 /* Don't write the inode if only I_DIRTY_PAGES was set */ 536 /* Don't write the inode if only I_DIRTY_PAGES was set */
505 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 537 if (dirty & ~I_DIRTY_PAGES) {
506 int err = write_inode(inode, wbc); 538 int err = write_inode(inode, wbc);
507 if (ret == 0) 539 if (ret == 0)
508 ret = err; 540 ret = err;
@@ -550,7 +582,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
550 * make sure inode is on some writeback list and leave it there unless 582 * make sure inode is on some writeback list and leave it there unless
551 * we have completely cleaned the inode. 583 * we have completely cleaned the inode.
552 */ 584 */
553 if (!(inode->i_state & I_DIRTY) && 585 if (!(inode->i_state & I_DIRTY_ALL) &&
554 (wbc->sync_mode != WB_SYNC_ALL || 586 (wbc->sync_mode != WB_SYNC_ALL ||
555 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) 587 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
556 goto out; 588 goto out;
@@ -565,7 +597,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
565 * If inode is clean, remove it from writeback lists. Otherwise don't 597 * If inode is clean, remove it from writeback lists. Otherwise don't
566 * touch it. See comment above for explanation. 598 * touch it. See comment above for explanation.
567 */ 599 */
568 if (!(inode->i_state & I_DIRTY)) 600 if (!(inode->i_state & I_DIRTY_ALL))
569 list_del_init(&inode->i_wb_list); 601 list_del_init(&inode->i_wb_list);
570 spin_unlock(&wb->list_lock); 602 spin_unlock(&wb->list_lock);
571 inode_sync_complete(inode); 603 inode_sync_complete(inode);
@@ -707,7 +739,7 @@ static long writeback_sb_inodes(struct super_block *sb,
707 wrote += write_chunk - wbc.nr_to_write; 739 wrote += write_chunk - wbc.nr_to_write;
708 spin_lock(&wb->list_lock); 740 spin_lock(&wb->list_lock);
709 spin_lock(&inode->i_lock); 741 spin_lock(&inode->i_lock);
710 if (!(inode->i_state & I_DIRTY)) 742 if (!(inode->i_state & I_DIRTY_ALL))
711 wrote++; 743 wrote++;
712 requeue_inode(inode, wb, &wbc); 744 requeue_inode(inode, wb, &wbc);
713 inode_sync_complete(inode); 745 inode_sync_complete(inode);
@@ -1145,16 +1177,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1145 * page->mapping->host, so the page-dirtying time is recorded in the internal 1177 * page->mapping->host, so the page-dirtying time is recorded in the internal
1146 * blockdev inode. 1178 * blockdev inode.
1147 */ 1179 */
1180#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
1148void __mark_inode_dirty(struct inode *inode, int flags) 1181void __mark_inode_dirty(struct inode *inode, int flags)
1149{ 1182{
1150 struct super_block *sb = inode->i_sb; 1183 struct super_block *sb = inode->i_sb;
1151 struct backing_dev_info *bdi = NULL; 1184 struct backing_dev_info *bdi = NULL;
1185 int dirtytime;
1186
1187 trace_writeback_mark_inode_dirty(inode, flags);
1152 1188
1153 /* 1189 /*
1154 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1190 * Don't do this for I_DIRTY_PAGES - that doesn't actually
1155 * dirty the inode itself 1191 * dirty the inode itself
1156 */ 1192 */
1157 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1193 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
1158 trace_writeback_dirty_inode_start(inode, flags); 1194 trace_writeback_dirty_inode_start(inode, flags);
1159 1195
1160 if (sb->s_op->dirty_inode) 1196 if (sb->s_op->dirty_inode)
@@ -1162,6 +1198,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1162 1198
1163 trace_writeback_dirty_inode(inode, flags); 1199 trace_writeback_dirty_inode(inode, flags);
1164 } 1200 }
1201 if (flags & I_DIRTY_INODE)
1202 flags &= ~I_DIRTY_TIME;
1203 dirtytime = flags & I_DIRTY_TIME;
1165 1204
1166 /* 1205 /*
1167 * Paired with smp_mb() in __writeback_single_inode() for the 1206 * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1208,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1169 */ 1208 */
1170 smp_mb(); 1209 smp_mb();
1171 1210
1172 if ((inode->i_state & flags) == flags) 1211 if (((inode->i_state & flags) == flags) ||
1212 (dirtytime && (inode->i_state & I_DIRTY_INODE)))
1173 return; 1213 return;
1174 1214
1175 if (unlikely(block_dump)) 1215 if (unlikely(block_dump))
1176 block_dump___mark_inode_dirty(inode); 1216 block_dump___mark_inode_dirty(inode);
1177 1217
1178 spin_lock(&inode->i_lock); 1218 spin_lock(&inode->i_lock);
1219 if (dirtytime && (inode->i_state & I_DIRTY_INODE))
1220 goto out_unlock_inode;
1179 if ((inode->i_state & flags) != flags) { 1221 if ((inode->i_state & flags) != flags) {
1180 const int was_dirty = inode->i_state & I_DIRTY; 1222 const int was_dirty = inode->i_state & I_DIRTY;
1181 1223
1224 if (flags & I_DIRTY_INODE)
1225 inode->i_state &= ~I_DIRTY_TIME;
1182 inode->i_state |= flags; 1226 inode->i_state |= flags;
1183 1227
1184 /* 1228 /*
@@ -1225,8 +1269,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1225 } 1269 }
1226 1270
1227 inode->dirtied_when = jiffies; 1271 inode->dirtied_when = jiffies;
1228 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1272 list_move(&inode->i_wb_list, dirtytime ?
1273 &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
1229 spin_unlock(&bdi->wb.list_lock); 1274 spin_unlock(&bdi->wb.list_lock);
1275 trace_writeback_dirty_inode_enqueue(inode);
1230 1276
1231 if (wakeup_bdi) 1277 if (wakeup_bdi)
1232 bdi_wakeup_thread_delayed(bdi); 1278 bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..b06c98796afb 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,78 +1,102 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/sched.h>
2#include <linux/slab.h> 3#include <linux/slab.h>
3#include <linux/fs_pin.h>
4#include "internal.h" 4#include "internal.h"
5#include "mount.h" 5#include "mount.h"
6 6
7static void pin_free_rcu(struct rcu_head *head)
8{
9 kfree(container_of(head, struct fs_pin, rcu));
10}
11
12static DEFINE_SPINLOCK(pin_lock); 7static DEFINE_SPINLOCK(pin_lock);
13 8
14void pin_put(struct fs_pin *p)
15{
16 if (atomic_long_dec_and_test(&p->count))
17 call_rcu(&p->rcu, pin_free_rcu);
18}
19
20void pin_remove(struct fs_pin *pin) 9void pin_remove(struct fs_pin *pin)
21{ 10{
22 spin_lock(&pin_lock); 11 spin_lock(&pin_lock);
23 hlist_del(&pin->m_list); 12 hlist_del(&pin->m_list);
24 hlist_del(&pin->s_list); 13 hlist_del(&pin->s_list);
25 spin_unlock(&pin_lock); 14 spin_unlock(&pin_lock);
15 spin_lock_irq(&pin->wait.lock);
16 pin->done = 1;
17 wake_up_locked(&pin->wait);
18 spin_unlock_irq(&pin->wait.lock);
26} 19}
27 20
28void pin_insert(struct fs_pin *pin, struct vfsmount *m) 21void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
29{ 22{
30 spin_lock(&pin_lock); 23 spin_lock(&pin_lock);
31 hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); 24 if (p)
25 hlist_add_head(&pin->s_list, p);
32 hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); 26 hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
33 spin_unlock(&pin_lock); 27 spin_unlock(&pin_lock);
34} 28}
35 29
30void pin_insert(struct fs_pin *pin, struct vfsmount *m)
31{
32 pin_insert_group(pin, m, &m->mnt_sb->s_pins);
33}
34
35void pin_kill(struct fs_pin *p)
36{
37 wait_queue_t wait;
38
39 if (!p) {
40 rcu_read_unlock();
41 return;
42 }
43 init_wait(&wait);
44 spin_lock_irq(&p->wait.lock);
45 if (likely(!p->done)) {
46 p->done = -1;
47 spin_unlock_irq(&p->wait.lock);
48 rcu_read_unlock();
49 p->kill(p);
50 return;
51 }
52 if (p->done > 0) {
53 spin_unlock_irq(&p->wait.lock);
54 rcu_read_unlock();
55 return;
56 }
57 __add_wait_queue(&p->wait, &wait);
58 while (1) {
59 set_current_state(TASK_UNINTERRUPTIBLE);
60 spin_unlock_irq(&p->wait.lock);
61 rcu_read_unlock();
62 schedule();
63 rcu_read_lock();
64 if (likely(list_empty(&wait.task_list)))
65 break;
66 /* OK, we know p couldn't have been freed yet */
67 spin_lock_irq(&p->wait.lock);
68 if (p->done > 0) {
69 spin_unlock_irq(&p->wait.lock);
70 break;
71 }
72 }
73 rcu_read_unlock();
74}
75
36void mnt_pin_kill(struct mount *m) 76void mnt_pin_kill(struct mount *m)
37{ 77{
38 while (1) { 78 while (1) {
39 struct hlist_node *p; 79 struct hlist_node *p;
40 struct fs_pin *pin;
41 rcu_read_lock(); 80 rcu_read_lock();
42 p = ACCESS_ONCE(m->mnt_pins.first); 81 p = ACCESS_ONCE(m->mnt_pins.first);
43 if (!p) { 82 if (!p) {
44 rcu_read_unlock(); 83 rcu_read_unlock();
45 break; 84 break;
46 } 85 }
47 pin = hlist_entry(p, struct fs_pin, m_list); 86 pin_kill(hlist_entry(p, struct fs_pin, m_list));
48 if (!atomic_long_inc_not_zero(&pin->count)) {
49 rcu_read_unlock();
50 cpu_relax();
51 continue;
52 }
53 rcu_read_unlock();
54 pin->kill(pin);
55 } 87 }
56} 88}
57 89
58void sb_pin_kill(struct super_block *sb) 90void group_pin_kill(struct hlist_head *p)
59{ 91{
60 while (1) { 92 while (1) {
61 struct hlist_node *p; 93 struct hlist_node *q;
62 struct fs_pin *pin;
63 rcu_read_lock(); 94 rcu_read_lock();
64 p = ACCESS_ONCE(sb->s_pins.first); 95 q = ACCESS_ONCE(p->first);
65 if (!p) { 96 if (!q) {
66 rcu_read_unlock(); 97 rcu_read_unlock();
67 break; 98 break;
68 } 99 }
69 pin = hlist_entry(p, struct fs_pin, s_list); 100 pin_kill(hlist_entry(q, struct fs_pin, s_list));
70 if (!atomic_long_inc_not_zero(&pin->count)) {
71 rcu_read_unlock();
72 cpu_relax();
73 continue;
74 }
75 rcu_read_unlock();
76 pin->kill(pin);
77 } 101 }
78} 102}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba1107977f2e..ed19a7d622fa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -131,6 +131,13 @@ static void fuse_req_init_context(struct fuse_req *req)
131 req->in.h.pid = current->pid; 131 req->in.h.pid = current->pid;
132} 132}
133 133
134void fuse_set_initialized(struct fuse_conn *fc)
135{
136 /* Make sure stores before this are seen on another CPU */
137 smp_wmb();
138 fc->initialized = 1;
139}
140
134static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) 141static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
135{ 142{
136 return !fc->initialized || (for_background && fc->blocked); 143 return !fc->initialized || (for_background && fc->blocked);
@@ -155,6 +162,8 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
155 if (intr) 162 if (intr)
156 goto out; 163 goto out;
157 } 164 }
165 /* Matches smp_wmb() in fuse_set_initialized() */
166 smp_rmb();
158 167
159 err = -ENOTCONN; 168 err = -ENOTCONN;
160 if (!fc->connected) 169 if (!fc->connected)
@@ -253,6 +262,8 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
253 262
254 atomic_inc(&fc->num_waiting); 263 atomic_inc(&fc->num_waiting);
255 wait_event(fc->blocked_waitq, fc->initialized); 264 wait_event(fc->blocked_waitq, fc->initialized);
265 /* Matches smp_wmb() in fuse_set_initialized() */
266 smp_rmb();
256 req = fuse_request_alloc(0); 267 req = fuse_request_alloc(0);
257 if (!req) 268 if (!req)
258 req = get_reserved_req(fc, file); 269 req = get_reserved_req(fc, file);
@@ -511,6 +522,39 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
511} 522}
512EXPORT_SYMBOL_GPL(fuse_request_send); 523EXPORT_SYMBOL_GPL(fuse_request_send);
513 524
525static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
526{
527 if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
528 args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
529
530 if (fc->minor < 9) {
531 switch (args->in.h.opcode) {
532 case FUSE_LOOKUP:
533 case FUSE_CREATE:
534 case FUSE_MKNOD:
535 case FUSE_MKDIR:
536 case FUSE_SYMLINK:
537 case FUSE_LINK:
538 args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
539 break;
540 case FUSE_GETATTR:
541 case FUSE_SETATTR:
542 args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
543 break;
544 }
545 }
546 if (fc->minor < 12) {
547 switch (args->in.h.opcode) {
548 case FUSE_CREATE:
549 args->in.args[0].size = sizeof(struct fuse_open_in);
550 break;
551 case FUSE_MKNOD:
552 args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
553 break;
554 }
555 }
556}
557
514ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) 558ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
515{ 559{
516 struct fuse_req *req; 560 struct fuse_req *req;
@@ -520,6 +564,9 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
520 if (IS_ERR(req)) 564 if (IS_ERR(req))
521 return PTR_ERR(req); 565 return PTR_ERR(req);
522 566
567 /* Needs to be done after fuse_get_req() so that fc->minor is valid */
568 fuse_adjust_compat(fc, args);
569
523 req->in.h.opcode = args->in.h.opcode; 570 req->in.h.opcode = args->in.h.opcode;
524 req->in.h.nodeid = args->in.h.nodeid; 571 req->in.h.nodeid = args->in.h.nodeid;
525 req->in.numargs = args->in.numargs; 572 req->in.numargs = args->in.numargs;
@@ -2127,7 +2174,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
2127 if (fc->connected) { 2174 if (fc->connected) {
2128 fc->connected = 0; 2175 fc->connected = 0;
2129 fc->blocked = 0; 2176 fc->blocked = 0;
2130 fc->initialized = 1; 2177 fuse_set_initialized(fc);
2131 end_io_requests(fc); 2178 end_io_requests(fc);
2132 end_queued_requests(fc); 2179 end_queued_requests(fc);
2133 end_polls(fc); 2180 end_polls(fc);
@@ -2146,7 +2193,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
2146 spin_lock(&fc->lock); 2193 spin_lock(&fc->lock);
2147 fc->connected = 0; 2194 fc->connected = 0;
2148 fc->blocked = 0; 2195 fc->blocked = 0;
2149 fc->initialized = 1; 2196 fuse_set_initialized(fc);
2150 end_queued_requests(fc); 2197 end_queued_requests(fc);
2151 end_polls(fc); 2198 end_polls(fc);
2152 wake_up_all(&fc->blocked_waitq); 2199 wake_up_all(&fc->blocked_waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 252b8a5de8b5..08e7b1a9d5d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -156,10 +156,7 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
156 args->in.args[0].size = name->len + 1; 156 args->in.args[0].size = name->len + 1;
157 args->in.args[0].value = name->name; 157 args->in.args[0].value = name->name;
158 args->out.numargs = 1; 158 args->out.numargs = 1;
159 if (fc->minor < 9) 159 args->out.args[0].size = sizeof(struct fuse_entry_out);
160 args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
161 else
162 args->out.args[0].size = sizeof(struct fuse_entry_out);
163 args->out.args[0].value = outarg; 160 args->out.args[0].value = outarg;
164} 161}
165 162
@@ -422,16 +419,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
422 args.in.h.opcode = FUSE_CREATE; 419 args.in.h.opcode = FUSE_CREATE;
423 args.in.h.nodeid = get_node_id(dir); 420 args.in.h.nodeid = get_node_id(dir);
424 args.in.numargs = 2; 421 args.in.numargs = 2;
425 args.in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : 422 args.in.args[0].size = sizeof(inarg);
426 sizeof(inarg);
427 args.in.args[0].value = &inarg; 423 args.in.args[0].value = &inarg;
428 args.in.args[1].size = entry->d_name.len + 1; 424 args.in.args[1].size = entry->d_name.len + 1;
429 args.in.args[1].value = entry->d_name.name; 425 args.in.args[1].value = entry->d_name.name;
430 args.out.numargs = 2; 426 args.out.numargs = 2;
431 if (fc->minor < 9) 427 args.out.args[0].size = sizeof(outentry);
432 args.out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
433 else
434 args.out.args[0].size = sizeof(outentry);
435 args.out.args[0].value = &outentry; 428 args.out.args[0].value = &outentry;
436 args.out.args[1].size = sizeof(outopen); 429 args.out.args[1].size = sizeof(outopen);
437 args.out.args[1].value = &outopen; 430 args.out.args[1].value = &outopen;
@@ -539,10 +532,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
539 memset(&outarg, 0, sizeof(outarg)); 532 memset(&outarg, 0, sizeof(outarg));
540 args->in.h.nodeid = get_node_id(dir); 533 args->in.h.nodeid = get_node_id(dir);
541 args->out.numargs = 1; 534 args->out.numargs = 1;
542 if (fc->minor < 9) 535 args->out.args[0].size = sizeof(outarg);
543 args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
544 else
545 args->out.args[0].size = sizeof(outarg);
546 args->out.args[0].value = &outarg; 536 args->out.args[0].value = &outarg;
547 err = fuse_simple_request(fc, args); 537 err = fuse_simple_request(fc, args);
548 if (err) 538 if (err)
@@ -592,8 +582,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
592 inarg.umask = current_umask(); 582 inarg.umask = current_umask();
593 args.in.h.opcode = FUSE_MKNOD; 583 args.in.h.opcode = FUSE_MKNOD;
594 args.in.numargs = 2; 584 args.in.numargs = 2;
595 args.in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : 585 args.in.args[0].size = sizeof(inarg);
596 sizeof(inarg);
597 args.in.args[0].value = &inarg; 586 args.in.args[0].value = &inarg;
598 args.in.args[1].size = entry->d_name.len + 1; 587 args.in.args[1].size = entry->d_name.len + 1;
599 args.in.args[1].value = entry->d_name.name; 588 args.in.args[1].value = entry->d_name.name;
@@ -899,10 +888,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
899 args.in.args[0].size = sizeof(inarg); 888 args.in.args[0].size = sizeof(inarg);
900 args.in.args[0].value = &inarg; 889 args.in.args[0].value = &inarg;
901 args.out.numargs = 1; 890 args.out.numargs = 1;
902 if (fc->minor < 9) 891 args.out.args[0].size = sizeof(outarg);
903 args.out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
904 else
905 args.out.args[0].size = sizeof(outarg);
906 args.out.args[0].value = &outarg; 892 args.out.args[0].value = &outarg;
907 err = fuse_simple_request(fc, &args); 893 err = fuse_simple_request(fc, &args);
908 if (!err) { 894 if (!err) {
@@ -1574,10 +1560,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
1574 args->in.args[0].size = sizeof(*inarg_p); 1560 args->in.args[0].size = sizeof(*inarg_p);
1575 args->in.args[0].value = inarg_p; 1561 args->in.args[0].value = inarg_p;
1576 args->out.numargs = 1; 1562 args->out.numargs = 1;
1577 if (fc->minor < 9) 1563 args->out.args[0].size = sizeof(*outarg_p);
1578 args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
1579 else
1580 args->out.args[0].size = sizeof(*outarg_p);
1581 args->out.args[0].value = outarg_p; 1564 args->out.args[0].value = outarg_p;
1582} 1565}
1583 1566
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1159 mutex_lock(&inode->i_mutex); 1159 mutex_lock(&inode->i_mutex);
1160 1160
1161 /* We can write back this queue in page reclaim */ 1161 /* We can write back this queue in page reclaim */
1162 current->backing_dev_info = mapping->backing_dev_info; 1162 current->backing_dev_info = inode_to_bdi(inode);
1163 1163
1164 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1164 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1165 if (err) 1165 if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1464{ 1464{
1465 struct inode *inode = req->inode; 1465 struct inode *inode = req->inode;
1466 struct fuse_inode *fi = get_fuse_inode(inode); 1466 struct fuse_inode *fi = get_fuse_inode(inode);
1467 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; 1467 struct backing_dev_info *bdi = inode_to_bdi(inode);
1468 int i; 1468 int i;
1469 1469
1470 list_del(&req->writepages_entry); 1470 list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
1658 req->end = fuse_writepage_end; 1658 req->end = fuse_writepage_end;
1659 req->inode = inode; 1659 req->inode = inode;
1660 1660
1661 inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); 1661 inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
1662 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1662 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1663 1663
1664 spin_lock(&fc->lock); 1664 spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
1768 1768
1769 if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || 1769 if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
1770 old_req->state == FUSE_REQ_PENDING)) { 1770 old_req->state == FUSE_REQ_PENDING)) {
1771 struct backing_dev_info *bdi = page->mapping->backing_dev_info; 1771 struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
1772 1772
1773 copy_highpage(old_req->pages[0], page); 1773 copy_highpage(old_req->pages[0], page);
1774 spin_unlock(&fc->lock); 1774 spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
1872 req->page_descs[req->num_pages].offset = 0; 1872 req->page_descs[req->num_pages].offset = 0;
1873 req->page_descs[req->num_pages].length = PAGE_SIZE; 1873 req->page_descs[req->num_pages].length = PAGE_SIZE;
1874 1874
1875 inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); 1875 inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
1876 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); 1876 inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1877 1877
1878 err = 0; 1878 err = 0;
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
2062 .fault = filemap_fault, 2062 .fault = filemap_fault,
2063 .map_pages = filemap_map_pages, 2063 .map_pages = filemap_map_pages,
2064 .page_mkwrite = fuse_page_mkwrite, 2064 .page_mkwrite = fuse_page_mkwrite,
2065 .remap_pages = generic_file_remap_pages,
2066}; 2065};
2067 2066
2068static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2067static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e0fc6725d1d0..1cdfb07c1376 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -906,4 +906,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
906int fuse_do_setattr(struct inode *inode, struct iattr *attr, 906int fuse_do_setattr(struct inode *inode, struct iattr *attr,
907 struct file *file); 907 struct file *file);
908 908
909void fuse_set_initialized(struct fuse_conn *fc);
910
909#endif /* _FS_FUSE_I_H */ 911#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6749109f255d..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
308 if (!fc->writeback_cache || !S_ISREG(attr->mode)) 308 if (!fc->writeback_cache || !S_ISREG(attr->mode))
309 inode->i_flags |= S_NOCMTIME; 309 inode->i_flags |= S_NOCMTIME;
310 inode->i_generation = generation; 310 inode->i_generation = generation;
311 inode->i_data.backing_dev_info = &fc->bdi;
312 fuse_init_inode(inode, attr); 311 fuse_init_inode(inode, attr);
313 unlock_new_inode(inode); 312 unlock_new_inode(inode);
314 } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { 313 } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
@@ -424,8 +423,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
424 args.in.h.opcode = FUSE_STATFS; 423 args.in.h.opcode = FUSE_STATFS;
425 args.in.h.nodeid = get_node_id(dentry->d_inode); 424 args.in.h.nodeid = get_node_id(dentry->d_inode);
426 args.out.numargs = 1; 425 args.out.numargs = 1;
427 args.out.args[0].size = 426 args.out.args[0].size = sizeof(outarg);
428 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
429 args.out.args[0].value = &outarg; 427 args.out.args[0].value = &outarg;
430 err = fuse_simple_request(fc, &args); 428 err = fuse_simple_request(fc, &args);
431 if (!err) 429 if (!err)
@@ -898,7 +896,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
898 fc->max_write = max_t(unsigned, 4096, fc->max_write); 896 fc->max_write = max_t(unsigned, 4096, fc->max_write);
899 fc->conn_init = 1; 897 fc->conn_init = 1;
900 } 898 }
901 fc->initialized = 1; 899 fuse_set_initialized(fc);
902 wake_up_all(&fc->blocked_waitq); 900 wake_up_all(&fc->blocked_waitq);
903} 901}
904 902
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a38e30..7b3143064af1 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
73 73
74 BUG_ON(name == NULL); 74 BUG_ON(name == NULL);
75 75
76 if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) 76 if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
77 return -E2BIG; 77 return -E2BIG;
78 78
79 if (type == ACL_TYPE_ACCESS) { 79 if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
289 if (!clear_page_dirty_for_io(page)) 289 if (!clear_page_dirty_for_io(page))
290 goto continue_unlock; 290 goto continue_unlock;
291 291
292 trace_wbc_writepage(wbc, mapping->backing_dev_info); 292 trace_wbc_writepage(wbc, inode_to_bdi(inode));
293 293
294 ret = __gfs2_jdata_writepage(page, wbc); 294 ret = __gfs2_jdata_writepage(page, wbc);
295 if (unlikely(ret)) { 295 if (unlikely(ret)) {
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f09e228..6371192961e2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1896 1896
1897 ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); 1897 ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
1898 if (ht == NULL) 1898 if (ht == NULL)
1899 ht = vzalloc(size); 1899 ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
1900 PAGE_KERNEL);
1900 if (!ht) 1901 if (!ht)
1901 return -ENOMEM; 1902 return -ENOMEM;
1902 1903
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..3e32bb8e2d7e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
498 .fault = filemap_fault, 498 .fault = filemap_fault,
499 .map_pages = filemap_map_pages, 499 .map_pages = filemap_map_pages,
500 .page_mkwrite = gfs2_page_mkwrite, 500 .page_mkwrite = gfs2_page_mkwrite,
501 .remap_pages = generic_file_remap_pages,
502}; 501};
503 502
504/** 503/**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
655{ 654{
656 struct address_space *mapping = file->f_mapping; 655 struct address_space *mapping = file->f_mapping;
657 struct inode *inode = mapping->host; 656 struct inode *inode = mapping->host;
658 int sync_state = inode->i_state & I_DIRTY; 657 int sync_state = inode->i_state & I_DIRTY_ALL;
659 struct gfs2_inode *ip = GFS2_I(inode); 658 struct gfs2_inode *ip = GFS2_I(inode);
660 int ret = 0, ret1 = 0; 659 int ret = 0, ret1 = 0;
661 660
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
668 if (!gfs2_is_jdata(ip)) 667 if (!gfs2_is_jdata(ip))
669 sync_state &= ~I_DIRTY_PAGES; 668 sync_state &= ~I_DIRTY_PAGES;
670 if (datasync) 669 if (datasync)
671 sync_state &= ~I_DIRTY_SYNC; 670 sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
672 671
673 if (sync_state) { 672 if (sync_state) {
674 ret = sync_inode_metadata(inode, 1); 673 ret = sync_inode_metadata(inode, 1);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
173 spin_unlock(&lru_lock); 173 spin_unlock(&lru_lock);
174} 174}
175 175
176static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl) 176static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
177{ 177{
178 spin_lock(&lru_lock);
178 if (!list_empty(&gl->gl_lru)) { 179 if (!list_empty(&gl->gl_lru)) {
179 list_del_init(&gl->gl_lru); 180 list_del_init(&gl->gl_lru);
180 atomic_dec(&lru_count); 181 atomic_dec(&lru_count);
181 clear_bit(GLF_LRU, &gl->gl_flags); 182 clear_bit(GLF_LRU, &gl->gl_flags);
182 } 183 }
183}
184
185static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
186{
187 spin_lock(&lru_lock);
188 __gfs2_glock_remove_from_lru(gl);
189 spin_unlock(&lru_lock); 184 spin_unlock(&lru_lock);
190} 185}
191 186
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
205 200
206 lockref_mark_dead(&gl->gl_lockref); 201 lockref_mark_dead(&gl->gl_lockref);
207 202
208 spin_lock(&lru_lock); 203 gfs2_glock_remove_from_lru(gl);
209 __gfs2_glock_remove_from_lru(gl);
210 spin_unlock(&lru_lock);
211 spin_unlock(&gl->gl_lockref.lock); 204 spin_unlock(&gl->gl_lockref.lock);
212 spin_lock_bucket(gl->gl_hash); 205 spin_lock_bucket(gl->gl_hash);
213 hlist_bl_del_rcu(&gl->gl_list); 206 hlist_bl_del_rcu(&gl->gl_list);
@@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
775 mapping->flags = 0; 768 mapping->flags = 0;
776 mapping_set_gfp_mask(mapping, GFP_NOFS); 769 mapping_set_gfp_mask(mapping, GFP_NOFS);
777 mapping->private_data = NULL; 770 mapping->private_data = NULL;
778 mapping->backing_dev_info = s->s_bdi;
779 mapping->writeback_index = 0; 771 mapping->writeback_index = 0;
780 } 772 }
781 773
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002ebe70..73c72253faac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
543 } 543 }
544 544
545 error = gfs2_dir_add(&dip->i_inode, name, ip, da); 545 error = gfs2_dir_add(&dip->i_inode, name, ip, da);
546 if (error)
547 goto fail_end_trans;
548 546
549fail_end_trans:
550 gfs2_trans_end(sdp); 547 gfs2_trans_end(sdp);
551fail_ipreserv: 548fail_ipreserv:
552 gfs2_inplace_release(dip); 549 gfs2_inplace_release(dip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
112 mapping->flags = 0; 112 mapping->flags = 0;
113 mapping_set_gfp_mask(mapping, GFP_NOFS); 113 mapping_set_gfp_mask(mapping, GFP_NOFS);
114 mapping->private_data = NULL; 114 mapping->private_data = NULL;
115 mapping->backing_dev_info = sb->s_bdi;
116 mapping->writeback_index = 0; 115 mapping->writeback_index = 0;
117 116
118 spin_lock_init(&sdp->sd_log_lock); 117 spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c8b148bbdc8b..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
145} 145}
146 146
147 147
148static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) 148static enum lru_status gfs2_qd_isolate(struct list_head *item,
149 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
149{ 150{
150 struct list_head *dispose = arg; 151 struct list_head *dispose = arg;
151 struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); 152 struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
155 156
156 if (qd->qd_lockref.count == 0) { 157 if (qd->qd_lockref.count == 0) {
157 lockref_mark_dead(&qd->qd_lockref); 158 lockref_mark_dead(&qd->qd_lockref);
158 list_move(&qd->qd_lru, dispose); 159 list_lru_isolate_move(lru, &qd->qd_lru, dispose);
159 } 160 }
160 161
161 spin_unlock(&qd->qd_lockref.lock); 162 spin_unlock(&qd->qd_lockref.lock);
@@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
171 if (!(sc->gfp_mask & __GFP_FS)) 172 if (!(sc->gfp_mask & __GFP_FS))
172 return SHRINK_STOP; 173 return SHRINK_STOP;
173 174
174 freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, 175 freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
175 &dispose, &sc->nr_to_scan); 176 gfs2_qd_isolate, &dispose);
176 177
177 gfs2_qd_dispose(&dispose); 178 gfs2_qd_dispose(&dispose);
178 179
@@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
182static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, 183static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
183 struct shrink_control *sc) 184 struct shrink_control *sc)
184{ 185{
185 return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); 186 return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
186} 187}
187 188
188struct shrinker gfs2_qd_shrinker = { 189struct shrinker gfs2_qd_shrinker = {
@@ -667,7 +668,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
667 668
668static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, 669static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
669 s64 change, struct gfs2_quota_data *qd, 670 s64 change, struct gfs2_quota_data *qd,
670 struct fs_disk_quota *fdq) 671 struct qc_dqblk *fdq)
671{ 672{
672 struct inode *inode = &ip->i_inode; 673 struct inode *inode = &ip->i_inode;
673 struct gfs2_sbd *sdp = GFS2_SB(inode); 674 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -697,16 +698,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
697 be64_add_cpu(&q.qu_value, change); 698 be64_add_cpu(&q.qu_value, change);
698 qd->qd_qb.qb_value = q.qu_value; 699 qd->qd_qb.qb_value = q.qu_value;
699 if (fdq) { 700 if (fdq) {
700 if (fdq->d_fieldmask & FS_DQ_BSOFT) { 701 if (fdq->d_fieldmask & QC_SPC_SOFT) {
701 q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); 702 q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
702 qd->qd_qb.qb_warn = q.qu_warn; 703 qd->qd_qb.qb_warn = q.qu_warn;
703 } 704 }
704 if (fdq->d_fieldmask & FS_DQ_BHARD) { 705 if (fdq->d_fieldmask & QC_SPC_HARD) {
705 q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); 706 q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
706 qd->qd_qb.qb_limit = q.qu_limit; 707 qd->qd_qb.qb_limit = q.qu_limit;
707 } 708 }
708 if (fdq->d_fieldmask & FS_DQ_BCOUNT) { 709 if (fdq->d_fieldmask & QC_SPACE) {
709 q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); 710 q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
710 qd->qd_qb.qb_value = q.qu_value; 711 qd->qd_qb.qb_value = q.qu_value;
711 } 712 }
712 } 713 }
@@ -1497,7 +1498,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1497} 1498}
1498 1499
1499static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, 1500static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
1500 struct fs_disk_quota *fdq) 1501 struct qc_dqblk *fdq)
1501{ 1502{
1502 struct gfs2_sbd *sdp = sb->s_fs_info; 1503 struct gfs2_sbd *sdp = sb->s_fs_info;
1503 struct gfs2_quota_lvb *qlvb; 1504 struct gfs2_quota_lvb *qlvb;
@@ -1505,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
1505 struct gfs2_holder q_gh; 1506 struct gfs2_holder q_gh;
1506 int error; 1507 int error;
1507 1508
1508 memset(fdq, 0, sizeof(struct fs_disk_quota)); 1509 memset(fdq, 0, sizeof(*fdq));
1509 1510
1510 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) 1511 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
1511 return -ESRCH; /* Crazy XFS error code */ 1512 return -ESRCH; /* Crazy XFS error code */
@@ -1522,12 +1523,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
1522 goto out; 1523 goto out;
1523 1524
1524 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; 1525 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
1525 fdq->d_version = FS_DQUOT_VERSION; 1526 fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
1526 fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1527 fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
1527 fdq->d_id = from_kqid_munged(current_user_ns(), qid); 1528 fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
1528 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
1529 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
1530 fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
1531 1529
1532 gfs2_glock_dq_uninit(&q_gh); 1530 gfs2_glock_dq_uninit(&q_gh);
1533out: 1531out:
@@ -1536,10 +1534,10 @@ out:
1536} 1534}
1537 1535
1538/* GFS2 only supports a subset of the XFS fields */ 1536/* GFS2 only supports a subset of the XFS fields */
1539#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) 1537#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
1540 1538
1541static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, 1539static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
1542 struct fs_disk_quota *fdq) 1540 struct qc_dqblk *fdq)
1543{ 1541{
1544 struct gfs2_sbd *sdp = sb->s_fs_info; 1542 struct gfs2_sbd *sdp = sb->s_fs_info;
1545 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); 1543 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1583,17 +1581,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
1583 goto out_i; 1581 goto out_i;
1584 1582
1585 /* If nothing has changed, this is a no-op */ 1583 /* If nothing has changed, this is a no-op */
1586 if ((fdq->d_fieldmask & FS_DQ_BSOFT) && 1584 if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
1587 ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) 1585 ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
1588 fdq->d_fieldmask ^= FS_DQ_BSOFT; 1586 fdq->d_fieldmask ^= QC_SPC_SOFT;
1589 1587
1590 if ((fdq->d_fieldmask & FS_DQ_BHARD) && 1588 if ((fdq->d_fieldmask & QC_SPC_HARD) &&
1591 ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) 1589 ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
1592 fdq->d_fieldmask ^= FS_DQ_BHARD; 1590 fdq->d_fieldmask ^= QC_SPC_HARD;
1593 1591
1594 if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && 1592 if ((fdq->d_fieldmask & QC_SPACE) &&
1595 ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) 1593 ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
1596 fdq->d_fieldmask ^= FS_DQ_BCOUNT; 1594 fdq->d_fieldmask ^= QC_SPACE;
1597 1595
1598 if (fdq->d_fieldmask == 0) 1596 if (fdq->d_fieldmask == 0)
1599 goto out_i; 1597 goto out_i;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b758fa..1b645773c98e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
439 439
440 ls->ls_recover_jid_done = jid; 440 ls->ls_recover_jid_done = jid;
441 ls->ls_recover_jid_status = message; 441 ls->ls_recover_jid_status = message;
442 sprintf(env_jid, "JID=%d", jid); 442 sprintf(env_jid, "JID=%u", jid);
443 sprintf(env_status, "RECOVERY=%s", 443 sprintf(env_status, "RECOVERY=%s",
444 message == LM_RD_SUCCESS ? "Done" : "Failed"); 444 message == LM_RD_SUCCESS ? "Done" : "Failed");
445 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 445 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
743 struct gfs2_inode *ip = GFS2_I(inode); 743 struct gfs2_inode *ip = GFS2_I(inode);
744 struct gfs2_sbd *sdp = GFS2_SB(inode); 744 struct gfs2_sbd *sdp = GFS2_SB(inode);
745 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); 745 struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
746 struct backing_dev_info *bdi = metamapping->backing_dev_info; 746 struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
747 int ret = 0; 747 int ret = 0;
748 748
749 if (wbc->sync_mode == WB_SYNC_ALL) 749 if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566ba5696..ae8e8811f0e8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
96 struct super_block *sb = sdp->sd_vfs; 96 struct super_block *sb = sdp->sd_vfs;
97 int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; 97 int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
98 98
99 return snprintf(buf, PAGE_SIZE, "%u\n", frozen); 99 return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
100} 100}
101 101
102static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 102static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
62 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); 62 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
63} 63}
64 64
65static struct backing_dev_info hugetlbfs_backing_dev_info = {
66 .name = "hugetlbfs",
67 .ra_pages = 0, /* No readahead */
68 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
69};
70
71int sysctl_hugetlb_shm_group; 65int sysctl_hugetlb_shm_group;
72 66
73enum { 67enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
498 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, 492 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
499 &hugetlbfs_i_mmap_rwsem_key); 493 &hugetlbfs_i_mmap_rwsem_key);
500 inode->i_mapping->a_ops = &hugetlbfs_aops; 494 inode->i_mapping->a_ops = &hugetlbfs_aops;
501 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
502 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 495 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
503 inode->i_mapping->private_data = resv_map; 496 inode->i_mapping->private_data = resv_map;
504 info = HUGETLBFS_I(inode); 497 info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
1032 return -ENOTSUPP; 1025 return -ENOTSUPP;
1033 } 1026 }
1034 1027
1035 error = bdi_init(&hugetlbfs_backing_dev_info);
1036 if (error)
1037 return error;
1038
1039 error = -ENOMEM; 1028 error = -ENOMEM;
1040 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1029 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1041 sizeof(struct hugetlbfs_inode_info), 1030 sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
1071 out: 1060 out:
1072 kmem_cache_destroy(hugetlbfs_inode_cachep); 1061 kmem_cache_destroy(hugetlbfs_inode_cachep);
1073 out2: 1062 out2:
1074 bdi_destroy(&hugetlbfs_backing_dev_info);
1075 return error; 1063 return error;
1076} 1064}
1077 1065
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
1091 for_each_hstate(h) 1079 for_each_hstate(h)
1092 kern_unmount(hugetlbfs_vfsmount[i++]); 1080 kern_unmount(hugetlbfs_vfsmount[i++]);
1093 unregister_filesystem(&hugetlbfs_fs_type); 1081 unregister_filesystem(&hugetlbfs_fs_type);
1094 bdi_destroy(&hugetlbfs_backing_dev_info);
1095} 1082}
1096 1083
1097module_init(init_hugetlbfs_fs) 1084module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f00b16f45507 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/buffer_head.h> /* for inode_has_buffers */ 18#include <linux/buffer_head.h> /* for inode_has_buffers */
19#include <linux/ratelimit.h> 19#include <linux/ratelimit.h>
20#include <linux/list_lru.h> 20#include <linux/list_lru.h>
21#include <trace/events/writeback.h>
21#include "internal.h" 22#include "internal.h"
22 23
23/* 24/*
@@ -30,7 +31,7 @@
30 * inode_sb_list_lock protects: 31 * inode_sb_list_lock protects:
31 * sb->s_inodes, inode->i_sb_list 32 * sb->s_inodes, inode->i_sb_list
32 * bdi->wb.list_lock protects: 33 * bdi->wb.list_lock protects:
33 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list 34 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
34 * inode_hash_lock protects: 35 * inode_hash_lock protects:
35 * inode_hashtable, inode->i_hash 36 * inode_hashtable, inode->i_hash
36 * 37 *
@@ -170,20 +171,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
170 atomic_set(&mapping->i_mmap_writable, 0); 171 atomic_set(&mapping->i_mmap_writable, 0);
171 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 172 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
172 mapping->private_data = NULL; 173 mapping->private_data = NULL;
173 mapping->backing_dev_info = &default_backing_dev_info;
174 mapping->writeback_index = 0; 174 mapping->writeback_index = 0;
175
176 /*
177 * If the block_device provides a backing_dev_info for client
178 * inodes then use that. Otherwise the inode share the bdev's
179 * backing_dev_info.
180 */
181 if (sb->s_bdev) {
182 struct backing_dev_info *bdi;
183
184 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
185 mapping->backing_dev_info = bdi;
186 }
187 inode->i_private = NULL; 175 inode->i_private = NULL;
188 inode->i_mapping = mapping; 176 inode->i_mapping = mapping;
189 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ 177 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
@@ -194,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
194#ifdef CONFIG_FSNOTIFY 182#ifdef CONFIG_FSNOTIFY
195 inode->i_fsnotify_mask = 0; 183 inode->i_fsnotify_mask = 0;
196#endif 184#endif
197 185 inode->i_flctx = NULL;
198 this_cpu_inc(nr_inodes); 186 this_cpu_inc(nr_inodes);
199 187
200 return 0; 188 return 0;
@@ -237,6 +225,7 @@ void __destroy_inode(struct inode *inode)
237 BUG_ON(inode_has_buffers(inode)); 225 BUG_ON(inode_has_buffers(inode));
238 security_inode_free(inode); 226 security_inode_free(inode);
239 fsnotify_inode_delete(inode); 227 fsnotify_inode_delete(inode);
228 locks_free_lock_context(inode->i_flctx);
240 if (!inode->i_nlink) { 229 if (!inode->i_nlink) {
241 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); 230 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
242 atomic_long_dec(&inode->i_sb->s_remove_count); 231 atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -355,7 +344,6 @@ void address_space_init_once(struct address_space *mapping)
355 INIT_LIST_HEAD(&mapping->private_list); 344 INIT_LIST_HEAD(&mapping->private_list);
356 spin_lock_init(&mapping->private_lock); 345 spin_lock_init(&mapping->private_lock);
357 mapping->i_mmap = RB_ROOT; 346 mapping->i_mmap = RB_ROOT;
358 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
359} 347}
360EXPORT_SYMBOL(address_space_init_once); 348EXPORT_SYMBOL(address_space_init_once);
361 349
@@ -416,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode)
416 */ 404 */
417void inode_add_lru(struct inode *inode) 405void inode_add_lru(struct inode *inode)
418{ 406{
419 if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && 407 if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
408 I_FREEING | I_WILL_FREE)) &&
420 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) 409 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
421 inode_lru_list_add(inode); 410 inode_lru_list_add(inode);
422} 411}
@@ -647,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
647 spin_unlock(&inode->i_lock); 636 spin_unlock(&inode->i_lock);
648 continue; 637 continue;
649 } 638 }
650 if (inode->i_state & I_DIRTY && !kill_dirty) { 639 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
651 spin_unlock(&inode->i_lock); 640 spin_unlock(&inode->i_lock);
652 busy = 1; 641 busy = 1;
653 continue; 642 continue;
@@ -685,8 +674,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
685 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 674 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
686 * with this flag set because they are the inodes that are out of order. 675 * with this flag set because they are the inodes that are out of order.
687 */ 676 */
688static enum lru_status 677static enum lru_status inode_lru_isolate(struct list_head *item,
689inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) 678 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
690{ 679{
691 struct list_head *freeable = arg; 680 struct list_head *freeable = arg;
692 struct inode *inode = container_of(item, struct inode, i_lru); 681 struct inode *inode = container_of(item, struct inode, i_lru);
@@ -704,7 +693,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
704 */ 693 */
705 if (atomic_read(&inode->i_count) || 694 if (atomic_read(&inode->i_count) ||
706 (inode->i_state & ~I_REFERENCED)) { 695 (inode->i_state & ~I_REFERENCED)) {
707 list_del_init(&inode->i_lru); 696 list_lru_isolate(lru, &inode->i_lru);
708 spin_unlock(&inode->i_lock); 697 spin_unlock(&inode->i_lock);
709 this_cpu_dec(nr_unused); 698 this_cpu_dec(nr_unused);
710 return LRU_REMOVED; 699 return LRU_REMOVED;
@@ -738,7 +727,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
738 727
739 WARN_ON(inode->i_state & I_NEW); 728 WARN_ON(inode->i_state & I_NEW);
740 inode->i_state |= I_FREEING; 729 inode->i_state |= I_FREEING;
741 list_move(&inode->i_lru, freeable); 730 list_lru_isolate_move(lru, &inode->i_lru, freeable);
742 spin_unlock(&inode->i_lock); 731 spin_unlock(&inode->i_lock);
743 732
744 this_cpu_dec(nr_unused); 733 this_cpu_dec(nr_unused);
@@ -751,14 +740,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
751 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 740 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
752 * then are freed outside inode_lock by dispose_list(). 741 * then are freed outside inode_lock by dispose_list().
753 */ 742 */
754long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 743long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
755 int nid)
756{ 744{
757 LIST_HEAD(freeable); 745 LIST_HEAD(freeable);
758 long freed; 746 long freed;
759 747
760 freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, 748 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
761 &freeable, &nr_to_scan); 749 inode_lru_isolate, &freeable);
762 dispose_list(&freeable); 750 dispose_list(&freeable);
763 return freed; 751 return freed;
764} 752}
@@ -1282,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
1282} 1270}
1283EXPORT_SYMBOL(ilookup); 1271EXPORT_SYMBOL(ilookup);
1284 1272
1273/**
1274 * find_inode_nowait - find an inode in the inode cache
1275 * @sb: super block of file system to search
1276 * @hashval: hash value (usually inode number) to search for
1277 * @match: callback used for comparisons between inodes
1278 * @data: opaque data pointer to pass to @match
1279 *
1280 * Search for the inode specified by @hashval and @data in the inode
1281 * cache, where the helper function @match will return 0 if the inode
1282 * does not match, 1 if the inode does match, and -1 if the search
1283 * should be stopped. The @match function must be responsible for
1284 * taking the i_lock spin_lock and checking i_state for an inode being
1285 * freed or being initialized, and incrementing the reference count
1286 * before returning 1. It also must not sleep, since it is called with
1287 * the inode_hash_lock spinlock held.
1288 *
1289 * This is a even more generalized version of ilookup5() when the
1290 * function must never block --- find_inode() can block in
1291 * __wait_on_freeing_inode() --- or when the caller can not increment
1292 * the reference count because the resulting iput() might cause an
1293 * inode eviction. The tradeoff is that the @match funtion must be
1294 * very carefully implemented.
1295 */
1296struct inode *find_inode_nowait(struct super_block *sb,
1297 unsigned long hashval,
1298 int (*match)(struct inode *, unsigned long,
1299 void *),
1300 void *data)
1301{
1302 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1303 struct inode *inode, *ret_inode = NULL;
1304 int mval;
1305
1306 spin_lock(&inode_hash_lock);
1307 hlist_for_each_entry(inode, head, i_hash) {
1308 if (inode->i_sb != sb)
1309 continue;
1310 mval = match(inode, hashval, data);
1311 if (mval == 0)
1312 continue;
1313 if (mval == 1)
1314 ret_inode = inode;
1315 goto out;
1316 }
1317out:
1318 spin_unlock(&inode_hash_lock);
1319 return ret_inode;
1320}
1321EXPORT_SYMBOL(find_inode_nowait);
1322
1285int insert_inode_locked(struct inode *inode) 1323int insert_inode_locked(struct inode *inode)
1286{ 1324{
1287 struct super_block *sb = inode->i_sb; 1325 struct super_block *sb = inode->i_sb;
@@ -1432,11 +1470,20 @@ static void iput_final(struct inode *inode)
1432 */ 1470 */
1433void iput(struct inode *inode) 1471void iput(struct inode *inode)
1434{ 1472{
1435 if (inode) { 1473 if (!inode)
1436 BUG_ON(inode->i_state & I_CLEAR); 1474 return;
1437 1475 BUG_ON(inode->i_state & I_CLEAR);
1438 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) 1476retry:
1439 iput_final(inode); 1477 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
1478 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
1479 atomic_inc(&inode->i_count);
1480 inode->i_state &= ~I_DIRTY_TIME;
1481 spin_unlock(&inode->i_lock);
1482 trace_writeback_lazytime_iput(inode);
1483 mark_inode_dirty_sync(inode);
1484 goto retry;
1485 }
1486 iput_final(inode);
1440 } 1487 }
1441} 1488}
1442EXPORT_SYMBOL(iput); 1489EXPORT_SYMBOL(iput);
@@ -1495,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1495 return 0; 1542 return 0;
1496} 1543}
1497 1544
1498/* 1545int generic_update_time(struct inode *inode, struct timespec *time, int flags)
1499 * This does the actual work of updating an inodes time or version. Must have
1500 * had called mnt_want_write() before calling this.
1501 */
1502static int update_time(struct inode *inode, struct timespec *time, int flags)
1503{ 1546{
1504 if (inode->i_op->update_time) 1547 int iflags = I_DIRTY_TIME;
1505 return inode->i_op->update_time(inode, time, flags);
1506 1548
1507 if (flags & S_ATIME) 1549 if (flags & S_ATIME)
1508 inode->i_atime = *time; 1550 inode->i_atime = *time;
@@ -1512,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
1512 inode->i_ctime = *time; 1554 inode->i_ctime = *time;
1513 if (flags & S_MTIME) 1555 if (flags & S_MTIME)
1514 inode->i_mtime = *time; 1556 inode->i_mtime = *time;
1515 mark_inode_dirty_sync(inode); 1557
1558 if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
1559 iflags |= I_DIRTY_SYNC;
1560 __mark_inode_dirty(inode, iflags);
1516 return 0; 1561 return 0;
1517} 1562}
1563EXPORT_SYMBOL(generic_update_time);
1564
1565/*
1566 * This does the actual work of updating an inodes time or version. Must have
1567 * had called mnt_want_write() before calling this.
1568 */
1569static int update_time(struct inode *inode, struct timespec *time, int flags)
1570{
1571 int (*update_time)(struct inode *, struct timespec *, int);
1572
1573 update_time = inode->i_op->update_time ? inode->i_op->update_time :
1574 generic_update_time;
1575
1576 return update_time(inode, time, flags);
1577}
1518 1578
1519/** 1579/**
1520 * touch_atime - update the access time 1580 * touch_atime - update the access time
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..30459dab409d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
14struct linux_binprm; 14struct linux_binprm;
15struct path; 15struct path;
16struct mount; 16struct mount;
17struct shrink_control;
17 18
18/* 19/*
19 * block_dev.c 20 * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
111 * inode.c 112 * inode.c
112 */ 113 */
113extern spinlock_t inode_sb_list_lock; 114extern spinlock_t inode_sb_list_lock;
114extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 115extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
115 int nid);
116extern void inode_add_lru(struct inode *inode); 116extern void inode_add_lru(struct inode *inode);
117 117
118/* 118/*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
129 */ 129 */
130extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 130extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
131extern int d_set_mounted(struct dentry *dentry); 131extern int d_set_mounted(struct dentry *dentry);
132extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 132extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
133 int nid);
134 133
135/* 134/*
136 * read_write.c 135 * read_write.c
@@ -145,7 +144,7 @@ extern const struct file_operations pipefifo_fops;
145/* 144/*
146 * fs_pin.c 145 * fs_pin.c
147 */ 146 */
148extern void sb_pin_kill(struct super_block *sb); 147extern void group_pin_kill(struct hlist_head *p);
149extern void mnt_pin_kill(struct mount *m); 148extern void mnt_pin_kill(struct mount *m);
150 149
151/* 150/*
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 214c3c11fbc2..5d01d2638ca5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode,
379 past_eof = true; 379 past_eof = true;
380 } 380 }
381 cond_resched(); 381 cond_resched();
382 if (fatal_signal_pending(current)) {
383 ret = -EINTR;
384 break;
385 }
386
382 } while (1); 387 } while (1);
383 388
384 /* If ret is 1 then we just hit the end of the extent array */ 389 /* If ret is 1 then we just hit the end of the extent array */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index bb63254ed848..735d7522a3a9 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -362,6 +362,9 @@ repeat:
362 rs.cont_size = isonum_733(rr->u.CE.size); 362 rs.cont_size = isonum_733(rr->u.CE.size);
363 break; 363 break;
364 case SIG('E', 'R'): 364 case SIG('E', 'R'):
365 /* Invalid length of ER tag id? */
366 if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
367 goto out;
365 ISOFS_SB(inode->i_sb)->s_rock = 1; 368 ISOFS_SB(inode->i_sb)->s_rock = 1;
366 printk(KERN_DEBUG "ISO 9660 Extensions: "); 369 printk(KERN_DEBUG "ISO 9660 Extensions: ");
367 { 370 {
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 01e1ee7a998b..005a15cfd30a 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -2,6 +2,7 @@
2 * linux/fs/isofs/util.c 2 * linux/fs/isofs/util.c
3 */ 3 */
4 4
5#include <linux/time.h>
5#include "isofs.h" 6#include "isofs.h"
6 7
7/* 8/*
@@ -17,9 +18,9 @@
17int iso_date(char * p, int flag) 18int iso_date(char * p, int flag)
18{ 19{
19 int year, month, day, hour, minute, second, tz; 20 int year, month, day, hour, minute, second, tz;
20 int crtime, days, i; 21 int crtime;
21 22
22 year = p[0] - 70; 23 year = p[0];
23 month = p[1]; 24 month = p[1];
24 day = p[2]; 25 day = p[2];
25 hour = p[3]; 26 hour = p[3];
@@ -31,18 +32,7 @@ int iso_date(char * p, int flag)
31 if (year < 0) { 32 if (year < 0) {
32 crtime = 0; 33 crtime = 0;
33 } else { 34 } else {
34 int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31}; 35 crtime = mktime64(year+1900, month, day, hour, minute, second);
35
36 days = year * 365;
37 if (year > 2)
38 days += (year+1) / 4;
39 for (i = 1; i < month; i++)
40 days += monlen[i-1];
41 if (((year+2) % 4) == 0 && month > 2)
42 days++;
43 days += day - 1;
44 crtime = ((((days * 24) + hour) * 60 + minute) * 60)
45 + second;
46 36
47 /* sign extend */ 37 /* sign extend */
48 if (tz & 0x80) 38 if (tz & 0x80)
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 92e0644bf867..556de100ebd5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp)
84 return bit; 84 return bit;
85} 85}
86 86
87static inline int pulledbits(struct pushpull *pp)
88{
89 return pp->ofs;
90}
91
92 87
93static void init_rubin(struct rubin_state *rs, int div, int *bits) 88static void init_rubin(struct rubin_state *rs, int div, int *bits)
94{ 89{
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87b0428..9ad5ba4b299b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
510 sumlen = c->sector_size - je32_to_cpu(sm->offset); 510 sumlen = c->sector_size - je32_to_cpu(sm->offset);
511 sumptr = buf + buf_size - sumlen; 511 sumptr = buf + buf_size - sumlen;
512 512
513 /* sm->offset maybe wrong but MAGIC maybe right */
514 if (sumlen > c->sector_size)
515 goto full_scan;
516
513 /* Now, make sure the summary itself is available */ 517 /* Now, make sure the summary itself is available */
514 if (sumlen > buf_size) { 518 if (sumlen > buf_size) {
515 /* Need to kmalloc for this. */ 519 /* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
544 } 548 }
545 } 549 }
546 550
551full_scan:
547 buf_ofs = jeb->offset; 552 buf_ofs = jeb->offset;
548 553
549 if (!buf_size) { 554 if (!buf_size) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
deleted file mode 100644
index fa92f7f1d0d0..000000000000
--- a/fs/jfs/endian24.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2001
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_ENDIAN24
19#define _H_ENDIAN24
20
21/*
22 * endian24.h:
23 *
24 * Endian conversion for 24-byte data
25 *
26 */
27#define __swab24(x) \
28({ \
29 __u32 __x = (x); \
30 ((__u32)( \
31 ((__x & (__u32)0x000000ffUL) << 16) | \
32 (__x & (__u32)0x0000ff00UL) | \
33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \
34})
35
36#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
37 #define __cpu_to_le24(x) ((__u32)(x))
38 #define __le24_to_cpu(x) ((__u32)(x))
39#else
40 #define __cpu_to_le24(x) __swab24(x)
41 #define __le24_to_cpu(x) __swab24(x)
42#endif
43
44#ifdef __KERNEL__
45 #define cpu_to_le24 __cpu_to_le24
46 #define le24_to_cpu __le24_to_cpu
47#endif
48
49#endif /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
39 return rc; 39 return rc;
40 40
41 mutex_lock(&inode->i_mutex); 41 mutex_lock(&inode->i_mutex);
42 if (!(inode->i_state & I_DIRTY) || 42 if (!(inode->i_state & I_DIRTY_ALL) ||
43 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { 43 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
44 /* Make sure committed changes hit the disk */ 44 /* Make sure committed changes hit the disk */
45 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); 45 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 984c2bbf4f61..d88576e23fe4 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid,
1040 pxdlist.maxnpxd = 1; 1040 pxdlist.maxnpxd = 1;
1041 pxdlist.npxd = 0; 1041 pxdlist.npxd = 0;
1042 pxd = &pxdlist.pxd[0]; 1042 pxd = &pxdlist.pxd[0];
1043 PXDaddress(pxd, nxaddr) 1043 PXDaddress(pxd, nxaddr);
1044 PXDlength(pxd, xlen + n); 1044 PXDlength(pxd, xlen + n);
1045 split->pxdlist = &pxdlist; 1045 split->pxdlist = &pxdlist;
1046 if ((rc = dtExtendPage(tid, ip, split, btstack))) { 1046 if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1047 nxaddr = addressPXD(pxd); 1047 nxaddr = addressPXD(pxd);
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 43ea3713c083..8f602dcb51fa 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -30,8 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/nls.h> 31#include <linux/nls.h>
32 32
33#include "endian24.h"
34
35/* 33/*
36 * transaction and lock id's 34 * transaction and lock id's
37 * 35 *
@@ -59,26 +57,42 @@ struct timestruc_t {
59 57
60/* 58/*
61 * physical xd (pxd) 59 * physical xd (pxd)
60 *
61 * The leftmost 24 bits of len_addr are the extent length.
62 * The rightmost 8 bits of len_addr are the most signficant bits of
63 * the extent address
62 */ 64 */
63typedef struct { 65typedef struct {
64 unsigned len:24; 66 __le32 len_addr;
65 unsigned addr1:8;
66 __le32 addr2; 67 __le32 addr2;
67} pxd_t; 68} pxd_t;
68 69
69/* xd_t field construction */ 70/* xd_t field construction */
70 71
71#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32)) 72static inline void PXDlength(pxd_t *pxd, __u32 len)
72#define PXDaddress(pxd, address64)\ 73{
73{\ 74 pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
74 (pxd)->addr1 = ((s64)address64) >> 32;\ 75 cpu_to_le32(len & 0xffffff);
75 (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ 76}
77
78static inline void PXDaddress(pxd_t *pxd, __u64 addr)
79{
80 pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
81 cpu_to_le32((addr >> 32)<<24);
82 pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
76} 83}
77 84
78/* xd_t field extraction */ 85/* xd_t field extraction */
79#define lengthPXD(pxd) __le24_to_cpu((pxd)->len) 86static inline __u32 lengthPXD(pxd_t *pxd)
80#define addressPXD(pxd)\ 87{
81 ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2)) 88 return le32_to_cpu((pxd)->len_addr) & 0xffffff;
89}
90
91static inline __u64 addressPXD(pxd_t *pxd)
92{
93 __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
94 return (n << 8) + le32_to_cpu(pxd->addr2);
95}
82 96
83#define MAXTREEHEIGHT 8 97#define MAXTREEHEIGHT 8
84/* pxd list */ 98/* pxd list */
@@ -93,12 +107,10 @@ struct pxdlist {
93 * data extent descriptor (dxd) 107 * data extent descriptor (dxd)
94 */ 108 */
95typedef struct { 109typedef struct {
96 unsigned flag:8; /* 1: flags */ 110 __u8 flag; /* 1: flags */
97 unsigned rsrvd:24; 111 __u8 rsrvd[3];
98 __le32 size; /* 4: size in byte */ 112 __le32 size; /* 4: size in byte */
99 unsigned len:24; /* 3: length in unit of fsblksize */ 113 pxd_t loc; /* 8: address and length in unit of fsblksize */
100 unsigned addr1:8; /* 1: address in unit of fsblksize */
101 __le32 addr2; /* 4: address in unit of fsblksize */
102} dxd_t; /* - 16 - */ 114} dxd_t; /* - 16 - */
103 115
104/* dxd_t flags */ 116/* dxd_t flags */
@@ -109,12 +121,11 @@ typedef struct {
109#define DXD_CORRUPT 0x08 /* Inconsistency detected */ 121#define DXD_CORRUPT 0x08 /* Inconsistency detected */
110 122
111/* dxd_t field construction 123/* dxd_t field construction
112 * Conveniently, the PXD macros work for DXD
113 */ 124 */
114#define DXDlength PXDlength 125#define DXDlength(dxd, len) PXDlength(&(dxd)->loc, len)
115#define DXDaddress PXDaddress 126#define DXDaddress(dxd, addr) PXDaddress(&(dxd)->loc, addr)
116#define lengthDXD lengthPXD 127#define lengthDXD(dxd) lengthPXD(&(dxd)->loc)
117#define addressDXD addressPXD 128#define addressDXD(dxd) addressPXD(&(dxd)->loc)
118#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) 129#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
119#define sizeDXD(dxd) le32_to_cpu((dxd)->size) 130#define sizeDXD(dxd) le32_to_cpu((dxd)->size)
120 131
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 08c0c749b986..1e0987986d5f 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -29,13 +29,11 @@
29 * extent allocation descriptor (xad) 29 * extent allocation descriptor (xad)
30 */ 30 */
31typedef struct xad { 31typedef struct xad {
32 unsigned flag:8; /* 1: flag */ 32 __u8 flag; /* 1: flag */
33 unsigned rsvrd:16; /* 2: reserved */ 33 __u8 rsvrd[2]; /* 2: reserved */
34 unsigned off1:8; /* 1: offset in unit of fsblksize */ 34 __u8 off1; /* 1: offset in unit of fsblksize */
35 __le32 off2; /* 4: offset in unit of fsblksize */ 35 __le32 off2; /* 4: offset in unit of fsblksize */
36 unsigned len:24; /* 3: length in unit of fsblksize */ 36 pxd_t loc; /* 8: length and address in unit of fsblksize */
37 unsigned addr1:8; /* 1: address in unit of fsblksize */
38 __le32 addr2; /* 4: address in unit of fsblksize */
39} xad_t; /* (16) */ 37} xad_t; /* (16) */
40 38
41#define MAXXLEN ((1 << 24) - 1) 39#define MAXXLEN ((1 << 24) - 1)
@@ -49,19 +47,14 @@ typedef struct xad {
49 (xad)->off1 = ((u64)offset64) >> 32;\ 47 (xad)->off1 = ((u64)offset64) >> 32;\
50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ 48 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
51} 49}
52#define XADaddress(xad, address64)\ 50#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
53{\ 51#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
54 (xad)->addr1 = ((u64)address64) >> 32;\
55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
56}
57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32)
58 52
59/* xad_t field extraction */ 53/* xad_t field extraction */
60#define offsetXAD(xad)\ 54#define offsetXAD(xad)\
61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) 55 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
62#define addressXAD(xad)\ 56#define addressXAD(xad) addressPXD(&(xad)->loc)
63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) 57#define lengthXAD(xad) lengthPXD(&(xad)->loc)
64#define lengthXAD(xad) __le24_to_cpu((xad)->len)
65 58
66/* xad list */ 59/* xad list */
67struct xadlist { 60struct xadlist {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 16c3a9556634..5d30c56ae075 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,8 +619,7 @@ out_mount_failed:
619 iput(sbi->direct_inode); 619 iput(sbi->direct_inode);
620 sbi->direct_inode = NULL; 620 sbi->direct_inode = NULL;
621out_unload: 621out_unload:
622 if (sbi->nls_tab) 622 unload_nls(sbi->nls_tab);
623 unload_nls(sbi->nls_tab);
624out_kfree: 623out_kfree:
625 kfree(sbi); 624 kfree(sbi);
626 return ret; 625 return ret;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 37989f02a226..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -201,10 +201,14 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
201static int kernfs_name_compare(unsigned int hash, const char *name, 201static int kernfs_name_compare(unsigned int hash, const char *name,
202 const void *ns, const struct kernfs_node *kn) 202 const void *ns, const struct kernfs_node *kn)
203{ 203{
204 if (hash != kn->hash) 204 if (hash < kn->hash)
205 return hash - kn->hash; 205 return -1;
206 if (ns != kn->ns) 206 if (hash > kn->hash)
207 return ns - kn->ns; 207 return 1;
208 if (ns < kn->ns)
209 return -1;
210 if (ns > kn->ns)
211 return 1;
208 return strcmp(name, kn->name); 212 return strcmp(name, kn->name);
209} 213}
210 214
@@ -407,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
407 411
408 if (kernfs_type(kn) == KERNFS_LINK) 412 if (kernfs_type(kn) == KERNFS_LINK)
409 kernfs_put(kn->symlink.target_kn); 413 kernfs_put(kn->symlink.target_kn);
410 if (!(kn->flags & KERNFS_STATIC_NAME)) 414
411 kfree(kn->name); 415 kfree_const(kn->name);
416
412 if (kn->iattr) { 417 if (kn->iattr) {
413 if (kn->iattr->ia_secdata) 418 if (kn->iattr->ia_secdata)
414 security_release_secctx(kn->iattr->ia_secdata, 419 security_release_secctx(kn->iattr->ia_secdata,
@@ -502,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
502 const char *name, umode_t mode, 507 const char *name, umode_t mode,
503 unsigned flags) 508 unsigned flags)
504{ 509{
505 char *dup_name = NULL;
506 struct kernfs_node *kn; 510 struct kernfs_node *kn;
507 int ret; 511 int ret;
508 512
509 if (!(flags & KERNFS_STATIC_NAME)) { 513 name = kstrdup_const(name, GFP_KERNEL);
510 name = dup_name = kstrdup(name, GFP_KERNEL); 514 if (!name)
511 if (!name) 515 return NULL;
512 return NULL;
513 }
514 516
515 kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); 517 kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
516 if (!kn) 518 if (!kn)
@@ -534,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
534 err_out2: 536 err_out2:
535 kmem_cache_free(kernfs_node_cache, kn); 537 kmem_cache_free(kernfs_node_cache, kn);
536 err_out1: 538 err_out1:
537 kfree(dup_name); 539 kfree_const(name);
538 return NULL; 540 return NULL;
539} 541}
540 542
@@ -1260,7 +1262,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1260 /* rename kernfs_node */ 1262 /* rename kernfs_node */
1261 if (strcmp(kn->name, new_name) != 0) { 1263 if (strcmp(kn->name, new_name) != 0) {
1262 error = -ENOMEM; 1264 error = -ENOMEM;
1263 new_name = kstrdup(new_name, GFP_KERNEL); 1265 new_name = kstrdup_const(new_name, GFP_KERNEL);
1264 if (!new_name) 1266 if (!new_name)
1265 goto out; 1267 goto out;
1266 } else { 1268 } else {
@@ -1281,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1281 1283
1282 kn->ns = new_ns; 1284 kn->ns = new_ns;
1283 if (new_name) { 1285 if (new_name) {
1284 if (!(kn->flags & KERNFS_STATIC_NAME)) 1286 old_name = kn->name;
1285 old_name = kn->name;
1286 kn->flags &= ~KERNFS_STATIC_NAME;
1287 kn->name = new_name; 1287 kn->name = new_name;
1288 } 1288 }
1289 1289
@@ -1293,7 +1293,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
1293 kernfs_link_sibling(kn); 1293 kernfs_link_sibling(kn);
1294 1294
1295 kernfs_put(old_parent); 1295 kernfs_put(old_parent);
1296 kfree(old_name); 1296 kfree_const(old_name);
1297 1297
1298 error = 0; 1298 error = 0;
1299 out: 1299 out:
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
901 * @ops: kernfs operations for the file 901 * @ops: kernfs operations for the file
902 * @priv: private data for the file 902 * @priv: private data for the file
903 * @ns: optional namespace tag of the file 903 * @ns: optional namespace tag of the file
904 * @name_is_static: don't copy file name
905 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep 904 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
906 * 905 *
907 * Returns the created node on success, ERR_PTR() value on error. 906 * Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
911 umode_t mode, loff_t size, 910 umode_t mode, loff_t size,
912 const struct kernfs_ops *ops, 911 const struct kernfs_ops *ops,
913 void *priv, const void *ns, 912 void *priv, const void *ns,
914 bool name_is_static,
915 struct lock_class_key *key) 913 struct lock_class_key *key)
916{ 914{
917 struct kernfs_node *kn; 915 struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
919 int rc; 917 int rc;
920 918
921 flags = KERNFS_FILE; 919 flags = KERNFS_FILE;
922 if (name_is_static)
923 flags |= KERNFS_STATIC_NAME;
924 920
925 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); 921 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
926 if (!kn) 922 if (!kn)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
24 .write_end = simple_write_end, 24 .write_end = simple_write_end,
25}; 25};
26 26
27static struct backing_dev_info kernfs_bdi = {
28 .name = "kernfs",
29 .ra_pages = 0, /* No readahead */
30 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
31};
32
33static const struct inode_operations kernfs_iops = { 27static const struct inode_operations kernfs_iops = {
34 .permission = kernfs_iop_permission, 28 .permission = kernfs_iop_permission,
35 .setattr = kernfs_iop_setattr, 29 .setattr = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
40 .listxattr = kernfs_iop_listxattr, 34 .listxattr = kernfs_iop_listxattr,
41}; 35};
42 36
43void __init kernfs_inode_init(void)
44{
45 if (bdi_init(&kernfs_bdi))
46 panic("failed to init kernfs_bdi");
47}
48
49static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) 37static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
50{ 38{
51 static DEFINE_MUTEX(iattr_mutex); 39 static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
298 kernfs_get(kn); 286 kernfs_get(kn);
299 inode->i_private = kn; 287 inode->i_private = kn;
300 inode->i_mapping->a_ops = &kernfs_aops; 288 inode->i_mapping->a_ops = &kernfs_aops;
301 inode->i_mapping->backing_dev_info = &kernfs_bdi;
302 inode->i_op = &kernfs_iops; 289 inode->i_op = &kernfs_iops;
303 290
304 set_default_inode_attr(inode, kn->mode); 291 set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
88ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, 88ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
89 size_t size); 89 size_t size);
90ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); 90ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
91void kernfs_inode_init(void);
92 91
93/* 92/*
94 * dir.c 93 * dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
246 kernfs_node_cache = kmem_cache_create("kernfs_node_cache", 246 kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
247 sizeof(struct kernfs_node), 247 sizeof(struct kernfs_node),
248 0, SLAB_PANIC, NULL); 248 0, SLAB_PANIC, NULL);
249 kernfs_inode_init();
250} 249}
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
948 948
949 mutex_lock(&inode->i_mutex); 949 mutex_lock(&inode->i_mutex);
950 ret = sync_mapping_buffers(inode->i_mapping); 950 ret = sync_mapping_buffers(inode->i_mapping);
951 if (!(inode->i_state & I_DIRTY)) 951 if (!(inode->i_state & I_DIRTY_ALL))
952 goto out; 952 goto out;
953 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 953 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
954 goto out; 954 goto out;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
65 return (struct sockaddr *)&nsm->sm_addr; 65 return (struct sockaddr *)&nsm->sm_addr;
66} 66}
67 67
68static struct rpc_clnt *nsm_create(struct net *net) 68static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
69{ 69{
70 struct sockaddr_in sin = { 70 struct sockaddr_in sin = {
71 .sin_family = AF_INET, 71 .sin_family = AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
77 .address = (struct sockaddr *)&sin, 77 .address = (struct sockaddr *)&sin,
78 .addrsize = sizeof(sin), 78 .addrsize = sizeof(sin),
79 .servername = "rpc.statd", 79 .servername = "rpc.statd",
80 .nodename = nodename,
80 .program = &nsm_program, 81 .program = &nsm_program,
81 .version = NSM_VERSION, 82 .version = NSM_VERSION,
82 .authflavor = RPC_AUTH_NULL, 83 .authflavor = RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
102 return clnt; 103 return clnt;
103} 104}
104 105
105static struct rpc_clnt *nsm_client_get(struct net *net) 106static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
106{ 107{
107 struct rpc_clnt *clnt, *new; 108 struct rpc_clnt *clnt, *new;
108 struct lockd_net *ln = net_generic(net, lockd_net_id); 109 struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
111 if (clnt != NULL) 112 if (clnt != NULL)
112 goto out; 113 goto out;
113 114
114 clnt = new = nsm_create(net); 115 clnt = new = nsm_create(net, nodename);
115 if (IS_ERR(clnt)) 116 if (IS_ERR(clnt))
116 goto out; 117 goto out;
117 118
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
190 struct nsm_res res; 191 struct nsm_res res;
191 int status; 192 int status;
192 struct rpc_clnt *clnt; 193 struct rpc_clnt *clnt;
194 const char *nodename = NULL;
193 195
194 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); 196 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
195 197
196 if (nsm->sm_monitored) 198 if (nsm->sm_monitored)
197 return 0; 199 return 0;
198 200
201 if (host->h_rpcclnt)
202 nodename = host->h_rpcclnt->cl_nodename;
203
199 /* 204 /*
200 * Choose whether to record the caller_name or IP address of 205 * Choose whether to record the caller_name or IP address of
201 * this peer in the local rpc.statd's database. 206 * this peer in the local rpc.statd's database.
202 */ 207 */
203 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; 208 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
204 209
205 clnt = nsm_client_get(host->net); 210 clnt = nsm_client_get(host->net, nodename);
206 if (IS_ERR(clnt)) { 211 if (IS_ERR(clnt)) {
207 status = PTR_ERR(clnt); 212 status = PTR_ERR(clnt);
208 dprintk("lockd: failed to create NSM upcall transport, " 213 dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e94c887da2d7..55505cbe11af 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -138,10 +138,6 @@ lockd(void *vrqstp)
138 138
139 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); 139 dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
140 140
141 if (!nlm_timeout)
142 nlm_timeout = LOCKD_DFLT_TIMEO;
143 nlmsvc_timeout = nlm_timeout * HZ;
144
145 /* 141 /*
146 * The main request loop. We don't terminate until the last 142 * The main request loop. We don't terminate until the last
147 * NFS mount or NFS daemon has gone away. 143 * NFS mount or NFS daemon has gone away.
@@ -350,6 +346,10 @@ static struct svc_serv *lockd_create_svc(void)
350 printk(KERN_WARNING 346 printk(KERN_WARNING
351 "lockd_up: no pid, %d users??\n", nlmsvc_users); 347 "lockd_up: no pid, %d users??\n", nlmsvc_users);
352 348
349 if (!nlm_timeout)
350 nlm_timeout = LOCKD_DFLT_TIMEO;
351 nlmsvc_timeout = nlm_timeout * HZ;
352
353 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); 353 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
354 if (!serv) { 354 if (!serv) {
355 printk(KERN_WARNING "lockd_up: create service failed\n"); 355 printk(KERN_WARNING "lockd_up: create service failed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) 57static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
58{ 58{
59 /* 59 /*
60 * We can get away with a static buffer because we're only 60 * We can get away with a static buffer because this is only called
61 * called with BKL held. 61 * from lockd, which is single-threaded.
62 */ 62 */
63 static char buf[2*NLM_MAXCOOKIELEN+1]; 63 static char buf[2*NLM_MAXCOOKIELEN+1];
64 unsigned int i, len = sizeof(buf); 64 unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
164{ 164{
165 struct inode *inode = nlmsvc_file_inode(file); 165 struct inode *inode = nlmsvc_file_inode(file);
166 struct file_lock *fl; 166 struct file_lock *fl;
167 struct file_lock_context *flctx = inode->i_flctx;
167 struct nlm_host *lockhost; 168 struct nlm_host *lockhost;
168 169
170 if (!flctx || list_empty_careful(&flctx->flc_posix))
171 return 0;
169again: 172again:
170 file->f_locks = 0; 173 file->f_locks = 0;
171 spin_lock(&inode->i_lock); 174 spin_lock(&flctx->flc_lock);
172 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 175 list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
173 if (fl->fl_lmops != &nlmsvc_lock_operations) 176 if (fl->fl_lmops != &nlmsvc_lock_operations)
174 continue; 177 continue;
175 178
@@ -180,7 +183,7 @@ again:
180 if (match(lockhost, host)) { 183 if (match(lockhost, host)) {
181 struct file_lock lock = *fl; 184 struct file_lock lock = *fl;
182 185
183 spin_unlock(&inode->i_lock); 186 spin_unlock(&flctx->flc_lock);
184 lock.fl_type = F_UNLCK; 187 lock.fl_type = F_UNLCK;
185 lock.fl_start = 0; 188 lock.fl_start = 0;
186 lock.fl_end = OFFSET_MAX; 189 lock.fl_end = OFFSET_MAX;
@@ -192,7 +195,7 @@ again:
192 goto again; 195 goto again;
193 } 196 }
194 } 197 }
195 spin_unlock(&inode->i_lock); 198 spin_unlock(&flctx->flc_lock);
196 199
197 return 0; 200 return 0;
198} 201}
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
223{ 226{
224 struct inode *inode = nlmsvc_file_inode(file); 227 struct inode *inode = nlmsvc_file_inode(file);
225 struct file_lock *fl; 228 struct file_lock *fl;
229 struct file_lock_context *flctx = inode->i_flctx;
226 230
227 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 231 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
228 return 1; 232 return 1;
229 233
230 spin_lock(&inode->i_lock); 234 if (flctx && !list_empty_careful(&flctx->flc_posix)) {
231 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 235 spin_lock(&flctx->flc_lock);
232 if (fl->fl_lmops == &nlmsvc_lock_operations) { 236 list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
233 spin_unlock(&inode->i_lock); 237 if (fl->fl_lmops == &nlmsvc_lock_operations) {
234 return 1; 238 spin_unlock(&flctx->flc_lock);
239 return 1;
240 }
235 } 241 }
242 spin_unlock(&flctx->flc_lock);
236 } 243 }
237 spin_unlock(&inode->i_lock);
238 file->f_locks = 0; 244 file->f_locks = 0;
239 return 0; 245 return 0;
240} 246}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
95 return p + XDR_QUADLEN(NFS2_FHSIZE); 95 return p + XDR_QUADLEN(NFS2_FHSIZE);
96} 96}
97 97
98static inline __be32 *
99nlm_encode_fh(__be32 *p, struct nfs_fh *f)
100{
101 *p++ = htonl(NFS2_FHSIZE);
102 memcpy(p, f->data, NFS2_FHSIZE);
103 return p + XDR_QUADLEN(NFS2_FHSIZE);
104}
105
106/* 98/*
107 * Encode and decode owner handle 99 * Encode and decode owner handle
108 */ 100 */
diff --git a/fs/locks.c b/fs/locks.c
index 735b8d3fa78c..365c82e1b3a9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
137 137
138#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 138#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
139#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 139#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
140#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 140#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
141#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) 141#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
142 142
143static bool lease_breaking(struct file_lock *fl) 143static bool lease_breaking(struct file_lock *fl)
@@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl)
157int leases_enable = 1; 157int leases_enable = 1;
158int lease_break_time = 45; 158int lease_break_time = 45;
159 159
160#define for_each_lock(inode, lockp) \
161 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
162
163/* 160/*
164 * The global file_lock_list is only used for displaying /proc/locks, so we 161 * The global file_lock_list is only used for displaying /proc/locks, so we
165 * keep a list on each CPU, with each list protected by its own spinlock via 162 * keep a list on each CPU, with each list protected by its own spinlock via
166 * the file_lock_lglock. Note that alterations to the list also require that 163 * the file_lock_lglock. Note that alterations to the list also require that
167 * the relevant i_lock is held. 164 * the relevant flc_lock is held.
168 */ 165 */
169DEFINE_STATIC_LGLOCK(file_lock_lglock); 166DEFINE_STATIC_LGLOCK(file_lock_lglock);
170static DEFINE_PER_CPU(struct hlist_head, file_lock_list); 167static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
192 * contrast to those that are acting as records of acquired locks). 189 * contrast to those that are acting as records of acquired locks).
193 * 190 *
194 * Note that when we acquire this lock in order to change the above fields, 191 * Note that when we acquire this lock in order to change the above fields,
195 * we often hold the i_lock as well. In certain cases, when reading the fields 192 * we often hold the flc_lock as well. In certain cases, when reading the fields
196 * protected by this lock, we can skip acquiring it iff we already hold the 193 * protected by this lock, we can skip acquiring it iff we already hold the
197 * i_lock. 194 * flc_lock.
198 * 195 *
199 * In particular, adding an entry to the fl_block list requires that you hold 196 * In particular, adding an entry to the fl_block list requires that you hold
200 * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting 197 * both the flc_lock and the blocked_lock_lock (acquired in that order).
201 * an entry from the list however only requires the file_lock_lock. 198 * Deleting an entry from the list however only requires the file_lock_lock.
202 */ 199 */
203static DEFINE_SPINLOCK(blocked_lock_lock); 200static DEFINE_SPINLOCK(blocked_lock_lock);
204 201
202static struct kmem_cache *flctx_cache __read_mostly;
205static struct kmem_cache *filelock_cache __read_mostly; 203static struct kmem_cache *filelock_cache __read_mostly;
206 204
205static struct file_lock_context *
206locks_get_lock_context(struct inode *inode)
207{
208 struct file_lock_context *new;
209
210 if (likely(inode->i_flctx))
211 goto out;
212
213 new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
214 if (!new)
215 goto out;
216
217 spin_lock_init(&new->flc_lock);
218 INIT_LIST_HEAD(&new->flc_flock);
219 INIT_LIST_HEAD(&new->flc_posix);
220 INIT_LIST_HEAD(&new->flc_lease);
221
222 /*
223 * Assign the pointer if it's not already assigned. If it is, then
224 * free the context we just allocated.
225 */
226 spin_lock(&inode->i_lock);
227 if (likely(!inode->i_flctx)) {
228 inode->i_flctx = new;
229 new = NULL;
230 }
231 spin_unlock(&inode->i_lock);
232
233 if (new)
234 kmem_cache_free(flctx_cache, new);
235out:
236 return inode->i_flctx;
237}
238
239void
240locks_free_lock_context(struct file_lock_context *ctx)
241{
242 if (ctx) {
243 WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
244 WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
245 WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
246 kmem_cache_free(flctx_cache, ctx);
247 }
248}
249
207static void locks_init_lock_heads(struct file_lock *fl) 250static void locks_init_lock_heads(struct file_lock *fl)
208{ 251{
209 INIT_HLIST_NODE(&fl->fl_link); 252 INIT_HLIST_NODE(&fl->fl_link);
253 INIT_LIST_HEAD(&fl->fl_list);
210 INIT_LIST_HEAD(&fl->fl_block); 254 INIT_LIST_HEAD(&fl->fl_block);
211 init_waitqueue_head(&fl->fl_wait); 255 init_waitqueue_head(&fl->fl_wait);
212} 256}
@@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
243void locks_free_lock(struct file_lock *fl) 287void locks_free_lock(struct file_lock *fl)
244{ 288{
245 BUG_ON(waitqueue_active(&fl->fl_wait)); 289 BUG_ON(waitqueue_active(&fl->fl_wait));
290 BUG_ON(!list_empty(&fl->fl_list));
246 BUG_ON(!list_empty(&fl->fl_block)); 291 BUG_ON(!list_empty(&fl->fl_block));
247 BUG_ON(!hlist_unhashed(&fl->fl_link)); 292 BUG_ON(!hlist_unhashed(&fl->fl_link));
248 293
@@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose)
257 struct file_lock *fl; 302 struct file_lock *fl;
258 303
259 while (!list_empty(dispose)) { 304 while (!list_empty(dispose)) {
260 fl = list_first_entry(dispose, struct file_lock, fl_block); 305 fl = list_first_entry(dispose, struct file_lock, fl_list);
261 list_del_init(&fl->fl_block); 306 list_del_init(&fl->fl_list);
262 locks_free_lock(fl); 307 locks_free_lock(fl);
263 } 308 }
264} 309}
@@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
513 return fl1->fl_owner == fl2->fl_owner; 558 return fl1->fl_owner == fl2->fl_owner;
514} 559}
515 560
516/* Must be called with the i_lock held! */ 561/* Must be called with the flc_lock held! */
517static void locks_insert_global_locks(struct file_lock *fl) 562static void locks_insert_global_locks(struct file_lock *fl)
518{ 563{
519 lg_local_lock(&file_lock_lglock); 564 lg_local_lock(&file_lock_lglock);
@@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
522 lg_local_unlock(&file_lock_lglock); 567 lg_local_unlock(&file_lock_lglock);
523} 568}
524 569
525/* Must be called with the i_lock held! */ 570/* Must be called with the flc_lock held! */
526static void locks_delete_global_locks(struct file_lock *fl) 571static void locks_delete_global_locks(struct file_lock *fl)
527{ 572{
528 /* 573 /*
529 * Avoid taking lock if already unhashed. This is safe since this check 574 * Avoid taking lock if already unhashed. This is safe since this check
530 * is done while holding the i_lock, and new insertions into the list 575 * is done while holding the flc_lock, and new insertions into the list
531 * also require that it be held. 576 * also require that it be held.
532 */ 577 */
533 if (hlist_unhashed(&fl->fl_link)) 578 if (hlist_unhashed(&fl->fl_link))
@@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
579 * the order they blocked. The documentation doesn't require this but 624 * the order they blocked. The documentation doesn't require this but
580 * it seems like the reasonable thing to do. 625 * it seems like the reasonable thing to do.
581 * 626 *
582 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block 627 * Must be called with both the flc_lock and blocked_lock_lock held. The
583 * list itself is protected by the blocked_lock_lock, but by ensuring that the 628 * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
584 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock 629 * that the flc_lock is also held on insertions we can avoid taking the
585 * in some cases when we see that the fl_block list is empty. 630 * blocked_lock_lock in some cases when we see that the fl_block list is empty.
586 */ 631 */
587static void __locks_insert_block(struct file_lock *blocker, 632static void __locks_insert_block(struct file_lock *blocker,
588 struct file_lock *waiter) 633 struct file_lock *waiter)
@@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
594 locks_insert_global_blocked(waiter); 639 locks_insert_global_blocked(waiter);
595} 640}
596 641
597/* Must be called with i_lock held. */ 642/* Must be called with flc_lock held. */
598static void locks_insert_block(struct file_lock *blocker, 643static void locks_insert_block(struct file_lock *blocker,
599 struct file_lock *waiter) 644 struct file_lock *waiter)
600{ 645{
@@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
606/* 651/*
607 * Wake up processes blocked waiting for blocker. 652 * Wake up processes blocked waiting for blocker.
608 * 653 *
609 * Must be called with the inode->i_lock held! 654 * Must be called with the inode->flc_lock held!
610 */ 655 */
611static void locks_wake_up_blocks(struct file_lock *blocker) 656static void locks_wake_up_blocks(struct file_lock *blocker)
612{ 657{
613 /* 658 /*
614 * Avoid taking global lock if list is empty. This is safe since new 659 * Avoid taking global lock if list is empty. This is safe since new
615 * blocked requests are only added to the list under the i_lock, and 660 * blocked requests are only added to the list under the flc_lock, and
616 * the i_lock is always held here. Note that removal from the fl_block 661 * the flc_lock is always held here. Note that removal from the fl_block
617 * list does not require the i_lock, so we must recheck list_empty() 662 * list does not require the flc_lock, so we must recheck list_empty()
618 * after acquiring the blocked_lock_lock. 663 * after acquiring the blocked_lock_lock.
619 */ 664 */
620 if (list_empty(&blocker->fl_block)) 665 if (list_empty(&blocker->fl_block))
@@ -635,63 +680,32 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
635 spin_unlock(&blocked_lock_lock); 680 spin_unlock(&blocked_lock_lock);
636} 681}
637 682
638/* Insert file lock fl into an inode's lock list at the position indicated 683static void
639 * by pos. At the same time add the lock to the global file lock list. 684locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
640 *
641 * Must be called with the i_lock held!
642 */
643static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
644{ 685{
645 fl->fl_nspid = get_pid(task_tgid(current)); 686 fl->fl_nspid = get_pid(task_tgid(current));
646 687 list_add_tail(&fl->fl_list, before);
647 /* insert into file's list */
648 fl->fl_next = *pos;
649 *pos = fl;
650
651 locks_insert_global_locks(fl); 688 locks_insert_global_locks(fl);
652} 689}
653 690
654/** 691static void
655 * locks_delete_lock - Delete a lock and then free it. 692locks_unlink_lock_ctx(struct file_lock *fl)
656 * @thisfl_p: pointer that points to the fl_next field of the previous
657 * inode->i_flock list entry
658 *
659 * Unlink a lock from all lists and free the namespace reference, but don't
660 * free it yet. Wake up processes that are blocked waiting for this lock and
661 * notify the FS that the lock has been cleared.
662 *
663 * Must be called with the i_lock held!
664 */
665static void locks_unlink_lock(struct file_lock **thisfl_p)
666{ 693{
667 struct file_lock *fl = *thisfl_p;
668
669 locks_delete_global_locks(fl); 694 locks_delete_global_locks(fl);
670 695 list_del_init(&fl->fl_list);
671 *thisfl_p = fl->fl_next;
672 fl->fl_next = NULL;
673
674 if (fl->fl_nspid) { 696 if (fl->fl_nspid) {
675 put_pid(fl->fl_nspid); 697 put_pid(fl->fl_nspid);
676 fl->fl_nspid = NULL; 698 fl->fl_nspid = NULL;
677 } 699 }
678
679 locks_wake_up_blocks(fl); 700 locks_wake_up_blocks(fl);
680} 701}
681 702
682/* 703static void
683 * Unlink a lock from all lists and free it. 704locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
684 *
685 * Must be called with i_lock held!
686 */
687static void locks_delete_lock(struct file_lock **thisfl_p,
688 struct list_head *dispose)
689{ 705{
690 struct file_lock *fl = *thisfl_p; 706 locks_unlink_lock_ctx(fl);
691
692 locks_unlink_lock(thisfl_p);
693 if (dispose) 707 if (dispose)
694 list_add(&fl->fl_block, dispose); 708 list_add(&fl->fl_list, dispose);
695 else 709 else
696 locks_free_lock(fl); 710 locks_free_lock(fl);
697} 711}
@@ -746,22 +760,27 @@ void
746posix_test_lock(struct file *filp, struct file_lock *fl) 760posix_test_lock(struct file *filp, struct file_lock *fl)
747{ 761{
748 struct file_lock *cfl; 762 struct file_lock *cfl;
763 struct file_lock_context *ctx;
749 struct inode *inode = file_inode(filp); 764 struct inode *inode = file_inode(filp);
750 765
751 spin_lock(&inode->i_lock); 766 ctx = inode->i_flctx;
752 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { 767 if (!ctx || list_empty_careful(&ctx->flc_posix)) {
753 if (!IS_POSIX(cfl))
754 continue;
755 if (posix_locks_conflict(fl, cfl))
756 break;
757 }
758 if (cfl) {
759 locks_copy_conflock(fl, cfl);
760 if (cfl->fl_nspid)
761 fl->fl_pid = pid_vnr(cfl->fl_nspid);
762 } else
763 fl->fl_type = F_UNLCK; 768 fl->fl_type = F_UNLCK;
764 spin_unlock(&inode->i_lock); 769 return;
770 }
771
772 spin_lock(&ctx->flc_lock);
773 list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
774 if (posix_locks_conflict(fl, cfl)) {
775 locks_copy_conflock(fl, cfl);
776 if (cfl->fl_nspid)
777 fl->fl_pid = pid_vnr(cfl->fl_nspid);
778 goto out;
779 }
780 }
781 fl->fl_type = F_UNLCK;
782out:
783 spin_unlock(&ctx->flc_lock);
765 return; 784 return;
766} 785}
767EXPORT_SYMBOL(posix_test_lock); 786EXPORT_SYMBOL(posix_test_lock);
@@ -845,34 +864,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
845static int flock_lock_file(struct file *filp, struct file_lock *request) 864static int flock_lock_file(struct file *filp, struct file_lock *request)
846{ 865{
847 struct file_lock *new_fl = NULL; 866 struct file_lock *new_fl = NULL;
848 struct file_lock **before; 867 struct file_lock *fl;
849 struct inode * inode = file_inode(filp); 868 struct file_lock_context *ctx;
869 struct inode *inode = file_inode(filp);
850 int error = 0; 870 int error = 0;
851 int found = 0; 871 bool found = false;
852 LIST_HEAD(dispose); 872 LIST_HEAD(dispose);
853 873
874 ctx = locks_get_lock_context(inode);
875 if (!ctx)
876 return -ENOMEM;
877
854 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { 878 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
855 new_fl = locks_alloc_lock(); 879 new_fl = locks_alloc_lock();
856 if (!new_fl) 880 if (!new_fl)
857 return -ENOMEM; 881 return -ENOMEM;
858 } 882 }
859 883
860 spin_lock(&inode->i_lock); 884 spin_lock(&ctx->flc_lock);
861 if (request->fl_flags & FL_ACCESS) 885 if (request->fl_flags & FL_ACCESS)
862 goto find_conflict; 886 goto find_conflict;
863 887
864 for_each_lock(inode, before) { 888 list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
865 struct file_lock *fl = *before;
866 if (IS_POSIX(fl))
867 break;
868 if (IS_LEASE(fl))
869 continue;
870 if (filp != fl->fl_file) 889 if (filp != fl->fl_file)
871 continue; 890 continue;
872 if (request->fl_type == fl->fl_type) 891 if (request->fl_type == fl->fl_type)
873 goto out; 892 goto out;
874 found = 1; 893 found = true;
875 locks_delete_lock(before, &dispose); 894 locks_delete_lock_ctx(fl, &dispose);
876 break; 895 break;
877 } 896 }
878 897
@@ -882,23 +901,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
882 goto out; 901 goto out;
883 } 902 }
884 903
885 /*
886 * If a higher-priority process was blocked on the old file lock,
887 * give it the opportunity to lock the file.
888 */
889 if (found) {
890 spin_unlock(&inode->i_lock);
891 cond_resched();
892 spin_lock(&inode->i_lock);
893 }
894
895find_conflict: 904find_conflict:
896 for_each_lock(inode, before) { 905 list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
897 struct file_lock *fl = *before;
898 if (IS_POSIX(fl))
899 break;
900 if (IS_LEASE(fl))
901 continue;
902 if (!flock_locks_conflict(request, fl)) 906 if (!flock_locks_conflict(request, fl))
903 continue; 907 continue;
904 error = -EAGAIN; 908 error = -EAGAIN;
@@ -911,12 +915,12 @@ find_conflict:
911 if (request->fl_flags & FL_ACCESS) 915 if (request->fl_flags & FL_ACCESS)
912 goto out; 916 goto out;
913 locks_copy_lock(new_fl, request); 917 locks_copy_lock(new_fl, request);
914 locks_insert_lock(before, new_fl); 918 locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
915 new_fl = NULL; 919 new_fl = NULL;
916 error = 0; 920 error = 0;
917 921
918out: 922out:
919 spin_unlock(&inode->i_lock); 923 spin_unlock(&ctx->flc_lock);
920 if (new_fl) 924 if (new_fl)
921 locks_free_lock(new_fl); 925 locks_free_lock(new_fl);
922 locks_dispose_list(&dispose); 926 locks_dispose_list(&dispose);
@@ -925,16 +929,20 @@ out:
925 929
926static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) 930static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
927{ 931{
928 struct file_lock *fl; 932 struct file_lock *fl, *tmp;
929 struct file_lock *new_fl = NULL; 933 struct file_lock *new_fl = NULL;
930 struct file_lock *new_fl2 = NULL; 934 struct file_lock *new_fl2 = NULL;
931 struct file_lock *left = NULL; 935 struct file_lock *left = NULL;
932 struct file_lock *right = NULL; 936 struct file_lock *right = NULL;
933 struct file_lock **before; 937 struct file_lock_context *ctx;
934 int error; 938 int error;
935 bool added = false; 939 bool added = false;
936 LIST_HEAD(dispose); 940 LIST_HEAD(dispose);
937 941
942 ctx = locks_get_lock_context(inode);
943 if (!ctx)
944 return -ENOMEM;
945
938 /* 946 /*
939 * We may need two file_lock structures for this operation, 947 * We may need two file_lock structures for this operation,
940 * so we get them in advance to avoid races. 948 * so we get them in advance to avoid races.
@@ -948,15 +956,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
948 new_fl2 = locks_alloc_lock(); 956 new_fl2 = locks_alloc_lock();
949 } 957 }
950 958
951 spin_lock(&inode->i_lock); 959 spin_lock(&ctx->flc_lock);
952 /* 960 /*
953 * New lock request. Walk all POSIX locks and look for conflicts. If 961 * New lock request. Walk all POSIX locks and look for conflicts. If
954 * there are any, either return error or put the request on the 962 * there are any, either return error or put the request on the
955 * blocker's list of waiters and the global blocked_hash. 963 * blocker's list of waiters and the global blocked_hash.
956 */ 964 */
957 if (request->fl_type != F_UNLCK) { 965 if (request->fl_type != F_UNLCK) {
958 for_each_lock(inode, before) { 966 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
959 fl = *before;
960 if (!IS_POSIX(fl)) 967 if (!IS_POSIX(fl))
961 continue; 968 continue;
962 if (!posix_locks_conflict(request, fl)) 969 if (!posix_locks_conflict(request, fl))
@@ -986,29 +993,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
986 if (request->fl_flags & FL_ACCESS) 993 if (request->fl_flags & FL_ACCESS)
987 goto out; 994 goto out;
988 995
989 /* 996 /* Find the first old lock with the same owner as the new lock */
990 * Find the first old lock with the same owner as the new lock. 997 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
991 */ 998 if (posix_same_owner(request, fl))
992 999 break;
993 before = &inode->i_flock;
994
995 /* First skip locks owned by other processes. */
996 while ((fl = *before) && (!IS_POSIX(fl) ||
997 !posix_same_owner(request, fl))) {
998 before = &fl->fl_next;
999 } 1000 }
1000 1001
1001 /* Process locks with this owner. */ 1002 /* Process locks with this owner. */
1002 while ((fl = *before) && posix_same_owner(request, fl)) { 1003 list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
1003 /* Detect adjacent or overlapping regions (if same lock type) 1004 if (!posix_same_owner(request, fl))
1004 */ 1005 break;
1006
1007 /* Detect adjacent or overlapping regions (if same lock type) */
1005 if (request->fl_type == fl->fl_type) { 1008 if (request->fl_type == fl->fl_type) {
1006 /* In all comparisons of start vs end, use 1009 /* In all comparisons of start vs end, use
1007 * "start - 1" rather than "end + 1". If end 1010 * "start - 1" rather than "end + 1". If end
1008 * is OFFSET_MAX, end + 1 will become negative. 1011 * is OFFSET_MAX, end + 1 will become negative.
1009 */ 1012 */
1010 if (fl->fl_end < request->fl_start - 1) 1013 if (fl->fl_end < request->fl_start - 1)
1011 goto next_lock; 1014 continue;
1012 /* If the next lock in the list has entirely bigger 1015 /* If the next lock in the list has entirely bigger
1013 * addresses than the new one, insert the lock here. 1016 * addresses than the new one, insert the lock here.
1014 */ 1017 */
@@ -1029,18 +1032,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1029 else 1032 else
1030 request->fl_end = fl->fl_end; 1033 request->fl_end = fl->fl_end;
1031 if (added) { 1034 if (added) {
1032 locks_delete_lock(before, &dispose); 1035 locks_delete_lock_ctx(fl, &dispose);
1033 continue; 1036 continue;
1034 } 1037 }
1035 request = fl; 1038 request = fl;
1036 added = true; 1039 added = true;
1037 } 1040 } else {
1038 else {
1039 /* Processing for different lock types is a bit 1041 /* Processing for different lock types is a bit
1040 * more complex. 1042 * more complex.
1041 */ 1043 */
1042 if (fl->fl_end < request->fl_start) 1044 if (fl->fl_end < request->fl_start)
1043 goto next_lock; 1045 continue;
1044 if (fl->fl_start > request->fl_end) 1046 if (fl->fl_start > request->fl_end)
1045 break; 1047 break;
1046 if (request->fl_type == F_UNLCK) 1048 if (request->fl_type == F_UNLCK)
@@ -1059,7 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1059 * one (This may happen several times). 1061 * one (This may happen several times).
1060 */ 1062 */
1061 if (added) { 1063 if (added) {
1062 locks_delete_lock(before, &dispose); 1064 locks_delete_lock_ctx(fl, &dispose);
1063 continue; 1065 continue;
1064 } 1066 }
1065 /* 1067 /*
@@ -1075,15 +1077,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1075 locks_copy_lock(new_fl, request); 1077 locks_copy_lock(new_fl, request);
1076 request = new_fl; 1078 request = new_fl;
1077 new_fl = NULL; 1079 new_fl = NULL;
1078 locks_delete_lock(before, &dispose); 1080 locks_insert_lock_ctx(request, &fl->fl_list);
1079 locks_insert_lock(before, request); 1081 locks_delete_lock_ctx(fl, &dispose);
1080 added = true; 1082 added = true;
1081 } 1083 }
1082 } 1084 }
1083 /* Go on to next lock.
1084 */
1085 next_lock:
1086 before = &fl->fl_next;
1087 } 1085 }
1088 1086
1089 /* 1087 /*
@@ -1108,7 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1108 goto out; 1106 goto out;
1109 } 1107 }
1110 locks_copy_lock(new_fl, request); 1108 locks_copy_lock(new_fl, request);
1111 locks_insert_lock(before, new_fl); 1109 locks_insert_lock_ctx(new_fl, &fl->fl_list);
1110 fl = new_fl;
1112 new_fl = NULL; 1111 new_fl = NULL;
1113 } 1112 }
1114 if (right) { 1113 if (right) {
@@ -1119,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1119 left = new_fl2; 1118 left = new_fl2;
1120 new_fl2 = NULL; 1119 new_fl2 = NULL;
1121 locks_copy_lock(left, right); 1120 locks_copy_lock(left, right);
1122 locks_insert_lock(before, left); 1121 locks_insert_lock_ctx(left, &fl->fl_list);
1123 } 1122 }
1124 right->fl_start = request->fl_end + 1; 1123 right->fl_start = request->fl_end + 1;
1125 locks_wake_up_blocks(right); 1124 locks_wake_up_blocks(right);
@@ -1129,7 +1128,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1129 locks_wake_up_blocks(left); 1128 locks_wake_up_blocks(left);
1130 } 1129 }
1131 out: 1130 out:
1132 spin_unlock(&inode->i_lock); 1131 spin_unlock(&ctx->flc_lock);
1133 /* 1132 /*
1134 * Free any unused locks. 1133 * Free any unused locks.
1135 */ 1134 */
@@ -1199,22 +1198,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
1199 */ 1198 */
1200int locks_mandatory_locked(struct file *file) 1199int locks_mandatory_locked(struct file *file)
1201{ 1200{
1201 int ret;
1202 struct inode *inode = file_inode(file); 1202 struct inode *inode = file_inode(file);
1203 struct file_lock_context *ctx;
1203 struct file_lock *fl; 1204 struct file_lock *fl;
1204 1205
1206 ctx = inode->i_flctx;
1207 if (!ctx || list_empty_careful(&ctx->flc_posix))
1208 return 0;
1209
1205 /* 1210 /*
1206 * Search the lock list for this inode for any POSIX locks. 1211 * Search the lock list for this inode for any POSIX locks.
1207 */ 1212 */
1208 spin_lock(&inode->i_lock); 1213 spin_lock(&ctx->flc_lock);
1209 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1214 ret = 0;
1210 if (!IS_POSIX(fl)) 1215 list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
1211 continue;
1212 if (fl->fl_owner != current->files && 1216 if (fl->fl_owner != current->files &&
1213 fl->fl_owner != file) 1217 fl->fl_owner != file) {
1218 ret = -EAGAIN;
1214 break; 1219 break;
1220 }
1215 } 1221 }
1216 spin_unlock(&inode->i_lock); 1222 spin_unlock(&ctx->flc_lock);
1217 return fl ? -EAGAIN : 0; 1223 return ret;
1218} 1224}
1219 1225
1220/** 1226/**
@@ -1294,9 +1300,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
1294} 1300}
1295 1301
1296/* We already had a lease on this file; just change its type */ 1302/* We already had a lease on this file; just change its type */
1297int lease_modify(struct file_lock **before, int arg, struct list_head *dispose) 1303int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
1298{ 1304{
1299 struct file_lock *fl = *before;
1300 int error = assign_type(fl, arg); 1305 int error = assign_type(fl, arg);
1301 1306
1302 if (error) 1307 if (error)
@@ -1313,7 +1318,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
1313 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); 1318 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
1314 fl->fl_fasync = NULL; 1319 fl->fl_fasync = NULL;
1315 } 1320 }
1316 locks_delete_lock(before, dispose); 1321 locks_delete_lock_ctx(fl, dispose);
1317 } 1322 }
1318 return 0; 1323 return 0;
1319} 1324}
@@ -1329,25 +1334,24 @@ static bool past_time(unsigned long then)
1329 1334
1330static void time_out_leases(struct inode *inode, struct list_head *dispose) 1335static void time_out_leases(struct inode *inode, struct list_head *dispose)
1331{ 1336{
1332 struct file_lock **before; 1337 struct file_lock_context *ctx = inode->i_flctx;
1333 struct file_lock *fl; 1338 struct file_lock *fl, *tmp;
1334 1339
1335 lockdep_assert_held(&inode->i_lock); 1340 lockdep_assert_held(&ctx->flc_lock);
1336 1341
1337 before = &inode->i_flock; 1342 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
1338 while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
1339 trace_time_out_leases(inode, fl); 1343 trace_time_out_leases(inode, fl);
1340 if (past_time(fl->fl_downgrade_time)) 1344 if (past_time(fl->fl_downgrade_time))
1341 lease_modify(before, F_RDLCK, dispose); 1345 lease_modify(fl, F_RDLCK, dispose);
1342 if (past_time(fl->fl_break_time)) 1346 if (past_time(fl->fl_break_time))
1343 lease_modify(before, F_UNLCK, dispose); 1347 lease_modify(fl, F_UNLCK, dispose);
1344 if (fl == *before) /* lease_modify may have freed fl */
1345 before = &fl->fl_next;
1346 } 1348 }
1347} 1349}
1348 1350
1349static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) 1351static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1350{ 1352{
1353 if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
1354 return false;
1351 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) 1355 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
1352 return false; 1356 return false;
1353 return locks_conflict(breaker, lease); 1357 return locks_conflict(breaker, lease);
@@ -1356,11 +1360,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1356static bool 1360static bool
1357any_leases_conflict(struct inode *inode, struct file_lock *breaker) 1361any_leases_conflict(struct inode *inode, struct file_lock *breaker)
1358{ 1362{
1363 struct file_lock_context *ctx = inode->i_flctx;
1359 struct file_lock *fl; 1364 struct file_lock *fl;
1360 1365
1361 lockdep_assert_held(&inode->i_lock); 1366 lockdep_assert_held(&ctx->flc_lock);
1362 1367
1363 for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) { 1368 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1364 if (leases_conflict(fl, breaker)) 1369 if (leases_conflict(fl, breaker))
1365 return true; 1370 return true;
1366 } 1371 }
@@ -1384,7 +1389,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1384{ 1389{
1385 int error = 0; 1390 int error = 0;
1386 struct file_lock *new_fl; 1391 struct file_lock *new_fl;
1387 struct file_lock *fl, **before; 1392 struct file_lock_context *ctx = inode->i_flctx;
1393 struct file_lock *fl;
1388 unsigned long break_time; 1394 unsigned long break_time;
1389 int want_write = (mode & O_ACCMODE) != O_RDONLY; 1395 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1390 LIST_HEAD(dispose); 1396 LIST_HEAD(dispose);
@@ -1394,7 +1400,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1394 return PTR_ERR(new_fl); 1400 return PTR_ERR(new_fl);
1395 new_fl->fl_flags = type; 1401 new_fl->fl_flags = type;
1396 1402
1397 spin_lock(&inode->i_lock); 1403 /* typically we will check that ctx is non-NULL before calling */
1404 if (!ctx) {
1405 WARN_ON_ONCE(1);
1406 return error;
1407 }
1408
1409 spin_lock(&ctx->flc_lock);
1398 1410
1399 time_out_leases(inode, &dispose); 1411 time_out_leases(inode, &dispose);
1400 1412
@@ -1408,9 +1420,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1408 break_time++; /* so that 0 means no break time */ 1420 break_time++; /* so that 0 means no break time */
1409 } 1421 }
1410 1422
1411 for (before = &inode->i_flock; 1423 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1412 ((fl = *before) != NULL) && IS_LEASE(fl);
1413 before = &fl->fl_next) {
1414 if (!leases_conflict(fl, new_fl)) 1424 if (!leases_conflict(fl, new_fl))
1415 continue; 1425 continue;
1416 if (want_write) { 1426 if (want_write) {
@@ -1419,17 +1429,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1419 fl->fl_flags |= FL_UNLOCK_PENDING; 1429 fl->fl_flags |= FL_UNLOCK_PENDING;
1420 fl->fl_break_time = break_time; 1430 fl->fl_break_time = break_time;
1421 } else { 1431 } else {
1422 if (lease_breaking(inode->i_flock)) 1432 if (lease_breaking(fl))
1423 continue; 1433 continue;
1424 fl->fl_flags |= FL_DOWNGRADE_PENDING; 1434 fl->fl_flags |= FL_DOWNGRADE_PENDING;
1425 fl->fl_downgrade_time = break_time; 1435 fl->fl_downgrade_time = break_time;
1426 } 1436 }
1427 if (fl->fl_lmops->lm_break(fl)) 1437 if (fl->fl_lmops->lm_break(fl))
1428 locks_delete_lock(before, &dispose); 1438 locks_delete_lock_ctx(fl, &dispose);
1429 } 1439 }
1430 1440
1431 fl = inode->i_flock; 1441 if (list_empty(&ctx->flc_lease))
1432 if (!fl || !IS_LEASE(fl))
1433 goto out; 1442 goto out;
1434 1443
1435 if (mode & O_NONBLOCK) { 1444 if (mode & O_NONBLOCK) {
@@ -1439,18 +1448,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1439 } 1448 }
1440 1449
1441restart: 1450restart:
1442 break_time = inode->i_flock->fl_break_time; 1451 fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
1452 break_time = fl->fl_break_time;
1443 if (break_time != 0) 1453 if (break_time != 0)
1444 break_time -= jiffies; 1454 break_time -= jiffies;
1445 if (break_time == 0) 1455 if (break_time == 0)
1446 break_time++; 1456 break_time++;
1447 locks_insert_block(inode->i_flock, new_fl); 1457 locks_insert_block(fl, new_fl);
1448 trace_break_lease_block(inode, new_fl); 1458 trace_break_lease_block(inode, new_fl);
1449 spin_unlock(&inode->i_lock); 1459 spin_unlock(&ctx->flc_lock);
1450 locks_dispose_list(&dispose); 1460 locks_dispose_list(&dispose);
1451 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1461 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1452 !new_fl->fl_next, break_time); 1462 !new_fl->fl_next, break_time);
1453 spin_lock(&inode->i_lock); 1463 spin_lock(&ctx->flc_lock);
1454 trace_break_lease_unblock(inode, new_fl); 1464 trace_break_lease_unblock(inode, new_fl);
1455 locks_delete_block(new_fl); 1465 locks_delete_block(new_fl);
1456 if (error >= 0) { 1466 if (error >= 0) {
@@ -1462,12 +1472,10 @@ restart:
1462 time_out_leases(inode, &dispose); 1472 time_out_leases(inode, &dispose);
1463 if (any_leases_conflict(inode, new_fl)) 1473 if (any_leases_conflict(inode, new_fl))
1464 goto restart; 1474 goto restart;
1465
1466 error = 0; 1475 error = 0;
1467 } 1476 }
1468
1469out: 1477out:
1470 spin_unlock(&inode->i_lock); 1478 spin_unlock(&ctx->flc_lock);
1471 locks_dispose_list(&dispose); 1479 locks_dispose_list(&dispose);
1472 locks_free_lock(new_fl); 1480 locks_free_lock(new_fl);
1473 return error; 1481 return error;
@@ -1487,14 +1495,18 @@ EXPORT_SYMBOL(__break_lease);
1487void lease_get_mtime(struct inode *inode, struct timespec *time) 1495void lease_get_mtime(struct inode *inode, struct timespec *time)
1488{ 1496{
1489 bool has_lease = false; 1497 bool has_lease = false;
1490 struct file_lock *flock; 1498 struct file_lock_context *ctx = inode->i_flctx;
1499 struct file_lock *fl;
1491 1500
1492 if (inode->i_flock) { 1501 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
1493 spin_lock(&inode->i_lock); 1502 spin_lock(&ctx->flc_lock);
1494 flock = inode->i_flock; 1503 if (!list_empty(&ctx->flc_lease)) {
1495 if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) 1504 fl = list_first_entry(&ctx->flc_lease,
1496 has_lease = true; 1505 struct file_lock, fl_list);
1497 spin_unlock(&inode->i_lock); 1506 if (fl->fl_type == F_WRLCK)
1507 has_lease = true;
1508 }
1509 spin_unlock(&ctx->flc_lock);
1498 } 1510 }
1499 1511
1500 if (has_lease) 1512 if (has_lease)
@@ -1532,20 +1544,22 @@ int fcntl_getlease(struct file *filp)
1532{ 1544{
1533 struct file_lock *fl; 1545 struct file_lock *fl;
1534 struct inode *inode = file_inode(filp); 1546 struct inode *inode = file_inode(filp);
1547 struct file_lock_context *ctx = inode->i_flctx;
1535 int type = F_UNLCK; 1548 int type = F_UNLCK;
1536 LIST_HEAD(dispose); 1549 LIST_HEAD(dispose);
1537 1550
1538 spin_lock(&inode->i_lock); 1551 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
1539 time_out_leases(file_inode(filp), &dispose); 1552 spin_lock(&ctx->flc_lock);
1540 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); 1553 time_out_leases(file_inode(filp), &dispose);
1541 fl = fl->fl_next) { 1554 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1542 if (fl->fl_file == filp) { 1555 if (fl->fl_file != filp)
1556 continue;
1543 type = target_leasetype(fl); 1557 type = target_leasetype(fl);
1544 break; 1558 break;
1545 } 1559 }
1560 spin_unlock(&ctx->flc_lock);
1561 locks_dispose_list(&dispose);
1546 } 1562 }
1547 spin_unlock(&inode->i_lock);
1548 locks_dispose_list(&dispose);
1549 return type; 1563 return type;
1550} 1564}
1551 1565
@@ -1560,11 +1574,14 @@ int fcntl_getlease(struct file *filp)
1560 * conflict with the lease we're trying to set. 1574 * conflict with the lease we're trying to set.
1561 */ 1575 */
1562static int 1576static int
1563check_conflicting_open(const struct dentry *dentry, const long arg) 1577check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
1564{ 1578{
1565 int ret = 0; 1579 int ret = 0;
1566 struct inode *inode = dentry->d_inode; 1580 struct inode *inode = dentry->d_inode;
1567 1581
1582 if (flags & FL_LAYOUT)
1583 return 0;
1584
1568 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1585 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1569 return -EAGAIN; 1586 return -EAGAIN;
1570 1587
@@ -1578,9 +1595,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
1578static int 1595static int
1579generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) 1596generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
1580{ 1597{
1581 struct file_lock *fl, **before, **my_before = NULL, *lease; 1598 struct file_lock *fl, *my_fl = NULL, *lease;
1582 struct dentry *dentry = filp->f_path.dentry; 1599 struct dentry *dentry = filp->f_path.dentry;
1583 struct inode *inode = dentry->d_inode; 1600 struct inode *inode = dentry->d_inode;
1601 struct file_lock_context *ctx;
1584 bool is_deleg = (*flp)->fl_flags & FL_DELEG; 1602 bool is_deleg = (*flp)->fl_flags & FL_DELEG;
1585 int error; 1603 int error;
1586 LIST_HEAD(dispose); 1604 LIST_HEAD(dispose);
@@ -1588,6 +1606,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1588 lease = *flp; 1606 lease = *flp;
1589 trace_generic_add_lease(inode, lease); 1607 trace_generic_add_lease(inode, lease);
1590 1608
1609 ctx = locks_get_lock_context(inode);
1610 if (!ctx)
1611 return -ENOMEM;
1612
1591 /* 1613 /*
1592 * In the delegation case we need mutual exclusion with 1614 * In the delegation case we need mutual exclusion with
1593 * a number of operations that take the i_mutex. We trylock 1615 * a number of operations that take the i_mutex. We trylock
@@ -1606,9 +1628,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1606 return -EINVAL; 1628 return -EINVAL;
1607 } 1629 }
1608 1630
1609 spin_lock(&inode->i_lock); 1631 spin_lock(&ctx->flc_lock);
1610 time_out_leases(inode, &dispose); 1632 time_out_leases(inode, &dispose);
1611 error = check_conflicting_open(dentry, arg); 1633 error = check_conflicting_open(dentry, arg, lease->fl_flags);
1612 if (error) 1634 if (error)
1613 goto out; 1635 goto out;
1614 1636
@@ -1621,13 +1643,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1621 * except for this filp. 1643 * except for this filp.
1622 */ 1644 */
1623 error = -EAGAIN; 1645 error = -EAGAIN;
1624 for (before = &inode->i_flock; 1646 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1625 ((fl = *before) != NULL) && IS_LEASE(fl); 1647 if (fl->fl_file == filp &&
1626 before = &fl->fl_next) { 1648 fl->fl_owner == lease->fl_owner) {
1627 if (fl->fl_file == filp) { 1649 my_fl = fl;
1628 my_before = before;
1629 continue; 1650 continue;
1630 } 1651 }
1652
1631 /* 1653 /*
1632 * No exclusive leases if someone else has a lease on 1654 * No exclusive leases if someone else has a lease on
1633 * this file: 1655 * this file:
@@ -1642,9 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1642 goto out; 1664 goto out;
1643 } 1665 }
1644 1666
1645 if (my_before != NULL) { 1667 if (my_fl != NULL) {
1646 lease = *my_before; 1668 error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
1647 error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
1648 if (error) 1669 if (error)
1649 goto out; 1670 goto out;
1650 goto out_setup; 1671 goto out_setup;
@@ -1654,7 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1654 if (!leases_enable) 1675 if (!leases_enable)
1655 goto out; 1676 goto out;
1656 1677
1657 locks_insert_lock(before, lease); 1678 locks_insert_lock_ctx(lease, &ctx->flc_lease);
1658 /* 1679 /*
1659 * The check in break_lease() is lockless. It's possible for another 1680 * The check in break_lease() is lockless. It's possible for another
1660 * open to race in after we did the earlier check for a conflicting 1681 * open to race in after we did the earlier check for a conflicting
@@ -1665,46 +1686,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
1665 * precedes these checks. 1686 * precedes these checks.
1666 */ 1687 */
1667 smp_mb(); 1688 smp_mb();
1668 error = check_conflicting_open(dentry, arg); 1689 error = check_conflicting_open(dentry, arg, lease->fl_flags);
1669 if (error) 1690 if (error) {
1670 goto out_unlink; 1691 locks_unlink_lock_ctx(lease);
1692 goto out;
1693 }
1671 1694
1672out_setup: 1695out_setup:
1673 if (lease->fl_lmops->lm_setup) 1696 if (lease->fl_lmops->lm_setup)
1674 lease->fl_lmops->lm_setup(lease, priv); 1697 lease->fl_lmops->lm_setup(lease, priv);
1675out: 1698out:
1676 spin_unlock(&inode->i_lock); 1699 spin_unlock(&ctx->flc_lock);
1677 locks_dispose_list(&dispose); 1700 locks_dispose_list(&dispose);
1678 if (is_deleg) 1701 if (is_deleg)
1679 mutex_unlock(&inode->i_mutex); 1702 mutex_unlock(&inode->i_mutex);
1680 if (!error && !my_before) 1703 if (!error && !my_fl)
1681 *flp = NULL; 1704 *flp = NULL;
1682 return error; 1705 return error;
1683out_unlink:
1684 locks_unlink_lock(before);
1685 goto out;
1686} 1706}
1687 1707
1688static int generic_delete_lease(struct file *filp) 1708static int generic_delete_lease(struct file *filp, void *owner)
1689{ 1709{
1690 int error = -EAGAIN; 1710 int error = -EAGAIN;
1691 struct file_lock *fl, **before; 1711 struct file_lock *fl, *victim = NULL;
1692 struct dentry *dentry = filp->f_path.dentry; 1712 struct dentry *dentry = filp->f_path.dentry;
1693 struct inode *inode = dentry->d_inode; 1713 struct inode *inode = dentry->d_inode;
1714 struct file_lock_context *ctx = inode->i_flctx;
1694 LIST_HEAD(dispose); 1715 LIST_HEAD(dispose);
1695 1716
1696 spin_lock(&inode->i_lock); 1717 if (!ctx) {
1697 time_out_leases(inode, &dispose); 1718 trace_generic_delete_lease(inode, NULL);
1698 for (before = &inode->i_flock; 1719 return error;
1699 ((fl = *before) != NULL) && IS_LEASE(fl); 1720 }
1700 before = &fl->fl_next) { 1721
1701 if (fl->fl_file == filp) 1722 spin_lock(&ctx->flc_lock);
1723 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
1724 if (fl->fl_file == filp &&
1725 fl->fl_owner == owner) {
1726 victim = fl;
1702 break; 1727 break;
1728 }
1703 } 1729 }
1704 trace_generic_delete_lease(inode, fl); 1730 trace_generic_delete_lease(inode, fl);
1705 if (fl) 1731 if (victim)
1706 error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose); 1732 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
1707 spin_unlock(&inode->i_lock); 1733 spin_unlock(&ctx->flc_lock);
1708 locks_dispose_list(&dispose); 1734 locks_dispose_list(&dispose);
1709 return error; 1735 return error;
1710} 1736}
@@ -1737,13 +1763,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
1737 1763
1738 switch (arg) { 1764 switch (arg) {
1739 case F_UNLCK: 1765 case F_UNLCK:
1740 return generic_delete_lease(filp); 1766 return generic_delete_lease(filp, *priv);
1741 case F_RDLCK: 1767 case F_RDLCK:
1742 case F_WRLCK: 1768 case F_WRLCK:
1743 if (!(*flp)->fl_lmops->lm_break) { 1769 if (!(*flp)->fl_lmops->lm_break) {
1744 WARN_ON_ONCE(1); 1770 WARN_ON_ONCE(1);
1745 return -ENOLCK; 1771 return -ENOLCK;
1746 } 1772 }
1773
1747 return generic_add_lease(filp, arg, flp, priv); 1774 return generic_add_lease(filp, arg, flp, priv);
1748 default: 1775 default:
1749 return -EINVAL; 1776 return -EINVAL;
@@ -1816,7 +1843,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1816int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1843int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1817{ 1844{
1818 if (arg == F_UNLCK) 1845 if (arg == F_UNLCK)
1819 return vfs_setlease(filp, F_UNLCK, NULL, NULL); 1846 return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
1820 return do_fcntl_add_lease(fd, filp, arg); 1847 return do_fcntl_add_lease(fd, filp, arg);
1821} 1848}
1822 1849
@@ -2171,7 +2198,7 @@ again:
2171 */ 2198 */
2172 /* 2199 /*
2173 * we need that spin_lock here - it prevents reordering between 2200 * we need that spin_lock here - it prevents reordering between
2174 * update of inode->i_flock and check for it done in close(). 2201 * update of i_flctx->flc_posix and check for it done in close().
2175 * rcu_read_lock() wouldn't do. 2202 * rcu_read_lock() wouldn't do.
2176 */ 2203 */
2177 spin_lock(&current->files->file_lock); 2204 spin_lock(&current->files->file_lock);
@@ -2331,13 +2358,14 @@ out:
2331void locks_remove_posix(struct file *filp, fl_owner_t owner) 2358void locks_remove_posix(struct file *filp, fl_owner_t owner)
2332{ 2359{
2333 struct file_lock lock; 2360 struct file_lock lock;
2361 struct file_lock_context *ctx = file_inode(filp)->i_flctx;
2334 2362
2335 /* 2363 /*
2336 * If there are no locks held on this file, we don't need to call 2364 * If there are no locks held on this file, we don't need to call
2337 * posix_lock_file(). Another process could be setting a lock on this 2365 * posix_lock_file(). Another process could be setting a lock on this
2338 * file at the same time, but we wouldn't remove that lock anyway. 2366 * file at the same time, but we wouldn't remove that lock anyway.
2339 */ 2367 */
2340 if (!file_inode(filp)->i_flock) 2368 if (!ctx || list_empty(&ctx->flc_posix))
2341 return; 2369 return;
2342 2370
2343 lock.fl_type = F_UNLCK; 2371 lock.fl_type = F_UNLCK;
@@ -2358,67 +2386,68 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
2358 2386
2359EXPORT_SYMBOL(locks_remove_posix); 2387EXPORT_SYMBOL(locks_remove_posix);
2360 2388
2389/* The i_flctx must be valid when calling into here */
2390static void
2391locks_remove_flock(struct file *filp)
2392{
2393 struct file_lock fl = {
2394 .fl_owner = filp,
2395 .fl_pid = current->tgid,
2396 .fl_file = filp,
2397 .fl_flags = FL_FLOCK,
2398 .fl_type = F_UNLCK,
2399 .fl_end = OFFSET_MAX,
2400 };
2401 struct file_lock_context *flctx = file_inode(filp)->i_flctx;
2402
2403 if (list_empty(&flctx->flc_flock))
2404 return;
2405
2406 if (filp->f_op->flock)
2407 filp->f_op->flock(filp, F_SETLKW, &fl);
2408 else
2409 flock_lock_file(filp, &fl);
2410
2411 if (fl.fl_ops && fl.fl_ops->fl_release_private)
2412 fl.fl_ops->fl_release_private(&fl);
2413}
2414
2415/* The i_flctx must be valid when calling into here */
2416static void
2417locks_remove_lease(struct file *filp)
2418{
2419 struct inode *inode = file_inode(filp);
2420 struct file_lock_context *ctx = inode->i_flctx;
2421 struct file_lock *fl, *tmp;
2422 LIST_HEAD(dispose);
2423
2424 if (list_empty(&ctx->flc_lease))
2425 return;
2426
2427 spin_lock(&ctx->flc_lock);
2428 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
2429 if (filp == fl->fl_file)
2430 lease_modify(fl, F_UNLCK, &dispose);
2431 spin_unlock(&ctx->flc_lock);
2432 locks_dispose_list(&dispose);
2433}
2434
2361/* 2435/*
2362 * This function is called on the last close of an open file. 2436 * This function is called on the last close of an open file.
2363 */ 2437 */
2364void locks_remove_file(struct file *filp) 2438void locks_remove_file(struct file *filp)
2365{ 2439{
2366 struct inode * inode = file_inode(filp); 2440 if (!file_inode(filp)->i_flctx)
2367 struct file_lock *fl;
2368 struct file_lock **before;
2369 LIST_HEAD(dispose);
2370
2371 if (!inode->i_flock)
2372 return; 2441 return;
2373 2442
2443 /* remove any OFD locks */
2374 locks_remove_posix(filp, filp); 2444 locks_remove_posix(filp, filp);
2375 2445
2376 if (filp->f_op->flock) { 2446 /* remove flock locks */
2377 struct file_lock fl = { 2447 locks_remove_flock(filp);
2378 .fl_owner = filp,
2379 .fl_pid = current->tgid,
2380 .fl_file = filp,
2381 .fl_flags = FL_FLOCK,
2382 .fl_type = F_UNLCK,
2383 .fl_end = OFFSET_MAX,
2384 };
2385 filp->f_op->flock(filp, F_SETLKW, &fl);
2386 if (fl.fl_ops && fl.fl_ops->fl_release_private)
2387 fl.fl_ops->fl_release_private(&fl);
2388 }
2389
2390 spin_lock(&inode->i_lock);
2391 before = &inode->i_flock;
2392 2448
2393 while ((fl = *before) != NULL) { 2449 /* remove any leases */
2394 if (fl->fl_file == filp) { 2450 locks_remove_lease(filp);
2395 if (IS_LEASE(fl)) {
2396 lease_modify(before, F_UNLCK, &dispose);
2397 continue;
2398 }
2399
2400 /*
2401 * There's a leftover lock on the list of a type that
2402 * we didn't expect to see. Most likely a classic
2403 * POSIX lock that ended up not getting released
2404 * properly, or that raced onto the list somehow. Log
2405 * some info about it and then just remove it from
2406 * the list.
2407 */
2408 WARN(!IS_FLOCK(fl),
2409 "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
2410 MAJOR(inode->i_sb->s_dev),
2411 MINOR(inode->i_sb->s_dev), inode->i_ino,
2412 fl->fl_type, fl->fl_flags,
2413 fl->fl_start, fl->fl_end);
2414
2415 locks_delete_lock(before, &dispose);
2416 continue;
2417 }
2418 before = &fl->fl_next;
2419 }
2420 spin_unlock(&inode->i_lock);
2421 locks_dispose_list(&dispose);
2422} 2451}
2423 2452
2424/** 2453/**
@@ -2621,6 +2650,9 @@ static int __init filelock_init(void)
2621{ 2650{
2622 int i; 2651 int i;
2623 2652
2653 flctx_cache = kmem_cache_create("file_lock_ctx",
2654 sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
2655
2624 filelock_cache = kmem_cache_create("file_lock_cache", 2656 filelock_cache = kmem_cache_create("file_lock_cache",
2625 sizeof(struct file_lock), 0, SLAB_PANIC, NULL); 2657 sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
2626 2658
diff --git a/fs/mount.h b/fs/mount.h
index 0ad6f760ce52..6a61c2b3e385 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -2,6 +2,7 @@
2#include <linux/seq_file.h> 2#include <linux/seq_file.h>
3#include <linux/poll.h> 3#include <linux/poll.h>
4#include <linux/ns_common.h> 4#include <linux/ns_common.h>
5#include <linux/fs_pin.h>
5 6
6struct mnt_namespace { 7struct mnt_namespace {
7 atomic_t count; 8 atomic_t count;
@@ -62,7 +63,8 @@ struct mount {
62 int mnt_group_id; /* peer group identifier */ 63 int mnt_group_id; /* peer group identifier */
63 int mnt_expiry_mark; /* true if marked for expiry */ 64 int mnt_expiry_mark; /* true if marked for expiry */
64 struct hlist_head mnt_pins; 65 struct hlist_head mnt_pins;
65 struct path mnt_ex_mountpoint; 66 struct fs_pin mnt_umount;
67 struct dentry *mnt_ex_mountpoint;
66}; 68};
67 69
68#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ 70#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namei.c b/fs/namei.c
index bc35b02883bb..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
118 * POSIX.1 2.4: an empty pathname is invalid (ENOENT). 118 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
119 * PATH_MAX includes the nul terminator --RR. 119 * PATH_MAX includes the nul terminator --RR.
120 */ 120 */
121void final_putname(struct filename *name)
122{
123 if (name->separate) {
124 __putname(name->name);
125 kfree(name);
126 } else {
127 __putname(name);
128 }
129}
130 121
131#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) 122#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
132 123
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
145 result = __getname(); 136 result = __getname();
146 if (unlikely(!result)) 137 if (unlikely(!result))
147 return ERR_PTR(-ENOMEM); 138 return ERR_PTR(-ENOMEM);
139 result->refcnt = 1;
148 140
149 /* 141 /*
150 * First, try to embed the struct filename inside the names_cache 142 * First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
179 } 171 }
180 result->name = kname; 172 result->name = kname;
181 result->separate = true; 173 result->separate = true;
174 result->refcnt = 1;
182 max = PATH_MAX; 175 max = PATH_MAX;
183 goto recopy; 176 goto recopy;
184 } 177 }
@@ -202,7 +195,7 @@ recopy:
202 return result; 195 return result;
203 196
204error: 197error:
205 final_putname(result); 198 putname(result);
206 return err; 199 return err;
207} 200}
208 201
@@ -212,43 +205,56 @@ getname(const char __user * filename)
212 return getname_flags(filename, 0, NULL); 205 return getname_flags(filename, 0, NULL);
213} 206}
214 207
215/*
216 * The "getname_kernel()" interface doesn't do pathnames longer
217 * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
218 */
219struct filename * 208struct filename *
220getname_kernel(const char * filename) 209getname_kernel(const char * filename)
221{ 210{
222 struct filename *result; 211 struct filename *result;
223 char *kname; 212 int len = strlen(filename) + 1;
224 int len;
225
226 len = strlen(filename);
227 if (len >= EMBEDDED_NAME_MAX)
228 return ERR_PTR(-ENAMETOOLONG);
229 213
230 result = __getname(); 214 result = __getname();
231 if (unlikely(!result)) 215 if (unlikely(!result))
232 return ERR_PTR(-ENOMEM); 216 return ERR_PTR(-ENOMEM);
233 217
234 kname = (char *)result + sizeof(*result); 218 if (len <= EMBEDDED_NAME_MAX) {
235 result->name = kname; 219 result->name = (char *)(result) + sizeof(*result);
220 result->separate = false;
221 } else if (len <= PATH_MAX) {
222 struct filename *tmp;
223
224 tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
225 if (unlikely(!tmp)) {
226 __putname(result);
227 return ERR_PTR(-ENOMEM);
228 }
229 tmp->name = (char *)result;
230 tmp->separate = true;
231 result = tmp;
232 } else {
233 __putname(result);
234 return ERR_PTR(-ENAMETOOLONG);
235 }
236 memcpy((char *)result->name, filename, len);
236 result->uptr = NULL; 237 result->uptr = NULL;
237 result->aname = NULL; 238 result->aname = NULL;
238 result->separate = false; 239 result->refcnt = 1;
240 audit_getname(result);
239 241
240 strlcpy(kname, filename, EMBEDDED_NAME_MAX);
241 return result; 242 return result;
242} 243}
243 244
244#ifdef CONFIG_AUDITSYSCALL
245void putname(struct filename *name) 245void putname(struct filename *name)
246{ 246{
247 if (unlikely(!audit_dummy_context())) 247 BUG_ON(name->refcnt <= 0);
248 return audit_putname(name); 248
249 final_putname(name); 249 if (--name->refcnt > 0)
250 return;
251
252 if (name->separate) {
253 __putname(name->name);
254 kfree(name);
255 } else
256 __putname(name);
250} 257}
251#endif
252 258
253static int check_acl(struct inode *inode, int mask) 259static int check_acl(struct inode *inode, int mask)
254{ 260{
@@ -2036,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name,
2036static int do_path_lookup(int dfd, const char *name, 2042static int do_path_lookup(int dfd, const char *name,
2037 unsigned int flags, struct nameidata *nd) 2043 unsigned int flags, struct nameidata *nd)
2038{ 2044{
2039 struct filename filename = { .name = name }; 2045 struct filename *filename = getname_kernel(name);
2046 int retval = PTR_ERR(filename);
2040 2047
2041 return filename_lookup(dfd, &filename, flags, nd); 2048 if (!IS_ERR(filename)) {
2049 retval = filename_lookup(dfd, filename, flags, nd);
2050 putname(filename);
2051 }
2052 return retval;
2042} 2053}
2043 2054
2044/* does lookup, returns the object with parent locked */ 2055/* does lookup, returns the object with parent locked */
2045struct dentry *kern_path_locked(const char *name, struct path *path) 2056struct dentry *kern_path_locked(const char *name, struct path *path)
2046{ 2057{
2058 struct filename *filename = getname_kernel(name);
2047 struct nameidata nd; 2059 struct nameidata nd;
2048 struct dentry *d; 2060 struct dentry *d;
2049 int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); 2061 int err;
2050 if (err) 2062
2051 return ERR_PTR(err); 2063 if (IS_ERR(filename))
2064 return ERR_CAST(filename);
2065
2066 err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
2067 if (err) {
2068 d = ERR_PTR(err);
2069 goto out;
2070 }
2052 if (nd.last_type != LAST_NORM) { 2071 if (nd.last_type != LAST_NORM) {
2053 path_put(&nd.path); 2072 path_put(&nd.path);
2054 return ERR_PTR(-EINVAL); 2073 d = ERR_PTR(-EINVAL);
2074 goto out;
2055 } 2075 }
2056 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 2076 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2057 d = __lookup_hash(&nd.last, nd.path.dentry, 0); 2077 d = __lookup_hash(&nd.last, nd.path.dentry, 0);
2058 if (IS_ERR(d)) { 2078 if (IS_ERR(d)) {
2059 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2079 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2060 path_put(&nd.path); 2080 path_put(&nd.path);
2061 return d; 2081 goto out;
2062 } 2082 }
2063 *path = nd.path; 2083 *path = nd.path;
2084out:
2085 putname(filename);
2064 return d; 2086 return d;
2065} 2087}
2066 2088
@@ -2351,13 +2373,17 @@ static int
2351filename_mountpoint(int dfd, struct filename *s, struct path *path, 2373filename_mountpoint(int dfd, struct filename *s, struct path *path,
2352 unsigned int flags) 2374 unsigned int flags)
2353{ 2375{
2354 int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); 2376 int error;
2377 if (IS_ERR(s))
2378 return PTR_ERR(s);
2379 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
2355 if (unlikely(error == -ECHILD)) 2380 if (unlikely(error == -ECHILD))
2356 error = path_mountpoint(dfd, s->name, path, flags); 2381 error = path_mountpoint(dfd, s->name, path, flags);
2357 if (unlikely(error == -ESTALE)) 2382 if (unlikely(error == -ESTALE))
2358 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); 2383 error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
2359 if (likely(!error)) 2384 if (likely(!error))
2360 audit_inode(s, path->dentry, 0); 2385 audit_inode(s, path->dentry, 0);
2386 putname(s);
2361 return error; 2387 return error;
2362} 2388}
2363 2389
@@ -2379,21 +2405,14 @@ int
2379user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, 2405user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
2380 struct path *path) 2406 struct path *path)
2381{ 2407{
2382 struct filename *s = getname(name); 2408 return filename_mountpoint(dfd, getname(name), path, flags);
2383 int error;
2384 if (IS_ERR(s))
2385 return PTR_ERR(s);
2386 error = filename_mountpoint(dfd, s, path, flags);
2387 putname(s);
2388 return error;
2389} 2409}
2390 2410
2391int 2411int
2392kern_path_mountpoint(int dfd, const char *name, struct path *path, 2412kern_path_mountpoint(int dfd, const char *name, struct path *path,
2393 unsigned int flags) 2413 unsigned int flags)
2394{ 2414{
2395 struct filename s = {.name = name}; 2415 return filename_mountpoint(dfd, getname_kernel(name), path, flags);
2396 return filename_mountpoint(dfd, &s, path, flags);
2397} 2416}
2398EXPORT_SYMBOL(kern_path_mountpoint); 2417EXPORT_SYMBOL(kern_path_mountpoint);
2399 2418
@@ -3273,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3273{ 3292{
3274 struct nameidata nd; 3293 struct nameidata nd;
3275 struct file *file; 3294 struct file *file;
3276 struct filename filename = { .name = name }; 3295 struct filename *filename;
3277 int flags = op->lookup_flags | LOOKUP_ROOT; 3296 int flags = op->lookup_flags | LOOKUP_ROOT;
3278 3297
3279 nd.root.mnt = mnt; 3298 nd.root.mnt = mnt;
@@ -3282,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3282 if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) 3301 if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
3283 return ERR_PTR(-ELOOP); 3302 return ERR_PTR(-ELOOP);
3284 3303
3285 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); 3304 filename = getname_kernel(name);
3305 if (unlikely(IS_ERR(filename)))
3306 return ERR_CAST(filename);
3307
3308 file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
3286 if (unlikely(file == ERR_PTR(-ECHILD))) 3309 if (unlikely(file == ERR_PTR(-ECHILD)))
3287 file = path_openat(-1, &filename, &nd, op, flags); 3310 file = path_openat(-1, filename, &nd, op, flags);
3288 if (unlikely(file == ERR_PTR(-ESTALE))) 3311 if (unlikely(file == ERR_PTR(-ESTALE)))
3289 file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); 3312 file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
3313 putname(filename);
3290 return file; 3314 return file;
3291} 3315}
3292 3316
3293struct dentry *kern_path_create(int dfd, const char *pathname, 3317static struct dentry *filename_create(int dfd, struct filename *name,
3294 struct path *path, unsigned int lookup_flags) 3318 struct path *path, unsigned int lookup_flags)
3295{ 3319{
3296 struct dentry *dentry = ERR_PTR(-EEXIST); 3320 struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3305,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
3305 */ 3329 */
3306 lookup_flags &= LOOKUP_REVAL; 3330 lookup_flags &= LOOKUP_REVAL;
3307 3331
3308 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); 3332 error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
3309 if (error) 3333 if (error)
3310 return ERR_PTR(error); 3334 return ERR_PTR(error);
3311 3335
@@ -3359,6 +3383,19 @@ out:
3359 path_put(&nd.path); 3383 path_put(&nd.path);
3360 return dentry; 3384 return dentry;
3361} 3385}
3386
3387struct dentry *kern_path_create(int dfd, const char *pathname,
3388 struct path *path, unsigned int lookup_flags)
3389{
3390 struct filename *filename = getname_kernel(pathname);
3391 struct dentry *res;
3392
3393 if (IS_ERR(filename))
3394 return ERR_CAST(filename);
3395 res = filename_create(dfd, filename, path, lookup_flags);
3396 putname(filename);
3397 return res;
3398}
3362EXPORT_SYMBOL(kern_path_create); 3399EXPORT_SYMBOL(kern_path_create);
3363 3400
3364void done_path_create(struct path *path, struct dentry *dentry) 3401void done_path_create(struct path *path, struct dentry *dentry)
@@ -3377,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname,
3377 struct dentry *res; 3414 struct dentry *res;
3378 if (IS_ERR(tmp)) 3415 if (IS_ERR(tmp))
3379 return ERR_CAST(tmp); 3416 return ERR_CAST(tmp);
3380 res = kern_path_create(dfd, tmp->name, path, lookup_flags); 3417 res = filename_create(dfd, tmp, path, lookup_flags);
3381 putname(tmp); 3418 putname(tmp);
3382 return res; 3419 return res;
3383} 3420}
diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e9681a0cf..72a286e0d33e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt)
190#endif 190#endif
191} 191}
192 192
193static void drop_mountpoint(struct fs_pin *p)
194{
195 struct mount *m = container_of(p, struct mount, mnt_umount);
196 dput(m->mnt_ex_mountpoint);
197 pin_remove(p);
198 mntput(&m->mnt);
199}
200
193static struct mount *alloc_vfsmnt(const char *name) 201static struct mount *alloc_vfsmnt(const char *name)
194{ 202{
195 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 203 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -201,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name)
201 goto out_free_cache; 209 goto out_free_cache;
202 210
203 if (name) { 211 if (name) {
204 mnt->mnt_devname = kstrdup(name, GFP_KERNEL); 212 mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
205 if (!mnt->mnt_devname) 213 if (!mnt->mnt_devname)
206 goto out_free_id; 214 goto out_free_id;
207 } 215 }
@@ -229,12 +237,13 @@ static struct mount *alloc_vfsmnt(const char *name)
229#ifdef CONFIG_FSNOTIFY 237#ifdef CONFIG_FSNOTIFY
230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 238 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
231#endif 239#endif
240 init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
232 } 241 }
233 return mnt; 242 return mnt;
234 243
235#ifdef CONFIG_SMP 244#ifdef CONFIG_SMP
236out_free_devname: 245out_free_devname:
237 kfree(mnt->mnt_devname); 246 kfree_const(mnt->mnt_devname);
238#endif 247#endif
239out_free_id: 248out_free_id:
240 mnt_free_id(mnt); 249 mnt_free_id(mnt);
@@ -568,7 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
568 577
569static void free_vfsmnt(struct mount *mnt) 578static void free_vfsmnt(struct mount *mnt)
570{ 579{
571 kfree(mnt->mnt_devname); 580 kfree_const(mnt->mnt_devname);
572#ifdef CONFIG_SMP 581#ifdef CONFIG_SMP
573 free_percpu(mnt->mnt_pcp); 582 free_percpu(mnt->mnt_pcp);
574#endif 583#endif
@@ -1289,7 +1298,6 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1289 1298
1290static void namespace_unlock(void) 1299static void namespace_unlock(void)
1291{ 1300{
1292 struct mount *mnt;
1293 struct hlist_head head = unmounted; 1301 struct hlist_head head = unmounted;
1294 1302
1295 if (likely(hlist_empty(&head))) { 1303 if (likely(hlist_empty(&head))) {
@@ -1299,23 +1307,11 @@ static void namespace_unlock(void)
1299 1307
1300 head.first->pprev = &head.first; 1308 head.first->pprev = &head.first;
1301 INIT_HLIST_HEAD(&unmounted); 1309 INIT_HLIST_HEAD(&unmounted);
1302
1303 /* undo decrements we'd done in umount_tree() */
1304 hlist_for_each_entry(mnt, &head, mnt_hash)
1305 if (mnt->mnt_ex_mountpoint.mnt)
1306 mntget(mnt->mnt_ex_mountpoint.mnt);
1307
1308 up_write(&namespace_sem); 1310 up_write(&namespace_sem);
1309 1311
1310 synchronize_rcu(); 1312 synchronize_rcu();
1311 1313
1312 while (!hlist_empty(&head)) { 1314 group_pin_kill(&head);
1313 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1314 hlist_del_init(&mnt->mnt_hash);
1315 if (mnt->mnt_ex_mountpoint.mnt)
1316 path_put(&mnt->mnt_ex_mountpoint);
1317 mntput(&mnt->mnt);
1318 }
1319} 1315}
1320 1316
1321static inline void namespace_lock(void) 1317static inline void namespace_lock(void)
@@ -1334,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how)
1334{ 1330{
1335 HLIST_HEAD(tmp_list); 1331 HLIST_HEAD(tmp_list);
1336 struct mount *p; 1332 struct mount *p;
1337 struct mount *last = NULL;
1338 1333
1339 for (p = mnt; p; p = next_mnt(p, mnt)) { 1334 for (p = mnt; p; p = next_mnt(p, mnt)) {
1340 hlist_del_init_rcu(&p->mnt_hash); 1335 hlist_del_init_rcu(&p->mnt_hash);
@@ -1347,33 +1342,28 @@ void umount_tree(struct mount *mnt, int how)
1347 if (how) 1342 if (how)
1348 propagate_umount(&tmp_list); 1343 propagate_umount(&tmp_list);
1349 1344
1350 hlist_for_each_entry(p, &tmp_list, mnt_hash) { 1345 while (!hlist_empty(&tmp_list)) {
1346 p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
1347 hlist_del_init_rcu(&p->mnt_hash);
1351 list_del_init(&p->mnt_expire); 1348 list_del_init(&p->mnt_expire);
1352 list_del_init(&p->mnt_list); 1349 list_del_init(&p->mnt_list);
1353 __touch_mnt_namespace(p->mnt_ns); 1350 __touch_mnt_namespace(p->mnt_ns);
1354 p->mnt_ns = NULL; 1351 p->mnt_ns = NULL;
1355 if (how < 2) 1352 if (how < 2)
1356 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1353 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1354
1355 pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
1357 if (mnt_has_parent(p)) { 1356 if (mnt_has_parent(p)) {
1358 hlist_del_init(&p->mnt_mp_list); 1357 hlist_del_init(&p->mnt_mp_list);
1359 put_mountpoint(p->mnt_mp); 1358 put_mountpoint(p->mnt_mp);
1360 mnt_add_count(p->mnt_parent, -1); 1359 mnt_add_count(p->mnt_parent, -1);
1361 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1360 /* old mountpoint will be dropped when we can do that */
1362 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1361 p->mnt_ex_mountpoint = p->mnt_mountpoint;
1363 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1364 p->mnt_mountpoint = p->mnt.mnt_root; 1362 p->mnt_mountpoint = p->mnt.mnt_root;
1365 p->mnt_parent = p; 1363 p->mnt_parent = p;
1366 p->mnt_mp = NULL; 1364 p->mnt_mp = NULL;
1367 } 1365 }
1368 change_mnt_propagation(p, MS_PRIVATE); 1366 change_mnt_propagation(p, MS_PRIVATE);
1369 last = p;
1370 }
1371 if (last) {
1372 last->mnt_hash.next = unmounted.first;
1373 if (unmounted.first)
1374 unmounted.first->pprev = &last->mnt_hash.next;
1375 unmounted.first = tmp_list.first;
1376 unmounted.first->pprev = &unmounted.first;
1377 } 1367 }
1378} 1368}
1379 1369
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 008960101520..e7ca827d7694 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *);
77static int ncp_compare_dentry(const struct dentry *, const struct dentry *, 77static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
78 unsigned int, const char *, const struct qstr *); 78 unsigned int, const char *, const struct qstr *);
79static int ncp_delete_dentry(const struct dentry *); 79static int ncp_delete_dentry(const struct dentry *);
80static void ncp_d_prune(struct dentry *dentry);
80 81
81const struct dentry_operations ncp_dentry_operations = 82const struct dentry_operations ncp_dentry_operations =
82{ 83{
@@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations =
84 .d_hash = ncp_hash_dentry, 85 .d_hash = ncp_hash_dentry,
85 .d_compare = ncp_compare_dentry, 86 .d_compare = ncp_compare_dentry,
86 .d_delete = ncp_delete_dentry, 87 .d_delete = ncp_delete_dentry,
88 .d_prune = ncp_d_prune,
87}; 89};
88 90
89#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) 91#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
@@ -384,42 +386,6 @@ finished:
384 return val; 386 return val;
385} 387}
386 388
387static struct dentry *
388ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
389{
390 struct dentry *dent = dentry;
391
392 if (d_validate(dent, parent)) {
393 if (dent->d_name.len <= NCP_MAXPATHLEN &&
394 (unsigned long)dent->d_fsdata == fpos) {
395 if (!dent->d_inode) {
396 dput(dent);
397 dent = NULL;
398 }
399 return dent;
400 }
401 dput(dent);
402 }
403
404 /* If a pointer is invalid, we search the dentry. */
405 spin_lock(&parent->d_lock);
406 list_for_each_entry(dent, &parent->d_subdirs, d_child) {
407 if ((unsigned long)dent->d_fsdata == fpos) {
408 if (dent->d_inode)
409 dget(dent);
410 else
411 dent = NULL;
412 spin_unlock(&parent->d_lock);
413 goto out;
414 }
415 }
416 spin_unlock(&parent->d_lock);
417 return NULL;
418
419out:
420 return dent;
421}
422
423static time_t ncp_obtain_mtime(struct dentry *dentry) 389static time_t ncp_obtain_mtime(struct dentry *dentry)
424{ 390{
425 struct inode *inode = dentry->d_inode; 391 struct inode *inode = dentry->d_inode;
@@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
435 return ncp_date_dos2unix(i.modifyTime, i.modifyDate); 401 return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
436} 402}
437 403
404static inline void
405ncp_invalidate_dircache_entries(struct dentry *parent)
406{
407 struct ncp_server *server = NCP_SERVER(parent->d_inode);
408 struct dentry *dentry;
409
410 spin_lock(&parent->d_lock);
411 list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
412 dentry->d_fsdata = NULL;
413 ncp_age_dentry(server, dentry);
414 }
415 spin_unlock(&parent->d_lock);
416}
417
438static int ncp_readdir(struct file *file, struct dir_context *ctx) 418static int ncp_readdir(struct file *file, struct dir_context *ctx)
439{ 419{
440 struct dentry *dentry = file->f_path.dentry; 420 struct dentry *dentry = file->f_path.dentry;
@@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
500 struct dentry *dent; 480 struct dentry *dent;
501 bool over; 481 bool over;
502 482
503 dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], 483 spin_lock(&dentry->d_lock);
504 dentry, ctx->pos); 484 if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) {
505 if (!dent) 485 spin_unlock(&dentry->d_lock);
486 goto invalid_cache;
487 }
488 dent = ctl.cache->dentry[ctl.idx];
489 if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) {
490 spin_unlock(&dentry->d_lock);
491 goto invalid_cache;
492 }
493 spin_unlock(&dentry->d_lock);
494 if (!dent->d_inode) {
495 dput(dent);
506 goto invalid_cache; 496 goto invalid_cache;
497 }
507 over = !dir_emit(ctx, dent->d_name.name, 498 over = !dir_emit(ctx, dent->d_name.name,
508 dent->d_name.len, 499 dent->d_name.len,
509 dent->d_inode->i_ino, DT_UNKNOWN); 500 dent->d_inode->i_ino, DT_UNKNOWN);
@@ -548,6 +539,9 @@ init_cache:
548 ctl.filled = 0; 539 ctl.filled = 0;
549 ctl.valid = 1; 540 ctl.valid = 1;
550read_really: 541read_really:
542 spin_lock(&dentry->d_lock);
543 NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE;
544 spin_unlock(&dentry->d_lock);
551 if (ncp_is_server_root(inode)) { 545 if (ncp_is_server_root(inode)) {
552 ncp_read_volume_list(file, ctx, &ctl); 546 ncp_read_volume_list(file, ctx, &ctl);
553 } else { 547 } else {
@@ -573,6 +567,13 @@ out:
573 return result; 567 return result;
574} 568}
575 569
570static void ncp_d_prune(struct dentry *dentry)
571{
572 if (!dentry->d_fsdata) /* not referenced from page cache */
573 return;
574 NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE;
575}
576
576static int 577static int
577ncp_fill_cache(struct file *file, struct dir_context *ctx, 578ncp_fill_cache(struct file *file, struct dir_context *ctx,
578 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, 579 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
@@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
630 d_instantiate(newdent, inode); 631 d_instantiate(newdent, inode);
631 if (!hashed) 632 if (!hashed)
632 d_rehash(newdent); 633 d_rehash(newdent);
634 } else {
635 spin_lock(&dentry->d_lock);
636 NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
637 spin_unlock(&dentry->d_lock);
633 } 638 }
634 } else { 639 } else {
635 struct inode *inode = newdent->d_inode; 640 struct inode *inode = newdent->d_inode;
@@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
639 mutex_unlock(&inode->i_mutex); 644 mutex_unlock(&inode->i_mutex);
640 } 645 }
641 646
642 if (newdent->d_inode) {
643 ino = newdent->d_inode->i_ino;
644 newdent->d_fsdata = (void *) ctl.fpos;
645 ncp_new_dentry(newdent);
646 }
647
648 if (ctl.idx >= NCP_DIRCACHE_SIZE) { 647 if (ctl.idx >= NCP_DIRCACHE_SIZE) {
649 if (ctl.page) { 648 if (ctl.page) {
650 kunmap(ctl.page); 649 kunmap(ctl.page);
@@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
660 ctl.cache = kmap(ctl.page); 659 ctl.cache = kmap(ctl.page);
661 } 660 }
662 if (ctl.cache) { 661 if (ctl.cache) {
663 ctl.cache->dentry[ctl.idx] = newdent; 662 if (newdent->d_inode) {
664 valid = 1; 663 newdent->d_fsdata = newdent;
664 ctl.cache->dentry[ctl.idx] = newdent;
665 ino = newdent->d_inode->i_ino;
666 ncp_new_dentry(newdent);
667 }
668 valid = 1;
665 } 669 }
666 dput(newdent); 670 dput(newdent);
667end_advance: 671end_advance:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
267 if (inode) { 267 if (inode) {
268 atomic_set(&NCP_FINFO(inode)->opened, info->opened); 268 atomic_set(&NCP_FINFO(inode)->opened, info->opened);
269 269
270 inode->i_mapping->backing_dev_info = sb->s_bdi;
271 inode->i_ino = info->ino; 270 inode->i_ino = info->ino;
272 ncp_set_attr(inode, info); 271 ncp_set_attr(inode, info);
273 if (S_ISREG(inode->i_mode)) { 272 if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
560 server = NCP_SBP(sb); 559 server = NCP_SBP(sb);
561 memset(server, 0, sizeof(*server)); 560 memset(server, 0, sizeof(*server));
562 561
563 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); 562 error = bdi_setup_and_register(&server->bdi, "ncpfs");
564 if (error) 563 if (error)
565 goto out_fput; 564 goto out_fput;
566 565
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
index 4b0bec477846..c4794504f843 100644
--- a/fs/ncpfs/ncp_fs_i.h
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -22,6 +22,7 @@ struct ncp_inode_info {
22 int access; 22 int access;
23 int flags; 23 int flags;
24#define NCPI_KLUDGE_SYMLINK 0x0001 24#define NCPI_KLUDGE_SYMLINK 0x0001
25#define NCPI_DIR_CACHE 0x0002
25 __u8 file_handle[6]; 26 __u8 file_handle[6];
26 struct inode vfs_inode; 27 struct inode vfs_inode;
27}; 28};
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index b785f74bfe3c..250e443a07f3 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry)
184 dentry->d_time = jiffies; 184 dentry->d_time = jiffies;
185} 185}
186 186
187static inline void
188ncp_renew_dentries(struct dentry *parent)
189{
190 struct ncp_server *server = NCP_SERVER(parent->d_inode);
191 struct dentry *dentry;
192
193 spin_lock(&parent->d_lock);
194 list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
195 if (dentry->d_fsdata == NULL)
196 ncp_age_dentry(server, dentry);
197 else
198 ncp_new_dentry(dentry);
199 }
200 spin_unlock(&parent->d_lock);
201}
202
203static inline void
204ncp_invalidate_dircache_entries(struct dentry *parent)
205{
206 struct ncp_server *server = NCP_SERVER(parent->d_inode);
207 struct dentry *dentry;
208
209 spin_lock(&parent->d_lock);
210 list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
211 dentry->d_fsdata = NULL;
212 ncp_age_dentry(server, dentry);
213 }
214 spin_unlock(&parent->d_lock);
215}
216
217struct ncp_cache_head { 187struct ncp_cache_head {
218 time_t mtime; 188 time_t mtime;
219 unsigned long time; /* cache age */ 189 unsigned long time; /* cache age */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
128 depends on NFS_V4_1 && SCSI_OSD_ULD 128 depends on NFS_V4_1 && SCSI_OSD_ULD
129 default NFS_V4 129 default NFS_V4
130 130
131config PNFS_FLEXFILE_LAYOUT
132 tristate
133 depends on NFS_V4_1 && NFS_V3
134 default m
135
131config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN 136config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
132 string "NFSv4.1 Implementation ID Domain" 137 string "NFSv4.1 Implementation ID Domain"
133 depends on NFS_V4_1 138 depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 04cb830fa09f..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
27 dns_resolve.o nfs4trace.o 27 dns_resolve.o nfs4trace.o
28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o 28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o 30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o 31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
32 32
33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ 33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
35obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ 35obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
36obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 77fec6a55f57..1cac3c175d18 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = {
860 .pg_init = bl_pg_init_read, 860 .pg_init = bl_pg_init_read,
861 .pg_test = bl_pg_test_read, 861 .pg_test = bl_pg_test_read,
862 .pg_doio = pnfs_generic_pg_readpages, 862 .pg_doio = pnfs_generic_pg_readpages,
863 .pg_cleanup = pnfs_generic_pg_cleanup,
863}; 864};
864 865
865static const struct nfs_pageio_ops bl_pg_write_ops = { 866static const struct nfs_pageio_ops bl_pg_write_ops = {
866 .pg_init = bl_pg_init_write, 867 .pg_init = bl_pg_init_write,
867 .pg_test = bl_pg_test_write, 868 .pg_test = bl_pg_test_write,
868 .pg_doio = pnfs_generic_pg_writepages, 869 .pg_doio = pnfs_generic_pg_writepages,
870 .pg_cleanup = pnfs_generic_pg_cleanup,
869}; 871};
870 872
871static struct pnfs_layoutdriver_type blocklayout_type = { 873static struct pnfs_layoutdriver_type blocklayout_type = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b8fb3a4ef649..351be9205bf8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
128 if (try_to_freeze()) 128 if (try_to_freeze())
129 continue; 129 continue;
130 130
131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); 131 prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
132 spin_lock_bh(&serv->sv_cb_lock); 132 spin_lock_bh(&serv->sv_cb_lock);
133 if (!list_empty(&serv->sv_cb_list)) { 133 if (!list_empty(&serv->sv_cb_list)) {
134 req = list_first_entry(&serv->sv_cb_list, 134 req = list_first_entry(&serv->sv_cb_list,
135 struct rpc_rqst, rq_bc_list); 135 struct rpc_rqst, rq_bc_list);
136 list_del(&req->rq_bc_list); 136 list_del(&req->rq_bc_list);
137 spin_unlock_bh(&serv->sv_cb_lock); 137 spin_unlock_bh(&serv->sv_cb_lock);
138 finish_wait(&serv->sv_cb_waitq, &wq);
138 dprintk("Invoking bc_svc_process()\n"); 139 dprintk("Invoking bc_svc_process()\n");
139 error = bc_svc_process(serv, req, rqstp); 140 error = bc_svc_process(serv, req, rqstp);
140 dprintk("bc_svc_process() returned w/ error code= %d\n", 141 dprintk("bc_svc_process() returned w/ error code= %d\n",
141 error); 142 error);
142 } else { 143 } else {
143 spin_unlock_bh(&serv->sv_cb_lock); 144 spin_unlock_bh(&serv->sv_cb_lock);
144 schedule(); 145 /* schedule_timeout to game the hung task watchdog */
146 schedule_timeout(60 * HZ);
147 finish_wait(&serv->sv_cb_waitq, &wq);
145 } 148 }
146 finish_wait(&serv->sv_cb_waitq, &wq);
147 } 149 }
148 return 0; 150 return 0;
149} 151}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e36a9d78ea49..197806fb87ff 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
427 if (clp == NULL) 427 if (clp == NULL)
428 goto out; 428 goto out;
429 429
430 if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
431 goto out;
430 tbl = &clp->cl_session->bc_slot_table; 432 tbl = &clp->cl_session->bc_slot_table;
431 433
432 spin_lock(&tbl->slot_tbl_lock); 434 spin_lock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f4ccfe6521ec..19ca95cdfd9b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
313 goto out; 313 goto out;
314 } 314 }
315 315
316 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); 316 args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
317 if (!args->devs) { 317 if (!args->devs) {
318 status = htonl(NFS4ERR_DELAY); 318 status = htonl(NFS4ERR_DELAY);
319 goto out; 319 goto out;
@@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
415 rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); 415 rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
416 if (unlikely(p == NULL)) 416 if (unlikely(p == NULL))
417 goto out; 417 goto out;
418 rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls * 418 rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
419 sizeof(*rc_list->rcl_refcalls), 419 sizeof(*rc_list->rcl_refcalls),
420 GFP_KERNEL); 420 GFP_KERNEL);
421 if (unlikely(rc_list->rcl_refcalls == NULL)) 421 if (unlikely(rc_list->rcl_refcalls == NULL))
@@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
464 464
465 for (i = 0; i < args->csa_nrclists; i++) { 465 for (i = 0; i < args->csa_nrclists; i++) {
466 status = decode_rc_list(xdr, &args->csa_rclists[i]); 466 status = decode_rc_list(xdr, &args->csa_rclists[i]);
467 if (status) 467 if (status) {
468 args->csa_nrclists = i;
468 goto out_free; 469 goto out_free;
470 }
469 } 471 }
470 } 472 }
471 status = 0; 473 status = 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f3f60641344..a1f0685b42ff 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
85{ 85{
86 struct inode *inode = state->inode; 86 struct inode *inode = state->inode;
87 struct file_lock *fl; 87 struct file_lock *fl;
88 struct file_lock_context *flctx = inode->i_flctx;
89 struct list_head *list;
88 int status = 0; 90 int status = 0;
89 91
90 if (inode->i_flock == NULL) 92 if (flctx == NULL)
91 goto out; 93 goto out;
92 94
93 /* Protect inode->i_flock using the i_lock */ 95 list = &flctx->flc_posix;
94 spin_lock(&inode->i_lock); 96 spin_lock(&flctx->flc_lock);
95 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 97restart:
96 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 98 list_for_each_entry(fl, list, fl_list) {
97 continue;
98 if (nfs_file_open_context(fl->fl_file) != ctx) 99 if (nfs_file_open_context(fl->fl_file) != ctx)
99 continue; 100 continue;
100 spin_unlock(&inode->i_lock); 101 spin_unlock(&flctx->flc_lock);
101 status = nfs4_lock_delegation_recall(fl, state, stateid); 102 status = nfs4_lock_delegation_recall(fl, state, stateid);
102 if (status < 0) 103 if (status < 0)
103 goto out; 104 goto out;
104 spin_lock(&inode->i_lock); 105 spin_lock(&flctx->flc_lock);
105 } 106 }
106 spin_unlock(&inode->i_lock); 107 if (list == &flctx->flc_posix) {
108 list = &flctx->flc_flock;
109 goto restart;
110 }
111 spin_unlock(&flctx->flc_lock);
107out: 112out:
108 return status; 113 return status;
109} 114}
@@ -175,7 +180,6 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
175 delegation->cred = get_rpccred(cred); 180 delegation->cred = get_rpccred(cred);
176 clear_bit(NFS_DELEGATION_NEED_RECLAIM, 181 clear_bit(NFS_DELEGATION_NEED_RECLAIM,
177 &delegation->flags); 182 &delegation->flags);
178 NFS_I(inode)->delegation_state = delegation->type;
179 spin_unlock(&delegation->lock); 183 spin_unlock(&delegation->lock);
180 put_rpccred(oldcred); 184 put_rpccred(oldcred);
181 rcu_read_unlock(); 185 rcu_read_unlock();
@@ -270,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
270 set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); 274 set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
271 list_del_rcu(&delegation->super_list); 275 list_del_rcu(&delegation->super_list);
272 delegation->inode = NULL; 276 delegation->inode = NULL;
273 nfsi->delegation_state = 0;
274 rcu_assign_pointer(nfsi->delegation, NULL); 277 rcu_assign_pointer(nfsi->delegation, NULL);
275 spin_unlock(&delegation->lock); 278 spin_unlock(&delegation->lock);
276 return delegation; 279 return delegation;
@@ -301,6 +304,17 @@ nfs_inode_detach_delegation(struct inode *inode)
301 return nfs_detach_delegation(nfsi, delegation, server); 304 return nfs_detach_delegation(nfsi, delegation, server);
302} 305}
303 306
307static void
308nfs_update_inplace_delegation(struct nfs_delegation *delegation,
309 const struct nfs_delegation *update)
310{
311 if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
312 delegation->stateid.seqid = update->stateid.seqid;
313 smp_wmb();
314 delegation->type = update->type;
315 }
316}
317
304/** 318/**
305 * nfs_inode_set_delegation - set up a delegation on an inode 319 * nfs_inode_set_delegation - set up a delegation on an inode
306 * @inode: inode to which delegation applies 320 * @inode: inode to which delegation applies
@@ -334,9 +348,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
334 old_delegation = rcu_dereference_protected(nfsi->delegation, 348 old_delegation = rcu_dereference_protected(nfsi->delegation,
335 lockdep_is_held(&clp->cl_lock)); 349 lockdep_is_held(&clp->cl_lock));
336 if (old_delegation != NULL) { 350 if (old_delegation != NULL) {
337 if (nfs4_stateid_match(&delegation->stateid, 351 /* Is this an update of the existing delegation? */
338 &old_delegation->stateid) && 352 if (nfs4_stateid_match_other(&old_delegation->stateid,
339 delegation->type == old_delegation->type) { 353 &delegation->stateid)) {
354 nfs_update_inplace_delegation(old_delegation,
355 delegation);
340 goto out; 356 goto out;
341 } 357 }
342 /* 358 /*
@@ -360,7 +376,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
360 goto out; 376 goto out;
361 } 377 }
362 list_add_rcu(&delegation->super_list, &server->delegations); 378 list_add_rcu(&delegation->super_list, &server->delegations);
363 nfsi->delegation_state = delegation->type;
364 rcu_assign_pointer(nfsi->delegation, delegation); 379 rcu_assign_pointer(nfsi->delegation, delegation);
365 delegation = NULL; 380 delegation = NULL;
366 381
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 10bf07280f4a..e907c8cf732e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep;
66/* 66/*
67 * This represents a set of asynchronous requests that we're waiting on 67 * This represents a set of asynchronous requests that we're waiting on
68 */ 68 */
69struct nfs_direct_mirror {
70 ssize_t count;
71};
72
69struct nfs_direct_req { 73struct nfs_direct_req {
70 struct kref kref; /* release manager */ 74 struct kref kref; /* release manager */
71 75
@@ -78,8 +82,13 @@ struct nfs_direct_req {
78 /* completion state */ 82 /* completion state */
79 atomic_t io_count; /* i/os we're waiting for */ 83 atomic_t io_count; /* i/os we're waiting for */
80 spinlock_t lock; /* protect completion state */ 84 spinlock_t lock; /* protect completion state */
85
86 struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
87 int mirror_count;
88
81 ssize_t count, /* bytes actually processed */ 89 ssize_t count, /* bytes actually processed */
82 bytes_left, /* bytes left to be sent */ 90 bytes_left, /* bytes left to be sent */
91 io_start, /* start of IO */
83 error; /* any reported error */ 92 error; /* any reported error */
84 struct completion completion; /* wait for i/o completion */ 93 struct completion completion; /* wait for i/o completion */
85 94
@@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
108 return atomic_dec_and_test(&dreq->io_count); 117 return atomic_dec_and_test(&dreq->io_count);
109} 118}
110 119
120void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
121{
122 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
123}
124EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
125
126static void
127nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
128{
129 int i;
130 ssize_t count;
131
132 WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
133
134 count = dreq->mirrors[hdr->pgio_mirror_idx].count;
135 if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
136 count = hdr->io_start + hdr->good_bytes - dreq->io_start;
137 dreq->mirrors[hdr->pgio_mirror_idx].count = count;
138 }
139
140 /* update the dreq->count by finding the minimum agreed count from all
141 * mirrors */
142 count = dreq->mirrors[0].count;
143
144 for (i = 1; i < dreq->mirror_count; i++)
145 count = min(count, dreq->mirrors[i].count);
146
147 dreq->count = count;
148}
149
111/* 150/*
112 * nfs_direct_select_verf - select the right verifier 151 * nfs_direct_select_verf - select the right verifier
113 * @dreq - direct request possibly spanning multiple servers 152 * @dreq - direct request possibly spanning multiple servers
114 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs 153 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
115 * @ds_idx - index of data server in data server list, only valid if ds_clp set 154 * @commit_idx - commit bucket index for the DS
116 * 155 *
117 * returns the correct verifier to use given the role of the server 156 * returns the correct verifier to use given the role of the server
118 */ 157 */
119static struct nfs_writeverf * 158static struct nfs_writeverf *
120nfs_direct_select_verf(struct nfs_direct_req *dreq, 159nfs_direct_select_verf(struct nfs_direct_req *dreq,
121 struct nfs_client *ds_clp, 160 struct nfs_client *ds_clp,
122 int ds_idx) 161 int commit_idx)
123{ 162{
124 struct nfs_writeverf *verfp = &dreq->verf; 163 struct nfs_writeverf *verfp = &dreq->verf;
125 164
126#ifdef CONFIG_NFS_V4_1 165#ifdef CONFIG_NFS_V4_1
127 if (ds_clp) { 166 if (ds_clp) {
128 /* pNFS is in use, use the DS verf */ 167 /* pNFS is in use, use the DS verf */
129 if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) 168 if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
130 verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; 169 verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
131 else 170 else
132 WARN_ON_ONCE(1); 171 WARN_ON_ONCE(1);
133 } 172 }
@@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
148{ 187{
149 struct nfs_writeverf *verfp; 188 struct nfs_writeverf *verfp;
150 189
151 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, 190 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
152 hdr->ds_idx);
153 WARN_ON_ONCE(verfp->committed >= 0); 191 WARN_ON_ONCE(verfp->committed >= 0);
154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 192 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
155 WARN_ON_ONCE(verfp->committed < 0); 193 WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
169{ 207{
170 struct nfs_writeverf *verfp; 208 struct nfs_writeverf *verfp;
171 209
172 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, 210 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
173 hdr->ds_idx);
174 if (verfp->committed < 0) { 211 if (verfp->committed < 0) {
175 nfs_direct_set_hdr_verf(dreq, hdr); 212 nfs_direct_set_hdr_verf(dreq, hdr);
176 return 0; 213 return 0;
@@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
193 230
194 verfp = nfs_direct_select_verf(dreq, data->ds_clp, 231 verfp = nfs_direct_select_verf(dreq, data->ds_clp,
195 data->ds_commit_index); 232 data->ds_commit_index);
196 WARN_ON_ONCE(verfp->committed < 0); 233
234 /* verifier not set so always fail */
235 if (verfp->committed < 0)
236 return 1;
237
197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 238 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
198} 239}
199 240
@@ -212,6 +253,12 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
212 */ 253 */
213ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) 254ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
214{ 255{
256 struct inode *inode = iocb->ki_filp->f_mapping->host;
257
258 /* we only support swap file calling nfs_direct_IO */
259 if (!IS_SWAPFILE(inode))
260 return 0;
261
215#ifndef CONFIG_NFS_SWAP 262#ifndef CONFIG_NFS_SWAP
216 dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", 263 dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
217 iocb->ki_filp, (long long) pos, iter->nr_segs); 264 iocb->ki_filp, (long long) pos, iter->nr_segs);
@@ -236,13 +283,25 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
236void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, 283void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
237 struct nfs_direct_req *dreq) 284 struct nfs_direct_req *dreq)
238{ 285{
239 cinfo->lock = &dreq->lock; 286 cinfo->lock = &dreq->inode->i_lock;
240 cinfo->mds = &dreq->mds_cinfo; 287 cinfo->mds = &dreq->mds_cinfo;
241 cinfo->ds = &dreq->ds_cinfo; 288 cinfo->ds = &dreq->ds_cinfo;
242 cinfo->dreq = dreq; 289 cinfo->dreq = dreq;
243 cinfo->completion_ops = &nfs_direct_commit_completion_ops; 290 cinfo->completion_ops = &nfs_direct_commit_completion_ops;
244} 291}
245 292
293static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
294 struct nfs_pageio_descriptor *pgio,
295 struct nfs_page *req)
296{
297 int mirror_count = 1;
298
299 if (pgio->pg_ops->pg_get_mirror_count)
300 mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
301
302 dreq->mirror_count = mirror_count;
303}
304
246static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 305static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
247{ 306{
248 struct nfs_direct_req *dreq; 307 struct nfs_direct_req *dreq;
@@ -257,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
257 INIT_LIST_HEAD(&dreq->mds_cinfo.list); 316 INIT_LIST_HEAD(&dreq->mds_cinfo.list);
258 dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ 317 dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
259 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); 318 INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
319 dreq->mirror_count = 1;
260 spin_lock_init(&dreq->lock); 320 spin_lock_init(&dreq->lock);
261 321
262 return dreq; 322 return dreq;
@@ -363,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
363 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) 423 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
364 dreq->error = hdr->error; 424 dreq->error = hdr->error;
365 else 425 else
366 dreq->count += hdr->good_bytes; 426 nfs_direct_good_bytes(dreq, hdr);
427
367 spin_unlock(&dreq->lock); 428 spin_unlock(&dreq->lock);
368 429
369 while (!list_empty(&hdr->pages)) { 430 while (!list_empty(&hdr->pages)) {
@@ -541,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
541 602
542 dreq->inode = inode; 603 dreq->inode = inode;
543 dreq->bytes_left = count; 604 dreq->bytes_left = count;
605 dreq->io_start = pos;
544 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 606 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
545 l_ctx = nfs_get_lock_context(dreq->ctx); 607 l_ctx = nfs_get_lock_context(dreq->ctx);
546 if (IS_ERR(l_ctx)) { 608 if (IS_ERR(l_ctx)) {
@@ -573,6 +635,20 @@ out:
573 return result; 635 return result;
574} 636}
575 637
638static void
639nfs_direct_write_scan_commit_list(struct inode *inode,
640 struct list_head *list,
641 struct nfs_commit_info *cinfo)
642{
643 spin_lock(cinfo->lock);
644#ifdef CONFIG_NFS_V4_1
645 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
646 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
647#endif
648 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
649 spin_unlock(cinfo->lock);
650}
651
576static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 652static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
577{ 653{
578 struct nfs_pageio_descriptor desc; 654 struct nfs_pageio_descriptor desc;
@@ -580,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
580 LIST_HEAD(reqs); 656 LIST_HEAD(reqs);
581 struct nfs_commit_info cinfo; 657 struct nfs_commit_info cinfo;
582 LIST_HEAD(failed); 658 LIST_HEAD(failed);
659 int i;
583 660
584 nfs_init_cinfo_from_dreq(&cinfo, dreq); 661 nfs_init_cinfo_from_dreq(&cinfo, dreq);
585 pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); 662 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
586 spin_lock(cinfo.lock);
587 nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
588 spin_unlock(cinfo.lock);
589 663
590 dreq->count = 0; 664 dreq->count = 0;
665 for (i = 0; i < dreq->mirror_count; i++)
666 dreq->mirrors[i].count = 0;
591 get_dreq(dreq); 667 get_dreq(dreq);
592 668
593 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, 669 nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
594 &nfs_direct_write_completion_ops); 670 &nfs_direct_write_completion_ops);
595 desc.pg_dreq = dreq; 671 desc.pg_dreq = dreq;
596 672
673 req = nfs_list_entry(reqs.next);
674 nfs_direct_setup_mirroring(dreq, &desc, req);
675
597 list_for_each_entry_safe(req, tmp, &reqs, wb_list) { 676 list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
598 if (!nfs_pageio_add_request(&desc, req)) { 677 if (!nfs_pageio_add_request(&desc, req)) {
599 nfs_list_remove_request(req); 678 nfs_list_remove_request(req);
@@ -640,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
640 nfs_list_remove_request(req); 719 nfs_list_remove_request(req);
641 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { 720 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
642 /* Note the rewrite will go through mds */ 721 /* Note the rewrite will go through mds */
643 nfs_mark_request_commit(req, NULL, &cinfo); 722 nfs_mark_request_commit(req, NULL, &cinfo, 0);
644 } else 723 } else
645 nfs_release_request(req); 724 nfs_release_request(req);
646 nfs_unlock_and_release_request(req); 725 nfs_unlock_and_release_request(req);
@@ -715,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
715 dreq->error = hdr->error; 794 dreq->error = hdr->error;
716 } 795 }
717 if (dreq->error == 0) { 796 if (dreq->error == 0) {
718 dreq->count += hdr->good_bytes; 797 nfs_direct_good_bytes(dreq, hdr);
719 if (nfs_write_need_commit(hdr)) { 798 if (nfs_write_need_commit(hdr)) {
720 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) 799 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
721 request_commit = true; 800 request_commit = true;
@@ -739,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
739 nfs_list_remove_request(req); 818 nfs_list_remove_request(req);
740 if (request_commit) { 819 if (request_commit) {
741 kref_get(&req->wb_kref); 820 kref_get(&req->wb_kref);
742 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 821 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
822 hdr->ds_commit_idx);
743 } 823 }
744 nfs_unlock_and_release_request(req); 824 nfs_unlock_and_release_request(req);
745 } 825 }
@@ -820,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
820 result = PTR_ERR(req); 900 result = PTR_ERR(req);
821 break; 901 break;
822 } 902 }
903
904 nfs_direct_setup_mirroring(dreq, &desc, req);
905
823 nfs_lock_request(req); 906 nfs_lock_request(req);
824 req->wb_index = pos >> PAGE_SHIFT; 907 req->wb_index = pos >> PAGE_SHIFT;
825 req->wb_offset = pos & ~PAGE_MASK; 908 req->wb_offset = pos & ~PAGE_MASK;
@@ -928,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
928 1011
929 dreq->inode = inode; 1012 dreq->inode = inode;
930 dreq->bytes_left = count; 1013 dreq->bytes_left = count;
1014 dreq->io_start = pos;
931 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1015 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
932 l_ctx = nfs_get_lock_context(dreq->ctx); 1016 l_ctx = nfs_get_lock_context(dreq->ctx);
933 if (IS_ERR(l_ctx)) { 1017 if (IS_ERR(l_ctx)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
646 .fault = filemap_fault, 646 .fault = filemap_fault,
647 .map_pages = filemap_map_pages, 647 .map_pages = filemap_map_pages,
648 .page_mkwrite = nfs_vm_page_mkwrite, 648 .page_mkwrite = nfs_vm_page_mkwrite,
649 .remap_pages = generic_file_remap_pages,
650}; 649};
651 650
652static int nfs_need_sync_write(struct file *filp, struct inode *inode) 651static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..91e88a7ecef0 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
118 } 118 }
119} 119}
120 120
121static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
122{
123 if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
124 return;
125 pnfs_return_layout(inode);
126}
127
128static int filelayout_async_handle_error(struct rpc_task *task, 121static int filelayout_async_handle_error(struct rpc_task *task,
129 struct nfs4_state *state, 122 struct nfs4_state *state,
130 struct nfs_client *clp, 123 struct nfs_client *clp,
@@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
207 dprintk("%s DS connection error %d\n", __func__, 200 dprintk("%s DS connection error %d\n", __func__,
208 task->tk_status); 201 task->tk_status);
209 nfs4_mark_deviceid_unavailable(devid); 202 nfs4_mark_deviceid_unavailable(devid);
210 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); 203 pnfs_error_mark_layout_for_return(inode, lseg);
211 rpc_wake_up(&tbl->slot_tbl_waitq); 204 rpc_wake_up(&tbl->slot_tbl_waitq);
212 /* fall through */ 205 /* fall through */
213 default: 206 default:
@@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data)
339 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); 332 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
340} 333}
341 334
342static void filelayout_read_release(void *data)
343{
344 struct nfs_pgio_header *hdr = data;
345 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
346
347 filelayout_fenceme(lo->plh_inode, lo);
348 nfs_put_client(hdr->ds_clp);
349 hdr->mds_ops->rpc_release(data);
350}
351
352static int filelayout_write_done_cb(struct rpc_task *task, 335static int filelayout_write_done_cb(struct rpc_task *task,
353 struct nfs_pgio_header *hdr) 336 struct nfs_pgio_header *hdr)
354{ 337{
@@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task,
371 return 0; 354 return 0;
372} 355}
373 356
374/* Fake up some data that will cause nfs_commit_release to retry the writes. */
375static void prepare_to_resend_writes(struct nfs_commit_data *data)
376{
377 struct nfs_page *first = nfs_list_entry(data->pages.next);
378
379 data->task.tk_status = 0;
380 memcpy(&data->verf.verifier, &first->wb_verf,
381 sizeof(data->verf.verifier));
382 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
383}
384
385static int filelayout_commit_done_cb(struct rpc_task *task, 357static int filelayout_commit_done_cb(struct rpc_task *task,
386 struct nfs_commit_data *data) 358 struct nfs_commit_data *data)
387{ 359{
@@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
393 365
394 switch (err) { 366 switch (err) {
395 case -NFS4ERR_RESET_TO_MDS: 367 case -NFS4ERR_RESET_TO_MDS:
396 prepare_to_resend_writes(data); 368 pnfs_generic_prepare_to_resend_writes(data);
397 return -EAGAIN; 369 return -EAGAIN;
398 case -EAGAIN: 370 case -EAGAIN:
399 rpc_restart_call_prepare(task); 371 rpc_restart_call_prepare(task);
@@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data)
451 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); 423 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
452} 424}
453 425
454static void filelayout_write_release(void *data)
455{
456 struct nfs_pgio_header *hdr = data;
457 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
458
459 filelayout_fenceme(lo->plh_inode, lo);
460 nfs_put_client(hdr->ds_clp);
461 hdr->mds_ops->rpc_release(data);
462}
463
464static void filelayout_commit_prepare(struct rpc_task *task, void *data) 426static void filelayout_commit_prepare(struct rpc_task *task, void *data)
465{ 427{
466 struct nfs_commit_data *wdata = data; 428 struct nfs_commit_data *wdata = data;
@@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
471 task); 433 task);
472} 434}
473 435
474static void filelayout_write_commit_done(struct rpc_task *task, void *data)
475{
476 struct nfs_commit_data *wdata = data;
477
478 /* Note this may cause RPC to be resent */
479 wdata->mds_ops->rpc_call_done(task, data);
480}
481
482static void filelayout_commit_count_stats(struct rpc_task *task, void *data) 436static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
483{ 437{
484 struct nfs_commit_data *cdata = data; 438 struct nfs_commit_data *cdata = data;
@@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
486 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); 440 rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
487} 441}
488 442
489static void filelayout_commit_release(void *calldata)
490{
491 struct nfs_commit_data *data = calldata;
492
493 data->completion_ops->completion(data);
494 pnfs_put_lseg(data->lseg);
495 nfs_put_client(data->ds_clp);
496 nfs_commitdata_release(data);
497}
498
499static const struct rpc_call_ops filelayout_read_call_ops = { 443static const struct rpc_call_ops filelayout_read_call_ops = {
500 .rpc_call_prepare = filelayout_read_prepare, 444 .rpc_call_prepare = filelayout_read_prepare,
501 .rpc_call_done = filelayout_read_call_done, 445 .rpc_call_done = filelayout_read_call_done,
502 .rpc_count_stats = filelayout_read_count_stats, 446 .rpc_count_stats = filelayout_read_count_stats,
503 .rpc_release = filelayout_read_release, 447 .rpc_release = pnfs_generic_rw_release,
504}; 448};
505 449
506static const struct rpc_call_ops filelayout_write_call_ops = { 450static const struct rpc_call_ops filelayout_write_call_ops = {
507 .rpc_call_prepare = filelayout_write_prepare, 451 .rpc_call_prepare = filelayout_write_prepare,
508 .rpc_call_done = filelayout_write_call_done, 452 .rpc_call_done = filelayout_write_call_done,
509 .rpc_count_stats = filelayout_write_count_stats, 453 .rpc_count_stats = filelayout_write_count_stats,
510 .rpc_release = filelayout_write_release, 454 .rpc_release = pnfs_generic_rw_release,
511}; 455};
512 456
513static const struct rpc_call_ops filelayout_commit_call_ops = { 457static const struct rpc_call_ops filelayout_commit_call_ops = {
514 .rpc_call_prepare = filelayout_commit_prepare, 458 .rpc_call_prepare = filelayout_commit_prepare,
515 .rpc_call_done = filelayout_write_commit_done, 459 .rpc_call_done = pnfs_generic_write_commit_done,
516 .rpc_count_stats = filelayout_commit_count_stats, 460 .rpc_count_stats = filelayout_commit_count_stats,
517 .rpc_release = filelayout_commit_release, 461 .rpc_release = pnfs_generic_commit_release,
518}; 462};
519 463
520static enum pnfs_try_status 464static enum pnfs_try_status
@@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
548 /* No multipath support. Use first DS */ 492 /* No multipath support. Use first DS */
549 atomic_inc(&ds->ds_clp->cl_count); 493 atomic_inc(&ds->ds_clp->cl_count);
550 hdr->ds_clp = ds->ds_clp; 494 hdr->ds_clp = ds->ds_clp;
551 hdr->ds_idx = idx; 495 hdr->ds_commit_idx = idx;
552 fh = nfs4_fl_select_ds_fh(lseg, j); 496 fh = nfs4_fl_select_ds_fh(lseg, j);
553 if (fh) 497 if (fh)
554 hdr->args.fh = fh; 498 hdr->args.fh = fh;
@@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
557 hdr->mds_offset = offset; 501 hdr->mds_offset = offset;
558 502
559 /* Perform an asynchronous read to ds */ 503 /* Perform an asynchronous read to ds */
560 nfs_initiate_pgio(ds_clnt, hdr, 504 nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
561 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); 505 NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
506 0, RPC_TASK_SOFTCONN);
562 return PNFS_ATTEMPTED; 507 return PNFS_ATTEMPTED;
563} 508}
564 509
@@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
591 hdr->pgio_done_cb = filelayout_write_done_cb; 536 hdr->pgio_done_cb = filelayout_write_done_cb;
592 atomic_inc(&ds->ds_clp->cl_count); 537 atomic_inc(&ds->ds_clp->cl_count);
593 hdr->ds_clp = ds->ds_clp; 538 hdr->ds_clp = ds->ds_clp;
594 hdr->ds_idx = idx; 539 hdr->ds_commit_idx = idx;
595 fh = nfs4_fl_select_ds_fh(lseg, j); 540 fh = nfs4_fl_select_ds_fh(lseg, j);
596 if (fh) 541 if (fh)
597 hdr->args.fh = fh; 542 hdr->args.fh = fh;
598 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); 543 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
599 544
600 /* Perform an asynchronous write */ 545 /* Perform an asynchronous write */
601 nfs_initiate_pgio(ds_clnt, hdr, 546 nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
602 &filelayout_write_call_ops, sync, 547 NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
603 RPC_TASK_SOFTCONN); 548 sync, RPC_TASK_SOFTCONN);
604 return PNFS_ATTEMPTED; 549 return PNFS_ATTEMPTED;
605} 550}
606 551
@@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = {
988 .pg_init = filelayout_pg_init_read, 933 .pg_init = filelayout_pg_init_read,
989 .pg_test = filelayout_pg_test, 934 .pg_test = filelayout_pg_test,
990 .pg_doio = pnfs_generic_pg_readpages, 935 .pg_doio = pnfs_generic_pg_readpages,
936 .pg_cleanup = pnfs_generic_pg_cleanup,
991}; 937};
992 938
993static const struct nfs_pageio_ops filelayout_pg_write_ops = { 939static const struct nfs_pageio_ops filelayout_pg_write_ops = {
994 .pg_init = filelayout_pg_init_write, 940 .pg_init = filelayout_pg_init_write,
995 .pg_test = filelayout_pg_test, 941 .pg_test = filelayout_pg_test,
996 .pg_doio = pnfs_generic_pg_writepages, 942 .pg_doio = pnfs_generic_pg_writepages,
943 .pg_cleanup = pnfs_generic_pg_cleanup,
997}; 944};
998 945
999static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) 946static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
@@ -1004,87 +951,28 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
1004 return j; 951 return j;
1005} 952}
1006 953
1007/* The generic layer is about to remove the req from the commit list.
1008 * If this will make the bucket empty, it will need to put the lseg reference.
1009 * Note this is must be called holding the inode (/cinfo) lock
1010 */
1011static void
1012filelayout_clear_request_commit(struct nfs_page *req,
1013 struct nfs_commit_info *cinfo)
1014{
1015 struct pnfs_layout_segment *freeme = NULL;
1016
1017 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
1018 goto out;
1019 cinfo->ds->nwritten--;
1020 if (list_is_singular(&req->wb_list)) {
1021 struct pnfs_commit_bucket *bucket;
1022
1023 bucket = list_first_entry(&req->wb_list,
1024 struct pnfs_commit_bucket,
1025 written);
1026 freeme = bucket->wlseg;
1027 bucket->wlseg = NULL;
1028 }
1029out:
1030 nfs_request_remove_commit_list(req, cinfo);
1031 pnfs_put_lseg_locked(freeme);
1032}
1033
1034static void 954static void
1035filelayout_mark_request_commit(struct nfs_page *req, 955filelayout_mark_request_commit(struct nfs_page *req,
1036 struct pnfs_layout_segment *lseg, 956 struct pnfs_layout_segment *lseg,
1037 struct nfs_commit_info *cinfo) 957 struct nfs_commit_info *cinfo,
958 u32 ds_commit_idx)
1038 959
1039{ 960{
1040 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 961 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
1041 u32 i, j; 962 u32 i, j;
1042 struct list_head *list;
1043 struct pnfs_commit_bucket *buckets;
1044 963
1045 if (fl->commit_through_mds) { 964 if (fl->commit_through_mds) {
1046 list = &cinfo->mds->list; 965 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
1047 spin_lock(cinfo->lock); 966 } else {
1048 goto mds_commit; 967 /* Note that we are calling nfs4_fl_calc_j_index on each page
1049 } 968 * that ends up being committed to a data server. An attractive
1050 969 * alternative is to add a field to nfs_write_data and nfs_page
1051 /* Note that we are calling nfs4_fl_calc_j_index on each page 970 * to store the value calculated in filelayout_write_pagelist
1052 * that ends up being committed to a data server. An attractive 971 * and just use that here.
1053 * alternative is to add a field to nfs_write_data and nfs_page
1054 * to store the value calculated in filelayout_write_pagelist
1055 * and just use that here.
1056 */
1057 j = nfs4_fl_calc_j_index(lseg, req_offset(req));
1058 i = select_bucket_index(fl, j);
1059 spin_lock(cinfo->lock);
1060 buckets = cinfo->ds->buckets;
1061 list = &buckets[i].written;
1062 if (list_empty(list)) {
1063 /* Non-empty buckets hold a reference on the lseg. That ref
1064 * is normally transferred to the COMMIT call and released
1065 * there. It could also be released if the last req is pulled
1066 * off due to a rewrite, in which case it will be done in
1067 * filelayout_clear_request_commit
1068 */ 972 */
1069 buckets[i].wlseg = pnfs_get_lseg(lseg); 973 j = nfs4_fl_calc_j_index(lseg, req_offset(req));
1070 } 974 i = select_bucket_index(fl, j);
1071 set_bit(PG_COMMIT_TO_DS, &req->wb_flags); 975 pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
1072 cinfo->ds->nwritten++;
1073
1074mds_commit:
1075 /* nfs_request_add_commit_list(). We need to add req to list without
1076 * dropping cinfo lock.
1077 */
1078 set_bit(PG_CLEAN, &(req)->wb_flags);
1079 nfs_list_add_request(req, list);
1080 cinfo->mds->ncommit++;
1081 spin_unlock(cinfo->lock);
1082 if (!cinfo->dreq) {
1083 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1084 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
1085 BDI_RECLAIMABLE);
1086 __mark_inode_dirty(req->wb_context->dentry->d_inode,
1087 I_DIRTY_DATASYNC);
1088 } 976 }
1089} 977}
1090 978
@@ -1138,101 +1026,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
1138 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); 1026 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1139 if (fh) 1027 if (fh)
1140 data->args.fh = fh; 1028 data->args.fh = fh;
1141 return nfs_initiate_commit(ds_clnt, data, 1029 return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
1142 &filelayout_commit_call_ops, how, 1030 &filelayout_commit_call_ops, how,
1143 RPC_TASK_SOFTCONN); 1031 RPC_TASK_SOFTCONN);
1144out_err: 1032out_err:
1145 prepare_to_resend_writes(data); 1033 pnfs_generic_prepare_to_resend_writes(data);
1146 filelayout_commit_release(data); 1034 pnfs_generic_commit_release(data);
1147 return -EAGAIN; 1035 return -EAGAIN;
1148} 1036}
1149 1037
1150static int
1151transfer_commit_list(struct list_head *src, struct list_head *dst,
1152 struct nfs_commit_info *cinfo, int max)
1153{
1154 struct nfs_page *req, *tmp;
1155 int ret = 0;
1156
1157 list_for_each_entry_safe(req, tmp, src, wb_list) {
1158 if (!nfs_lock_request(req))
1159 continue;
1160 kref_get(&req->wb_kref);
1161 if (cond_resched_lock(cinfo->lock))
1162 list_safe_reset_next(req, tmp, wb_list);
1163 nfs_request_remove_commit_list(req, cinfo);
1164 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1165 nfs_list_add_request(req, dst);
1166 ret++;
1167 if ((ret == max) && !cinfo->dreq)
1168 break;
1169 }
1170 return ret;
1171}
1172
1173/* Note called with cinfo->lock held. */
1174static int
1175filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
1176 struct nfs_commit_info *cinfo,
1177 int max)
1178{
1179 struct list_head *src = &bucket->written;
1180 struct list_head *dst = &bucket->committing;
1181 int ret;
1182
1183 ret = transfer_commit_list(src, dst, cinfo, max);
1184 if (ret) {
1185 cinfo->ds->nwritten -= ret;
1186 cinfo->ds->ncommitting += ret;
1187 bucket->clseg = bucket->wlseg;
1188 if (list_empty(src))
1189 bucket->wlseg = NULL;
1190 else
1191 pnfs_get_lseg(bucket->clseg);
1192 }
1193 return ret;
1194}
1195
1196/* Move reqs from written to committing lists, returning count of number moved.
1197 * Note called with cinfo->lock held.
1198 */
1199static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
1200 int max)
1201{
1202 int i, rv = 0, cnt;
1203
1204 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
1205 cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
1206 cinfo, max);
1207 max -= cnt;
1208 rv += cnt;
1209 }
1210 return rv;
1211}
1212
1213/* Pull everything off the committing lists and dump into @dst */
1214static void filelayout_recover_commit_reqs(struct list_head *dst,
1215 struct nfs_commit_info *cinfo)
1216{
1217 struct pnfs_commit_bucket *b;
1218 struct pnfs_layout_segment *freeme;
1219 int i;
1220
1221restart:
1222 spin_lock(cinfo->lock);
1223 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1224 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1225 freeme = b->wlseg;
1226 b->wlseg = NULL;
1227 spin_unlock(cinfo->lock);
1228 pnfs_put_lseg(freeme);
1229 goto restart;
1230 }
1231 }
1232 cinfo->ds->nwritten = 0;
1233 spin_unlock(cinfo->lock);
1234}
1235
1236/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest 1038/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
1237 * for @page 1039 * for @page
1238 * @cinfo - commit info for current inode 1040 * @cinfo - commit info for current inode
@@ -1263,108 +1065,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
1263 return NULL; 1065 return NULL;
1264} 1066}
1265 1067
1266static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
1267{
1268 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
1269 struct pnfs_commit_bucket *bucket;
1270 struct pnfs_layout_segment *freeme;
1271 int i;
1272
1273 for (i = idx; i < fl_cinfo->nbuckets; i++) {
1274 bucket = &fl_cinfo->buckets[i];
1275 if (list_empty(&bucket->committing))
1276 continue;
1277 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1278 spin_lock(cinfo->lock);
1279 freeme = bucket->clseg;
1280 bucket->clseg = NULL;
1281 spin_unlock(cinfo->lock);
1282 pnfs_put_lseg(freeme);
1283 }
1284}
1285
1286static unsigned int
1287alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1288{
1289 struct pnfs_ds_commit_info *fl_cinfo;
1290 struct pnfs_commit_bucket *bucket;
1291 struct nfs_commit_data *data;
1292 int i;
1293 unsigned int nreq = 0;
1294
1295 fl_cinfo = cinfo->ds;
1296 bucket = fl_cinfo->buckets;
1297 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
1298 if (list_empty(&bucket->committing))
1299 continue;
1300 data = nfs_commitdata_alloc();
1301 if (!data)
1302 break;
1303 data->ds_commit_index = i;
1304 spin_lock(cinfo->lock);
1305 data->lseg = bucket->clseg;
1306 bucket->clseg = NULL;
1307 spin_unlock(cinfo->lock);
1308 list_add(&data->pages, list);
1309 nreq++;
1310 }
1311
1312 /* Clean up on error */
1313 filelayout_retry_commit(cinfo, i);
1314 /* Caller will clean up entries put on list */
1315 return nreq;
1316}
1317
1318/* This follows nfs_commit_list pretty closely */
1319static int 1068static int
1320filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, 1069filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1321 int how, struct nfs_commit_info *cinfo) 1070 int how, struct nfs_commit_info *cinfo)
1322{ 1071{
1323 struct nfs_commit_data *data, *tmp; 1072 return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1324 LIST_HEAD(list); 1073 filelayout_initiate_commit);
1325 unsigned int nreq = 0;
1326
1327 if (!list_empty(mds_pages)) {
1328 data = nfs_commitdata_alloc();
1329 if (data != NULL) {
1330 data->lseg = NULL;
1331 list_add(&data->pages, &list);
1332 nreq++;
1333 } else {
1334 nfs_retry_commit(mds_pages, NULL, cinfo);
1335 filelayout_retry_commit(cinfo, 0);
1336 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1337 return -ENOMEM;
1338 }
1339 }
1340
1341 nreq += alloc_ds_commits(cinfo, &list);
1342
1343 if (nreq == 0) {
1344 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1345 goto out;
1346 }
1347
1348 atomic_add(nreq, &cinfo->mds->rpcs_out);
1349
1350 list_for_each_entry_safe(data, tmp, &list, pages) {
1351 list_del_init(&data->pages);
1352 if (!data->lseg) {
1353 nfs_init_commit(data, mds_pages, NULL, cinfo);
1354 nfs_initiate_commit(NFS_CLIENT(inode), data,
1355 data->mds_ops, how, 0);
1356 } else {
1357 struct pnfs_commit_bucket *buckets;
1358
1359 buckets = cinfo->ds->buckets;
1360 nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
1361 filelayout_initiate_commit(data, how);
1362 }
1363 }
1364out:
1365 cinfo->ds->ncommitting = 0;
1366 return PNFS_ATTEMPTED;
1367} 1074}
1075
1368static struct nfs4_deviceid_node * 1076static struct nfs4_deviceid_node *
1369filelayout_alloc_deviceid_node(struct nfs_server *server, 1077filelayout_alloc_deviceid_node(struct nfs_server *server,
1370 struct pnfs_device *pdev, gfp_t gfp_flags) 1078 struct pnfs_device *pdev, gfp_t gfp_flags)
@@ -1421,9 +1129,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1421 .pg_write_ops = &filelayout_pg_write_ops, 1129 .pg_write_ops = &filelayout_pg_write_ops,
1422 .get_ds_info = &filelayout_get_ds_info, 1130 .get_ds_info = &filelayout_get_ds_info,
1423 .mark_request_commit = filelayout_mark_request_commit, 1131 .mark_request_commit = filelayout_mark_request_commit,
1424 .clear_request_commit = filelayout_clear_request_commit, 1132 .clear_request_commit = pnfs_generic_clear_request_commit,
1425 .scan_commit_lists = filelayout_scan_commit_lists, 1133 .scan_commit_lists = pnfs_generic_scan_commit_lists,
1426 .recover_commit_reqs = filelayout_recover_commit_reqs, 1134 .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
1427 .search_commit_reqs = filelayout_search_commit_reqs, 1135 .search_commit_reqs = filelayout_search_commit_reqs,
1428 .commit_pagelist = filelayout_commit_pagelist, 1136 .commit_pagelist = filelayout_commit_pagelist,
1429 .read_pagelist = filelayout_read_pagelist, 1137 .read_pagelist = filelayout_read_pagelist,
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 7c9f800c49d7..2896cb833a11 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -33,13 +33,6 @@
33#include "../pnfs.h" 33#include "../pnfs.h"
34 34
35/* 35/*
36 * Default data server connection timeout and retrans vaules.
37 * Set by module paramters dataserver_timeo and dataserver_retrans.
38 */
39#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
40#define NFS4_DEF_DS_RETRANS 5
41
42/*
43 * Field testing shows we need to support up to 4096 stripe indices. 36 * Field testing shows we need to support up to 4096 stripe indices.
44 * We store each index as a u8 (u32 on the wire) to keep the memory footprint 37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
45 * reasonable. This in turn means we support a maximum of 256 38 * reasonable. This in turn means we support a maximum of 256
@@ -48,32 +41,11 @@
48#define NFS4_PNFS_MAX_STRIPE_CNT 4096 41#define NFS4_PNFS_MAX_STRIPE_CNT 4096
49#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ 42#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
50 43
51/* error codes for internal use */
52#define NFS4ERR_RESET_TO_MDS 12001
53
54enum stripetype4 { 44enum stripetype4 {
55 STRIPE_SPARSE = 1, 45 STRIPE_SPARSE = 1,
56 STRIPE_DENSE = 2 46 STRIPE_DENSE = 2
57}; 47};
58 48
59/* Individual ip address */
60struct nfs4_pnfs_ds_addr {
61 struct sockaddr_storage da_addr;
62 size_t da_addrlen;
63 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
64 char *da_remotestr; /* human readable addr+port */
65};
66
67struct nfs4_pnfs_ds {
68 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
69 char *ds_remotestr; /* comma sep list of addrs */
70 struct list_head ds_addrs;
71 struct nfs_client *ds_clp;
72 atomic_t ds_count;
73 unsigned long ds_state;
74#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
75};
76
77struct nfs4_file_layout_dsaddr { 49struct nfs4_file_layout_dsaddr {
78 struct nfs4_deviceid_node id_node; 50 struct nfs4_deviceid_node id_node;
79 u32 stripe_count; 51 u32 stripe_count;
@@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
119 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; 91 return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
120} 92}
121 93
122static inline void
123filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
124{
125 u32 *p = (u32 *)&node->deviceid;
126
127 printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
128 p[0], p[1], p[2], p[3]);
129
130 set_bit(NFS_DEVICEID_INVALID, &node->flags);
131}
132
133static inline bool 94static inline bool
134filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) 95filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
135{ 96{
@@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
142extern struct nfs_fh * 103extern struct nfs_fh *
143nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 104nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
144 105
145extern void print_ds(struct nfs4_pnfs_ds *ds);
146u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); 106u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 107u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 108struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index bfecac781f19..4f372e224603 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -31,7 +31,6 @@
31#include <linux/nfs_fs.h> 31#include <linux/nfs_fs.h>
32#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/sunrpc/addr.h>
35 34
36#include "../internal.h" 35#include "../internal.h"
37#include "../nfs4session.h" 36#include "../nfs4session.h"
@@ -42,183 +41,6 @@
42static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; 41static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
43static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; 42static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
44 43
45/*
46 * Data server cache
47 *
48 * Data servers can be mapped to different device ids.
49 * nfs4_pnfs_ds reference counting
50 * - set to 1 on allocation
51 * - incremented when a device id maps a data server already in the cache.
52 * - decremented when deviceid is removed from the cache.
53 */
54static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55static LIST_HEAD(nfs4_data_server_cache);
56
57/* Debug routines */
58void
59print_ds(struct nfs4_pnfs_ds *ds)
60{
61 if (ds == NULL) {
62 printk("%s NULL device\n", __func__);
63 return;
64 }
65 printk(" ds %s\n"
66 " ref count %d\n"
67 " client %p\n"
68 " cl_exchange_flags %x\n",
69 ds->ds_remotestr,
70 atomic_read(&ds->ds_count), ds->ds_clp,
71 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
72}
73
74static bool
75same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76{
77 struct sockaddr_in *a, *b;
78 struct sockaddr_in6 *a6, *b6;
79
80 if (addr1->sa_family != addr2->sa_family)
81 return false;
82
83 switch (addr1->sa_family) {
84 case AF_INET:
85 a = (struct sockaddr_in *)addr1;
86 b = (struct sockaddr_in *)addr2;
87
88 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
89 a->sin_port == b->sin_port)
90 return true;
91 break;
92
93 case AF_INET6:
94 a6 = (struct sockaddr_in6 *)addr1;
95 b6 = (struct sockaddr_in6 *)addr2;
96
97 /* LINKLOCAL addresses must have matching scope_id */
98 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 IPV6_ADDR_SCOPE_LINKLOCAL &&
100 a6->sin6_scope_id != b6->sin6_scope_id)
101 return false;
102
103 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
104 a6->sin6_port == b6->sin6_port)
105 return true;
106 break;
107
108 default:
109 dprintk("%s: unhandled address family: %u\n",
110 __func__, addr1->sa_family);
111 return false;
112 }
113
114 return false;
115}
116
117static bool
118_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
119 const struct list_head *dsaddrs2)
120{
121 struct nfs4_pnfs_ds_addr *da1, *da2;
122
123 /* step through both lists, comparing as we go */
124 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
125 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
126 da1 != NULL && da2 != NULL;
127 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
128 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
129 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
130 (struct sockaddr *)&da2->da_addr))
131 return false;
132 }
133 if (da1 == NULL && da2 == NULL)
134 return true;
135
136 return false;
137}
138
139/*
140 * Lookup DS by addresses. nfs4_ds_cache_lock is held
141 */
142static struct nfs4_pnfs_ds *
143_data_server_lookup_locked(const struct list_head *dsaddrs)
144{
145 struct nfs4_pnfs_ds *ds;
146
147 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
148 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
149 return ds;
150 return NULL;
151}
152
153/*
154 * Create an rpc connection to the nfs4_pnfs_ds data server
155 * Currently only supports IPv4 and IPv6 addresses
156 */
157static int
158nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
159{
160 struct nfs_client *clp = ERR_PTR(-EIO);
161 struct nfs4_pnfs_ds_addr *da;
162 int status = 0;
163
164 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
165 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
166
167 list_for_each_entry(da, &ds->ds_addrs, da_node) {
168 dprintk("%s: DS %s: trying address %s\n",
169 __func__, ds->ds_remotestr, da->da_remotestr);
170
171 clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 (struct sockaddr *)&da->da_addr,
173 da->da_addrlen, IPPROTO_TCP,
174 dataserver_timeo, dataserver_retrans);
175 if (!IS_ERR(clp))
176 break;
177 }
178
179 if (IS_ERR(clp)) {
180 status = PTR_ERR(clp);
181 goto out;
182 }
183
184 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
185 if (status)
186 goto out_put;
187
188 smp_wmb();
189 ds->ds_clp = clp;
190 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
191out:
192 return status;
193out_put:
194 nfs_put_client(clp);
195 goto out;
196}
197
198static void
199destroy_ds(struct nfs4_pnfs_ds *ds)
200{
201 struct nfs4_pnfs_ds_addr *da;
202
203 dprintk("--> %s\n", __func__);
204 ifdebug(FACILITY)
205 print_ds(ds);
206
207 nfs_put_client(ds->ds_clp);
208
209 while (!list_empty(&ds->ds_addrs)) {
210 da = list_first_entry(&ds->ds_addrs,
211 struct nfs4_pnfs_ds_addr,
212 da_node);
213 list_del_init(&da->da_node);
214 kfree(da->da_remotestr);
215 kfree(da);
216 }
217
218 kfree(ds->ds_remotestr);
219 kfree(ds);
220}
221
222void 44void
223nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 45nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
224{ 46{
@@ -229,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
229 51
230 for (i = 0; i < dsaddr->ds_num; i++) { 52 for (i = 0; i < dsaddr->ds_num; i++) {
231 ds = dsaddr->ds_list[i]; 53 ds = dsaddr->ds_list[i];
232 if (ds != NULL) { 54 if (ds != NULL)
233 if (atomic_dec_and_lock(&ds->ds_count, 55 nfs4_pnfs_ds_put(ds);
234 &nfs4_ds_cache_lock)) {
235 list_del_init(&ds->ds_node);
236 spin_unlock(&nfs4_ds_cache_lock);
237 destroy_ds(ds);
238 }
239 }
240 } 56 }
241 kfree(dsaddr->stripe_indices); 57 kfree(dsaddr->stripe_indices);
242 kfree(dsaddr); 58 kfree(dsaddr);
243} 59}
244 60
245/*
246 * Create a string with a human readable address and port to avoid
247 * complicated setup around many dprinks.
248 */
249static char *
250nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
251{
252 struct nfs4_pnfs_ds_addr *da;
253 char *remotestr;
254 size_t len;
255 char *p;
256
257 len = 3; /* '{', '}' and eol */
258 list_for_each_entry(da, dsaddrs, da_node) {
259 len += strlen(da->da_remotestr) + 1; /* string plus comma */
260 }
261
262 remotestr = kzalloc(len, gfp_flags);
263 if (!remotestr)
264 return NULL;
265
266 p = remotestr;
267 *(p++) = '{';
268 len--;
269 list_for_each_entry(da, dsaddrs, da_node) {
270 size_t ll = strlen(da->da_remotestr);
271
272 if (ll > len)
273 goto out_err;
274
275 memcpy(p, da->da_remotestr, ll);
276 p += ll;
277 len -= ll;
278
279 if (len < 1)
280 goto out_err;
281 (*p++) = ',';
282 len--;
283 }
284 if (len < 2)
285 goto out_err;
286 *(p++) = '}';
287 *p = '\0';
288 return remotestr;
289out_err:
290 kfree(remotestr);
291 return NULL;
292}
293
294static struct nfs4_pnfs_ds *
295nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
296{
297 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
298 char *remotestr;
299
300 if (list_empty(dsaddrs)) {
301 dprintk("%s: no addresses defined\n", __func__);
302 goto out;
303 }
304
305 ds = kzalloc(sizeof(*ds), gfp_flags);
306 if (!ds)
307 goto out;
308
309 /* this is only used for debugging, so it's ok if its NULL */
310 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
311
312 spin_lock(&nfs4_ds_cache_lock);
313 tmp_ds = _data_server_lookup_locked(dsaddrs);
314 if (tmp_ds == NULL) {
315 INIT_LIST_HEAD(&ds->ds_addrs);
316 list_splice_init(dsaddrs, &ds->ds_addrs);
317 ds->ds_remotestr = remotestr;
318 atomic_set(&ds->ds_count, 1);
319 INIT_LIST_HEAD(&ds->ds_node);
320 ds->ds_clp = NULL;
321 list_add(&ds->ds_node, &nfs4_data_server_cache);
322 dprintk("%s add new data server %s\n", __func__,
323 ds->ds_remotestr);
324 } else {
325 kfree(remotestr);
326 kfree(ds);
327 atomic_inc(&tmp_ds->ds_count);
328 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
329 __func__, tmp_ds->ds_remotestr,
330 atomic_read(&tmp_ds->ds_count));
331 ds = tmp_ds;
332 }
333 spin_unlock(&nfs4_ds_cache_lock);
334out:
335 return ds;
336}
337
338/*
339 * Currently only supports ipv4, ipv6 and one multi-path address.
340 */
341static struct nfs4_pnfs_ds_addr *
342decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
343{
344 struct nfs4_pnfs_ds_addr *da = NULL;
345 char *buf, *portstr;
346 __be16 port;
347 int nlen, rlen;
348 int tmp[2];
349 __be32 *p;
350 char *netid, *match_netid;
351 size_t len, match_netid_len;
352 char *startsep = "";
353 char *endsep = "";
354
355
356 /* r_netid */
357 p = xdr_inline_decode(streamp, 4);
358 if (unlikely(!p))
359 goto out_err;
360 nlen = be32_to_cpup(p++);
361
362 p = xdr_inline_decode(streamp, nlen);
363 if (unlikely(!p))
364 goto out_err;
365
366 netid = kmalloc(nlen+1, gfp_flags);
367 if (unlikely(!netid))
368 goto out_err;
369
370 netid[nlen] = '\0';
371 memcpy(netid, p, nlen);
372
373 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
374 p = xdr_inline_decode(streamp, 4);
375 if (unlikely(!p))
376 goto out_free_netid;
377 rlen = be32_to_cpup(p);
378
379 p = xdr_inline_decode(streamp, rlen);
380 if (unlikely(!p))
381 goto out_free_netid;
382
383 /* port is ".ABC.DEF", 8 chars max */
384 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
385 dprintk("%s: Invalid address, length %d\n", __func__,
386 rlen);
387 goto out_free_netid;
388 }
389 buf = kmalloc(rlen + 1, gfp_flags);
390 if (!buf) {
391 dprintk("%s: Not enough memory\n", __func__);
392 goto out_free_netid;
393 }
394 buf[rlen] = '\0';
395 memcpy(buf, p, rlen);
396
397 /* replace port '.' with '-' */
398 portstr = strrchr(buf, '.');
399 if (!portstr) {
400 dprintk("%s: Failed finding expected dot in port\n",
401 __func__);
402 goto out_free_buf;
403 }
404 *portstr = '-';
405
406 /* find '.' between address and port */
407 portstr = strrchr(buf, '.');
408 if (!portstr) {
409 dprintk("%s: Failed finding expected dot between address and "
410 "port\n", __func__);
411 goto out_free_buf;
412 }
413 *portstr = '\0';
414
415 da = kzalloc(sizeof(*da), gfp_flags);
416 if (unlikely(!da))
417 goto out_free_buf;
418
419 INIT_LIST_HEAD(&da->da_node);
420
421 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
422 sizeof(da->da_addr))) {
423 dprintk("%s: error parsing address %s\n", __func__, buf);
424 goto out_free_da;
425 }
426
427 portstr++;
428 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
429 port = htons((tmp[0] << 8) | (tmp[1]));
430
431 switch (da->da_addr.ss_family) {
432 case AF_INET:
433 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
434 da->da_addrlen = sizeof(struct sockaddr_in);
435 match_netid = "tcp";
436 match_netid_len = 3;
437 break;
438
439 case AF_INET6:
440 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
441 da->da_addrlen = sizeof(struct sockaddr_in6);
442 match_netid = "tcp6";
443 match_netid_len = 4;
444 startsep = "[";
445 endsep = "]";
446 break;
447
448 default:
449 dprintk("%s: unsupported address family: %u\n",
450 __func__, da->da_addr.ss_family);
451 goto out_free_da;
452 }
453
454 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
455 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
456 __func__, netid, match_netid);
457 goto out_free_da;
458 }
459
460 /* save human readable address */
461 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
462 da->da_remotestr = kzalloc(len, gfp_flags);
463
464 /* NULL is ok, only used for dprintk */
465 if (da->da_remotestr)
466 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
467 buf, endsep, ntohs(port));
468
469 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
470 kfree(buf);
471 kfree(netid);
472 return da;
473
474out_free_da:
475 kfree(da);
476out_free_buf:
477 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
478 kfree(buf);
479out_free_netid:
480 kfree(netid);
481out_err:
482 return NULL;
483}
484
485/* Decode opaque device data and return the result */ 61/* Decode opaque device data and return the result */
486struct nfs4_file_layout_dsaddr * 62struct nfs4_file_layout_dsaddr *
487nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, 63nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
@@ -584,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
584 160
585 mp_count = be32_to_cpup(p); /* multipath count */ 161 mp_count = be32_to_cpup(p); /* multipath count */
586 for (j = 0; j < mp_count; j++) { 162 for (j = 0; j < mp_count; j++) {
587 da = decode_ds_addr(server->nfs_client->cl_net, 163 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
588 &stream, gfp_flags); 164 &stream, gfp_flags);
589 if (da) 165 if (da)
590 list_add_tail(&da->da_node, &dsaddrs); 166 list_add_tail(&da->da_node, &dsaddrs);
591 } 167 }
@@ -681,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
681 return flseg->fh_array[i]; 257 return flseg->fh_array[i];
682} 258}
683 259
684static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) 260/* Upon return, either ds is connected, or ds is NULL */
685{
686 might_sleep();
687 wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
688 nfs_wait_bit_killable, TASK_KILLABLE);
689}
690
691static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
692{
693 smp_mb__before_atomic();
694 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
695 smp_mb__after_atomic();
696 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
697}
698
699
700struct nfs4_pnfs_ds * 261struct nfs4_pnfs_ds *
701nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) 262nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
702{ 263{
@@ -704,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
704 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; 265 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
705 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); 266 struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
706 struct nfs4_pnfs_ds *ret = ds; 267 struct nfs4_pnfs_ds *ret = ds;
268 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
707 269
708 if (ds == NULL) { 270 if (ds == NULL) {
709 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", 271 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
710 __func__, ds_idx); 272 __func__, ds_idx);
711 filelayout_mark_devid_invalid(devid); 273 pnfs_generic_mark_devid_invalid(devid);
712 goto out; 274 goto out;
713 } 275 }
714 smp_rmb(); 276 smp_rmb();
715 if (ds->ds_clp) 277 if (ds->ds_clp)
716 goto out_test_devid; 278 goto out_test_devid;
717 279
718 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { 280 nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
719 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); 281 dataserver_retrans, 4,
720 int err; 282 s->nfs_client->cl_minorversion,
721 283 s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
722 err = nfs4_ds_connect(s, ds); 284
723 if (err)
724 nfs4_mark_deviceid_unavailable(devid);
725 nfs4_clear_ds_conn_bit(ds);
726 } else {
727 /* Either ds is connected, or ds is NULL */
728 nfs4_wait_ds_connect(ds);
729 }
730out_test_devid: 285out_test_devid:
731 if (filelayout_test_devid_unavailable(devid)) 286 if (filelayout_test_devid_unavailable(devid))
732 ret = NULL; 287 ret = NULL;
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Flexfile Layout Driver kernel module
3#
4obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
5nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..315cc68945b9
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1533 @@
1/*
2 * Module for pnfs flexfile layout driver.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#include <linux/nfs_fs.h>
10#include <linux/nfs_page.h>
11#include <linux/module.h>
12
13#include <linux/sunrpc/metrics.h>
14#include <linux/nfs_idmap.h>
15
16#include "flexfilelayout.h"
17#include "../nfs4session.h"
18#include "../internal.h"
19#include "../delegation.h"
20#include "../nfs4trace.h"
21#include "../iostat.h"
22#include "../nfs.h"
23
24#define NFSDBG_FACILITY NFSDBG_PNFS_LD
25
26#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
27
28static struct pnfs_layout_hdr *
29ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
30{
31 struct nfs4_flexfile_layout *ffl;
32
33 ffl = kzalloc(sizeof(*ffl), gfp_flags);
34 if (ffl) {
35 INIT_LIST_HEAD(&ffl->error_list);
36 return &ffl->generic_hdr;
37 } else
38 return NULL;
39}
40
41static void
42ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
43{
44 struct nfs4_ff_layout_ds_err *err, *n;
45
46 list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
47 list) {
48 list_del(&err->list);
49 kfree(err);
50 }
51 kfree(FF_LAYOUT_FROM_HDR(lo));
52}
53
54static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
55{
56 __be32 *p;
57
58 p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
59 if (unlikely(p == NULL))
60 return -ENOBUFS;
61 memcpy(stateid, p, NFS4_STATEID_SIZE);
62 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
63 p[0], p[1], p[2], p[3]);
64 return 0;
65}
66
67static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
68{
69 __be32 *p;
70
71 p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
72 if (unlikely(!p))
73 return -ENOBUFS;
74 memcpy(devid, p, NFS4_DEVICEID4_SIZE);
75 nfs4_print_deviceid(devid);
76 return 0;
77}
78
79static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
80{
81 __be32 *p;
82
83 p = xdr_inline_decode(xdr, 4);
84 if (unlikely(!p))
85 return -ENOBUFS;
86 fh->size = be32_to_cpup(p++);
87 if (fh->size > sizeof(struct nfs_fh)) {
88 printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
89 fh->size);
90 return -EOVERFLOW;
91 }
92 /* fh.data */
93 p = xdr_inline_decode(xdr, fh->size);
94 if (unlikely(!p))
95 return -ENOBUFS;
96 memcpy(&fh->data, p, fh->size);
97 dprintk("%s: fh len %d\n", __func__, fh->size);
98
99 return 0;
100}
101
102/*
103 * Currently only stringified uids and gids are accepted.
104 * I.e., kerberos is not supported to the DSes, so no pricipals.
105 *
106 * That means that one common function will suffice, but when
107 * principals are added, this should be split to accomodate
108 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
109 */
110static int
111decode_name(struct xdr_stream *xdr, u32 *id)
112{
113 __be32 *p;
114 int len;
115
116 /* opaque_length(4)*/
117 p = xdr_inline_decode(xdr, 4);
118 if (unlikely(!p))
119 return -ENOBUFS;
120 len = be32_to_cpup(p++);
121 if (len < 0)
122 return -EINVAL;
123
124 dprintk("%s: len %u\n", __func__, len);
125
126 /* opaque body */
127 p = xdr_inline_decode(xdr, len);
128 if (unlikely(!p))
129 return -ENOBUFS;
130
131 if (!nfs_map_string_to_numeric((char *)p, len, id))
132 return -EINVAL;
133
134 return 0;
135}
136
137static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
138{
139 int i;
140
141 if (fls->mirror_array) {
142 for (i = 0; i < fls->mirror_array_cnt; i++) {
143 /* normally mirror_ds is freed in
144 * .free_deviceid_node but we still do it here
145 * for .alloc_lseg error path */
146 if (fls->mirror_array[i]) {
147 kfree(fls->mirror_array[i]->fh_versions);
148 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
149 kfree(fls->mirror_array[i]);
150 }
151 }
152 kfree(fls->mirror_array);
153 fls->mirror_array = NULL;
154 }
155}
156
157static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
158{
159 int ret = 0;
160
161 dprintk("--> %s\n", __func__);
162
163 /* FIXME: remove this check when layout segment support is added */
164 if (lgr->range.offset != 0 ||
165 lgr->range.length != NFS4_MAX_UINT64) {
166 dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
167 __func__);
168 ret = -EINVAL;
169 }
170
171 dprintk("--> %s returns %d\n", __func__, ret);
172 return ret;
173}
174
175static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
176{
177 if (fls) {
178 ff_layout_free_mirror_array(fls);
179 kfree(fls);
180 }
181}
182
183static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
184{
185 struct nfs4_ff_layout_mirror *tmp;
186 int i, j;
187
188 for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
189 for (j = i + 1; j < fls->mirror_array_cnt; j++)
190 if (fls->mirror_array[i]->efficiency <
191 fls->mirror_array[j]->efficiency) {
192 tmp = fls->mirror_array[i];
193 fls->mirror_array[i] = fls->mirror_array[j];
194 fls->mirror_array[j] = tmp;
195 }
196 }
197}
198
199static struct pnfs_layout_segment *
200ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
201 struct nfs4_layoutget_res *lgr,
202 gfp_t gfp_flags)
203{
204 struct pnfs_layout_segment *ret;
205 struct nfs4_ff_layout_segment *fls = NULL;
206 struct xdr_stream stream;
207 struct xdr_buf buf;
208 struct page *scratch;
209 u64 stripe_unit;
210 u32 mirror_array_cnt;
211 __be32 *p;
212 int i, rc;
213
214 dprintk("--> %s\n", __func__);
215 scratch = alloc_page(gfp_flags);
216 if (!scratch)
217 return ERR_PTR(-ENOMEM);
218
219 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
220 lgr->layoutp->len);
221 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
222
223 /* stripe unit and mirror_array_cnt */
224 rc = -EIO;
225 p = xdr_inline_decode(&stream, 8 + 4);
226 if (!p)
227 goto out_err_free;
228
229 p = xdr_decode_hyper(p, &stripe_unit);
230 mirror_array_cnt = be32_to_cpup(p++);
231 dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
232 stripe_unit, mirror_array_cnt);
233
234 if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
235 mirror_array_cnt == 0)
236 goto out_err_free;
237
238 rc = -ENOMEM;
239 fls = kzalloc(sizeof(*fls), gfp_flags);
240 if (!fls)
241 goto out_err_free;
242
243 fls->mirror_array_cnt = mirror_array_cnt;
244 fls->stripe_unit = stripe_unit;
245 fls->mirror_array = kcalloc(fls->mirror_array_cnt,
246 sizeof(fls->mirror_array[0]), gfp_flags);
247 if (fls->mirror_array == NULL)
248 goto out_err_free;
249
250 for (i = 0; i < fls->mirror_array_cnt; i++) {
251 struct nfs4_deviceid devid;
252 struct nfs4_deviceid_node *idnode;
253 u32 ds_count;
254 u32 fh_count;
255 int j;
256
257 rc = -EIO;
258 p = xdr_inline_decode(&stream, 4);
259 if (!p)
260 goto out_err_free;
261 ds_count = be32_to_cpup(p);
262
263 /* FIXME: allow for striping? */
264 if (ds_count != 1)
265 goto out_err_free;
266
267 fls->mirror_array[i] =
268 kzalloc(sizeof(struct nfs4_ff_layout_mirror),
269 gfp_flags);
270 if (fls->mirror_array[i] == NULL) {
271 rc = -ENOMEM;
272 goto out_err_free;
273 }
274
275 spin_lock_init(&fls->mirror_array[i]->lock);
276 fls->mirror_array[i]->ds_count = ds_count;
277
278 /* deviceid */
279 rc = decode_deviceid(&stream, &devid);
280 if (rc)
281 goto out_err_free;
282
283 idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
284 &devid, lh->plh_lc_cred,
285 gfp_flags);
286 /*
287 * upon success, mirror_ds is allocated by previous
288 * getdeviceinfo, or newly by .alloc_deviceid_node
289 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
290 */
291 if (idnode)
292 fls->mirror_array[i]->mirror_ds =
293 FF_LAYOUT_MIRROR_DS(idnode);
294 else
295 goto out_err_free;
296
297 /* efficiency */
298 rc = -EIO;
299 p = xdr_inline_decode(&stream, 4);
300 if (!p)
301 goto out_err_free;
302 fls->mirror_array[i]->efficiency = be32_to_cpup(p);
303
304 /* stateid */
305 rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
306 if (rc)
307 goto out_err_free;
308
309 /* fh */
310 p = xdr_inline_decode(&stream, 4);
311 if (!p)
312 goto out_err_free;
313 fh_count = be32_to_cpup(p);
314
315 fls->mirror_array[i]->fh_versions =
316 kzalloc(fh_count * sizeof(struct nfs_fh),
317 gfp_flags);
318 if (fls->mirror_array[i]->fh_versions == NULL) {
319 rc = -ENOMEM;
320 goto out_err_free;
321 }
322
323 for (j = 0; j < fh_count; j++) {
324 rc = decode_nfs_fh(&stream,
325 &fls->mirror_array[i]->fh_versions[j]);
326 if (rc)
327 goto out_err_free;
328 }
329
330 fls->mirror_array[i]->fh_versions_cnt = fh_count;
331
332 /* user */
333 rc = decode_name(&stream, &fls->mirror_array[i]->uid);
334 if (rc)
335 goto out_err_free;
336
337 /* group */
338 rc = decode_name(&stream, &fls->mirror_array[i]->gid);
339 if (rc)
340 goto out_err_free;
341
342 dprintk("%s: uid %d gid %d\n", __func__,
343 fls->mirror_array[i]->uid,
344 fls->mirror_array[i]->gid);
345 }
346
347 ff_layout_sort_mirrors(fls);
348 rc = ff_layout_check_layout(lgr);
349 if (rc)
350 goto out_err_free;
351
352 ret = &fls->generic_hdr;
353 dprintk("<-- %s (success)\n", __func__);
354out_free_page:
355 __free_page(scratch);
356 return ret;
357out_err_free:
358 _ff_layout_free_lseg(fls);
359 ret = ERR_PTR(rc);
360 dprintk("<-- %s (%d)\n", __func__, rc);
361 goto out_free_page;
362}
363
364static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
365{
366 struct pnfs_layout_segment *lseg;
367
368 list_for_each_entry(lseg, &layout->plh_segs, pls_list)
369 if (lseg->pls_range.iomode == IOMODE_RW)
370 return true;
371
372 return false;
373}
374
375static void
376ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
377{
378 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
379 int i;
380
381 dprintk("--> %s\n", __func__);
382
383 for (i = 0; i < fls->mirror_array_cnt; i++) {
384 if (fls->mirror_array[i]) {
385 nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
386 fls->mirror_array[i]->mirror_ds = NULL;
387 if (fls->mirror_array[i]->cred) {
388 put_rpccred(fls->mirror_array[i]->cred);
389 fls->mirror_array[i]->cred = NULL;
390 }
391 }
392 }
393
394 if (lseg->pls_range.iomode == IOMODE_RW) {
395 struct nfs4_flexfile_layout *ffl;
396 struct inode *inode;
397
398 ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
399 inode = ffl->generic_hdr.plh_inode;
400 spin_lock(&inode->i_lock);
401 if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
402 ffl->commit_info.nbuckets = 0;
403 kfree(ffl->commit_info.buckets);
404 ffl->commit_info.buckets = NULL;
405 }
406 spin_unlock(&inode->i_lock);
407 }
408 _ff_layout_free_lseg(fls);
409}
410
411/* Return 1 until we have multiple lsegs support */
412static int
413ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
414{
415 return 1;
416}
417
418static int
419ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
420 struct nfs_commit_info *cinfo,
421 gfp_t gfp_flags)
422{
423 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
424 struct pnfs_commit_bucket *buckets;
425 int size;
426
427 if (cinfo->ds->nbuckets != 0) {
428 /* This assumes there is only one RW lseg per file.
429 * To support multiple lseg per file, we need to
430 * change struct pnfs_commit_bucket to allow dynamic
431 * increasing nbuckets.
432 */
433 return 0;
434 }
435
436 size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
437
438 buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
439 gfp_flags);
440 if (!buckets)
441 return -ENOMEM;
442 else {
443 int i;
444
445 spin_lock(cinfo->lock);
446 if (cinfo->ds->nbuckets != 0)
447 kfree(buckets);
448 else {
449 cinfo->ds->buckets = buckets;
450 cinfo->ds->nbuckets = size;
451 for (i = 0; i < size; i++) {
452 INIT_LIST_HEAD(&buckets[i].written);
453 INIT_LIST_HEAD(&buckets[i].committing);
454 /* mark direct verifier as unset */
455 buckets[i].direct_verf.committed =
456 NFS_INVALID_STABLE_HOW;
457 }
458 }
459 spin_unlock(cinfo->lock);
460 return 0;
461 }
462}
463
464static struct nfs4_pnfs_ds *
465ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
466 int *best_idx)
467{
468 struct nfs4_ff_layout_segment *fls;
469 struct nfs4_pnfs_ds *ds;
470 int idx;
471
472 fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
473 /* mirrors are sorted by efficiency */
474 for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
475 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
476 if (ds) {
477 *best_idx = idx;
478 return ds;
479 }
480 }
481
482 return NULL;
483}
484
485static void
486ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
487 struct nfs_page *req)
488{
489 struct nfs_pgio_mirror *pgm;
490 struct nfs4_ff_layout_mirror *mirror;
491 struct nfs4_pnfs_ds *ds;
492 int ds_idx;
493
494 /* Use full layout for now */
495 if (!pgio->pg_lseg)
496 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
497 req->wb_context,
498 0,
499 NFS4_MAX_UINT64,
500 IOMODE_READ,
501 GFP_KERNEL);
502 /* If no lseg, fall back to read through mds */
503 if (pgio->pg_lseg == NULL)
504 goto out_mds;
505
506 ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
507 if (!ds)
508 goto out_mds;
509 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
510
511 pgio->pg_mirror_idx = ds_idx;
512
513 /* read always uses only one mirror - idx 0 for pgio layer */
514 pgm = &pgio->pg_mirrors[0];
515 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
516
517 return;
518out_mds:
519 pnfs_put_lseg(pgio->pg_lseg);
520 pgio->pg_lseg = NULL;
521 nfs_pageio_reset_read_mds(pgio);
522}
523
524static void
525ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
526 struct nfs_page *req)
527{
528 struct nfs4_ff_layout_mirror *mirror;
529 struct nfs_pgio_mirror *pgm;
530 struct nfs_commit_info cinfo;
531 struct nfs4_pnfs_ds *ds;
532 int i;
533 int status;
534
535 if (!pgio->pg_lseg)
536 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
537 req->wb_context,
538 0,
539 NFS4_MAX_UINT64,
540 IOMODE_RW,
541 GFP_NOFS);
542 /* If no lseg, fall back to write through mds */
543 if (pgio->pg_lseg == NULL)
544 goto out_mds;
545
546 nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
547 status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
548 if (status < 0)
549 goto out_mds;
550
551 /* Use a direct mapping of ds_idx to pgio mirror_idx */
552 if (WARN_ON_ONCE(pgio->pg_mirror_count !=
553 FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
554 goto out_mds;
555
556 for (i = 0; i < pgio->pg_mirror_count; i++) {
557 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
558 if (!ds)
559 goto out_mds;
560 pgm = &pgio->pg_mirrors[i];
561 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
562 pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
563 }
564
565 return;
566
567out_mds:
568 pnfs_put_lseg(pgio->pg_lseg);
569 pgio->pg_lseg = NULL;
570 nfs_pageio_reset_write_mds(pgio);
571}
572
573static unsigned int
574ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
575 struct nfs_page *req)
576{
577 if (!pgio->pg_lseg)
578 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
579 req->wb_context,
580 0,
581 NFS4_MAX_UINT64,
582 IOMODE_RW,
583 GFP_NOFS);
584 if (pgio->pg_lseg)
585 return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
586
587 /* no lseg means that pnfs is not in use, so no mirroring here */
588 pnfs_put_lseg(pgio->pg_lseg);
589 pgio->pg_lseg = NULL;
590 nfs_pageio_reset_write_mds(pgio);
591 return 1;
592}
593
594static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
595 .pg_init = ff_layout_pg_init_read,
596 .pg_test = pnfs_generic_pg_test,
597 .pg_doio = pnfs_generic_pg_readpages,
598 .pg_cleanup = pnfs_generic_pg_cleanup,
599};
600
601static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
602 .pg_init = ff_layout_pg_init_write,
603 .pg_test = pnfs_generic_pg_test,
604 .pg_doio = pnfs_generic_pg_writepages,
605 .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
606 .pg_cleanup = pnfs_generic_pg_cleanup,
607};
608
609static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
610{
611 struct rpc_task *task = &hdr->task;
612
613 pnfs_layoutcommit_inode(hdr->inode, false);
614
615 if (retry_pnfs) {
616 dprintk("%s Reset task %5u for i/o through pNFS "
617 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
618 hdr->task.tk_pid,
619 hdr->inode->i_sb->s_id,
620 (unsigned long long)NFS_FILEID(hdr->inode),
621 hdr->args.count,
622 (unsigned long long)hdr->args.offset);
623
624 if (!hdr->dreq) {
625 struct nfs_open_context *ctx;
626
627 ctx = nfs_list_entry(hdr->pages.next)->wb_context;
628 set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
629 hdr->completion_ops->error_cleanup(&hdr->pages);
630 } else {
631 nfs_direct_set_resched_writes(hdr->dreq);
632 /* fake unstable write to let common nfs resend pages */
633 hdr->verf.committed = NFS_UNSTABLE;
634 hdr->good_bytes = 0;
635 }
636 return;
637 }
638
639 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
640 dprintk("%s Reset task %5u for i/o through MDS "
641 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
642 hdr->task.tk_pid,
643 hdr->inode->i_sb->s_id,
644 (unsigned long long)NFS_FILEID(hdr->inode),
645 hdr->args.count,
646 (unsigned long long)hdr->args.offset);
647
648 task->tk_status = pnfs_write_done_resend_to_mds(hdr);
649 }
650}
651
652static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
653{
654 struct rpc_task *task = &hdr->task;
655
656 pnfs_layoutcommit_inode(hdr->inode, false);
657
658 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
659 dprintk("%s Reset task %5u for i/o through MDS "
660 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
661 hdr->task.tk_pid,
662 hdr->inode->i_sb->s_id,
663 (unsigned long long)NFS_FILEID(hdr->inode),
664 hdr->args.count,
665 (unsigned long long)hdr->args.offset);
666
667 task->tk_status = pnfs_read_done_resend_to_mds(hdr);
668 }
669}
670
671static int ff_layout_async_handle_error_v4(struct rpc_task *task,
672 struct nfs4_state *state,
673 struct nfs_client *clp,
674 struct pnfs_layout_segment *lseg,
675 int idx)
676{
677 struct pnfs_layout_hdr *lo = lseg->pls_layout;
678 struct inode *inode = lo->plh_inode;
679 struct nfs_server *mds_server = NFS_SERVER(inode);
680
681 struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
682 struct nfs_client *mds_client = mds_server->nfs_client;
683 struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
684
685 if (task->tk_status >= 0)
686 return 0;
687
688 switch (task->tk_status) {
689 /* MDS state errors */
690 case -NFS4ERR_DELEG_REVOKED:
691 case -NFS4ERR_ADMIN_REVOKED:
692 case -NFS4ERR_BAD_STATEID:
693 if (state == NULL)
694 break;
695 nfs_remove_bad_delegation(state->inode);
696 case -NFS4ERR_OPENMODE:
697 if (state == NULL)
698 break;
699 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
700 goto out_bad_stateid;
701 goto wait_on_recovery;
702 case -NFS4ERR_EXPIRED:
703 if (state != NULL) {
704 if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
705 goto out_bad_stateid;
706 }
707 nfs4_schedule_lease_recovery(mds_client);
708 goto wait_on_recovery;
709 /* DS session errors */
710 case -NFS4ERR_BADSESSION:
711 case -NFS4ERR_BADSLOT:
712 case -NFS4ERR_BAD_HIGH_SLOT:
713 case -NFS4ERR_DEADSESSION:
714 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
715 case -NFS4ERR_SEQ_FALSE_RETRY:
716 case -NFS4ERR_SEQ_MISORDERED:
717 dprintk("%s ERROR %d, Reset session. Exchangeid "
718 "flags 0x%x\n", __func__, task->tk_status,
719 clp->cl_exchange_flags);
720 nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
721 break;
722 case -NFS4ERR_DELAY:
723 case -NFS4ERR_GRACE:
724 rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
725 break;
726 case -NFS4ERR_RETRY_UNCACHED_REP:
727 break;
728 /* Invalidate Layout errors */
729 case -NFS4ERR_PNFS_NO_LAYOUT:
730 case -ESTALE: /* mapped NFS4ERR_STALE */
731 case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
732 case -EISDIR: /* mapped NFS4ERR_ISDIR */
733 case -NFS4ERR_FHEXPIRED:
734 case -NFS4ERR_WRONG_TYPE:
735 dprintk("%s Invalid layout error %d\n", __func__,
736 task->tk_status);
737 /*
738 * Destroy layout so new i/o will get a new layout.
739 * Layout will not be destroyed until all current lseg
740 * references are put. Mark layout as invalid to resend failed
741 * i/o and all i/o waiting on the slot table to the MDS until
742 * layout is destroyed and a new valid layout is obtained.
743 */
744 pnfs_destroy_layout(NFS_I(inode));
745 rpc_wake_up(&tbl->slot_tbl_waitq);
746 goto reset;
747 /* RPC connection errors */
748 case -ECONNREFUSED:
749 case -EHOSTDOWN:
750 case -EHOSTUNREACH:
751 case -ENETUNREACH:
752 case -EIO:
753 case -ETIMEDOUT:
754 case -EPIPE:
755 dprintk("%s DS connection error %d\n", __func__,
756 task->tk_status);
757 nfs4_mark_deviceid_unavailable(devid);
758 rpc_wake_up(&tbl->slot_tbl_waitq);
759 /* fall through */
760 default:
761 if (ff_layout_has_available_ds(lseg))
762 return -NFS4ERR_RESET_TO_PNFS;
763reset:
764 dprintk("%s Retry through MDS. Error %d\n", __func__,
765 task->tk_status);
766 return -NFS4ERR_RESET_TO_MDS;
767 }
768out:
769 task->tk_status = 0;
770 return -EAGAIN;
771out_bad_stateid:
772 task->tk_status = -EIO;
773 return 0;
774wait_on_recovery:
775 rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
776 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
777 rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
778 goto out;
779}
780
781/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
782static int ff_layout_async_handle_error_v3(struct rpc_task *task,
783 struct pnfs_layout_segment *lseg,
784 int idx)
785{
786 struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
787
788 if (task->tk_status >= 0)
789 return 0;
790
791 if (task->tk_status != -EJUKEBOX) {
792 dprintk("%s DS connection error %d\n", __func__,
793 task->tk_status);
794 nfs4_mark_deviceid_unavailable(devid);
795 if (ff_layout_has_available_ds(lseg))
796 return -NFS4ERR_RESET_TO_PNFS;
797 else
798 return -NFS4ERR_RESET_TO_MDS;
799 }
800
801 if (task->tk_status == -EJUKEBOX)
802 nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
803 task->tk_status = 0;
804 rpc_restart_call(task);
805 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
806 return -EAGAIN;
807}
808
809static int ff_layout_async_handle_error(struct rpc_task *task,
810 struct nfs4_state *state,
811 struct nfs_client *clp,
812 struct pnfs_layout_segment *lseg,
813 int idx)
814{
815 int vers = clp->cl_nfs_mod->rpc_vers->number;
816
817 switch (vers) {
818 case 3:
819 return ff_layout_async_handle_error_v3(task, lseg, idx);
820 case 4:
821 return ff_layout_async_handle_error_v4(task, state, clp,
822 lseg, idx);
823 default:
824 /* should never happen */
825 WARN_ON_ONCE(1);
826 return 0;
827 }
828}
829
830static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
831 int idx, u64 offset, u64 length,
832 u32 status, int opnum)
833{
834 struct nfs4_ff_layout_mirror *mirror;
835 int err;
836
837 mirror = FF_LAYOUT_COMP(lseg, idx);
838 err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
839 mirror, offset, length, status, opnum,
840 GFP_NOIO);
841 dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
842}
843
844/* NFS_PROTO call done callback routines */
845
846static int ff_layout_read_done_cb(struct rpc_task *task,
847 struct nfs_pgio_header *hdr)
848{
849 struct inode *inode;
850 int err;
851
852 trace_nfs4_pnfs_read(hdr, task->tk_status);
853 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
854 hdr->res.op_status = NFS4ERR_NXIO;
855 if (task->tk_status < 0 && hdr->res.op_status)
856 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
857 hdr->args.offset, hdr->args.count,
858 hdr->res.op_status, OP_READ);
859 err = ff_layout_async_handle_error(task, hdr->args.context->state,
860 hdr->ds_clp, hdr->lseg,
861 hdr->pgio_mirror_idx);
862
863 switch (err) {
864 case -NFS4ERR_RESET_TO_PNFS:
865 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
866 &hdr->lseg->pls_layout->plh_flags);
867 pnfs_read_resend_pnfs(hdr);
868 return task->tk_status;
869 case -NFS4ERR_RESET_TO_MDS:
870 inode = hdr->lseg->pls_layout->plh_inode;
871 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
872 ff_layout_reset_read(hdr);
873 return task->tk_status;
874 case -EAGAIN:
875 rpc_restart_call_prepare(task);
876 return -EAGAIN;
877 }
878
879 return 0;
880}
881
882/*
883 * We reference the rpc_cred of the first WRITE that triggers the need for
884 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
885 * rfc5661 is not clear about which credential should be used.
886 *
887 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
888 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
889 * we always send layoutcommit after DS writes.
890 */
891static void
892ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
893{
894 pnfs_set_layoutcommit(hdr);
895 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
896 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
897}
898
899static bool
900ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
901{
902 /* No mirroring for now */
903 struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
904
905 return ff_layout_test_devid_unavailable(node);
906}
907
908static int ff_layout_read_prepare_common(struct rpc_task *task,
909 struct nfs_pgio_header *hdr)
910{
911 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
912 rpc_exit(task, -EIO);
913 return -EIO;
914 }
915 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
916 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
917 if (ff_layout_has_available_ds(hdr->lseg))
918 pnfs_read_resend_pnfs(hdr);
919 else
920 ff_layout_reset_read(hdr);
921 rpc_exit(task, 0);
922 return -EAGAIN;
923 }
924 hdr->pgio_done_cb = ff_layout_read_done_cb;
925
926 return 0;
927}
928
929/*
930 * Call ops for the async read/write cases
931 * In the case of dense layouts, the offset needs to be reset to its
932 * original value.
933 */
934static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
935{
936 struct nfs_pgio_header *hdr = data;
937
938 if (ff_layout_read_prepare_common(task, hdr))
939 return;
940
941 rpc_call_start(task);
942}
943
944static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
945 struct nfs4_sequence_args *args,
946 struct nfs4_sequence_res *res,
947 struct rpc_task *task)
948{
949 if (ds_clp->cl_session)
950 return nfs41_setup_sequence(ds_clp->cl_session,
951 args,
952 res,
953 task);
954 return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
955 args,
956 res,
957 task);
958}
959
960static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
961{
962 struct nfs_pgio_header *hdr = data;
963
964 if (ff_layout_read_prepare_common(task, hdr))
965 return;
966
967 if (ff_layout_setup_sequence(hdr->ds_clp,
968 &hdr->args.seq_args,
969 &hdr->res.seq_res,
970 task))
971 return;
972
973 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
974 hdr->args.lock_context, FMODE_READ) == -EIO)
975 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
976}
977
978static void ff_layout_read_call_done(struct rpc_task *task, void *data)
979{
980 struct nfs_pgio_header *hdr = data;
981
982 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
983
984 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
985 task->tk_status == 0) {
986 nfs4_sequence_done(task, &hdr->res.seq_res);
987 return;
988 }
989
990 /* Note this may cause RPC to be resent */
991 hdr->mds_ops->rpc_call_done(task, hdr);
992}
993
994static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
995{
996 struct nfs_pgio_header *hdr = data;
997
998 rpc_count_iostats_metrics(task,
999 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
1000}
1001
1002static int ff_layout_write_done_cb(struct rpc_task *task,
1003 struct nfs_pgio_header *hdr)
1004{
1005 struct inode *inode;
1006 int err;
1007
1008 trace_nfs4_pnfs_write(hdr, task->tk_status);
1009 if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
1010 hdr->res.op_status = NFS4ERR_NXIO;
1011 if (task->tk_status < 0 && hdr->res.op_status)
1012 ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1013 hdr->args.offset, hdr->args.count,
1014 hdr->res.op_status, OP_WRITE);
1015 err = ff_layout_async_handle_error(task, hdr->args.context->state,
1016 hdr->ds_clp, hdr->lseg,
1017 hdr->pgio_mirror_idx);
1018
1019 switch (err) {
1020 case -NFS4ERR_RESET_TO_PNFS:
1021 case -NFS4ERR_RESET_TO_MDS:
1022 inode = hdr->lseg->pls_layout->plh_inode;
1023 pnfs_error_mark_layout_for_return(inode, hdr->lseg);
1024 if (err == -NFS4ERR_RESET_TO_PNFS) {
1025 pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
1026 ff_layout_reset_write(hdr, true);
1027 } else {
1028 pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
1029 ff_layout_reset_write(hdr, false);
1030 }
1031 return task->tk_status;
1032 case -EAGAIN:
1033 rpc_restart_call_prepare(task);
1034 return -EAGAIN;
1035 }
1036
1037 if (hdr->res.verf->committed == NFS_FILE_SYNC ||
1038 hdr->res.verf->committed == NFS_DATA_SYNC)
1039 ff_layout_set_layoutcommit(hdr);
1040
1041 return 0;
1042}
1043
1044static int ff_layout_commit_done_cb(struct rpc_task *task,
1045 struct nfs_commit_data *data)
1046{
1047 struct inode *inode;
1048 int err;
1049
1050 trace_nfs4_pnfs_commit_ds(data, task->tk_status);
1051 if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
1052 data->res.op_status = NFS4ERR_NXIO;
1053 if (task->tk_status < 0 && data->res.op_status)
1054 ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
1055 data->args.offset, data->args.count,
1056 data->res.op_status, OP_COMMIT);
1057 err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
1058 data->lseg, data->ds_commit_index);
1059
1060 switch (err) {
1061 case -NFS4ERR_RESET_TO_PNFS:
1062 case -NFS4ERR_RESET_TO_MDS:
1063 inode = data->lseg->pls_layout->plh_inode;
1064 pnfs_error_mark_layout_for_return(inode, data->lseg);
1065 if (err == -NFS4ERR_RESET_TO_PNFS)
1066 pnfs_set_retry_layoutget(data->lseg->pls_layout);
1067 else
1068 pnfs_clear_retry_layoutget(data->lseg->pls_layout);
1069 pnfs_generic_prepare_to_resend_writes(data);
1070 return -EAGAIN;
1071 case -EAGAIN:
1072 rpc_restart_call_prepare(task);
1073 return -EAGAIN;
1074 }
1075
1076 if (data->verf.committed == NFS_UNSTABLE)
1077 pnfs_commit_set_layoutcommit(data);
1078
1079 return 0;
1080}
1081
1082static int ff_layout_write_prepare_common(struct rpc_task *task,
1083 struct nfs_pgio_header *hdr)
1084{
1085 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1086 rpc_exit(task, -EIO);
1087 return -EIO;
1088 }
1089
1090 if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
1091 bool retry_pnfs;
1092
1093 retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
1094 dprintk("%s task %u reset io to %s\n", __func__,
1095 task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
1096 ff_layout_reset_write(hdr, retry_pnfs);
1097 rpc_exit(task, 0);
1098 return -EAGAIN;
1099 }
1100
1101 return 0;
1102}
1103
1104static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
1105{
1106 struct nfs_pgio_header *hdr = data;
1107
1108 if (ff_layout_write_prepare_common(task, hdr))
1109 return;
1110
1111 rpc_call_start(task);
1112}
1113
1114static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
1115{
1116 struct nfs_pgio_header *hdr = data;
1117
1118 if (ff_layout_write_prepare_common(task, hdr))
1119 return;
1120
1121 if (ff_layout_setup_sequence(hdr->ds_clp,
1122 &hdr->args.seq_args,
1123 &hdr->res.seq_res,
1124 task))
1125 return;
1126
1127 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
1128 hdr->args.lock_context, FMODE_WRITE) == -EIO)
1129 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
1130}
1131
1132static void ff_layout_write_call_done(struct rpc_task *task, void *data)
1133{
1134 struct nfs_pgio_header *hdr = data;
1135
1136 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1137 task->tk_status == 0) {
1138 nfs4_sequence_done(task, &hdr->res.seq_res);
1139 return;
1140 }
1141
1142 /* Note this may cause RPC to be resent */
1143 hdr->mds_ops->rpc_call_done(task, hdr);
1144}
1145
1146static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1147{
1148 struct nfs_pgio_header *hdr = data;
1149
1150 rpc_count_iostats_metrics(task,
1151 &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
1152}
1153
1154static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
1155{
1156 rpc_call_start(task);
1157}
1158
1159static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
1160{
1161 struct nfs_commit_data *wdata = data;
1162
1163 ff_layout_setup_sequence(wdata->ds_clp,
1164 &wdata->args.seq_args,
1165 &wdata->res.seq_res,
1166 task);
1167}
1168
1169static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
1170{
1171 struct nfs_commit_data *cdata = data;
1172
1173 rpc_count_iostats_metrics(task,
1174 &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
1175}
1176
1177static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
1178 .rpc_call_prepare = ff_layout_read_prepare_v3,
1179 .rpc_call_done = ff_layout_read_call_done,
1180 .rpc_count_stats = ff_layout_read_count_stats,
1181 .rpc_release = pnfs_generic_rw_release,
1182};
1183
1184static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
1185 .rpc_call_prepare = ff_layout_read_prepare_v4,
1186 .rpc_call_done = ff_layout_read_call_done,
1187 .rpc_count_stats = ff_layout_read_count_stats,
1188 .rpc_release = pnfs_generic_rw_release,
1189};
1190
1191static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
1192 .rpc_call_prepare = ff_layout_write_prepare_v3,
1193 .rpc_call_done = ff_layout_write_call_done,
1194 .rpc_count_stats = ff_layout_write_count_stats,
1195 .rpc_release = pnfs_generic_rw_release,
1196};
1197
1198static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
1199 .rpc_call_prepare = ff_layout_write_prepare_v4,
1200 .rpc_call_done = ff_layout_write_call_done,
1201 .rpc_count_stats = ff_layout_write_count_stats,
1202 .rpc_release = pnfs_generic_rw_release,
1203};
1204
1205static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
1206 .rpc_call_prepare = ff_layout_commit_prepare_v3,
1207 .rpc_call_done = pnfs_generic_write_commit_done,
1208 .rpc_count_stats = ff_layout_commit_count_stats,
1209 .rpc_release = pnfs_generic_commit_release,
1210};
1211
1212static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
1213 .rpc_call_prepare = ff_layout_commit_prepare_v4,
1214 .rpc_call_done = pnfs_generic_write_commit_done,
1215 .rpc_count_stats = ff_layout_commit_count_stats,
1216 .rpc_release = pnfs_generic_commit_release,
1217};
1218
1219static enum pnfs_try_status
1220ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1221{
1222 struct pnfs_layout_segment *lseg = hdr->lseg;
1223 struct nfs4_pnfs_ds *ds;
1224 struct rpc_clnt *ds_clnt;
1225 struct rpc_cred *ds_cred;
1226 loff_t offset = hdr->args.offset;
1227 u32 idx = hdr->pgio_mirror_idx;
1228 int vers;
1229 struct nfs_fh *fh;
1230
1231 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
1232 __func__, hdr->inode->i_ino,
1233 hdr->args.pgbase, (size_t)hdr->args.count, offset);
1234
1235 ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
1236 if (!ds)
1237 goto out_failed;
1238
1239 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1240 hdr->inode);
1241 if (IS_ERR(ds_clnt))
1242 goto out_failed;
1243
1244 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1245 if (IS_ERR(ds_cred))
1246 goto out_failed;
1247
1248 vers = nfs4_ff_layout_ds_version(lseg, idx);
1249
1250 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
1251 ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
1252
1253 atomic_inc(&ds->ds_clp->cl_count);
1254 hdr->ds_clp = ds->ds_clp;
1255 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1256 if (fh)
1257 hdr->args.fh = fh;
1258
1259 /*
1260 * Note that if we ever decide to split across DSes,
1261 * then we may need to handle dense-like offsets.
1262 */
1263 hdr->args.offset = offset;
1264 hdr->mds_offset = offset;
1265
1266 /* Perform an asynchronous read to ds */
1267 nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1268 vers == 3 ? &ff_layout_read_call_ops_v3 :
1269 &ff_layout_read_call_ops_v4,
1270 0, RPC_TASK_SOFTCONN);
1271
1272 return PNFS_ATTEMPTED;
1273
1274out_failed:
1275 if (ff_layout_has_available_ds(lseg))
1276 return PNFS_TRY_AGAIN;
1277 return PNFS_NOT_ATTEMPTED;
1278}
1279
1280/* Perform async writes. */
1281static enum pnfs_try_status
1282ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1283{
1284 struct pnfs_layout_segment *lseg = hdr->lseg;
1285 struct nfs4_pnfs_ds *ds;
1286 struct rpc_clnt *ds_clnt;
1287 struct rpc_cred *ds_cred;
1288 loff_t offset = hdr->args.offset;
1289 int vers;
1290 struct nfs_fh *fh;
1291 int idx = hdr->pgio_mirror_idx;
1292
1293 ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1294 if (!ds)
1295 return PNFS_NOT_ATTEMPTED;
1296
1297 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1298 hdr->inode);
1299 if (IS_ERR(ds_clnt))
1300 return PNFS_NOT_ATTEMPTED;
1301
1302 ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1303 if (IS_ERR(ds_cred))
1304 return PNFS_NOT_ATTEMPTED;
1305
1306 vers = nfs4_ff_layout_ds_version(lseg, idx);
1307
1308 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
1309 __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
1310 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
1311 vers);
1312
1313 hdr->pgio_done_cb = ff_layout_write_done_cb;
1314 atomic_inc(&ds->ds_clp->cl_count);
1315 hdr->ds_clp = ds->ds_clp;
1316 hdr->ds_commit_idx = idx;
1317 fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1318 if (fh)
1319 hdr->args.fh = fh;
1320
1321 /*
1322 * Note that if we ever decide to split across DSes,
1323 * then we may need to handle dense-like offsets.
1324 */
1325 hdr->args.offset = offset;
1326
1327 /* Perform an asynchronous write */
1328 nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1329 vers == 3 ? &ff_layout_write_call_ops_v3 :
1330 &ff_layout_write_call_ops_v4,
1331 sync, RPC_TASK_SOFTCONN);
1332 return PNFS_ATTEMPTED;
1333}
1334
1335static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1336{
1337 return i;
1338}
1339
1340static struct nfs_fh *
1341select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1342{
1343 struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
1344
1345 /* FIXME: Assume that there is only one NFS version available
1346 * for the DS.
1347 */
1348 return &flseg->mirror_array[i]->fh_versions[0];
1349}
1350
1351static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1352{
1353 struct pnfs_layout_segment *lseg = data->lseg;
1354 struct nfs4_pnfs_ds *ds;
1355 struct rpc_clnt *ds_clnt;
1356 struct rpc_cred *ds_cred;
1357 u32 idx;
1358 int vers;
1359 struct nfs_fh *fh;
1360
1361 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1362 ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1363 if (!ds)
1364 goto out_err;
1365
1366 ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1367 data->inode);
1368 if (IS_ERR(ds_clnt))
1369 goto out_err;
1370
1371 ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
1372 if (IS_ERR(ds_cred))
1373 goto out_err;
1374
1375 vers = nfs4_ff_layout_ds_version(lseg, idx);
1376
1377 dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
1378 data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
1379 vers);
1380 data->commit_done_cb = ff_layout_commit_done_cb;
1381 data->cred = ds_cred;
1382 atomic_inc(&ds->ds_clp->cl_count);
1383 data->ds_clp = ds->ds_clp;
1384 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1385 if (fh)
1386 data->args.fh = fh;
1387 return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1388 vers == 3 ? &ff_layout_commit_call_ops_v3 :
1389 &ff_layout_commit_call_ops_v4,
1390 how, RPC_TASK_SOFTCONN);
1391out_err:
1392 pnfs_generic_prepare_to_resend_writes(data);
1393 pnfs_generic_commit_release(data);
1394 return -EAGAIN;
1395}
1396
1397static int
1398ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1399 int how, struct nfs_commit_info *cinfo)
1400{
1401 return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1402 ff_layout_initiate_commit);
1403}
1404
1405static struct pnfs_ds_commit_info *
1406ff_layout_get_ds_info(struct inode *inode)
1407{
1408 struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
1409
1410 if (layout == NULL)
1411 return NULL;
1412
1413 return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
1414}
1415
1416static void
1417ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
1418{
1419 nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
1420 id_node));
1421}
1422
1423static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
1424 struct xdr_stream *xdr,
1425 const struct nfs4_layoutreturn_args *args)
1426{
1427 struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
1428 __be32 *start;
1429 int count = 0, ret = 0;
1430
1431 start = xdr_reserve_space(xdr, 4);
1432 if (unlikely(!start))
1433 return -E2BIG;
1434
1435 /* This assume we always return _ALL_ layouts */
1436 spin_lock(&hdr->plh_inode->i_lock);
1437 ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
1438 spin_unlock(&hdr->plh_inode->i_lock);
1439
1440 *start = cpu_to_be32(count);
1441
1442 return ret;
1443}
1444
1445/* report nothing for now */
1446static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
1447 struct xdr_stream *xdr,
1448 const struct nfs4_layoutreturn_args *args)
1449{
1450 __be32 *p;
1451
1452 p = xdr_reserve_space(xdr, 4);
1453 if (likely(p))
1454 *p = cpu_to_be32(0);
1455}
1456
1457static struct nfs4_deviceid_node *
1458ff_layout_alloc_deviceid_node(struct nfs_server *server,
1459 struct pnfs_device *pdev, gfp_t gfp_flags)
1460{
1461 struct nfs4_ff_layout_ds *dsaddr;
1462
1463 dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
1464 if (!dsaddr)
1465 return NULL;
1466 return &dsaddr->id_node;
1467}
1468
1469static void
1470ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
1471 struct xdr_stream *xdr,
1472 const struct nfs4_layoutreturn_args *args)
1473{
1474 struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
1475 __be32 *start;
1476
1477 dprintk("%s: Begin\n", __func__);
1478 start = xdr_reserve_space(xdr, 4);
1479 BUG_ON(!start);
1480
1481 if (ff_layout_encode_ioerr(flo, xdr, args))
1482 goto out;
1483
1484 ff_layout_encode_iostats(flo, xdr, args);
1485out:
1486 *start = cpu_to_be32((xdr->p - start - 1) * 4);
1487 dprintk("%s: Return\n", __func__);
1488}
1489
1490static struct pnfs_layoutdriver_type flexfilelayout_type = {
1491 .id = LAYOUT_FLEX_FILES,
1492 .name = "LAYOUT_FLEX_FILES",
1493 .owner = THIS_MODULE,
1494 .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
1495 .free_layout_hdr = ff_layout_free_layout_hdr,
1496 .alloc_lseg = ff_layout_alloc_lseg,
1497 .free_lseg = ff_layout_free_lseg,
1498 .pg_read_ops = &ff_layout_pg_read_ops,
1499 .pg_write_ops = &ff_layout_pg_write_ops,
1500 .get_ds_info = ff_layout_get_ds_info,
1501 .free_deviceid_node = ff_layout_free_deveiceid_node,
1502 .mark_request_commit = pnfs_layout_mark_request_commit,
1503 .clear_request_commit = pnfs_generic_clear_request_commit,
1504 .scan_commit_lists = pnfs_generic_scan_commit_lists,
1505 .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
1506 .commit_pagelist = ff_layout_commit_pagelist,
1507 .read_pagelist = ff_layout_read_pagelist,
1508 .write_pagelist = ff_layout_write_pagelist,
1509 .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
1510 .encode_layoutreturn = ff_layout_encode_layoutreturn,
1511};
1512
1513static int __init nfs4flexfilelayout_init(void)
1514{
1515 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
1516 __func__);
1517 return pnfs_register_layoutdriver(&flexfilelayout_type);
1518}
1519
1520static void __exit nfs4flexfilelayout_exit(void)
1521{
1522 printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
1523 __func__);
1524 pnfs_unregister_layoutdriver(&flexfilelayout_type);
1525}
1526
1527MODULE_ALIAS("nfs-layouttype4-4");
1528
1529MODULE_LICENSE("GPL");
1530MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
1531
1532module_init(nfs4flexfilelayout_init);
1533module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
1/*
2 * NFSv4 flexfile layout driver data structures.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
10#define FS_NFS_NFS4FLEXFILELAYOUT_H
11
12#include "../pnfs.h"
13
14/* XXX: Let's filter out insanely large mirror count for now to avoid oom
15 * due to network error etc. */
16#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
17
18struct nfs4_ff_ds_version {
19 u32 version;
20 u32 minor_version;
21 u32 rsize;
22 u32 wsize;
23 bool tightly_coupled;
24};
25
26/* chained in global deviceid hlist */
27struct nfs4_ff_layout_ds {
28 struct nfs4_deviceid_node id_node;
29 u32 ds_versions_cnt;
30 struct nfs4_ff_ds_version *ds_versions;
31 struct nfs4_pnfs_ds *ds;
32};
33
34struct nfs4_ff_layout_ds_err {
35 struct list_head list; /* linked in mirror error_list */
36 u64 offset;
37 u64 length;
38 int status;
39 enum nfs_opnum4 opnum;
40 nfs4_stateid stateid;
41 struct nfs4_deviceid deviceid;
42};
43
44struct nfs4_ff_layout_mirror {
45 u32 ds_count;
46 u32 efficiency;
47 struct nfs4_ff_layout_ds *mirror_ds;
48 u32 fh_versions_cnt;
49 struct nfs_fh *fh_versions;
50 nfs4_stateid stateid;
51 struct nfs4_string user_name;
52 struct nfs4_string group_name;
53 u32 uid;
54 u32 gid;
55 struct rpc_cred *cred;
56 spinlock_t lock;
57};
58
59struct nfs4_ff_layout_segment {
60 struct pnfs_layout_segment generic_hdr;
61 u64 stripe_unit;
62 u32 mirror_array_cnt;
63 struct nfs4_ff_layout_mirror **mirror_array;
64};
65
66struct nfs4_flexfile_layout {
67 struct pnfs_layout_hdr generic_hdr;
68 struct pnfs_ds_commit_info commit_info;
69 struct list_head error_list; /* nfs4_ff_layout_ds_err */
70};
71
72static inline struct nfs4_flexfile_layout *
73FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
74{
75 return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
76}
77
78static inline struct nfs4_ff_layout_segment *
79FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
80{
81 return container_of(lseg,
82 struct nfs4_ff_layout_segment,
83 generic_hdr);
84}
85
86static inline struct nfs4_deviceid_node *
87FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
88{
89 if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
90 FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
91 FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
92 return NULL;
93 return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
94}
95
96static inline struct nfs4_ff_layout_ds *
97FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
98{
99 return container_of(node, struct nfs4_ff_layout_ds, id_node);
100}
101
102static inline struct nfs4_ff_layout_mirror *
103FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
104{
105 if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
106 return NULL;
107 return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
108}
109
110static inline u32
111FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
112{
113 return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
114}
115
116static inline bool
117ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
118{
119 return nfs4_test_deviceid_unavailable(node);
120}
121
122static inline int
123nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
124{
125 return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
126}
127
128struct nfs4_ff_layout_ds *
129nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
130 gfp_t gfp_flags);
131void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
132void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
133int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
134 struct nfs4_ff_layout_mirror *mirror, u64 offset,
135 u64 length, int status, enum nfs_opnum4 opnum,
136 gfp_t gfp_flags);
137int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
138 struct xdr_stream *xdr, int *count,
139 const struct pnfs_layout_range *range);
140struct nfs_fh *
141nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
142
143struct nfs4_pnfs_ds *
144nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
145 bool fail_return);
146
147struct rpc_clnt *
148nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
149 u32 ds_idx,
150 struct nfs_client *ds_clp,
151 struct inode *inode);
152struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
153 u32 ds_idx, struct rpc_cred *mdscred);
154bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
155#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..e2c01f204a95
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
1/*
2 * Device operations for the pnfs nfs4 file layout driver.
3 *
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
5 *
6 * Tao Peng <bergwolf@primarydata.com>
7 */
8
9#include <linux/nfs_fs.h>
10#include <linux/vmalloc.h>
11#include <linux/module.h>
12#include <linux/sunrpc/addr.h>
13
14#include "../internal.h"
15#include "../nfs4session.h"
16#include "flexfilelayout.h"
17
18#define NFSDBG_FACILITY NFSDBG_PNFS_LD
19
20static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
21static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
22
23void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
24{
25 if (mirror_ds)
26 nfs4_put_deviceid_node(&mirror_ds->id_node);
27}
28
29void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
30{
31 nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
32 nfs4_pnfs_ds_put(mirror_ds->ds);
33 kfree(mirror_ds);
34}
35
36/* Decode opaque device data and construct new_ds using it */
37struct nfs4_ff_layout_ds *
38nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
39 gfp_t gfp_flags)
40{
41 struct xdr_stream stream;
42 struct xdr_buf buf;
43 struct page *scratch;
44 struct list_head dsaddrs;
45 struct nfs4_pnfs_ds_addr *da;
46 struct nfs4_ff_layout_ds *new_ds = NULL;
47 struct nfs4_ff_ds_version *ds_versions = NULL;
48 u32 mp_count;
49 u32 version_count;
50 __be32 *p;
51 int i, ret = -ENOMEM;
52
53 /* set up xdr stream */
54 scratch = alloc_page(gfp_flags);
55 if (!scratch)
56 goto out_err;
57
58 new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
59 if (!new_ds)
60 goto out_scratch;
61
62 nfs4_init_deviceid_node(&new_ds->id_node,
63 server,
64 &pdev->dev_id);
65 INIT_LIST_HEAD(&dsaddrs);
66
67 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
68 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
69
70 /* multipath count */
71 p = xdr_inline_decode(&stream, 4);
72 if (unlikely(!p))
73 goto out_err_drain_dsaddrs;
74 mp_count = be32_to_cpup(p);
75 dprintk("%s: multipath ds count %d\n", __func__, mp_count);
76
77 for (i = 0; i < mp_count; i++) {
78 /* multipath ds */
79 da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
80 &stream, gfp_flags);
81 if (da)
82 list_add_tail(&da->da_node, &dsaddrs);
83 }
84 if (list_empty(&dsaddrs)) {
85 dprintk("%s: no suitable DS addresses found\n",
86 __func__);
87 ret = -ENOMEDIUM;
88 goto out_err_drain_dsaddrs;
89 }
90
91 /* version count */
92 p = xdr_inline_decode(&stream, 4);
93 if (unlikely(!p))
94 goto out_err_drain_dsaddrs;
95 version_count = be32_to_cpup(p);
96 dprintk("%s: version count %d\n", __func__, version_count);
97
98 ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
99 gfp_flags);
100 if (!ds_versions)
101 goto out_scratch;
102
103 for (i = 0; i < version_count; i++) {
104 /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
105 * tightly_coupled(4) */
106 p = xdr_inline_decode(&stream, 20);
107 if (unlikely(!p))
108 goto out_err_drain_dsaddrs;
109 ds_versions[i].version = be32_to_cpup(p++);
110 ds_versions[i].minor_version = be32_to_cpup(p++);
111 ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
112 ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
113 ds_versions[i].tightly_coupled = be32_to_cpup(p);
114
115 if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
116 ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
117 if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
118 ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
119
120 if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
121 dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
122 i, ds_versions[i].version,
123 ds_versions[i].minor_version);
124 ret = -EPROTONOSUPPORT;
125 goto out_err_drain_dsaddrs;
126 }
127
128 dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
129 __func__, i, ds_versions[i].version,
130 ds_versions[i].minor_version,
131 ds_versions[i].rsize,
132 ds_versions[i].wsize,
133 ds_versions[i].tightly_coupled);
134 }
135
136 new_ds->ds_versions = ds_versions;
137 new_ds->ds_versions_cnt = version_count;
138
139 new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
140 if (!new_ds->ds)
141 goto out_err_drain_dsaddrs;
142
143 /* If DS was already in cache, free ds addrs */
144 while (!list_empty(&dsaddrs)) {
145 da = list_first_entry(&dsaddrs,
146 struct nfs4_pnfs_ds_addr,
147 da_node);
148 list_del_init(&da->da_node);
149 kfree(da->da_remotestr);
150 kfree(da);
151 }
152
153 __free_page(scratch);
154 return new_ds;
155
156out_err_drain_dsaddrs:
157 while (!list_empty(&dsaddrs)) {
158 da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
159 da_node);
160 list_del_init(&da->da_node);
161 kfree(da->da_remotestr);
162 kfree(da);
163 }
164
165 kfree(ds_versions);
166out_scratch:
167 __free_page(scratch);
168out_err:
169 kfree(new_ds);
170
171 dprintk("%s ERROR: returning %d\n", __func__, ret);
172 return NULL;
173}
174
175static u64
176end_offset(u64 start, u64 len)
177{
178 u64 end;
179
180 end = start + len;
181 return end >= start ? end : NFS4_MAX_UINT64;
182}
183
184static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
185 u64 offset, u64 length)
186{
187 u64 end;
188
189 end = max_t(u64, end_offset(err->offset, err->length),
190 end_offset(offset, length));
191 err->offset = min_t(u64, err->offset, offset);
192 err->length = end - err->offset;
193}
194
195static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
196 u64 length, int status, enum nfs_opnum4 opnum,
197 nfs4_stateid *stateid,
198 struct nfs4_deviceid *deviceid)
199{
200 return err->status == status && err->opnum == opnum &&
201 nfs4_stateid_match(&err->stateid, stateid) &&
202 !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
203 end_offset(err->offset, err->length) >= offset &&
204 err->offset <= end_offset(offset, length);
205}
206
207static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
208 struct nfs4_ff_layout_ds_err *new)
209{
210 if (!ds_error_can_merge(old, new->offset, new->length, new->status,
211 new->opnum, &new->stateid, &new->deviceid))
212 return false;
213
214 extend_ds_error(old, new->offset, new->length);
215 return true;
216}
217
218static bool
219ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
220 struct nfs4_ff_layout_ds_err *dserr)
221{
222 struct nfs4_ff_layout_ds_err *err;
223
224 list_for_each_entry(err, &flo->error_list, list) {
225 if (merge_ds_error(err, dserr)) {
226 return true;
227 }
228 }
229
230 list_add(&dserr->list, &flo->error_list);
231 return false;
232}
233
234static bool
235ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
236 u64 length, int status, enum nfs_opnum4 opnum,
237 nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
238{
239 bool found = false;
240 struct nfs4_ff_layout_ds_err *err;
241
242 list_for_each_entry(err, &flo->error_list, list) {
243 if (ds_error_can_merge(err, offset, length, status, opnum,
244 stateid, deviceid)) {
245 found = true;
246 extend_ds_error(err, offset, length);
247 break;
248 }
249 }
250
251 return found;
252}
253
254int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
255 struct nfs4_ff_layout_mirror *mirror, u64 offset,
256 u64 length, int status, enum nfs_opnum4 opnum,
257 gfp_t gfp_flags)
258{
259 struct nfs4_ff_layout_ds_err *dserr;
260 bool needfree;
261
262 if (status == 0)
263 return 0;
264
265 if (mirror->mirror_ds == NULL)
266 return -EINVAL;
267
268 spin_lock(&flo->generic_hdr.plh_inode->i_lock);
269 if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
270 &mirror->stateid,
271 &mirror->mirror_ds->id_node.deviceid)) {
272 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
273 return 0;
274 }
275 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
276 dserr = kmalloc(sizeof(*dserr), gfp_flags);
277 if (!dserr)
278 return -ENOMEM;
279
280 INIT_LIST_HEAD(&dserr->list);
281 dserr->offset = offset;
282 dserr->length = length;
283 dserr->status = status;
284 dserr->opnum = opnum;
285 nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
286 memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
287 NFS4_DEVICEID4_SIZE);
288
289 spin_lock(&flo->generic_hdr.plh_inode->i_lock);
290 needfree = ff_layout_add_ds_error_locked(flo, dserr);
291 spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
292 if (needfree)
293 kfree(dserr);
294
295 return 0;
296}
297
298/* currently we only support AUTH_NONE and AUTH_SYS */
299static rpc_authflavor_t
300nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
301{
302 if (mirror->uid == (u32)-1)
303 return RPC_AUTH_NULL;
304 return RPC_AUTH_UNIX;
305}
306
307/* fetch cred for NFSv3 DS */
308static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
309 struct nfs4_pnfs_ds *ds)
310{
311 if (ds->ds_clp && !mirror->cred &&
312 mirror->mirror_ds->ds_versions[0].version == 3) {
313 struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
314 struct rpc_cred *cred;
315 struct auth_cred acred = {
316 .uid = make_kuid(&init_user_ns, mirror->uid),
317 .gid = make_kgid(&init_user_ns, mirror->gid),
318 };
319
320 /* AUTH_NULL ignores acred */
321 cred = auth->au_ops->lookup_cred(auth, &acred, 0);
322 if (IS_ERR(cred)) {
323 dprintk("%s: lookup_cred failed with %ld\n",
324 __func__, PTR_ERR(cred));
325 return PTR_ERR(cred);
326 } else {
327 mirror->cred = cred;
328 }
329 }
330 return 0;
331}
332
333struct nfs_fh *
334nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
335{
336 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
337 struct nfs_fh *fh = NULL;
338 struct nfs4_deviceid_node *devid;
339
340 if (mirror == NULL || mirror->mirror_ds == NULL ||
341 mirror->mirror_ds->ds == NULL) {
342 printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
343 __func__, mirror_idx);
344 if (mirror && mirror->mirror_ds) {
345 devid = &mirror->mirror_ds->id_node;
346 pnfs_generic_mark_devid_invalid(devid);
347 }
348 goto out;
349 }
350
351 /* FIXME: For now assume there is only 1 version available for the DS */
352 fh = &mirror->fh_versions[0];
353out:
354 return fh;
355}
356
357/* Upon return, either ds is connected, or ds is NULL */
358struct nfs4_pnfs_ds *
359nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
360 bool fail_return)
361{
362 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
363 struct nfs4_pnfs_ds *ds = NULL;
364 struct nfs4_deviceid_node *devid;
365 struct inode *ino = lseg->pls_layout->plh_inode;
366 struct nfs_server *s = NFS_SERVER(ino);
367 unsigned int max_payload;
368 rpc_authflavor_t flavor;
369
370 if (mirror == NULL || mirror->mirror_ds == NULL ||
371 mirror->mirror_ds->ds == NULL) {
372 printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
373 __func__, ds_idx);
374 if (mirror && mirror->mirror_ds) {
375 devid = &mirror->mirror_ds->id_node;
376 pnfs_generic_mark_devid_invalid(devid);
377 }
378 goto out;
379 }
380
381 devid = &mirror->mirror_ds->id_node;
382 if (ff_layout_test_devid_unavailable(devid))
383 goto out;
384
385 ds = mirror->mirror_ds->ds;
386 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
387 smp_rmb();
388 if (ds->ds_clp)
389 goto out;
390
391 flavor = nfs4_ff_layout_choose_authflavor(mirror);
392
393 /* FIXME: For now we assume the server sent only one version of NFS
394 * to use for the DS.
395 */
396 nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
397 dataserver_retrans,
398 mirror->mirror_ds->ds_versions[0].version,
399 mirror->mirror_ds->ds_versions[0].minor_version,
400 flavor);
401
402 /* connect success, check rsize/wsize limit */
403 if (ds->ds_clp) {
404 max_payload =
405 nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
406 NULL);
407 if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
408 mirror->mirror_ds->ds_versions[0].rsize = max_payload;
409 if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
410 mirror->mirror_ds->ds_versions[0].wsize = max_payload;
411 } else {
412 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
413 mirror, lseg->pls_range.offset,
414 lseg->pls_range.length, NFS4ERR_NXIO,
415 OP_ILLEGAL, GFP_NOIO);
416 if (fail_return) {
417 pnfs_error_mark_layout_for_return(ino, lseg);
418 if (ff_layout_has_available_ds(lseg))
419 pnfs_set_retry_layoutget(lseg->pls_layout);
420 else
421 pnfs_clear_retry_layoutget(lseg->pls_layout);
422
423 } else {
424 if (ff_layout_has_available_ds(lseg))
425 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
426 &lseg->pls_layout->plh_flags);
427 else {
428 pnfs_error_mark_layout_for_return(ino, lseg);
429 pnfs_clear_retry_layoutget(lseg->pls_layout);
430 }
431 }
432 }
433
434 if (ff_layout_update_mirror_cred(mirror, ds))
435 ds = NULL;
436out:
437 return ds;
438}
439
440struct rpc_cred *
441ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
442 struct rpc_cred *mdscred)
443{
444 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
445 struct rpc_cred *cred = ERR_PTR(-EINVAL);
446
447 if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
448 goto out;
449
450 if (mirror && mirror->cred)
451 cred = mirror->cred;
452 else
453 cred = mdscred;
454out:
455 return cred;
456}
457
458/**
459* Find or create a DS rpc client with th MDS server rpc client auth flavor
460* in the nfs_client cl_ds_clients list.
461*/
462struct rpc_clnt *
463nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
464 struct nfs_client *ds_clp, struct inode *inode)
465{
466 struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
467
468 switch (mirror->mirror_ds->ds_versions[0].version) {
469 case 3:
470 /* For NFSv3 DS, flavor is set when creating DS connections */
471 return ds_clp->cl_rpcclient;
472 case 4:
473 return nfs4_find_or_create_ds_client(ds_clp, inode);
474 default:
475 BUG();
476 }
477}
478
479static bool is_range_intersecting(u64 offset1, u64 length1,
480 u64 offset2, u64 length2)
481{
482 u64 end1 = end_offset(offset1, length1);
483 u64 end2 = end_offset(offset2, length2);
484
485 return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
486 (end2 == NFS4_MAX_UINT64 || end2 > offset1);
487}
488
489/* called with inode i_lock held */
490int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
491 struct xdr_stream *xdr, int *count,
492 const struct pnfs_layout_range *range)
493{
494 struct nfs4_ff_layout_ds_err *err, *n;
495 __be32 *p;
496
497 list_for_each_entry_safe(err, n, &flo->error_list, list) {
498 if (!is_range_intersecting(err->offset, err->length,
499 range->offset, range->length))
500 continue;
501 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
502 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
503 */
504 p = xdr_reserve_space(xdr,
505 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
506 if (unlikely(!p))
507 return -ENOBUFS;
508 p = xdr_encode_hyper(p, err->offset);
509 p = xdr_encode_hyper(p, err->length);
510 p = xdr_encode_opaque_fixed(p, &err->stateid,
511 NFS4_STATEID_SIZE);
512 p = xdr_encode_opaque_fixed(p, &err->deviceid,
513 NFS4_DEVICEID4_SIZE);
514 *p++ = cpu_to_be32(err->status);
515 *p++ = cpu_to_be32(err->opnum);
516 *count += 1;
517 list_del(&err->list);
518 dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
519 __func__, err->offset, err->length, err->status,
520 err->opnum, *count);
521 kfree(err);
522 }
523
524 return 0;
525}
526
527bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
528{
529 struct nfs4_ff_layout_mirror *mirror;
530 struct nfs4_deviceid_node *devid;
531 int idx;
532
533 for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
534 mirror = FF_LAYOUT_COMP(lseg, idx);
535 if (mirror && mirror->mirror_ds) {
536 devid = &mirror->mirror_ds->id_node;
537 if (!ff_layout_test_devid_unavailable(devid))
538 return true;
539 }
540 }
541
542 return false;
543}
544
545module_param(dataserver_retrans, uint, 0644);
546MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
547 "retries a request before it attempts further "
548 " recovery action.");
549module_param(dataserver_timeo, uint, 0644);
550MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
551 "NFSv4.1 client waits for a response from a "
552 " data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
152 nfs_fattr_free_group_name(fattr); 152 nfs_fattr_free_group_name(fattr);
153} 153}
154 154
155static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) 155int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
156{ 156{
157 unsigned long val; 157 unsigned long val;
158 char buf[16]; 158 char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
166 *res = val; 166 *res = val;
167 return 1; 167 return 1;
168} 168}
169EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
169 170
170static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) 171static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
171{ 172{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4bffe637ea32..83107be3dd01 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -352,8 +352,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
352 352
353 nfs_attr_check_mountpoint(sb, fattr); 353 nfs_attr_check_mountpoint(sb, fattr);
354 354
355 if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && 355 if (nfs_attr_use_mounted_on_fileid(fattr))
356 !nfs_attr_use_mounted_on_fileid(fattr)) 356 fattr->fileid = fattr->mounted_on_fileid;
357 else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
357 goto out_no_inode; 358 goto out_no_inode;
358 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) 359 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
359 goto out_no_inode; 360 goto out_no_inode;
@@ -387,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
387 if (S_ISREG(inode->i_mode)) { 388 if (S_ISREG(inode->i_mode)) {
388 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; 389 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
389 inode->i_data.a_ops = &nfs_file_aops; 390 inode->i_data.a_ops = &nfs_file_aops;
390 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
391 } else if (S_ISDIR(inode->i_mode)) { 391 } else if (S_ISDIR(inode->i_mode)) {
392 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; 392 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
393 inode->i_fop = &nfs_dir_operations; 393 inode->i_fop = &nfs_dir_operations;
@@ -506,10 +506,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
506 attr->ia_valid &= ~ATTR_MODE; 506 attr->ia_valid &= ~ATTR_MODE;
507 507
508 if (attr->ia_valid & ATTR_SIZE) { 508 if (attr->ia_valid & ATTR_SIZE) {
509 loff_t i_size;
510
509 BUG_ON(!S_ISREG(inode->i_mode)); 511 BUG_ON(!S_ISREG(inode->i_mode));
510 512
511 if (attr->ia_size == i_size_read(inode)) 513 i_size = i_size_read(inode);
514 if (attr->ia_size == i_size)
512 attr->ia_valid &= ~ATTR_SIZE; 515 attr->ia_valid &= ~ATTR_SIZE;
516 else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
517 return -ETXTBSY;
513 } 518 }
514 519
515 /* Optimization: if the end result is no change, don't RPC */ 520 /* Optimization: if the end result is no change, don't RPC */
@@ -1770,7 +1775,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1770#if IS_ENABLED(CONFIG_NFS_V4) 1775#if IS_ENABLED(CONFIG_NFS_V4)
1771 INIT_LIST_HEAD(&nfsi->open_states); 1776 INIT_LIST_HEAD(&nfsi->open_states);
1772 nfsi->delegation = NULL; 1777 nfsi->delegation = NULL;
1773 nfsi->delegation_state = 0;
1774 init_rwsem(&nfsi->rwsem); 1778 init_rwsem(&nfsi->rwsem);
1775 nfsi->layout = NULL; 1779 nfsi->layout = NULL;
1776#endif 1780#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index efaa31c70fbe..b802fb3a2d99 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -6,6 +6,7 @@
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/crc32.h> 8#include <linux/crc32.h>
9#include <linux/nfs_page.h>
9 10
10#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) 11#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
11 12
@@ -31,8 +32,6 @@ static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
31 (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && 32 (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
32 ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) 33 ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
33 return 0; 34 return 0;
34
35 fattr->fileid = fattr->mounted_on_fileid;
36 return 1; 35 return 1;
37} 36}
38 37
@@ -189,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
189 const struct sockaddr *ds_addr, 188 const struct sockaddr *ds_addr,
190 int ds_addrlen, int ds_proto, 189 int ds_addrlen, int ds_proto,
191 unsigned int ds_timeo, 190 unsigned int ds_timeo,
192 unsigned int ds_retrans); 191 unsigned int ds_retrans,
192 u32 minor_version,
193 rpc_authflavor_t au_flavor);
193extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, 194extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
194 struct inode *); 195 struct inode *);
196extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
197 const struct sockaddr *ds_addr, int ds_addrlen,
198 int ds_proto, unsigned int ds_timeo,
199 unsigned int ds_retrans, rpc_authflavor_t au_flavor);
195#ifdef CONFIG_PROC_FS 200#ifdef CONFIG_PROC_FS
196extern int __init nfs_fs_proc_init(void); 201extern int __init nfs_fs_proc_init(void);
197extern void nfs_fs_proc_exit(void); 202extern void nfs_fs_proc_exit(void);
@@ -244,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
244void nfs_pgio_header_free(struct nfs_pgio_header *); 249void nfs_pgio_header_free(struct nfs_pgio_header *);
245void nfs_pgio_data_destroy(struct nfs_pgio_header *); 250void nfs_pgio_data_destroy(struct nfs_pgio_header *);
246int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); 251int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
247int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *, 252int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
248 const struct rpc_call_ops *, int, int); 253 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
254 const struct rpc_call_ops *call_ops, int how, int flags);
249void nfs_free_request(struct nfs_page *req); 255void nfs_free_request(struct nfs_page *req);
256struct nfs_pgio_mirror *
257nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
250 258
251static inline void nfs_iocounter_init(struct nfs_io_counter *c) 259static inline void nfs_iocounter_init(struct nfs_io_counter *c)
252{ 260{
@@ -254,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c)
254 atomic_set(&c->io_count, 0); 262 atomic_set(&c->io_count, 0);
255} 263}
256 264
265static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
266{
267 WARN_ON_ONCE(desc->pg_mirror_count < 1);
268 return desc->pg_mirror_count > 1;
269}
270
257/* nfs2xdr.c */ 271/* nfs2xdr.c */
258extern struct rpc_procinfo nfs_procedures[]; 272extern struct rpc_procinfo nfs_procedures[];
259extern int nfs2_decode_dirent(struct xdr_stream *, 273extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -377,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
377 391
378extern int __init register_nfs_fs(void); 392extern int __init register_nfs_fs(void);
379extern void __exit unregister_nfs_fs(void); 393extern void __exit unregister_nfs_fs(void);
380extern void nfs_sb_active(struct super_block *sb); 394extern bool nfs_sb_active(struct super_block *sb);
381extern void nfs_sb_deactive(struct super_block *sb); 395extern void nfs_sb_deactive(struct super_block *sb);
382 396
383/* namespace.c */ 397/* namespace.c */
@@ -416,7 +430,6 @@ int nfs_show_options(struct seq_file *, struct dentry *);
416int nfs_show_devname(struct seq_file *, struct dentry *); 430int nfs_show_devname(struct seq_file *, struct dentry *);
417int nfs_show_path(struct seq_file *, struct dentry *); 431int nfs_show_path(struct seq_file *, struct dentry *);
418int nfs_show_stats(struct seq_file *, struct dentry *); 432int nfs_show_stats(struct seq_file *, struct dentry *);
419void nfs_put_super(struct super_block *);
420int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 433int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
421 434
422/* write.c */ 435/* write.c */
@@ -429,6 +442,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
429extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); 442extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
430extern int nfs_initiate_commit(struct rpc_clnt *clnt, 443extern int nfs_initiate_commit(struct rpc_clnt *clnt,
431 struct nfs_commit_data *data, 444 struct nfs_commit_data *data,
445 const struct nfs_rpc_ops *nfs_ops,
432 const struct rpc_call_ops *call_ops, 446 const struct rpc_call_ops *call_ops,
433 int how, int flags); 447 int how, int flags);
434extern void nfs_init_commit(struct nfs_commit_data *data, 448extern void nfs_init_commit(struct nfs_commit_data *data,
@@ -442,13 +456,15 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
442 struct nfs_commit_info *cinfo); 456 struct nfs_commit_info *cinfo);
443void nfs_mark_request_commit(struct nfs_page *req, 457void nfs_mark_request_commit(struct nfs_page *req,
444 struct pnfs_layout_segment *lseg, 458 struct pnfs_layout_segment *lseg,
445 struct nfs_commit_info *cinfo); 459 struct nfs_commit_info *cinfo,
460 u32 ds_commit_idx);
446int nfs_write_need_commit(struct nfs_pgio_header *); 461int nfs_write_need_commit(struct nfs_pgio_header *);
447int nfs_generic_commit_list(struct inode *inode, struct list_head *head, 462int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
448 int how, struct nfs_commit_info *cinfo); 463 int how, struct nfs_commit_info *cinfo);
449void nfs_retry_commit(struct list_head *page_list, 464void nfs_retry_commit(struct list_head *page_list,
450 struct pnfs_layout_segment *lseg, 465 struct pnfs_layout_segment *lseg,
451 struct nfs_commit_info *cinfo); 466 struct nfs_commit_info *cinfo,
467 u32 ds_commit_idx);
452void nfs_commitdata_release(struct nfs_commit_data *data); 468void nfs_commitdata_release(struct nfs_commit_data *data);
453void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, 469void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
454 struct nfs_commit_info *cinfo); 470 struct nfs_commit_info *cinfo);
@@ -459,6 +475,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
459 struct nfs_direct_req *dreq); 475 struct nfs_direct_req *dreq);
460int nfs_key_timeout_notify(struct file *filp, struct inode *inode); 476int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
461bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); 477bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
478void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
462 479
463#ifdef CONFIG_MIGRATION 480#ifdef CONFIG_MIGRATION
464extern int nfs_migrate_page(struct address_space *, 481extern int nfs_migrate_page(struct address_space *,
@@ -482,6 +499,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
482 inode_dio_wait(inode); 499 inode_dio_wait(inode);
483} 500}
484extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); 501extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
502extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
485 503
486/* nfs4proc.c */ 504/* nfs4proc.c */
487extern void __nfs4_read_done_cb(struct nfs_pgio_header *); 505extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -495,6 +513,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
495 struct nfs_client **result, 513 struct nfs_client **result,
496 struct rpc_cred *cred); 514 struct rpc_cred *cred);
497 515
516static inline struct inode *nfs_igrab_and_active(struct inode *inode)
517{
518 inode = igrab(inode);
519 if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
520 iput(inode);
521 inode = NULL;
522 }
523 return inode;
524}
525
526static inline void nfs_iput_and_deactive(struct inode *inode)
527{
528 if (inode != NULL) {
529 struct super_block *sb = inode->i_sb;
530
531 iput(inode);
532 nfs_sb_deactive(sb);
533 }
534}
535
498/* 536/*
499 * Determine the device name as a string 537 * Determine the device name as a string
500 */ 538 */
@@ -560,6 +598,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
560} 598}
561 599
562/* 600/*
601 * Record the page as unstable and mark its inode as dirty.
602 */
603static inline
604void nfs_mark_page_unstable(struct page *page)
605{
606 struct inode *inode = page_file_mapping(page)->host;
607
608 inc_zone_page_state(page, NR_UNSTABLE_NFS);
609 inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
610 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
611}
612
613/*
563 * Determine the number of bytes of data the page contains 614 * Determine the number of bytes of data the page contains
564 */ 615 */
565static inline 616static inline
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
481 * void; 481 * void;
482 * }; 482 * };
483 */ 483 */
484static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) 484static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
485 __u32 *op_status)
485{ 486{
486 enum nfs_stat status; 487 enum nfs_stat status;
487 int error; 488 int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
489 error = decode_stat(xdr, &status); 490 error = decode_stat(xdr, &status);
490 if (unlikely(error)) 491 if (unlikely(error))
491 goto out; 492 goto out;
493 if (op_status)
494 *op_status = status;
492 if (status != NFS_OK) 495 if (status != NFS_OK)
493 goto out_default; 496 goto out_default;
494 error = decode_fattr(xdr, result); 497 error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
808static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, 811static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
809 struct nfs_fattr *result) 812 struct nfs_fattr *result)
810{ 813{
811 return decode_attrstat(xdr, result); 814 return decode_attrstat(xdr, result, NULL);
812} 815}
813 816
814static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, 817static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
865 error = decode_stat(xdr, &status); 868 error = decode_stat(xdr, &status);
866 if (unlikely(error)) 869 if (unlikely(error))
867 goto out; 870 goto out;
871 result->op_status = status;
868 if (status != NFS_OK) 872 if (status != NFS_OK)
869 goto out_default; 873 goto out_default;
870 error = decode_fattr(xdr, result->fattr); 874 error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
882{ 886{
883 /* All NFSv2 writes are "file sync" writes */ 887 /* All NFSv2 writes are "file sync" writes */
884 result->verf->committed = NFS_FILE_SYNC; 888 result->verf->committed = NFS_FILE_SYNC;
885 return decode_attrstat(xdr, result->fattr); 889 return decode_attrstat(xdr, result->fattr, &result->op_status);
886} 890}
887 891
888/** 892/**
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, 30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t); 31 struct nfs_fattr *, rpc_authflavor_t);
32 32
33/* nfs3super.c */
34extern struct nfs_subversion nfs_v3;
33 35
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */ 36#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..9e9fa347a948 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,5 +1,6 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include <linux/sunrpc/addr.h>
3#include "internal.h" 4#include "internal.h"
4#include "nfs3_fs.h" 5#include "nfs3_fs.h"
5 6
@@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
64 nfs_init_server_aclclient(server); 65 nfs_init_server_aclclient(server);
65 return server; 66 return server;
66} 67}
68
69/*
70 * Set up a pNFS Data Server client over NFSv3.
71 *
72 * Return any existing nfs_client that matches server address,port,version
73 * and minorversion.
74 *
75 * For a new nfs_client, use a soft mount (default), a low retrans and a
76 * low timeout interval so that if a connection is lost, we retry through
77 * the MDS.
78 */
79struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
80 const struct sockaddr *ds_addr, int ds_addrlen,
81 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
82 rpc_authflavor_t au_flavor)
83{
84 struct nfs_client_initdata cl_init = {
85 .addr = ds_addr,
86 .addrlen = ds_addrlen,
87 .nfs_mod = &nfs_v3,
88 .proto = ds_proto,
89 .net = mds_clp->cl_net,
90 };
91 struct rpc_timeout ds_timeout;
92 struct nfs_client *clp;
93 char buf[INET6_ADDRSTRLEN + 1];
94
95 /* fake a hostname because lockd wants it */
96 if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
97 return ERR_PTR(-EINVAL);
98 cl_init.hostname = buf;
99
100 /* Use the MDS nfs_client cl_ipaddr. */
101 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
102 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
103 au_flavor);
104
105 return clp;
106}
107EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 524f9f837408..78e557c3ab87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
800{ 800{
801 struct inode *inode = hdr->inode; 801 struct inode *inode = hdr->inode;
802 802
803 if (hdr->pgio_done_cb != NULL)
804 return hdr->pgio_done_cb(task, hdr);
805
803 if (nfs3_async_handle_jukebox(task, inode)) 806 if (nfs3_async_handle_jukebox(task, inode))
804 return -EAGAIN; 807 return -EAGAIN;
805 808
@@ -825,6 +828,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
825{ 828{
826 struct inode *inode = hdr->inode; 829 struct inode *inode = hdr->inode;
827 830
831 if (hdr->pgio_done_cb != NULL)
832 return hdr->pgio_done_cb(task, hdr);
833
828 if (nfs3_async_handle_jukebox(task, inode)) 834 if (nfs3_async_handle_jukebox(task, inode))
829 return -EAGAIN; 835 return -EAGAIN;
830 if (task->tk_status >= 0) 836 if (task->tk_status >= 0)
@@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi
845 851
846static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) 852static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
847{ 853{
854 if (data->commit_done_cb != NULL)
855 return data->commit_done_cb(task, data);
856
848 if (nfs3_async_handle_jukebox(task, data->inode)) 857 if (nfs3_async_handle_jukebox(task, data->inode))
849 return -EAGAIN; 858 return -EAGAIN;
850 nfs_refresh_inode(data->inode, data->res.fattr); 859 nfs_refresh_inode(data->inode, data->res.fattr);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
7#include "nfs3_fs.h" 7#include "nfs3_fs.h"
8#include "nfs.h" 8#include "nfs.h"
9 9
10static struct nfs_subversion nfs_v3 = { 10struct nfs_subversion nfs_v3 = {
11 .owner = THIS_MODULE, 11 .owner = THIS_MODULE,
12 .nfs_fs = &nfs_fs_type, 12 .nfs_fs = &nfs_fs_type,
13 .rpc_vers = &nfs_version3, 13 .rpc_vers = &nfs_version3,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1636 error = decode_post_op_attr(xdr, result->fattr); 1636 error = decode_post_op_attr(xdr, result->fattr);
1637 if (unlikely(error)) 1637 if (unlikely(error))
1638 goto out; 1638 goto out;
1639 result->op_status = status;
1639 if (status != NFS3_OK) 1640 if (status != NFS3_OK)
1640 goto out_status; 1641 goto out_status;
1641 error = decode_read3resok(xdr, result); 1642 error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
1708 error = decode_wcc_data(xdr, result->fattr); 1709 error = decode_wcc_data(xdr, result->fattr);
1709 if (unlikely(error)) 1710 if (unlikely(error))
1710 goto out; 1711 goto out;
1712 result->op_status = status;
1711 if (status != NFS3_OK) 1713 if (status != NFS3_OK)
1712 goto out_status; 1714 goto out_status;
1713 error = decode_write3resok(xdr, result); 1715 error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
2323 error = decode_wcc_data(xdr, result->fattr); 2325 error = decode_wcc_data(xdr, result->fattr);
2324 if (unlikely(error)) 2326 if (unlikely(error))
2325 goto out; 2327 goto out;
2328 result->op_status = status;
2326 if (status != NFS3_OK) 2329 if (status != NFS3_OK)
2327 goto out_status; 2330 goto out_status;
2328 error = decode_writeverf3(xdr, &result->verf->verifier); 2331 error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a08178764cf9..fdef424b0cd3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
44#define NFS4_RENEW_TIMEOUT 0x01 44#define NFS4_RENEW_TIMEOUT 0x01
45#define NFS4_RENEW_DELEGATION_CB 0x02 45#define NFS4_RENEW_DELEGATION_CB 0x02
46 46
47struct nfs_seqid_counter;
47struct nfs4_minor_version_ops { 48struct nfs4_minor_version_ops {
48 u32 minor_version; 49 u32 minor_version;
49 unsigned init_caps; 50 unsigned init_caps;
@@ -56,6 +57,8 @@ struct nfs4_minor_version_ops {
56 struct nfs_fsinfo *); 57 struct nfs_fsinfo *);
57 void (*free_lock_state)(struct nfs_server *, 58 void (*free_lock_state)(struct nfs_server *,
58 struct nfs4_lock_state *); 59 struct nfs4_lock_state *);
60 struct nfs_seqid *
61 (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
59 const struct rpc_call_ops *call_sync_ops; 62 const struct rpc_call_ops *call_sync_ops;
60 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 63 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
61 const struct nfs4_state_recovery_ops *nograce_recovery_ops; 64 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -443,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
443extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); 446extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
444extern void nfs_release_seqid(struct nfs_seqid *seqid); 447extern void nfs_release_seqid(struct nfs_seqid *seqid);
445extern void nfs_free_seqid(struct nfs_seqid *seqid); 448extern void nfs_free_seqid(struct nfs_seqid *seqid);
449extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
450 struct nfs4_sequence_args *args,
451 struct nfs4_sequence_res *res,
452 struct rpc_task *task);
453extern int nfs4_sequence_done(struct rpc_task *task,
454 struct nfs4_sequence_res *res);
446 455
447extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); 456extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
448 457
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 03311259b0c4..8646af9b11d2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -228,6 +228,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
228 kfree(clp->cl_serverowner); 228 kfree(clp->cl_serverowner);
229 kfree(clp->cl_serverscope); 229 kfree(clp->cl_serverscope);
230 kfree(clp->cl_implid); 230 kfree(clp->cl_implid);
231 kfree(clp->cl_owner_id);
231} 232}
232 233
233void nfs4_free_client(struct nfs_client *clp) 234void nfs4_free_client(struct nfs_client *clp)
@@ -452,6 +453,14 @@ static void nfs4_swap_callback_idents(struct nfs_client *keep,
452 spin_unlock(&nn->nfs_client_lock); 453 spin_unlock(&nn->nfs_client_lock);
453} 454}
454 455
456static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
457 const struct nfs_client *clp2)
458{
459 if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL)
460 return true;
461 return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
462}
463
455/** 464/**
456 * nfs40_walk_client_list - Find server that recognizes a client ID 465 * nfs40_walk_client_list - Find server that recognizes a client ID
457 * 466 *
@@ -483,9 +492,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
483 if (pos->rpc_ops != new->rpc_ops) 492 if (pos->rpc_ops != new->rpc_ops)
484 continue; 493 continue;
485 494
486 if (pos->cl_proto != new->cl_proto)
487 continue;
488
489 if (pos->cl_minorversion != new->cl_minorversion) 495 if (pos->cl_minorversion != new->cl_minorversion)
490 continue; 496 continue;
491 497
@@ -510,6 +516,9 @@ int nfs40_walk_client_list(struct nfs_client *new,
510 if (pos->cl_clientid != new->cl_clientid) 516 if (pos->cl_clientid != new->cl_clientid)
511 continue; 517 continue;
512 518
519 if (!nfs4_match_client_owner_id(pos, new))
520 continue;
521
513 atomic_inc(&pos->cl_count); 522 atomic_inc(&pos->cl_count);
514 spin_unlock(&nn->nfs_client_lock); 523 spin_unlock(&nn->nfs_client_lock);
515 524
@@ -566,20 +575,14 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
566} 575}
567 576
568/* 577/*
569 * Returns true if the server owners match 578 * Returns true if the server major ids match
570 */ 579 */
571static bool 580static bool
572nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b) 581nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
573{ 582{
574 struct nfs41_server_owner *o1 = a->cl_serverowner; 583 struct nfs41_server_owner *o1 = a->cl_serverowner;
575 struct nfs41_server_owner *o2 = b->cl_serverowner; 584 struct nfs41_server_owner *o2 = b->cl_serverowner;
576 585
577 if (o1->minor_id != o2->minor_id) {
578 dprintk("NFS: --> %s server owner minor IDs do not match\n",
579 __func__);
580 return false;
581 }
582
583 if (o1->major_id_sz != o2->major_id_sz) 586 if (o1->major_id_sz != o2->major_id_sz)
584 goto out_major_mismatch; 587 goto out_major_mismatch;
585 if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) 588 if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
@@ -621,9 +624,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
621 if (pos->rpc_ops != new->rpc_ops) 624 if (pos->rpc_ops != new->rpc_ops)
622 continue; 625 continue;
623 626
624 if (pos->cl_proto != new->cl_proto)
625 continue;
626
627 if (pos->cl_minorversion != new->cl_minorversion) 627 if (pos->cl_minorversion != new->cl_minorversion)
628 continue; 628 continue;
629 629
@@ -639,7 +639,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
639 prev = pos; 639 prev = pos;
640 640
641 status = nfs_wait_client_init_complete(pos); 641 status = nfs_wait_client_init_complete(pos);
642 if (status == 0) { 642 if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
643 nfs4_schedule_lease_recovery(pos); 643 nfs4_schedule_lease_recovery(pos);
644 status = nfs4_wait_clnt_recover(pos); 644 status = nfs4_wait_clnt_recover(pos);
645 } 645 }
@@ -654,7 +654,19 @@ int nfs41_walk_client_list(struct nfs_client *new,
654 if (!nfs4_match_clientids(pos, new)) 654 if (!nfs4_match_clientids(pos, new))
655 continue; 655 continue;
656 656
657 if (!nfs4_match_serverowners(pos, new)) 657 /*
658 * Note that session trunking is just a special subcase of
659 * client id trunking. In either case, we want to fall back
660 * to using the existing nfs_client.
661 */
662 if (!nfs4_check_clientid_trunking(pos, new))
663 continue;
664
665 /* Unlike NFSv4.0, we know that NFSv4.1 always uses the
666 * uniform string, however someone might switch the
667 * uniquifier string on us.
668 */
669 if (!nfs4_match_client_owner_id(pos, new))
658 continue; 670 continue;
659 671
660 atomic_inc(&pos->cl_count); 672 atomic_inc(&pos->cl_count);
@@ -837,14 +849,15 @@ error:
837 */ 849 */
838struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 850struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
839 const struct sockaddr *ds_addr, int ds_addrlen, 851 const struct sockaddr *ds_addr, int ds_addrlen,
840 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) 852 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
853 u32 minor_version, rpc_authflavor_t au_flavor)
841{ 854{
842 struct nfs_client_initdata cl_init = { 855 struct nfs_client_initdata cl_init = {
843 .addr = ds_addr, 856 .addr = ds_addr,
844 .addrlen = ds_addrlen, 857 .addrlen = ds_addrlen,
845 .nfs_mod = &nfs_v4, 858 .nfs_mod = &nfs_v4,
846 .proto = ds_proto, 859 .proto = ds_proto,
847 .minorversion = mds_clp->cl_minorversion, 860 .minorversion = minor_version,
848 .net = mds_clp->cl_net, 861 .net = mds_clp->cl_net,
849 }; 862 };
850 struct rpc_timeout ds_timeout; 863 struct rpc_timeout ds_timeout;
@@ -862,7 +875,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
862 */ 875 */
863 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 876 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
864 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 877 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
865 mds_clp->cl_rpcclient->cl_auth->au_flavor); 878 au_flavor);
866 879
867 dprintk("<-- %s %p\n", __func__, clp); 880 dprintk("<-- %s %p\n", __func__, clp);
868 return clp; 881 return clp;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e7f8d5ff2581..88180ac5ea0e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -495,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
495 args->sa_privileged = 1; 495 args->sa_privileged = 1;
496} 496}
497 497
498static int nfs40_setup_sequence(const struct nfs_server *server, 498int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
499 struct nfs4_sequence_args *args, 499 struct nfs4_sequence_args *args,
500 struct nfs4_sequence_res *res, 500 struct nfs4_sequence_res *res,
501 struct rpc_task *task) 501 struct rpc_task *task)
502{ 502{
503 struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
504 struct nfs4_slot *slot; 503 struct nfs4_slot *slot;
505 504
506 /* slot already allocated? */ 505 /* slot already allocated? */
@@ -535,6 +534,7 @@ out_sleep:
535 spin_unlock(&tbl->slot_tbl_lock); 534 spin_unlock(&tbl->slot_tbl_lock);
536 return -EAGAIN; 535 return -EAGAIN;
537} 536}
537EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
538 538
539static int nfs40_sequence_done(struct rpc_task *task, 539static int nfs40_sequence_done(struct rpc_task *task,
540 struct nfs4_sequence_res *res) 540 struct nfs4_sequence_res *res)
@@ -694,8 +694,7 @@ out_retry:
694} 694}
695EXPORT_SYMBOL_GPL(nfs41_sequence_done); 695EXPORT_SYMBOL_GPL(nfs41_sequence_done);
696 696
697static int nfs4_sequence_done(struct rpc_task *task, 697int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
698 struct nfs4_sequence_res *res)
699{ 698{
700 if (res->sr_slot == NULL) 699 if (res->sr_slot == NULL)
701 return 1; 700 return 1;
@@ -703,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task,
703 return nfs40_sequence_done(task, res); 702 return nfs40_sequence_done(task, res);
704 return nfs41_sequence_done(task, res); 703 return nfs41_sequence_done(task, res);
705} 704}
705EXPORT_SYMBOL_GPL(nfs4_sequence_done);
706 706
707int nfs41_setup_sequence(struct nfs4_session *session, 707int nfs41_setup_sequence(struct nfs4_session *session,
708 struct nfs4_sequence_args *args, 708 struct nfs4_sequence_args *args,
@@ -777,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
777 int ret = 0; 777 int ret = 0;
778 778
779 if (!session) 779 if (!session)
780 return nfs40_setup_sequence(server, args, res, task); 780 return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
781 args, res, task);
781 782
782 dprintk("--> %s clp %p session %p sr_slot %u\n", 783 dprintk("--> %s clp %p session %p sr_slot %u\n",
783 __func__, session->clp, session, res->sr_slot ? 784 __func__, session->clp, session, res->sr_slot ?
@@ -818,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
818 struct nfs4_sequence_res *res, 819 struct nfs4_sequence_res *res,
819 struct rpc_task *task) 820 struct rpc_task *task)
820{ 821{
821 return nfs40_setup_sequence(server, args, res, task); 822 return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
823 args, res, task);
822} 824}
823 825
824static int nfs4_sequence_done(struct rpc_task *task, 826int nfs4_sequence_done(struct rpc_task *task,
825 struct nfs4_sequence_res *res) 827 struct nfs4_sequence_res *res)
826{ 828{
827 return nfs40_sequence_done(task, res); 829 return nfs40_sequence_done(task, res);
828} 830}
831EXPORT_SYMBOL_GPL(nfs4_sequence_done);
829 832
830#endif /* !CONFIG_NFS_V4_1 */ 833#endif /* !CONFIG_NFS_V4_1 */
831 834
@@ -937,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
937 return true; 940 return true;
938} 941}
939 942
943static u32
944nfs4_map_atomic_open_share(struct nfs_server *server,
945 fmode_t fmode, int openflags)
946{
947 u32 res = 0;
948
949 switch (fmode & (FMODE_READ | FMODE_WRITE)) {
950 case FMODE_READ:
951 res = NFS4_SHARE_ACCESS_READ;
952 break;
953 case FMODE_WRITE:
954 res = NFS4_SHARE_ACCESS_WRITE;
955 break;
956 case FMODE_READ|FMODE_WRITE:
957 res = NFS4_SHARE_ACCESS_BOTH;
958 }
959 if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
960 goto out;
961 /* Want no delegation if we're using O_DIRECT */
962 if (openflags & O_DIRECT)
963 res |= NFS4_SHARE_WANT_NO_DELEG;
964out:
965 return res;
966}
967
940static enum open_claim_type4 968static enum open_claim_type4
941nfs4_map_atomic_open_claim(struct nfs_server *server, 969nfs4_map_atomic_open_claim(struct nfs_server *server,
942 enum open_claim_type4 claim) 970 enum open_claim_type4 claim)
@@ -977,6 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
977 struct dentry *parent = dget_parent(dentry); 1005 struct dentry *parent = dget_parent(dentry);
978 struct inode *dir = parent->d_inode; 1006 struct inode *dir = parent->d_inode;
979 struct nfs_server *server = NFS_SERVER(dir); 1007 struct nfs_server *server = NFS_SERVER(dir);
1008 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
980 struct nfs4_opendata *p; 1009 struct nfs4_opendata *p;
981 1010
982 p = kzalloc(sizeof(*p), gfp_mask); 1011 p = kzalloc(sizeof(*p), gfp_mask);
@@ -987,8 +1016,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
987 if (IS_ERR(p->f_label)) 1016 if (IS_ERR(p->f_label))
988 goto err_free_p; 1017 goto err_free_p;
989 1018
990 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 1019 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
991 if (p->o_arg.seqid == NULL) 1020 p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
1021 if (IS_ERR(p->o_arg.seqid))
992 goto err_free_label; 1022 goto err_free_label;
993 nfs_sb_active(dentry->d_sb); 1023 nfs_sb_active(dentry->d_sb);
994 p->dentry = dget(dentry); 1024 p->dentry = dget(dentry);
@@ -997,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
997 atomic_inc(&sp->so_count); 1027 atomic_inc(&sp->so_count);
998 p->o_arg.open_flags = flags; 1028 p->o_arg.open_flags = flags;
999 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); 1029 p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
1030 p->o_arg.share_access = nfs4_map_atomic_open_share(server,
1031 fmode, flags);
1000 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS 1032 /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
1001 * will return permission denied for all bits until close */ 1033 * will return permission denied for all bits until close */
1002 if (!(flags & O_EXCL)) { 1034 if (!(flags & O_EXCL)) {
@@ -1117,8 +1149,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
1117 return 0; 1149 return 0;
1118 if ((delegation->type & fmode) != fmode) 1150 if ((delegation->type & fmode) != fmode)
1119 return 0; 1151 return 0;
1120 if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
1121 return 0;
1122 if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) 1152 if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
1123 return 0; 1153 return 0;
1124 nfs_mark_delegation_referenced(delegation); 1154 nfs_mark_delegation_referenced(delegation);
@@ -1169,6 +1199,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
1169 return false; 1199 return false;
1170} 1200}
1171 1201
1202static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
1203{
1204 if (state->n_wronly)
1205 set_bit(NFS_O_WRONLY_STATE, &state->flags);
1206 if (state->n_rdonly)
1207 set_bit(NFS_O_RDONLY_STATE, &state->flags);
1208 if (state->n_rdwr)
1209 set_bit(NFS_O_RDWR_STATE, &state->flags);
1210}
1211
1172static void nfs_clear_open_stateid_locked(struct nfs4_state *state, 1212static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1173 nfs4_stateid *stateid, fmode_t fmode) 1213 nfs4_stateid *stateid, fmode_t fmode)
1174{ 1214{
@@ -1187,8 +1227,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
1187 } 1227 }
1188 if (stateid == NULL) 1228 if (stateid == NULL)
1189 return; 1229 return;
1190 if (!nfs_need_update_open_stateid(state, stateid)) 1230 /* Handle races with OPEN */
1231 if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
1232 !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
1233 nfs_resync_open_stateid_locked(state);
1191 return; 1234 return;
1235 }
1192 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1236 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1193 nfs4_stateid_copy(&state->stateid, stateid); 1237 nfs4_stateid_copy(&state->stateid, stateid);
1194 nfs4_stateid_copy(&state->open_stateid, stateid); 1238 nfs4_stateid_copy(&state->open_stateid, stateid);
@@ -1283,6 +1327,23 @@ no_delegation:
1283 return ret; 1327 return ret;
1284} 1328}
1285 1329
1330static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
1331 const nfs4_stateid *stateid)
1332{
1333 struct nfs4_state *state = lsp->ls_state;
1334 bool ret = false;
1335
1336 spin_lock(&state->state_lock);
1337 if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
1338 goto out_noupdate;
1339 if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
1340 goto out_noupdate;
1341 nfs4_stateid_copy(&lsp->ls_stateid, stateid);
1342 ret = true;
1343out_noupdate:
1344 spin_unlock(&state->state_lock);
1345 return ret;
1346}
1286 1347
1287static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) 1348static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
1288{ 1349{
@@ -1681,8 +1742,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
1681{ 1742{
1682 struct nfs4_opendata *data = calldata; 1743 struct nfs4_opendata *data = calldata;
1683 1744
1684 nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, 1745 nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
1685 &data->c_res.seq_res, task); 1746 &data->c_arg.seq_args, &data->c_res.seq_res, task);
1686} 1747}
1687 1748
1688static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) 1749static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -2589,6 +2650,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2589 case -NFS4ERR_OLD_STATEID: 2650 case -NFS4ERR_OLD_STATEID:
2590 case -NFS4ERR_BAD_STATEID: 2651 case -NFS4ERR_BAD_STATEID:
2591 case -NFS4ERR_EXPIRED: 2652 case -NFS4ERR_EXPIRED:
2653 if (!nfs4_stateid_match(&calldata->arg.stateid,
2654 &state->stateid)) {
2655 rpc_restart_call_prepare(task);
2656 goto out_release;
2657 }
2592 if (calldata->arg.fmode == 0) 2658 if (calldata->arg.fmode == 0)
2593 break; 2659 break;
2594 default: 2660 default:
@@ -2621,6 +2687,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2621 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); 2687 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
2622 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); 2688 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
2623 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); 2689 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
2690 nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
2624 /* Calculate the change in open mode */ 2691 /* Calculate the change in open mode */
2625 calldata->arg.fmode = 0; 2692 calldata->arg.fmode = 0;
2626 if (state->n_rdwr == 0) { 2693 if (state->n_rdwr == 0) {
@@ -2655,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2655 goto out_wait; 2722 goto out_wait;
2656 } 2723 }
2657 } 2724 }
2725 calldata->arg.share_access =
2726 nfs4_map_atomic_open_share(NFS_SERVER(inode),
2727 calldata->arg.fmode, 0);
2658 2728
2659 nfs_fattr_init(calldata->res.fattr); 2729 nfs_fattr_init(calldata->res.fattr);
2660 calldata->timestamp = jiffies; 2730 calldata->timestamp = jiffies;
@@ -2677,45 +2747,10 @@ static const struct rpc_call_ops nfs4_close_ops = {
2677 .rpc_release = nfs4_free_closedata, 2747 .rpc_release = nfs4_free_closedata,
2678}; 2748};
2679 2749
2680static bool nfs4_state_has_opener(struct nfs4_state *state)
2681{
2682 /* first check existing openers */
2683 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
2684 state->n_rdonly != 0)
2685 return true;
2686
2687 if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
2688 state->n_wronly != 0)
2689 return true;
2690
2691 if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
2692 state->n_rdwr != 0)
2693 return true;
2694
2695 return false;
2696}
2697
2698static bool nfs4_roc(struct inode *inode) 2750static bool nfs4_roc(struct inode *inode)
2699{ 2751{
2700 struct nfs_inode *nfsi = NFS_I(inode); 2752 if (!nfs_have_layout(inode))
2701 struct nfs_open_context *ctx;
2702 struct nfs4_state *state;
2703
2704 spin_lock(&inode->i_lock);
2705 list_for_each_entry(ctx, &nfsi->open_files, list) {
2706 state = ctx->state;
2707 if (state == NULL)
2708 continue;
2709 if (nfs4_state_has_opener(state)) {
2710 spin_unlock(&inode->i_lock);
2711 return false;
2712 }
2713 }
2714 spin_unlock(&inode->i_lock);
2715
2716 if (nfs4_check_delegation(inode, FMODE_READ))
2717 return false; 2753 return false;
2718
2719 return pnfs_roc(inode); 2754 return pnfs_roc(inode);
2720} 2755}
2721 2756
@@ -2733,6 +2768,7 @@ static bool nfs4_roc(struct inode *inode)
2733int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) 2768int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2734{ 2769{
2735 struct nfs_server *server = NFS_SERVER(state->inode); 2770 struct nfs_server *server = NFS_SERVER(state->inode);
2771 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
2736 struct nfs4_closedata *calldata; 2772 struct nfs4_closedata *calldata;
2737 struct nfs4_state_owner *sp = state->owner; 2773 struct nfs4_state_owner *sp = state->owner;
2738 struct rpc_task *task; 2774 struct rpc_task *task;
@@ -2759,10 +2795,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2759 calldata->inode = state->inode; 2795 calldata->inode = state->inode;
2760 calldata->state = state; 2796 calldata->state = state;
2761 calldata->arg.fh = NFS_FH(state->inode); 2797 calldata->arg.fh = NFS_FH(state->inode);
2762 calldata->arg.stateid = &state->open_stateid;
2763 /* Serialization for the sequence id */ 2798 /* Serialization for the sequence id */
2764 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); 2799 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
2765 if (calldata->arg.seqid == NULL) 2800 calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
2801 if (IS_ERR(calldata->arg.seqid))
2766 goto out_free_calldata; 2802 goto out_free_calldata;
2767 calldata->arg.fmode = 0; 2803 calldata->arg.fmode = 0;
2768 calldata->arg.bitmask = server->cache_consistency_bitmask; 2804 calldata->arg.bitmask = server->cache_consistency_bitmask;
@@ -4917,11 +4953,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
4917} 4953}
4918 4954
4919static unsigned int 4955static unsigned int
4920nfs4_init_nonuniform_client_string(const struct nfs_client *clp, 4956nfs4_init_nonuniform_client_string(struct nfs_client *clp,
4921 char *buf, size_t len) 4957 char *buf, size_t len)
4922{ 4958{
4923 unsigned int result; 4959 unsigned int result;
4924 4960
4961 if (clp->cl_owner_id != NULL)
4962 return strlcpy(buf, clp->cl_owner_id, len);
4963
4925 rcu_read_lock(); 4964 rcu_read_lock();
4926 result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", 4965 result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
4927 clp->cl_ipaddr, 4966 clp->cl_ipaddr,
@@ -4930,24 +4969,32 @@ nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
4930 rpc_peeraddr2str(clp->cl_rpcclient, 4969 rpc_peeraddr2str(clp->cl_rpcclient,
4931 RPC_DISPLAY_PROTO)); 4970 RPC_DISPLAY_PROTO));
4932 rcu_read_unlock(); 4971 rcu_read_unlock();
4972 clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
4933 return result; 4973 return result;
4934} 4974}
4935 4975
4936static unsigned int 4976static unsigned int
4937nfs4_init_uniform_client_string(const struct nfs_client *clp, 4977nfs4_init_uniform_client_string(struct nfs_client *clp,
4938 char *buf, size_t len) 4978 char *buf, size_t len)
4939{ 4979{
4940 const char *nodename = clp->cl_rpcclient->cl_nodename; 4980 const char *nodename = clp->cl_rpcclient->cl_nodename;
4981 unsigned int result;
4982
4983 if (clp->cl_owner_id != NULL)
4984 return strlcpy(buf, clp->cl_owner_id, len);
4941 4985
4942 if (nfs4_client_id_uniquifier[0] != '\0') 4986 if (nfs4_client_id_uniquifier[0] != '\0')
4943 return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", 4987 result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
4944 clp->rpc_ops->version, 4988 clp->rpc_ops->version,
4945 clp->cl_minorversion, 4989 clp->cl_minorversion,
4946 nfs4_client_id_uniquifier, 4990 nfs4_client_id_uniquifier,
4947 nodename); 4991 nodename);
4948 return scnprintf(buf, len, "Linux NFSv%u.%u %s", 4992 else
4993 result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
4949 clp->rpc_ops->version, clp->cl_minorversion, 4994 clp->rpc_ops->version, clp->cl_minorversion,
4950 nodename); 4995 nodename);
4996 clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
4997 return result;
4951} 4998}
4952 4999
4953/* 5000/*
@@ -5128,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5128static void nfs4_delegreturn_release(void *calldata) 5175static void nfs4_delegreturn_release(void *calldata)
5129{ 5176{
5130 struct nfs4_delegreturndata *data = calldata; 5177 struct nfs4_delegreturndata *data = calldata;
5178 struct inode *inode = data->inode;
5131 5179
5132 if (data->roc) 5180 if (inode) {
5133 pnfs_roc_release(data->inode); 5181 if (data->roc)
5182 pnfs_roc_release(inode);
5183 nfs_iput_and_deactive(inode);
5184 }
5134 kfree(calldata); 5185 kfree(calldata);
5135} 5186}
5136 5187
@@ -5187,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
5187 nfs_fattr_init(data->res.fattr); 5238 nfs_fattr_init(data->res.fattr);
5188 data->timestamp = jiffies; 5239 data->timestamp = jiffies;
5189 data->rpc_status = 0; 5240 data->rpc_status = 0;
5190 data->inode = inode; 5241 data->inode = nfs_igrab_and_active(inode);
5191 data->roc = list_empty(&NFS_I(inode)->open_files) ? 5242 if (data->inode)
5192 pnfs_roc(inode) : false; 5243 data->roc = nfs4_roc(inode);
5193 5244
5194 task_setup_data.callback_data = data; 5245 task_setup_data.callback_data = data;
5195 msg.rpc_argp = &data->args; 5246 msg.rpc_argp = &data->args;
@@ -5344,7 +5395,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
5344 p->arg.fl = &p->fl; 5395 p->arg.fl = &p->fl;
5345 p->arg.seqid = seqid; 5396 p->arg.seqid = seqid;
5346 p->res.seqid = seqid; 5397 p->res.seqid = seqid;
5347 p->arg.stateid = &lsp->ls_stateid;
5348 p->lsp = lsp; 5398 p->lsp = lsp;
5349 atomic_inc(&lsp->ls_count); 5399 atomic_inc(&lsp->ls_count);
5350 /* Ensure we don't close file until we're done freeing locks! */ 5400 /* Ensure we don't close file until we're done freeing locks! */
@@ -5371,14 +5421,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5371 return; 5421 return;
5372 switch (task->tk_status) { 5422 switch (task->tk_status) {
5373 case 0: 5423 case 0:
5374 nfs4_stateid_copy(&calldata->lsp->ls_stateid,
5375 &calldata->res.stateid);
5376 renew_lease(calldata->server, calldata->timestamp); 5424 renew_lease(calldata->server, calldata->timestamp);
5377 break; 5425 do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
5426 if (nfs4_update_lock_stateid(calldata->lsp,
5427 &calldata->res.stateid))
5428 break;
5378 case -NFS4ERR_BAD_STATEID: 5429 case -NFS4ERR_BAD_STATEID:
5379 case -NFS4ERR_OLD_STATEID: 5430 case -NFS4ERR_OLD_STATEID:
5380 case -NFS4ERR_STALE_STATEID: 5431 case -NFS4ERR_STALE_STATEID:
5381 case -NFS4ERR_EXPIRED: 5432 case -NFS4ERR_EXPIRED:
5433 if (!nfs4_stateid_match(&calldata->arg.stateid,
5434 &calldata->lsp->ls_stateid))
5435 rpc_restart_call_prepare(task);
5382 break; 5436 break;
5383 default: 5437 default:
5384 if (nfs4_async_handle_error(task, calldata->server, 5438 if (nfs4_async_handle_error(task, calldata->server,
@@ -5394,6 +5448,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
5394 5448
5395 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) 5449 if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
5396 goto out_wait; 5450 goto out_wait;
5451 nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
5397 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { 5452 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
5398 /* Note: exit _without_ running nfs4_locku_done */ 5453 /* Note: exit _without_ running nfs4_locku_done */
5399 goto out_no_action; 5454 goto out_no_action;
@@ -5464,6 +5519,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5464 struct nfs_seqid *seqid; 5519 struct nfs_seqid *seqid;
5465 struct nfs4_lock_state *lsp; 5520 struct nfs4_lock_state *lsp;
5466 struct rpc_task *task; 5521 struct rpc_task *task;
5522 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
5467 int status = 0; 5523 int status = 0;
5468 unsigned char fl_flags = request->fl_flags; 5524 unsigned char fl_flags = request->fl_flags;
5469 5525
@@ -5487,9 +5543,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
5487 lsp = request->fl_u.nfs4_fl.owner; 5543 lsp = request->fl_u.nfs4_fl.owner;
5488 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) 5544 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
5489 goto out; 5545 goto out;
5490 seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); 5546 alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
5547 seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
5491 status = -ENOMEM; 5548 status = -ENOMEM;
5492 if (seqid == NULL) 5549 if (IS_ERR(seqid))
5493 goto out; 5550 goto out;
5494 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); 5551 task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
5495 status = PTR_ERR(task); 5552 status = PTR_ERR(task);
@@ -5522,6 +5579,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
5522 struct nfs4_lockdata *p; 5579 struct nfs4_lockdata *p;
5523 struct inode *inode = lsp->ls_state->inode; 5580 struct inode *inode = lsp->ls_state->inode;
5524 struct nfs_server *server = NFS_SERVER(inode); 5581 struct nfs_server *server = NFS_SERVER(inode);
5582 struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
5525 5583
5526 p = kzalloc(sizeof(*p), gfp_mask); 5584 p = kzalloc(sizeof(*p), gfp_mask);
5527 if (p == NULL) 5585 if (p == NULL)
@@ -5530,12 +5588,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
5530 p->arg.fh = NFS_FH(inode); 5588 p->arg.fh = NFS_FH(inode);
5531 p->arg.fl = &p->fl; 5589 p->arg.fl = &p->fl;
5532 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); 5590 p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
5533 if (p->arg.open_seqid == NULL) 5591 if (IS_ERR(p->arg.open_seqid))
5534 goto out_free; 5592 goto out_free;
5535 p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); 5593 alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
5536 if (p->arg.lock_seqid == NULL) 5594 p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
5595 if (IS_ERR(p->arg.lock_seqid))
5537 goto out_free_seqid; 5596 goto out_free_seqid;
5538 p->arg.lock_stateid = &lsp->ls_stateid;
5539 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; 5597 p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
5540 p->arg.lock_owner.id = lsp->ls_seqid.owner_id; 5598 p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
5541 p->arg.lock_owner.s_dev = server->s_dev; 5599 p->arg.lock_owner.s_dev = server->s_dev;
@@ -5562,15 +5620,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
5562 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) 5620 if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
5563 goto out_wait; 5621 goto out_wait;
5564 /* Do we need to do an open_to_lock_owner? */ 5622 /* Do we need to do an open_to_lock_owner? */
5565 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 5623 if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
5566 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { 5624 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
5567 goto out_release_lock_seqid; 5625 goto out_release_lock_seqid;
5568 } 5626 }
5569 data->arg.open_stateid = &state->open_stateid; 5627 nfs4_stateid_copy(&data->arg.open_stateid,
5628 &state->open_stateid);
5570 data->arg.new_lock_owner = 1; 5629 data->arg.new_lock_owner = 1;
5571 data->res.open_seqid = data->arg.open_seqid; 5630 data->res.open_seqid = data->arg.open_seqid;
5572 } else 5631 } else {
5573 data->arg.new_lock_owner = 0; 5632 data->arg.new_lock_owner = 0;
5633 nfs4_stateid_copy(&data->arg.lock_stateid,
5634 &data->lsp->ls_stateid);
5635 }
5574 if (!nfs4_valid_open_stateid(state)) { 5636 if (!nfs4_valid_open_stateid(state)) {
5575 data->rpc_status = -EBADF; 5637 data->rpc_status = -EBADF;
5576 task->tk_action = NULL; 5638 task->tk_action = NULL;
@@ -5594,6 +5656,7 @@ out_wait:
5594static void nfs4_lock_done(struct rpc_task *task, void *calldata) 5656static void nfs4_lock_done(struct rpc_task *task, void *calldata)
5595{ 5657{
5596 struct nfs4_lockdata *data = calldata; 5658 struct nfs4_lockdata *data = calldata;
5659 struct nfs4_lock_state *lsp = data->lsp;
5597 5660
5598 dprintk("%s: begin!\n", __func__); 5661 dprintk("%s: begin!\n", __func__);
5599 5662
@@ -5601,18 +5664,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
5601 return; 5664 return;
5602 5665
5603 data->rpc_status = task->tk_status; 5666 data->rpc_status = task->tk_status;
5604 if (data->arg.new_lock_owner != 0) { 5667 switch (task->tk_status) {
5605 if (data->rpc_status == 0) 5668 case 0:
5606 nfs_confirm_seqid(&data->lsp->ls_seqid, 0); 5669 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
5607 else 5670 data->timestamp);
5608 goto out; 5671 if (data->arg.new_lock) {
5609 } 5672 data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
5610 if (data->rpc_status == 0) { 5673 if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
5611 nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); 5674 rpc_restart_call_prepare(task);
5612 set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); 5675 break;
5613 renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); 5676 }
5677 }
5678 if (data->arg.new_lock_owner != 0) {
5679 nfs_confirm_seqid(&lsp->ls_seqid, 0);
5680 nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
5681 set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
5682 } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
5683 rpc_restart_call_prepare(task);
5684 break;
5685 case -NFS4ERR_BAD_STATEID:
5686 case -NFS4ERR_OLD_STATEID:
5687 case -NFS4ERR_STALE_STATEID:
5688 case -NFS4ERR_EXPIRED:
5689 if (data->arg.new_lock_owner != 0) {
5690 if (!nfs4_stateid_match(&data->arg.open_stateid,
5691 &lsp->ls_state->open_stateid))
5692 rpc_restart_call_prepare(task);
5693 } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
5694 &lsp->ls_stateid))
5695 rpc_restart_call_prepare(task);
5614 } 5696 }
5615out:
5616 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); 5697 dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
5617} 5698}
5618 5699
@@ -5693,7 +5774,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
5693 if (recovery_type == NFS_LOCK_RECLAIM) 5774 if (recovery_type == NFS_LOCK_RECLAIM)
5694 data->arg.reclaim = NFS_LOCK_RECLAIM; 5775 data->arg.reclaim = NFS_LOCK_RECLAIM;
5695 nfs4_set_sequence_privileged(&data->arg.seq_args); 5776 nfs4_set_sequence_privileged(&data->arg.seq_args);
5696 } 5777 } else
5778 data->arg.new_lock = 1;
5697 task = rpc_run_task(&task_setup_data); 5779 task = rpc_run_task(&task_setup_data);
5698 if (IS_ERR(task)) 5780 if (IS_ERR(task))
5699 return PTR_ERR(task); 5781 return PTR_ERR(task);
@@ -5817,10 +5899,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
5817 5899
5818static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) 5900static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
5819{ 5901{
5820 struct nfs4_state_owner *sp = state->owner;
5821 struct nfs_inode *nfsi = NFS_I(state->inode); 5902 struct nfs_inode *nfsi = NFS_I(state->inode);
5822 unsigned char fl_flags = request->fl_flags; 5903 unsigned char fl_flags = request->fl_flags;
5823 unsigned int seq;
5824 int status = -ENOLCK; 5904 int status = -ENOLCK;
5825 5905
5826 if ((fl_flags & FL_POSIX) && 5906 if ((fl_flags & FL_POSIX) &&
@@ -5840,25 +5920,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
5840 /* ...but avoid races with delegation recall... */ 5920 /* ...but avoid races with delegation recall... */
5841 request->fl_flags = fl_flags & ~FL_SLEEP; 5921 request->fl_flags = fl_flags & ~FL_SLEEP;
5842 status = do_vfs_lock(request->fl_file, request); 5922 status = do_vfs_lock(request->fl_file, request);
5843 goto out_unlock; 5923 up_read(&nfsi->rwsem);
5844 }
5845 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5846 up_read(&nfsi->rwsem);
5847 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
5848 if (status != 0)
5849 goto out; 5924 goto out;
5850 down_read(&nfsi->rwsem);
5851 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
5852 status = -NFS4ERR_DELAY;
5853 goto out_unlock;
5854 } 5925 }
5855 /* Note: we always want to sleep here! */
5856 request->fl_flags = fl_flags | FL_SLEEP;
5857 if (do_vfs_lock(request->fl_file, request) < 0)
5858 printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
5859 "manager!\n", __func__);
5860out_unlock:
5861 up_read(&nfsi->rwsem); 5926 up_read(&nfsi->rwsem);
5927 status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
5862out: 5928out:
5863 request->fl_flags = fl_flags; 5929 request->fl_flags = fl_flags;
5864 return status; 5930 return status;
@@ -5965,8 +6031,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
5965{ 6031{
5966 struct nfs_release_lockowner_data *data = calldata; 6032 struct nfs_release_lockowner_data *data = calldata;
5967 struct nfs_server *server = data->server; 6033 struct nfs_server *server = data->server;
5968 nfs40_setup_sequence(server, &data->args.seq_args, 6034 nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
5969 &data->res.seq_res, task); 6035 &data->args.seq_args, &data->res.seq_res, task);
5970 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 6036 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
5971 data->timestamp = jiffies; 6037 data->timestamp = jiffies;
5972} 6038}
@@ -6582,47 +6648,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
6582int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) 6648int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
6583{ 6649{
6584 int status; 6650 int status;
6651 struct nfs41_bind_conn_to_session_args args = {
6652 .client = clp,
6653 .dir = NFS4_CDFC4_FORE_OR_BOTH,
6654 };
6585 struct nfs41_bind_conn_to_session_res res; 6655 struct nfs41_bind_conn_to_session_res res;
6586 struct rpc_message msg = { 6656 struct rpc_message msg = {
6587 .rpc_proc = 6657 .rpc_proc =
6588 &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], 6658 &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
6589 .rpc_argp = clp, 6659 .rpc_argp = &args,
6590 .rpc_resp = &res, 6660 .rpc_resp = &res,
6591 .rpc_cred = cred, 6661 .rpc_cred = cred,
6592 }; 6662 };
6593 6663
6594 dprintk("--> %s\n", __func__); 6664 dprintk("--> %s\n", __func__);
6595 6665
6596 res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); 6666 nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
6597 if (unlikely(res.session == NULL)) { 6667 if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
6598 status = -ENOMEM; 6668 args.dir = NFS4_CDFC4_FORE;
6599 goto out;
6600 }
6601 6669
6602 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 6670 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
6603 trace_nfs4_bind_conn_to_session(clp, status); 6671 trace_nfs4_bind_conn_to_session(clp, status);
6604 if (status == 0) { 6672 if (status == 0) {
6605 if (memcmp(res.session->sess_id.data, 6673 if (memcmp(res.sessionid.data,
6606 clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { 6674 clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
6607 dprintk("NFS: %s: Session ID mismatch\n", __func__); 6675 dprintk("NFS: %s: Session ID mismatch\n", __func__);
6608 status = -EIO; 6676 status = -EIO;
6609 goto out_session; 6677 goto out;
6610 } 6678 }
6611 if (res.dir != NFS4_CDFS4_BOTH) { 6679 if ((res.dir & args.dir) != res.dir || res.dir == 0) {
6612 dprintk("NFS: %s: Unexpected direction from server\n", 6680 dprintk("NFS: %s: Unexpected direction from server\n",
6613 __func__); 6681 __func__);
6614 status = -EIO; 6682 status = -EIO;
6615 goto out_session; 6683 goto out;
6616 } 6684 }
6617 if (res.use_conn_in_rdma_mode) { 6685 if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
6618 dprintk("NFS: %s: Server returned RDMA mode = true\n", 6686 dprintk("NFS: %s: Server returned RDMA mode = true\n",
6619 __func__); 6687 __func__);
6620 status = -EIO; 6688 status = -EIO;
6621 goto out_session; 6689 goto out;
6622 } 6690 }
6623 } 6691 }
6624out_session:
6625 kfree(res.session);
6626out: 6692out:
6627 dprintk("<-- %s status= %d\n", __func__, status); 6693 dprintk("<-- %s status= %d\n", __func__, status);
6628 return status; 6694 return status;
@@ -7100,10 +7166,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
7100 args->bc_attrs.max_reqs); 7166 args->bc_attrs.max_reqs);
7101} 7167}
7102 7168
7103static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) 7169static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
7170 struct nfs41_create_session_res *res)
7104{ 7171{
7105 struct nfs4_channel_attrs *sent = &args->fc_attrs; 7172 struct nfs4_channel_attrs *sent = &args->fc_attrs;
7106 struct nfs4_channel_attrs *rcvd = &session->fc_attrs; 7173 struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
7107 7174
7108 if (rcvd->max_resp_sz > sent->max_resp_sz) 7175 if (rcvd->max_resp_sz > sent->max_resp_sz)
7109 return -EINVAL; 7176 return -EINVAL;
@@ -7122,11 +7189,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
7122 return 0; 7189 return 0;
7123} 7190}
7124 7191
7125static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) 7192static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
7193 struct nfs41_create_session_res *res)
7126{ 7194{
7127 struct nfs4_channel_attrs *sent = &args->bc_attrs; 7195 struct nfs4_channel_attrs *sent = &args->bc_attrs;
7128 struct nfs4_channel_attrs *rcvd = &session->bc_attrs; 7196 struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
7129 7197
7198 if (!(res->flags & SESSION4_BACK_CHAN))
7199 goto out;
7130 if (rcvd->max_rqst_sz > sent->max_rqst_sz) 7200 if (rcvd->max_rqst_sz > sent->max_rqst_sz)
7131 return -EINVAL; 7201 return -EINVAL;
7132 if (rcvd->max_resp_sz < sent->max_resp_sz) 7202 if (rcvd->max_resp_sz < sent->max_resp_sz)
@@ -7138,18 +7208,30 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
7138 return -EINVAL; 7208 return -EINVAL;
7139 if (rcvd->max_reqs != sent->max_reqs) 7209 if (rcvd->max_reqs != sent->max_reqs)
7140 return -EINVAL; 7210 return -EINVAL;
7211out:
7141 return 0; 7212 return 0;
7142} 7213}
7143 7214
7144static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, 7215static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
7145 struct nfs4_session *session) 7216 struct nfs41_create_session_res *res)
7146{ 7217{
7147 int ret; 7218 int ret;
7148 7219
7149 ret = nfs4_verify_fore_channel_attrs(args, session); 7220 ret = nfs4_verify_fore_channel_attrs(args, res);
7150 if (ret) 7221 if (ret)
7151 return ret; 7222 return ret;
7152 return nfs4_verify_back_channel_attrs(args, session); 7223 return nfs4_verify_back_channel_attrs(args, res);
7224}
7225
7226static void nfs4_update_session(struct nfs4_session *session,
7227 struct nfs41_create_session_res *res)
7228{
7229 nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
7230 session->flags = res->flags;
7231 memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
7232 if (res->flags & SESSION4_BACK_CHAN)
7233 memcpy(&session->bc_attrs, &res->bc_attrs,
7234 sizeof(session->bc_attrs));
7153} 7235}
7154 7236
7155static int _nfs4_proc_create_session(struct nfs_client *clp, 7237static int _nfs4_proc_create_session(struct nfs_client *clp,
@@ -7158,11 +7240,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
7158 struct nfs4_session *session = clp->cl_session; 7240 struct nfs4_session *session = clp->cl_session;
7159 struct nfs41_create_session_args args = { 7241 struct nfs41_create_session_args args = {
7160 .client = clp, 7242 .client = clp,
7243 .clientid = clp->cl_clientid,
7244 .seqid = clp->cl_seqid,
7161 .cb_program = NFS4_CALLBACK, 7245 .cb_program = NFS4_CALLBACK,
7162 }; 7246 };
7163 struct nfs41_create_session_res res = { 7247 struct nfs41_create_session_res res;
7164 .client = clp, 7248
7165 };
7166 struct rpc_message msg = { 7249 struct rpc_message msg = {
7167 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], 7250 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
7168 .rpc_argp = &args, 7251 .rpc_argp = &args,
@@ -7179,11 +7262,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
7179 7262
7180 if (!status) { 7263 if (!status) {
7181 /* Verify the session's negotiated channel_attrs values */ 7264 /* Verify the session's negotiated channel_attrs values */
7182 status = nfs4_verify_channel_attrs(&args, session); 7265 status = nfs4_verify_channel_attrs(&args, &res);
7183 /* Increment the clientid slot sequence id */ 7266 /* Increment the clientid slot sequence id */
7184 clp->cl_seqid++; 7267 if (clp->cl_seqid == res.seqid)
7268 clp->cl_seqid++;
7269 if (status)
7270 goto out;
7271 nfs4_update_session(session, &res);
7185 } 7272 }
7186 7273out:
7187 return status; 7274 return status;
7188} 7275}
7189 7276
@@ -7528,6 +7615,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
7528 return; 7615 return;
7529 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, 7616 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
7530 NFS_I(lgp->args.inode)->layout, 7617 NFS_I(lgp->args.inode)->layout,
7618 &lgp->args.range,
7531 lgp->args.ctx->state)) { 7619 lgp->args.ctx->state)) {
7532 rpc_exit(task, NFS4_OK); 7620 rpc_exit(task, NFS4_OK);
7533 } 7621 }
@@ -7783,9 +7871,13 @@ static void nfs4_layoutreturn_release(void *calldata)
7783 spin_lock(&lo->plh_inode->i_lock); 7871 spin_lock(&lo->plh_inode->i_lock);
7784 if (lrp->res.lrs_present) 7872 if (lrp->res.lrs_present)
7785 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 7873 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
7874 pnfs_clear_layoutreturn_waitbit(lo);
7875 clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
7876 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
7786 lo->plh_block_lgets--; 7877 lo->plh_block_lgets--;
7787 spin_unlock(&lo->plh_inode->i_lock); 7878 spin_unlock(&lo->plh_inode->i_lock);
7788 pnfs_put_layout_hdr(lrp->args.layout); 7879 pnfs_put_layout_hdr(lrp->args.layout);
7880 nfs_iput_and_deactive(lrp->inode);
7789 kfree(calldata); 7881 kfree(calldata);
7790 dprintk("<-- %s\n", __func__); 7882 dprintk("<-- %s\n", __func__);
7791} 7883}
@@ -7796,7 +7888,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
7796 .rpc_release = nfs4_layoutreturn_release, 7888 .rpc_release = nfs4_layoutreturn_release,
7797}; 7889};
7798 7890
7799int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) 7891int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
7800{ 7892{
7801 struct rpc_task *task; 7893 struct rpc_task *task;
7802 struct rpc_message msg = { 7894 struct rpc_message msg = {
@@ -7811,14 +7903,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7811 .callback_ops = &nfs4_layoutreturn_call_ops, 7903 .callback_ops = &nfs4_layoutreturn_call_ops,
7812 .callback_data = lrp, 7904 .callback_data = lrp,
7813 }; 7905 };
7814 int status; 7906 int status = 0;
7815 7907
7816 dprintk("--> %s\n", __func__); 7908 dprintk("--> %s\n", __func__);
7909 if (!sync) {
7910 lrp->inode = nfs_igrab_and_active(lrp->args.inode);
7911 if (!lrp->inode) {
7912 nfs4_layoutreturn_release(lrp);
7913 return -EAGAIN;
7914 }
7915 task_setup_data.flags |= RPC_TASK_ASYNC;
7916 }
7817 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); 7917 nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
7818 task = rpc_run_task(&task_setup_data); 7918 task = rpc_run_task(&task_setup_data);
7819 if (IS_ERR(task)) 7919 if (IS_ERR(task))
7820 return PTR_ERR(task); 7920 return PTR_ERR(task);
7821 status = task->tk_status; 7921 if (sync)
7922 status = task->tk_status;
7822 trace_nfs4_layoutreturn(lrp->args.inode, status); 7923 trace_nfs4_layoutreturn(lrp->args.inode, status);
7823 dprintk("<-- %s status=%d\n", __func__, status); 7924 dprintk("<-- %s status=%d\n", __func__, status);
7824 rpc_put_task(task); 7925 rpc_put_task(task);
@@ -7912,6 +8013,7 @@ static void nfs4_layoutcommit_release(void *calldata)
7912 nfs_post_op_update_inode_force_wcc(data->args.inode, 8013 nfs_post_op_update_inode_force_wcc(data->args.inode,
7913 data->res.fattr); 8014 data->res.fattr);
7914 put_rpccred(data->cred); 8015 put_rpccred(data->cred);
8016 nfs_iput_and_deactive(data->inode);
7915 kfree(data); 8017 kfree(data);
7916} 8018}
7917 8019
@@ -7936,7 +8038,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7936 .rpc_message = &msg, 8038 .rpc_message = &msg,
7937 .callback_ops = &nfs4_layoutcommit_ops, 8039 .callback_ops = &nfs4_layoutcommit_ops,
7938 .callback_data = data, 8040 .callback_data = data,
7939 .flags = RPC_TASK_ASYNC,
7940 }; 8041 };
7941 struct rpc_task *task; 8042 struct rpc_task *task;
7942 int status = 0; 8043 int status = 0;
@@ -7947,18 +8048,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
7947 data->args.lastbytewritten, 8048 data->args.lastbytewritten,
7948 data->args.inode->i_ino); 8049 data->args.inode->i_ino);
7949 8050
8051 if (!sync) {
8052 data->inode = nfs_igrab_and_active(data->args.inode);
8053 if (data->inode == NULL) {
8054 nfs4_layoutcommit_release(data);
8055 return -EAGAIN;
8056 }
8057 task_setup_data.flags = RPC_TASK_ASYNC;
8058 }
7950 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 8059 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
7951 task = rpc_run_task(&task_setup_data); 8060 task = rpc_run_task(&task_setup_data);
7952 if (IS_ERR(task)) 8061 if (IS_ERR(task))
7953 return PTR_ERR(task); 8062 return PTR_ERR(task);
7954 if (sync == false) 8063 if (sync)
7955 goto out; 8064 status = task->tk_status;
7956 status = nfs4_wait_for_completion_rpc_task(task);
7957 if (status != 0)
7958 goto out;
7959 status = task->tk_status;
7960 trace_nfs4_layoutcommit(data->args.inode, status); 8065 trace_nfs4_layoutcommit(data->args.inode, status);
7961out:
7962 dprintk("%s: status %d\n", __func__, status); 8066 dprintk("%s: status %d\n", __func__, status);
7963 rpc_put_task(task); 8067 rpc_put_task(task);
7964 return status; 8068 return status;
@@ -8386,6 +8490,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
8386 .match_stateid = nfs4_match_stateid, 8490 .match_stateid = nfs4_match_stateid,
8387 .find_root_sec = nfs4_find_root_sec, 8491 .find_root_sec = nfs4_find_root_sec,
8388 .free_lock_state = nfs4_release_lockowner, 8492 .free_lock_state = nfs4_release_lockowner,
8493 .alloc_seqid = nfs_alloc_seqid,
8389 .call_sync_ops = &nfs40_call_sync_ops, 8494 .call_sync_ops = &nfs40_call_sync_ops,
8390 .reboot_recovery_ops = &nfs40_reboot_recovery_ops, 8495 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
8391 .nograce_recovery_ops = &nfs40_nograce_recovery_ops, 8496 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -8394,6 +8499,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
8394}; 8499};
8395 8500
8396#if defined(CONFIG_NFS_V4_1) 8501#if defined(CONFIG_NFS_V4_1)
8502static struct nfs_seqid *
8503nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
8504{
8505 return NULL;
8506}
8507
8397static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { 8508static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8398 .minor_version = 1, 8509 .minor_version = 1,
8399 .init_caps = NFS_CAP_READDIRPLUS 8510 .init_caps = NFS_CAP_READDIRPLUS
@@ -8407,6 +8518,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8407 .match_stateid = nfs41_match_stateid, 8518 .match_stateid = nfs41_match_stateid,
8408 .find_root_sec = nfs41_find_root_sec, 8519 .find_root_sec = nfs41_find_root_sec,
8409 .free_lock_state = nfs41_free_lock_state, 8520 .free_lock_state = nfs41_free_lock_state,
8521 .alloc_seqid = nfs_alloc_no_seqid,
8410 .call_sync_ops = &nfs41_call_sync_ops, 8522 .call_sync_ops = &nfs41_call_sync_ops,
8411 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8523 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8412 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8524 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8433,6 +8545,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8433 .find_root_sec = nfs41_find_root_sec, 8545 .find_root_sec = nfs41_find_root_sec,
8434 .free_lock_state = nfs41_free_lock_state, 8546 .free_lock_state = nfs41_free_lock_state,
8435 .call_sync_ops = &nfs41_call_sync_ops, 8547 .call_sync_ops = &nfs41_call_sync_ops,
8548 .alloc_seqid = nfs_alloc_no_seqid,
8436 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 8549 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
8437 .nograce_recovery_ops = &nfs41_nograce_recovery_ops, 8550 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
8438 .state_renewal_ops = &nfs41_state_renewal_ops, 8551 .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e799dc3c3b1d..e23366effcfb 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
450 tbl = &ses->fc_slot_table; 450 tbl = &ses->fc_slot_table;
451 tbl->session = ses; 451 tbl->session = ses;
452 status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); 452 status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
453 if (status) /* -ENOMEM */ 453 if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
454 return status; 454 return status;
455 /* Back channel */ 455 /* Back channel */
456 tbl = &ses->bc_slot_table; 456 tbl = &ses->bc_slot_table;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b34ada9bc6a2..fc46c7455898 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -118,6 +118,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
118 return 0; 118 return 0;
119} 119}
120 120
121static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
122 const struct nfs4_sessionid *src)
123{
124 memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
125}
126
121#ifdef CONFIG_CRC32 127#ifdef CONFIG_CRC32
122/* 128/*
123 * nfs_session_id_hash - calculate the crc32 hash for the session id 129 * nfs_session_id_hash - calculate the crc32 hash for the session id
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5194933ed419..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1003,11 +1003,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
1003 struct nfs_seqid *new; 1003 struct nfs_seqid *new;
1004 1004
1005 new = kmalloc(sizeof(*new), gfp_mask); 1005 new = kmalloc(sizeof(*new), gfp_mask);
1006 if (new != NULL) { 1006 if (new == NULL)
1007 new->sequence = counter; 1007 return ERR_PTR(-ENOMEM);
1008 INIT_LIST_HEAD(&new->list); 1008 new->sequence = counter;
1009 new->task = NULL; 1009 INIT_LIST_HEAD(&new->list);
1010 } 1010 new->task = NULL;
1011 return new; 1011 return new;
1012} 1012}
1013 1013
@@ -1015,7 +1015,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid)
1015{ 1015{
1016 struct nfs_seqid_counter *sequence; 1016 struct nfs_seqid_counter *sequence;
1017 1017
1018 if (list_empty(&seqid->list)) 1018 if (seqid == NULL || list_empty(&seqid->list))
1019 return; 1019 return;
1020 sequence = seqid->sequence; 1020 sequence = seqid->sequence;
1021 spin_lock(&sequence->lock); 1021 spin_lock(&sequence->lock);
@@ -1071,13 +1071,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
1071 1071
1072void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) 1072void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1073{ 1073{
1074 struct nfs4_state_owner *sp = container_of(seqid->sequence, 1074 struct nfs4_state_owner *sp;
1075 struct nfs4_state_owner, so_seqid); 1075
1076 struct nfs_server *server = sp->so_server; 1076 if (seqid == NULL)
1077 return;
1077 1078
1079 sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
1078 if (status == -NFS4ERR_BAD_SEQID) 1080 if (status == -NFS4ERR_BAD_SEQID)
1079 nfs4_drop_state_owner(sp); 1081 nfs4_drop_state_owner(sp);
1080 if (!nfs4_has_session(server->nfs_client)) 1082 if (!nfs4_has_session(sp->so_server->nfs_client))
1081 nfs_increment_seqid(status, seqid); 1083 nfs_increment_seqid(status, seqid);
1082} 1084}
1083 1085
@@ -1088,14 +1090,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1088 */ 1090 */
1089void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) 1091void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
1090{ 1092{
1091 nfs_increment_seqid(status, seqid); 1093 if (seqid != NULL)
1094 nfs_increment_seqid(status, seqid);
1092} 1095}
1093 1096
1094int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) 1097int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1095{ 1098{
1096 struct nfs_seqid_counter *sequence = seqid->sequence; 1099 struct nfs_seqid_counter *sequence;
1097 int status = 0; 1100 int status = 0;
1098 1101
1102 if (seqid == NULL)
1103 goto out;
1104 sequence = seqid->sequence;
1099 spin_lock(&sequence->lock); 1105 spin_lock(&sequence->lock);
1100 seqid->task = task; 1106 seqid->task = task;
1101 if (list_empty(&seqid->list)) 1107 if (list_empty(&seqid->list))
@@ -1106,6 +1112,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
1106 status = -EAGAIN; 1112 status = -EAGAIN;
1107unlock: 1113unlock:
1108 spin_unlock(&sequence->lock); 1114 spin_unlock(&sequence->lock);
1115out:
1109 return status; 1116 return status;
1110} 1117}
1111 1118
@@ -1366,49 +1373,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1366 struct nfs_inode *nfsi = NFS_I(inode); 1373 struct nfs_inode *nfsi = NFS_I(inode);
1367 struct file_lock *fl; 1374 struct file_lock *fl;
1368 int status = 0; 1375 int status = 0;
1376 struct file_lock_context *flctx = inode->i_flctx;
1377 struct list_head *list;
1369 1378
1370 if (inode->i_flock == NULL) 1379 if (flctx == NULL)
1371 return 0; 1380 return 0;
1372 1381
1382 list = &flctx->flc_posix;
1383
1373 /* Guard against delegation returns and new lock/unlock calls */ 1384 /* Guard against delegation returns and new lock/unlock calls */
1374 down_write(&nfsi->rwsem); 1385 down_write(&nfsi->rwsem);
1375 /* Protect inode->i_flock using the BKL */ 1386 spin_lock(&flctx->flc_lock);
1376 spin_lock(&inode->i_lock); 1387restart:
1377 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1388 list_for_each_entry(fl, list, fl_list) {
1378 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
1379 continue;
1380 if (nfs_file_open_context(fl->fl_file)->state != state) 1389 if (nfs_file_open_context(fl->fl_file)->state != state)
1381 continue; 1390 continue;
1382 spin_unlock(&inode->i_lock); 1391 spin_unlock(&flctx->flc_lock);
1383 status = ops->recover_lock(state, fl); 1392 status = ops->recover_lock(state, fl);
1384 switch (status) { 1393 switch (status) {
1385 case 0: 1394 case 0:
1386 break; 1395 break;
1387 case -ESTALE: 1396 case -ESTALE:
1388 case -NFS4ERR_ADMIN_REVOKED: 1397 case -NFS4ERR_ADMIN_REVOKED:
1389 case -NFS4ERR_STALE_STATEID: 1398 case -NFS4ERR_STALE_STATEID:
1390 case -NFS4ERR_BAD_STATEID: 1399 case -NFS4ERR_BAD_STATEID:
1391 case -NFS4ERR_EXPIRED: 1400 case -NFS4ERR_EXPIRED:
1392 case -NFS4ERR_NO_GRACE: 1401 case -NFS4ERR_NO_GRACE:
1393 case -NFS4ERR_STALE_CLIENTID: 1402 case -NFS4ERR_STALE_CLIENTID:
1394 case -NFS4ERR_BADSESSION: 1403 case -NFS4ERR_BADSESSION:
1395 case -NFS4ERR_BADSLOT: 1404 case -NFS4ERR_BADSLOT:
1396 case -NFS4ERR_BAD_HIGH_SLOT: 1405 case -NFS4ERR_BAD_HIGH_SLOT:
1397 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1406 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1398 goto out; 1407 goto out;
1399 default: 1408 default:
1400 printk(KERN_ERR "NFS: %s: unhandled error %d\n", 1409 pr_err("NFS: %s: unhandled error %d\n",
1401 __func__, status); 1410 __func__, status);
1402 case -ENOMEM: 1411 case -ENOMEM:
1403 case -NFS4ERR_DENIED: 1412 case -NFS4ERR_DENIED:
1404 case -NFS4ERR_RECLAIM_BAD: 1413 case -NFS4ERR_RECLAIM_BAD:
1405 case -NFS4ERR_RECLAIM_CONFLICT: 1414 case -NFS4ERR_RECLAIM_CONFLICT:
1406 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1415 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1407 status = 0; 1416 status = 0;
1408 } 1417 }
1409 spin_lock(&inode->i_lock); 1418 spin_lock(&flctx->flc_lock);
1410 } 1419 }
1411 spin_unlock(&inode->i_lock); 1420 if (list == &flctx->flc_posix) {
1421 list = &flctx->flc_flock;
1422 goto restart;
1423 }
1424 spin_unlock(&flctx->flc_lock);
1412out: 1425out:
1413 up_write(&nfsi->rwsem); 1426 up_write(&nfsi->rwsem);
1414 return status; 1427 return status;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6f340f02f2ba..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
53 .destroy_inode = nfs_destroy_inode, 53 .destroy_inode = nfs_destroy_inode,
54 .write_inode = nfs4_write_inode, 54 .write_inode = nfs4_write_inode,
55 .drop_inode = nfs_drop_inode, 55 .drop_inode = nfs_drop_inode,
56 .put_super = nfs_put_super,
57 .statfs = nfs_statfs, 56 .statfs = nfs_statfs,
58 .evict_inode = nfs4_evict_inode, 57 .evict_inode = nfs4_evict_inode,
59 .umount_begin = nfs_umount_begin, 58 .umount_begin = nfs_umount_begin,
@@ -346,6 +345,9 @@ out:
346 345
347static void __exit exit_nfs_v4(void) 346static void __exit exit_nfs_v4(void)
348{ 347{
348 /* Not called in the _init(), conditionally loaded */
349 nfs4_pnfs_v3_ds_connect_unload();
350
349 unregister_nfs_version(&nfs_v4); 351 unregister_nfs_version(&nfs_v4);
350 nfs4_unregister_sysctl(); 352 nfs4_unregister_sysctl();
351 nfs_idmap_quit(); 353 nfs_idmap_quit();
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..5c399ec41079 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -946,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
946static void encode_nfs4_seqid(struct xdr_stream *xdr, 946static void encode_nfs4_seqid(struct xdr_stream *xdr,
947 const struct nfs_seqid *seqid) 947 const struct nfs_seqid *seqid)
948{ 948{
949 encode_uint32(xdr, seqid->sequence->counter); 949 if (seqid != NULL)
950 encode_uint32(xdr, seqid->sequence->counter);
951 else
952 encode_uint32(xdr, 0);
950} 953}
951 954
952static void encode_compound_hdr(struct xdr_stream *xdr, 955static void encode_compound_hdr(struct xdr_stream *xdr,
@@ -1125,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
1125{ 1128{
1126 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); 1129 encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
1127 encode_nfs4_seqid(xdr, arg->seqid); 1130 encode_nfs4_seqid(xdr, arg->seqid);
1128 encode_nfs4_stateid(xdr, arg->stateid); 1131 encode_nfs4_stateid(xdr, &arg->stateid);
1129} 1132}
1130 1133
1131static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) 1134static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1301,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1301 *p = cpu_to_be32(args->new_lock_owner); 1304 *p = cpu_to_be32(args->new_lock_owner);
1302 if (args->new_lock_owner){ 1305 if (args->new_lock_owner){
1303 encode_nfs4_seqid(xdr, args->open_seqid); 1306 encode_nfs4_seqid(xdr, args->open_seqid);
1304 encode_nfs4_stateid(xdr, args->open_stateid); 1307 encode_nfs4_stateid(xdr, &args->open_stateid);
1305 encode_nfs4_seqid(xdr, args->lock_seqid); 1308 encode_nfs4_seqid(xdr, args->lock_seqid);
1306 encode_lockowner(xdr, &args->lock_owner); 1309 encode_lockowner(xdr, &args->lock_owner);
1307 } 1310 }
1308 else { 1311 else {
1309 encode_nfs4_stateid(xdr, args->lock_stateid); 1312 encode_nfs4_stateid(xdr, &args->lock_stateid);
1310 encode_nfs4_seqid(xdr, args->lock_seqid); 1313 encode_nfs4_seqid(xdr, args->lock_seqid);
1311 } 1314 }
1312} 1315}
@@ -1330,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1330 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); 1333 encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
1331 encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); 1334 encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
1332 encode_nfs4_seqid(xdr, args->seqid); 1335 encode_nfs4_seqid(xdr, args->seqid);
1333 encode_nfs4_stateid(xdr, args->stateid); 1336 encode_nfs4_stateid(xdr, &args->stateid);
1334 p = reserve_space(xdr, 16); 1337 p = reserve_space(xdr, 16);
1335 p = xdr_encode_hyper(p, args->fl->fl_start); 1338 p = xdr_encode_hyper(p, args->fl->fl_start);
1336 xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1339 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
@@ -1348,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
1348 encode_string(xdr, name->len, name->name); 1351 encode_string(xdr, name->len, name->name);
1349} 1352}
1350 1353
1351static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) 1354static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
1352{ 1355{
1353 __be32 *p; 1356 __be32 *p;
1354 1357
1355 p = reserve_space(xdr, 8); 1358 p = reserve_space(xdr, 8);
1356 switch (fmode & (FMODE_READ|FMODE_WRITE)) { 1359 *p++ = cpu_to_be32(share_access);
1357 case FMODE_READ:
1358 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
1359 break;
1360 case FMODE_WRITE:
1361 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
1362 break;
1363 case FMODE_READ|FMODE_WRITE:
1364 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
1365 break;
1366 default:
1367 *p++ = cpu_to_be32(0);
1368 }
1369 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ 1360 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
1370} 1361}
1371 1362
@@ -1377,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1377 * owner 4 = 32 1368 * owner 4 = 32
1378 */ 1369 */
1379 encode_nfs4_seqid(xdr, arg->seqid); 1370 encode_nfs4_seqid(xdr, arg->seqid);
1380 encode_share_access(xdr, arg->fmode); 1371 encode_share_access(xdr, arg->share_access);
1381 p = reserve_space(xdr, 36); 1372 p = reserve_space(xdr, 36);
1382 p = xdr_encode_hyper(p, arg->clientid); 1373 p = xdr_encode_hyper(p, arg->clientid);
1383 *p++ = cpu_to_be32(24); 1374 *p++ = cpu_to_be32(24);
@@ -1530,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1530static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) 1521static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
1531{ 1522{
1532 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); 1523 encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
1533 encode_nfs4_stateid(xdr, arg->stateid); 1524 encode_nfs4_stateid(xdr, &arg->stateid);
1534 encode_nfs4_seqid(xdr, arg->seqid); 1525 encode_nfs4_seqid(xdr, arg->seqid);
1535 encode_share_access(xdr, arg->fmode); 1526 encode_share_access(xdr, arg->share_access);
1536} 1527}
1537 1528
1538static void 1529static void
@@ -1724,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
1724#if defined(CONFIG_NFS_V4_1) 1715#if defined(CONFIG_NFS_V4_1)
1725/* NFSv4.1 operations */ 1716/* NFSv4.1 operations */
1726static void encode_bind_conn_to_session(struct xdr_stream *xdr, 1717static void encode_bind_conn_to_session(struct xdr_stream *xdr,
1727 struct nfs4_session *session, 1718 struct nfs41_bind_conn_to_session_args *args,
1728 struct compound_hdr *hdr) 1719 struct compound_hdr *hdr)
1729{ 1720{
1730 __be32 *p; 1721 __be32 *p;
1731 1722
1732 encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, 1723 encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
1733 decode_bind_conn_to_session_maxsz, hdr); 1724 decode_bind_conn_to_session_maxsz, hdr);
1734 encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1725 encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1735 p = xdr_reserve_space(xdr, 8); 1726 p = xdr_reserve_space(xdr, 8);
1736 *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH); 1727 *p++ = cpu_to_be32(args->dir);
1737 *p = 0; /* use_conn_in_rdma_mode = False */ 1728 *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
1738} 1729}
1739 1730
1740static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) 1731static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
@@ -1801,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr,
1801 struct compound_hdr *hdr) 1792 struct compound_hdr *hdr)
1802{ 1793{
1803 __be32 *p; 1794 __be32 *p;
1804 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1805 uint32_t len;
1806 struct nfs_client *clp = args->client; 1795 struct nfs_client *clp = args->client;
1796 struct rpc_clnt *clnt = clp->cl_rpcclient;
1807 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); 1797 struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
1808 u32 max_resp_sz_cached; 1798 u32 max_resp_sz_cached;
1809 1799
@@ -1814,13 +1804,10 @@ static void encode_create_session(struct xdr_stream *xdr,
1814 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 1804 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
1815 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; 1805 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
1816 1806
1817 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1818 clp->cl_ipaddr);
1819
1820 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); 1807 encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
1821 p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); 1808 p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
1822 p = xdr_encode_hyper(p, clp->cl_clientid); 1809 p = xdr_encode_hyper(p, args->clientid);
1823 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ 1810 *p++ = cpu_to_be32(args->seqid); /*Sequence id */
1824 *p++ = cpu_to_be32(args->flags); /*flags */ 1811 *p++ = cpu_to_be32(args->flags); /*flags */
1825 1812
1826 /* Fore Channel */ 1813 /* Fore Channel */
@@ -1847,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1847 1834
1848 /* authsys_parms rfc1831 */ 1835 /* authsys_parms rfc1831 */
1849 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ 1836 *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
1850 p = xdr_encode_opaque(p, machine_name, len); 1837 p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
1851 *p++ = cpu_to_be32(0); /* UID */ 1838 *p++ = cpu_to_be32(0); /* UID */
1852 *p++ = cpu_to_be32(0); /* GID */ 1839 *p++ = cpu_to_be32(0); /* GID */
1853 *p = cpu_to_be32(0); /* No more gids */ 1840 *p = cpu_to_be32(0); /* No more gids */
@@ -2012,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
2012 p = reserve_space(xdr, 16); 1999 p = reserve_space(xdr, 16);
2013 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ 2000 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
2014 *p++ = cpu_to_be32(args->layout_type); 2001 *p++ = cpu_to_be32(args->layout_type);
2015 *p++ = cpu_to_be32(IOMODE_ANY); 2002 *p++ = cpu_to_be32(args->range.iomode);
2016 *p = cpu_to_be32(RETURN_FILE); 2003 *p = cpu_to_be32(RETURN_FILE);
2017 p = reserve_space(xdr, 16); 2004 p = reserve_space(xdr, 16);
2018 p = xdr_encode_hyper(p, 0); 2005 p = xdr_encode_hyper(p, args->range.offset);
2019 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); 2006 p = xdr_encode_hyper(p, args->range.length);
2020 spin_lock(&args->inode->i_lock); 2007 spin_lock(&args->inode->i_lock);
2021 encode_nfs4_stateid(xdr, &args->stateid); 2008 encode_nfs4_stateid(xdr, &args->stateid);
2022 spin_unlock(&args->inode->i_lock); 2009 spin_unlock(&args->inode->i_lock);
@@ -2747,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
2747 */ 2734 */
2748static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, 2735static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
2749 struct xdr_stream *xdr, 2736 struct xdr_stream *xdr,
2750 struct nfs_client *clp) 2737 struct nfs41_bind_conn_to_session_args *args)
2751{ 2738{
2752 struct compound_hdr hdr = { 2739 struct compound_hdr hdr = {
2753 .minorversion = clp->cl_mvops->minor_version, 2740 .minorversion = args->client->cl_mvops->minor_version,
2754 }; 2741 };
2755 2742
2756 encode_compound_hdr(xdr, req, &hdr); 2743 encode_compound_hdr(xdr, req, &hdr);
2757 encode_bind_conn_to_session(xdr, clp->cl_session, &hdr); 2744 encode_bind_conn_to_session(xdr, args, &hdr);
2758 encode_nops(&hdr); 2745 encode_nops(&hdr);
2759} 2746}
2760 2747
@@ -4936,20 +4923,13 @@ out_overflow:
4936 return -EIO; 4923 return -EIO;
4937} 4924}
4938 4925
4939static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 4926static int decode_rw_delegation(struct xdr_stream *xdr,
4927 uint32_t delegation_type,
4928 struct nfs_openres *res)
4940{ 4929{
4941 __be32 *p; 4930 __be32 *p;
4942 uint32_t delegation_type;
4943 int status; 4931 int status;
4944 4932
4945 p = xdr_inline_decode(xdr, 4);
4946 if (unlikely(!p))
4947 goto out_overflow;
4948 delegation_type = be32_to_cpup(p);
4949 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
4950 res->delegation_type = 0;
4951 return 0;
4952 }
4953 status = decode_stateid(xdr, &res->delegation); 4933 status = decode_stateid(xdr, &res->delegation);
4954 if (unlikely(status)) 4934 if (unlikely(status))
4955 return status; 4935 return status;
@@ -4973,6 +4953,52 @@ out_overflow:
4973 return -EIO; 4953 return -EIO;
4974} 4954}
4975 4955
4956static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
4957{
4958 __be32 *p;
4959 uint32_t why_no_delegation;
4960
4961 p = xdr_inline_decode(xdr, 4);
4962 if (unlikely(!p))
4963 goto out_overflow;
4964 why_no_delegation = be32_to_cpup(p);
4965 switch (why_no_delegation) {
4966 case WND4_CONTENTION:
4967 case WND4_RESOURCE:
4968 xdr_inline_decode(xdr, 4);
4969 /* Ignore for now */
4970 }
4971 return 0;
4972out_overflow:
4973 print_overflow_msg(__func__, xdr);
4974 return -EIO;
4975}
4976
4977static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
4978{
4979 __be32 *p;
4980 uint32_t delegation_type;
4981
4982 p = xdr_inline_decode(xdr, 4);
4983 if (unlikely(!p))
4984 goto out_overflow;
4985 delegation_type = be32_to_cpup(p);
4986 res->delegation_type = 0;
4987 switch (delegation_type) {
4988 case NFS4_OPEN_DELEGATE_NONE:
4989 return 0;
4990 case NFS4_OPEN_DELEGATE_READ:
4991 case NFS4_OPEN_DELEGATE_WRITE:
4992 return decode_rw_delegation(xdr, delegation_type, res);
4993 case NFS4_OPEN_DELEGATE_NONE_EXT:
4994 return decode_no_delegation(xdr, res);
4995 }
4996 return -EIO;
4997out_overflow:
4998 print_overflow_msg(__func__, xdr);
4999 return -EIO;
5000}
5001
4976static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 5002static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
4977{ 5003{
4978 __be32 *p; 5004 __be32 *p;
@@ -5587,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
5587 5613
5588 status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); 5614 status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
5589 if (!status) 5615 if (!status)
5590 status = decode_sessionid(xdr, &res->session->sess_id); 5616 status = decode_sessionid(xdr, &res->sessionid);
5591 if (unlikely(status)) 5617 if (unlikely(status))
5592 return status; 5618 return status;
5593 5619
@@ -5615,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr,
5615{ 5641{
5616 __be32 *p; 5642 __be32 *p;
5617 int status; 5643 int status;
5618 struct nfs_client *clp = res->client;
5619 struct nfs4_session *session = clp->cl_session;
5620 5644
5621 status = decode_op_hdr(xdr, OP_CREATE_SESSION); 5645 status = decode_op_hdr(xdr, OP_CREATE_SESSION);
5622 if (!status) 5646 if (!status)
5623 status = decode_sessionid(xdr, &session->sess_id); 5647 status = decode_sessionid(xdr, &res->sessionid);
5624 if (unlikely(status)) 5648 if (unlikely(status))
5625 return status; 5649 return status;
5626 5650
@@ -5628,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr,
5628 p = xdr_inline_decode(xdr, 8); 5652 p = xdr_inline_decode(xdr, 8);
5629 if (unlikely(!p)) 5653 if (unlikely(!p))
5630 goto out_overflow; 5654 goto out_overflow;
5631 clp->cl_seqid = be32_to_cpup(p++); 5655 res->seqid = be32_to_cpup(p++);
5632 session->flags = be32_to_cpup(p); 5656 res->flags = be32_to_cpup(p);
5633 5657
5634 /* Channel attributes */ 5658 /* Channel attributes */
5635 status = decode_chan_attrs(xdr, &session->fc_attrs); 5659 status = decode_chan_attrs(xdr, &res->fc_attrs);
5636 if (!status) 5660 if (!status)
5637 status = decode_chan_attrs(xdr, &session->bc_attrs); 5661 status = decode_chan_attrs(xdr, &res->bc_attrs);
5638 return status; 5662 return status;
5639out_overflow: 5663out_overflow:
5640 print_overflow_msg(__func__, xdr); 5664 print_overflow_msg(__func__, xdr);
@@ -6567,6 +6591,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6567 int status; 6591 int status;
6568 6592
6569 status = decode_compound_hdr(xdr, &hdr); 6593 status = decode_compound_hdr(xdr, &hdr);
6594 res->op_status = hdr.status;
6570 if (status) 6595 if (status)
6571 goto out; 6596 goto out;
6572 status = decode_sequence(xdr, &res->seq_res, rqstp); 6597 status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6617,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6592 int status; 6617 int status;
6593 6618
6594 status = decode_compound_hdr(xdr, &hdr); 6619 status = decode_compound_hdr(xdr, &hdr);
6620 res->op_status = hdr.status;
6595 if (status) 6621 if (status)
6596 goto out; 6622 goto out;
6597 status = decode_sequence(xdr, &res->seq_res, rqstp); 6623 status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6647,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6621 int status; 6647 int status;
6622 6648
6623 status = decode_compound_hdr(xdr, &hdr); 6649 status = decode_compound_hdr(xdr, &hdr);
6650 res->op_status = hdr.status;
6624 if (status) 6651 if (status)
6625 goto out; 6652 goto out;
6626 status = decode_sequence(xdr, &res->seq_res, rqstp); 6653 status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cd3c910d2d12..9bc9f04fb7f6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline)
261 */ 261 */
262 len = snprintf(nfs_export_path, sizeof(nfs_export_path), 262 len = snprintf(nfs_export_path, sizeof(nfs_export_path),
263 tmp, utsname()->nodename); 263 tmp, utsname()->nodename);
264 if (len > (int)sizeof(nfs_export_path)) 264 if (len >= (int)sizeof(nfs_export_path))
265 goto out_devnametoolong; 265 goto out_devnametoolong;
266 len = snprintf(nfs_root_device, sizeof(nfs_root_device), 266 len = snprintf(nfs_root_device, sizeof(nfs_root_device),
267 "%pI4:%s", &servaddr, nfs_export_path); 267 "%pI4:%s", &servaddr, nfs_export_path);
268 if (len > (int)sizeof(nfs_root_device)) 268 if (len >= (int)sizeof(nfs_root_device))
269 goto out_devnametoolong; 269 goto out_devnametoolong;
270 270
271 retval = 0; 271 retval = 0;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9e5bc42180e4..24e1d7403c0b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
537static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, 537static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
538 struct nfs_page *prev, struct nfs_page *req) 538 struct nfs_page *prev, struct nfs_page *req)
539{ 539{
540 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
540 unsigned int size; 541 unsigned int size;
541 542
542 size = pnfs_generic_pg_test(pgio, prev, req); 543 size = pnfs_generic_pg_test(pgio, prev, req);
543 544
544 if (!size || pgio->pg_count + req->wb_bytes > 545 if (!size || mirror->pg_count + req->wb_bytes >
545 (unsigned long)pgio->pg_layout_private) 546 (unsigned long)pgio->pg_layout_private)
546 return 0; 547 return 0;
547 548
@@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = {
607 .pg_init = objio_init_read, 608 .pg_init = objio_init_read,
608 .pg_test = objio_pg_test, 609 .pg_test = objio_pg_test,
609 .pg_doio = pnfs_generic_pg_readpages, 610 .pg_doio = pnfs_generic_pg_readpages,
611 .pg_cleanup = pnfs_generic_pg_cleanup,
610}; 612};
611 613
612static const struct nfs_pageio_ops objio_pg_write_ops = { 614static const struct nfs_pageio_ops objio_pg_write_ops = {
613 .pg_init = objio_init_write, 615 .pg_init = objio_init_write,
614 .pg_test = objio_pg_test, 616 .pg_test = objio_pg_test,
615 .pg_doio = pnfs_generic_pg_writepages, 617 .pg_doio = pnfs_generic_pg_writepages,
618 .pg_cleanup = pnfs_generic_pg_cleanup,
616}; 619};
617 620
618static struct pnfs_layoutdriver_type objlayout_type = { 621static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2b5e769beb16..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
42 return p->pagevec != NULL; 42 return p->pagevec != NULL;
43} 43}
44 44
45struct nfs_pgio_mirror *
46nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
47{
48 return nfs_pgio_has_mirroring(desc) ?
49 &desc->pg_mirrors[desc->pg_mirror_idx] :
50 &desc->pg_mirrors[0];
51}
52EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
53
45void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, 54void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
46 struct nfs_pgio_header *hdr, 55 struct nfs_pgio_header *hdr,
47 void (*release)(struct nfs_pgio_header *hdr)) 56 void (*release)(struct nfs_pgio_header *hdr))
48{ 57{
49 hdr->req = nfs_list_entry(desc->pg_list.next); 58 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
59
60
61 hdr->req = nfs_list_entry(mirror->pg_list.next);
50 hdr->inode = desc->pg_inode; 62 hdr->inode = desc->pg_inode;
51 hdr->cred = hdr->req->wb_context->cred; 63 hdr->cred = hdr->req->wb_context->cred;
52 hdr->io_start = req_offset(hdr->req); 64 hdr->io_start = req_offset(hdr->req);
53 hdr->good_bytes = desc->pg_count; 65 hdr->good_bytes = mirror->pg_count;
54 hdr->dreq = desc->pg_dreq; 66 hdr->dreq = desc->pg_dreq;
55 hdr->layout_private = desc->pg_layout_private; 67 hdr->layout_private = desc->pg_layout_private;
56 hdr->release = release; 68 hdr->release = release;
57 hdr->completion_ops = desc->pg_completion_ops; 69 hdr->completion_ops = desc->pg_completion_ops;
58 if (hdr->completion_ops->init_hdr) 70 if (hdr->completion_ops->init_hdr)
59 hdr->completion_ops->init_hdr(hdr); 71 hdr->completion_ops->init_hdr(hdr);
72
73 hdr->pgio_mirror_idx = desc->pg_mirror_idx;
60} 74}
61EXPORT_SYMBOL_GPL(nfs_pgheader_init); 75EXPORT_SYMBOL_GPL(nfs_pgheader_init);
62 76
@@ -480,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req)
480size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, 494size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
481 struct nfs_page *prev, struct nfs_page *req) 495 struct nfs_page *prev, struct nfs_page *req)
482{ 496{
483 if (desc->pg_count > desc->pg_bsize) { 497 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
498
499
500 if (mirror->pg_count > mirror->pg_bsize) {
484 /* should never happen */ 501 /* should never happen */
485 WARN_ON_ONCE(1); 502 WARN_ON_ONCE(1);
486 return 0; 503 return 0;
@@ -490,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
490 * Limit the request size so that we can still allocate a page array 507 * Limit the request size so that we can still allocate a page array
491 * for it without upsetting the slab allocator. 508 * for it without upsetting the slab allocator.
492 */ 509 */
493 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * 510 if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
494 sizeof(struct page) > PAGE_SIZE) 511 sizeof(struct page) > PAGE_SIZE)
495 return 0; 512 return 0;
496 513
497 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 514 return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
498} 515}
499EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 516EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
500 517
@@ -597,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
597} 614}
598 615
599int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, 616int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
617 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
600 const struct rpc_call_ops *call_ops, int how, int flags) 618 const struct rpc_call_ops *call_ops, int how, int flags)
601{ 619{
602 struct rpc_task *task; 620 struct rpc_task *task;
603 struct rpc_message msg = { 621 struct rpc_message msg = {
604 .rpc_argp = &hdr->args, 622 .rpc_argp = &hdr->args,
605 .rpc_resp = &hdr->res, 623 .rpc_resp = &hdr->res,
606 .rpc_cred = hdr->cred, 624 .rpc_cred = cred,
607 }; 625 };
608 struct rpc_task_setup task_setup_data = { 626 struct rpc_task_setup task_setup_data = {
609 .rpc_client = clnt, 627 .rpc_client = clnt,
@@ -616,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
616 }; 634 };
617 int ret = 0; 635 int ret = 0;
618 636
619 hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how); 637 hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
620 638
621 dprintk("NFS: %5u initiated pgio call " 639 dprintk("NFS: %5u initiated pgio call "
622 "(req %s/%llu, %u bytes @ offset %llu)\n", 640 "(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -650,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
650static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, 668static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
651 struct nfs_pgio_header *hdr) 669 struct nfs_pgio_header *hdr)
652{ 670{
671 struct nfs_pgio_mirror *mirror;
672 u32 midx;
673
653 set_bit(NFS_IOHDR_REDO, &hdr->flags); 674 set_bit(NFS_IOHDR_REDO, &hdr->flags);
654 nfs_pgio_data_destroy(hdr); 675 nfs_pgio_data_destroy(hdr);
655 hdr->completion_ops->completion(hdr); 676 hdr->completion_ops->completion(hdr);
656 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 677 /* TODO: Make sure it's right to clean up all mirrors here
678 * and not just hdr->pgio_mirror_idx */
679 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
680 mirror = &desc->pg_mirrors[midx];
681 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
682 }
657 return -ENOMEM; 683 return -ENOMEM;
658} 684}
659 685
@@ -670,6 +696,17 @@ static void nfs_pgio_release(void *calldata)
670 hdr->completion_ops->completion(hdr); 696 hdr->completion_ops->completion(hdr);
671} 697}
672 698
699static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
700 unsigned int bsize)
701{
702 INIT_LIST_HEAD(&mirror->pg_list);
703 mirror->pg_bytes_written = 0;
704 mirror->pg_count = 0;
705 mirror->pg_bsize = bsize;
706 mirror->pg_base = 0;
707 mirror->pg_recoalesce = 0;
708}
709
673/** 710/**
674 * nfs_pageio_init - initialise a page io descriptor 711 * nfs_pageio_init - initialise a page io descriptor
675 * @desc: pointer to descriptor 712 * @desc: pointer to descriptor
@@ -686,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
686 size_t bsize, 723 size_t bsize,
687 int io_flags) 724 int io_flags)
688{ 725{
689 INIT_LIST_HEAD(&desc->pg_list); 726 struct nfs_pgio_mirror *new;
690 desc->pg_bytes_written = 0; 727 int i;
691 desc->pg_count = 0; 728
692 desc->pg_bsize = bsize;
693 desc->pg_base = 0;
694 desc->pg_moreio = 0; 729 desc->pg_moreio = 0;
695 desc->pg_recoalesce = 0;
696 desc->pg_inode = inode; 730 desc->pg_inode = inode;
697 desc->pg_ops = pg_ops; 731 desc->pg_ops = pg_ops;
698 desc->pg_completion_ops = compl_ops; 732 desc->pg_completion_ops = compl_ops;
@@ -702,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
702 desc->pg_lseg = NULL; 736 desc->pg_lseg = NULL;
703 desc->pg_dreq = NULL; 737 desc->pg_dreq = NULL;
704 desc->pg_layout_private = NULL; 738 desc->pg_layout_private = NULL;
739 desc->pg_bsize = bsize;
740
741 desc->pg_mirror_count = 1;
742 desc->pg_mirror_idx = 0;
743
744 if (pg_ops->pg_get_mirror_count) {
745 /* until we have a request, we don't have an lseg and no
746 * idea how many mirrors there will be */
747 new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
748 sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
749 desc->pg_mirrors_dynamic = new;
750 desc->pg_mirrors = new;
751
752 for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
753 nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
754 } else {
755 desc->pg_mirrors_dynamic = NULL;
756 desc->pg_mirrors = desc->pg_mirrors_static;
757 nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
758 }
705} 759}
706EXPORT_SYMBOL_GPL(nfs_pageio_init); 760EXPORT_SYMBOL_GPL(nfs_pageio_init);
707 761
@@ -737,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
737int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, 791int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
738 struct nfs_pgio_header *hdr) 792 struct nfs_pgio_header *hdr)
739{ 793{
794 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
795
740 struct nfs_page *req; 796 struct nfs_page *req;
741 struct page **pages, 797 struct page **pages,
742 *last_page; 798 *last_page;
743 struct list_head *head = &desc->pg_list; 799 struct list_head *head = &mirror->pg_list;
744 struct nfs_commit_info cinfo; 800 struct nfs_commit_info cinfo;
745 unsigned int pagecount, pageused; 801 unsigned int pagecount, pageused;
746 802
747 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); 803 pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
748 if (!nfs_pgarray_set(&hdr->page_array, pagecount)) 804 if (!nfs_pgarray_set(&hdr->page_array, pagecount))
749 return nfs_pgio_error(desc, hdr); 805 return nfs_pgio_error(desc, hdr);
750 806
@@ -772,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
772 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 828 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
773 829
774 /* Set up the argument struct */ 830 /* Set up the argument struct */
775 nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo); 831 nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
776 desc->pg_rpc_callops = &nfs_pgio_common_ops; 832 desc->pg_rpc_callops = &nfs_pgio_common_ops;
777 return 0; 833 return 0;
778} 834}
@@ -780,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
780 836
781static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) 837static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
782{ 838{
839 struct nfs_pgio_mirror *mirror;
783 struct nfs_pgio_header *hdr; 840 struct nfs_pgio_header *hdr;
784 int ret; 841 int ret;
785 842
843 mirror = nfs_pgio_current_mirror(desc);
844
786 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 845 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
787 if (!hdr) { 846 if (!hdr) {
788 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 847 /* TODO: make sure this is right with mirroring - or
848 * should it back out all mirrors? */
849 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
789 return -ENOMEM; 850 return -ENOMEM;
790 } 851 }
791 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); 852 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
792 ret = nfs_generic_pgio(desc, hdr); 853 ret = nfs_generic_pgio(desc, hdr);
793 if (ret == 0) 854 if (ret == 0)
794 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), 855 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
795 hdr, desc->pg_rpc_callops, 856 hdr,
857 hdr->cred,
858 NFS_PROTO(hdr->inode),
859 desc->pg_rpc_callops,
796 desc->pg_ioflags, 0); 860 desc->pg_ioflags, 0);
797 return ret; 861 return ret;
798} 862}
799 863
864/*
865 * nfs_pageio_setup_mirroring - determine if mirroring is to be used
866 * by calling the pg_get_mirror_count op
867 */
868static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
869 struct nfs_page *req)
870{
871 int mirror_count = 1;
872
873 if (!pgio->pg_ops->pg_get_mirror_count)
874 return 0;
875
876 mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
877
878 if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
879 return -EINVAL;
880
881 if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
882 return -EINVAL;
883
884 pgio->pg_mirror_count = mirror_count;
885
886 return 0;
887}
888
889/*
890 * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
891 */
892void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
893{
894 pgio->pg_mirror_count = 1;
895 pgio->pg_mirror_idx = 0;
896}
897
898static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
899{
900 pgio->pg_mirror_count = 1;
901 pgio->pg_mirror_idx = 0;
902 pgio->pg_mirrors = pgio->pg_mirrors_static;
903 kfree(pgio->pg_mirrors_dynamic);
904 pgio->pg_mirrors_dynamic = NULL;
905}
906
800static bool nfs_match_open_context(const struct nfs_open_context *ctx1, 907static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
801 const struct nfs_open_context *ctx2) 908 const struct nfs_open_context *ctx2)
802{ 909{
@@ -826,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
826 struct nfs_pageio_descriptor *pgio) 933 struct nfs_pageio_descriptor *pgio)
827{ 934{
828 size_t size; 935 size_t size;
936 struct file_lock_context *flctx;
829 937
830 if (prev) { 938 if (prev) {
831 if (!nfs_match_open_context(req->wb_context, prev->wb_context)) 939 if (!nfs_match_open_context(req->wb_context, prev->wb_context))
832 return false; 940 return false;
833 if (req->wb_context->dentry->d_inode->i_flock != NULL && 941 flctx = req->wb_context->dentry->d_inode->i_flctx;
942 if (flctx != NULL &&
943 !(list_empty_careful(&flctx->flc_posix) &&
944 list_empty_careful(&flctx->flc_flock)) &&
834 !nfs_match_lock_context(req->wb_lock_context, 945 !nfs_match_lock_context(req->wb_lock_context,
835 prev->wb_lock_context)) 946 prev->wb_lock_context))
836 return false; 947 return false;
@@ -863,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
863static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, 974static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
864 struct nfs_page *req) 975 struct nfs_page *req)
865{ 976{
977 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
978
866 struct nfs_page *prev = NULL; 979 struct nfs_page *prev = NULL;
867 if (desc->pg_count != 0) { 980
868 prev = nfs_list_entry(desc->pg_list.prev); 981 if (mirror->pg_count != 0) {
982 prev = nfs_list_entry(mirror->pg_list.prev);
869 } else { 983 } else {
870 if (desc->pg_ops->pg_init) 984 if (desc->pg_ops->pg_init)
871 desc->pg_ops->pg_init(desc, req); 985 desc->pg_ops->pg_init(desc, req);
872 desc->pg_base = req->wb_pgbase; 986 mirror->pg_base = req->wb_pgbase;
873 } 987 }
874 if (!nfs_can_coalesce_requests(prev, req, desc)) 988 if (!nfs_can_coalesce_requests(prev, req, desc))
875 return 0; 989 return 0;
876 nfs_list_remove_request(req); 990 nfs_list_remove_request(req);
877 nfs_list_add_request(req, &desc->pg_list); 991 nfs_list_add_request(req, &mirror->pg_list);
878 desc->pg_count += req->wb_bytes; 992 mirror->pg_count += req->wb_bytes;
879 return 1; 993 return 1;
880} 994}
881 995
@@ -884,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
884 */ 998 */
885static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 999static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
886{ 1000{
887 if (!list_empty(&desc->pg_list)) { 1001 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1002
1003
1004 if (!list_empty(&mirror->pg_list)) {
888 int error = desc->pg_ops->pg_doio(desc); 1005 int error = desc->pg_ops->pg_doio(desc);
889 if (error < 0) 1006 if (error < 0)
890 desc->pg_error = error; 1007 desc->pg_error = error;
891 else 1008 else
892 desc->pg_bytes_written += desc->pg_count; 1009 mirror->pg_bytes_written += mirror->pg_count;
893 } 1010 }
894 if (list_empty(&desc->pg_list)) { 1011 if (list_empty(&mirror->pg_list)) {
895 desc->pg_count = 0; 1012 mirror->pg_count = 0;
896 desc->pg_base = 0; 1013 mirror->pg_base = 0;
897 } 1014 }
898} 1015}
899 1016
@@ -911,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
911static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 1028static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
912 struct nfs_page *req) 1029 struct nfs_page *req)
913{ 1030{
1031 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1032
914 struct nfs_page *subreq; 1033 struct nfs_page *subreq;
915 unsigned int bytes_left = 0; 1034 unsigned int bytes_left = 0;
916 unsigned int offset, pgbase; 1035 unsigned int offset, pgbase;
@@ -934,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
934 nfs_pageio_doio(desc); 1053 nfs_pageio_doio(desc);
935 if (desc->pg_error < 0) 1054 if (desc->pg_error < 0)
936 return 0; 1055 return 0;
937 if (desc->pg_recoalesce) 1056 if (mirror->pg_recoalesce)
938 return 0; 1057 return 0;
939 /* retry add_request for this subreq */ 1058 /* retry add_request for this subreq */
940 nfs_page_group_lock(req, false); 1059 nfs_page_group_lock(req, false);
@@ -972,14 +1091,16 @@ err_ptr:
972 1091
973static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) 1092static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
974{ 1093{
1094 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
975 LIST_HEAD(head); 1095 LIST_HEAD(head);
976 1096
977 do { 1097 do {
978 list_splice_init(&desc->pg_list, &head); 1098 list_splice_init(&mirror->pg_list, &head);
979 desc->pg_bytes_written -= desc->pg_count; 1099 mirror->pg_bytes_written -= mirror->pg_count;
980 desc->pg_count = 0; 1100 mirror->pg_count = 0;
981 desc->pg_base = 0; 1101 mirror->pg_base = 0;
982 desc->pg_recoalesce = 0; 1102 mirror->pg_recoalesce = 0;
1103
983 desc->pg_moreio = 0; 1104 desc->pg_moreio = 0;
984 1105
985 while (!list_empty(&head)) { 1106 while (!list_empty(&head)) {
@@ -993,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
993 return 0; 1114 return 0;
994 break; 1115 break;
995 } 1116 }
996 } while (desc->pg_recoalesce); 1117 } while (mirror->pg_recoalesce);
997 return 1; 1118 return 1;
998} 1119}
999 1120
1000int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, 1121static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
1001 struct nfs_page *req) 1122 struct nfs_page *req)
1002{ 1123{
1003 int ret; 1124 int ret;
@@ -1010,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1010 break; 1131 break;
1011 ret = nfs_do_recoalesce(desc); 1132 ret = nfs_do_recoalesce(desc);
1012 } while (ret); 1133 } while (ret);
1134
1013 return ret; 1135 return ret;
1014} 1136}
1015 1137
1138int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1139 struct nfs_page *req)
1140{
1141 u32 midx;
1142 unsigned int pgbase, offset, bytes;
1143 struct nfs_page *dupreq, *lastreq;
1144
1145 pgbase = req->wb_pgbase;
1146 offset = req->wb_offset;
1147 bytes = req->wb_bytes;
1148
1149 nfs_pageio_setup_mirroring(desc, req);
1150
1151 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
1152 if (midx) {
1153 nfs_page_group_lock(req, false);
1154
1155 /* find the last request */
1156 for (lastreq = req->wb_head;
1157 lastreq->wb_this_page != req->wb_head;
1158 lastreq = lastreq->wb_this_page)
1159 ;
1160
1161 dupreq = nfs_create_request(req->wb_context,
1162 req->wb_page, lastreq, pgbase, bytes);
1163
1164 if (IS_ERR(dupreq)) {
1165 nfs_page_group_unlock(req);
1166 return 0;
1167 }
1168
1169 nfs_lock_request(dupreq);
1170 nfs_page_group_unlock(req);
1171 dupreq->wb_offset = offset;
1172 dupreq->wb_index = req->wb_index;
1173 } else
1174 dupreq = req;
1175
1176 if (nfs_pgio_has_mirroring(desc))
1177 desc->pg_mirror_idx = midx;
1178 if (!nfs_pageio_add_request_mirror(desc, dupreq))
1179 return 0;
1180 }
1181
1182 return 1;
1183}
1184
1185/*
1186 * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
1187 * nfs_pageio_descriptor
1188 * @desc: pointer to io descriptor
1189 */
1190static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
1191 u32 mirror_idx)
1192{
1193 struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
1194 u32 restore_idx = desc->pg_mirror_idx;
1195
1196 if (nfs_pgio_has_mirroring(desc))
1197 desc->pg_mirror_idx = mirror_idx;
1198 for (;;) {
1199 nfs_pageio_doio(desc);
1200 if (!mirror->pg_recoalesce)
1201 break;
1202 if (!nfs_do_recoalesce(desc))
1203 break;
1204 }
1205 desc->pg_mirror_idx = restore_idx;
1206}
1207
1016/* 1208/*
1017 * nfs_pageio_resend - Transfer requests to new descriptor and resend 1209 * nfs_pageio_resend - Transfer requests to new descriptor and resend
1018 * @hdr - the pgio header to move request from 1210 * @hdr - the pgio header to move request from
@@ -1046,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
1046EXPORT_SYMBOL_GPL(nfs_pageio_resend); 1238EXPORT_SYMBOL_GPL(nfs_pageio_resend);
1047 1239
1048/** 1240/**
1049 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 1241 * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
1050 * @desc: pointer to io descriptor 1242 * @desc: pointer to io descriptor
1051 */ 1243 */
1052void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) 1244void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1053{ 1245{
1054 for (;;) { 1246 u32 midx;
1055 nfs_pageio_doio(desc); 1247
1056 if (!desc->pg_recoalesce) 1248 for (midx = 0; midx < desc->pg_mirror_count; midx++)
1057 break; 1249 nfs_pageio_complete_mirror(desc, midx);
1058 if (!nfs_do_recoalesce(desc)) 1250
1059 break; 1251 if (desc->pg_ops->pg_cleanup)
1060 } 1252 desc->pg_ops->pg_cleanup(desc);
1253 nfs_pageio_cleanup_mirroring(desc);
1061} 1254}
1062 1255
1063/** 1256/**
@@ -1073,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1073 */ 1266 */
1074void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) 1267void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
1075{ 1268{
1076 if (!list_empty(&desc->pg_list)) { 1269 struct nfs_pgio_mirror *mirror;
1077 struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); 1270 struct nfs_page *prev;
1078 if (index != prev->wb_index + 1) 1271 u32 midx;
1079 nfs_pageio_complete(desc); 1272
1273 for (midx = 0; midx < desc->pg_mirror_count; midx++) {
1274 mirror = &desc->pg_mirrors[midx];
1275 if (!list_empty(&mirror->pg_list)) {
1276 prev = nfs_list_entry(mirror->pg_list.prev);
1277 if (index != prev->wb_index + 1)
1278 nfs_pageio_complete_mirror(desc, midx);
1279 }
1080 } 1280 }
1081} 1281}
1082 1282
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
34#include "pnfs.h" 34#include "pnfs.h"
35#include "iostat.h" 35#include "iostat.h"
36#include "nfs4trace.h" 36#include "nfs4trace.h"
37#include "delegation.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_PNFS 39#define NFSDBG_FACILITY NFSDBG_PNFS
39#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 40#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
50 */ 51 */
51static LIST_HEAD(pnfs_modules_tbl); 52static LIST_HEAD(pnfs_modules_tbl);
52 53
54static int
55pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
56 enum pnfs_iomode iomode, bool sync);
57
53/* Return the registered pnfs layout driver module matching given id */ 58/* Return the registered pnfs layout driver module matching given id */
54static struct pnfs_layoutdriver_type * 59static struct pnfs_layoutdriver_type *
55find_pnfs_driver_locked(u32 id) 60find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
238 struct inode *inode = lo->plh_inode; 243 struct inode *inode = lo->plh_inode;
239 244
240 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 245 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
246 if (!list_empty(&lo->plh_segs))
247 WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
241 pnfs_detach_layout_hdr(lo); 248 pnfs_detach_layout_hdr(lo);
242 spin_unlock(&inode->i_lock); 249 spin_unlock(&inode->i_lock);
243 pnfs_free_layout_hdr(lo); 250 pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
337 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 344 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
338} 345}
339 346
347/* Return true if layoutreturn is needed */
348static bool
349pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
350 struct pnfs_layout_segment *lseg)
351{
352 struct pnfs_layout_segment *s;
353
354 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
355 return false;
356
357 list_for_each_entry(s, &lo->plh_segs, pls_list)
358 if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
359 return false;
360
361 return true;
362}
363
364static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
365 struct pnfs_layout_hdr *lo, struct inode *inode)
366{
367 lo = lseg->pls_layout;
368 inode = lo->plh_inode;
369
370 spin_lock(&inode->i_lock);
371 if (pnfs_layout_need_return(lo, lseg)) {
372 nfs4_stateid stateid;
373 enum pnfs_iomode iomode;
374
375 stateid = lo->plh_stateid;
376 iomode = lo->plh_return_iomode;
377 /* decreased in pnfs_send_layoutreturn() */
378 lo->plh_block_lgets++;
379 lo->plh_return_iomode = 0;
380 spin_unlock(&inode->i_lock);
381 pnfs_get_layout_hdr(lo);
382
383 /* Send an async layoutreturn so we dont deadlock */
384 pnfs_send_layoutreturn(lo, stateid, iomode, false);
385 } else
386 spin_unlock(&inode->i_lock);
387}
388
340void 389void
341pnfs_put_lseg(struct pnfs_layout_segment *lseg) 390pnfs_put_lseg(struct pnfs_layout_segment *lseg)
342{ 391{
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
349 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 398 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
350 atomic_read(&lseg->pls_refcount), 399 atomic_read(&lseg->pls_refcount),
351 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 400 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
401
402 /* Handle the case where refcount != 1 */
403 if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
404 return;
405
352 lo = lseg->pls_layout; 406 lo = lseg->pls_layout;
353 inode = lo->plh_inode; 407 inode = lo->plh_inode;
408 /* Do we need a layoutreturn? */
409 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
410 pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
411
354 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 412 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
355 pnfs_get_layout_hdr(lo); 413 pnfs_get_layout_hdr(lo);
356 pnfs_layout_remove_lseg(lo, lseg); 414 pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
543 pnfs_get_layout_hdr(lo); 601 pnfs_get_layout_hdr(lo);
544 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 602 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
545 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 603 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
604 pnfs_clear_retry_layoutget(lo);
546 spin_unlock(&nfsi->vfs_inode.i_lock); 605 spin_unlock(&nfsi->vfs_inode.i_lock);
547 pnfs_free_lseg_list(&tmp_list); 606 pnfs_free_lseg_list(&tmp_list);
548 pnfs_put_layout_hdr(lo); 607 pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
740 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); 799 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
741} 800}
742 801
802static bool
803pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
804 struct pnfs_layout_range *range)
805{
806 return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
807 (lo->plh_return_iomode == IOMODE_ANY ||
808 lo->plh_return_iomode == range->iomode);
809}
810
743/* lget is set to 1 if called from inside send_layoutget call chain */ 811/* lget is set to 1 if called from inside send_layoutget call chain */
744static bool 812static bool
745pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) 813pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
814 struct pnfs_layout_range *range, int lget)
746{ 815{
747 return lo->plh_block_lgets || 816 return lo->plh_block_lgets ||
748 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 817 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
749 (list_empty(&lo->plh_segs) && 818 (list_empty(&lo->plh_segs) &&
750 (atomic_read(&lo->plh_outstanding) > lget)); 819 (atomic_read(&lo->plh_outstanding) > lget)) ||
820 pnfs_layout_returning(lo, range);
751} 821}
752 822
753int 823int
754pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, 824pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
825 struct pnfs_layout_range *range,
755 struct nfs4_state *open_state) 826 struct nfs4_state *open_state)
756{ 827{
757 int status = 0; 828 int status = 0;
758 829
759 dprintk("--> %s\n", __func__); 830 dprintk("--> %s\n", __func__);
760 spin_lock(&lo->plh_inode->i_lock); 831 spin_lock(&lo->plh_inode->i_lock);
761 if (pnfs_layoutgets_blocked(lo, 1)) { 832 if (pnfs_layoutgets_blocked(lo, range, 1)) {
762 status = -EAGAIN; 833 status = -EAGAIN;
763 } else if (!nfs4_valid_open_stateid(open_state)) { 834 } else if (!nfs4_valid_open_stateid(open_state)) {
764 status = -EBADF; 835 status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
825 pnfs_layout_io_set_failed(lo, range->iomode); 896 pnfs_layout_io_set_failed(lo, range->iomode);
826 } 897 }
827 return NULL; 898 return NULL;
828 } 899 } else
900 pnfs_layout_clear_fail_bit(lo,
901 pnfs_iomode_to_fail_bit(range->iomode));
829 902
830 return lseg; 903 return lseg;
831} 904}
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
845 } 918 }
846} 919}
847 920
921void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
922{
923 clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
924 smp_mb__after_atomic();
925 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
926}
927
928static int
929pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
930 enum pnfs_iomode iomode, bool sync)
931{
932 struct inode *ino = lo->plh_inode;
933 struct nfs4_layoutreturn *lrp;
934 int status = 0;
935
936 lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
937 if (unlikely(lrp == NULL)) {
938 status = -ENOMEM;
939 spin_lock(&ino->i_lock);
940 lo->plh_block_lgets--;
941 pnfs_clear_layoutreturn_waitbit(lo);
942 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
943 spin_unlock(&ino->i_lock);
944 pnfs_put_layout_hdr(lo);
945 goto out;
946 }
947
948 lrp->args.stateid = stateid;
949 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
950 lrp->args.inode = ino;
951 lrp->args.range.iomode = iomode;
952 lrp->args.range.offset = 0;
953 lrp->args.range.length = NFS4_MAX_UINT64;
954 lrp->args.layout = lo;
955 lrp->clp = NFS_SERVER(ino)->nfs_client;
956 lrp->cred = lo->plh_lc_cred;
957
958 status = nfs4_proc_layoutreturn(lrp, sync);
959out:
960 dprintk("<-- %s status: %d\n", __func__, status);
961 return status;
962}
963
848/* 964/*
849 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 965 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
850 * when the layout segment list is empty. 966 * when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
859 struct pnfs_layout_hdr *lo = NULL; 975 struct pnfs_layout_hdr *lo = NULL;
860 struct nfs_inode *nfsi = NFS_I(ino); 976 struct nfs_inode *nfsi = NFS_I(ino);
861 LIST_HEAD(tmp_list); 977 LIST_HEAD(tmp_list);
862 struct nfs4_layoutreturn *lrp;
863 nfs4_stateid stateid; 978 nfs4_stateid stateid;
864 int status = 0, empty; 979 int status = 0, empty;
865 980
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
901 spin_unlock(&ino->i_lock); 1016 spin_unlock(&ino->i_lock);
902 pnfs_free_lseg_list(&tmp_list); 1017 pnfs_free_lseg_list(&tmp_list);
903 1018
904 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); 1019 status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
905 if (unlikely(lrp == NULL)) {
906 status = -ENOMEM;
907 spin_lock(&ino->i_lock);
908 lo->plh_block_lgets--;
909 spin_unlock(&ino->i_lock);
910 pnfs_put_layout_hdr(lo);
911 goto out;
912 }
913
914 lrp->args.stateid = stateid;
915 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
916 lrp->args.inode = ino;
917 lrp->args.layout = lo;
918 lrp->clp = NFS_SERVER(ino)->nfs_client;
919 lrp->cred = lo->plh_lc_cred;
920
921 status = nfs4_proc_layoutreturn(lrp);
922out: 1020out:
923 dprintk("<-- %s status: %d\n", __func__, status); 1021 dprintk("<-- %s status: %d\n", __func__, status);
924 return status; 1022 return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
954 1052
955bool pnfs_roc(struct inode *ino) 1053bool pnfs_roc(struct inode *ino)
956{ 1054{
1055 struct nfs_inode *nfsi = NFS_I(ino);
1056 struct nfs_open_context *ctx;
1057 struct nfs4_state *state;
957 struct pnfs_layout_hdr *lo; 1058 struct pnfs_layout_hdr *lo;
958 struct pnfs_layout_segment *lseg, *tmp; 1059 struct pnfs_layout_segment *lseg, *tmp;
1060 nfs4_stateid stateid;
959 LIST_HEAD(tmp_list); 1061 LIST_HEAD(tmp_list);
960 bool found = false; 1062 bool found = false, layoutreturn = false;
961 1063
962 spin_lock(&ino->i_lock); 1064 spin_lock(&ino->i_lock);
963 lo = NFS_I(ino)->layout; 1065 lo = nfsi->layout;
964 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || 1066 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
965 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 1067 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
966 goto out_nolayout; 1068 goto out_noroc;
1069
1070 /* Don't return layout if we hold a delegation */
1071 if (nfs4_check_delegation(ino, FMODE_READ))
1072 goto out_noroc;
1073
1074 list_for_each_entry(ctx, &nfsi->open_files, list) {
1075 state = ctx->state;
1076 /* Don't return layout if there is open file state */
1077 if (state != NULL && state->state != 0)
1078 goto out_noroc;
1079 }
1080
1081 pnfs_clear_retry_layoutget(lo);
967 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1082 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
968 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { 1083 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
969 mark_lseg_invalid(lseg, &tmp_list); 1084 mark_lseg_invalid(lseg, &tmp_list);
970 found = true; 1085 found = true;
971 } 1086 }
972 if (!found) 1087 if (!found)
973 goto out_nolayout; 1088 goto out_noroc;
974 lo->plh_block_lgets++; 1089 lo->plh_block_lgets++;
975 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ 1090 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
976 spin_unlock(&ino->i_lock); 1091 spin_unlock(&ino->i_lock);
977 pnfs_free_lseg_list(&tmp_list); 1092 pnfs_free_lseg_list(&tmp_list);
978 return true; 1093 return true;
979 1094
980out_nolayout: 1095out_noroc:
1096 if (lo) {
1097 stateid = lo->plh_stateid;
1098 layoutreturn =
1099 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1100 &lo->plh_flags);
1101 if (layoutreturn) {
1102 lo->plh_block_lgets++;
1103 pnfs_get_layout_hdr(lo);
1104 }
1105 }
981 spin_unlock(&ino->i_lock); 1106 spin_unlock(&ino->i_lock);
1107 if (layoutreturn)
1108 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
982 return false; 1109 return false;
983} 1110}
984 1111
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1013 struct nfs_inode *nfsi = NFS_I(ino); 1140 struct nfs_inode *nfsi = NFS_I(ino);
1014 struct pnfs_layout_hdr *lo; 1141 struct pnfs_layout_hdr *lo;
1015 struct pnfs_layout_segment *lseg; 1142 struct pnfs_layout_segment *lseg;
1143 nfs4_stateid stateid;
1016 u32 current_seqid; 1144 u32 current_seqid;
1017 bool found = false; 1145 bool found = false, layoutreturn = false;
1018 1146
1019 spin_lock(&ino->i_lock); 1147 spin_lock(&ino->i_lock);
1020 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) 1148 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
1031 */ 1159 */
1032 *barrier = current_seqid + atomic_read(&lo->plh_outstanding); 1160 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1033out: 1161out:
1162 if (!found) {
1163 stateid = lo->plh_stateid;
1164 layoutreturn =
1165 test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1166 &lo->plh_flags);
1167 if (layoutreturn) {
1168 lo->plh_block_lgets++;
1169 pnfs_get_layout_hdr(lo);
1170 }
1171 }
1034 spin_unlock(&ino->i_lock); 1172 spin_unlock(&ino->i_lock);
1173 if (layoutreturn) {
1174 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1175 pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
1176 }
1035 return found; 1177 return found;
1036} 1178}
1037 1179
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1178 1320
1179 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1321 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1180 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1322 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1323 !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1181 pnfs_lseg_range_match(&lseg->pls_range, range)) { 1324 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1182 ret = pnfs_get_lseg(lseg); 1325 ret = pnfs_get_lseg(lseg);
1183 break; 1326 break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1266 return ret; 1409 return ret;
1267} 1410}
1268 1411
1412/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
1413static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
1414{
1415 if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
1416 return 1;
1417 return nfs_wait_bit_killable(key);
1418}
1419
1420static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1421{
1422 /*
1423 * send layoutcommit as it can hold up layoutreturn due to lseg
1424 * reference
1425 */
1426 pnfs_layoutcommit_inode(lo->plh_inode, false);
1427 return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1428 pnfs_layoutget_retry_bit_wait,
1429 TASK_UNINTERRUPTIBLE);
1430}
1431
1432static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1433{
1434 unsigned long *bitlock = &lo->plh_flags;
1435
1436 clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1437 smp_mb__after_atomic();
1438 wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1439}
1440
1269/* 1441/*
1270 * Layout segment is retreived from the server if not cached. 1442 * Layout segment is retreived from the server if not cached.
1271 * The appropriate layout segment is referenced and returned to the caller. 1443 * The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
1296 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) 1468 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1297 goto out; 1469 goto out;
1298 1470
1471lookup_again:
1472 first = false;
1299 spin_lock(&ino->i_lock); 1473 spin_lock(&ino->i_lock);
1300 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1474 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1301 if (lo == NULL) { 1475 if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
1310 } 1484 }
1311 1485
1312 /* if LAYOUTGET already failed once we don't try again */ 1486 /* if LAYOUTGET already failed once we don't try again */
1313 if (pnfs_layout_io_test_failed(lo, iomode)) 1487 if (pnfs_layout_io_test_failed(lo, iomode) &&
1488 !pnfs_should_retry_layoutget(lo))
1314 goto out_unlock; 1489 goto out_unlock;
1315 1490
1316 /* Check to see if the layout for the given range already exists */ 1491 first = list_empty(&lo->plh_segs);
1317 lseg = pnfs_find_lseg(lo, &arg); 1492 if (first) {
1318 if (lseg) 1493 /* The first layoutget for the file. Need to serialize per
1319 goto out_unlock; 1494 * RFC 5661 Errata 3208.
1495 */
1496 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1497 &lo->plh_flags)) {
1498 spin_unlock(&ino->i_lock);
1499 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1500 TASK_UNINTERRUPTIBLE);
1501 pnfs_put_layout_hdr(lo);
1502 goto lookup_again;
1503 }
1504 } else {
1505 /* Check to see if the layout for the given range
1506 * already exists
1507 */
1508 lseg = pnfs_find_lseg(lo, &arg);
1509 if (lseg)
1510 goto out_unlock;
1511 }
1512
1513 /*
1514 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1515 * for LAYOUTRETURN even if first is true.
1516 */
1517 if (!lseg && pnfs_should_retry_layoutget(lo) &&
1518 test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1519 spin_unlock(&ino->i_lock);
1520 dprintk("%s wait for layoutreturn\n", __func__);
1521 if (pnfs_prepare_to_retry_layoutget(lo)) {
1522 if (first)
1523 pnfs_clear_first_layoutget(lo);
1524 pnfs_put_layout_hdr(lo);
1525 dprintk("%s retrying\n", __func__);
1526 goto lookup_again;
1527 }
1528 goto out_put_layout_hdr;
1529 }
1320 1530
1321 if (pnfs_layoutgets_blocked(lo, 0)) 1531 if (pnfs_layoutgets_blocked(lo, &arg, 0))
1322 goto out_unlock; 1532 goto out_unlock;
1323 atomic_inc(&lo->plh_outstanding); 1533 atomic_inc(&lo->plh_outstanding);
1324
1325 first = list_empty(&lo->plh_layouts) ? true : false;
1326 spin_unlock(&ino->i_lock); 1534 spin_unlock(&ino->i_lock);
1327 1535
1328 if (first) { 1536 if (list_empty(&lo->plh_layouts)) {
1329 /* The lo must be on the clp list if there is any 1537 /* The lo must be on the clp list if there is any
1330 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1538 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1331 */ 1539 */
1332 spin_lock(&clp->cl_lock); 1540 spin_lock(&clp->cl_lock);
1333 list_add_tail(&lo->plh_layouts, &server->layouts); 1541 if (list_empty(&lo->plh_layouts))
1542 list_add_tail(&lo->plh_layouts, &server->layouts);
1334 spin_unlock(&clp->cl_lock); 1543 spin_unlock(&clp->cl_lock);
1335 } 1544 }
1336 1545
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
1343 arg.length = PAGE_CACHE_ALIGN(arg.length); 1552 arg.length = PAGE_CACHE_ALIGN(arg.length);
1344 1553
1345 lseg = send_layoutget(lo, ctx, &arg, gfp_flags); 1554 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1555 pnfs_clear_retry_layoutget(lo);
1346 atomic_dec(&lo->plh_outstanding); 1556 atomic_dec(&lo->plh_outstanding);
1347out_put_layout_hdr: 1557out_put_layout_hdr:
1558 if (first)
1559 pnfs_clear_first_layoutget(lo);
1348 pnfs_put_layout_hdr(lo); 1560 pnfs_put_layout_hdr(lo);
1349out: 1561out:
1350 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 1562 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1393 goto out_forget_reply; 1605 goto out_forget_reply;
1394 } 1606 }
1395 1607
1396 if (pnfs_layoutgets_blocked(lo, 1)) { 1608 if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
1397 dprintk("%s forget reply due to state\n", __func__); 1609 dprintk("%s forget reply due to state\n", __func__);
1398 goto out_forget_reply; 1610 goto out_forget_reply;
1399 } 1611 }
@@ -1440,24 +1652,79 @@ out_forget_reply:
1440 goto out; 1652 goto out;
1441} 1653}
1442 1654
1655static void
1656pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1657 struct list_head *tmp_list,
1658 struct pnfs_layout_range *return_range)
1659{
1660 struct pnfs_layout_segment *lseg, *next;
1661
1662 dprintk("%s:Begin lo %p\n", __func__, lo);
1663
1664 if (list_empty(&lo->plh_segs))
1665 return;
1666
1667 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1668 if (should_free_lseg(&lseg->pls_range, return_range)) {
1669 dprintk("%s: marking lseg %p iomode %d "
1670 "offset %llu length %llu\n", __func__,
1671 lseg, lseg->pls_range.iomode,
1672 lseg->pls_range.offset,
1673 lseg->pls_range.length);
1674 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1675 mark_lseg_invalid(lseg, tmp_list);
1676 }
1677}
1678
1679void pnfs_error_mark_layout_for_return(struct inode *inode,
1680 struct pnfs_layout_segment *lseg)
1681{
1682 struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1683 int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
1684 struct pnfs_layout_range range = {
1685 .iomode = lseg->pls_range.iomode,
1686 .offset = 0,
1687 .length = NFS4_MAX_UINT64,
1688 };
1689 LIST_HEAD(free_me);
1690
1691 spin_lock(&inode->i_lock);
1692 /* set failure bit so that pnfs path will be retried later */
1693 pnfs_layout_set_fail_bit(lo, iomode);
1694 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1695 if (lo->plh_return_iomode == 0)
1696 lo->plh_return_iomode = range.iomode;
1697 else if (lo->plh_return_iomode != range.iomode)
1698 lo->plh_return_iomode = IOMODE_ANY;
1699 /*
1700 * mark all matching lsegs so that we are sure to have no live
1701 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1702 * for how it works.
1703 */
1704 pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
1705 spin_unlock(&inode->i_lock);
1706 pnfs_free_lseg_list(&free_me);
1707}
1708EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1709
1443void 1710void
1444pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 1711pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1445{ 1712{
1446 u64 rd_size = req->wb_bytes; 1713 u64 rd_size = req->wb_bytes;
1447 1714
1448 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1715 if (pgio->pg_lseg == NULL) {
1449 1716 if (pgio->pg_dreq == NULL)
1450 if (pgio->pg_dreq == NULL) 1717 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1451 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 1718 else
1452 else 1719 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1453 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 1720
1454 1721 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1455 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1722 req->wb_context,
1456 req->wb_context, 1723 req_offset(req),
1457 req_offset(req), 1724 rd_size,
1458 rd_size, 1725 IOMODE_READ,
1459 IOMODE_READ, 1726 GFP_KERNEL);
1460 GFP_KERNEL); 1727 }
1461 /* If no lseg, fall back to read through mds */ 1728 /* If no lseg, fall back to read through mds */
1462 if (pgio->pg_lseg == NULL) 1729 if (pgio->pg_lseg == NULL)
1463 nfs_pageio_reset_read_mds(pgio); 1730 nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
1469pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1736pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1470 struct nfs_page *req, u64 wb_size) 1737 struct nfs_page *req, u64 wb_size)
1471{ 1738{
1472 WARN_ON_ONCE(pgio->pg_lseg != NULL); 1739 if (pgio->pg_lseg == NULL)
1473 1740 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1474 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1741 req->wb_context,
1475 req->wb_context, 1742 req_offset(req),
1476 req_offset(req), 1743 wb_size,
1477 wb_size, 1744 IOMODE_RW,
1478 IOMODE_RW, 1745 GFP_NOFS);
1479 GFP_NOFS);
1480 /* If no lseg, fall back to write through mds */ 1746 /* If no lseg, fall back to write through mds */
1481 if (pgio->pg_lseg == NULL) 1747 if (pgio->pg_lseg == NULL)
1482 nfs_pageio_reset_write_mds(pgio); 1748 nfs_pageio_reset_write_mds(pgio);
1483} 1749}
1484EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 1750EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1485 1751
1752void
1753pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1754{
1755 if (desc->pg_lseg) {
1756 pnfs_put_lseg(desc->pg_lseg);
1757 desc->pg_lseg = NULL;
1758 }
1759}
1760EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1761
1486/* 1762/*
1487 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number 1763 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1488 * of bytes (maximum @req->wb_bytes) that can be coalesced. 1764 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1489 */ 1765 */
1490size_t 1766size_t
1491pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 1767pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1492 struct nfs_page *req) 1768 struct nfs_page *prev, struct nfs_page *req)
1493{ 1769{
1494 unsigned int size; 1770 unsigned int size;
1495 u64 seg_end, req_start, seg_left; 1771 u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1513 seg_end = end_offset(pgio->pg_lseg->pls_range.offset, 1789 seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1514 pgio->pg_lseg->pls_range.length); 1790 pgio->pg_lseg->pls_range.length);
1515 req_start = req_offset(req); 1791 req_start = req_offset(req);
1516 WARN_ON_ONCE(req_start > seg_end); 1792 WARN_ON_ONCE(req_start >= seg_end);
1517 /* start of request is past the last byte of this segment */ 1793 /* start of request is past the last byte of this segment */
1518 if (req_start >= seg_end) 1794 if (req_start >= seg_end) {
1795 /* reference the new lseg */
1796 if (pgio->pg_ops->pg_cleanup)
1797 pgio->pg_ops->pg_cleanup(pgio);
1798 if (pgio->pg_ops->pg_init)
1799 pgio->pg_ops->pg_init(pgio, req);
1519 return 0; 1800 return 0;
1801 }
1520 1802
1521 /* adjust 'size' iff there are fewer bytes left in the 1803 /* adjust 'size' iff there are fewer bytes left in the
1522 * segment than what nfs_generic_pg_test returned */ 1804 * segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
1571pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1853pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1572 struct nfs_pgio_header *hdr) 1854 struct nfs_pgio_header *hdr)
1573{ 1855{
1856 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1857
1574 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1858 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1575 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1859 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1576 nfs_pageio_reset_write_mds(desc); 1860 nfs_pageio_reset_write_mds(desc);
1577 desc->pg_recoalesce = 1; 1861 mirror->pg_recoalesce = 1;
1578 } 1862 }
1579 nfs_pgio_data_destroy(hdr); 1863 nfs_pgio_data_destroy(hdr);
1580} 1864}
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
1608 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1892 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1609 enum pnfs_try_status trypnfs; 1893 enum pnfs_try_status trypnfs;
1610 1894
1611 desc->pg_lseg = NULL;
1612 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); 1895 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1613 if (trypnfs == PNFS_NOT_ATTEMPTED) 1896 if (trypnfs == PNFS_NOT_ATTEMPTED)
1614 pnfs_write_through_mds(desc, hdr); 1897 pnfs_write_through_mds(desc, hdr);
1615 pnfs_put_lseg(lseg);
1616} 1898}
1617 1899
1618static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1900static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1625int 1907int
1626pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1908pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1627{ 1909{
1910 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1911
1628 struct nfs_pgio_header *hdr; 1912 struct nfs_pgio_header *hdr;
1629 int ret; 1913 int ret;
1630 1914
1631 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 1915 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1632 if (!hdr) { 1916 if (!hdr) {
1633 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1917 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1634 pnfs_put_lseg(desc->pg_lseg);
1635 desc->pg_lseg = NULL;
1636 return -ENOMEM; 1918 return -ENOMEM;
1637 } 1919 }
1638 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1920 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1921
1639 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1922 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1640 ret = nfs_generic_pgio(desc, hdr); 1923 ret = nfs_generic_pgio(desc, hdr);
1641 if (ret != 0) { 1924 if (!ret)
1642 pnfs_put_lseg(desc->pg_lseg);
1643 desc->pg_lseg = NULL;
1644 } else
1645 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1925 pnfs_do_write(desc, hdr, desc->pg_ioflags);
1926
1646 return ret; 1927 return ret;
1647} 1928}
1648EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1929EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
1687pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1968pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1688 struct nfs_pgio_header *hdr) 1969 struct nfs_pgio_header *hdr)
1689{ 1970{
1971 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1972
1690 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1973 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1691 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1974 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1692 nfs_pageio_reset_read_mds(desc); 1975 nfs_pageio_reset_read_mds(desc);
1693 desc->pg_recoalesce = 1; 1976 mirror->pg_recoalesce = 1;
1694 } 1977 }
1695 nfs_pgio_data_destroy(hdr); 1978 nfs_pgio_data_destroy(hdr);
1696} 1979}
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
1719 return trypnfs; 2002 return trypnfs;
1720} 2003}
1721 2004
2005/* Resend all requests through pnfs. */
2006int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2007{
2008 struct nfs_pageio_descriptor pgio;
2009
2010 nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
2011 return nfs_pageio_resend(&pgio, hdr);
2012}
2013EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2014
1722static void 2015static void
1723pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 2016pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
1724{ 2017{
1725 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2018 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1726 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2019 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1727 enum pnfs_try_status trypnfs; 2020 enum pnfs_try_status trypnfs;
2021 int err = 0;
1728 2022
1729 desc->pg_lseg = NULL;
1730 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 2023 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
1731 if (trypnfs == PNFS_NOT_ATTEMPTED) 2024 if (trypnfs == PNFS_TRY_AGAIN)
2025 err = pnfs_read_resend_pnfs(hdr);
2026 if (trypnfs == PNFS_NOT_ATTEMPTED || err)
1732 pnfs_read_through_mds(desc, hdr); 2027 pnfs_read_through_mds(desc, hdr);
1733 pnfs_put_lseg(lseg);
1734} 2028}
1735 2029
1736static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 2030static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1743int 2037int
1744pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 2038pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1745{ 2039{
2040 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2041
1746 struct nfs_pgio_header *hdr; 2042 struct nfs_pgio_header *hdr;
1747 int ret; 2043 int ret;
1748 2044
1749 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 2045 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1750 if (!hdr) { 2046 if (!hdr) {
1751 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 2047 desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1752 ret = -ENOMEM; 2048 return -ENOMEM;
1753 pnfs_put_lseg(desc->pg_lseg);
1754 desc->pg_lseg = NULL;
1755 return ret;
1756 } 2049 }
1757 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 2050 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1758 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 2051 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1759 ret = nfs_generic_pgio(desc, hdr); 2052 ret = nfs_generic_pgio(desc, hdr);
1760 if (ret != 0) { 2053 if (!ret)
1761 pnfs_put_lseg(desc->pg_lseg);
1762 desc->pg_lseg = NULL;
1763 } else
1764 pnfs_do_read(desc, hdr); 2054 pnfs_do_read(desc, hdr);
1765 return ret; 2055 return ret;
1766} 2056}
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
1966 pnfs_clear_layoutcommitting(inode); 2256 pnfs_clear_layoutcommitting(inode);
1967 goto out; 2257 goto out;
1968} 2258}
2259EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
1969 2260
1970struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 2261struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1971{ 2262{
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9ae5b765b073..635f0865671c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -38,6 +38,25 @@ enum {
38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
39 NFS_LSEG_ROC, /* roc bit received from server */ 39 NFS_LSEG_ROC, /* roc bit received from server */
40 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ 40 NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
41 NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
42};
43
44/* Individual ip address */
45struct nfs4_pnfs_ds_addr {
46 struct sockaddr_storage da_addr;
47 size_t da_addrlen;
48 struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
49 char *da_remotestr; /* human readable addr+port */
50};
51
52struct nfs4_pnfs_ds {
53 struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
54 char *ds_remotestr; /* comma sep list of addrs */
55 struct list_head ds_addrs;
56 struct nfs_client *ds_clp;
57 atomic_t ds_count;
58 unsigned long ds_state;
59#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
41}; 60};
42 61
43struct pnfs_layout_segment { 62struct pnfs_layout_segment {
@@ -53,19 +72,34 @@ struct pnfs_layout_segment {
53enum pnfs_try_status { 72enum pnfs_try_status {
54 PNFS_ATTEMPTED = 0, 73 PNFS_ATTEMPTED = 0,
55 PNFS_NOT_ATTEMPTED = 1, 74 PNFS_NOT_ATTEMPTED = 1,
75 PNFS_TRY_AGAIN = 2,
56}; 76};
57 77
58#ifdef CONFIG_NFS_V4_1 78#ifdef CONFIG_NFS_V4_1
59 79
60#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" 80#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
61 81
82/*
83 * Default data server connection timeout and retrans vaules.
84 * Set by module parameters dataserver_timeo and dataserver_retrans.
85 */
86#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
87#define NFS4_DEF_DS_RETRANS 5
88
89/* error codes for internal use */
90#define NFS4ERR_RESET_TO_MDS 12001
91#define NFS4ERR_RESET_TO_PNFS 12002
92
62enum { 93enum {
63 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ 94 NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
64 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ 95 NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 96 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 97 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 98 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
99 NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ 100 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
101 NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
102 NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
69}; 103};
70 104
71enum layoutdriver_policy_flags { 105enum layoutdriver_policy_flags {
@@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type {
106 struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); 140 struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
107 void (*mark_request_commit) (struct nfs_page *req, 141 void (*mark_request_commit) (struct nfs_page *req,
108 struct pnfs_layout_segment *lseg, 142 struct pnfs_layout_segment *lseg,
109 struct nfs_commit_info *cinfo); 143 struct nfs_commit_info *cinfo,
144 u32 ds_commit_idx);
110 void (*clear_request_commit) (struct nfs_page *req, 145 void (*clear_request_commit) (struct nfs_page *req,
111 struct nfs_commit_info *cinfo); 146 struct nfs_commit_info *cinfo);
112 int (*scan_commit_lists) (struct nfs_commit_info *cinfo, 147 int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
@@ -154,6 +189,7 @@ struct pnfs_layout_hdr {
154 u32 plh_barrier; /* ignore lower seqids */ 189 u32 plh_barrier; /* ignore lower seqids */
155 unsigned long plh_retry_timestamp; 190 unsigned long plh_retry_timestamp;
156 unsigned long plh_flags; 191 unsigned long plh_flags;
192 enum pnfs_iomode plh_return_iomode;
157 loff_t plh_lwb; /* last write byte for layoutcommit */ 193 loff_t plh_lwb; /* last write byte for layoutcommit */
158 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ 194 struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
159 struct inode *plh_inode; 195 struct inode *plh_inode;
@@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
185 struct pnfs_device *dev, 221 struct pnfs_device *dev,
186 struct rpc_cred *cred); 222 struct rpc_cred *cred);
187extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 223extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
188extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 224extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
189 225
190/* pnfs.c */ 226/* pnfs.c */
191void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); 227void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *
198int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); 234int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
199void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 235void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
200 struct nfs_page *req, u64 wb_size); 236 struct nfs_page *req, u64 wb_size);
237void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
201int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); 238int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
202size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, 239size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
203 struct nfs_page *prev, struct nfs_page *req); 240 struct nfs_page *prev, struct nfs_page *req);
@@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
217 bool update_barrier); 254 bool update_barrier);
218int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, 255int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
219 struct pnfs_layout_hdr *lo, 256 struct pnfs_layout_hdr *lo,
257 struct pnfs_layout_range *range,
220 struct nfs4_state *open_state); 258 struct nfs4_state *open_state);
221int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 259int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
222 struct list_head *tmp_list, 260 struct list_head *tmp_list,
@@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *);
233int pnfs_commit_and_return_layout(struct inode *); 271int pnfs_commit_and_return_layout(struct inode *);
234void pnfs_ld_write_done(struct nfs_pgio_header *); 272void pnfs_ld_write_done(struct nfs_pgio_header *);
235void pnfs_ld_read_done(struct nfs_pgio_header *); 273void pnfs_ld_read_done(struct nfs_pgio_header *);
274int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
236struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 275struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
237 struct nfs_open_context *ctx, 276 struct nfs_open_context *ctx,
238 loff_t pos, 277 loff_t pos,
239 u64 count, 278 u64 count,
240 enum pnfs_iomode iomode, 279 enum pnfs_iomode iomode,
241 gfp_t gfp_flags); 280 gfp_t gfp_flags);
281void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
242 282
243void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); 283void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
244int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); 284int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
245int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); 285int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
246struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); 286struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
287void pnfs_error_mark_layout_for_return(struct inode *inode,
288 struct pnfs_layout_segment *lseg);
247 289
248/* nfs4_deviceid_flags */ 290/* nfs4_deviceid_flags */
249enum { 291enum {
@@ -275,6 +317,43 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
275bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 317bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
276void nfs4_deviceid_purge_client(const struct nfs_client *); 318void nfs4_deviceid_purge_client(const struct nfs_client *);
277 319
320/* pnfs_nfs.c */
321void pnfs_generic_clear_request_commit(struct nfs_page *req,
322 struct nfs_commit_info *cinfo);
323void pnfs_generic_commit_release(void *calldata);
324void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
325void pnfs_generic_rw_release(void *data);
326void pnfs_generic_recover_commit_reqs(struct list_head *dst,
327 struct nfs_commit_info *cinfo);
328int pnfs_generic_commit_pagelist(struct inode *inode,
329 struct list_head *mds_pages,
330 int how,
331 struct nfs_commit_info *cinfo,
332 int (*initiate_commit)(struct nfs_commit_data *data,
333 int how));
334int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
335void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
336void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
337struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
338 gfp_t gfp_flags);
339void nfs4_pnfs_v3_ds_connect_unload(void);
340void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
341 struct nfs4_deviceid_node *devid, unsigned int timeo,
342 unsigned int retrans, u32 version, u32 minor_version,
343 rpc_authflavor_t au_flavor);
344struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
345 struct xdr_stream *xdr,
346 gfp_t gfp_flags);
347void pnfs_layout_mark_request_commit(struct nfs_page *req,
348 struct pnfs_layout_segment *lseg,
349 struct nfs_commit_info *cinfo,
350 u32 ds_commit_idx);
351
352static inline bool nfs_have_layout(struct inode *inode)
353{
354 return NFS_I(inode)->layout != NULL;
355}
356
278static inline struct nfs4_deviceid_node * 357static inline struct nfs4_deviceid_node *
279nfs4_get_deviceid(struct nfs4_deviceid_node *d) 358nfs4_get_deviceid(struct nfs4_deviceid_node *d)
280{ 359{
@@ -282,6 +361,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
282 return d; 361 return d;
283} 362}
284 363
364static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
365{
366 if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
367 atomic_inc(&lo->plh_refcount);
368}
369
370static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
371{
372 if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
373 atomic_dec(&lo->plh_refcount);
374 /* wake up waiters for LAYOUTRETURN as that is not needed */
375 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
376 }
377}
378
379static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
380{
381 return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
382}
383
285static inline struct pnfs_layout_segment * 384static inline struct pnfs_layout_segment *
286pnfs_get_lseg(struct pnfs_layout_segment *lseg) 385pnfs_get_lseg(struct pnfs_layout_segment *lseg)
287{ 386{
@@ -317,16 +416,22 @@ pnfs_get_ds_info(struct inode *inode)
317 return ld->get_ds_info(inode); 416 return ld->get_ds_info(inode);
318} 417}
319 418
419static inline void
420pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
421{
422 set_bit(NFS_DEVICEID_INVALID, &node->flags);
423}
424
320static inline bool 425static inline bool
321pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 426pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
322 struct nfs_commit_info *cinfo) 427 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
323{ 428{
324 struct inode *inode = req->wb_context->dentry->d_inode; 429 struct inode *inode = req->wb_context->dentry->d_inode;
325 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 430 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
326 431
327 if (lseg == NULL || ld->mark_request_commit == NULL) 432 if (lseg == NULL || ld->mark_request_commit == NULL)
328 return false; 433 return false;
329 ld->mark_request_commit(req, lseg, cinfo); 434 ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
330 return true; 435 return true;
331} 436}
332 437
@@ -352,15 +457,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
352 return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); 457 return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
353} 458}
354 459
355static inline void
356pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
357 struct nfs_commit_info *cinfo)
358{
359 if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
360 return;
361 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
362}
363
364static inline struct nfs_page * 460static inline struct nfs_page *
365pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, 461pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
366 struct page *page) 462 struct page *page)
@@ -427,6 +523,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
427#endif /* NFS_DEBUG */ 523#endif /* NFS_DEBUG */
428#else /* CONFIG_NFS_V4_1 */ 524#else /* CONFIG_NFS_V4_1 */
429 525
526static inline bool nfs_have_layout(struct inode *inode)
527{
528 return false;
529}
530
430static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 531static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
431{ 532{
432} 533}
@@ -513,7 +614,7 @@ pnfs_get_ds_info(struct inode *inode)
513 614
514static inline bool 615static inline bool
515pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 616pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
516 struct nfs_commit_info *cinfo) 617 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
517{ 618{
518 return false; 619 return false;
519} 620}
@@ -531,12 +632,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
531 return 0; 632 return 0;
532} 633}
533 634
534static inline void
535pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
536 struct nfs_commit_info *cinfo)
537{
538}
539
540static inline struct nfs_page * 635static inline struct nfs_page *
541pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, 636pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
542 struct page *page) 637 struct page *page)
@@ -568,6 +663,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
568 return NULL; 663 return NULL;
569} 664}
570 665
666static inline void nfs4_pnfs_v3_ds_connect_unload(void)
667{
668}
669
571#endif /* CONFIG_NFS_V4_1 */ 670#endif /* CONFIG_NFS_V4_1 */
572 671
573#endif /* FS_NFS_PNFS_H */ 672#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000000..54e36b38fb5f
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,870 @@
1/*
2 * Common NFS I/O operations for the pnfs file based
3 * layout drivers.
4 *
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
6 *
7 * Tom Haynes <loghyr@primarydata.com>
8 */
9
10#include <linux/nfs_fs.h>
11#include <linux/nfs_page.h>
12#include <linux/sunrpc/addr.h>
13#include <linux/module.h>
14
15#include "nfs4session.h"
16#include "internal.h"
17#include "pnfs.h"
18
19#define NFSDBG_FACILITY NFSDBG_PNFS
20
21void pnfs_generic_rw_release(void *data)
22{
23 struct nfs_pgio_header *hdr = data;
24
25 nfs_put_client(hdr->ds_clp);
26 hdr->mds_ops->rpc_release(data);
27}
28EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
29
30/* Fake up some data that will cause nfs_commit_release to retry the writes. */
31void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
32{
33 struct nfs_page *first = nfs_list_entry(data->pages.next);
34
35 data->task.tk_status = 0;
36 memcpy(&data->verf.verifier, &first->wb_verf,
37 sizeof(data->verf.verifier));
38 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
39}
40EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
41
42void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
43{
44 struct nfs_commit_data *wdata = data;
45
46 /* Note this may cause RPC to be resent */
47 wdata->mds_ops->rpc_call_done(task, data);
48}
49EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
50
51void pnfs_generic_commit_release(void *calldata)
52{
53 struct nfs_commit_data *data = calldata;
54
55 data->completion_ops->completion(data);
56 pnfs_put_lseg(data->lseg);
57 nfs_put_client(data->ds_clp);
58 nfs_commitdata_release(data);
59}
60EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
61
62/* The generic layer is about to remove the req from the commit list.
63 * If this will make the bucket empty, it will need to put the lseg reference.
64 * Note this must be called holding the inode (/cinfo) lock
65 */
66void
67pnfs_generic_clear_request_commit(struct nfs_page *req,
68 struct nfs_commit_info *cinfo)
69{
70 struct pnfs_layout_segment *freeme = NULL;
71
72 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
73 goto out;
74 cinfo->ds->nwritten--;
75 if (list_is_singular(&req->wb_list)) {
76 struct pnfs_commit_bucket *bucket;
77
78 bucket = list_first_entry(&req->wb_list,
79 struct pnfs_commit_bucket,
80 written);
81 freeme = bucket->wlseg;
82 bucket->wlseg = NULL;
83 }
84out:
85 nfs_request_remove_commit_list(req, cinfo);
86 pnfs_put_lseg_locked(freeme);
87}
88EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
89
90static int
91pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
92 struct nfs_commit_info *cinfo, int max)
93{
94 struct nfs_page *req, *tmp;
95 int ret = 0;
96
97 list_for_each_entry_safe(req, tmp, src, wb_list) {
98 if (!nfs_lock_request(req))
99 continue;
100 kref_get(&req->wb_kref);
101 if (cond_resched_lock(cinfo->lock))
102 list_safe_reset_next(req, tmp, wb_list);
103 nfs_request_remove_commit_list(req, cinfo);
104 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
105 nfs_list_add_request(req, dst);
106 ret++;
107 if ((ret == max) && !cinfo->dreq)
108 break;
109 }
110 return ret;
111}
112
113static int
114pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
115 struct nfs_commit_info *cinfo,
116 int max)
117{
118 struct list_head *src = &bucket->written;
119 struct list_head *dst = &bucket->committing;
120 int ret;
121
122 lockdep_assert_held(cinfo->lock);
123 ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
124 if (ret) {
125 cinfo->ds->nwritten -= ret;
126 cinfo->ds->ncommitting += ret;
127 bucket->clseg = bucket->wlseg;
128 if (list_empty(src))
129 bucket->wlseg = NULL;
130 else
131 pnfs_get_lseg(bucket->clseg);
132 }
133 return ret;
134}
135
136/* Move reqs from written to committing lists, returning count
137 * of number moved.
138 */
139int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
140 int max)
141{
142 int i, rv = 0, cnt;
143
144 lockdep_assert_held(cinfo->lock);
145 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
146 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
147 cinfo, max);
148 max -= cnt;
149 rv += cnt;
150 }
151 return rv;
152}
153EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
154
155/* Pull everything off the committing lists and dump into @dst. */
156void pnfs_generic_recover_commit_reqs(struct list_head *dst,
157 struct nfs_commit_info *cinfo)
158{
159 struct pnfs_commit_bucket *b;
160 struct pnfs_layout_segment *freeme;
161 int i;
162
163 lockdep_assert_held(cinfo->lock);
164restart:
165 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
166 if (pnfs_generic_transfer_commit_list(&b->written, dst,
167 cinfo, 0)) {
168 freeme = b->wlseg;
169 b->wlseg = NULL;
170 spin_unlock(cinfo->lock);
171 pnfs_put_lseg(freeme);
172 spin_lock(cinfo->lock);
173 goto restart;
174 }
175 }
176 cinfo->ds->nwritten = 0;
177}
178EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
179
180static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
181{
182 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
183 struct pnfs_commit_bucket *bucket;
184 struct pnfs_layout_segment *freeme;
185 int i;
186
187 for (i = idx; i < fl_cinfo->nbuckets; i++) {
188 bucket = &fl_cinfo->buckets[i];
189 if (list_empty(&bucket->committing))
190 continue;
191 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
192 spin_lock(cinfo->lock);
193 freeme = bucket->clseg;
194 bucket->clseg = NULL;
195 spin_unlock(cinfo->lock);
196 pnfs_put_lseg(freeme);
197 }
198}
199
200static unsigned int
201pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
202 struct list_head *list)
203{
204 struct pnfs_ds_commit_info *fl_cinfo;
205 struct pnfs_commit_bucket *bucket;
206 struct nfs_commit_data *data;
207 int i;
208 unsigned int nreq = 0;
209
210 fl_cinfo = cinfo->ds;
211 bucket = fl_cinfo->buckets;
212 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
213 if (list_empty(&bucket->committing))
214 continue;
215 data = nfs_commitdata_alloc();
216 if (!data)
217 break;
218 data->ds_commit_index = i;
219 spin_lock(cinfo->lock);
220 data->lseg = bucket->clseg;
221 bucket->clseg = NULL;
222 spin_unlock(cinfo->lock);
223 list_add(&data->pages, list);
224 nreq++;
225 }
226
227 /* Clean up on error */
228 pnfs_generic_retry_commit(cinfo, i);
229 return nreq;
230}
231
232/* This follows nfs_commit_list pretty closely */
233int
234pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
235 int how, struct nfs_commit_info *cinfo,
236 int (*initiate_commit)(struct nfs_commit_data *data,
237 int how))
238{
239 struct nfs_commit_data *data, *tmp;
240 LIST_HEAD(list);
241 unsigned int nreq = 0;
242
243 if (!list_empty(mds_pages)) {
244 data = nfs_commitdata_alloc();
245 if (data != NULL) {
246 data->lseg = NULL;
247 list_add(&data->pages, &list);
248 nreq++;
249 } else {
250 nfs_retry_commit(mds_pages, NULL, cinfo, 0);
251 pnfs_generic_retry_commit(cinfo, 0);
252 cinfo->completion_ops->error_cleanup(NFS_I(inode));
253 return -ENOMEM;
254 }
255 }
256
257 nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
258
259 if (nreq == 0) {
260 cinfo->completion_ops->error_cleanup(NFS_I(inode));
261 goto out;
262 }
263
264 atomic_add(nreq, &cinfo->mds->rpcs_out);
265
266 list_for_each_entry_safe(data, tmp, &list, pages) {
267 list_del_init(&data->pages);
268 if (!data->lseg) {
269 nfs_init_commit(data, mds_pages, NULL, cinfo);
270 nfs_initiate_commit(NFS_CLIENT(inode), data,
271 NFS_PROTO(data->inode),
272 data->mds_ops, how, 0);
273 } else {
274 struct pnfs_commit_bucket *buckets;
275
276 buckets = cinfo->ds->buckets;
277 nfs_init_commit(data,
278 &buckets[data->ds_commit_index].committing,
279 data->lseg,
280 cinfo);
281 initiate_commit(data, how);
282 }
283 }
284out:
285 cinfo->ds->ncommitting = 0;
286 return PNFS_ATTEMPTED;
287}
288EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
289
290/*
291 * Data server cache
292 *
293 * Data servers can be mapped to different device ids.
294 * nfs4_pnfs_ds reference counting
295 * - set to 1 on allocation
296 * - incremented when a device id maps a data server already in the cache.
297 * - decremented when deviceid is removed from the cache.
298 */
299static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
300static LIST_HEAD(nfs4_data_server_cache);
301
302/* Debug routines */
303static void
304print_ds(struct nfs4_pnfs_ds *ds)
305{
306 if (ds == NULL) {
307 printk(KERN_WARNING "%s NULL device\n", __func__);
308 return;
309 }
310 printk(KERN_WARNING " ds %s\n"
311 " ref count %d\n"
312 " client %p\n"
313 " cl_exchange_flags %x\n",
314 ds->ds_remotestr,
315 atomic_read(&ds->ds_count), ds->ds_clp,
316 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
317}
318
319static bool
320same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
321{
322 struct sockaddr_in *a, *b;
323 struct sockaddr_in6 *a6, *b6;
324
325 if (addr1->sa_family != addr2->sa_family)
326 return false;
327
328 switch (addr1->sa_family) {
329 case AF_INET:
330 a = (struct sockaddr_in *)addr1;
331 b = (struct sockaddr_in *)addr2;
332
333 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
334 a->sin_port == b->sin_port)
335 return true;
336 break;
337
338 case AF_INET6:
339 a6 = (struct sockaddr_in6 *)addr1;
340 b6 = (struct sockaddr_in6 *)addr2;
341
342 /* LINKLOCAL addresses must have matching scope_id */
343 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
344 IPV6_ADDR_SCOPE_LINKLOCAL &&
345 a6->sin6_scope_id != b6->sin6_scope_id)
346 return false;
347
348 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
349 a6->sin6_port == b6->sin6_port)
350 return true;
351 break;
352
353 default:
354 dprintk("%s: unhandled address family: %u\n",
355 __func__, addr1->sa_family);
356 return false;
357 }
358
359 return false;
360}
361
362static bool
363_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
364 const struct list_head *dsaddrs2)
365{
366 struct nfs4_pnfs_ds_addr *da1, *da2;
367
368 /* step through both lists, comparing as we go */
369 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
370 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
371 da1 != NULL && da2 != NULL;
372 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
373 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
374 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
375 (struct sockaddr *)&da2->da_addr))
376 return false;
377 }
378 if (da1 == NULL && da2 == NULL)
379 return true;
380
381 return false;
382}
383
384/*
385 * Lookup DS by addresses. nfs4_ds_cache_lock is held
386 */
387static struct nfs4_pnfs_ds *
388_data_server_lookup_locked(const struct list_head *dsaddrs)
389{
390 struct nfs4_pnfs_ds *ds;
391
392 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
393 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
394 return ds;
395 return NULL;
396}
397
398static void destroy_ds(struct nfs4_pnfs_ds *ds)
399{
400 struct nfs4_pnfs_ds_addr *da;
401
402 dprintk("--> %s\n", __func__);
403 ifdebug(FACILITY)
404 print_ds(ds);
405
406 nfs_put_client(ds->ds_clp);
407
408 while (!list_empty(&ds->ds_addrs)) {
409 da = list_first_entry(&ds->ds_addrs,
410 struct nfs4_pnfs_ds_addr,
411 da_node);
412 list_del_init(&da->da_node);
413 kfree(da->da_remotestr);
414 kfree(da);
415 }
416
417 kfree(ds->ds_remotestr);
418 kfree(ds);
419}
420
421void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
422{
423 if (atomic_dec_and_lock(&ds->ds_count,
424 &nfs4_ds_cache_lock)) {
425 list_del_init(&ds->ds_node);
426 spin_unlock(&nfs4_ds_cache_lock);
427 destroy_ds(ds);
428 }
429}
430EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
431
432/*
433 * Create a string with a human readable address and port to avoid
434 * complicated setup around many dprinks.
435 */
436static char *
437nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
438{
439 struct nfs4_pnfs_ds_addr *da;
440 char *remotestr;
441 size_t len;
442 char *p;
443
444 len = 3; /* '{', '}' and eol */
445 list_for_each_entry(da, dsaddrs, da_node) {
446 len += strlen(da->da_remotestr) + 1; /* string plus comma */
447 }
448
449 remotestr = kzalloc(len, gfp_flags);
450 if (!remotestr)
451 return NULL;
452
453 p = remotestr;
454 *(p++) = '{';
455 len--;
456 list_for_each_entry(da, dsaddrs, da_node) {
457 size_t ll = strlen(da->da_remotestr);
458
459 if (ll > len)
460 goto out_err;
461
462 memcpy(p, da->da_remotestr, ll);
463 p += ll;
464 len -= ll;
465
466 if (len < 1)
467 goto out_err;
468 (*p++) = ',';
469 len--;
470 }
471 if (len < 2)
472 goto out_err;
473 *(p++) = '}';
474 *p = '\0';
475 return remotestr;
476out_err:
477 kfree(remotestr);
478 return NULL;
479}
480
481/*
482 * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
483 * uncached and return cached struct nfs4_pnfs_ds.
484 */
485struct nfs4_pnfs_ds *
486nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
487{
488 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
489 char *remotestr;
490
491 if (list_empty(dsaddrs)) {
492 dprintk("%s: no addresses defined\n", __func__);
493 goto out;
494 }
495
496 ds = kzalloc(sizeof(*ds), gfp_flags);
497 if (!ds)
498 goto out;
499
500 /* this is only used for debugging, so it's ok if its NULL */
501 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
502
503 spin_lock(&nfs4_ds_cache_lock);
504 tmp_ds = _data_server_lookup_locked(dsaddrs);
505 if (tmp_ds == NULL) {
506 INIT_LIST_HEAD(&ds->ds_addrs);
507 list_splice_init(dsaddrs, &ds->ds_addrs);
508 ds->ds_remotestr = remotestr;
509 atomic_set(&ds->ds_count, 1);
510 INIT_LIST_HEAD(&ds->ds_node);
511 ds->ds_clp = NULL;
512 list_add(&ds->ds_node, &nfs4_data_server_cache);
513 dprintk("%s add new data server %s\n", __func__,
514 ds->ds_remotestr);
515 } else {
516 kfree(remotestr);
517 kfree(ds);
518 atomic_inc(&tmp_ds->ds_count);
519 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
520 __func__, tmp_ds->ds_remotestr,
521 atomic_read(&tmp_ds->ds_count));
522 ds = tmp_ds;
523 }
524 spin_unlock(&nfs4_ds_cache_lock);
525out:
526 return ds;
527}
528EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
529
530static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
531{
532 might_sleep();
533 wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
534 TASK_KILLABLE);
535}
536
537static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
538{
539 smp_mb__before_atomic();
540 clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
541 smp_mb__after_atomic();
542 wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
543}
544
545static struct nfs_client *(*get_v3_ds_connect)(
546 struct nfs_client *mds_clp,
547 const struct sockaddr *ds_addr,
548 int ds_addrlen,
549 int ds_proto,
550 unsigned int ds_timeo,
551 unsigned int ds_retrans,
552 rpc_authflavor_t au_flavor);
553
554static bool load_v3_ds_connect(void)
555{
556 if (!get_v3_ds_connect) {
557 get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
558 WARN_ON_ONCE(!get_v3_ds_connect);
559 }
560
561 return(get_v3_ds_connect != NULL);
562}
563
564void __exit nfs4_pnfs_v3_ds_connect_unload(void)
565{
566 if (get_v3_ds_connect) {
567 symbol_put(nfs3_set_ds_client);
568 get_v3_ds_connect = NULL;
569 }
570}
571EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
572
573static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
574 struct nfs4_pnfs_ds *ds,
575 unsigned int timeo,
576 unsigned int retrans,
577 rpc_authflavor_t au_flavor)
578{
579 struct nfs_client *clp = ERR_PTR(-EIO);
580 struct nfs4_pnfs_ds_addr *da;
581 int status = 0;
582
583 dprintk("--> %s DS %s au_flavor %d\n", __func__,
584 ds->ds_remotestr, au_flavor);
585
586 if (!load_v3_ds_connect())
587 goto out;
588
589 list_for_each_entry(da, &ds->ds_addrs, da_node) {
590 dprintk("%s: DS %s: trying address %s\n",
591 __func__, ds->ds_remotestr, da->da_remotestr);
592
593 clp = get_v3_ds_connect(mds_srv->nfs_client,
594 (struct sockaddr *)&da->da_addr,
595 da->da_addrlen, IPPROTO_TCP,
596 timeo, retrans, au_flavor);
597 if (!IS_ERR(clp))
598 break;
599 }
600
601 if (IS_ERR(clp)) {
602 status = PTR_ERR(clp);
603 goto out;
604 }
605
606 smp_wmb();
607 ds->ds_clp = clp;
608 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
609out:
610 return status;
611}
612
613static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
614 struct nfs4_pnfs_ds *ds,
615 unsigned int timeo,
616 unsigned int retrans,
617 u32 minor_version,
618 rpc_authflavor_t au_flavor)
619{
620 struct nfs_client *clp = ERR_PTR(-EIO);
621 struct nfs4_pnfs_ds_addr *da;
622 int status = 0;
623
624 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
625 au_flavor);
626
627 list_for_each_entry(da, &ds->ds_addrs, da_node) {
628 dprintk("%s: DS %s: trying address %s\n",
629 __func__, ds->ds_remotestr, da->da_remotestr);
630
631 clp = nfs4_set_ds_client(mds_srv->nfs_client,
632 (struct sockaddr *)&da->da_addr,
633 da->da_addrlen, IPPROTO_TCP,
634 timeo, retrans, minor_version,
635 au_flavor);
636 if (!IS_ERR(clp))
637 break;
638 }
639
640 if (IS_ERR(clp)) {
641 status = PTR_ERR(clp);
642 goto out;
643 }
644
645 status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
646 if (status)
647 goto out_put;
648
649 smp_wmb();
650 ds->ds_clp = clp;
651 dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
652out:
653 return status;
654out_put:
655 nfs_put_client(clp);
656 goto out;
657}
658
659/*
660 * Create an rpc connection to the nfs4_pnfs_ds data server.
661 * Currently only supports IPv4 and IPv6 addresses.
662 * If connection fails, make devid unavailable.
663 */
664void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
665 struct nfs4_deviceid_node *devid, unsigned int timeo,
666 unsigned int retrans, u32 version,
667 u32 minor_version, rpc_authflavor_t au_flavor)
668{
669 if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
670 int err = 0;
671
672 if (version == 3) {
673 err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
674 retrans, au_flavor);
675 } else if (version == 4) {
676 err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
677 retrans, minor_version,
678 au_flavor);
679 } else {
680 dprintk("%s: unsupported DS version %d\n", __func__,
681 version);
682 err = -EPROTONOSUPPORT;
683 }
684
685 if (err)
686 nfs4_mark_deviceid_unavailable(devid);
687 nfs4_clear_ds_conn_bit(ds);
688 } else {
689 nfs4_wait_ds_connect(ds);
690 }
691}
692EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
693
694/*
695 * Currently only supports ipv4, ipv6 and one multi-path address.
696 */
697struct nfs4_pnfs_ds_addr *
698nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
699{
700 struct nfs4_pnfs_ds_addr *da = NULL;
701 char *buf, *portstr;
702 __be16 port;
703 int nlen, rlen;
704 int tmp[2];
705 __be32 *p;
706 char *netid, *match_netid;
707 size_t len, match_netid_len;
708 char *startsep = "";
709 char *endsep = "";
710
711
712 /* r_netid */
713 p = xdr_inline_decode(xdr, 4);
714 if (unlikely(!p))
715 goto out_err;
716 nlen = be32_to_cpup(p++);
717
718 p = xdr_inline_decode(xdr, nlen);
719 if (unlikely(!p))
720 goto out_err;
721
722 netid = kmalloc(nlen+1, gfp_flags);
723 if (unlikely(!netid))
724 goto out_err;
725
726 netid[nlen] = '\0';
727 memcpy(netid, p, nlen);
728
729 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
730 p = xdr_inline_decode(xdr, 4);
731 if (unlikely(!p))
732 goto out_free_netid;
733 rlen = be32_to_cpup(p);
734
735 p = xdr_inline_decode(xdr, rlen);
736 if (unlikely(!p))
737 goto out_free_netid;
738
739 /* port is ".ABC.DEF", 8 chars max */
740 if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
741 dprintk("%s: Invalid address, length %d\n", __func__,
742 rlen);
743 goto out_free_netid;
744 }
745 buf = kmalloc(rlen + 1, gfp_flags);
746 if (!buf) {
747 dprintk("%s: Not enough memory\n", __func__);
748 goto out_free_netid;
749 }
750 buf[rlen] = '\0';
751 memcpy(buf, p, rlen);
752
753 /* replace port '.' with '-' */
754 portstr = strrchr(buf, '.');
755 if (!portstr) {
756 dprintk("%s: Failed finding expected dot in port\n",
757 __func__);
758 goto out_free_buf;
759 }
760 *portstr = '-';
761
762 /* find '.' between address and port */
763 portstr = strrchr(buf, '.');
764 if (!portstr) {
765 dprintk("%s: Failed finding expected dot between address and "
766 "port\n", __func__);
767 goto out_free_buf;
768 }
769 *portstr = '\0';
770
771 da = kzalloc(sizeof(*da), gfp_flags);
772 if (unlikely(!da))
773 goto out_free_buf;
774
775 INIT_LIST_HEAD(&da->da_node);
776
777 if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
778 sizeof(da->da_addr))) {
779 dprintk("%s: error parsing address %s\n", __func__, buf);
780 goto out_free_da;
781 }
782
783 portstr++;
784 sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
785 port = htons((tmp[0] << 8) | (tmp[1]));
786
787 switch (da->da_addr.ss_family) {
788 case AF_INET:
789 ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
790 da->da_addrlen = sizeof(struct sockaddr_in);
791 match_netid = "tcp";
792 match_netid_len = 3;
793 break;
794
795 case AF_INET6:
796 ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
797 da->da_addrlen = sizeof(struct sockaddr_in6);
798 match_netid = "tcp6";
799 match_netid_len = 4;
800 startsep = "[";
801 endsep = "]";
802 break;
803
804 default:
805 dprintk("%s: unsupported address family: %u\n",
806 __func__, da->da_addr.ss_family);
807 goto out_free_da;
808 }
809
810 if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
811 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
812 __func__, netid, match_netid);
813 goto out_free_da;
814 }
815
816 /* save human readable address */
817 len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
818 da->da_remotestr = kzalloc(len, gfp_flags);
819
820 /* NULL is ok, only used for dprintk */
821 if (da->da_remotestr)
822 snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
823 buf, endsep, ntohs(port));
824
825 dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
826 kfree(buf);
827 kfree(netid);
828 return da;
829
830out_free_da:
831 kfree(da);
832out_free_buf:
833 dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
834 kfree(buf);
835out_free_netid:
836 kfree(netid);
837out_err:
838 return NULL;
839}
840EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
841
842void
843pnfs_layout_mark_request_commit(struct nfs_page *req,
844 struct pnfs_layout_segment *lseg,
845 struct nfs_commit_info *cinfo,
846 u32 ds_commit_idx)
847{
848 struct list_head *list;
849 struct pnfs_commit_bucket *buckets;
850
851 spin_lock(cinfo->lock);
852 buckets = cinfo->ds->buckets;
853 list = &buckets[ds_commit_idx].written;
854 if (list_empty(list)) {
855 /* Non-empty buckets hold a reference on the lseg. That ref
856 * is normally transferred to the COMMIT call and released
857 * there. It could also be released if the last req is pulled
858 * off due to a rewrite, in which case it will be done in
859 * pnfs_common_clear_request_commit
860 */
861 WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
862 buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
863 }
864 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
865 cinfo->ds->nwritten++;
866 spin_unlock(cinfo->lock);
867
868 nfs_request_add_commit_list(req, list, cinfo);
869}
870EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
70 70
71void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) 71void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
72{ 72{
73 struct nfs_pgio_mirror *mirror;
74
73 pgio->pg_ops = &nfs_pgio_rw_ops; 75 pgio->pg_ops = &nfs_pgio_rw_ops;
74 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; 76
77 /* read path should never have more than one mirror */
78 WARN_ON_ONCE(pgio->pg_mirror_count != 1);
79
80 mirror = &pgio->pg_mirrors[0];
81 mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
75} 82}
76EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); 83EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
77 84
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
81 struct nfs_page *new; 88 struct nfs_page *new;
82 unsigned int len; 89 unsigned int len;
83 struct nfs_pageio_descriptor pgio; 90 struct nfs_pageio_descriptor pgio;
91 struct nfs_pgio_mirror *pgm;
84 92
85 len = nfs_page_length(page); 93 len = nfs_page_length(page);
86 if (len == 0) 94 if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
97 &nfs_async_read_completion_ops); 105 &nfs_async_read_completion_ops);
98 nfs_pageio_add_request(&pgio, new); 106 nfs_pageio_add_request(&pgio, new);
99 nfs_pageio_complete(&pgio); 107 nfs_pageio_complete(&pgio);
100 NFS_I(inode)->read_io += pgio.pg_bytes_written; 108
109 /* It doesn't make sense to do mirrored reads! */
110 WARN_ON_ONCE(pgio.pg_mirror_count != 1);
111
112 pgm = &pgio.pg_mirrors[0];
113 NFS_I(inode)->read_io += pgm->pg_bytes_written;
114
101 return 0; 115 return 0;
102} 116}
103 117
@@ -168,13 +182,14 @@ out:
168 182
169static void nfs_initiate_read(struct nfs_pgio_header *hdr, 183static void nfs_initiate_read(struct nfs_pgio_header *hdr,
170 struct rpc_message *msg, 184 struct rpc_message *msg,
185 const struct nfs_rpc_ops *rpc_ops,
171 struct rpc_task_setup *task_setup_data, int how) 186 struct rpc_task_setup *task_setup_data, int how)
172{ 187{
173 struct inode *inode = hdr->inode; 188 struct inode *inode = hdr->inode;
174 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 189 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
175 190
176 task_setup_data->flags |= swap_flags; 191 task_setup_data->flags |= swap_flags;
177 NFS_PROTO(inode)->read_setup(hdr, msg); 192 rpc_ops->read_setup(hdr, msg);
178} 193}
179 194
180static void 195static void
@@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
351 struct list_head *pages, unsigned nr_pages) 366 struct list_head *pages, unsigned nr_pages)
352{ 367{
353 struct nfs_pageio_descriptor pgio; 368 struct nfs_pageio_descriptor pgio;
369 struct nfs_pgio_mirror *pgm;
354 struct nfs_readdesc desc = { 370 struct nfs_readdesc desc = {
355 .pgio = &pgio, 371 .pgio = &pgio,
356 }; 372 };
@@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
386 &nfs_async_read_completion_ops); 402 &nfs_async_read_completion_ops);
387 403
388 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); 404 ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
389
390 nfs_pageio_complete(&pgio); 405 nfs_pageio_complete(&pgio);
391 NFS_I(inode)->read_io += pgio.pg_bytes_written; 406
392 npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 407 /* It doesn't make sense to do mirrored reads! */
408 WARN_ON_ONCE(pgio.pg_mirror_count != 1);
409
410 pgm = &pgio.pg_mirrors[0];
411 NFS_I(inode)->read_io += pgm->pg_bytes_written;
412 npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
413 PAGE_CACHE_SHIFT;
393 nfs_add_stats(inode, NFSIOS_READPAGES, npages); 414 nfs_add_stats(inode, NFSIOS_READPAGES, npages);
394read_complete: 415read_complete:
395 put_nfs_open_context(desc.ctx); 416 put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
311 .destroy_inode = nfs_destroy_inode, 311 .destroy_inode = nfs_destroy_inode,
312 .write_inode = nfs_write_inode, 312 .write_inode = nfs_write_inode,
313 .drop_inode = nfs_drop_inode, 313 .drop_inode = nfs_drop_inode,
314 .put_super = nfs_put_super,
315 .statfs = nfs_statfs, 314 .statfs = nfs_statfs,
316 .evict_inode = nfs_evict_inode, 315 .evict_inode = nfs_evict_inode,
317 .umount_begin = nfs_umount_begin, 316 .umount_begin = nfs_umount_begin,
@@ -405,12 +404,15 @@ void __exit unregister_nfs_fs(void)
405 unregister_filesystem(&nfs_fs_type); 404 unregister_filesystem(&nfs_fs_type);
406} 405}
407 406
408void nfs_sb_active(struct super_block *sb) 407bool nfs_sb_active(struct super_block *sb)
409{ 408{
410 struct nfs_server *server = NFS_SB(sb); 409 struct nfs_server *server = NFS_SB(sb);
411 410
412 if (atomic_inc_return(&server->active) == 1) 411 if (!atomic_inc_not_zero(&sb->s_active))
413 atomic_inc(&sb->s_active); 412 return false;
413 if (atomic_inc_return(&server->active) != 1)
414 atomic_dec(&sb->s_active);
415 return true;
414} 416}
415EXPORT_SYMBOL_GPL(nfs_sb_active); 417EXPORT_SYMBOL_GPL(nfs_sb_active);
416 418
@@ -2569,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
2569 error = nfs_bdi_register(server); 2571 error = nfs_bdi_register(server);
2570 if (error) { 2572 if (error) {
2571 mntroot = ERR_PTR(error); 2573 mntroot = ERR_PTR(error);
2572 goto error_splat_bdi; 2574 goto error_splat_super;
2573 } 2575 }
2574 server->super = s; 2576 server->super = s;
2575 } 2577 }
@@ -2601,9 +2603,6 @@ error_splat_root:
2601 dput(mntroot); 2603 dput(mntroot);
2602 mntroot = ERR_PTR(error); 2604 mntroot = ERR_PTR(error);
2603error_splat_super: 2605error_splat_super:
2604 if (server && !s->s_root)
2605 bdi_unregister(&server->backing_dev_info);
2606error_splat_bdi:
2607 deactivate_locked_super(s); 2606 deactivate_locked_super(s);
2608 goto out; 2607 goto out;
2609} 2608}
@@ -2651,27 +2650,19 @@ out:
2651EXPORT_SYMBOL_GPL(nfs_fs_mount); 2650EXPORT_SYMBOL_GPL(nfs_fs_mount);
2652 2651
2653/* 2652/*
2654 * Ensure that we unregister the bdi before kill_anon_super
2655 * releases the device name
2656 */
2657void nfs_put_super(struct super_block *s)
2658{
2659 struct nfs_server *server = NFS_SB(s);
2660
2661 bdi_unregister(&server->backing_dev_info);
2662}
2663EXPORT_SYMBOL_GPL(nfs_put_super);
2664
2665/*
2666 * Destroy an NFS2/3 superblock 2653 * Destroy an NFS2/3 superblock
2667 */ 2654 */
2668void nfs_kill_super(struct super_block *s) 2655void nfs_kill_super(struct super_block *s)
2669{ 2656{
2670 struct nfs_server *server = NFS_SB(s); 2657 struct nfs_server *server = NFS_SB(s);
2658 dev_t dev = s->s_dev;
2659
2660 generic_shutdown_super(s);
2671 2661
2672 kill_anon_super(s);
2673 nfs_fscache_release_super_cookie(s); 2662 nfs_fscache_release_super_cookie(s);
2663
2674 nfs_free_server(server); 2664 nfs_free_server(server);
2665 free_anon_bdev(dev);
2675} 2666}
2676EXPORT_SYMBOL_GPL(nfs_kill_super); 2667EXPORT_SYMBOL_GPL(nfs_kill_super);
2677 2668
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..595d81e354d1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -473,13 +473,18 @@ try_again:
473 do { 473 do {
474 /* 474 /*
475 * Subrequests are always contiguous, non overlapping 475 * Subrequests are always contiguous, non overlapping
476 * and in order. If not, it's a programming error. 476 * and in order - but may be repeated (mirrored writes).
477 */ 477 */
478 WARN_ON_ONCE(subreq->wb_offset != 478 if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
479 (head->wb_offset + total_bytes)); 479 /* keep track of how many bytes this group covers */
480 480 total_bytes += subreq->wb_bytes;
481 /* keep track of how many bytes this group covers */ 481 } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
482 total_bytes += subreq->wb_bytes; 482 ((subreq->wb_offset + subreq->wb_bytes) >
483 (head->wb_offset + total_bytes)))) {
484 nfs_page_group_unlock(head);
485 spin_unlock(&inode->i_lock);
486 return ERR_PTR(-EIO);
487 }
483 488
484 if (!nfs_lock_request(subreq)) { 489 if (!nfs_lock_request(subreq)) {
485 /* releases page group bit lock and 490 /* releases page group bit lock and
@@ -784,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
784 nfs_list_add_request(req, dst); 789 nfs_list_add_request(req, dst);
785 cinfo->mds->ncommit++; 790 cinfo->mds->ncommit++;
786 spin_unlock(cinfo->lock); 791 spin_unlock(cinfo->lock);
787 if (!cinfo->dreq) { 792 if (!cinfo->dreq)
788 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 793 nfs_mark_page_unstable(req->wb_page);
789 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
790 BDI_RECLAIMABLE);
791 __mark_inode_dirty(req->wb_context->dentry->d_inode,
792 I_DIRTY_DATASYNC);
793 }
794} 794}
795EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); 795EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
796 796
@@ -842,9 +842,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo);
842 */ 842 */
843void 843void
844nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, 844nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
845 struct nfs_commit_info *cinfo) 845 struct nfs_commit_info *cinfo, u32 ds_commit_idx)
846{ 846{
847 if (pnfs_mark_request_commit(req, lseg, cinfo)) 847 if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
848 return; 848 return;
849 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); 849 nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
850} 850}
@@ -853,7 +853,7 @@ static void
853nfs_clear_page_commit(struct page *page) 853nfs_clear_page_commit(struct page *page)
854{ 854{
855 dec_zone_page_state(page, NR_UNSTABLE_NFS); 855 dec_zone_page_state(page, NR_UNSTABLE_NFS);
856 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); 856 dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
857} 857}
858 858
859/* Called holding inode (/cinfo) lock */ 859/* Called holding inode (/cinfo) lock */
@@ -900,7 +900,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
900 } 900 }
901 if (nfs_write_need_commit(hdr)) { 901 if (nfs_write_need_commit(hdr)) {
902 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); 902 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
903 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 903 nfs_mark_request_commit(req, hdr->lseg, &cinfo,
904 hdr->pgio_mirror_idx);
904 goto next; 905 goto next;
905 } 906 }
906remove_req: 907remove_req:
@@ -1091,6 +1092,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
1091{ 1092{
1092 struct nfs_open_context *ctx = nfs_file_open_context(file); 1093 struct nfs_open_context *ctx = nfs_file_open_context(file);
1093 struct nfs_lock_context *l_ctx; 1094 struct nfs_lock_context *l_ctx;
1095 struct file_lock_context *flctx = file_inode(file)->i_flctx;
1094 struct nfs_page *req; 1096 struct nfs_page *req;
1095 int do_flush, status; 1097 int do_flush, status;
1096 /* 1098 /*
@@ -1109,7 +1111,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
1109 do_flush = req->wb_page != page || req->wb_context != ctx; 1111 do_flush = req->wb_page != page || req->wb_context != ctx;
1110 /* for now, flush if more than 1 request in page_group */ 1112 /* for now, flush if more than 1 request in page_group */
1111 do_flush |= req->wb_this_page != req; 1113 do_flush |= req->wb_this_page != req;
1112 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { 1114 if (l_ctx && flctx &&
1115 !(list_empty_careful(&flctx->flc_posix) &&
1116 list_empty_careful(&flctx->flc_flock))) {
1113 do_flush |= l_ctx->lockowner.l_owner != current->files 1117 do_flush |= l_ctx->lockowner.l_owner != current->files
1114 || l_ctx->lockowner.l_pid != current->tgid; 1118 || l_ctx->lockowner.l_pid != current->tgid;
1115 } 1119 }
@@ -1170,6 +1174,13 @@ out:
1170 return PageUptodate(page) != 0; 1174 return PageUptodate(page) != 0;
1171} 1175}
1172 1176
1177static bool
1178is_whole_file_wrlock(struct file_lock *fl)
1179{
1180 return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
1181 fl->fl_type == F_WRLCK;
1182}
1183
1173/* If we know the page is up to date, and we're not using byte range locks (or 1184/* If we know the page is up to date, and we're not using byte range locks (or
1174 * if we have the whole file locked for writing), it may be more efficient to 1185 * if we have the whole file locked for writing), it may be more efficient to
1175 * extend the write to cover the entire page in order to avoid fragmentation 1186 * extend the write to cover the entire page in order to avoid fragmentation
@@ -1180,17 +1191,36 @@ out:
1180 */ 1191 */
1181static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) 1192static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
1182{ 1193{
1194 int ret;
1195 struct file_lock_context *flctx = inode->i_flctx;
1196 struct file_lock *fl;
1197
1183 if (file->f_flags & O_DSYNC) 1198 if (file->f_flags & O_DSYNC)
1184 return 0; 1199 return 0;
1185 if (!nfs_write_pageuptodate(page, inode)) 1200 if (!nfs_write_pageuptodate(page, inode))
1186 return 0; 1201 return 0;
1187 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) 1202 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
1188 return 1; 1203 return 1;
1189 if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && 1204 if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
1190 inode->i_flock->fl_end == OFFSET_MAX && 1205 list_empty_careful(&flctx->flc_posix)))
1191 inode->i_flock->fl_type != F_RDLCK)) 1206 return 0;
1192 return 1; 1207
1193 return 0; 1208 /* Check to see if there are whole file write locks */
1209 ret = 0;
1210 spin_lock(&flctx->flc_lock);
1211 if (!list_empty(&flctx->flc_posix)) {
1212 fl = list_first_entry(&flctx->flc_posix, struct file_lock,
1213 fl_list);
1214 if (is_whole_file_wrlock(fl))
1215 ret = 1;
1216 } else if (!list_empty(&flctx->flc_flock)) {
1217 fl = list_first_entry(&flctx->flc_flock, struct file_lock,
1218 fl_list);
1219 if (fl->fl_type == F_WRLCK)
1220 ret = 1;
1221 }
1222 spin_unlock(&flctx->flc_lock);
1223 return ret;
1194} 1224}
1195 1225
1196/* 1226/*
@@ -1240,15 +1270,15 @@ static int flush_task_priority(int how)
1240 1270
1241static void nfs_initiate_write(struct nfs_pgio_header *hdr, 1271static void nfs_initiate_write(struct nfs_pgio_header *hdr,
1242 struct rpc_message *msg, 1272 struct rpc_message *msg,
1273 const struct nfs_rpc_ops *rpc_ops,
1243 struct rpc_task_setup *task_setup_data, int how) 1274 struct rpc_task_setup *task_setup_data, int how)
1244{ 1275{
1245 struct inode *inode = hdr->inode;
1246 int priority = flush_task_priority(how); 1276 int priority = flush_task_priority(how);
1247 1277
1248 task_setup_data->priority = priority; 1278 task_setup_data->priority = priority;
1249 NFS_PROTO(inode)->write_setup(hdr, msg); 1279 rpc_ops->write_setup(hdr, msg);
1250 1280
1251 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, 1281 nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
1252 &task_setup_data->rpc_client, msg, hdr); 1282 &task_setup_data->rpc_client, msg, hdr);
1253} 1283}
1254 1284
@@ -1298,8 +1328,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
1298 1328
1299void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) 1329void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
1300{ 1330{
1331 struct nfs_pgio_mirror *mirror;
1332
1301 pgio->pg_ops = &nfs_pgio_rw_ops; 1333 pgio->pg_ops = &nfs_pgio_rw_ops;
1302 pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; 1334
1335 nfs_pageio_stop_mirroring(pgio);
1336
1337 mirror = &pgio->pg_mirrors[0];
1338 mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
1303} 1339}
1304EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); 1340EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
1305 1341
@@ -1465,6 +1501,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data)
1465EXPORT_SYMBOL_GPL(nfs_commitdata_release); 1501EXPORT_SYMBOL_GPL(nfs_commitdata_release);
1466 1502
1467int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, 1503int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1504 const struct nfs_rpc_ops *nfs_ops,
1468 const struct rpc_call_ops *call_ops, 1505 const struct rpc_call_ops *call_ops,
1469 int how, int flags) 1506 int how, int flags)
1470{ 1507{
@@ -1486,7 +1523,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1486 .priority = priority, 1523 .priority = priority,
1487 }; 1524 };
1488 /* Set up the initial task struct. */ 1525 /* Set up the initial task struct. */
1489 NFS_PROTO(data->inode)->commit_setup(data, &msg); 1526 nfs_ops->commit_setup(data, &msg);
1490 1527
1491 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1528 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1492 1529
@@ -1554,19 +1591,17 @@ EXPORT_SYMBOL_GPL(nfs_init_commit);
1554 1591
1555void nfs_retry_commit(struct list_head *page_list, 1592void nfs_retry_commit(struct list_head *page_list,
1556 struct pnfs_layout_segment *lseg, 1593 struct pnfs_layout_segment *lseg,
1557 struct nfs_commit_info *cinfo) 1594 struct nfs_commit_info *cinfo,
1595 u32 ds_commit_idx)
1558{ 1596{
1559 struct nfs_page *req; 1597 struct nfs_page *req;
1560 1598
1561 while (!list_empty(page_list)) { 1599 while (!list_empty(page_list)) {
1562 req = nfs_list_entry(page_list->next); 1600 req = nfs_list_entry(page_list->next);
1563 nfs_list_remove_request(req); 1601 nfs_list_remove_request(req);
1564 nfs_mark_request_commit(req, lseg, cinfo); 1602 nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
1565 if (!cinfo->dreq) { 1603 if (!cinfo->dreq)
1566 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 1604 nfs_clear_page_commit(req->wb_page);
1567 dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
1568 BDI_RECLAIMABLE);
1569 }
1570 nfs_unlock_and_release_request(req); 1605 nfs_unlock_and_release_request(req);
1571 } 1606 }
1572} 1607}
@@ -1589,10 +1624,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
1589 /* Set up the argument struct */ 1624 /* Set up the argument struct */
1590 nfs_init_commit(data, head, NULL, cinfo); 1625 nfs_init_commit(data, head, NULL, cinfo);
1591 atomic_inc(&cinfo->mds->rpcs_out); 1626 atomic_inc(&cinfo->mds->rpcs_out);
1592 return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, 1627 return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
1593 how, 0); 1628 data->mds_ops, how, 0);
1594 out_bad: 1629 out_bad:
1595 nfs_retry_commit(head, NULL, cinfo); 1630 nfs_retry_commit(head, NULL, cinfo, 0);
1596 cinfo->completion_ops->error_cleanup(NFS_I(inode)); 1631 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1597 return -ENOMEM; 1632 return -ENOMEM;
1598} 1633}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
82 82
83 If unsure, say N. 83 If unsure, say N.
84 84
85config NFSD_PNFS
86 bool "NFSv4.1 server support for Parallel NFS (pNFS)"
87 depends on NFSD_V4
88 help
89 This option enables support for the parallel NFS features of the
90 minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
91 server.
92
93 If unsure, say N.
94
85config NFSD_V4_SECURITY_LABEL 95config NFSD_V4_SECURITY_LABEL
86 bool "Provide Security Label support for NFSv4 server" 96 bool "Provide Security Label support for NFSv4 server"
87 depends on NFSD_V4 && SECURITY 97 depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
2# Makefile for the Linux nfs server 2# Makefile for the Linux nfs server
3# 3#
4 4
5ccflags-y += -I$(src) # needed for trace events
6
5obj-$(CONFIG_NFSD) += nfsd.o 7obj-$(CONFIG_NFSD) += nfsd.o
6 8
7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 9# this one should be compiled first, as the tracing macros can easily blow up
10nfsd-y += trace.o
11
12nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o 13 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
9nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o 14nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
10nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o 15nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
12nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 17nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
13nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 18nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
14 nfs4acl.o nfs4callback.o nfs4recover.o 19 nfs4acl.o nfs4callback.o nfs4recover.o
20nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/exportfs.h>
5#include <linux/genhd.h>
6#include <linux/slab.h>
7
8#include <linux/nfsd/debug.h>
9
10#include "blocklayoutxdr.h"
11#include "pnfs.h"
12
13#define NFSDDBG_FACILITY NFSDDBG_PNFS
14
15
16static int
17nfsd4_block_get_device_info_simple(struct super_block *sb,
18 struct nfsd4_getdeviceinfo *gdp)
19{
20 struct pnfs_block_deviceaddr *dev;
21 struct pnfs_block_volume *b;
22
23 dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
24 sizeof(struct pnfs_block_volume), GFP_KERNEL);
25 if (!dev)
26 return -ENOMEM;
27 gdp->gd_device = dev;
28
29 dev->nr_volumes = 1;
30 b = &dev->volumes[0];
31
32 b->type = PNFS_BLOCK_VOLUME_SIMPLE;
33 b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
34 return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
35 &b->simple.offset);
36}
37
38static __be32
39nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
40 struct nfsd4_getdeviceinfo *gdp)
41{
42 if (sb->s_bdev != sb->s_bdev->bd_contains)
43 return nfserr_inval;
44 return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
45}
46
47static __be32
48nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
49 struct nfsd4_layoutget *args)
50{
51 struct nfsd4_layout_seg *seg = &args->lg_seg;
52 struct super_block *sb = inode->i_sb;
53 u32 block_size = (1 << inode->i_blkbits);
54 struct pnfs_block_extent *bex;
55 struct iomap iomap;
56 u32 device_generation = 0;
57 int error;
58
59 /*
60 * We do not attempt to support I/O smaller than the fs block size,
61 * or not aligned to it.
62 */
63 if (args->lg_minlength < block_size) {
64 dprintk("pnfsd: I/O too small\n");
65 goto out_layoutunavailable;
66 }
67 if (seg->offset & (block_size - 1)) {
68 dprintk("pnfsd: I/O misaligned\n");
69 goto out_layoutunavailable;
70 }
71
72 /*
73 * Some clients barf on non-zero block numbers for NONE or INVALID
74 * layouts, so make sure to zero the whole structure.
75 */
76 error = -ENOMEM;
77 bex = kzalloc(sizeof(*bex), GFP_KERNEL);
78 if (!bex)
79 goto out_error;
80 args->lg_content = bex;
81
82 error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
83 &iomap, seg->iomode != IOMODE_READ,
84 &device_generation);
85 if (error) {
86 if (error == -ENXIO)
87 goto out_layoutunavailable;
88 goto out_error;
89 }
90
91 if (iomap.length < args->lg_minlength) {
92 dprintk("pnfsd: extent smaller than minlength\n");
93 goto out_layoutunavailable;
94 }
95
96 switch (iomap.type) {
97 case IOMAP_MAPPED:
98 if (seg->iomode == IOMODE_READ)
99 bex->es = PNFS_BLOCK_READ_DATA;
100 else
101 bex->es = PNFS_BLOCK_READWRITE_DATA;
102 bex->soff = (iomap.blkno << 9);
103 break;
104 case IOMAP_UNWRITTEN:
105 if (seg->iomode & IOMODE_RW) {
106 /*
107 * Crack monkey special case from section 2.3.1.
108 */
109 if (args->lg_minlength == 0) {
110 dprintk("pnfsd: no soup for you!\n");
111 goto out_layoutunavailable;
112 }
113
114 bex->es = PNFS_BLOCK_INVALID_DATA;
115 bex->soff = (iomap.blkno << 9);
116 break;
117 }
118 /*FALLTHRU*/
119 case IOMAP_HOLE:
120 if (seg->iomode == IOMODE_READ) {
121 bex->es = PNFS_BLOCK_NONE_DATA;
122 break;
123 }
124 /*FALLTHRU*/
125 case IOMAP_DELALLOC:
126 default:
127 WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
128 goto out_layoutunavailable;
129 }
130
131 error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
132 if (error)
133 goto out_error;
134 bex->foff = iomap.offset;
135 bex->len = iomap.length;
136
137 seg->offset = iomap.offset;
138 seg->length = iomap.length;
139
140 dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
141 return 0;
142
143out_error:
144 seg->length = 0;
145 return nfserrno(error);
146out_layoutunavailable:
147 seg->length = 0;
148 return nfserr_layoutunavailable;
149}
150
151static __be32
152nfsd4_block_proc_layoutcommit(struct inode *inode,
153 struct nfsd4_layoutcommit *lcp)
154{
155 loff_t new_size = lcp->lc_last_wr + 1;
156 struct iattr iattr = { .ia_valid = 0 };
157 struct iomap *iomaps;
158 int nr_iomaps;
159 int error;
160
161 nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
162 lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
163 if (nr_iomaps < 0)
164 return nfserrno(nr_iomaps);
165
166 if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
167 timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
168 lcp->lc_mtime = current_fs_time(inode->i_sb);
169 iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
170 iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
171
172 if (new_size > i_size_read(inode)) {
173 iattr.ia_valid |= ATTR_SIZE;
174 iattr.ia_size = new_size;
175 }
176
177 error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
178 nr_iomaps, &iattr);
179 kfree(iomaps);
180 return nfserrno(error);
181}
182
183const struct nfsd4_layout_ops bl_layout_ops = {
184 .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
185 .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
186 .proc_layoutget = nfsd4_block_proc_layoutget,
187 .encode_layoutget = nfsd4_block_encode_layoutget,
188 .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
189};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h>
6#include <linux/nfs4.h>
7
8#include "nfsd.h"
9#include "blocklayoutxdr.h"
10
11#define NFSDDBG_FACILITY NFSDDBG_PNFS
12
13
14__be32
15nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
16 struct nfsd4_layoutget *lgp)
17{
18 struct pnfs_block_extent *b = lgp->lg_content;
19 int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
20 __be32 *p;
21
22 p = xdr_reserve_space(xdr, sizeof(__be32) + len);
23 if (!p)
24 return nfserr_toosmall;
25
26 *p++ = cpu_to_be32(len);
27 *p++ = cpu_to_be32(1); /* we always return a single extent */
28
29 p = xdr_encode_opaque_fixed(p, &b->vol_id,
30 sizeof(struct nfsd4_deviceid));
31 p = xdr_encode_hyper(p, b->foff);
32 p = xdr_encode_hyper(p, b->len);
33 p = xdr_encode_hyper(p, b->soff);
34 *p++ = cpu_to_be32(b->es);
35 return 0;
36}
37
38static int
39nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
40{
41 __be32 *p;
42 int len;
43
44 switch (b->type) {
45 case PNFS_BLOCK_VOLUME_SIMPLE:
46 len = 4 + 4 + 8 + 4 + b->simple.sig_len;
47 p = xdr_reserve_space(xdr, len);
48 if (!p)
49 return -ETOOSMALL;
50
51 *p++ = cpu_to_be32(b->type);
52 *p++ = cpu_to_be32(1); /* single signature */
53 p = xdr_encode_hyper(p, b->simple.offset);
54 p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
55 break;
56 default:
57 return -ENOTSUPP;
58 }
59
60 return len;
61}
62
63__be32
64nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
65 struct nfsd4_getdeviceinfo *gdp)
66{
67 struct pnfs_block_deviceaddr *dev = gdp->gd_device;
68 int len = sizeof(__be32), ret, i;
69 __be32 *p;
70
71 p = xdr_reserve_space(xdr, len + sizeof(__be32));
72 if (!p)
73 return nfserr_resource;
74
75 for (i = 0; i < dev->nr_volumes; i++) {
76 ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
77 if (ret < 0)
78 return nfserrno(ret);
79 len += ret;
80 }
81
82 /*
83 * Fill in the overall length and number of volumes at the beginning
84 * of the layout.
85 */
86 *p++ = cpu_to_be32(len);
87 *p++ = cpu_to_be32(dev->nr_volumes);
88 return 0;
89}
90
91int
92nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
93 u32 block_size)
94{
95 struct iomap *iomaps;
96 u32 nr_iomaps, expected, i;
97
98 if (len < sizeof(u32)) {
99 dprintk("%s: extent array too small: %u\n", __func__, len);
100 return -EINVAL;
101 }
102
103 nr_iomaps = be32_to_cpup(p++);
104 expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
105 if (len != expected) {
106 dprintk("%s: extent array size mismatch: %u/%u\n",
107 __func__, len, expected);
108 return -EINVAL;
109 }
110
111 iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
112 if (!iomaps) {
113 dprintk("%s: failed to allocate extent array\n", __func__);
114 return -ENOMEM;
115 }
116
117 for (i = 0; i < nr_iomaps; i++) {
118 struct pnfs_block_extent bex;
119
120 memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
121 p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
122
123 p = xdr_decode_hyper(p, &bex.foff);
124 if (bex.foff & (block_size - 1)) {
125 dprintk("%s: unaligned offset %lld\n",
126 __func__, bex.foff);
127 goto fail;
128 }
129 p = xdr_decode_hyper(p, &bex.len);
130 if (bex.len & (block_size - 1)) {
131 dprintk("%s: unaligned length %lld\n",
132 __func__, bex.foff);
133 goto fail;
134 }
135 p = xdr_decode_hyper(p, &bex.soff);
136 if (bex.soff & (block_size - 1)) {
137 dprintk("%s: unaligned disk offset %lld\n",
138 __func__, bex.soff);
139 goto fail;
140 }
141 bex.es = be32_to_cpup(p++);
142 if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
143 dprintk("%s: incorrect extent state %d\n",
144 __func__, bex.es);
145 goto fail;
146 }
147
148 iomaps[i].offset = bex.foff;
149 iomaps[i].length = bex.len;
150 }
151
152 *iomapp = iomaps;
153 return nr_iomaps;
154fail:
155 kfree(iomaps);
156 return -EINVAL;
157}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
1#ifndef _NFSD_BLOCKLAYOUTXDR_H
2#define _NFSD_BLOCKLAYOUTXDR_H 1
3
4#include <linux/blkdev.h>
5#include "xdr4.h"
6
7struct iomap;
8struct xdr_stream;
9
10enum pnfs_block_extent_state {
11 PNFS_BLOCK_READWRITE_DATA = 0,
12 PNFS_BLOCK_READ_DATA = 1,
13 PNFS_BLOCK_INVALID_DATA = 2,
14 PNFS_BLOCK_NONE_DATA = 3,
15};
16
17struct pnfs_block_extent {
18 struct nfsd4_deviceid vol_id;
19 u64 foff;
20 u64 len;
21 u64 soff;
22 enum pnfs_block_extent_state es;
23};
24#define NFS4_BLOCK_EXTENT_SIZE 44
25
26enum pnfs_block_volume_type {
27 PNFS_BLOCK_VOLUME_SIMPLE = 0,
28 PNFS_BLOCK_VOLUME_SLICE = 1,
29 PNFS_BLOCK_VOLUME_CONCAT = 2,
30 PNFS_BLOCK_VOLUME_STRIPE = 3,
31};
32
33/*
34 * Random upper cap for the uuid length to avoid unbounded allocation.
35 * Not actually limited by the protocol.
36 */
37#define PNFS_BLOCK_UUID_LEN 128
38
39struct pnfs_block_volume {
40 enum pnfs_block_volume_type type;
41 union {
42 struct {
43 u64 offset;
44 u32 sig_len;
45 u8 sig[PNFS_BLOCK_UUID_LEN];
46 } simple;
47 };
48};
49
50struct pnfs_block_deviceaddr {
51 u32 nr_volumes;
52 struct pnfs_block_volume volumes[];
53};
54
55__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
56 struct nfsd4_getdeviceinfo *gdp);
57__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
58 struct nfsd4_layoutget *lgp);
59int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
60 u32 block_size);
61
62#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
20#include "nfsd.h" 20#include "nfsd.h"
21#include "nfsfh.h" 21#include "nfsfh.h"
22#include "netns.h" 22#include "netns.h"
23#include "pnfs.h"
23 24
24#define NFSDDBG_FACILITY NFSDDBG_EXPORT 25#define NFSDDBG_FACILITY NFSDDBG_EXPORT
25 26
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
545 546
546 exp.ex_client = dom; 547 exp.ex_client = dom;
547 exp.cd = cd; 548 exp.cd = cd;
549 exp.ex_devid_map = NULL;
548 550
549 /* expiry */ 551 /* expiry */
550 err = -EINVAL; 552 err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
621 if (!gid_valid(exp.ex_anon_gid)) 623 if (!gid_valid(exp.ex_anon_gid))
622 goto out4; 624 goto out4;
623 err = 0; 625 err = 0;
626
627 nfsd4_setup_layout_type(&exp);
624 } 628 }
625 629
626 expp = svc_export_lookup(&exp); 630 expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
703 new->ex_fslocs.locations = NULL; 707 new->ex_fslocs.locations = NULL;
704 new->ex_fslocs.locations_count = 0; 708 new->ex_fslocs.locations_count = 0;
705 new->ex_fslocs.migrated = 0; 709 new->ex_fslocs.migrated = 0;
710 new->ex_layout_type = 0;
706 new->ex_uuid = NULL; 711 new->ex_uuid = NULL;
707 new->cd = item->cd; 712 new->cd = item->cd;
708} 713}
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
717 new->ex_anon_uid = item->ex_anon_uid; 722 new->ex_anon_uid = item->ex_anon_uid;
718 new->ex_anon_gid = item->ex_anon_gid; 723 new->ex_anon_gid = item->ex_anon_gid;
719 new->ex_fsid = item->ex_fsid; 724 new->ex_fsid = item->ex_fsid;
725 new->ex_devid_map = item->ex_devid_map;
726 item->ex_devid_map = NULL;
720 new->ex_uuid = item->ex_uuid; 727 new->ex_uuid = item->ex_uuid;
721 item->ex_uuid = NULL; 728 item->ex_uuid = NULL;
722 new->ex_fslocs.locations = item->ex_fslocs.locations; 729 new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
725 item->ex_fslocs.locations_count = 0; 732 item->ex_fslocs.locations_count = 0;
726 new->ex_fslocs.migrated = item->ex_fslocs.migrated; 733 new->ex_fslocs.migrated = item->ex_fslocs.migrated;
727 item->ex_fslocs.migrated = 0; 734 item->ex_fslocs.migrated = 0;
735 new->ex_layout_type = item->ex_layout_type;
728 new->ex_nflavors = item->ex_nflavors; 736 new->ex_nflavors = item->ex_nflavors;
729 for (i = 0; i < MAX_SECINFO_LIST; i++) { 737 for (i = 0; i < MAX_SECINFO_LIST; i++) {
730 new->ex_flavors[i] = item->ex_flavors[i]; 738 new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
56 struct nfsd4_fs_locations ex_fslocs; 56 struct nfsd4_fs_locations ex_fslocs;
57 uint32_t ex_nflavors; 57 uint32_t ex_nflavors;
58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; 58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
59 enum pnfs_layouttype ex_layout_type;
60 struct nfsd4_deviceid_map *ex_devid_map;
59 struct cache_detail *cd; 61 struct cache_detail *cd;
60}; 62};
61 63
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
546 return status; 546 return status;
547} 547}
548 548
549#ifdef CONFIG_NFSD_PNFS
550/*
551 * CB_LAYOUTRECALL4args
552 *
553 * struct layoutrecall_file4 {
554 * nfs_fh4 lor_fh;
555 * offset4 lor_offset;
556 * length4 lor_length;
557 * stateid4 lor_stateid;
558 * };
559 *
560 * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
561 * case LAYOUTRECALL4_FILE:
562 * layoutrecall_file4 lor_layout;
563 * case LAYOUTRECALL4_FSID:
564 * fsid4 lor_fsid;
565 * case LAYOUTRECALL4_ALL:
566 * void;
567 * };
568 *
569 * struct CB_LAYOUTRECALL4args {
570 * layouttype4 clora_type;
571 * layoutiomode4 clora_iomode;
572 * bool clora_changed;
573 * layoutrecall4 clora_recall;
574 * };
575 */
576static void encode_cb_layout4args(struct xdr_stream *xdr,
577 const struct nfs4_layout_stateid *ls,
578 struct nfs4_cb_compound_hdr *hdr)
579{
580 __be32 *p;
581
582 BUG_ON(hdr->minorversion == 0);
583
584 p = xdr_reserve_space(xdr, 5 * 4);
585 *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
586 *p++ = cpu_to_be32(ls->ls_layout_type);
587 *p++ = cpu_to_be32(IOMODE_ANY);
588 *p++ = cpu_to_be32(1);
589 *p = cpu_to_be32(RETURN_FILE);
590
591 encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
592
593 p = xdr_reserve_space(xdr, 2 * 8);
594 p = xdr_encode_hyper(p, 0);
595 xdr_encode_hyper(p, NFS4_MAX_UINT64);
596
597 encode_stateid4(xdr, &ls->ls_recall_sid);
598
599 hdr->nops++;
600}
601
602static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
603 struct xdr_stream *xdr,
604 const struct nfsd4_callback *cb)
605{
606 const struct nfs4_layout_stateid *ls =
607 container_of(cb, struct nfs4_layout_stateid, ls_recall);
608 struct nfs4_cb_compound_hdr hdr = {
609 .ident = 0,
610 .minorversion = cb->cb_minorversion,
611 };
612
613 encode_cb_compound4args(xdr, &hdr);
614 encode_cb_sequence4args(xdr, cb, &hdr);
615 encode_cb_layout4args(xdr, ls, &hdr);
616 encode_cb_nops(&hdr);
617}
618
619static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
620 struct xdr_stream *xdr,
621 struct nfsd4_callback *cb)
622{
623 struct nfs4_cb_compound_hdr hdr;
624 enum nfsstat4 nfserr;
625 int status;
626
627 status = decode_cb_compound4res(xdr, &hdr);
628 if (unlikely(status))
629 goto out;
630 if (cb) {
631 status = decode_cb_sequence4res(xdr, cb);
632 if (unlikely(status))
633 goto out;
634 }
635 status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
636 if (unlikely(status))
637 goto out;
638 if (unlikely(nfserr != NFS4_OK))
639 status = nfs_cb_stat_to_errno(nfserr);
640out:
641 return status;
642}
643#endif /* CONFIG_NFSD_PNFS */
644
549/* 645/*
550 * RPC procedure tables 646 * RPC procedure tables
551 */ 647 */
@@ -563,6 +659,9 @@ out:
563static struct rpc_procinfo nfs4_cb_procedures[] = { 659static struct rpc_procinfo nfs4_cb_procedures[] = {
564 PROC(CB_NULL, NULL, cb_null, cb_null), 660 PROC(CB_NULL, NULL, cb_null, cb_null),
565 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), 661 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
662#ifdef CONFIG_NFSD_PNFS
663 PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
664#endif
566}; 665};
567 666
568static struct rpc_version nfs_cb_version4 = { 667static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/kmod.h>
5#include <linux/file.h>
6#include <linux/jhash.h>
7#include <linux/sched.h>
8#include <linux/sunrpc/addr.h>
9
10#include "pnfs.h"
11#include "netns.h"
12#include "trace.h"
13
14#define NFSDDBG_FACILITY NFSDDBG_PNFS
15
16struct nfs4_layout {
17 struct list_head lo_perstate;
18 struct nfs4_layout_stateid *lo_state;
19 struct nfsd4_layout_seg lo_seg;
20};
21
22static struct kmem_cache *nfs4_layout_cache;
23static struct kmem_cache *nfs4_layout_stateid_cache;
24
25static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
26static const struct lock_manager_operations nfsd4_layouts_lm_ops;
27
28const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
29 [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
30};
31
32/* pNFS device ID to export fsid mapping */
33#define DEVID_HASH_BITS 8
34#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36static u64 nfsd_devid_seq = 1;
37static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
38static DEFINE_SPINLOCK(nfsd_devid_lock);
39
40static inline u32 devid_hashfn(u64 idx)
41{
42 return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
43}
44
45static void
46nfsd4_alloc_devid_map(const struct svc_fh *fhp)
47{
48 const struct knfsd_fh *fh = &fhp->fh_handle;
49 size_t fsid_len = key_len(fh->fh_fsid_type);
50 struct nfsd4_deviceid_map *map, *old;
51 int i;
52
53 map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
54 if (!map)
55 return;
56
57 map->fsid_type = fh->fh_fsid_type;
58 memcpy(&map->fsid, fh->fh_fsid, fsid_len);
59
60 spin_lock(&nfsd_devid_lock);
61 if (fhp->fh_export->ex_devid_map)
62 goto out_unlock;
63
64 for (i = 0; i < DEVID_HASH_SIZE; i++) {
65 list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
66 if (old->fsid_type != fh->fh_fsid_type)
67 continue;
68 if (memcmp(old->fsid, fh->fh_fsid,
69 key_len(old->fsid_type)))
70 continue;
71
72 fhp->fh_export->ex_devid_map = old;
73 goto out_unlock;
74 }
75 }
76
77 map->idx = nfsd_devid_seq++;
78 list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
79 fhp->fh_export->ex_devid_map = map;
80 map = NULL;
81
82out_unlock:
83 spin_unlock(&nfsd_devid_lock);
84 kfree(map);
85}
86
87struct nfsd4_deviceid_map *
88nfsd4_find_devid_map(int idx)
89{
90 struct nfsd4_deviceid_map *map, *ret = NULL;
91
92 rcu_read_lock();
93 list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
94 if (map->idx == idx)
95 ret = map;
96 rcu_read_unlock();
97
98 return ret;
99}
100
101int
102nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
103 u32 device_generation)
104{
105 if (!fhp->fh_export->ex_devid_map) {
106 nfsd4_alloc_devid_map(fhp);
107 if (!fhp->fh_export->ex_devid_map)
108 return -ENOMEM;
109 }
110
111 id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
112 id->generation = device_generation;
113 id->pad = 0;
114 return 0;
115}
116
117void nfsd4_setup_layout_type(struct svc_export *exp)
118{
119 struct super_block *sb = exp->ex_path.mnt->mnt_sb;
120
121 if (exp->ex_flags & NFSEXP_NOPNFS)
122 return;
123
124 if (sb->s_export_op->get_uuid &&
125 sb->s_export_op->map_blocks &&
126 sb->s_export_op->commit_blocks)
127 exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
128}
129
130static void
131nfsd4_free_layout_stateid(struct nfs4_stid *stid)
132{
133 struct nfs4_layout_stateid *ls = layoutstateid(stid);
134 struct nfs4_client *clp = ls->ls_stid.sc_client;
135 struct nfs4_file *fp = ls->ls_stid.sc_file;
136
137 trace_layoutstate_free(&ls->ls_stid.sc_stateid);
138
139 spin_lock(&clp->cl_lock);
140 list_del_init(&ls->ls_perclnt);
141 spin_unlock(&clp->cl_lock);
142
143 spin_lock(&fp->fi_lock);
144 list_del_init(&ls->ls_perfile);
145 spin_unlock(&fp->fi_lock);
146
147 vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
148 fput(ls->ls_file);
149
150 if (ls->ls_recalled)
151 atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
152
153 kmem_cache_free(nfs4_layout_stateid_cache, ls);
154}
155
156static int
157nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
158{
159 struct file_lock *fl;
160 int status;
161
162 fl = locks_alloc_lock();
163 if (!fl)
164 return -ENOMEM;
165 locks_init_lock(fl);
166 fl->fl_lmops = &nfsd4_layouts_lm_ops;
167 fl->fl_flags = FL_LAYOUT;
168 fl->fl_type = F_RDLCK;
169 fl->fl_end = OFFSET_MAX;
170 fl->fl_owner = ls;
171 fl->fl_pid = current->tgid;
172 fl->fl_file = ls->ls_file;
173
174 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
175 if (status) {
176 locks_free_lock(fl);
177 return status;
178 }
179 BUG_ON(fl != NULL);
180 return 0;
181}
182
183static struct nfs4_layout_stateid *
184nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
185 struct nfs4_stid *parent, u32 layout_type)
186{
187 struct nfs4_client *clp = cstate->clp;
188 struct nfs4_file *fp = parent->sc_file;
189 struct nfs4_layout_stateid *ls;
190 struct nfs4_stid *stp;
191
192 stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
193 if (!stp)
194 return NULL;
195 stp->sc_free = nfsd4_free_layout_stateid;
196 get_nfs4_file(fp);
197 stp->sc_file = fp;
198
199 ls = layoutstateid(stp);
200 INIT_LIST_HEAD(&ls->ls_perclnt);
201 INIT_LIST_HEAD(&ls->ls_perfile);
202 spin_lock_init(&ls->ls_lock);
203 INIT_LIST_HEAD(&ls->ls_layouts);
204 ls->ls_layout_type = layout_type;
205 nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
206 NFSPROC4_CLNT_CB_LAYOUT);
207
208 if (parent->sc_type == NFS4_DELEG_STID)
209 ls->ls_file = get_file(fp->fi_deleg_file);
210 else
211 ls->ls_file = find_any_file(fp);
212 BUG_ON(!ls->ls_file);
213
214 if (nfsd4_layout_setlease(ls)) {
215 put_nfs4_file(fp);
216 kmem_cache_free(nfs4_layout_stateid_cache, ls);
217 return NULL;
218 }
219
220 spin_lock(&clp->cl_lock);
221 stp->sc_type = NFS4_LAYOUT_STID;
222 list_add(&ls->ls_perclnt, &clp->cl_lo_states);
223 spin_unlock(&clp->cl_lock);
224
225 spin_lock(&fp->fi_lock);
226 list_add(&ls->ls_perfile, &fp->fi_lo_states);
227 spin_unlock(&fp->fi_lock);
228
229 trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
230 return ls;
231}
232
233__be32
234nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
235 struct nfsd4_compound_state *cstate, stateid_t *stateid,
236 bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
237{
238 struct nfs4_layout_stateid *ls;
239 struct nfs4_stid *stid;
240 unsigned char typemask = NFS4_LAYOUT_STID;
241 __be32 status;
242
243 if (create)
244 typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
245
246 status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
247 net_generic(SVC_NET(rqstp), nfsd_net_id));
248 if (status)
249 goto out;
250
251 if (!fh_match(&cstate->current_fh.fh_handle,
252 &stid->sc_file->fi_fhandle)) {
253 status = nfserr_bad_stateid;
254 goto out_put_stid;
255 }
256
257 if (stid->sc_type != NFS4_LAYOUT_STID) {
258 ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
259 nfs4_put_stid(stid);
260
261 status = nfserr_jukebox;
262 if (!ls)
263 goto out;
264 } else {
265 ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
266
267 status = nfserr_bad_stateid;
268 if (stateid->si_generation > stid->sc_stateid.si_generation)
269 goto out_put_stid;
270 if (layout_type != ls->ls_layout_type)
271 goto out_put_stid;
272 }
273
274 *lsp = ls;
275 return 0;
276
277out_put_stid:
278 nfs4_put_stid(stid);
279out:
280 return status;
281}
282
283static void
284nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
285{
286 spin_lock(&ls->ls_lock);
287 if (ls->ls_recalled)
288 goto out_unlock;
289
290 ls->ls_recalled = true;
291 atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
292 if (list_empty(&ls->ls_layouts))
293 goto out_unlock;
294
295 trace_layout_recall(&ls->ls_stid.sc_stateid);
296
297 atomic_inc(&ls->ls_stid.sc_count);
298 update_stateid(&ls->ls_stid.sc_stateid);
299 memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
300 nfsd4_run_cb(&ls->ls_recall);
301
302out_unlock:
303 spin_unlock(&ls->ls_lock);
304}
305
306static inline u64
307layout_end(struct nfsd4_layout_seg *seg)
308{
309 u64 end = seg->offset + seg->length;
310 return end >= seg->offset ? end : NFS4_MAX_UINT64;
311}
312
313static void
314layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
315{
316 if (end == NFS4_MAX_UINT64)
317 lo->length = NFS4_MAX_UINT64;
318 else
319 lo->length = end - lo->offset;
320}
321
322static bool
323layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
324{
325 if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
326 return false;
327 if (layout_end(&lo->lo_seg) <= s->offset)
328 return false;
329 if (layout_end(s) <= lo->lo_seg.offset)
330 return false;
331 return true;
332}
333
334static bool
335layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
336{
337 if (lo->iomode != new->iomode)
338 return false;
339 if (layout_end(new) < lo->offset)
340 return false;
341 if (layout_end(lo) < new->offset)
342 return false;
343
344 lo->offset = min(lo->offset, new->offset);
345 layout_update_len(lo, max(layout_end(lo), layout_end(new)));
346 return true;
347}
348
349static __be32
350nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
351{
352 struct nfs4_file *fp = ls->ls_stid.sc_file;
353 struct nfs4_layout_stateid *l, *n;
354 __be32 nfserr = nfs_ok;
355
356 assert_spin_locked(&fp->fi_lock);
357
358 list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
359 if (l != ls) {
360 nfsd4_recall_file_layout(l);
361 nfserr = nfserr_recallconflict;
362 }
363 }
364
365 return nfserr;
366}
367
368__be32
369nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
370{
371 struct nfsd4_layout_seg *seg = &lgp->lg_seg;
372 struct nfs4_file *fp = ls->ls_stid.sc_file;
373 struct nfs4_layout *lp, *new = NULL;
374 __be32 nfserr;
375
376 spin_lock(&fp->fi_lock);
377 nfserr = nfsd4_recall_conflict(ls);
378 if (nfserr)
379 goto out;
380 spin_lock(&ls->ls_lock);
381 list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
382 if (layouts_try_merge(&lp->lo_seg, seg))
383 goto done;
384 }
385 spin_unlock(&ls->ls_lock);
386 spin_unlock(&fp->fi_lock);
387
388 new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
389 if (!new)
390 return nfserr_jukebox;
391 memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
392 new->lo_state = ls;
393
394 spin_lock(&fp->fi_lock);
395 nfserr = nfsd4_recall_conflict(ls);
396 if (nfserr)
397 goto out;
398 spin_lock(&ls->ls_lock);
399 list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
400 if (layouts_try_merge(&lp->lo_seg, seg))
401 goto done;
402 }
403
404 atomic_inc(&ls->ls_stid.sc_count);
405 list_add_tail(&new->lo_perstate, &ls->ls_layouts);
406 new = NULL;
407done:
408 update_stateid(&ls->ls_stid.sc_stateid);
409 memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
410 spin_unlock(&ls->ls_lock);
411out:
412 spin_unlock(&fp->fi_lock);
413 if (new)
414 kmem_cache_free(nfs4_layout_cache, new);
415 return nfserr;
416}
417
418static void
419nfsd4_free_layouts(struct list_head *reaplist)
420{
421 while (!list_empty(reaplist)) {
422 struct nfs4_layout *lp = list_first_entry(reaplist,
423 struct nfs4_layout, lo_perstate);
424
425 list_del(&lp->lo_perstate);
426 nfs4_put_stid(&lp->lo_state->ls_stid);
427 kmem_cache_free(nfs4_layout_cache, lp);
428 }
429}
430
431static void
432nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
433 struct list_head *reaplist)
434{
435 struct nfsd4_layout_seg *lo = &lp->lo_seg;
436 u64 end = layout_end(lo);
437
438 if (seg->offset <= lo->offset) {
439 if (layout_end(seg) >= end) {
440 list_move_tail(&lp->lo_perstate, reaplist);
441 return;
442 }
443 end = seg->offset;
444 } else {
445 /* retain the whole layout segment on a split. */
446 if (layout_end(seg) < end) {
447 dprintk("%s: split not supported\n", __func__);
448 return;
449 }
450
451 lo->offset = layout_end(seg);
452 }
453
454 layout_update_len(lo, end);
455}
456
457__be32
458nfsd4_return_file_layouts(struct svc_rqst *rqstp,
459 struct nfsd4_compound_state *cstate,
460 struct nfsd4_layoutreturn *lrp)
461{
462 struct nfs4_layout_stateid *ls;
463 struct nfs4_layout *lp, *n;
464 LIST_HEAD(reaplist);
465 __be32 nfserr;
466 int found = 0;
467
468 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
469 false, lrp->lr_layout_type,
470 &ls);
471 if (nfserr) {
472 trace_layout_return_lookup_fail(&lrp->lr_sid);
473 return nfserr;
474 }
475
476 spin_lock(&ls->ls_lock);
477 list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
478 if (layouts_overlapping(lp, &lrp->lr_seg)) {
479 nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
480 found++;
481 }
482 }
483 if (!list_empty(&ls->ls_layouts)) {
484 if (found) {
485 update_stateid(&ls->ls_stid.sc_stateid);
486 memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
487 sizeof(stateid_t));
488 }
489 lrp->lrs_present = 1;
490 } else {
491 trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
492 nfs4_unhash_stid(&ls->ls_stid);
493 lrp->lrs_present = 0;
494 }
495 spin_unlock(&ls->ls_lock);
496
497 nfs4_put_stid(&ls->ls_stid);
498 nfsd4_free_layouts(&reaplist);
499 return nfs_ok;
500}
501
502__be32
503nfsd4_return_client_layouts(struct svc_rqst *rqstp,
504 struct nfsd4_compound_state *cstate,
505 struct nfsd4_layoutreturn *lrp)
506{
507 struct nfs4_layout_stateid *ls, *n;
508 struct nfs4_client *clp = cstate->clp;
509 struct nfs4_layout *lp, *t;
510 LIST_HEAD(reaplist);
511
512 lrp->lrs_present = 0;
513
514 spin_lock(&clp->cl_lock);
515 list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
516 if (lrp->lr_return_type == RETURN_FSID &&
517 !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
518 &cstate->current_fh.fh_handle))
519 continue;
520
521 spin_lock(&ls->ls_lock);
522 list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
523 if (lrp->lr_seg.iomode == IOMODE_ANY ||
524 lrp->lr_seg.iomode == lp->lo_seg.iomode)
525 list_move_tail(&lp->lo_perstate, &reaplist);
526 }
527 spin_unlock(&ls->ls_lock);
528 }
529 spin_unlock(&clp->cl_lock);
530
531 nfsd4_free_layouts(&reaplist);
532 return 0;
533}
534
535static void
536nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
537 struct list_head *reaplist)
538{
539 spin_lock(&ls->ls_lock);
540 list_splice_init(&ls->ls_layouts, reaplist);
541 spin_unlock(&ls->ls_lock);
542}
543
544void
545nfsd4_return_all_client_layouts(struct nfs4_client *clp)
546{
547 struct nfs4_layout_stateid *ls, *n;
548 LIST_HEAD(reaplist);
549
550 spin_lock(&clp->cl_lock);
551 list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
552 nfsd4_return_all_layouts(ls, &reaplist);
553 spin_unlock(&clp->cl_lock);
554
555 nfsd4_free_layouts(&reaplist);
556}
557
558void
559nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
560{
561 struct nfs4_layout_stateid *ls, *n;
562 LIST_HEAD(reaplist);
563
564 spin_lock(&fp->fi_lock);
565 list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
566 if (ls->ls_stid.sc_client == clp)
567 nfsd4_return_all_layouts(ls, &reaplist);
568 }
569 spin_unlock(&fp->fi_lock);
570
571 nfsd4_free_layouts(&reaplist);
572}
573
574static void
575nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
576{
577 struct nfs4_client *clp = ls->ls_stid.sc_client;
578 char addr_str[INET6_ADDRSTRLEN];
579 static char *envp[] = {
580 "HOME=/",
581 "TERM=linux",
582 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
583 NULL
584 };
585 char *argv[8];
586 int error;
587
588 rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
589
590 nfsd4_cb_layout_fail(ls);
591
592 printk(KERN_WARNING
593 "nfsd: client %s failed to respond to layout recall. "
594 " Fencing..\n", addr_str);
595
596 argv[0] = "/sbin/nfsd-recall-failed";
597 argv[1] = addr_str;
598 argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
599 argv[3] = NULL;
600
601 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
602 if (error) {
603 printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
604 addr_str, error);
605 }
606}
607
608static int
609nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
610{
611 struct nfs4_layout_stateid *ls =
612 container_of(cb, struct nfs4_layout_stateid, ls_recall);
613 LIST_HEAD(reaplist);
614
615 switch (task->tk_status) {
616 case 0:
617 return 1;
618 case -NFS4ERR_NOMATCHING_LAYOUT:
619 trace_layout_recall_done(&ls->ls_stid.sc_stateid);
620 task->tk_status = 0;
621 return 1;
622 case -NFS4ERR_DELAY:
623 /* Poll the client until it's done with the layout */
624 /* FIXME: cap number of retries.
625 * The pnfs standard states that we need to only expire
626 * the client after at-least "lease time" .eg lease-time * 2
627 * when failing to communicate a recall
628 */
629 rpc_delay(task, HZ/100); /* 10 mili-seconds */
630 return 0;
631 default:
632 /*
633 * Unknown error or non-responding client, we'll need to fence.
634 */
635 nfsd4_cb_layout_fail(ls);
636 return -1;
637 }
638}
639
640static void
641nfsd4_cb_layout_release(struct nfsd4_callback *cb)
642{
643 struct nfs4_layout_stateid *ls =
644 container_of(cb, struct nfs4_layout_stateid, ls_recall);
645 LIST_HEAD(reaplist);
646
647 trace_layout_recall_release(&ls->ls_stid.sc_stateid);
648
649 nfsd4_return_all_layouts(ls, &reaplist);
650 nfsd4_free_layouts(&reaplist);
651 nfs4_put_stid(&ls->ls_stid);
652}
653
654static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
655 .done = nfsd4_cb_layout_done,
656 .release = nfsd4_cb_layout_release,
657};
658
659static bool
660nfsd4_layout_lm_break(struct file_lock *fl)
661{
662 /*
663 * We don't want the locks code to timeout the lease for us;
664 * we'll remove it ourself if a layout isn't returned
665 * in time:
666 */
667 fl->fl_break_time = 0;
668 nfsd4_recall_file_layout(fl->fl_owner);
669 return false;
670}
671
672static int
673nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
674 struct list_head *dispose)
675{
676 BUG_ON(!(arg & F_UNLCK));
677 return lease_modify(onlist, arg, dispose);
678}
679
680static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
681 .lm_break = nfsd4_layout_lm_break,
682 .lm_change = nfsd4_layout_lm_change,
683};
684
685int
686nfsd4_init_pnfs(void)
687{
688 int i;
689
690 for (i = 0; i < DEVID_HASH_SIZE; i++)
691 INIT_LIST_HEAD(&nfsd_devid_hash[i]);
692
693 nfs4_layout_cache = kmem_cache_create("nfs4_layout",
694 sizeof(struct nfs4_layout), 0, 0, NULL);
695 if (!nfs4_layout_cache)
696 return -ENOMEM;
697
698 nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
699 sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
700 if (!nfs4_layout_stateid_cache) {
701 kmem_cache_destroy(nfs4_layout_cache);
702 return -ENOMEM;
703 }
704 return 0;
705}
706
707void
708nfsd4_exit_pnfs(void)
709{
710 int i;
711
712 kmem_cache_destroy(nfs4_layout_cache);
713 kmem_cache_destroy(nfs4_layout_stateid_cache);
714
715 for (i = 0; i < DEVID_HASH_SIZE; i++) {
716 struct nfsd4_deviceid_map *map, *n;
717
718 list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
719 kfree(map);
720 }
721}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
43#include "current_stateid.h" 43#include "current_stateid.h"
44#include "netns.h" 44#include "netns.h"
45#include "acl.h" 45#include "acl.h"
46#include "pnfs.h"
47#include "trace.h"
46 48
47#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 49#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
48#include <linux/security.h> 50#include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1178 return status == nfserr_same ? nfs_ok : status; 1180 return status == nfserr_same ? nfs_ok : status;
1179} 1181}
1180 1182
1183#ifdef CONFIG_NFSD_PNFS
1184static const struct nfsd4_layout_ops *
1185nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
1186{
1187 if (!exp->ex_layout_type) {
1188 dprintk("%s: export does not support pNFS\n", __func__);
1189 return NULL;
1190 }
1191
1192 if (exp->ex_layout_type != layout_type) {
1193 dprintk("%s: layout type %d not supported\n",
1194 __func__, layout_type);
1195 return NULL;
1196 }
1197
1198 return nfsd4_layout_ops[layout_type];
1199}
1200
1201static __be32
1202nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
1203 struct nfsd4_compound_state *cstate,
1204 struct nfsd4_getdeviceinfo *gdp)
1205{
1206 const struct nfsd4_layout_ops *ops;
1207 struct nfsd4_deviceid_map *map;
1208 struct svc_export *exp;
1209 __be32 nfserr;
1210
1211 dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
1212 __func__,
1213 gdp->gd_layout_type,
1214 gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
1215 gdp->gd_maxcount);
1216
1217 map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
1218 if (!map) {
1219 dprintk("%s: couldn't find device ID to export mapping!\n",
1220 __func__);
1221 return nfserr_noent;
1222 }
1223
1224 exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
1225 if (IS_ERR(exp)) {
1226 dprintk("%s: could not find device id\n", __func__);
1227 return nfserr_noent;
1228 }
1229
1230 nfserr = nfserr_layoutunavailable;
1231 ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
1232 if (!ops)
1233 goto out;
1234
1235 nfserr = nfs_ok;
1236 if (gdp->gd_maxcount != 0)
1237 nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
1238
1239 gdp->gd_notify_types &= ops->notify_types;
1240 exp_put(exp);
1241out:
1242 return nfserr;
1243}
1244
1245static __be32
1246nfsd4_layoutget(struct svc_rqst *rqstp,
1247 struct nfsd4_compound_state *cstate,
1248 struct nfsd4_layoutget *lgp)
1249{
1250 struct svc_fh *current_fh = &cstate->current_fh;
1251 const struct nfsd4_layout_ops *ops;
1252 struct nfs4_layout_stateid *ls;
1253 __be32 nfserr;
1254 int accmode;
1255
1256 switch (lgp->lg_seg.iomode) {
1257 case IOMODE_READ:
1258 accmode = NFSD_MAY_READ;
1259 break;
1260 case IOMODE_RW:
1261 accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
1262 break;
1263 default:
1264 dprintk("%s: invalid iomode %d\n",
1265 __func__, lgp->lg_seg.iomode);
1266 nfserr = nfserr_badiomode;
1267 goto out;
1268 }
1269
1270 nfserr = fh_verify(rqstp, current_fh, 0, accmode);
1271 if (nfserr)
1272 goto out;
1273
1274 nfserr = nfserr_layoutunavailable;
1275 ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
1276 if (!ops)
1277 goto out;
1278
1279 /*
1280 * Verify minlength and range as per RFC5661:
1281 * o If loga_length is less than loga_minlength,
1282 * the metadata server MUST return NFS4ERR_INVAL.
1283 * o If the sum of loga_offset and loga_minlength exceeds
1284 * NFS4_UINT64_MAX, and loga_minlength is not
1285 * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
1286 * o If the sum of loga_offset and loga_length exceeds
1287 * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
1288 * the error NFS4ERR_INVAL MUST result.
1289 */
1290 nfserr = nfserr_inval;
1291 if (lgp->lg_seg.length < lgp->lg_minlength ||
1292 (lgp->lg_minlength != NFS4_MAX_UINT64 &&
1293 lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
1294 (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
1295 lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
1296 goto out;
1297 if (lgp->lg_seg.length == 0)
1298 goto out;
1299
1300 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
1301 true, lgp->lg_layout_type, &ls);
1302 if (nfserr) {
1303 trace_layout_get_lookup_fail(&lgp->lg_sid);
1304 goto out;
1305 }
1306
1307 nfserr = nfserr_recallconflict;
1308 if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
1309 goto out_put_stid;
1310
1311 nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
1312 current_fh, lgp);
1313 if (nfserr)
1314 goto out_put_stid;
1315
1316 nfserr = nfsd4_insert_layout(lgp, ls);
1317
1318out_put_stid:
1319 nfs4_put_stid(&ls->ls_stid);
1320out:
1321 return nfserr;
1322}
1323
1324static __be32
1325nfsd4_layoutcommit(struct svc_rqst *rqstp,
1326 struct nfsd4_compound_state *cstate,
1327 struct nfsd4_layoutcommit *lcp)
1328{
1329 const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
1330 struct svc_fh *current_fh = &cstate->current_fh;
1331 const struct nfsd4_layout_ops *ops;
1332 loff_t new_size = lcp->lc_last_wr + 1;
1333 struct inode *inode;
1334 struct nfs4_layout_stateid *ls;
1335 __be32 nfserr;
1336
1337 nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
1338 if (nfserr)
1339 goto out;
1340
1341 nfserr = nfserr_layoutunavailable;
1342 ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
1343 if (!ops)
1344 goto out;
1345 inode = current_fh->fh_dentry->d_inode;
1346
1347 nfserr = nfserr_inval;
1348 if (new_size <= seg->offset) {
1349 dprintk("pnfsd: last write before layout segment\n");
1350 goto out;
1351 }
1352 if (new_size > seg->offset + seg->length) {
1353 dprintk("pnfsd: last write beyond layout segment\n");
1354 goto out;
1355 }
1356 if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
1357 dprintk("pnfsd: layoutcommit beyond EOF\n");
1358 goto out;
1359 }
1360
1361 nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
1362 false, lcp->lc_layout_type,
1363 &ls);
1364 if (nfserr) {
1365 trace_layout_commit_lookup_fail(&lcp->lc_sid);
1366 /* fixup error code as per RFC5661 */
1367 if (nfserr == nfserr_bad_stateid)
1368 nfserr = nfserr_badlayout;
1369 goto out;
1370 }
1371
1372 nfserr = ops->proc_layoutcommit(inode, lcp);
1373 if (nfserr)
1374 goto out_put_stid;
1375
1376 if (new_size > i_size_read(inode)) {
1377 lcp->lc_size_chg = 1;
1378 lcp->lc_newsize = new_size;
1379 } else {
1380 lcp->lc_size_chg = 0;
1381 }
1382
1383out_put_stid:
1384 nfs4_put_stid(&ls->ls_stid);
1385out:
1386 return nfserr;
1387}
1388
1389static __be32
1390nfsd4_layoutreturn(struct svc_rqst *rqstp,
1391 struct nfsd4_compound_state *cstate,
1392 struct nfsd4_layoutreturn *lrp)
1393{
1394 struct svc_fh *current_fh = &cstate->current_fh;
1395 __be32 nfserr;
1396
1397 nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
1398 if (nfserr)
1399 goto out;
1400
1401 nfserr = nfserr_layoutunavailable;
1402 if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
1403 goto out;
1404
1405 switch (lrp->lr_seg.iomode) {
1406 case IOMODE_READ:
1407 case IOMODE_RW:
1408 case IOMODE_ANY:
1409 break;
1410 default:
1411 dprintk("%s: invalid iomode %d\n", __func__,
1412 lrp->lr_seg.iomode);
1413 nfserr = nfserr_inval;
1414 goto out;
1415 }
1416
1417 switch (lrp->lr_return_type) {
1418 case RETURN_FILE:
1419 nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
1420 break;
1421 case RETURN_FSID:
1422 case RETURN_ALL:
1423 nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
1424 break;
1425 default:
1426 dprintk("%s: invalid return_type %d\n", __func__,
1427 lrp->lr_return_type);
1428 nfserr = nfserr_inval;
1429 break;
1430 }
1431out:
1432 return nfserr;
1433}
1434#endif /* CONFIG_NFSD_PNFS */
1435
1181/* 1436/*
1182 * NULL call. 1437 * NULL call.
1183 */ 1438 */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
1679 op_encode_channel_attrs_maxsz) * sizeof(__be32); 1934 op_encode_channel_attrs_maxsz) * sizeof(__be32);
1680} 1935}
1681 1936
1937#ifdef CONFIG_NFSD_PNFS
1938/*
1939 * At this stage we don't really know what layout driver will handle the request,
1940 * so we need to define an arbitrary upper bound here.
1941 */
1942#define MAX_LAYOUT_SIZE 128
1943static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1944{
1945 return (op_encode_hdr_size +
1946 1 /* logr_return_on_close */ +
1947 op_encode_stateid_maxsz +
1948 1 /* nr of layouts */ +
1949 MAX_LAYOUT_SIZE) * sizeof(__be32);
1950}
1951
1952static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1953{
1954 return (op_encode_hdr_size +
1955 1 /* locr_newsize */ +
1956 2 /* ns_size */) * sizeof(__be32);
1957}
1958
1959static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1960{
1961 return (op_encode_hdr_size +
1962 1 /* lrs_stateid */ +
1963 op_encode_stateid_maxsz) * sizeof(__be32);
1964}
1965#endif /* CONFIG_NFSD_PNFS */
1966
1682static struct nfsd4_operation nfsd4_ops[] = { 1967static struct nfsd4_operation nfsd4_ops[] = {
1683 [OP_ACCESS] = { 1968 [OP_ACCESS] = {
1684 .op_func = (nfsd4op_func)nfsd4_access, 1969 .op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
1966 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 2251 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1967 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 2252 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1968 }, 2253 },
2254#ifdef CONFIG_NFSD_PNFS
2255 [OP_GETDEVICEINFO] = {
2256 .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
2257 .op_flags = ALLOWED_WITHOUT_FH,
2258 .op_name = "OP_GETDEVICEINFO",
2259 },
2260 [OP_LAYOUTGET] = {
2261 .op_func = (nfsd4op_func)nfsd4_layoutget,
2262 .op_flags = OP_MODIFIES_SOMETHING,
2263 .op_name = "OP_LAYOUTGET",
2264 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
2265 },
2266 [OP_LAYOUTCOMMIT] = {
2267 .op_func = (nfsd4op_func)nfsd4_layoutcommit,
2268 .op_flags = OP_MODIFIES_SOMETHING,
2269 .op_name = "OP_LAYOUTCOMMIT",
2270 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
2271 },
2272 [OP_LAYOUTRETURN] = {
2273 .op_func = (nfsd4op_func)nfsd4_layoutreturn,
2274 .op_flags = OP_MODIFIES_SOMETHING,
2275 .op_name = "OP_LAYOUTRETURN",
2276 .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
2277 },
2278#endif /* CONFIG_NFSD_PNFS */
1969 2279
1970 /* NFSv4.2 operations */ 2280 /* NFSv4.2 operations */
1971 [OP_ALLOCATE] = { 2281 [OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3550a9c87616..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
48#include "current_stateid.h" 48#include "current_stateid.h"
49 49
50#include "netns.h" 50#include "netns.h"
51#include "pnfs.h"
51 52
52#define NFSDDBG_FACILITY NFSDDBG_PROC 53#define NFSDDBG_FACILITY NFSDDBG_PROC
53 54
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
150 clp->cl_time = get_seconds(); 151 clp->cl_time = get_seconds();
151} 152}
152 153
153static inline void
154renew_client(struct nfs4_client *clp)
155{
156 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
157
158 spin_lock(&nn->client_lock);
159 renew_client_locked(clp);
160 spin_unlock(&nn->client_lock);
161}
162
163static void put_client_renew_locked(struct nfs4_client *clp) 154static void put_client_renew_locked(struct nfs4_client *clp)
164{ 155{
165 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 156 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
282 kmem_cache_free(file_slab, fp); 273 kmem_cache_free(file_slab, fp);
283} 274}
284 275
285static inline void 276void
286put_nfs4_file(struct nfs4_file *fi) 277put_nfs4_file(struct nfs4_file *fi)
287{ 278{
288 might_lock(&state_lock); 279 might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
295 } 286 }
296} 287}
297 288
298static inline void
299get_nfs4_file(struct nfs4_file *fi)
300{
301 atomic_inc(&fi->fi_ref);
302}
303
304static struct file * 289static struct file *
305__nfs4_get_fd(struct nfs4_file *f, int oflag) 290__nfs4_get_fd(struct nfs4_file *f, int oflag)
306{ 291{
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
358 return ret; 343 return ret;
359} 344}
360 345
361static struct file * 346struct file *
362find_any_file(struct nfs4_file *f) 347find_any_file(struct nfs4_file *f)
363{ 348{
364 struct file *ret; 349 struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
408 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); 393 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
409} 394}
410 395
411static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
412{
413 return fh1->fh_size == fh2->fh_size &&
414 !memcmp(fh1->fh_base.fh_pad,
415 fh2->fh_base.fh_pad,
416 fh1->fh_size);
417}
418
419static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; 396static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
420 397
421static void 398static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
494 __nfs4_file_put_access(fp, O_RDONLY); 471 __nfs4_file_put_access(fp, O_RDONLY);
495} 472}
496 473
497static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, 474struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
498 struct kmem_cache *slab) 475 struct kmem_cache *slab)
499{ 476{
500 struct nfs4_stid *stid; 477 struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
688 struct file *filp = NULL; 665 struct file *filp = NULL;
689 666
690 spin_lock(&fp->fi_lock); 667 spin_lock(&fp->fi_lock);
691 if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) 668 if (fp->fi_deleg_file && --fp->fi_delegees == 0)
692 swap(filp, fp->fi_deleg_file); 669 swap(filp, fp->fi_deleg_file);
693 spin_unlock(&fp->fi_lock); 670 spin_unlock(&fp->fi_lock);
694 671
695 if (filp) { 672 if (filp) {
696 vfs_setlease(filp, F_UNLCK, NULL, NULL); 673 vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
697 fput(filp); 674 fput(filp);
698 } 675 }
699} 676}
700 677
701static void unhash_stid(struct nfs4_stid *s) 678void nfs4_unhash_stid(struct nfs4_stid *s)
702{ 679{
703 s->sc_type = 0; 680 s->sc_type = 0;
704} 681}
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
1006 983
1007 list_del_init(&stp->st_locks); 984 list_del_init(&stp->st_locks);
1008 unhash_ol_stateid(stp); 985 unhash_ol_stateid(stp);
1009 unhash_stid(&stp->st_stid); 986 nfs4_unhash_stid(&stp->st_stid);
1010} 987}
1011 988
1012static void release_lock_stateid(struct nfs4_ol_stateid *stp) 989static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
1518static int 1495static int
1519STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) 1496STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
1520{ 1497{
1521 if (clid->cl_boot == nn->boot_time) 1498 /*
1499 * We're assuming the clid was not given out from a boot
1500 * precisely 2^32 (about 136 years) before this one. That seems
1501 * a safe assumption:
1502 */
1503 if (clid->cl_boot == (u32)nn->boot_time)
1522 return 0; 1504 return 0;
1523 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", 1505 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
1524 clid->cl_boot, clid->cl_id, nn->boot_time); 1506 clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1558 INIT_LIST_HEAD(&clp->cl_lru); 1540 INIT_LIST_HEAD(&clp->cl_lru);
1559 INIT_LIST_HEAD(&clp->cl_callbacks); 1541 INIT_LIST_HEAD(&clp->cl_callbacks);
1560 INIT_LIST_HEAD(&clp->cl_revoked); 1542 INIT_LIST_HEAD(&clp->cl_revoked);
1543#ifdef CONFIG_NFSD_PNFS
1544 INIT_LIST_HEAD(&clp->cl_lo_states);
1545#endif
1561 spin_lock_init(&clp->cl_lock); 1546 spin_lock_init(&clp->cl_lock);
1562 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1547 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1563 return clp; 1548 return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
1662 nfs4_get_stateowner(&oo->oo_owner); 1647 nfs4_get_stateowner(&oo->oo_owner);
1663 release_openowner(oo); 1648 release_openowner(oo);
1664 } 1649 }
1650 nfsd4_return_all_client_layouts(clp);
1665 nfsd4_shutdown_callback(clp); 1651 nfsd4_shutdown_callback(clp);
1666 if (clp->cl_cb_conn.cb_xprt) 1652 if (clp->cl_cb_conn.cb_xprt)
1667 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 1653 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
2145static void 2131static void
2146nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) 2132nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
2147{ 2133{
2148 /* pNFS is not supported */ 2134#ifdef CONFIG_NFSD_PNFS
2135 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
2136#else
2149 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; 2137 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
2138#endif
2150 2139
2151 /* Referrals are supported, Migration is not. */ 2140 /* Referrals are supported, Migration is not. */
2152 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; 2141 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
3074 fp->fi_share_deny = 0; 3063 fp->fi_share_deny = 0;
3075 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 3064 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
3076 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 3065 memset(fp->fi_access, 0, sizeof(fp->fi_access));
3066#ifdef CONFIG_NFSD_PNFS
3067 INIT_LIST_HEAD(&fp->fi_lo_states);
3068 atomic_set(&fp->fi_lo_recalls, 0);
3069#endif
3077 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); 3070 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
3078} 3071}
3079 3072
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
3300 struct nfs4_file *fp; 3293 struct nfs4_file *fp;
3301 3294
3302 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { 3295 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
3303 if (nfsd_fh_match(&fp->fi_fhandle, fh)) { 3296 if (fh_match(&fp->fi_fhandle, fh)) {
3304 if (atomic_inc_not_zero(&fp->fi_ref)) 3297 if (atomic_inc_not_zero(&fp->fi_ref))
3305 return fp; 3298 return fp;
3306 } 3299 }
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
3308 return NULL; 3301 return NULL;
3309} 3302}
3310 3303
3311static struct nfs4_file * 3304struct nfs4_file *
3312find_file(struct knfsd_fh *fh) 3305find_file(struct knfsd_fh *fh)
3313{ 3306{
3314 struct nfs4_file *fp; 3307 struct nfs4_file *fp;
@@ -3477,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
3477} 3470}
3478 3471
3479static int 3472static int
3480nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose) 3473nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
3474 struct list_head *dispose)
3481{ 3475{
3482 if (arg & F_UNLCK) 3476 if (arg & F_UNLCK)
3483 return lease_modify(onlist, arg, dispose); 3477 return lease_modify(onlist, arg, dispose);
@@ -3855,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
3855 /* Race breaker */ 3849 /* Race breaker */
3856 if (fp->fi_deleg_file) { 3850 if (fp->fi_deleg_file) {
3857 status = 0; 3851 status = 0;
3858 atomic_inc(&fp->fi_delegees); 3852 ++fp->fi_delegees;
3859 hash_delegation_locked(dp, fp); 3853 hash_delegation_locked(dp, fp);
3860 goto out_unlock; 3854 goto out_unlock;
3861 } 3855 }
3862 fp->fi_deleg_file = filp; 3856 fp->fi_deleg_file = filp;
3863 atomic_set(&fp->fi_delegees, 1); 3857 fp->fi_delegees = 1;
3864 hash_delegation_locked(dp, fp); 3858 hash_delegation_locked(dp, fp);
3865 spin_unlock(&fp->fi_lock); 3859 spin_unlock(&fp->fi_lock);
3866 spin_unlock(&state_lock); 3860 spin_unlock(&state_lock);
@@ -3897,11 +3891,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
3897 status = nfs4_setlease(dp); 3891 status = nfs4_setlease(dp);
3898 goto out; 3892 goto out;
3899 } 3893 }
3900 atomic_inc(&fp->fi_delegees);
3901 if (fp->fi_had_conflict) { 3894 if (fp->fi_had_conflict) {
3902 status = -EAGAIN; 3895 status = -EAGAIN;
3903 goto out_unlock; 3896 goto out_unlock;
3904 } 3897 }
3898 ++fp->fi_delegees;
3905 hash_delegation_locked(dp, fp); 3899 hash_delegation_locked(dp, fp);
3906 status = 0; 3900 status = 0;
3907out_unlock: 3901out_unlock:
@@ -4294,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
4294 4288
4295static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 4289static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
4296{ 4290{
4297 if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) 4291 if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
4298 return nfserr_bad_stateid; 4292 return nfserr_bad_stateid;
4299 return nfs_ok; 4293 return nfs_ok;
4300} 4294}
@@ -4445,7 +4439,7 @@ out_unlock:
4445 return status; 4439 return status;
4446} 4440}
4447 4441
4448static __be32 4442__be32
4449nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, 4443nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
4450 stateid_t *stateid, unsigned char typemask, 4444 stateid_t *stateid, unsigned char typemask,
4451 struct nfs4_stid **s, struct nfsd_net *nn) 4445 struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4859,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4859 update_stateid(&stp->st_stid.sc_stateid); 4853 update_stateid(&stp->st_stid.sc_stateid);
4860 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4854 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4861 4855
4856 nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
4857 stp->st_stid.sc_file);
4858
4862 nfsd4_close_open_stateid(stp); 4859 nfsd4_close_open_stateid(stp);
4863 4860
4864 /* put reference from nfs4_preprocess_seqid_op */ 4861 /* put reference from nfs4_preprocess_seqid_op */
@@ -5556,10 +5553,11 @@ out_nfserr:
5556static bool 5553static bool
5557check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) 5554check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
5558{ 5555{
5559 struct file_lock **flpp; 5556 struct file_lock *fl;
5560 int status = false; 5557 int status = false;
5561 struct file *filp = find_any_file(fp); 5558 struct file *filp = find_any_file(fp);
5562 struct inode *inode; 5559 struct inode *inode;
5560 struct file_lock_context *flctx;
5563 5561
5564 if (!filp) { 5562 if (!filp) {
5565 /* Any valid lock stateid should have some sort of access */ 5563 /* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
5568 } 5566 }
5569 5567
5570 inode = file_inode(filp); 5568 inode = file_inode(filp);
5569 flctx = inode->i_flctx;
5571 5570
5572 spin_lock(&inode->i_lock); 5571 if (flctx && !list_empty_careful(&flctx->flc_posix)) {
5573 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 5572 spin_lock(&flctx->flc_lock);
5574 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 5573 list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
5575 status = true; 5574 if (fl->fl_owner == (fl_owner_t)lowner) {
5576 break; 5575 status = true;
5576 break;
5577 }
5577 } 5578 }
5579 spin_unlock(&flctx->flc_lock);
5578 } 5580 }
5579 spin_unlock(&inode->i_lock);
5580 fput(filp); 5581 fput(filp);
5581 return status; 5582 return status;
5582} 5583}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
47#include "state.h" 47#include "state.h"
48#include "cache.h" 48#include "cache.h"
49#include "netns.h" 49#include "netns.h"
50#include "pnfs.h"
50 51
51#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 52#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
52#include <linux/security.h> 53#include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
234 return ret; 235 return ret;
235} 236}
236 237
238/*
239 * We require the high 32 bits of 'seconds' to be 0, and
240 * we ignore all 32 bits of 'nseconds'.
241 */
242static __be32
243nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
244{
245 DECODE_HEAD;
246 u64 sec;
247
248 READ_BUF(12);
249 p = xdr_decode_hyper(p, &sec);
250 tv->tv_sec = sec;
251 tv->tv_nsec = be32_to_cpup(p++);
252 if (tv->tv_nsec >= (u32)1000000000)
253 return nfserr_inval;
254
255 DECODE_TAIL;
256}
257
237static __be32 258static __be32
238nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) 259nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
239{ 260{
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
267{ 288{
268 int expected_len, len = 0; 289 int expected_len, len = 0;
269 u32 dummy32; 290 u32 dummy32;
270 u64 sec;
271 char *buf; 291 char *buf;
272 292
273 DECODE_HEAD; 293 DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
358 dummy32 = be32_to_cpup(p++); 378 dummy32 = be32_to_cpup(p++);
359 switch (dummy32) { 379 switch (dummy32) {
360 case NFS4_SET_TO_CLIENT_TIME: 380 case NFS4_SET_TO_CLIENT_TIME:
361 /* We require the high 32 bits of 'seconds' to be 0, and we ignore
362 all 32 bits of 'nseconds'. */
363 READ_BUF(12);
364 len += 12; 381 len += 12;
365 p = xdr_decode_hyper(p, &sec); 382 status = nfsd4_decode_time(argp, &iattr->ia_atime);
366 iattr->ia_atime.tv_sec = (time_t)sec; 383 if (status)
367 iattr->ia_atime.tv_nsec = be32_to_cpup(p++); 384 return status;
368 if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
369 return nfserr_inval;
370 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); 385 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
371 break; 386 break;
372 case NFS4_SET_TO_SERVER_TIME: 387 case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
382 dummy32 = be32_to_cpup(p++); 397 dummy32 = be32_to_cpup(p++);
383 switch (dummy32) { 398 switch (dummy32) {
384 case NFS4_SET_TO_CLIENT_TIME: 399 case NFS4_SET_TO_CLIENT_TIME:
385 /* We require the high 32 bits of 'seconds' to be 0, and we ignore
386 all 32 bits of 'nseconds'. */
387 READ_BUF(12);
388 len += 12; 400 len += 12;
389 p = xdr_decode_hyper(p, &sec); 401 status = nfsd4_decode_time(argp, &iattr->ia_mtime);
390 iattr->ia_mtime.tv_sec = sec; 402 if (status)
391 iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); 403 return status;
392 if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
393 return nfserr_inval;
394 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); 404 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
395 break; 405 break;
396 case NFS4_SET_TO_SERVER_TIME: 406 case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
1513 DECODE_TAIL; 1523 DECODE_TAIL;
1514} 1524}
1515 1525
1526#ifdef CONFIG_NFSD_PNFS
1527static __be32
1528nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
1529 struct nfsd4_getdeviceinfo *gdev)
1530{
1531 DECODE_HEAD;
1532 u32 num, i;
1533
1534 READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
1535 COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
1536 gdev->gd_layout_type = be32_to_cpup(p++);
1537 gdev->gd_maxcount = be32_to_cpup(p++);
1538 num = be32_to_cpup(p++);
1539 if (num) {
1540 READ_BUF(4 * num);
1541 gdev->gd_notify_types = be32_to_cpup(p++);
1542 for (i = 1; i < num; i++) {
1543 if (be32_to_cpup(p++)) {
1544 status = nfserr_inval;
1545 goto out;
1546 }
1547 }
1548 }
1549 DECODE_TAIL;
1550}
1551
1552static __be32
1553nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
1554 struct nfsd4_layoutget *lgp)
1555{
1556 DECODE_HEAD;
1557
1558 READ_BUF(36);
1559 lgp->lg_signal = be32_to_cpup(p++);
1560 lgp->lg_layout_type = be32_to_cpup(p++);
1561 lgp->lg_seg.iomode = be32_to_cpup(p++);
1562 p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
1563 p = xdr_decode_hyper(p, &lgp->lg_seg.length);
1564 p = xdr_decode_hyper(p, &lgp->lg_minlength);
1565 nfsd4_decode_stateid(argp, &lgp->lg_sid);
1566 READ_BUF(4);
1567 lgp->lg_maxcount = be32_to_cpup(p++);
1568
1569 DECODE_TAIL;
1570}
1571
1572static __be32
1573nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
1574 struct nfsd4_layoutcommit *lcp)
1575{
1576 DECODE_HEAD;
1577 u32 timechange;
1578
1579 READ_BUF(20);
1580 p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
1581 p = xdr_decode_hyper(p, &lcp->lc_seg.length);
1582 lcp->lc_reclaim = be32_to_cpup(p++);
1583 nfsd4_decode_stateid(argp, &lcp->lc_sid);
1584 READ_BUF(4);
1585 lcp->lc_newoffset = be32_to_cpup(p++);
1586 if (lcp->lc_newoffset) {
1587 READ_BUF(8);
1588 p = xdr_decode_hyper(p, &lcp->lc_last_wr);
1589 } else
1590 lcp->lc_last_wr = 0;
1591 READ_BUF(4);
1592 timechange = be32_to_cpup(p++);
1593 if (timechange) {
1594 status = nfsd4_decode_time(argp, &lcp->lc_mtime);
1595 if (status)
1596 return status;
1597 } else {
1598 lcp->lc_mtime.tv_nsec = UTIME_NOW;
1599 }
1600 READ_BUF(8);
1601 lcp->lc_layout_type = be32_to_cpup(p++);
1602
1603 /*
1604 * Save the layout update in XDR format and let the layout driver deal
1605 * with it later.
1606 */
1607 lcp->lc_up_len = be32_to_cpup(p++);
1608 if (lcp->lc_up_len > 0) {
1609 READ_BUF(lcp->lc_up_len);
1610 READMEM(lcp->lc_up_layout, lcp->lc_up_len);
1611 }
1612
1613 DECODE_TAIL;
1614}
1615
1616static __be32
1617nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
1618 struct nfsd4_layoutreturn *lrp)
1619{
1620 DECODE_HEAD;
1621
1622 READ_BUF(16);
1623 lrp->lr_reclaim = be32_to_cpup(p++);
1624 lrp->lr_layout_type = be32_to_cpup(p++);
1625 lrp->lr_seg.iomode = be32_to_cpup(p++);
1626 lrp->lr_return_type = be32_to_cpup(p++);
1627 if (lrp->lr_return_type == RETURN_FILE) {
1628 READ_BUF(16);
1629 p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
1630 p = xdr_decode_hyper(p, &lrp->lr_seg.length);
1631 nfsd4_decode_stateid(argp, &lrp->lr_sid);
1632 READ_BUF(4);
1633 lrp->lrf_body_len = be32_to_cpup(p++);
1634 if (lrp->lrf_body_len > 0) {
1635 READ_BUF(lrp->lrf_body_len);
1636 READMEM(lrp->lrf_body, lrp->lrf_body_len);
1637 }
1638 } else {
1639 lrp->lr_seg.offset = 0;
1640 lrp->lr_seg.length = NFS4_MAX_UINT64;
1641 }
1642
1643 DECODE_TAIL;
1644}
1645#endif /* CONFIG_NFSD_PNFS */
1646
1516static __be32 1647static __be32
1517nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, 1648nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
1518 struct nfsd4_fallocate *fallocate) 1649 struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1607 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1738 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
1608 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, 1739 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
1609 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1740 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1741#ifdef CONFIG_NFSD_PNFS
1742 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
1743 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1744 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
1745 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
1746 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
1747#else
1610 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, 1748 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
1611 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, 1749 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
1612 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1750 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
1613 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1751 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
1614 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1752 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
1753#endif
1615 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, 1754 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
1616 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1755 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
1617 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, 1756 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
2539 get_parent_attributes(exp, &stat); 2678 get_parent_attributes(exp, &stat);
2540 p = xdr_encode_hyper(p, stat.ino); 2679 p = xdr_encode_hyper(p, stat.ino);
2541 } 2680 }
2681#ifdef CONFIG_NFSD_PNFS
2682 if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
2683 (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
2684 if (exp->ex_layout_type) {
2685 p = xdr_reserve_space(xdr, 8);
2686 if (!p)
2687 goto out_resource;
2688 *p++ = cpu_to_be32(1);
2689 *p++ = cpu_to_be32(exp->ex_layout_type);
2690 } else {
2691 p = xdr_reserve_space(xdr, 4);
2692 if (!p)
2693 goto out_resource;
2694 *p++ = cpu_to_be32(0);
2695 }
2696 }
2697
2698 if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
2699 p = xdr_reserve_space(xdr, 4);
2700 if (!p)
2701 goto out_resource;
2702 *p++ = cpu_to_be32(stat.blksize);
2703 }
2704#endif /* CONFIG_NFSD_PNFS */
2542 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { 2705 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2543 status = nfsd4_encode_security_label(xdr, rqstp, context, 2706 status = nfsd4_encode_security_label(xdr, rqstp, context,
2544 contextlen); 2707 contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2768 if (entry_bytes > cd->rd_maxcount) 2931 if (entry_bytes > cd->rd_maxcount)
2769 goto fail; 2932 goto fail;
2770 cd->rd_maxcount -= entry_bytes; 2933 cd->rd_maxcount -= entry_bytes;
2771 if (!cd->rd_dircount)
2772 goto fail;
2773 /* 2934 /*
2774 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so 2935 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
2775 * let's always let through the first entry, at least: 2936 * let's always let through the first entry, at least:
2776 */ 2937 */
2777 name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; 2938 if (!cd->rd_dircount)
2939 goto fail;
2940 name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
2778 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) 2941 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
2779 goto fail; 2942 goto fail;
2780 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); 2943 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
2944
2781 cd->cookie_offset = cookie_offset; 2945 cd->cookie_offset = cookie_offset;
2782skip_entry: 2946skip_entry:
2783 cd->common.err = nfs_ok; 2947 cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3814 return nfserr; 3978 return nfserr;
3815} 3979}
3816 3980
3981#ifdef CONFIG_NFSD_PNFS
3982static __be32
3983nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
3984 struct nfsd4_getdeviceinfo *gdev)
3985{
3986 struct xdr_stream *xdr = &resp->xdr;
3987 const struct nfsd4_layout_ops *ops =
3988 nfsd4_layout_ops[gdev->gd_layout_type];
3989 u32 starting_len = xdr->buf->len, needed_len;
3990 __be32 *p;
3991
3992 dprintk("%s: err %d\n", __func__, nfserr);
3993 if (nfserr)
3994 goto out;
3995
3996 nfserr = nfserr_resource;
3997 p = xdr_reserve_space(xdr, 4);
3998 if (!p)
3999 goto out;
4000
4001 *p++ = cpu_to_be32(gdev->gd_layout_type);
4002
4003 /* If maxcount is 0 then just update notifications */
4004 if (gdev->gd_maxcount != 0) {
4005 nfserr = ops->encode_getdeviceinfo(xdr, gdev);
4006 if (nfserr) {
4007 /*
4008 * We don't bother to burden the layout drivers with
4009 * enforcing gd_maxcount, just tell the client to
4010 * come back with a bigger buffer if it's not enough.
4011 */
4012 if (xdr->buf->len + 4 > gdev->gd_maxcount)
4013 goto toosmall;
4014 goto out;
4015 }
4016 }
4017
4018 nfserr = nfserr_resource;
4019 if (gdev->gd_notify_types) {
4020 p = xdr_reserve_space(xdr, 4 + 4);
4021 if (!p)
4022 goto out;
4023 *p++ = cpu_to_be32(1); /* bitmap length */
4024 *p++ = cpu_to_be32(gdev->gd_notify_types);
4025 } else {
4026 p = xdr_reserve_space(xdr, 4);
4027 if (!p)
4028 goto out;
4029 *p++ = 0;
4030 }
4031
4032 nfserr = 0;
4033out:
4034 kfree(gdev->gd_device);
4035 dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
4036 return nfserr;
4037
4038toosmall:
4039 dprintk("%s: maxcount too small\n", __func__);
4040 needed_len = xdr->buf->len + 4 /* notifications */;
4041 xdr_truncate_encode(xdr, starting_len);
4042 p = xdr_reserve_space(xdr, 4);
4043 if (!p) {
4044 nfserr = nfserr_resource;
4045 } else {
4046 *p++ = cpu_to_be32(needed_len);
4047 nfserr = nfserr_toosmall;
4048 }
4049 goto out;
4050}
4051
4052static __be32
4053nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
4054 struct nfsd4_layoutget *lgp)
4055{
4056 struct xdr_stream *xdr = &resp->xdr;
4057 const struct nfsd4_layout_ops *ops =
4058 nfsd4_layout_ops[lgp->lg_layout_type];
4059 __be32 *p;
4060
4061 dprintk("%s: err %d\n", __func__, nfserr);
4062 if (nfserr)
4063 goto out;
4064
4065 nfserr = nfserr_resource;
4066 p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
4067 if (!p)
4068 goto out;
4069
4070 *p++ = cpu_to_be32(1); /* we always set return-on-close */
4071 *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
4072 p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
4073 sizeof(stateid_opaque_t));
4074
4075 *p++ = cpu_to_be32(1); /* we always return a single layout */
4076 p = xdr_encode_hyper(p, lgp->lg_seg.offset);
4077 p = xdr_encode_hyper(p, lgp->lg_seg.length);
4078 *p++ = cpu_to_be32(lgp->lg_seg.iomode);
4079 *p++ = cpu_to_be32(lgp->lg_layout_type);
4080
4081 nfserr = ops->encode_layoutget(xdr, lgp);
4082out:
4083 kfree(lgp->lg_content);
4084 return nfserr;
4085}
4086
4087static __be32
4088nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
4089 struct nfsd4_layoutcommit *lcp)
4090{
4091 struct xdr_stream *xdr = &resp->xdr;
4092 __be32 *p;
4093
4094 if (nfserr)
4095 return nfserr;
4096
4097 p = xdr_reserve_space(xdr, 4);
4098 if (!p)
4099 return nfserr_resource;
4100 *p++ = cpu_to_be32(lcp->lc_size_chg);
4101 if (lcp->lc_size_chg) {
4102 p = xdr_reserve_space(xdr, 8);
4103 if (!p)
4104 return nfserr_resource;
4105 p = xdr_encode_hyper(p, lcp->lc_newsize);
4106 }
4107
4108 return nfs_ok;
4109}
4110
4111static __be32
4112nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
4113 struct nfsd4_layoutreturn *lrp)
4114{
4115 struct xdr_stream *xdr = &resp->xdr;
4116 __be32 *p;
4117
4118 if (nfserr)
4119 return nfserr;
4120
4121 p = xdr_reserve_space(xdr, 4);
4122 if (!p)
4123 return nfserr_resource;
4124 *p++ = cpu_to_be32(lrp->lrs_present);
4125 if (lrp->lrs_present)
4126 nfsd4_encode_stateid(xdr, &lrp->lr_sid);
4127 return nfs_ok;
4128}
4129#endif /* CONFIG_NFSD_PNFS */
4130
3817static __be32 4131static __be32
3818nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, 4132nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
3819 struct nfsd4_seek *seek) 4133 struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3890 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 4204 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3891 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 4205 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3892 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 4206 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
4207#ifdef CONFIG_NFSD_PNFS
4208 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
4209 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
4210 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
4211 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
4212 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
4213#else
3893 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, 4214 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3894 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, 4215 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3895 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 4216 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3896 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 4217 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3897 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 4218 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
4219#endif
3898 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, 4220 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
3899 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 4221 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3900 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, 4222 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
21#include "cache.h" 21#include "cache.h"
22#include "state.h" 22#include "state.h"
23#include "netns.h" 23#include "netns.h"
24#include "pnfs.h"
24 25
25/* 26/*
26 * We have a single directory with several nodes in it. 27 * We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
1258 retval = nfsd4_init_slabs(); 1259 retval = nfsd4_init_slabs();
1259 if (retval) 1260 if (retval)
1260 goto out_unregister_pernet; 1261 goto out_unregister_pernet;
1261 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ 1262 retval = nfsd4_init_pnfs();
1262 if (retval) 1263 if (retval)
1263 goto out_free_slabs; 1264 goto out_free_slabs;
1265 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
1266 if (retval)
1267 goto out_exit_pnfs;
1264 nfsd_stat_init(); /* Statistics */ 1268 nfsd_stat_init(); /* Statistics */
1265 retval = nfsd_reply_cache_init(); 1269 retval = nfsd_reply_cache_init();
1266 if (retval) 1270 if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
1282out_free_stat: 1286out_free_stat:
1283 nfsd_stat_shutdown(); 1287 nfsd_stat_shutdown();
1284 nfsd_fault_inject_cleanup(); 1288 nfsd_fault_inject_cleanup();
1289out_exit_pnfs:
1290 nfsd4_exit_pnfs();
1285out_free_slabs: 1291out_free_slabs:
1286 nfsd4_free_slabs(); 1292 nfsd4_free_slabs();
1287out_unregister_pernet: 1293out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
1299 nfsd_stat_shutdown(); 1305 nfsd_stat_shutdown();
1300 nfsd_lockd_shutdown(); 1306 nfsd_lockd_shutdown();
1301 nfsd4_free_slabs(); 1307 nfsd4_free_slabs();
1308 nfsd4_exit_pnfs();
1302 nfsd_fault_inject_cleanup(); 1309 nfsd_fault_inject_cleanup();
1303 unregister_filesystem(&nfsd_fs_type); 1310 unregister_filesystem(&nfsd_fs_type);
1304 unregister_pernet_subsys(&nfsd_net_ops); 1311 unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void nfsd_lockd_shutdown(void);
325 325
326#define NFSD4_SUPPORTED_ATTRS_WORD2 0 326#define NFSD4_SUPPORTED_ATTRS_WORD2 0
327 327
328/* 4.1 */
329#ifdef CONFIG_NFSD_PNFS
330#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES
331#define PNFSD_SUPPORTED_ATTRS_WORD2 \
332(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES)
333#else
334#define PNFSD_SUPPORTED_ATTRS_WORD1 0
335#define PNFSD_SUPPORTED_ATTRS_WORD2 0
336#endif /* CONFIG_NFSD_PNFS */
337
328#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ 338#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
329 NFSD4_SUPPORTED_ATTRS_WORD0 339 NFSD4_SUPPORTED_ATTRS_WORD0
330 340
331#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ 341#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
332 NFSD4_SUPPORTED_ATTRS_WORD1 342 (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1)
333 343
334#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 344#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
335 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 345 (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \
346 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
336 347
348/* 4.2 */
337#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 349#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
338#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL 350#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
339#else 351#else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..f22920442172 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
187 return fhp; 187 return fhp;
188} 188}
189 189
190static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
191{
192 if (fh1->fh_size != fh2->fh_size)
193 return false;
194 if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
195 return false;
196 return true;
197}
198
199static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
200{
201 if (fh1->fh_fsid_type != fh2->fh_fsid_type)
202 return false;
203 if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
204 return false;
205 return true;
206}
207
190#ifdef CONFIG_NFSD_V3 208#ifdef CONFIG_NFSD_V3
191/* 209/*
192 * The wcc data stored in current_fh should be cleared 210 * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program nfsd_program = {
119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { 119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
120 [0] = 1, 120 [0] = 1,
121 [1] = 1, 121 [1] = 1,
122 [2] = 1,
122}; 123};
123 124
124int nfsd_vers(int vers, enum vers_op change) 125int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..d4c4453674c6
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,86 @@
1#ifndef _FS_NFSD_PNFS_H
2#define _FS_NFSD_PNFS_H 1
3
4#ifdef CONFIG_NFSD_V4
5#include <linux/exportfs.h>
6#include <linux/nfsd/export.h>
7
8#include "state.h"
9#include "xdr4.h"
10
11struct xdr_stream;
12
13struct nfsd4_deviceid_map {
14 struct list_head hash;
15 u64 idx;
16 int fsid_type;
17 u32 fsid[];
18};
19
20struct nfsd4_layout_ops {
21 u32 notify_types;
22
23 __be32 (*proc_getdeviceinfo)(struct super_block *sb,
24 struct nfsd4_getdeviceinfo *gdevp);
25 __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
26 struct nfsd4_getdeviceinfo *gdevp);
27
28 __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
29 struct nfsd4_layoutget *lgp);
30 __be32 (*encode_layoutget)(struct xdr_stream *,
31 struct nfsd4_layoutget *lgp);
32
33 __be32 (*proc_layoutcommit)(struct inode *inode,
34 struct nfsd4_layoutcommit *lcp);
35};
36
37extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
38extern const struct nfsd4_layout_ops bl_layout_ops;
39
40__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
41 struct nfsd4_compound_state *cstate, stateid_t *stateid,
42 bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
43__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
44 struct nfs4_layout_stateid *ls);
45__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
46 struct nfsd4_compound_state *cstate,
47 struct nfsd4_layoutreturn *lrp);
48__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
49 struct nfsd4_compound_state *cstate,
50 struct nfsd4_layoutreturn *lrp);
51int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
52 u32 device_generation);
53struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
54#endif /* CONFIG_NFSD_V4 */
55
56#ifdef CONFIG_NFSD_PNFS
57void nfsd4_setup_layout_type(struct svc_export *exp);
58void nfsd4_return_all_client_layouts(struct nfs4_client *);
59void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
60 struct nfs4_file *fp);
61int nfsd4_init_pnfs(void);
62void nfsd4_exit_pnfs(void);
63#else
64struct nfs4_client;
65struct nfs4_file;
66
67static inline void nfsd4_setup_layout_type(struct svc_export *exp)
68{
69}
70
71static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
72{
73}
74static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
75 struct nfs4_file *fp)
76{
77}
78static inline void nfsd4_exit_pnfs(void)
79{
80}
81static inline int nfsd4_init_pnfs(void)
82{
83 return 0;
84}
85#endif /* CONFIG_NFSD_PNFS */
86#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
92/* For a deleg stateid kept around only to process free_stateid's: */ 92/* For a deleg stateid kept around only to process free_stateid's: */
93#define NFS4_REVOKED_DELEG_STID 16 93#define NFS4_REVOKED_DELEG_STID 16
94#define NFS4_CLOSED_DELEG_STID 32 94#define NFS4_CLOSED_DELEG_STID 32
95#define NFS4_LAYOUT_STID 64
95 unsigned char sc_type; 96 unsigned char sc_type;
96 stateid_t sc_stateid; 97 stateid_t sc_stateid;
97 struct nfs4_client *sc_client; 98 struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
297 struct list_head cl_delegations; 298 struct list_head cl_delegations;
298 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ 299 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
299 struct list_head cl_lru; /* tail queue */ 300 struct list_head cl_lru; /* tail queue */
301#ifdef CONFIG_NFSD_PNFS
302 struct list_head cl_lo_states; /* outstanding layout states */
303#endif
300 struct xdr_netobj cl_name; /* id generated by client */ 304 struct xdr_netobj cl_name; /* id generated by client */
301 nfs4_verifier cl_verifier; /* generated by client */ 305 nfs4_verifier cl_verifier; /* generated by client */
302 time_t cl_time; /* time of last lease renewal */ 306 time_t cl_time; /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
493 atomic_t fi_access[2]; 497 atomic_t fi_access[2];
494 u32 fi_share_deny; 498 u32 fi_share_deny;
495 struct file *fi_deleg_file; 499 struct file *fi_deleg_file;
496 atomic_t fi_delegees; 500 int fi_delegees;
497 struct knfsd_fh fi_fhandle; 501 struct knfsd_fh fi_fhandle;
498 bool fi_had_conflict; 502 bool fi_had_conflict;
503#ifdef CONFIG_NFSD_PNFS
504 struct list_head fi_lo_states;
505 atomic_t fi_lo_recalls;
506#endif
499}; 507};
500 508
501/* 509/*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
528 return container_of(s, struct nfs4_ol_stateid, st_stid); 536 return container_of(s, struct nfs4_ol_stateid, st_stid);
529} 537}
530 538
539struct nfs4_layout_stateid {
540 struct nfs4_stid ls_stid;
541 struct list_head ls_perclnt;
542 struct list_head ls_perfile;
543 spinlock_t ls_lock;
544 struct list_head ls_layouts;
545 u32 ls_layout_type;
546 struct file *ls_file;
547 struct nfsd4_callback ls_recall;
548 stateid_t ls_recall_sid;
549 bool ls_recalled;
550};
551
552static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
553{
554 return container_of(s, struct nfs4_layout_stateid, ls_stid);
555}
556
531/* flags for preprocess_seqid_op() */ 557/* flags for preprocess_seqid_op() */
532#define RD_STATE 0x00000010 558#define RD_STATE 0x00000010
533#define WR_STATE 0x00000020 559#define WR_STATE 0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
535enum nfsd4_cb_op { 561enum nfsd4_cb_op {
536 NFSPROC4_CLNT_CB_NULL = 0, 562 NFSPROC4_CLNT_CB_NULL = 0,
537 NFSPROC4_CLNT_CB_RECALL, 563 NFSPROC4_CLNT_CB_RECALL,
564 NFSPROC4_CLNT_CB_LAYOUT,
538 NFSPROC4_CLNT_CB_SEQUENCE, 565 NFSPROC4_CLNT_CB_SEQUENCE,
539}; 566};
540 567
@@ -545,6 +572,12 @@ struct nfsd_net;
545extern __be32 nfs4_preprocess_stateid_op(struct net *net, 572extern __be32 nfs4_preprocess_stateid_op(struct net *net,
546 struct nfsd4_compound_state *cstate, 573 struct nfsd4_compound_state *cstate,
547 stateid_t *stateid, int flags, struct file **filp); 574 stateid_t *stateid, int flags, struct file **filp);
575__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
576 stateid_t *stateid, unsigned char typemask,
577 struct nfs4_stid **s, struct nfsd_net *nn);
578struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
579 struct kmem_cache *slab);
580void nfs4_unhash_stid(struct nfs4_stid *s);
548void nfs4_put_stid(struct nfs4_stid *s); 581void nfs4_put_stid(struct nfs4_stid *s);
549void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); 582void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
550extern void nfs4_release_reclaim(struct nfsd_net *); 583extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
567 struct nfsd_net *nn); 600 struct nfsd_net *nn);
568extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 601extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
569 602
603struct nfs4_file *find_file(struct knfsd_fh *fh);
604void put_nfs4_file(struct nfs4_file *fi);
605static inline void get_nfs4_file(struct nfs4_file *fi)
606{
607 atomic_inc(&fi->fi_ref);
608}
609struct file *find_any_file(struct nfs4_file *f);
610
570/* grace period management */ 611/* grace period management */
571void nfsd4_end_grace(struct nfsd_net *nn); 612void nfsd4_end_grace(struct nfsd_net *nn);
572 613
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
1
2#include "state.h"
3
4#define CREATE_TRACE_POINTS
5#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#undef TRACE_SYSTEM
5#define TRACE_SYSTEM nfsd
6
7#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
8#define _NFSD_TRACE_H
9
10#include <linux/tracepoint.h>
11
12DECLARE_EVENT_CLASS(nfsd_stateid_class,
13 TP_PROTO(stateid_t *stp),
14 TP_ARGS(stp),
15 TP_STRUCT__entry(
16 __field(u32, cl_boot)
17 __field(u32, cl_id)
18 __field(u32, si_id)
19 __field(u32, si_generation)
20 ),
21 TP_fast_assign(
22 __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
23 __entry->cl_id = stp->si_opaque.so_clid.cl_id;
24 __entry->si_id = stp->si_opaque.so_id;
25 __entry->si_generation = stp->si_generation;
26 ),
27 TP_printk("client %08x:%08x stateid %08x:%08x",
28 __entry->cl_boot,
29 __entry->cl_id,
30 __entry->si_id,
31 __entry->si_generation)
32)
33
34#define DEFINE_STATEID_EVENT(name) \
35DEFINE_EVENT(nfsd_stateid_class, name, \
36 TP_PROTO(stateid_t *stp), \
37 TP_ARGS(stp))
38DEFINE_STATEID_EVENT(layoutstate_alloc);
39DEFINE_STATEID_EVENT(layoutstate_unhash);
40DEFINE_STATEID_EVENT(layoutstate_free);
41DEFINE_STATEID_EVENT(layout_get_lookup_fail);
42DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
43DEFINE_STATEID_EVENT(layout_return_lookup_fail);
44DEFINE_STATEID_EVENT(layout_recall);
45DEFINE_STATEID_EVENT(layout_recall_done);
46DEFINE_STATEID_EVENT(layout_recall_fail);
47DEFINE_STATEID_EVENT(layout_recall_release);
48
49#endif /* _NFSD_TRACE_H */
50
51#undef TRACE_INCLUDE_PATH
52#define TRACE_INCLUDE_PATH .
53#define TRACE_INCLUDE_FILE trace
54#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
428 u32 rca_one_fs; 428 u32 rca_one_fs;
429}; 429};
430 430
431struct nfsd4_deviceid {
432 u64 fsid_idx;
433 u32 generation;
434 u32 pad;
435};
436
437struct nfsd4_layout_seg {
438 u32 iomode;
439 u64 offset;
440 u64 length;
441};
442
443struct nfsd4_getdeviceinfo {
444 struct nfsd4_deviceid gd_devid; /* request */
445 u32 gd_layout_type; /* request */
446 u32 gd_maxcount; /* request */
447 u32 gd_notify_types;/* request - response */
448 void *gd_device; /* response */
449};
450
451struct nfsd4_layoutget {
452 u64 lg_minlength; /* request */
453 u32 lg_signal; /* request */
454 u32 lg_layout_type; /* request */
455 u32 lg_maxcount; /* request */
456 stateid_t lg_sid; /* request/response */
457 struct nfsd4_layout_seg lg_seg; /* request/response */
458 void *lg_content; /* response */
459};
460
461struct nfsd4_layoutcommit {
462 stateid_t lc_sid; /* request */
463 struct nfsd4_layout_seg lc_seg; /* request */
464 u32 lc_reclaim; /* request */
465 u32 lc_newoffset; /* request */
466 u64 lc_last_wr; /* request */
467 struct timespec lc_mtime; /* request */
468 u32 lc_layout_type; /* request */
469 u32 lc_up_len; /* layout length */
470 void *lc_up_layout; /* decoded by callback */
471 u32 lc_size_chg; /* boolean for response */
472 u64 lc_newsize; /* response */
473};
474
475struct nfsd4_layoutreturn {
476 u32 lr_return_type; /* request */
477 u32 lr_layout_type; /* request */
478 struct nfsd4_layout_seg lr_seg; /* request */
479 u32 lr_reclaim; /* request */
480 u32 lrf_body_len; /* request */
481 void *lrf_body; /* request */
482 stateid_t lr_sid; /* request/response */
483 u32 lrs_present; /* response */
484};
485
431struct nfsd4_fallocate { 486struct nfsd4_fallocate {
432 /* request */ 487 /* request */
433 stateid_t falloc_stateid; 488 stateid_t falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
491 struct nfsd4_reclaim_complete reclaim_complete; 546 struct nfsd4_reclaim_complete reclaim_complete;
492 struct nfsd4_test_stateid test_stateid; 547 struct nfsd4_test_stateid test_stateid;
493 struct nfsd4_free_stateid free_stateid; 548 struct nfsd4_free_stateid free_stateid;
549 struct nfsd4_getdeviceinfo getdeviceinfo;
550 struct nfsd4_layoutget layoutget;
551 struct nfsd4_layoutcommit layoutcommit;
552 struct nfsd4_layoutreturn layoutreturn;
494 553
495 /* NFSv4.2 */ 554 /* NFSv4.2 */
496 struct nfsd4_fallocate allocate; 555 struct nfsd4_fallocate allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
21#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 21#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
22 cb_sequence_dec_sz + \ 22 cb_sequence_dec_sz + \
23 op_dec_sz) 23 op_dec_sz)
24#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
25 cb_sequence_enc_sz + \
26 1 + 3 + \
27 enc_nfs4_fh_sz + 4)
28#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
29 cb_sequence_dec_sz + \
30 op_dec_sz)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
128 .fault = filemap_fault, 128 .fault = filemap_fault,
129 .map_pages = filemap_map_pages, 129 .map_pages = filemap_map_pages,
130 .page_mkwrite = nilfs_page_mkwrite, 130 .page_mkwrite = nilfs_page_mkwrite,
131 .remap_pages = generic_file_remap_pages,
132}; 131};
133 132
134static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) 133static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
172 inode->i_mode = S_IFREG; 172 inode->i_mode = S_IFREG;
173 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 173 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
174 inode->i_mapping->a_ops = &empty_aops; 174 inode->i_mapping->a_ops = &empty_aops;
175 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
176 175
177 ii->i_flags = 0; 176 ii->i_flags = 0;
178 nilfs_bmap_init_gc(ii->i_bmap); 177 nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
429 429
430 inode->i_mode = S_IFREG; 430 inode->i_mode = S_IFREG;
431 mapping_set_gfp_mask(inode->i_mapping, gfp_mask); 431 mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
432 inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
433 432
434 inode->i_op = &def_mdt_iops; 433 inode->i_op = &def_mdt_iops;
435 inode->i_fop = &def_mdt_fops; 434 inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
457 struct nilfs_shadow_map *shadow) 456 struct nilfs_shadow_map *shadow)
458{ 457{
459 struct nilfs_mdt_info *mi = NILFS_MDT(inode); 458 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
460 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
461 459
462 INIT_LIST_HEAD(&shadow->frozen_buffers); 460 INIT_LIST_HEAD(&shadow->frozen_buffers);
463 address_space_init_once(&shadow->frozen_data); 461 address_space_init_once(&shadow->frozen_data);
464 nilfs_mapping_init(&shadow->frozen_data, inode, bdi); 462 nilfs_mapping_init(&shadow->frozen_data, inode);
465 address_space_init_once(&shadow->frozen_btnodes); 463 address_space_init_once(&shadow->frozen_btnodes);
466 nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); 464 nilfs_mapping_init(&shadow->frozen_btnodes, inode);
467 mi->mi_shadow = shadow; 465 mi->mi_shadow = shadow;
468 return 0; 466 return 0;
469} 467}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 91093cd74f0d..385704027575 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
141 * @ti_save: Backup of journal_info field of task_struct 141 * @ti_save: Backup of journal_info field of task_struct
142 * @ti_flags: Flags 142 * @ti_flags: Flags
143 * @ti_count: Nest level 143 * @ti_count: Nest level
144 * @ti_garbage: List of inode to be put when releasing semaphore
145 */ 144 */
146struct nilfs_transaction_info { 145struct nilfs_transaction_info {
147 u32 ti_magic; 146 u32 ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
150 one of other filesystems has a bug. */ 149 one of other filesystems has a bug. */
151 unsigned short ti_flags; 150 unsigned short ti_flags;
152 unsigned short ti_count; 151 unsigned short ti_count;
153 struct list_head ti_garbage;
154}; 152};
155 153
156/* ti_magic */ 154/* ti_magic */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
461 return nc; 461 return nc;
462} 462}
463 463
464void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, 464void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
465 struct backing_dev_info *bdi)
466{ 465{
467 mapping->host = inode; 466 mapping->host = inode;
468 mapping->flags = 0; 467 mapping->flags = 0;
469 mapping_set_gfp_mask(mapping, GFP_NOFS); 468 mapping_set_gfp_mask(mapping, GFP_NOFS);
470 mapping->private_data = NULL; 469 mapping->private_data = NULL;
471 mapping->backing_dev_info = bdi;
472 mapping->a_ops = &empty_aops; 470 mapping->a_ops = &empty_aops;
473} 471}
474 472
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
57void nilfs_copy_back_pages(struct address_space *, struct address_space *); 57void nilfs_copy_back_pages(struct address_space *, struct address_space *);
58void nilfs_clear_dirty_page(struct page *, bool); 58void nilfs_clear_dirty_page(struct page *, bool);
59void nilfs_clear_dirty_pages(struct address_space *, bool); 59void nilfs_clear_dirty_pages(struct address_space *, bool);
60void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, 60void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
61 struct backing_dev_info *bdi);
62unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 61unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
63unsigned long nilfs_find_uncommitted_extent(struct inode *inode, 62unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
64 sector_t start_blk, 63 sector_t start_blk,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ef18fc656c2..469086b9f99b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
305 ti->ti_count = 0; 305 ti->ti_count = 0;
306 ti->ti_save = cur_ti; 306 ti->ti_save = cur_ti;
307 ti->ti_magic = NILFS_TI_MAGIC; 307 ti->ti_magic = NILFS_TI_MAGIC;
308 INIT_LIST_HEAD(&ti->ti_garbage);
309 current->journal_info = ti; 308 current->journal_info = ti;
310 309
311 for (;;) { 310 for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
332 331
333 up_write(&nilfs->ns_segctor_sem); 332 up_write(&nilfs->ns_segctor_sem);
334 current->journal_info = ti->ti_save; 333 current->journal_info = ti->ti_save;
335 if (!list_empty(&ti->ti_garbage))
336 nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
337} 334}
338 335
339static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, 336static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
746 } 743 }
747} 744}
748 745
746static void nilfs_iput_work_func(struct work_struct *work)
747{
748 struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
749 sc_iput_work);
750 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
751
752 nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
753}
754
749static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, 755static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
750 struct nilfs_root *root) 756 struct nilfs_root *root)
751{ 757{
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
1900static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, 1906static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
1901 struct the_nilfs *nilfs) 1907 struct the_nilfs *nilfs)
1902{ 1908{
1903 struct nilfs_transaction_info *ti = current->journal_info;
1904 struct nilfs_inode_info *ii, *n; 1909 struct nilfs_inode_info *ii, *n;
1910 int defer_iput = false;
1905 1911
1906 spin_lock(&nilfs->ns_inode_lock); 1912 spin_lock(&nilfs->ns_inode_lock);
1907 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 1913 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
1912 clear_bit(NILFS_I_BUSY, &ii->i_state); 1918 clear_bit(NILFS_I_BUSY, &ii->i_state);
1913 brelse(ii->i_bh); 1919 brelse(ii->i_bh);
1914 ii->i_bh = NULL; 1920 ii->i_bh = NULL;
1915 list_move_tail(&ii->i_dirty, &ti->ti_garbage); 1921 list_del_init(&ii->i_dirty);
1922 if (!ii->vfs_inode.i_nlink) {
1923 /*
1924 * Defer calling iput() to avoid a deadlock
1925 * over I_SYNC flag for inodes with i_nlink == 0
1926 */
1927 list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
1928 defer_iput = true;
1929 } else {
1930 spin_unlock(&nilfs->ns_inode_lock);
1931 iput(&ii->vfs_inode);
1932 spin_lock(&nilfs->ns_inode_lock);
1933 }
1916 } 1934 }
1917 spin_unlock(&nilfs->ns_inode_lock); 1935 spin_unlock(&nilfs->ns_inode_lock);
1936
1937 if (defer_iput)
1938 schedule_work(&sci->sc_iput_work);
1918} 1939}
1919 1940
1920/* 1941/*
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
2583 INIT_LIST_HEAD(&sci->sc_segbufs); 2604 INIT_LIST_HEAD(&sci->sc_segbufs);
2584 INIT_LIST_HEAD(&sci->sc_write_logs); 2605 INIT_LIST_HEAD(&sci->sc_write_logs);
2585 INIT_LIST_HEAD(&sci->sc_gc_inodes); 2606 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2607 INIT_LIST_HEAD(&sci->sc_iput_queue);
2608 INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
2586 init_timer(&sci->sc_timer); 2609 init_timer(&sci->sc_timer);
2587 2610
2588 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; 2611 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2609 ret = nilfs_segctor_construct(sci, SC_LSEG_SR); 2632 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2610 nilfs_transaction_unlock(sci->sc_super); 2633 nilfs_transaction_unlock(sci->sc_super);
2611 2634
2635 flush_work(&sci->sc_iput_work);
2636
2612 } while (ret && retrycount-- > 0); 2637 } while (ret && retrycount-- > 0);
2613} 2638}
2614 2639
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2633 || sci->sc_seq_request != sci->sc_seq_done); 2658 || sci->sc_seq_request != sci->sc_seq_done);
2634 spin_unlock(&sci->sc_state_lock); 2659 spin_unlock(&sci->sc_state_lock);
2635 2660
2661 if (flush_work(&sci->sc_iput_work))
2662 flag = true;
2663
2636 if (flag || !nilfs_segctor_confirm(sci)) 2664 if (flag || !nilfs_segctor_confirm(sci))
2637 nilfs_segctor_write_out(sci); 2665 nilfs_segctor_write_out(sci);
2638 2666
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2642 nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); 2670 nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
2643 } 2671 }
2644 2672
2673 if (!list_empty(&sci->sc_iput_queue)) {
2674 nilfs_warning(sci->sc_super, __func__,
2675 "iput queue is not empty\n");
2676 nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
2677 }
2678
2645 WARN_ON(!list_empty(&sci->sc_segbufs)); 2679 WARN_ON(!list_empty(&sci->sc_segbufs));
2646 WARN_ON(!list_empty(&sci->sc_write_logs)); 2680 WARN_ON(!list_empty(&sci->sc_write_logs));
2647 2681
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 38a1d0013314..a48d6de1e02c 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/workqueue.h>
29#include <linux/nilfs2_fs.h> 30#include <linux/nilfs2_fs.h>
30#include "nilfs.h" 31#include "nilfs.h"
31 32
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
92 * @sc_nblk_inc: Block count of current generation 93 * @sc_nblk_inc: Block count of current generation
93 * @sc_dirty_files: List of files to be written 94 * @sc_dirty_files: List of files to be written
94 * @sc_gc_inodes: List of GC inodes having blocks to be written 95 * @sc_gc_inodes: List of GC inodes having blocks to be written
96 * @sc_iput_queue: list of inodes for which iput should be done
97 * @sc_iput_work: work struct to defer iput call
95 * @sc_freesegs: array of segment numbers to be freed 98 * @sc_freesegs: array of segment numbers to be freed
96 * @sc_nfreesegs: number of segments on @sc_freesegs 99 * @sc_nfreesegs: number of segments on @sc_freesegs
97 * @sc_dsync_inode: inode whose data pages are written for a sync operation 100 * @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
135 138
136 struct list_head sc_dirty_files; 139 struct list_head sc_dirty_files;
137 struct list_head sc_gc_inodes; 140 struct list_head sc_gc_inodes;
141 struct list_head sc_iput_queue;
142 struct work_struct sc_iput_work;
138 143
139 __u64 *sc_freesegs; 144 __u64 *sc_freesegs;
140 size_t sc_nfreesegs; 145 size_t sc_nfreesegs;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
166 ii->i_state = 0; 166 ii->i_state = 0;
167 ii->i_cno = 0; 167 ii->i_cno = 0;
168 ii->vfs_inode.i_version = 1; 168 ii->vfs_inode.i_version = 1;
169 nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); 169 nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
170 return &ii->vfs_inode; 170 return &ii->vfs_inode;
171} 171}
172 172
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
1057{ 1057{
1058 struct the_nilfs *nilfs; 1058 struct the_nilfs *nilfs;
1059 struct nilfs_root *fsroot; 1059 struct nilfs_root *fsroot;
1060 struct backing_dev_info *bdi;
1061 __u64 cno; 1060 __u64 cno;
1062 int err; 1061 int err;
1063 1062
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
1077 sb->s_time_gran = 1; 1076 sb->s_time_gran = 1;
1078 sb->s_max_links = NILFS_LINK_MAX; 1077 sb->s_max_links = NILFS_LINK_MAX;
1079 1078
1080 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 1079 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
1081 sb->s_bdi = bdi ? : &default_backing_dev_info;
1082 1080
1083 err = load_nilfs(nilfs, sb); 1081 err = load_nilfs(nilfs, sb);
1084 if (err) 1082 if (err)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..2a24249b30af 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
1config FSNOTIFY 1config FSNOTIFY
2 def_bool n 2 def_bool n
3 select SRCU
3 4
4source "fs/notify/dnotify/Kconfig" 5source "fs/notify/dnotify/Kconfig"
5source "fs/notify/inotify/Kconfig" 6source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 30d3addfad75..51ceb8107284 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
140 } 140 }
141 141
142 if (S_ISDIR(path->dentry->d_inode->i_mode) && 142 if (S_ISDIR(path->dentry->d_inode->i_mode) &&
143 (marks_ignored_mask & FS_ISDIR)) 143 !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
144 return false; 144 return false;
145 145
146 if (event_mask & marks_mask & ~marks_ignored_mask) 146 if (event_mask & marks_mask & ~marks_ignored_mask)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c991616acca9..cf275500a665 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -259,16 +259,15 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
259 struct fsnotify_event *kevent; 259 struct fsnotify_event *kevent;
260 char __user *start; 260 char __user *start;
261 int ret; 261 int ret;
262 DEFINE_WAIT(wait); 262 DEFINE_WAIT_FUNC(wait, woken_wake_function);
263 263
264 start = buf; 264 start = buf;
265 group = file->private_data; 265 group = file->private_data;
266 266
267 pr_debug("%s: group=%p\n", __func__, group); 267 pr_debug("%s: group=%p\n", __func__, group);
268 268
269 add_wait_queue(&group->notification_waitq, &wait);
269 while (1) { 270 while (1) {
270 prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
271
272 mutex_lock(&group->notification_mutex); 271 mutex_lock(&group->notification_mutex);
273 kevent = get_one_event(group, count); 272 kevent = get_one_event(group, count);
274 mutex_unlock(&group->notification_mutex); 273 mutex_unlock(&group->notification_mutex);
@@ -289,7 +288,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
289 288
290 if (start != buf) 289 if (start != buf)
291 break; 290 break;
292 schedule(); 291
292 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
293 continue; 293 continue;
294 } 294 }
295 295
@@ -318,8 +318,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
318 buf += ret; 318 buf += ret;
319 count -= ret; 319 count -= ret;
320 } 320 }
321 remove_wait_queue(&group->notification_waitq, &wait);
321 322
322 finish_wait(&group->notification_waitq, &wait);
323 if (start != buf && ret != -EFAULT) 323 if (start != buf && ret != -EFAULT)
324 ret = buf - start; 324 ret = buf - start;
325 return ret; 325 return ret;
@@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
487 unsigned int flags, 487 unsigned int flags,
488 int *destroy) 488 int *destroy)
489{ 489{
490 __u32 oldmask; 490 __u32 oldmask = 0;
491 491
492 spin_lock(&fsn_mark->lock); 492 spin_lock(&fsn_mark->lock);
493 if (!(flags & FAN_MARK_IGNORED_MASK)) { 493 if (!(flags & FAN_MARK_IGNORED_MASK)) {
494 __u32 tmask = fsn_mark->mask & ~mask;
495
496 if (flags & FAN_MARK_ONDIR)
497 tmask &= ~FAN_ONDIR;
498
494 oldmask = fsn_mark->mask; 499 oldmask = fsn_mark->mask;
495 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask)); 500 fsnotify_set_mark_mask_locked(fsn_mark, tmask);
496 } else { 501 } else {
497 oldmask = fsn_mark->ignored_mask; 502 __u32 tmask = fsn_mark->ignored_mask & ~mask;
498 fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask)); 503 if (flags & FAN_MARK_ONDIR)
504 tmask &= ~FAN_ONDIR;
505
506 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
499 } 507 }
508 *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
500 spin_unlock(&fsn_mark->lock); 509 spin_unlock(&fsn_mark->lock);
501 510
502 *destroy = !(oldmask & ~mask);
503
504 return mask & oldmask; 511 return mask & oldmask;
505} 512}
506 513
@@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
569 576
570 spin_lock(&fsn_mark->lock); 577 spin_lock(&fsn_mark->lock);
571 if (!(flags & FAN_MARK_IGNORED_MASK)) { 578 if (!(flags & FAN_MARK_IGNORED_MASK)) {
579 __u32 tmask = fsn_mark->mask | mask;
580
581 if (flags & FAN_MARK_ONDIR)
582 tmask |= FAN_ONDIR;
583
572 oldmask = fsn_mark->mask; 584 oldmask = fsn_mark->mask;
573 fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); 585 fsnotify_set_mark_mask_locked(fsn_mark, tmask);
574 } else { 586 } else {
575 __u32 tmask = fsn_mark->ignored_mask | mask; 587 __u32 tmask = fsn_mark->ignored_mask | mask;
588 if (flags & FAN_MARK_ONDIR)
589 tmask |= FAN_ONDIR;
590
576 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); 591 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
577 if (flags & FAN_MARK_IGNORED_SURV_MODIFY) 592 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
578 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 593 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
579 } 594 }
580
581 if (!(flags & FAN_MARK_ONDIR)) {
582 __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
583 fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
584 }
585
586 spin_unlock(&fsn_mark->lock); 595 spin_unlock(&fsn_mark->lock);
587 596
588 return mask & ~oldmask; 597 return mask & ~oldmask;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/backing-dev.h>
22#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
23#include <linux/gfp.h> 24#include <linux/gfp.h>
24#include <linux/pagemap.h> 25#include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2091 count = iov_length(iov, nr_segs); 2092 count = iov_length(iov, nr_segs);
2092 pos = *ppos; 2093 pos = *ppos;
2093 /* We can write back this queue in page reclaim. */ 2094 /* We can write back this queue in page reclaim. */
2094 current->backing_dev_info = mapping->backing_dev_info; 2095 current->backing_dev_info = inode_to_bdi(inode);
2095 written = 0; 2096 written = 0;
2096 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2097 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2097 if (err) 2098 if (err)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7e8282dcea2a..c58a1bcfda0f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle,
245 ret = posix_acl_equiv_mode(acl, &mode); 245 ret = posix_acl_equiv_mode(acl, &mode);
246 if (ret < 0) 246 if (ret < 0)
247 return ret; 247 return ret;
248 else {
249 if (ret == 0)
250 acl = NULL;
251 248
252 ret = ocfs2_acl_set_mode(inode, di_bh, 249 if (ret == 0)
253 handle, mode); 250 acl = NULL;
254 if (ret)
255 return ret;
256 251
257 } 252 ret = ocfs2_acl_set_mode(inode, di_bh,
253 handle, mode);
254 if (ret)
255 return ret;
258 } 256 }
259 break; 257 break;
260 case ACL_TYPE_DEFAULT: 258 case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fcae9ef1a328..044158bd22be 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6873 if (IS_ERR(handle)) { 6873 if (IS_ERR(handle)) {
6874 ret = PTR_ERR(handle); 6874 ret = PTR_ERR(handle);
6875 mlog_errno(ret); 6875 mlog_errno(ret);
6876 goto out_unlock; 6876 goto out;
6877 } 6877 }
6878 6878
6879 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 6879 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6931 if (ret) { 6931 if (ret) {
6932 mlog_errno(ret); 6932 mlog_errno(ret);
6933 need_free = 1; 6933 need_free = 1;
6934 goto out_commit; 6934 goto out_unlock;
6935 } 6935 }
6936 6936
6937 page_end = PAGE_CACHE_SIZE; 6937 page_end = PAGE_CACHE_SIZE;
@@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6964 if (ret) { 6964 if (ret) {
6965 mlog_errno(ret); 6965 mlog_errno(ret);
6966 need_free = 1; 6966 need_free = 1;
6967 goto out_commit; 6967 goto out_unlock;
6968 } 6968 }
6969 6969
6970 inode->i_blocks = ocfs2_inode_sector_count(inode); 6970 inode->i_blocks = ocfs2_inode_sector_count(inode);
6971 } 6971 }
6972 6972
6973out_unlock:
6974 if (pages)
6975 ocfs2_unlock_and_free_pages(pages, num_pages);
6976
6973out_commit: 6977out_commit:
6974 if (ret < 0 && did_quota) 6978 if (ret < 0 && did_quota)
6975 dquot_free_space_nodirty(inode, 6979 dquot_free_space_nodirty(inode,
@@ -6989,15 +6993,11 @@ out_commit:
6989 6993
6990 ocfs2_commit_trans(osb, handle); 6994 ocfs2_commit_trans(osb, handle);
6991 6995
6992out_unlock: 6996out:
6993 if (data_ac) 6997 if (data_ac)
6994 ocfs2_free_alloc_context(data_ac); 6998 ocfs2_free_alloc_context(data_ac);
6995 6999 if (pages)
6996out:
6997 if (pages) {
6998 ocfs2_unlock_and_free_pages(pages, num_pages);
6999 kfree(pages); 7000 kfree(pages);
7000 }
7001 7001
7002 return ret; 7002 return ret;
7003} 7003}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 46d93e941f3d..44db1808cdb5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -28,6 +28,7 @@
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h> 29#include <linux/mpage.h>
30#include <linux/quotaops.h> 30#include <linux/quotaops.h>
31#include <linux/blkdev.h>
31 32
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
33 34
@@ -47,6 +48,9 @@
47#include "ocfs2_trace.h" 48#include "ocfs2_trace.h"
48 49
49#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51#include "dir.h"
52#include "namei.h"
53#include "sysfile.h"
50 54
51static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 55static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
52 struct buffer_head *bh_result, int create) 56 struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
506 * 510 *
507 * called like this: dio->get_blocks(dio->inode, fs_startblk, 511 * called like this: dio->get_blocks(dio->inode, fs_startblk,
508 * fs_count, map_bh, dio->rw == WRITE); 512 * fs_count, map_bh, dio->rw == WRITE);
509 *
510 * Note that we never bother to allocate blocks here, and thus ignore the
511 * create argument.
512 */ 513 */
513static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 514static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
514 struct buffer_head *bh_result, int create) 515 struct buffer_head *bh_result, int create)
515{ 516{
516 int ret; 517 int ret;
518 u32 cpos = 0;
519 int alloc_locked = 0;
517 u64 p_blkno, inode_blocks, contig_blocks; 520 u64 p_blkno, inode_blocks, contig_blocks;
518 unsigned int ext_flags; 521 unsigned int ext_flags;
519 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 522 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
520 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 523 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
524 unsigned long len = bh_result->b_size;
525 unsigned int clusters_to_alloc = 0;
526
527 cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
521 528
522 /* This function won't even be called if the request isn't all 529 /* This function won't even be called if the request isn't all
523 * nicely aligned and of the right size, so there's no need 530 * nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
539 /* We should already CoW the refcounted extent in case of create. */ 546 /* We should already CoW the refcounted extent in case of create. */
540 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); 547 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
541 548
549 /* allocate blocks if no p_blkno is found, and create == 1 */
550 if (!p_blkno && create) {
551 ret = ocfs2_inode_lock(inode, NULL, 1);
552 if (ret < 0) {
553 mlog_errno(ret);
554 goto bail;
555 }
556
557 alloc_locked = 1;
558
559 /* fill hole, allocate blocks can't be larger than the size
560 * of the hole */
561 clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
562 if (clusters_to_alloc > contig_blocks)
563 clusters_to_alloc = contig_blocks;
564
565 /* allocate extent and insert them into the extent tree */
566 ret = ocfs2_extend_allocation(inode, cpos,
567 clusters_to_alloc, 0);
568 if (ret < 0) {
569 mlog_errno(ret);
570 goto bail;
571 }
572
573 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
574 &contig_blocks, &ext_flags);
575 if (ret < 0) {
576 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
577 (unsigned long long)iblock);
578 ret = -EIO;
579 goto bail;
580 }
581 }
582
542 /* 583 /*
543 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
544 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
556 contig_blocks = max_blocks; 597 contig_blocks = max_blocks;
557 bh_result->b_size = contig_blocks << blocksize_bits; 598 bh_result->b_size = contig_blocks << blocksize_bits;
558bail: 599bail:
600 if (alloc_locked)
601 ocfs2_inode_unlock(inode, 1);
559 return ret; 602 return ret;
560} 603}
561 604
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
597 return try_to_free_buffers(page); 640 return try_to_free_buffers(page);
598} 641}
599 642
643static int ocfs2_is_overwrite(struct ocfs2_super *osb,
644 struct inode *inode, loff_t offset)
645{
646 int ret = 0;
647 u32 v_cpos = 0;
648 u32 p_cpos = 0;
649 unsigned int num_clusters = 0;
650 unsigned int ext_flags = 0;
651
652 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
653 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
654 &num_clusters, &ext_flags);
655 if (ret < 0) {
656 mlog_errno(ret);
657 return ret;
658 }
659
660 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
661 return 1;
662
663 return 0;
664}
665
666static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
667 struct iov_iter *iter,
668 loff_t offset)
669{
670 ssize_t ret = 0;
671 ssize_t written = 0;
672 bool orphaned = false;
673 int is_overwrite = 0;
674 struct file *file = iocb->ki_filp;
675 struct inode *inode = file_inode(file)->i_mapping->host;
676 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
677 struct buffer_head *di_bh = NULL;
678 size_t count = iter->count;
679 journal_t *journal = osb->journal->j_journal;
680 u32 zero_len;
681 int cluster_align;
682 loff_t final_size = offset + count;
683 int append_write = offset >= i_size_read(inode) ? 1 : 0;
684 unsigned int num_clusters = 0;
685 unsigned int ext_flags = 0;
686
687 {
688 u64 o = offset;
689
690 zero_len = do_div(o, 1 << osb->s_clustersize_bits);
691 cluster_align = !zero_len;
692 }
693
694 /*
695 * when final_size > inode->i_size, inode->i_size will be
696 * updated after direct write, so add the inode to orphan
697 * dir first.
698 */
699 if (final_size > i_size_read(inode)) {
700 ret = ocfs2_add_inode_to_orphan(osb, inode);
701 if (ret < 0) {
702 mlog_errno(ret);
703 goto out;
704 }
705 orphaned = true;
706 }
707
708 if (append_write) {
709 ret = ocfs2_inode_lock(inode, &di_bh, 1);
710 if (ret < 0) {
711 mlog_errno(ret);
712 goto clean_orphan;
713 }
714
715 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
716 ret = ocfs2_zero_extend(inode, di_bh, offset);
717 else
718 ret = ocfs2_extend_no_holes(inode, di_bh, offset,
719 offset);
720 if (ret < 0) {
721 mlog_errno(ret);
722 ocfs2_inode_unlock(inode, 1);
723 brelse(di_bh);
724 goto clean_orphan;
725 }
726
727 is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
728 if (is_overwrite < 0) {
729 mlog_errno(is_overwrite);
730 ocfs2_inode_unlock(inode, 1);
731 brelse(di_bh);
732 goto clean_orphan;
733 }
734
735 ocfs2_inode_unlock(inode, 1);
736 brelse(di_bh);
737 di_bh = NULL;
738 }
739
740 written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
741 iter, offset,
742 ocfs2_direct_IO_get_blocks,
743 ocfs2_dio_end_io, NULL, 0);
744 if (unlikely(written < 0)) {
745 loff_t i_size = i_size_read(inode);
746
747 if (offset + count > i_size) {
748 ret = ocfs2_inode_lock(inode, &di_bh, 1);
749 if (ret < 0) {
750 mlog_errno(ret);
751 goto clean_orphan;
752 }
753
754 if (i_size == i_size_read(inode)) {
755 ret = ocfs2_truncate_file(inode, di_bh,
756 i_size);
757 if (ret < 0) {
758 if (ret != -ENOSPC)
759 mlog_errno(ret);
760
761 ocfs2_inode_unlock(inode, 1);
762 brelse(di_bh);
763 goto clean_orphan;
764 }
765 }
766
767 ocfs2_inode_unlock(inode, 1);
768 brelse(di_bh);
769
770 ret = jbd2_journal_force_commit(journal);
771 if (ret < 0)
772 mlog_errno(ret);
773 }
774 } else if (written < 0 && append_write && !is_overwrite &&
775 !cluster_align) {
776 u32 p_cpos = 0;
777 u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
778
779 ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
780 &num_clusters, &ext_flags);
781 if (ret < 0) {
782 mlog_errno(ret);
783 goto clean_orphan;
784 }
785
786 BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
787
788 ret = blkdev_issue_zeroout(osb->sb->s_bdev,
789 p_cpos << (osb->s_clustersize_bits - 9),
790 zero_len >> 9, GFP_KERNEL, false);
791 if (ret < 0)
792 mlog_errno(ret);
793 }
794
795clean_orphan:
796 if (orphaned) {
797 int tmp_ret;
798 int update_isize = written > 0 ? 1 : 0;
799 loff_t end = update_isize ? offset + written : 0;
800
801 tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
802 update_isize, end);
803 if (tmp_ret < 0) {
804 ret = tmp_ret;
805 goto out;
806 }
807
808 tmp_ret = jbd2_journal_force_commit(journal);
809 if (tmp_ret < 0) {
810 ret = tmp_ret;
811 mlog_errno(tmp_ret);
812 }
813 }
814
815out:
816 if (ret >= 0)
817 ret = written;
818 return ret;
819}
820
600static ssize_t ocfs2_direct_IO(int rw, 821static ssize_t ocfs2_direct_IO(int rw,
601 struct kiocb *iocb, 822 struct kiocb *iocb,
602 struct iov_iter *iter, 823 struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
604{ 825{
605 struct file *file = iocb->ki_filp; 826 struct file *file = iocb->ki_filp;
606 struct inode *inode = file_inode(file)->i_mapping->host; 827 struct inode *inode = file_inode(file)->i_mapping->host;
828 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
829 int full_coherency = !(osb->s_mount_opt &
830 OCFS2_MOUNT_COHERENCY_BUFFERED);
607 831
608 /* 832 /*
609 * Fallback to buffered I/O if we see an inode without 833 * Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
612 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 836 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
613 return 0; 837 return 0;
614 838
615 /* Fallback to buffered I/O if we are appending. */ 839 /* Fallback to buffered I/O if we are appending and
616 if (i_size_read(inode) <= offset) 840 * concurrent O_DIRECT writes are allowed.
841 */
842 if (i_size_read(inode) <= offset && !full_coherency)
617 return 0; 843 return 0;
618 844
619 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, 845 if (rw == READ)
846 return __blockdev_direct_IO(rw, iocb, inode,
847 inode->i_sb->s_bdev,
620 iter, offset, 848 iter, offset,
621 ocfs2_direct_IO_get_blocks, 849 ocfs2_direct_IO_get_blocks,
622 ocfs2_dio_end_io, NULL, 0); 850 ocfs2_dio_end_io, NULL, 0);
851 else
852 return ocfs2_direct_IO_write(iocb, iter, offset);
623} 853}
624 854
625static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 855static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2e355e0f8335..56c403a563bc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes)
1016 1016
1017 memset(map, 0, bytes); 1017 memset(map, 0, bytes);
1018 for (node = 0; node < O2NM_MAX_NODES; ++node) { 1018 for (node = 0; node < O2NM_MAX_NODES; ++node) {
1019 o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); 1019 if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
1020 continue;
1020 if (!ret) { 1021 if (!ret) {
1021 set_bit(node, map); 1022 set_bit(node, map);
1022 sc_put(sc); 1023 sc_put(sc);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index dc024367110a..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -107,12 +107,12 @@ struct o2net_node {
107 struct list_head nn_status_list; 107 struct list_head nn_status_list;
108 108
109 /* connects are attempted from when heartbeat comes up until either hb 109 /* connects are attempted from when heartbeat comes up until either hb
110 * goes down, the node is unconfigured, no connect attempts succeed 110 * goes down, the node is unconfigured, or a connect succeeds.
111 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work 111 * connect_work is queued from set_nn_state both from hb up and from
112 * is queued from set_nn_state both from hb up and from itself if a 112 * itself if a connect attempt fails and so can be self-arming.
113 * connect attempt fails and so can be self-arming. shutdown is 113 * shutdown is careful to first mark the nn such that no connects will
114 * careful to first mark the nn such that no connects will be attempted 114 * be attempted before canceling delayed connect work and flushing the
115 * before canceling delayed connect work and flushing the queue. */ 115 * queue. */
116 struct delayed_work nn_connect_work; 116 struct delayed_work nn_connect_work;
117 unsigned long nn_last_connect_attempt; 117 unsigned long nn_last_connect_attempt;
118 118
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 319e786175af..b08050bd3f2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3456 int blocksize = dir->i_sb->s_blocksize; 3456 int blocksize = dir->i_sb->s_blocksize;
3457 3457
3458 status = ocfs2_read_dir_block(dir, 0, &bh, 0); 3458 status = ocfs2_read_dir_block(dir, 0, &bh, 0);
3459 if (status) { 3459 if (status)
3460 mlog_errno(status);
3461 goto bail; 3460 goto bail;
3462 }
3463 3461
3464 rec_len = OCFS2_DIR_REC_LEN(namelen); 3462 rec_len = OCFS2_DIR_REC_LEN(namelen);
3465 offset = 0; 3463 offset = 0;
@@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3480 status = ocfs2_read_dir_block(dir, 3478 status = ocfs2_read_dir_block(dir,
3481 offset >> sb->s_blocksize_bits, 3479 offset >> sb->s_blocksize_bits,
3482 &bh, 0); 3480 &bh, 0);
3483 if (status) { 3481 if (status)
3484 mlog_errno(status);
3485 goto bail; 3482 goto bail;
3486 } 3483
3487 /* move to next block */ 3484 /* move to next block */
3488 de = (struct ocfs2_dir_entry *) bh->b_data; 3485 de = (struct ocfs2_dir_entry *) bh->b_data;
3489 } 3486 }
@@ -3513,7 +3510,6 @@ next:
3513 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); 3510 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
3514 } 3511 }
3515 3512
3516 status = 0;
3517bail: 3513bail:
3518 brelse(bh); 3514 brelse(bh);
3519 if (status) 3515 if (status)
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
385 head = &res->granted; 385 head = &res->granted;
386 386
387 list_for_each_entry(lock, head, list) { 387 list_for_each_entry(lock, head, list) {
388 if (lock->ml.cookie == cookie) 388 /* if lock is found but unlock is pending ignore the bast */
389 if (lock->ml.cookie == cookie) {
390 if (lock->unlock_pending)
391 break;
389 goto do_ast; 392 goto do_ast;
393 }
390 } 394 }
391 395
392 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " 396 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 149eb556b8c6..825136070d2c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
406 } 406 }
407 spin_unlock(&dlm->spinlock); 407 spin_unlock(&dlm->spinlock);
408 408
409 out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); 409 out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
410 410
411 return out; 411 return out;
412} 412}
@@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
464 spin_unlock(&dlm->master_lock); 464 spin_unlock(&dlm->master_lock);
465 465
466 out += snprintf(buf + out, len - out, 466 out += snprintf(buf + out, len - out,
467 "Total: %ld, Longest: %ld\n", total, longest); 467 "Total: %lu, Longest: %lu\n", total, longest);
468 return out; 468 return out;
469} 469}
470 470
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 50a59d2337b2..7df88a6dd626 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
674 spin_unlock(&dlm->spinlock); 674 spin_unlock(&dlm->spinlock);
675} 675}
676 676
677int dlm_joined(struct dlm_ctxt *dlm)
678{
679 int ret = 0;
680
681 spin_lock(&dlm_domain_lock);
682
683 if (dlm->dlm_state == DLM_CTXT_JOINED)
684 ret = 1;
685
686 spin_unlock(&dlm_domain_lock);
687
688 return ret;
689}
690
691int dlm_shutting_down(struct dlm_ctxt *dlm) 677int dlm_shutting_down(struct dlm_ctxt *dlm)
692{ 678{
693 int ret = 0; 679 int ret = 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..fd6122a38dbd 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,6 @@
28extern spinlock_t dlm_domain_lock; 28extern spinlock_t dlm_domain_lock;
29extern struct list_head dlm_domains; 29extern struct list_head dlm_domains;
30 30
31int dlm_joined(struct dlm_ctxt *dlm);
32int dlm_shutting_down(struct dlm_ctxt *dlm); 31int dlm_shutting_down(struct dlm_ctxt *dlm);
33void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, 32void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
34 int node_num); 33 int node_num);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 79b5af5e6a7b..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1070 dead_node, dlm->name); 1070 dead_node, dlm->name);
1071 list_del_init(&lock->list); 1071 list_del_init(&lock->list);
1072 dlm_lock_put(lock); 1072 dlm_lock_put(lock);
1073 /* Can't schedule DLM_UNLOCK_FREE_LOCK
1074 * - do manually */
1075 dlm_lock_put(lock);
1073 break; 1076 break;
1074 } 1077 }
1075 } 1078 }
@@ -2023,11 +2026,8 @@ leave:
2023 dlm_lockres_drop_inflight_ref(dlm, res); 2026 dlm_lockres_drop_inflight_ref(dlm, res);
2024 spin_unlock(&res->spinlock); 2027 spin_unlock(&res->spinlock);
2025 2028
2026 if (ret < 0) { 2029 if (ret < 0)
2027 mlog_errno(ret); 2030 mlog_errno(ret);
2028 if (newlock)
2029 dlm_lock_put(newlock);
2030 }
2031 2031
2032 return ret; 2032 return ret;
2033} 2033}
@@ -2349,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2349 dead_node, dlm->name); 2349 dead_node, dlm->name);
2350 list_del_init(&lock->list); 2350 list_del_init(&lock->list);
2351 dlm_lock_put(lock); 2351 dlm_lock_put(lock);
2352 /* Can't schedule
2353 * DLM_UNLOCK_FREE_LOCK
2354 * - do manually */
2355 dlm_lock_put(lock);
2352 break; 2356 break;
2353 } 2357 }
2354 } 2358 }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
390 ip->ip_conn = NULL; 390 ip->ip_conn = NULL;
391} 391}
392 392
393static struct backing_dev_info dlmfs_backing_dev_info = {
394 .name = "ocfs2-dlmfs",
395 .ra_pages = 0, /* No readahead */
396 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
397};
398
399static struct inode *dlmfs_get_root_inode(struct super_block *sb) 393static struct inode *dlmfs_get_root_inode(struct super_block *sb)
400{ 394{
401 struct inode *inode = new_inode(sb); 395 struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
404 if (inode) { 398 if (inode) {
405 inode->i_ino = get_next_ino(); 399 inode->i_ino = get_next_ino();
406 inode_init_owner(inode, NULL, mode); 400 inode_init_owner(inode, NULL, mode);
407 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
408 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 401 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
409 inc_nlink(inode); 402 inc_nlink(inode);
410 403
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
428 421
429 inode->i_ino = get_next_ino(); 422 inode->i_ino = get_next_ino();
430 inode_init_owner(inode, parent, mode); 423 inode_init_owner(inode, parent, mode);
431 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 424 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
433 425
434 ip = DLMFS_I(inode); 426 ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
643 int status; 635 int status;
644 int cleanup_inode = 0, cleanup_worker = 0; 636 int cleanup_inode = 0, cleanup_worker = 0;
645 637
646 status = bdi_init(&dlmfs_backing_dev_info);
647 if (status)
648 return status;
649
650 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", 638 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
651 sizeof(struct dlmfs_inode_private), 639 sizeof(struct dlmfs_inode_private),
652 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 640 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
673 kmem_cache_destroy(dlmfs_inode_cache); 661 kmem_cache_destroy(dlmfs_inode_cache);
674 if (cleanup_worker) 662 if (cleanup_worker)
675 destroy_workqueue(user_dlm_worker); 663 destroy_workqueue(user_dlm_worker);
676 bdi_destroy(&dlmfs_backing_dev_info);
677 } else 664 } else
678 printk("OCFS2 User DLM kernel interface loaded\n"); 665 printk("OCFS2 User DLM kernel interface loaded\n");
679 return status; 666 return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
693 rcu_barrier(); 680 rcu_barrier();
694 kmem_cache_destroy(dlmfs_inode_cache); 681 kmem_cache_destroy(dlmfs_inode_cache);
695 682
696 bdi_destroy(&dlmfs_backing_dev_info);
697} 683}
698 684
699MODULE_AUTHOR("Oracle"); 685MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c423af04c69..11849a44dc5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3750 break; 3750 break;
3751 spin_unlock(&dentry_attach_lock); 3751 spin_unlock(&dentry_attach_lock);
3752 3752
3753 if (S_ISDIR(dl->dl_inode->i_mode))
3754 shrink_dcache_parent(dentry);
3755
3753 mlog(0, "d_delete(%pd);\n", dentry); 3756 mlog(0, "d_delete(%pd);\n", dentry);
3754 3757
3755 /* 3758 /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..46e0d4e857c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -295,7 +295,7 @@ out:
295 return ret; 295 return ret;
296} 296}
297 297
298static int ocfs2_set_inode_size(handle_t *handle, 298int ocfs2_set_inode_size(handle_t *handle,
299 struct inode *inode, 299 struct inode *inode,
300 struct buffer_head *fe_bh, 300 struct buffer_head *fe_bh,
301 u64 new_i_size) 301 u64 new_i_size)
@@ -441,7 +441,7 @@ out:
441 return status; 441 return status;
442} 442}
443 443
444static int ocfs2_truncate_file(struct inode *inode, 444int ocfs2_truncate_file(struct inode *inode,
445 struct buffer_head *di_bh, 445 struct buffer_head *di_bh,
446 u64 new_i_size) 446 u64 new_i_size)
447{ 447{
@@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
569 handle_t *handle = NULL; 569 handle_t *handle = NULL;
570 struct ocfs2_alloc_context *data_ac = NULL; 570 struct ocfs2_alloc_context *data_ac = NULL;
571 struct ocfs2_alloc_context *meta_ac = NULL; 571 struct ocfs2_alloc_context *meta_ac = NULL;
572 enum ocfs2_alloc_restarted why; 572 enum ocfs2_alloc_restarted why = RESTART_NONE;
573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
574 struct ocfs2_extent_tree et; 574 struct ocfs2_extent_tree et;
575 int did_quota = 0; 575 int did_quota = 0;
@@ -709,6 +709,13 @@ leave:
709 return status; 709 return status;
710} 710}
711 711
712int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
713 u32 clusters_to_add, int mark_unwritten)
714{
715 return __ocfs2_extend_allocation(inode, logical_start,
716 clusters_to_add, mark_unwritten);
717}
718
712/* 719/*
713 * While a write will already be ordering the data, a truncate will not. 720 * While a write will already be ordering the data, a truncate will not.
714 * Thus, we need to explicitly order the zeroed pages. 721 * Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2109 struct dentry *dentry = file->f_path.dentry; 2116 struct dentry *dentry = file->f_path.dentry;
2110 struct inode *inode = dentry->d_inode; 2117 struct inode *inode = dentry->d_inode;
2111 loff_t saved_pos = 0, end; 2118 loff_t saved_pos = 0, end;
2119 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2120 int full_coherency = !(osb->s_mount_opt &
2121 OCFS2_MOUNT_COHERENCY_BUFFERED);
2112 2122
2113 /* 2123 /*
2114 * We start with a read level meta lock and only jump to an ex 2124 * We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2197 * one node could wind up truncating another 2207 * one node could wind up truncating another
2198 * nodes writes. 2208 * nodes writes.
2199 */ 2209 */
2200 if (end > i_size_read(inode)) { 2210 if (end > i_size_read(inode) && !full_coherency) {
2211 *direct_io = 0;
2212 break;
2213 }
2214
2215 /*
2216 * Fallback to old way if the feature bit is not set.
2217 */
2218 if (end > i_size_read(inode) &&
2219 !ocfs2_supports_append_dio(osb)) {
2201 *direct_io = 0; 2220 *direct_io = 0;
2202 break; 2221 break;
2203 } 2222 }
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2210 */ 2229 */
2211 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 2230 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2212 if (ret == 1) { 2231 if (ret == 1) {
2213 *direct_io = 0; 2232 /*
2233 * Fallback to old way if the feature bit is not set.
2234 * Otherwise try dio first and then complete the rest
2235 * request through buffer io.
2236 */
2237 if (!ocfs2_supports_append_dio(osb))
2238 *direct_io = 0;
2214 ret = 0; 2239 ret = 0;
2215 } else if (ret < 0) 2240 } else if (ret < 0)
2216 mlog_errno(ret); 2241 mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2243 u32 old_clusters; 2268 u32 old_clusters;
2244 struct file *file = iocb->ki_filp; 2269 struct file *file = iocb->ki_filp;
2245 struct inode *inode = file_inode(file); 2270 struct inode *inode = file_inode(file);
2271 struct address_space *mapping = file->f_mapping;
2246 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2272 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2247 int full_coherency = !(osb->s_mount_opt & 2273 int full_coherency = !(osb->s_mount_opt &
2248 OCFS2_MOUNT_COHERENCY_BUFFERED); 2274 OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,13 +2383,53 @@ relock:
2357 2383
2358 iov_iter_truncate(from, count); 2384 iov_iter_truncate(from, count);
2359 if (direct_io) { 2385 if (direct_io) {
2386 loff_t endbyte;
2387 ssize_t written_buffered;
2360 written = generic_file_direct_write(iocb, from, *ppos); 2388 written = generic_file_direct_write(iocb, from, *ppos);
2361 if (written < 0) { 2389 if (written < 0 || written == count) {
2362 ret = written; 2390 ret = written;
2363 goto out_dio; 2391 goto out_dio;
2364 } 2392 }
2393
2394 /*
2395 * for completing the rest of the request.
2396 */
2397 *ppos += written;
2398 count -= written;
2399 written_buffered = generic_perform_write(file, from, *ppos);
2400 /*
2401 * If generic_file_buffered_write() returned a synchronous error
2402 * then we want to return the number of bytes which were
2403 * direct-written, or the error code if that was zero. Note
2404 * that this differs from normal direct-io semantics, which
2405 * will return -EFOO even if some bytes were written.
2406 */
2407 if (written_buffered < 0) {
2408 ret = written_buffered;
2409 goto out_dio;
2410 }
2411
2412 iocb->ki_pos = *ppos + written_buffered;
2413 /* We need to ensure that the page cache pages are written to
2414 * disk and invalidated to preserve the expected O_DIRECT
2415 * semantics.
2416 */
2417 endbyte = *ppos + written_buffered - 1;
2418 ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
2419 endbyte);
2420 if (ret == 0) {
2421 written += written_buffered;
2422 invalidate_mapping_pages(mapping,
2423 *ppos >> PAGE_CACHE_SHIFT,
2424 endbyte >> PAGE_CACHE_SHIFT);
2425 } else {
2426 /*
2427 * We don't know how much we wrote, so just return
2428 * the number of bytes which were direct-written
2429 */
2430 }
2365 } else { 2431 } else {
2366 current->backing_dev_info = file->f_mapping->backing_dev_info; 2432 current->backing_dev_info = inode_to_bdi(inode);
2367 written = generic_perform_write(file, from, *ppos); 2433 written = generic_perform_write(file, from, *ppos);
2368 if (likely(written >= 0)) 2434 if (likely(written >= 0))
2369 iocb->ki_pos = *ppos + written; 2435 iocb->ki_pos = *ppos + written;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..e8c62f22215c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
51 struct ocfs2_alloc_context *data_ac, 51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac, 52 struct ocfs2_alloc_context *meta_ac,
53 enum ocfs2_alloc_restarted *reason_ret); 53 enum ocfs2_alloc_restarted *reason_ret);
54int ocfs2_set_inode_size(handle_t *handle,
55 struct inode *inode,
56 struct buffer_head *fe_bh,
57 u64 new_i_size);
54int ocfs2_simple_size_update(struct inode *inode, 58int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh, 59 struct buffer_head *di_bh,
56 u64 new_i_size); 60 u64 new_i_size);
61int ocfs2_truncate_file(struct inode *inode,
62 struct buffer_head *di_bh,
63 u64 new_i_size);
57int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, 64int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
58 u64 new_i_size, u64 zero_to); 65 u64 new_i_size, u64 zero_to);
59int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, 66int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
60 loff_t zero_to); 67 loff_t zero_to);
68int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
69 u32 clusters_to_add, int mark_unwritten);
61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 70int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 71int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
63 struct kstat *stat); 72 struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8b25de9efbb..3025c0da6b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
648 648
649 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 649 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
650 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 650 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
651 orphan_dir_bh); 651 orphan_dir_bh, false);
652 if (status < 0) { 652 if (status < 0) {
653 mlog_errno(status); 653 mlog_errno(status);
654 goto bail_commit; 654 goto bail_commit;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..5e86b247c821 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
81 tid_t i_sync_tid; 81 tid_t i_sync_tid;
82 tid_t i_datasync_tid; 82 tid_t i_datasync_tid;
83 83
84 wait_queue_head_t append_dio_wq;
85
84 struct dquot *i_dquot[MAXQUOTAS]; 86 struct dquot *i_dquot[MAXQUOTAS];
85}; 87};
86 88
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f502382180f..ff531928269e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -50,6 +50,8 @@
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "quota.h" 52#include "quota.h"
53#include "file.h"
54#include "namei.h"
53 55
54#include "buffer_head_io.h" 56#include "buffer_head_io.h"
55#include "ocfs2_trace.h" 57#include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
69static int ocfs2_trylock_journal(struct ocfs2_super *osb, 71static int ocfs2_trylock_journal(struct ocfs2_super *osb,
70 int slot_num); 72 int slot_num);
71static int ocfs2_recover_orphans(struct ocfs2_super *osb, 73static int ocfs2_recover_orphans(struct ocfs2_super *osb,
72 int slot); 74 int slot,
75 enum ocfs2_orphan_reco_type orphan_reco_type);
73static int ocfs2_commit_thread(void *arg); 76static int ocfs2_commit_thread(void *arg);
74static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 77static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
75 int slot_num, 78 int slot_num,
76 struct ocfs2_dinode *la_dinode, 79 struct ocfs2_dinode *la_dinode,
77 struct ocfs2_dinode *tl_dinode, 80 struct ocfs2_dinode *tl_dinode,
78 struct ocfs2_quota_recovery *qrec); 81 struct ocfs2_quota_recovery *qrec,
82 enum ocfs2_orphan_reco_type orphan_reco_type);
79 83
80static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) 84static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
81{ 85{
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
149 return 0; 153 return 0;
150} 154}
151 155
152void ocfs2_queue_replay_slots(struct ocfs2_super *osb) 156void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
157 enum ocfs2_orphan_reco_type orphan_reco_type)
153{ 158{
154 struct ocfs2_replay_map *replay_map = osb->replay_map; 159 struct ocfs2_replay_map *replay_map = osb->replay_map;
155 int i; 160 int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
163 for (i = 0; i < replay_map->rm_slots; i++) 168 for (i = 0; i < replay_map->rm_slots; i++)
164 if (replay_map->rm_replay_slots[i]) 169 if (replay_map->rm_replay_slots[i])
165 ocfs2_queue_recovery_completion(osb->journal, i, NULL, 170 ocfs2_queue_recovery_completion(osb->journal, i, NULL,
166 NULL, NULL); 171 NULL, NULL,
172 orphan_reco_type);
167 replay_map->rm_state = REPLAY_DONE; 173 replay_map->rm_state = REPLAY_DONE;
168} 174}
169 175
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
1174 struct ocfs2_dinode *lri_la_dinode; 1180 struct ocfs2_dinode *lri_la_dinode;
1175 struct ocfs2_dinode *lri_tl_dinode; 1181 struct ocfs2_dinode *lri_tl_dinode;
1176 struct ocfs2_quota_recovery *lri_qrec; 1182 struct ocfs2_quota_recovery *lri_qrec;
1183 enum ocfs2_orphan_reco_type lri_orphan_reco_type;
1177}; 1184};
1178 1185
1179/* Does the second half of the recovery process. By this point, the 1186/* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1195 struct ocfs2_dinode *la_dinode, *tl_dinode; 1202 struct ocfs2_dinode *la_dinode, *tl_dinode;
1196 struct ocfs2_la_recovery_item *item, *n; 1203 struct ocfs2_la_recovery_item *item, *n;
1197 struct ocfs2_quota_recovery *qrec; 1204 struct ocfs2_quota_recovery *qrec;
1205 enum ocfs2_orphan_reco_type orphan_reco_type;
1198 LIST_HEAD(tmp_la_list); 1206 LIST_HEAD(tmp_la_list);
1199 1207
1200 trace_ocfs2_complete_recovery( 1208 trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1212 la_dinode = item->lri_la_dinode; 1220 la_dinode = item->lri_la_dinode;
1213 tl_dinode = item->lri_tl_dinode; 1221 tl_dinode = item->lri_tl_dinode;
1214 qrec = item->lri_qrec; 1222 qrec = item->lri_qrec;
1223 orphan_reco_type = item->lri_orphan_reco_type;
1215 1224
1216 trace_ocfs2_complete_recovery_slot(item->lri_slot, 1225 trace_ocfs2_complete_recovery_slot(item->lri_slot,
1217 la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, 1226 la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
1236 kfree(tl_dinode); 1245 kfree(tl_dinode);
1237 } 1246 }
1238 1247
1239 ret = ocfs2_recover_orphans(osb, item->lri_slot); 1248 ret = ocfs2_recover_orphans(osb, item->lri_slot,
1249 orphan_reco_type);
1240 if (ret < 0) 1250 if (ret < 0)
1241 mlog_errno(ret); 1251 mlog_errno(ret);
1242 1252
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1261 int slot_num, 1271 int slot_num,
1262 struct ocfs2_dinode *la_dinode, 1272 struct ocfs2_dinode *la_dinode,
1263 struct ocfs2_dinode *tl_dinode, 1273 struct ocfs2_dinode *tl_dinode,
1264 struct ocfs2_quota_recovery *qrec) 1274 struct ocfs2_quota_recovery *qrec,
1275 enum ocfs2_orphan_reco_type orphan_reco_type)
1265{ 1276{
1266 struct ocfs2_la_recovery_item *item; 1277 struct ocfs2_la_recovery_item *item;
1267 1278
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
1285 item->lri_slot = slot_num; 1296 item->lri_slot = slot_num;
1286 item->lri_tl_dinode = tl_dinode; 1297 item->lri_tl_dinode = tl_dinode;
1287 item->lri_qrec = qrec; 1298 item->lri_qrec = qrec;
1299 item->lri_orphan_reco_type = orphan_reco_type;
1288 1300
1289 spin_lock(&journal->j_lock); 1301 spin_lock(&journal->j_lock);
1290 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1302 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1304 /* No need to queue up our truncate_log as regular cleanup will catch 1316 /* No need to queue up our truncate_log as regular cleanup will catch
1305 * that */ 1317 * that */
1306 ocfs2_queue_recovery_completion(journal, osb->slot_num, 1318 ocfs2_queue_recovery_completion(journal, osb->slot_num,
1307 osb->local_alloc_copy, NULL, NULL); 1319 osb->local_alloc_copy, NULL, NULL,
1320 ORPHAN_NEED_TRUNCATE);
1308 ocfs2_schedule_truncate_log_flush(osb, 0); 1321 ocfs2_schedule_truncate_log_flush(osb, 0);
1309 1322
1310 osb->local_alloc_copy = NULL; 1323 osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1312 1325
1313 /* queue to recover orphan slots for all offline slots */ 1326 /* queue to recover orphan slots for all offline slots */
1314 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1327 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1315 ocfs2_queue_replay_slots(osb); 1328 ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
1316 ocfs2_free_replay_slots(osb); 1329 ocfs2_free_replay_slots(osb);
1317} 1330}
1318 1331
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
1323 osb->slot_num, 1336 osb->slot_num,
1324 NULL, 1337 NULL,
1325 NULL, 1338 NULL,
1326 osb->quota_rec); 1339 osb->quota_rec,
1340 ORPHAN_NEED_TRUNCATE);
1327 osb->quota_rec = NULL; 1341 osb->quota_rec = NULL;
1328 } 1342 }
1329} 1343}
@@ -1360,7 +1374,7 @@ restart:
1360 1374
1361 /* queue recovery for our own slot */ 1375 /* queue recovery for our own slot */
1362 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 1376 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1363 NULL, NULL); 1377 NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
1364 1378
1365 spin_lock(&osb->osb_lock); 1379 spin_lock(&osb->osb_lock);
1366 while (rm->rm_used) { 1380 while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
1419 continue; 1433 continue;
1420 } 1434 }
1421 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], 1435 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
1422 NULL, NULL, qrec); 1436 NULL, NULL, qrec,
1437 ORPHAN_NEED_TRUNCATE);
1423 } 1438 }
1424 1439
1425 ocfs2_super_unlock(osb, 1); 1440 ocfs2_super_unlock(osb, 1);
1426 1441
1427 /* queue recovery for offline slots */ 1442 /* queue recovery for offline slots */
1428 ocfs2_queue_replay_slots(osb); 1443 ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
1429 1444
1430bail: 1445bail:
1431 mutex_lock(&osb->recovery_lock); 1446 mutex_lock(&osb->recovery_lock);
@@ -1447,7 +1462,6 @@ bail:
1447 * requires that we call do_exit(). And it isn't exported, but 1462 * requires that we call do_exit(). And it isn't exported, but
1448 * complete_and_exit() seems to be a minimal wrapper around it. */ 1463 * complete_and_exit() seems to be a minimal wrapper around it. */
1449 complete_and_exit(NULL, status); 1464 complete_and_exit(NULL, status);
1450 return status;
1451} 1465}
1452 1466
1453void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) 1467void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
@@ -1712,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1712 1726
1713 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1727 /* This will kfree the memory pointed to by la_copy and tl_copy */
1714 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1728 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1715 tl_copy, NULL); 1729 tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
1716 1730
1717 status = 0; 1731 status = 0;
1718done: 1732done:
@@ -1902,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1902 1916
1903 for (i = 0; i < osb->max_slots; i++) 1917 for (i = 0; i < osb->max_slots; i++)
1904 ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, 1918 ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
1905 NULL); 1919 NULL, ORPHAN_NO_NEED_TRUNCATE);
1906 /* 1920 /*
1907 * We queued a recovery on orphan slots, increment the sequence 1921 * We queued a recovery on orphan slots, increment the sequence
1908 * number and update LVB so other node will skip the scan for a while 1922 * number and update LVB so other node will skip the scan for a while
@@ -2001,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
2001 if (IS_ERR(iter)) 2015 if (IS_ERR(iter))
2002 return 0; 2016 return 0;
2003 2017
2018 /* Skip inodes which are already added to recover list, since dio may
2019 * happen concurrently with unlink/rename */
2020 if (OCFS2_I(iter)->ip_next_orphan) {
2021 iput(iter);
2022 return 0;
2023 }
2024
2004 trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); 2025 trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
2005 /* No locking is required for the next_orphan queue as there 2026 /* No locking is required for the next_orphan queue as there
2006 * is only ever a single process doing orphan recovery. */ 2027 * is only ever a single process doing orphan recovery. */
@@ -2109,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
2109 * advertising our state to ocfs2_delete_inode(). 2130 * advertising our state to ocfs2_delete_inode().
2110 */ 2131 */
2111static int ocfs2_recover_orphans(struct ocfs2_super *osb, 2132static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2112 int slot) 2133 int slot,
2134 enum ocfs2_orphan_reco_type orphan_reco_type)
2113{ 2135{
2114 int ret = 0; 2136 int ret = 0;
2115 struct inode *inode = NULL; 2137 struct inode *inode = NULL;
@@ -2133,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2133 (unsigned long long)oi->ip_blkno); 2155 (unsigned long long)oi->ip_blkno);
2134 2156
2135 iter = oi->ip_next_orphan; 2157 iter = oi->ip_next_orphan;
2158 oi->ip_next_orphan = NULL;
2159
2160 /*
2161 * We need to take and drop the inode lock to
2162 * force read inode from disk.
2163 */
2164 ret = ocfs2_inode_lock(inode, NULL, 0);
2165 if (ret) {
2166 mlog_errno(ret);
2167 goto next;
2168 }
2169 ocfs2_inode_unlock(inode, 0);
2170
2171 if (inode->i_nlink == 0) {
2172 spin_lock(&oi->ip_lock);
2173 /* Set the proper information to get us going into
2174 * ocfs2_delete_inode. */
2175 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2176 spin_unlock(&oi->ip_lock);
2177 } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
2178 struct buffer_head *di_bh = NULL;
2179
2180 ret = ocfs2_rw_lock(inode, 1);
2181 if (ret) {
2182 mlog_errno(ret);
2183 goto next;
2184 }
2185
2186 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2187 if (ret < 0) {
2188 ocfs2_rw_unlock(inode, 1);
2189 mlog_errno(ret);
2190 goto next;
2191 }
2192
2193 ret = ocfs2_truncate_file(inode, di_bh,
2194 i_size_read(inode));
2195 ocfs2_inode_unlock(inode, 1);
2196 ocfs2_rw_unlock(inode, 1);
2197 brelse(di_bh);
2198 if (ret < 0) {
2199 if (ret != -ENOSPC)
2200 mlog_errno(ret);
2201 goto next;
2202 }
2203
2204 ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
2205 if (ret)
2206 mlog_errno(ret);
2136 2207
2137 spin_lock(&oi->ip_lock); 2208 wake_up(&OCFS2_I(inode)->append_dio_wq);
2138 /* Set the proper information to get us going into 2209 } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
2139 * ocfs2_delete_inode. */
2140 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2141 spin_unlock(&oi->ip_lock);
2142 2210
2211next:
2143 iput(inode); 2212 iput(inode);
2144 2213
2145 inode = iter; 2214 inode = iter;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7f8cde94abfe..f4cd3c3e9fb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
472 * orphan dir index leaf */ 472 * orphan dir index leaf */
473#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) 473#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
474 474
475/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
476 * orphan dir index root + orphan dir index leaf */
477#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
478#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
479
475/* dinode update, old dir dinode update, new dir dinode update, old 480/* dinode update, old dir dinode update, new dir dinode update, old
476 * dir dir entry, new dir dir entry, dir entry update for renaming 481 * dir dir entry, new dir dir entry, dir entry update for renaming
477 * directory + target unlink + 3 x dir index leaves */ 482 * directory + target unlink + 3 x dir index leaves */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
173static const struct vm_operations_struct ocfs2_file_vm_ops = { 173static const struct vm_operations_struct ocfs2_file_vm_ops = {
174 .fault = ocfs2_fault, 174 .fault = ocfs2_fault,
175 .page_mkwrite = ocfs2_page_mkwrite, 175 .page_mkwrite = ocfs2_page_mkwrite,
176 .remap_pages = generic_file_remap_pages,
177}; 176};
178 177
179int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 178int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b931e04e3388..b5c3a5ea3ee6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
79 struct inode **ret_orphan_dir, 79 struct inode **ret_orphan_dir,
80 u64 blkno, 80 u64 blkno,
81 char *name, 81 char *name,
82 struct ocfs2_dir_lookup_result *lookup); 82 struct ocfs2_dir_lookup_result *lookup,
83 bool dio);
83 84
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 85static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 86 handle_t *handle,
@@ -87,15 +88,26 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
87 struct buffer_head *fe_bh, 88 struct buffer_head *fe_bh,
88 char *name, 89 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 90 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 91 struct inode *orphan_dir_inode,
92 bool dio);
91 93
92static int ocfs2_create_symlink_data(struct ocfs2_super *osb, 94static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
93 handle_t *handle, 95 handle_t *handle,
94 struct inode *inode, 96 struct inode *inode,
95 const char *symname); 97 const char *symname);
96 98
99static int ocfs2_double_lock(struct ocfs2_super *osb,
100 struct buffer_head **bh1,
101 struct inode *inode1,
102 struct buffer_head **bh2,
103 struct inode *inode2,
104 int rename);
105
106static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
97/* An orphan dir name is an 8 byte value, printed as a hex string */ 107/* An orphan dir name is an 8 byte value, printed as a hex string */
98#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 108#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
109#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
110#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
99 111
100static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, 112static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
101 unsigned int flags) 113 unsigned int flags)
@@ -678,8 +690,10 @@ static int ocfs2_link(struct dentry *old_dentry,
678{ 690{
679 handle_t *handle; 691 handle_t *handle;
680 struct inode *inode = old_dentry->d_inode; 692 struct inode *inode = old_dentry->d_inode;
693 struct inode *old_dir = old_dentry->d_parent->d_inode;
681 int err; 694 int err;
682 struct buffer_head *fe_bh = NULL; 695 struct buffer_head *fe_bh = NULL;
696 struct buffer_head *old_dir_bh = NULL;
683 struct buffer_head *parent_fe_bh = NULL; 697 struct buffer_head *parent_fe_bh = NULL;
684 struct ocfs2_dinode *fe = NULL; 698 struct ocfs2_dinode *fe = NULL;
685 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 699 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -696,19 +710,33 @@ static int ocfs2_link(struct dentry *old_dentry,
696 710
697 dquot_initialize(dir); 711 dquot_initialize(dir);
698 712
699 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 713 err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
714 &parent_fe_bh, dir, 0);
700 if (err < 0) { 715 if (err < 0) {
701 if (err != -ENOENT) 716 if (err != -ENOENT)
702 mlog_errno(err); 717 mlog_errno(err);
703 return err; 718 return err;
704 } 719 }
705 720
721 /* make sure both dirs have bhs
722 * get an extra ref on old_dir_bh if old==new */
723 if (!parent_fe_bh) {
724 if (old_dir_bh) {
725 parent_fe_bh = old_dir_bh;
726 get_bh(parent_fe_bh);
727 } else {
728 mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
729 err = -EIO;
730 goto out;
731 }
732 }
733
706 if (!dir->i_nlink) { 734 if (!dir->i_nlink) {
707 err = -ENOENT; 735 err = -ENOENT;
708 goto out; 736 goto out;
709 } 737 }
710 738
711 err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, 739 err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
712 old_dentry->d_name.len, &old_de_ino); 740 old_dentry->d_name.len, &old_de_ino);
713 if (err) { 741 if (err) {
714 err = -ENOENT; 742 err = -ENOENT;
@@ -801,10 +829,11 @@ out_unlock_inode:
801 ocfs2_inode_unlock(inode, 1); 829 ocfs2_inode_unlock(inode, 1);
802 830
803out: 831out:
804 ocfs2_inode_unlock(dir, 1); 832 ocfs2_double_unlock(old_dir, dir);
805 833
806 brelse(fe_bh); 834 brelse(fe_bh);
807 brelse(parent_fe_bh); 835 brelse(parent_fe_bh);
836 brelse(old_dir_bh);
808 837
809 ocfs2_free_dir_lookup_result(&lookup); 838 ocfs2_free_dir_lookup_result(&lookup);
810 839
@@ -927,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
927 if (ocfs2_inode_is_unlinkable(inode)) { 956 if (ocfs2_inode_is_unlinkable(inode)) {
928 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 957 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
929 OCFS2_I(inode)->ip_blkno, 958 OCFS2_I(inode)->ip_blkno,
930 orphan_name, &orphan_insert); 959 orphan_name, &orphan_insert,
960 false);
931 if (status < 0) { 961 if (status < 0) {
932 mlog_errno(status); 962 mlog_errno(status);
933 goto leave; 963 goto leave;
@@ -979,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
979 1009
980 if (is_unlinkable) { 1010 if (is_unlinkable) {
981 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, 1011 status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
982 orphan_name, &orphan_insert, orphan_dir); 1012 orphan_name, &orphan_insert, orphan_dir, false);
983 if (status < 0) 1013 if (status < 0)
984 mlog_errno(status); 1014 mlog_errno(status);
985 } 1015 }
@@ -1072,14 +1102,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
1072} 1102}
1073 1103
1074/* 1104/*
1075 * The only place this should be used is rename! 1105 * The only place this should be used is rename and link!
1076 * if they have the same id, then the 1st one is the only one locked. 1106 * if they have the same id, then the 1st one is the only one locked.
1077 */ 1107 */
1078static int ocfs2_double_lock(struct ocfs2_super *osb, 1108static int ocfs2_double_lock(struct ocfs2_super *osb,
1079 struct buffer_head **bh1, 1109 struct buffer_head **bh1,
1080 struct inode *inode1, 1110 struct inode *inode1,
1081 struct buffer_head **bh2, 1111 struct buffer_head **bh2,
1082 struct inode *inode2) 1112 struct inode *inode2,
1113 int rename)
1083{ 1114{
1084 int status; 1115 int status;
1085 int inode1_is_ancestor, inode2_is_ancestor; 1116 int inode1_is_ancestor, inode2_is_ancestor;
@@ -1127,7 +1158,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1127 } 1158 }
1128 /* lock id2 */ 1159 /* lock id2 */
1129 status = ocfs2_inode_lock_nested(inode2, bh2, 1, 1160 status = ocfs2_inode_lock_nested(inode2, bh2, 1,
1130 OI_LS_RENAME1); 1161 rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
1131 if (status < 0) { 1162 if (status < 0) {
1132 if (status != -ENOENT) 1163 if (status != -ENOENT)
1133 mlog_errno(status); 1164 mlog_errno(status);
@@ -1136,7 +1167,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1136 } 1167 }
1137 1168
1138 /* lock id1 */ 1169 /* lock id1 */
1139 status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); 1170 status = ocfs2_inode_lock_nested(inode1, bh1, 1,
1171 rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
1140 if (status < 0) { 1172 if (status < 0) {
1141 /* 1173 /*
1142 * An error return must mean that no cluster locks 1174 * An error return must mean that no cluster locks
@@ -1252,7 +1284,7 @@ static int ocfs2_rename(struct inode *old_dir,
1252 1284
1253 /* if old and new are the same, this'll just do one lock. */ 1285 /* if old and new are the same, this'll just do one lock. */
1254 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, 1286 status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
1255 &new_dir_bh, new_dir); 1287 &new_dir_bh, new_dir, 1);
1256 if (status < 0) { 1288 if (status < 0) {
1257 mlog_errno(status); 1289 mlog_errno(status);
1258 goto bail; 1290 goto bail;
@@ -1413,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
1413 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { 1445 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1414 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1446 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1415 OCFS2_I(new_inode)->ip_blkno, 1447 OCFS2_I(new_inode)->ip_blkno,
1416 orphan_name, &orphan_insert); 1448 orphan_name, &orphan_insert,
1449 false);
1417 if (status < 0) { 1450 if (status < 0) {
1418 mlog_errno(status); 1451 mlog_errno(status);
1419 goto bail; 1452 goto bail;
@@ -1480,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
1480 if (should_add_orphan) { 1513 if (should_add_orphan) {
1481 status = ocfs2_orphan_add(osb, handle, new_inode, 1514 status = ocfs2_orphan_add(osb, handle, new_inode,
1482 newfe_bh, orphan_name, 1515 newfe_bh, orphan_name,
1483 &orphan_insert, orphan_dir); 1516 &orphan_insert, orphan_dir, false);
1484 if (status < 0) { 1517 if (status < 0) {
1485 mlog_errno(status); 1518 mlog_errno(status);
1486 goto bail; 1519 goto bail;
@@ -2061,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
2061 struct buffer_head *orphan_dir_bh, 2094 struct buffer_head *orphan_dir_bh,
2062 u64 blkno, 2095 u64 blkno,
2063 char *name, 2096 char *name,
2064 struct ocfs2_dir_lookup_result *lookup) 2097 struct ocfs2_dir_lookup_result *lookup,
2098 bool dio)
2065{ 2099{
2066 int ret; 2100 int ret;
2067 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); 2101 struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
2102 int namelen = dio ?
2103 (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
2104 OCFS2_ORPHAN_NAMELEN;
2105
2106 if (dio) {
2107 ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
2108 OCFS2_DIO_ORPHAN_PREFIX);
2109 if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
2110 ret = -EINVAL;
2111 mlog_errno(ret);
2112 return ret;
2113 }
2068 2114
2069 ret = ocfs2_blkno_stringify(blkno, name); 2115 ret = ocfs2_blkno_stringify(blkno,
2116 name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
2117 } else
2118 ret = ocfs2_blkno_stringify(blkno, name);
2070 if (ret < 0) { 2119 if (ret < 0) {
2071 mlog_errno(ret); 2120 mlog_errno(ret);
2072 return ret; 2121 return ret;
@@ -2074,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
2074 2123
2075 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, 2124 ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2076 orphan_dir_bh, name, 2125 orphan_dir_bh, name,
2077 OCFS2_ORPHAN_NAMELEN, lookup); 2126 namelen, lookup);
2078 if (ret < 0) { 2127 if (ret < 0) {
2079 mlog_errno(ret); 2128 mlog_errno(ret);
2080 return ret; 2129 return ret;
@@ -2101,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2101 struct inode **ret_orphan_dir, 2150 struct inode **ret_orphan_dir,
2102 u64 blkno, 2151 u64 blkno,
2103 char *name, 2152 char *name,
2104 struct ocfs2_dir_lookup_result *lookup) 2153 struct ocfs2_dir_lookup_result *lookup,
2154 bool dio)
2105{ 2155{
2106 struct inode *orphan_dir_inode = NULL; 2156 struct inode *orphan_dir_inode = NULL;
2107 struct buffer_head *orphan_dir_bh = NULL; 2157 struct buffer_head *orphan_dir_bh = NULL;
@@ -2115,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2115 } 2165 }
2116 2166
2117 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, 2167 ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
2118 blkno, name, lookup); 2168 blkno, name, lookup, dio);
2119 if (ret < 0) { 2169 if (ret < 0) {
2120 mlog_errno(ret); 2170 mlog_errno(ret);
2121 goto out; 2171 goto out;
@@ -2143,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2143 struct buffer_head *fe_bh, 2193 struct buffer_head *fe_bh,
2144 char *name, 2194 char *name,
2145 struct ocfs2_dir_lookup_result *lookup, 2195 struct ocfs2_dir_lookup_result *lookup,
2146 struct inode *orphan_dir_inode) 2196 struct inode *orphan_dir_inode,
2197 bool dio)
2147{ 2198{
2148 struct buffer_head *orphan_dir_bh = NULL; 2199 struct buffer_head *orphan_dir_bh = NULL;
2149 int status = 0; 2200 int status = 0;
2150 struct ocfs2_dinode *orphan_fe; 2201 struct ocfs2_dinode *orphan_fe;
2151 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 2202 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
2203 int namelen = dio ?
2204 (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
2205 OCFS2_ORPHAN_NAMELEN;
2152 2206
2153 trace_ocfs2_orphan_add_begin( 2207 trace_ocfs2_orphan_add_begin(
2154 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2208 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2192,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2192 ocfs2_journal_dirty(handle, orphan_dir_bh); 2246 ocfs2_journal_dirty(handle, orphan_dir_bh);
2193 2247
2194 status = __ocfs2_add_entry(handle, orphan_dir_inode, name, 2248 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2195 OCFS2_ORPHAN_NAMELEN, inode, 2249 namelen, inode,
2196 OCFS2_I(inode)->ip_blkno, 2250 OCFS2_I(inode)->ip_blkno,
2197 orphan_dir_bh, lookup); 2251 orphan_dir_bh, lookup);
2198 if (status < 0) { 2252 if (status < 0) {
@@ -2200,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2200 goto rollback; 2254 goto rollback;
2201 } 2255 }
2202 2256
2203 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); 2257 if (dio) {
2204 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; 2258 /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
2259 * slot.
2260 */
2261 fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
2262 fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
2263 } else {
2264 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
2265 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
2205 2266
2206 /* Record which orphan dir our inode now resides 2267 /* Record which orphan dir our inode now resides
2207 * in. delete_inode will use this to determine which orphan 2268 * in. delete_inode will use this to determine which orphan
2208 * dir to lock. */ 2269 * dir to lock. */
2209 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 2270 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
2271 }
2210 2272
2211 ocfs2_journal_dirty(handle, fe_bh); 2273 ocfs2_journal_dirty(handle, fe_bh);
2212 2274
@@ -2231,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2231 handle_t *handle, 2293 handle_t *handle,
2232 struct inode *orphan_dir_inode, 2294 struct inode *orphan_dir_inode,
2233 struct inode *inode, 2295 struct inode *inode,
2234 struct buffer_head *orphan_dir_bh) 2296 struct buffer_head *orphan_dir_bh,
2297 bool dio)
2235{ 2298{
2236 char name[OCFS2_ORPHAN_NAMELEN + 1]; 2299 const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
2300 char name[namelen + 1];
2237 struct ocfs2_dinode *orphan_fe; 2301 struct ocfs2_dinode *orphan_fe;
2238 int status = 0; 2302 int status = 0;
2239 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2303 struct ocfs2_dir_lookup_result lookup = { NULL, };
2240 2304
2241 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 2305 if (dio) {
2306 status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
2307 OCFS2_DIO_ORPHAN_PREFIX);
2308 if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
2309 status = -EINVAL;
2310 mlog_errno(status);
2311 return status;
2312 }
2313
2314 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
2315 name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
2316 } else
2317 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2242 if (status < 0) { 2318 if (status < 0) {
2243 mlog_errno(status); 2319 mlog_errno(status);
2244 goto leave; 2320 goto leave;
@@ -2246,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2246 2322
2247 trace_ocfs2_orphan_del( 2323 trace_ocfs2_orphan_del(
2248 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, 2324 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
2249 name, OCFS2_ORPHAN_NAMELEN); 2325 name, namelen);
2250 2326
2251 /* find it's spot in the orphan directory */ 2327 /* find it's spot in the orphan directory */
2252 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, 2328 status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
2253 &lookup); 2329 &lookup);
2254 if (status) { 2330 if (status) {
2255 mlog_errno(status); 2331 mlog_errno(status);
@@ -2349,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
2349 } 2425 }
2350 2426
2351 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, 2427 ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
2352 di_blkno, orphan_name, orphan_insert); 2428 di_blkno, orphan_name, orphan_insert,
2429 false);
2353 if (ret < 0) { 2430 if (ret < 0) {
2354 mlog_errno(ret); 2431 mlog_errno(ret);
2355 goto out; 2432 goto out;
@@ -2455,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2455 2532
2456 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2533 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2457 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, 2534 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2458 &orphan_insert, orphan_dir); 2535 &orphan_insert, orphan_dir, false);
2459 if (status < 0) { 2536 if (status < 0) {
2460 mlog_errno(status); 2537 mlog_errno(status);
2461 goto leave; 2538 goto leave;
@@ -2500,6 +2577,186 @@ leave:
2500 return status; 2577 return status;
2501} 2578}
2502 2579
2580static int ocfs2_dio_orphan_recovered(struct inode *inode)
2581{
2582 int ret;
2583 struct buffer_head *di_bh = NULL;
2584 struct ocfs2_dinode *di = NULL;
2585
2586 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2587 if (ret < 0) {
2588 mlog_errno(ret);
2589 return 0;
2590 }
2591
2592 di = (struct ocfs2_dinode *) di_bh->b_data;
2593 ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
2594 ocfs2_inode_unlock(inode, 1);
2595 brelse(di_bh);
2596
2597 return ret;
2598}
2599
2600#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
2601int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
2602 struct inode *inode)
2603{
2604 char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
2605 struct inode *orphan_dir_inode = NULL;
2606 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2607 struct buffer_head *di_bh = NULL;
2608 int status = 0;
2609 handle_t *handle = NULL;
2610 struct ocfs2_dinode *di = NULL;
2611
2612restart:
2613 status = ocfs2_inode_lock(inode, &di_bh, 1);
2614 if (status < 0) {
2615 mlog_errno(status);
2616 goto bail;
2617 }
2618
2619 di = (struct ocfs2_dinode *) di_bh->b_data;
2620 /*
2621 * Another append dio crashed?
2622 * If so, wait for recovery first.
2623 */
2624 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
2625 ocfs2_inode_unlock(inode, 1);
2626 brelse(di_bh);
2627 wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
2628 ocfs2_dio_orphan_recovered(inode),
2629 msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
2630 goto restart;
2631 }
2632
2633 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
2634 OCFS2_I(inode)->ip_blkno,
2635 orphan_name,
2636 &orphan_insert,
2637 true);
2638 if (status < 0) {
2639 mlog_errno(status);
2640 goto bail_unlock_inode;
2641 }
2642
2643 handle = ocfs2_start_trans(osb,
2644 OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
2645 if (IS_ERR(handle)) {
2646 status = PTR_ERR(handle);
2647 goto bail_unlock_orphan;
2648 }
2649
2650 status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
2651 &orphan_insert, orphan_dir_inode, true);
2652 if (status)
2653 mlog_errno(status);
2654
2655 ocfs2_commit_trans(osb, handle);
2656
2657bail_unlock_orphan:
2658 ocfs2_inode_unlock(orphan_dir_inode, 1);
2659 mutex_unlock(&orphan_dir_inode->i_mutex);
2660 iput(orphan_dir_inode);
2661
2662 ocfs2_free_dir_lookup_result(&orphan_insert);
2663
2664bail_unlock_inode:
2665 ocfs2_inode_unlock(inode, 1);
2666 brelse(di_bh);
2667
2668bail:
2669 return status;
2670}
2671
2672int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
2673 struct inode *inode, int update_isize,
2674 loff_t end)
2675{
2676 struct inode *orphan_dir_inode = NULL;
2677 struct buffer_head *orphan_dir_bh = NULL;
2678 struct buffer_head *di_bh = NULL;
2679 struct ocfs2_dinode *di = NULL;
2680 handle_t *handle = NULL;
2681 int status = 0;
2682
2683 status = ocfs2_inode_lock(inode, &di_bh, 1);
2684 if (status < 0) {
2685 mlog_errno(status);
2686 goto bail;
2687 }
2688 di = (struct ocfs2_dinode *) di_bh->b_data;
2689
2690 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2691 ORPHAN_DIR_SYSTEM_INODE,
2692 le16_to_cpu(di->i_dio_orphaned_slot));
2693 if (!orphan_dir_inode) {
2694 status = -ENOENT;
2695 mlog_errno(status);
2696 goto bail_unlock_inode;
2697 }
2698
2699 mutex_lock(&orphan_dir_inode->i_mutex);
2700 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
2701 if (status < 0) {
2702 mutex_unlock(&orphan_dir_inode->i_mutex);
2703 iput(orphan_dir_inode);
2704 mlog_errno(status);
2705 goto bail_unlock_inode;
2706 }
2707
2708 handle = ocfs2_start_trans(osb,
2709 OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
2710 if (IS_ERR(handle)) {
2711 status = PTR_ERR(handle);
2712 goto bail_unlock_orphan;
2713 }
2714
2715 BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
2716
2717 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
2718 inode, orphan_dir_bh, true);
2719 if (status < 0) {
2720 mlog_errno(status);
2721 goto bail_commit;
2722 }
2723
2724 status = ocfs2_journal_access_di(handle,
2725 INODE_CACHE(inode),
2726 di_bh,
2727 OCFS2_JOURNAL_ACCESS_WRITE);
2728 if (status < 0) {
2729 mlog_errno(status);
2730 goto bail_commit;
2731 }
2732
2733 di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
2734 di->i_dio_orphaned_slot = 0;
2735
2736 if (update_isize) {
2737 status = ocfs2_set_inode_size(handle, inode, di_bh, end);
2738 if (status)
2739 mlog_errno(status);
2740 } else
2741 ocfs2_journal_dirty(handle, di_bh);
2742
2743bail_commit:
2744 ocfs2_commit_trans(osb, handle);
2745
2746bail_unlock_orphan:
2747 ocfs2_inode_unlock(orphan_dir_inode, 1);
2748 mutex_unlock(&orphan_dir_inode->i_mutex);
2749 brelse(orphan_dir_bh);
2750 iput(orphan_dir_inode);
2751
2752bail_unlock_inode:
2753 ocfs2_inode_unlock(inode, 1);
2754 brelse(di_bh);
2755
2756bail:
2757 return status;
2758}
2759
2503int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, 2760int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2504 struct inode *inode, 2761 struct inode *inode,
2505 struct dentry *dentry) 2762 struct dentry *dentry)
@@ -2588,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2588 } 2845 }
2589 2846
2590 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 2847 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
2591 orphan_dir_bh); 2848 orphan_dir_bh, false);
2592 if (status < 0) { 2849 if (status < 0) {
2593 mlog_errno(status); 2850 mlog_errno(status);
2594 goto out_commit; 2851 goto out_commit;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..5ddecce172fa 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
34 handle_t *handle, 34 handle_t *handle,
35 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
36 struct inode *inode, 36 struct inode *inode,
37 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh,
38 bool dio);
38int ocfs2_create_inode_in_orphan(struct inode *dir, 39int ocfs2_create_inode_in_orphan(struct inode *dir,
39 int mode, 40 int mode,
40 struct inode **new_inode); 41 struct inode **new_inode);
42int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
43 struct inode *inode);
44int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
45 struct inode *inode, int update_isize,
46 loff_t end);
41int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, 47int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
42 struct inode *new_inode, 48 struct inode *new_inode,
43 struct dentry *new_dentry); 49 struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7d6b7d090452..8490c64d34fe 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
209#endif 209#endif
210}; 210};
211 211
212enum ocfs2_orphan_reco_type {
213 ORPHAN_NO_NEED_TRUNCATE = 0,
214 ORPHAN_NEED_TRUNCATE,
215};
216
212enum ocfs2_orphan_scan_state { 217enum ocfs2_orphan_scan_state {
213 ORPHAN_SCAN_ACTIVE, 218 ORPHAN_SCAN_ACTIVE,
214 ORPHAN_SCAN_INACTIVE 219 ORPHAN_SCAN_INACTIVE
@@ -279,6 +284,8 @@ enum ocfs2_mount_options
279 writes */ 284 writes */
280 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ 285 OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
281 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ 286 OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
287
288 OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
282}; 289};
283 290
284#define OCFS2_OSB_SOFT_RO 0x0001 291#define OCFS2_OSB_SOFT_RO 0x0001
@@ -493,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
493 return 0; 500 return 0;
494} 501}
495 502
503static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
504{
505 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
506 return 1;
507 return 0;
508}
509
510
496static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) 511static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
497{ 512{
498 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) 513 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -724,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
724 return clusters; 739 return clusters;
725} 740}
726 741
742static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
743 u64 bytes)
744{
745 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
746 unsigned int clusters;
747
748 clusters = (unsigned int)(bytes >> cl_bits);
749 return clusters;
750}
751
727static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, 752static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
728 u64 bytes) 753 u64 bytes)
729{ 754{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..20e37a3ed26f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -105,7 +105,8 @@
105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) 105 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 106#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 107 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 108 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
109 | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
109 110
110/* 111/*
111 * Heartbeat-only devices are missing journals and other files. The 112 * Heartbeat-only devices are missing journals and other files. The
@@ -199,6 +200,11 @@
199#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 200#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
200#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 201#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
201 202
203/*
204 * Append Direct IO support
205 */
206#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008
207
202/* The byte offset of the first backup block will be 1G. 208/* The byte offset of the first backup block will be 1G.
203 * The following will be 4G, 16G, 64G, 256G and 1T. 209 * The following will be 4G, 16G, 64G, 256G and 1T.
204 */ 210 */
@@ -229,6 +235,8 @@
229#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 235#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
230#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 236#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
231#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ 237#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
238#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially
239 * for dio */
232 240
233/* 241/*
234 * Flags on ocfs2_dinode.i_dyn_features 242 * Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
729 inode belongs to. Only valid 737 inode belongs to. Only valid
730 if allocated from a 738 if allocated from a
731 discontiguous block group */ 739 discontiguous block group */
732/*A0*/ __le64 i_reserved2[3]; 740/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */
741 __le16 i_reserved1[3];
742 __le64 i_reserved2[2];
733/*B8*/ union { 743/*B8*/ union {
734 __le64 i_pad1; /* Generic way to refer to this 744 __le64 i_pad1; /* Generic way to refer to this
735 64bit union */ 745 64bit union */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1eae330193a6..b6d51333ad02 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -48,6 +48,7 @@ struct ocfs2_quota_recovery {
48/* In-memory structure with quota header information */ 48/* In-memory structure with quota header information */
49struct ocfs2_mem_dqinfo { 49struct ocfs2_mem_dqinfo {
50 unsigned int dqi_type; /* Quota type this structure describes */ 50 unsigned int dqi_type; /* Quota type this structure describes */
51 unsigned int dqi_flags; /* Flags OLQF_* */
51 unsigned int dqi_chunks; /* Number of chunks in local quota file */ 52 unsigned int dqi_chunks; /* Number of chunks in local quota file */
52 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ 53 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
53 unsigned int dqi_syncms; /* How often should we sync with other nodes */ 54 unsigned int dqi_syncms; /* How often should we sync with other nodes */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 10b653930ee2..3d0b63d34225 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
73 ol_dqblk_block_off(sb, c, off); 73 ol_dqblk_block_off(sb, c, off);
74} 74}
75 75
76/* Compute block number from given offset */
77static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
78{
79 return off >> sb->s_blocksize_bits;
80}
81
82static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) 76static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
83{ 77{
84 return off & ((1 << sb->s_blocksize_bits) - 1); 78 return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
292 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + 286 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
293 OCFS2_LOCAL_INFO_OFF); 287 OCFS2_LOCAL_INFO_OFF);
294 spin_lock(&dq_data_lock); 288 spin_lock(&dq_data_lock);
295 ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 289 ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
296 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); 290 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
297 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); 291 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
298 spin_unlock(&dq_data_lock); 292 spin_unlock(&dq_data_lock);
@@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
701 /* We don't need the lock and we have to acquire quota file locks 695 /* We don't need the lock and we have to acquire quota file locks
702 * which will later depend on this lock */ 696 * which will later depend on this lock */
703 mutex_unlock(&sb_dqopt(sb)->dqio_mutex); 697 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
704 info->dqi_maxblimit = 0x7fffffffffffffffLL; 698 info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
705 info->dqi_maxilimit = 0x7fffffffffffffffLL; 699 info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
706 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); 700 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
707 if (!oinfo) { 701 if (!oinfo) {
708 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" 702 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
@@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
737 } 731 }
738 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + 732 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
739 OCFS2_LOCAL_INFO_OFF); 733 OCFS2_LOCAL_INFO_OFF);
740 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); 734 oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
741 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); 735 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
742 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); 736 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
743 oinfo->dqi_libh = bh; 737 oinfo->dqi_libh = bh;
744 738
745 /* We crashed when using local quota file? */ 739 /* We crashed when using local quota file? */
746 if (!(info->dqi_flags & OLQF_CLEAN)) { 740 if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
747 rec = OCFS2_SB(sb)->quota_rec; 741 rec = OCFS2_SB(sb)->quota_rec;
748 if (!rec) { 742 if (!rec) {
749 rec = ocfs2_alloc_quota_recovery(); 743 rec = ocfs2_alloc_quota_recovery();
@@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
772 } 766 }
773 767
774 /* Now mark quota file as used */ 768 /* Now mark quota file as used */
775 info->dqi_flags &= ~OLQF_CLEAN; 769 oinfo->dqi_flags &= ~OLQF_CLEAN;
776 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); 770 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
777 if (status < 0) { 771 if (status < 0) {
778 mlog_errno(status); 772 mlog_errno(status);
@@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
857 goto out; 851 goto out;
858 852
859 /* Mark local file as clean */ 853 /* Mark local file as clean */
860 info->dqi_flags |= OLQF_CLEAN; 854 oinfo->dqi_flags |= OLQF_CLEAN;
861 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], 855 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
862 oinfo->dqi_libh, 856 oinfo->dqi_libh,
863 olq_update_info, 857 olq_update_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d81f6e2a97f5..ee541f92dab4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2428 get_bh(prev_bh); 2428 get_bh(prev_bh);
2429 } 2429 }
2430 2430
2431 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2432
2433 trace_ocfs2_calc_refcount_meta_credits_iterate( 2431 trace_ocfs2_calc_refcount_meta_credits_iterate(
2434 recs_add, (unsigned long long)cpos, clusters, 2432 recs_add, (unsigned long long)cpos, clusters,
2435 (unsigned long long)le64_to_cpu(rec.r_cpos), 2433 (unsigned long long)le64_to_cpu(rec.r_cpos),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..6a348b0294ab 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -39,7 +39,7 @@
39#define OCFS2_CHECK_RESERVATIONS 39#define OCFS2_CHECK_RESERVATIONS
40#endif 40#endif
41 41
42DEFINE_SPINLOCK(resv_lock); 42static DEFINE_SPINLOCK(resv_lock);
43 43
44#define OCFS2_MIN_RESV_WINDOW_BITS 8 44#define OCFS2_MIN_RESV_WINDOW_BITS 8
45#define OCFS2_MAX_RESV_WINDOW_BITS 1024 45#define OCFS2_MAX_RESV_WINDOW_BITS 1024
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83723179e1ec..26675185b886 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -191,6 +191,7 @@ enum {
191 Opt_coherency_full, 191 Opt_coherency_full,
192 Opt_resv_level, 192 Opt_resv_level,
193 Opt_dir_resv_level, 193 Opt_dir_resv_level,
194 Opt_journal_async_commit,
194 Opt_err, 195 Opt_err,
195}; 196};
196 197
@@ -222,6 +223,7 @@ static const match_table_t tokens = {
222 {Opt_coherency_full, "coherency=full"}, 223 {Opt_coherency_full, "coherency=full"},
223 {Opt_resv_level, "resv_level=%u"}, 224 {Opt_resv_level, "resv_level=%u"},
224 {Opt_dir_resv_level, "dir_resv_level=%u"}, 225 {Opt_dir_resv_level, "dir_resv_level=%u"},
226 {Opt_journal_async_commit, "journal_async_commit"},
225 {Opt_err, NULL} 227 {Opt_err, NULL}
226}; 228};
227 229
@@ -1000,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
1000 } 1002 }
1001} 1003}
1002 1004
1003/* Handle quota on quotactl */
1004static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
1005{
1006 unsigned int feature[OCFS2_MAXQUOTAS] = {
1007 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
1008 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
1009
1010 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
1011 return -EINVAL;
1012
1013 return dquot_enable(sb_dqopt(sb)->files[type], type,
1014 format_id, DQUOT_LIMITS_ENABLED);
1015}
1016
1017/* Handle quota off quotactl */
1018static int ocfs2_quota_off(struct super_block *sb, int type)
1019{
1020 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
1021}
1022
1023static const struct quotactl_ops ocfs2_quotactl_ops = {
1024 .quota_on_meta = ocfs2_quota_on,
1025 .quota_off = ocfs2_quota_off,
1026 .quota_sync = dquot_quota_sync,
1027 .get_info = dquot_get_dqinfo,
1028 .set_info = dquot_set_dqinfo,
1029 .get_dqblk = dquot_get_dqblk,
1030 .set_dqblk = dquot_set_dqblk,
1031};
1032
1033static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 1005static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1034{ 1006{
1035 struct dentry *root; 1007 struct dentry *root;
@@ -1500,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb,
1500 option < OCFS2_MAX_RESV_LEVEL) 1472 option < OCFS2_MAX_RESV_LEVEL)
1501 mopt->dir_resv_level = option; 1473 mopt->dir_resv_level = option;
1502 break; 1474 break;
1475 case Opt_journal_async_commit:
1476 mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
1477 break;
1503 default: 1478 default:
1504 mlog(ML_ERROR, 1479 mlog(ML_ERROR,
1505 "Unrecognized mount option \"%s\" " 1480 "Unrecognized mount option \"%s\" "
@@ -1606,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
1606 if (osb->osb_dir_resv_level != osb->osb_resv_level) 1581 if (osb->osb_dir_resv_level != osb->osb_resv_level)
1607 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); 1582 seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
1608 1583
1584 if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
1585 seq_printf(s, ",journal_async_commit");
1586
1609 return 0; 1587 return 0;
1610} 1588}
1611 1589
@@ -1768,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
1768 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1746 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1769 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1747 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1770 1748
1749 init_waitqueue_head(&oi->append_dio_wq);
1750
1771 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), 1751 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1772 &ocfs2_inode_caching_ops); 1752 &ocfs2_inode_caching_ops);
1773 1753
@@ -2079,7 +2059,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2079 sb->s_op = &ocfs2_sops; 2059 sb->s_op = &ocfs2_sops;
2080 sb->s_d_op = &ocfs2_dentry_ops; 2060 sb->s_d_op = &ocfs2_dentry_ops;
2081 sb->s_export_op = &ocfs2_export_ops; 2061 sb->s_export_op = &ocfs2_export_ops;
2082 sb->s_qcop = &ocfs2_quotactl_ops; 2062 sb->s_qcop = &dquot_quotactl_sysfile_ops;
2083 sb->dq_op = &ocfs2_quota_operations; 2063 sb->dq_op = &ocfs2_quota_operations;
2084 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; 2064 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
2085 sb->s_xattr = ocfs2_xattr_handlers; 2065 sb->s_xattr = ocfs2_xattr_handlers;
@@ -2475,6 +2455,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2475 goto finally; 2455 goto finally;
2476 } 2456 }
2477 2457
2458 if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
2459 jbd2_journal_set_features(osb->journal->j_journal,
2460 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2461 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2462 else
2463 jbd2_journal_clear_features(osb->journal->j_journal,
2464 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2465 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2466
2478 if (dirty) { 2467 if (dirty) {
2479 /* recover my local alloc if we didn't unmount cleanly. */ 2468 /* recover my local alloc if we didn't unmount cleanly. */
2480 status = ocfs2_begin_local_alloc_recovery(osb, 2469 status = ocfs2_begin_local_alloc_recovery(osb,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 662f8dee149f..85b190dc132f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5334,16 +5334,6 @@ out:
5334 return ret; 5334 return ret;
5335} 5335}
5336 5336
5337static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
5338 struct ocfs2_xattr_bucket *bucket,
5339 int offs)
5340{
5341 int block_off = offs >> inode->i_sb->s_blocksize_bits;
5342
5343 offs = offs % inode->i_sb->s_blocksize;
5344 return bucket_block(bucket, block_off) + offs;
5345}
5346
5347/* 5337/*
5348 * Truncate the specified xe_off entry in xattr bucket. 5338 * Truncate the specified xe_off entry in xattr bucket.
5349 * bucket is indicated by header_bh and len is the new length. 5339 * bucket is indicated by header_bh and len is the new length.
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..33f9cbf2610b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
667{ 667{
668 /* NB: we're sure to have correct a_ops only after f_op->open */ 668 /* NB: we're sure to have correct a_ops only after f_op->open */
669 if (f->f_flags & O_DIRECT) { 669 if (f->f_flags & O_DIRECT) {
670 if (!f->f_mapping->a_ops || 670 if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
671 ((!f->f_mapping->a_ops->direct_IO) &&
672 (!f->f_mapping->a_ops->get_xip_mem))) {
673 return -EINVAL; 671 return -EINVAL;
674 }
675 } 672 }
676 return 0; 673 return 0;
677} 674}
@@ -971,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
971 */ 968 */
972struct file *filp_open(const char *filename, int flags, umode_t mode) 969struct file *filp_open(const char *filename, int flags, umode_t mode)
973{ 970{
974 struct filename name = {.name = filename}; 971 struct filename *name = getname_kernel(filename);
975 return file_open_name(&name, flags, mode); 972 struct file *file = ERR_CAST(name);
973
974 if (!IS_ERR(name)) {
975 file = file_open_name(name, flags, mode);
976 putname(name);
977 }
978 return file;
976} 979}
977EXPORT_SYMBOL(filp_open); 980EXPORT_SYMBOL(filp_open);
978 981
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d065b82..1295a00ca316 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
81#include <linux/pid_namespace.h> 81#include <linux/pid_namespace.h>
82#include <linux/ptrace.h> 82#include <linux/ptrace.h>
83#include <linux/tracehook.h> 83#include <linux/tracehook.h>
84#include <linux/string_helpers.h>
84#include <linux/user_namespace.h> 85#include <linux/user_namespace.h>
85 86
86#include <asm/pgtable.h> 87#include <asm/pgtable.h>
@@ -89,39 +90,18 @@
89 90
90static inline void task_name(struct seq_file *m, struct task_struct *p) 91static inline void task_name(struct seq_file *m, struct task_struct *p)
91{ 92{
92 int i; 93 char *buf;
93 char *buf, *end;
94 char *name;
95 char tcomm[sizeof(p->comm)]; 94 char tcomm[sizeof(p->comm)];
96 95
97 get_task_comm(tcomm, p); 96 get_task_comm(tcomm, p);
98 97
99 seq_puts(m, "Name:\t"); 98 seq_puts(m, "Name:\t");
100 end = m->buf + m->size;
101 buf = m->buf + m->count; 99 buf = m->buf + m->count;
102 name = tcomm; 100
103 i = sizeof(tcomm); 101 /* Ignore error for now */
104 while (i && (buf < end)) { 102 string_escape_str(tcomm, &buf, m->size - m->count,
105 unsigned char c = *name; 103 ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
106 name++; 104
107 i--;
108 *buf = c;
109 if (!c)
110 break;
111 if (c == '\\') {
112 buf++;
113 if (buf < end)
114 *buf++ = c;
115 continue;
116 }
117 if (c == '\n') {
118 *buf++ = '\\';
119 if (buf < end)
120 *buf++ = 'n';
121 continue;
122 }
123 buf++;
124 }
125 m->count = buf - m->buf; 105 m->count = buf - m->buf;
126 seq_putc(m, '\n'); 106 seq_putc(m, '\n');
127} 107}
@@ -336,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
336 316
337static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 317static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
338{ 318{
339 seq_puts(m, "Cpus_allowed:\t"); 319 seq_printf(m, "Cpus_allowed:\t%*pb\n",
340 seq_cpumask(m, &task->cpus_allowed); 320 cpumask_pr_args(&task->cpus_allowed));
341 seq_putc(m, '\n'); 321 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
342 seq_puts(m, "Cpus_allowed_list:\t"); 322 cpumask_pr_args(&task->cpus_allowed));
343 seq_cpumask_list(m, &task->cpus_allowed);
344 seq_putc(m, '\n');
345} 323}
346 324
347int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 325int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7fea13229f33..3309f59d421b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
122 struct kstat *stat) 122 struct kstat *stat)
123{ 123{
124 struct inode *inode = dentry->d_inode; 124 struct inode *inode = dentry->d_inode;
125 struct proc_dir_entry *de = PROC_I(inode)->pde; 125 struct proc_dir_entry *de = PDE(inode);
126 if (de && de->nlink) 126 if (de && de->nlink)
127 set_nlink(inode, de->nlink); 127 set_nlink(inode, de->nlink);
128 128
@@ -350,29 +350,12 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
350 if (ret) 350 if (ret)
351 return ret; 351 return ret;
352 352
353 if (S_ISDIR(dp->mode)) {
354 dp->proc_fops = &proc_dir_operations;
355 dp->proc_iops = &proc_dir_inode_operations;
356 dir->nlink++;
357 } else if (S_ISLNK(dp->mode)) {
358 dp->proc_iops = &proc_link_inode_operations;
359 } else if (S_ISREG(dp->mode)) {
360 BUG_ON(dp->proc_fops == NULL);
361 dp->proc_iops = &proc_file_inode_operations;
362 } else {
363 WARN_ON(1);
364 proc_free_inum(dp->low_ino);
365 return -EINVAL;
366 }
367
368 spin_lock(&proc_subdir_lock); 353 spin_lock(&proc_subdir_lock);
369 dp->parent = dir; 354 dp->parent = dir;
370 if (pde_subdir_insert(dir, dp) == false) { 355 if (pde_subdir_insert(dir, dp) == false) {
371 WARN(1, "proc_dir_entry '%s/%s' already registered\n", 356 WARN(1, "proc_dir_entry '%s/%s' already registered\n",
372 dir->name, dp->name); 357 dir->name, dp->name);
373 spin_unlock(&proc_subdir_lock); 358 spin_unlock(&proc_subdir_lock);
374 if (S_ISDIR(dp->mode))
375 dir->nlink--;
376 proc_free_inum(dp->low_ino); 359 proc_free_inum(dp->low_ino);
377 return -EEXIST; 360 return -EEXIST;
378 } 361 }
@@ -431,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
431 ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); 414 ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
432 if (ent->data) { 415 if (ent->data) {
433 strcpy((char*)ent->data,dest); 416 strcpy((char*)ent->data,dest);
417 ent->proc_iops = &proc_link_inode_operations;
434 if (proc_register(parent, ent) < 0) { 418 if (proc_register(parent, ent) < 0) {
435 kfree(ent->data); 419 kfree(ent->data);
436 kfree(ent); 420 kfree(ent);
@@ -456,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
456 ent = __proc_create(&parent, name, S_IFDIR | mode, 2); 440 ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
457 if (ent) { 441 if (ent) {
458 ent->data = data; 442 ent->data = data;
443 ent->proc_fops = &proc_dir_operations;
444 ent->proc_iops = &proc_dir_inode_operations;
445 parent->nlink++;
459 if (proc_register(parent, ent) < 0) { 446 if (proc_register(parent, ent) < 0) {
460 kfree(ent); 447 kfree(ent);
448 parent->nlink--;
461 ent = NULL; 449 ent = NULL;
462 } 450 }
463 } 451 }
@@ -493,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
493 return NULL; 481 return NULL;
494 } 482 }
495 483
484 BUG_ON(proc_fops == NULL);
485
496 if ((mode & S_IALLUGO) == 0) 486 if ((mode & S_IALLUGO) == 0)
497 mode |= S_IRUGO; 487 mode |= S_IRUGO;
498 pde = __proc_create(&parent, name, mode, 1); 488 pde = __proc_create(&parent, name, mode, 1);
@@ -500,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
500 goto out; 490 goto out;
501 pde->proc_fops = proc_fops; 491 pde->proc_fops = proc_fops;
502 pde->data = data; 492 pde->data = data;
493 pde->proc_iops = &proc_file_inode_operations;
503 if (proc_register(parent, pde) < 0) 494 if (proc_register(parent, pde) < 0)
504 goto out_free; 495 goto out_free;
505 return pde; 496 return pde;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8420a2f80811..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
40 put_pid(PROC_I(inode)->pid); 40 put_pid(PROC_I(inode)->pid);
41 41
42 /* Let go of any associated proc directory entry */ 42 /* Let go of any associated proc directory entry */
43 de = PROC_I(inode)->pde; 43 de = PDE(inode);
44 if (de) 44 if (de)
45 pde_put(de); 45 pde_put(de);
46 head = PROC_I(inode)->sysctl; 46 head = PROC_I(inode)->sysctl;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
5#include <linux/ksm.h> 5#include <linux/ksm.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmzone.h> 7#include <linux/mmzone.h>
8#include <linux/huge_mm.h>
8#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
9#include <linux/seq_file.h> 10#include <linux/seq_file.h>
10#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon 122 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
122 * to make sure a given page is a thp, not a non-huge compound page. 123 * to make sure a given page is a thp, not a non-huge compound page.
123 */ 124 */
124 else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || 125 else if (PageTransCompound(page)) {
125 PageAnon(compound_head(page)))) 126 struct page *head = compound_head(page);
126 u |= 1 << KPF_THP; 127
128 if (PageLRU(head) || PageAnon(head))
129 u |= 1 << KPF_THP;
130 else if (is_huge_zero_page(head)) {
131 u |= 1 << KPF_ZERO_PAGE;
132 u |= 1 << KPF_THP;
133 }
134 } else if (is_zero_pfn(page_to_pfn(page)))
135 u |= 1 << KPF_ZERO_PAGE;
136
127 137
128 /* 138 /*
129 * Caveats on high order pages: page->_count will only be set 139 * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 246eae84b13b..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
21 21
22void task_mem(struct seq_file *m, struct mm_struct *mm) 22void task_mem(struct seq_file *m, struct mm_struct *mm)
23{ 23{
24 unsigned long data, text, lib, swap; 24 unsigned long data, text, lib, swap, ptes, pmds;
25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 25 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
26 26
27 /* 27 /*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 42 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 43 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
44 swap = get_mm_counter(mm, MM_SWAPENTS); 44 swap = get_mm_counter(mm, MM_SWAPENTS);
45 ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
46 pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
45 seq_printf(m, 47 seq_printf(m,
46 "VmPeak:\t%8lu kB\n" 48 "VmPeak:\t%8lu kB\n"
47 "VmSize:\t%8lu kB\n" 49 "VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 "VmExe:\t%8lu kB\n" 56 "VmExe:\t%8lu kB\n"
55 "VmLib:\t%8lu kB\n" 57 "VmLib:\t%8lu kB\n"
56 "VmPTE:\t%8lu kB\n" 58 "VmPTE:\t%8lu kB\n"
59 "VmPMD:\t%8lu kB\n"
57 "VmSwap:\t%8lu kB\n", 60 "VmSwap:\t%8lu kB\n",
58 hiwater_vm << (PAGE_SHIFT-10), 61 hiwater_vm << (PAGE_SHIFT-10),
59 total_vm << (PAGE_SHIFT-10), 62 total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
63 total_rss << (PAGE_SHIFT-10), 66 total_rss << (PAGE_SHIFT-10),
64 data << (PAGE_SHIFT-10), 67 data << (PAGE_SHIFT-10),
65 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 68 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66 (PTRS_PER_PTE * sizeof(pte_t) * 69 ptes >> 10,
67 atomic_long_read(&mm->nr_ptes)) >> 10, 70 pmds >> 10,
68 swap << (PAGE_SHIFT-10)); 71 swap << (PAGE_SHIFT-10));
69} 72}
70 73
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
433 436
434#ifdef CONFIG_PROC_PAGE_MONITOR 437#ifdef CONFIG_PROC_PAGE_MONITOR
435struct mem_size_stats { 438struct mem_size_stats {
436 struct vm_area_struct *vma;
437 unsigned long resident; 439 unsigned long resident;
438 unsigned long shared_clean; 440 unsigned long shared_clean;
439 unsigned long shared_dirty; 441 unsigned long shared_dirty;
@@ -443,7 +445,6 @@ struct mem_size_stats {
443 unsigned long anonymous; 445 unsigned long anonymous;
444 unsigned long anonymous_thp; 446 unsigned long anonymous_thp;
445 unsigned long swap; 447 unsigned long swap;
446 unsigned long nonlinear;
447 u64 pss; 448 u64 pss;
448}; 449};
449 450
@@ -483,8 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
483 struct mm_walk *walk) 484 struct mm_walk *walk)
484{ 485{
485 struct mem_size_stats *mss = walk->private; 486 struct mem_size_stats *mss = walk->private;
486 struct vm_area_struct *vma = mss->vma; 487 struct vm_area_struct *vma = walk->vma;
487 pgoff_t pgoff = linear_page_index(vma, addr);
488 struct page *page = NULL; 488 struct page *page = NULL;
489 489
490 if (pte_present(*pte)) { 490 if (pte_present(*pte)) {
@@ -496,17 +496,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
496 mss->swap += PAGE_SIZE; 496 mss->swap += PAGE_SIZE;
497 else if (is_migration_entry(swpent)) 497 else if (is_migration_entry(swpent))
498 page = migration_entry_to_page(swpent); 498 page = migration_entry_to_page(swpent);
499 } else if (pte_file(*pte)) {
500 if (pte_to_pgoff(*pte) != pgoff)
501 mss->nonlinear += PAGE_SIZE;
502 } 499 }
503 500
504 if (!page) 501 if (!page)
505 return; 502 return;
506
507 if (page->index != pgoff)
508 mss->nonlinear += PAGE_SIZE;
509
510 smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); 503 smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
511} 504}
512 505
@@ -515,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
515 struct mm_walk *walk) 508 struct mm_walk *walk)
516{ 509{
517 struct mem_size_stats *mss = walk->private; 510 struct mem_size_stats *mss = walk->private;
518 struct vm_area_struct *vma = mss->vma; 511 struct vm_area_struct *vma = walk->vma;
519 struct page *page; 512 struct page *page;
520 513
521 /* FOLL_DUMP will return -EFAULT on huge zero page */ 514 /* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -536,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
536static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 529static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
537 struct mm_walk *walk) 530 struct mm_walk *walk)
538{ 531{
539 struct mem_size_stats *mss = walk->private; 532 struct vm_area_struct *vma = walk->vma;
540 struct vm_area_struct *vma = mss->vma;
541 pte_t *pte; 533 pte_t *pte;
542 spinlock_t *ptl; 534 spinlock_t *ptl;
543 535
@@ -596,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
596 [ilog2(VM_ACCOUNT)] = "ac", 588 [ilog2(VM_ACCOUNT)] = "ac",
597 [ilog2(VM_NORESERVE)] = "nr", 589 [ilog2(VM_NORESERVE)] = "nr",
598 [ilog2(VM_HUGETLB)] = "ht", 590 [ilog2(VM_HUGETLB)] = "ht",
599 [ilog2(VM_NONLINEAR)] = "nl",
600 [ilog2(VM_ARCH_1)] = "ar", 591 [ilog2(VM_ARCH_1)] = "ar",
601 [ilog2(VM_DONTDUMP)] = "dd", 592 [ilog2(VM_DONTDUMP)] = "dd",
602#ifdef CONFIG_MEM_SOFT_DIRTY 593#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -630,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
630 }; 621 };
631 622
632 memset(&mss, 0, sizeof mss); 623 memset(&mss, 0, sizeof mss);
633 mss.vma = vma;
634 /* mmap_sem is held in m_start */ 624 /* mmap_sem is held in m_start */
635 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 625 walk_page_vma(vma, &smaps_walk);
636 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
637 626
638 show_map_vma(m, vma, is_pid); 627 show_map_vma(m, vma, is_pid);
639 628
@@ -668,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
668 (vma->vm_flags & VM_LOCKED) ? 657 (vma->vm_flags & VM_LOCKED) ?
669 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 658 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
670 659
671 if (vma->vm_flags & VM_NONLINEAR)
672 seq_printf(m, "Nonlinear: %8lu kB\n",
673 mss.nonlinear >> 10);
674
675 show_smap_vma_flags(m, vma); 660 show_smap_vma_flags(m, vma);
676 m_cache_vma(m, vma); 661 m_cache_vma(m, vma);
677 return 0; 662 return 0;
@@ -747,18 +732,18 @@ enum clear_refs_types {
747 CLEAR_REFS_ANON, 732 CLEAR_REFS_ANON,
748 CLEAR_REFS_MAPPED, 733 CLEAR_REFS_MAPPED,
749 CLEAR_REFS_SOFT_DIRTY, 734 CLEAR_REFS_SOFT_DIRTY,
735 CLEAR_REFS_MM_HIWATER_RSS,
750 CLEAR_REFS_LAST, 736 CLEAR_REFS_LAST,
751}; 737};
752 738
753struct clear_refs_private { 739struct clear_refs_private {
754 struct vm_area_struct *vma;
755 enum clear_refs_types type; 740 enum clear_refs_types type;
756}; 741};
757 742
743#ifdef CONFIG_MEM_SOFT_DIRTY
758static inline void clear_soft_dirty(struct vm_area_struct *vma, 744static inline void clear_soft_dirty(struct vm_area_struct *vma,
759 unsigned long addr, pte_t *pte) 745 unsigned long addr, pte_t *pte)
760{ 746{
761#ifdef CONFIG_MEM_SOFT_DIRTY
762 /* 747 /*
763 * The soft-dirty tracker uses #PF-s to catch writes 748 * The soft-dirty tracker uses #PF-s to catch writes
764 * to pages, so write-protect the pte as well. See the 749 * to pages, so write-protect the pte as well. See the
@@ -772,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
772 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); 757 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
773 } else if (is_swap_pte(ptent)) { 758 } else if (is_swap_pte(ptent)) {
774 ptent = pte_swp_clear_soft_dirty(ptent); 759 ptent = pte_swp_clear_soft_dirty(ptent);
775 } else if (pte_file(ptent)) {
776 ptent = pte_file_clear_soft_dirty(ptent);
777 } 760 }
778 761
779 set_pte_at(vma->vm_mm, addr, pte, ptent); 762 set_pte_at(vma->vm_mm, addr, pte, ptent);
780#endif
781} 763}
782 764
765static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
766 unsigned long addr, pmd_t *pmdp)
767{
768 pmd_t pmd = *pmdp;
769
770 pmd = pmd_wrprotect(pmd);
771 pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
772
773 if (vma->vm_flags & VM_SOFTDIRTY)
774 vma->vm_flags &= ~VM_SOFTDIRTY;
775
776 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
777}
778
779#else
780
781static inline void clear_soft_dirty(struct vm_area_struct *vma,
782 unsigned long addr, pte_t *pte)
783{
784}
785
786static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
787 unsigned long addr, pmd_t *pmdp)
788{
789}
790#endif
791
783static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 792static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
784 unsigned long end, struct mm_walk *walk) 793 unsigned long end, struct mm_walk *walk)
785{ 794{
786 struct clear_refs_private *cp = walk->private; 795 struct clear_refs_private *cp = walk->private;
787 struct vm_area_struct *vma = cp->vma; 796 struct vm_area_struct *vma = walk->vma;
788 pte_t *pte, ptent; 797 pte_t *pte, ptent;
789 spinlock_t *ptl; 798 spinlock_t *ptl;
790 struct page *page; 799 struct page *page;
791 800
792 split_huge_page_pmd(vma, addr, pmd); 801 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
802 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
803 clear_soft_dirty_pmd(vma, addr, pmd);
804 goto out;
805 }
806
807 page = pmd_page(*pmd);
808
809 /* Clear accessed and referenced bits. */
810 pmdp_test_and_clear_young(vma, addr, pmd);
811 ClearPageReferenced(page);
812out:
813 spin_unlock(ptl);
814 return 0;
815 }
816
793 if (pmd_trans_unstable(pmd)) 817 if (pmd_trans_unstable(pmd))
794 return 0; 818 return 0;
795 819
@@ -818,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
818 return 0; 842 return 0;
819} 843}
820 844
845static int clear_refs_test_walk(unsigned long start, unsigned long end,
846 struct mm_walk *walk)
847{
848 struct clear_refs_private *cp = walk->private;
849 struct vm_area_struct *vma = walk->vma;
850
851 if (vma->vm_flags & VM_PFNMAP)
852 return 1;
853
854 /*
855 * Writing 1 to /proc/pid/clear_refs affects all pages.
856 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
857 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
858 * Writing 4 to /proc/pid/clear_refs affects all pages.
859 */
860 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
861 return 1;
862 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
863 return 1;
864 return 0;
865}
866
821static ssize_t clear_refs_write(struct file *file, const char __user *buf, 867static ssize_t clear_refs_write(struct file *file, const char __user *buf,
822 size_t count, loff_t *ppos) 868 size_t count, loff_t *ppos)
823{ 869{
@@ -858,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
858 }; 904 };
859 struct mm_walk clear_refs_walk = { 905 struct mm_walk clear_refs_walk = {
860 .pmd_entry = clear_refs_pte_range, 906 .pmd_entry = clear_refs_pte_range,
907 .test_walk = clear_refs_test_walk,
861 .mm = mm, 908 .mm = mm,
862 .private = &cp, 909 .private = &cp,
863 }; 910 };
911
912 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
913 /*
914 * Writing 5 to /proc/pid/clear_refs resets the peak
915 * resident set size to this mm's current rss value.
916 */
917 down_write(&mm->mmap_sem);
918 reset_mm_hiwater_rss(mm);
919 up_write(&mm->mmap_sem);
920 goto out_mm;
921 }
922
864 down_read(&mm->mmap_sem); 923 down_read(&mm->mmap_sem);
865 if (type == CLEAR_REFS_SOFT_DIRTY) { 924 if (type == CLEAR_REFS_SOFT_DIRTY) {
866 for (vma = mm->mmap; vma; vma = vma->vm_next) { 925 for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -877,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
877 } 936 }
878 mmu_notifier_invalidate_range_start(mm, 0, -1); 937 mmu_notifier_invalidate_range_start(mm, 0, -1);
879 } 938 }
880 for (vma = mm->mmap; vma; vma = vma->vm_next) { 939 walk_page_range(0, ~0UL, &clear_refs_walk);
881 cp.vma = vma;
882 if (is_vm_hugetlb_page(vma))
883 continue;
884 /*
885 * Writing 1 to /proc/pid/clear_refs affects all pages.
886 *
887 * Writing 2 to /proc/pid/clear_refs only affects
888 * Anonymous pages.
889 *
890 * Writing 3 to /proc/pid/clear_refs only affects file
891 * mapped pages.
892 *
893 * Writing 4 to /proc/pid/clear_refs affects all pages.
894 */
895 if (type == CLEAR_REFS_ANON && vma->vm_file)
896 continue;
897 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
898 continue;
899 walk_page_range(vma->vm_start, vma->vm_end,
900 &clear_refs_walk);
901 }
902 if (type == CLEAR_REFS_SOFT_DIRTY) 940 if (type == CLEAR_REFS_SOFT_DIRTY)
903 mmu_notifier_invalidate_range_end(mm, 0, -1); 941 mmu_notifier_invalidate_range_end(mm, 0, -1);
904 flush_tlb_mm(mm); 942 flush_tlb_mm(mm);
905 up_read(&mm->mmap_sem); 943 up_read(&mm->mmap_sem);
944out_mm:
906 mmput(mm); 945 mmput(mm);
907 } 946 }
908 put_task_struct(task); 947 put_task_struct(task);
@@ -1066,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
1066static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 1105static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1067 struct mm_walk *walk) 1106 struct mm_walk *walk)
1068{ 1107{
1069 struct vm_area_struct *vma; 1108 struct vm_area_struct *vma = walk->vma;
1070 struct pagemapread *pm = walk->private; 1109 struct pagemapread *pm = walk->private;
1071 spinlock_t *ptl; 1110 spinlock_t *ptl;
1072 pte_t *pte; 1111 pte_t *pte, *orig_pte;
1073 int err = 0; 1112 int err = 0;
1074 1113
1075 /* find the first VMA at or above 'addr' */ 1114 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1076 vma = find_vma(walk->mm, addr);
1077 if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1078 int pmd_flags2; 1115 int pmd_flags2;
1079 1116
1080 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1117 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1100,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1100 if (pmd_trans_unstable(pmd)) 1137 if (pmd_trans_unstable(pmd))
1101 return 0; 1138 return 0;
1102 1139
1103 while (1) { 1140 /*
1104 /* End of address space hole, which we mark as non-present. */ 1141 * We can assume that @vma always points to a valid one and @end never
1105 unsigned long hole_end; 1142 * goes beyond vma->vm_end.
1106 1143 */
1107 if (vma) 1144 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1108 hole_end = min(end, vma->vm_start); 1145 for (; addr < end; pte++, addr += PAGE_SIZE) {
1109 else 1146 pagemap_entry_t pme;
1110 hole_end = end;
1111
1112 for (; addr < hole_end; addr += PAGE_SIZE) {
1113 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1114
1115 err = add_to_pagemap(addr, &pme, pm);
1116 if (err)
1117 return err;
1118 }
1119
1120 if (!vma || vma->vm_start >= end)
1121 break;
1122 /*
1123 * We can't possibly be in a hugetlb VMA. In general,
1124 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1125 * the pmd_entry can only be called on addresses in a
1126 * hugetlb if the walk starts in a non-hugetlb VMA and
1127 * spans a hugepage VMA. Since pagemap_read walks are
1128 * PMD-sized and PMD-aligned, this will never be true.
1129 */
1130 BUG_ON(is_vm_hugetlb_page(vma));
1131
1132 /* Addresses in the VMA. */
1133 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1134 pagemap_entry_t pme;
1135 pte = pte_offset_map(pmd, addr);
1136 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1137 pte_unmap(pte);
1138 err = add_to_pagemap(addr, &pme, pm);
1139 if (err)
1140 return err;
1141 }
1142 1147
1143 if (addr == end) 1148 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1149 err = add_to_pagemap(addr, &pme, pm);
1150 if (err)
1144 break; 1151 break;
1145
1146 vma = find_vma(walk->mm, addr);
1147 } 1152 }
1153 pte_unmap_unlock(orig_pte, ptl);
1148 1154
1149 cond_resched(); 1155 cond_resched();
1150 1156
@@ -1170,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1170 struct mm_walk *walk) 1176 struct mm_walk *walk)
1171{ 1177{
1172 struct pagemapread *pm = walk->private; 1178 struct pagemapread *pm = walk->private;
1173 struct vm_area_struct *vma; 1179 struct vm_area_struct *vma = walk->vma;
1174 int err = 0; 1180 int err = 0;
1175 int flags2; 1181 int flags2;
1176 pagemap_entry_t pme; 1182 pagemap_entry_t pme;
1177 1183
1178 vma = find_vma(walk->mm, addr); 1184 if (vma->vm_flags & VM_SOFTDIRTY)
1179 WARN_ON_ONCE(!vma);
1180
1181 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1182 flags2 = __PM_SOFT_DIRTY; 1185 flags2 = __PM_SOFT_DIRTY;
1183 else 1186 else
1184 flags2 = 0; 1187 flags2 = 0;
@@ -1338,7 +1341,6 @@ const struct file_operations proc_pagemap_operations = {
1338#ifdef CONFIG_NUMA 1341#ifdef CONFIG_NUMA
1339 1342
1340struct numa_maps { 1343struct numa_maps {
1341 struct vm_area_struct *vma;
1342 unsigned long pages; 1344 unsigned long pages;
1343 unsigned long anon; 1345 unsigned long anon;
1344 unsigned long active; 1346 unsigned long active;
@@ -1407,18 +1409,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1407static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 1409static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1408 unsigned long end, struct mm_walk *walk) 1410 unsigned long end, struct mm_walk *walk)
1409{ 1411{
1410 struct numa_maps *md; 1412 struct numa_maps *md = walk->private;
1413 struct vm_area_struct *vma = walk->vma;
1411 spinlock_t *ptl; 1414 spinlock_t *ptl;
1412 pte_t *orig_pte; 1415 pte_t *orig_pte;
1413 pte_t *pte; 1416 pte_t *pte;
1414 1417
1415 md = walk->private; 1418 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1416
1417 if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
1418 pte_t huge_pte = *(pte_t *)pmd; 1419 pte_t huge_pte = *(pte_t *)pmd;
1419 struct page *page; 1420 struct page *page;
1420 1421
1421 page = can_gather_numa_stats(huge_pte, md->vma, addr); 1422 page = can_gather_numa_stats(huge_pte, vma, addr);
1422 if (page) 1423 if (page)
1423 gather_stats(page, md, pte_dirty(huge_pte), 1424 gather_stats(page, md, pte_dirty(huge_pte),
1424 HPAGE_PMD_SIZE/PAGE_SIZE); 1425 HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1430,7 +1431,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1430 return 0; 1431 return 0;
1431 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1432 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1432 do { 1433 do {
1433 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 1434 struct page *page = can_gather_numa_stats(*pte, vma, addr);
1434 if (!page) 1435 if (!page)
1435 continue; 1436 continue;
1436 gather_stats(page, md, pte_dirty(*pte), 1); 1437 gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1440,7 +1441,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1440 return 0; 1441 return 0;
1441} 1442}
1442#ifdef CONFIG_HUGETLB_PAGE 1443#ifdef CONFIG_HUGETLB_PAGE
1443static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1444static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1444 unsigned long addr, unsigned long end, struct mm_walk *walk) 1445 unsigned long addr, unsigned long end, struct mm_walk *walk)
1445{ 1446{
1446 struct numa_maps *md; 1447 struct numa_maps *md;
@@ -1459,7 +1460,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1459} 1460}
1460 1461
1461#else 1462#else
1462static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, 1463static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1463 unsigned long addr, unsigned long end, struct mm_walk *walk) 1464 unsigned long addr, unsigned long end, struct mm_walk *walk)
1464{ 1465{
1465 return 0; 1466 return 0;
@@ -1477,7 +1478,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1477 struct numa_maps *md = &numa_priv->md; 1478 struct numa_maps *md = &numa_priv->md;
1478 struct file *file = vma->vm_file; 1479 struct file *file = vma->vm_file;
1479 struct mm_struct *mm = vma->vm_mm; 1480 struct mm_struct *mm = vma->vm_mm;
1480 struct mm_walk walk = {}; 1481 struct mm_walk walk = {
1482 .hugetlb_entry = gather_hugetlb_stats,
1483 .pmd_entry = gather_pte_stats,
1484 .private = md,
1485 .mm = mm,
1486 };
1481 struct mempolicy *pol; 1487 struct mempolicy *pol;
1482 char buffer[64]; 1488 char buffer[64];
1483 int nid; 1489 int nid;
@@ -1488,13 +1494,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1488 /* Ensure we start with an empty set of numa_maps statistics. */ 1494 /* Ensure we start with an empty set of numa_maps statistics. */
1489 memset(md, 0, sizeof(*md)); 1495 memset(md, 0, sizeof(*md));
1490 1496
1491 md->vma = vma;
1492
1493 walk.hugetlb_entry = gather_hugetbl_stats;
1494 walk.pmd_entry = gather_pte_stats;
1495 walk.private = md;
1496 walk.mm = mm;
1497
1498 pol = __get_vma_policy(vma, vma->vm_start); 1497 pol = __get_vma_policy(vma, vma->vm_start);
1499 if (pol) { 1498 if (pol) {
1500 mpol_to_str(buffer, sizeof(buffer), pol); 1499 mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1528,7 +1527,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1528 if (is_vm_hugetlb_page(vma)) 1527 if (is_vm_hugetlb_page(vma))
1529 seq_puts(m, " huge"); 1528 seq_puts(m, " huge");
1530 1529
1531 walk_page_range(vma->vm_start, vma->vm_end, &walk); 1530 /* mmap_sem is held by m_start */
1531 walk_page_vma(vma, &walk);
1532 1532
1533 if (!md->pages) 1533 if (!md->pages)
1534 goto out; 1534 goto out;
@@ -1557,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1557 for_each_node_state(nid, N_MEMORY) 1557 for_each_node_state(nid, N_MEMORY)
1558 if (md->node[nid]) 1558 if (md->node[nid])
1559 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1559 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1560
1561 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1560out: 1562out:
1561 seq_putc(m, '\n'); 1563 seq_putc(m, '\n');
1562 m_cache_vma(m, vma); 1564 m_cache_vma(m, vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a90d6d354199..4e61388ec03d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
546 nhdr_ptr = notes_section; 546 nhdr_ptr = notes_section;
547 while (nhdr_ptr->n_namesz != 0) { 547 while (nhdr_ptr->n_namesz != 0) {
548 sz = sizeof(Elf64_Nhdr) + 548 sz = sizeof(Elf64_Nhdr) +
549 ((nhdr_ptr->n_namesz + 3) & ~3) + 549 (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
550 ((nhdr_ptr->n_descsz + 3) & ~3); 550 (((u64)nhdr_ptr->n_descsz + 3) & ~3);
551 if ((real_sz + sz) > max_sz) { 551 if ((real_sz + sz) > max_sz) {
552 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", 552 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
553 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); 553 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
@@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
732 nhdr_ptr = notes_section; 732 nhdr_ptr = notes_section;
733 while (nhdr_ptr->n_namesz != 0) { 733 while (nhdr_ptr->n_namesz != 0) {
734 sz = sizeof(Elf32_Nhdr) + 734 sz = sizeof(Elf32_Nhdr) +
735 ((nhdr_ptr->n_namesz + 3) & ~3) + 735 (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
736 ((nhdr_ptr->n_descsz + 3) & ~3); 736 (((u64)nhdr_ptr->n_descsz + 3) & ~3);
737 if ((real_sz + sz) > max_sz) { 737 if ((real_sz + sz) > max_sz) {
738 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", 738 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
739 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); 739 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
44 { MS_SYNCHRONOUS, ",sync" }, 44 { MS_SYNCHRONOUS, ",sync" },
45 { MS_DIRSYNC, ",dirsync" }, 45 { MS_DIRSYNC, ",dirsync" },
46 { MS_MANDLOCK, ",mand" }, 46 { MS_MANDLOCK, ",mand" },
47 { MS_LAZYTIME, ",lazytime" },
47 { 0, NULL } 48 { 0, NULL }
48 }; 49 };
49 const struct proc_fs_info *fs_infop; 50 const struct proc_fs_info *fs_infop;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
21 When the option is enabled, pstore will log all kernel 21 When the option is enabled, pstore will log all kernel
22 messages, even if no oops or panic happened. 22 messages, even if no oops or panic happened.
23 23
24config PSTORE_PMSG
25 bool "Log user space messages"
26 depends on PSTORE
27 help
28 When the option is enabled, pstore will export a character
29 interface /dev/pmsg0 to log user space messages. On reboot
30 data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
31
32 If unsure, say N.
33
24config PSTORE_FTRACE 34config PSTORE_FTRACE
25 bool "Persistent function tracer" 35 bool "Persistent function tracer"
26 depends on PSTORE 36 depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
7pstore-objs += inode.o platform.o 7pstore-objs += inode.o platform.o
8obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o 8obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
9 9
10obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
11
10ramoops-objs += ram.o ram_core.o 12ramoops-objs += ram.o ram_core.o
11obj-$(CONFIG_PSTORE_RAM) += ramoops.o 13obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 50416602774d..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -338,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
338 338
339 switch (type) { 339 switch (type) {
340 case PSTORE_TYPE_DMESG: 340 case PSTORE_TYPE_DMESG:
341 sprintf(name, "dmesg-%s-%lld%s", psname, id, 341 scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
342 compressed ? ".enc.z" : ""); 342 psname, id, compressed ? ".enc.z" : "");
343 break; 343 break;
344 case PSTORE_TYPE_CONSOLE: 344 case PSTORE_TYPE_CONSOLE:
345 sprintf(name, "console-%s-%lld", psname, id); 345 scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
346 break; 346 break;
347 case PSTORE_TYPE_FTRACE: 347 case PSTORE_TYPE_FTRACE:
348 sprintf(name, "ftrace-%s-%lld", psname, id); 348 scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
349 break; 349 break;
350 case PSTORE_TYPE_MCE: 350 case PSTORE_TYPE_MCE:
351 sprintf(name, "mce-%s-%lld", psname, id); 351 scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
352 break; 352 break;
353 case PSTORE_TYPE_PPC_RTAS: 353 case PSTORE_TYPE_PPC_RTAS:
354 sprintf(name, "rtas-%s-%lld", psname, id); 354 scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
355 break; 355 break;
356 case PSTORE_TYPE_PPC_OF: 356 case PSTORE_TYPE_PPC_OF:
357 sprintf(name, "powerpc-ofw-%s-%lld", psname, id); 357 scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
358 psname, id);
358 break; 359 break;
359 case PSTORE_TYPE_PPC_COMMON: 360 case PSTORE_TYPE_PPC_COMMON:
360 sprintf(name, "powerpc-common-%s-%lld", psname, id); 361 scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
362 psname, id);
363 break;
364 case PSTORE_TYPE_PMSG:
365 scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
361 break; 366 break;
362 case PSTORE_TYPE_UNKNOWN: 367 case PSTORE_TYPE_UNKNOWN:
363 sprintf(name, "unknown-%s-%lld", psname, id); 368 scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
364 break; 369 break;
365 default: 370 default:
366 sprintf(name, "type%d-%s-%lld", type, psname, id); 371 scnprintf(name, sizeof(name), "type%d-%s-%lld",
372 type, psname, id);
367 break; 373 break;
368 } 374 }
369 375
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
45static inline void pstore_register_ftrace(void) {} 45static inline void pstore_register_ftrace(void) {}
46#endif 46#endif
47 47
48#ifdef CONFIG_PSTORE_PMSG
49extern void pstore_register_pmsg(void);
50#else
51static inline void pstore_register_pmsg(void) {}
52#endif
53
48extern struct pstore_info *psinfo; 54extern struct pstore_info *psinfo;
49 55
50extern void pstore_set_kmsg_bytes(int); 56extern void pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..c4c9a10c5760 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
301 301
302 if (big_oops_buf) { 302 if (big_oops_buf) {
303 dst = big_oops_buf; 303 dst = big_oops_buf;
304 hsize = sprintf(dst, "%s#%d Part%d\n", why, 304 hsize = sprintf(dst, "%s#%d Part%u\n", why,
305 oopscount, part); 305 oopscount, part);
306 size = big_oops_buf_sz - hsize; 306 size = big_oops_buf_sz - hsize;
307 307
@@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
321 } 321 }
322 } else { 322 } else {
323 dst = psinfo->buf; 323 dst = psinfo->buf;
324 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, 324 hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
325 part); 325 part);
326 size = psinfo->bufsize - hsize; 326 size = psinfo->bufsize - hsize;
327 dst += hsize; 327 dst += hsize;
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
447 if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { 447 if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
448 pstore_register_console(); 448 pstore_register_console();
449 pstore_register_ftrace(); 449 pstore_register_ftrace();
450 pstore_register_pmsg();
450 } 451 }
451 452
452 if (pstore_update_ms >= 0) { 453 if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright 2014 Google, Inc.
3 *
4 * This software is licensed under the terms of the GNU General Public
5 * License version 2, as published by the Free Software Foundation, and
6 * may be copied, distributed, and modified under those terms.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 */
13
14#include <linux/cdev.h>
15#include <linux/device.h>
16#include <linux/fs.h>
17#include <linux/uaccess.h>
18#include <linux/vmalloc.h>
19#include "internal.h"
20
21static DEFINE_MUTEX(pmsg_lock);
22#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
23
24static ssize_t write_pmsg(struct file *file, const char __user *buf,
25 size_t count, loff_t *ppos)
26{
27 size_t i, buffer_size;
28 char *buffer;
29
30 if (!count)
31 return 0;
32
33 if (!access_ok(VERIFY_READ, buf, count))
34 return -EFAULT;
35
36 buffer_size = count;
37 if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
38 buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
39 buffer = vmalloc(buffer_size);
40
41 mutex_lock(&pmsg_lock);
42 for (i = 0; i < count; ) {
43 size_t c = min(count - i, buffer_size);
44 u64 id;
45 long ret;
46
47 ret = __copy_from_user(buffer, buf + i, c);
48 if (unlikely(ret != 0)) {
49 mutex_unlock(&pmsg_lock);
50 vfree(buffer);
51 return -EFAULT;
52 }
53 psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
54 psinfo);
55
56 i += c;
57 }
58
59 mutex_unlock(&pmsg_lock);
60 vfree(buffer);
61 return count;
62}
63
64static const struct file_operations pmsg_fops = {
65 .owner = THIS_MODULE,
66 .llseek = noop_llseek,
67 .write = write_pmsg,
68};
69
70static struct class *pmsg_class;
71static int pmsg_major;
72#define PMSG_NAME "pmsg"
73#undef pr_fmt
74#define pr_fmt(fmt) PMSG_NAME ": " fmt
75
76static char *pmsg_devnode(struct device *dev, umode_t *mode)
77{
78 if (mode)
79 *mode = 0220;
80 return NULL;
81}
82
83void pstore_register_pmsg(void)
84{
85 struct device *pmsg_device;
86
87 pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
88 if (pmsg_major < 0) {
89 pr_err("register_chrdev failed\n");
90 goto err;
91 }
92
93 pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
94 if (IS_ERR(pmsg_class)) {
95 pr_err("device class file already in use\n");
96 goto err_class;
97 }
98 pmsg_class->devnode = pmsg_devnode;
99
100 pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
101 NULL, "%s%d", PMSG_NAME, 0);
102 if (IS_ERR(pmsg_device)) {
103 pr_err("failed to create device\n");
104 goto err_device;
105 }
106 return;
107
108err_device:
109 class_destroy(pmsg_class);
110err_class:
111 unregister_chrdev(pmsg_major, PMSG_NAME);
112err:
113 return;
114}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8613e5b35c22..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
51module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400); 51module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
52MODULE_PARM_DESC(ftrace_size, "size of ftrace log"); 52MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
53 53
54static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
55module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
56MODULE_PARM_DESC(pmsg_size, "size of user space message log");
57
54static ulong mem_address; 58static ulong mem_address;
55module_param(mem_address, ulong, 0400); 59module_param(mem_address, ulong, 0400);
56MODULE_PARM_DESC(mem_address, 60MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
82 struct persistent_ram_zone **przs; 86 struct persistent_ram_zone **przs;
83 struct persistent_ram_zone *cprz; 87 struct persistent_ram_zone *cprz;
84 struct persistent_ram_zone *fprz; 88 struct persistent_ram_zone *fprz;
89 struct persistent_ram_zone *mprz;
85 phys_addr_t phys_addr; 90 phys_addr_t phys_addr;
86 unsigned long size; 91 unsigned long size;
87 unsigned int memtype; 92 unsigned int memtype;
88 size_t record_size; 93 size_t record_size;
89 size_t console_size; 94 size_t console_size;
90 size_t ftrace_size; 95 size_t ftrace_size;
96 size_t pmsg_size;
91 int dump_oops; 97 int dump_oops;
92 struct persistent_ram_ecc_info ecc_info; 98 struct persistent_ram_ecc_info ecc_info;
93 unsigned int max_dump_cnt; 99 unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
96 unsigned int dump_read_cnt; 102 unsigned int dump_read_cnt;
97 unsigned int console_read_cnt; 103 unsigned int console_read_cnt;
98 unsigned int ftrace_read_cnt; 104 unsigned int ftrace_read_cnt;
105 unsigned int pmsg_read_cnt;
99 struct pstore_info pstore; 106 struct pstore_info pstore;
100}; 107};
101 108
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
109 cxt->dump_read_cnt = 0; 116 cxt->dump_read_cnt = 0;
110 cxt->console_read_cnt = 0; 117 cxt->console_read_cnt = 0;
111 cxt->ftrace_read_cnt = 0; 118 cxt->ftrace_read_cnt = 0;
119 cxt->pmsg_read_cnt = 0;
112 return 0; 120 return 0;
113} 121}
114 122
@@ -164,6 +172,12 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
164 return header_length; 172 return header_length;
165} 173}
166 174
175static bool prz_ok(struct persistent_ram_zone *prz)
176{
177 return !!prz && !!(persistent_ram_old_size(prz) +
178 persistent_ram_ecc_string(prz, NULL, 0));
179}
180
167static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, 181static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
168 int *count, struct timespec *time, 182 int *count, struct timespec *time,
169 char **buf, bool *compressed, 183 char **buf, bool *compressed,
@@ -178,13 +192,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
178 prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, 192 prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
179 cxt->max_dump_cnt, id, type, 193 cxt->max_dump_cnt, id, type,
180 PSTORE_TYPE_DMESG, 1); 194 PSTORE_TYPE_DMESG, 1);
181 if (!prz) 195 if (!prz_ok(prz))
182 prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 196 prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
183 1, id, type, PSTORE_TYPE_CONSOLE, 0); 197 1, id, type, PSTORE_TYPE_CONSOLE, 0);
184 if (!prz) 198 if (!prz_ok(prz))
185 prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, 199 prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
186 1, id, type, PSTORE_TYPE_FTRACE, 0); 200 1, id, type, PSTORE_TYPE_FTRACE, 0);
187 if (!prz) 201 if (!prz_ok(prz))
202 prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
203 1, id, type, PSTORE_TYPE_PMSG, 0);
204 if (!prz_ok(prz))
188 return 0; 205 return 0;
189 206
190 if (!persistent_ram_old(prz)) 207 if (!persistent_ram_old(prz))
@@ -252,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
252 return -ENOMEM; 269 return -ENOMEM;
253 persistent_ram_write(cxt->fprz, buf, size); 270 persistent_ram_write(cxt->fprz, buf, size);
254 return 0; 271 return 0;
272 } else if (type == PSTORE_TYPE_PMSG) {
273 if (!cxt->mprz)
274 return -ENOMEM;
275 persistent_ram_write(cxt->mprz, buf, size);
276 return 0;
255 } 277 }
256 278
257 if (type != PSTORE_TYPE_DMESG) 279 if (type != PSTORE_TYPE_DMESG)
@@ -309,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
309 case PSTORE_TYPE_FTRACE: 331 case PSTORE_TYPE_FTRACE:
310 prz = cxt->fprz; 332 prz = cxt->fprz;
311 break; 333 break;
334 case PSTORE_TYPE_PMSG:
335 prz = cxt->mprz;
336 break;
312 default: 337 default:
313 return -EINVAL; 338 return -EINVAL;
314 } 339 }
@@ -435,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
435 goto fail_out; 460 goto fail_out;
436 461
437 if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && 462 if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
438 !pdata->ftrace_size)) { 463 !pdata->ftrace_size && !pdata->pmsg_size)) {
439 pr_err("The memory size and the record/console size must be " 464 pr_err("The memory size and the record/console size must be "
440 "non-zero\n"); 465 "non-zero\n");
441 goto fail_out; 466 goto fail_out;
@@ -447,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
447 pdata->console_size = rounddown_pow_of_two(pdata->console_size); 472 pdata->console_size = rounddown_pow_of_two(pdata->console_size);
448 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) 473 if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
449 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 474 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
475 if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
476 pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
450 477
451 cxt->size = pdata->mem_size; 478 cxt->size = pdata->mem_size;
452 cxt->phys_addr = pdata->mem_address; 479 cxt->phys_addr = pdata->mem_address;
@@ -454,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
454 cxt->record_size = pdata->record_size; 481 cxt->record_size = pdata->record_size;
455 cxt->console_size = pdata->console_size; 482 cxt->console_size = pdata->console_size;
456 cxt->ftrace_size = pdata->ftrace_size; 483 cxt->ftrace_size = pdata->ftrace_size;
484 cxt->pmsg_size = pdata->pmsg_size;
457 cxt->dump_oops = pdata->dump_oops; 485 cxt->dump_oops = pdata->dump_oops;
458 cxt->ecc_info = pdata->ecc_info; 486 cxt->ecc_info = pdata->ecc_info;
459 487
460 paddr = cxt->phys_addr; 488 paddr = cxt->phys_addr;
461 489
462 dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size; 490 dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
491 - cxt->pmsg_size;
463 err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); 492 err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
464 if (err) 493 if (err)
465 goto fail_out; 494 goto fail_out;
@@ -474,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev)
474 if (err) 503 if (err)
475 goto fail_init_fprz; 504 goto fail_init_fprz;
476 505
477 if (!cxt->przs && !cxt->cprz && !cxt->fprz) { 506 err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
478 pr_err("memory size too small, minimum is %zu\n", 507 if (err)
479 cxt->console_size + cxt->record_size + 508 goto fail_init_mprz;
480 cxt->ftrace_size);
481 err = -EINVAL;
482 goto fail_cnt;
483 }
484 509
485 cxt->pstore.data = cxt; 510 cxt->pstore.data = cxt;
486 /* 511 /*
@@ -525,7 +550,8 @@ fail_buf:
525 kfree(cxt->pstore.buf); 550 kfree(cxt->pstore.buf);
526fail_clear: 551fail_clear:
527 cxt->pstore.bufsize = 0; 552 cxt->pstore.bufsize = 0;
528fail_cnt: 553 kfree(cxt->mprz);
554fail_init_mprz:
529 kfree(cxt->fprz); 555 kfree(cxt->fprz);
530fail_init_fprz: 556fail_init_fprz:
531 kfree(cxt->cprz); 557 kfree(cxt->cprz);
@@ -583,6 +609,7 @@ static void ramoops_register_dummy(void)
583 dummy_data->record_size = record_size; 609 dummy_data->record_size = record_size;
584 dummy_data->console_size = ramoops_console_size; 610 dummy_data->console_size = ramoops_console_size;
585 dummy_data->ftrace_size = ramoops_ftrace_size; 611 dummy_data->ftrace_size = ramoops_ftrace_size;
612 dummy_data->pmsg_size = ramoops_pmsg_size;
586 dummy_data->dump_oops = dump_oops; 613 dummy_data->dump_oops = dump_oops;
587 /* 614 /*
588 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC 615 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1dd237e..4a09975aac90 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
5config QUOTA 5config QUOTA
6 bool "Quota support" 6 bool "Quota support"
7 select QUOTACTL 7 select QUOTACTL
8 select SRCU
8 help 9 help
9 If you say Y here, you will be able to set per user limits for disk 10 If you say Y here, you will be able to set per user limits for disk
10 usage (also called disk quotas). Currently, it works for the 11 usage (also called disk quotas). Currently, it works for the
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
1248 1248
1249 return capable(CAP_SYS_RESOURCE) && 1249 return capable(CAP_SYS_RESOURCE) &&
1250 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || 1250 (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
1251 !(info->dqi_flags & V1_DQF_RSQUASH)); 1251 !(info->dqi_flags & DQF_ROOT_SQUASH));
1252} 1252}
1253 1253
1254/* needs dq_data_lock */ 1254/* needs dq_data_lock */
@@ -2385,41 +2385,106 @@ out:
2385} 2385}
2386EXPORT_SYMBOL(dquot_quota_on_mount); 2386EXPORT_SYMBOL(dquot_quota_on_mount);
2387 2387
2388static inline qsize_t qbtos(qsize_t blocks) 2388static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
2389{ 2389{
2390 return blocks << QIF_DQBLKSIZE_BITS; 2390 int ret;
2391 int type;
2392 struct quota_info *dqopt = sb_dqopt(sb);
2393
2394 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
2395 return -ENOSYS;
2396 /* Accounting cannot be turned on while fs is mounted */
2397 flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
2398 if (!flags)
2399 return -EINVAL;
2400 for (type = 0; type < MAXQUOTAS; type++) {
2401 if (!(flags & qtype_enforce_flag(type)))
2402 continue;
2403 /* Can't enforce without accounting */
2404 if (!sb_has_quota_usage_enabled(sb, type))
2405 return -EINVAL;
2406 ret = dquot_enable(dqopt->files[type], type,
2407 dqopt->info[type].dqi_fmt_id,
2408 DQUOT_LIMITS_ENABLED);
2409 if (ret < 0)
2410 goto out_err;
2411 }
2412 return 0;
2413out_err:
2414 /* Backout enforcement enablement we already did */
2415 for (type--; type >= 0; type--) {
2416 if (flags & qtype_enforce_flag(type))
2417 dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
2418 }
2419 /* Error code translation for better compatibility with XFS */
2420 if (ret == -EBUSY)
2421 ret = -EEXIST;
2422 return ret;
2391} 2423}
2392 2424
2393static inline qsize_t stoqb(qsize_t space) 2425static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
2394{ 2426{
2395 return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; 2427 int ret;
2428 int type;
2429 struct quota_info *dqopt = sb_dqopt(sb);
2430
2431 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
2432 return -ENOSYS;
2433 /*
2434 * We don't support turning off accounting via quotactl. In principle
2435 * quota infrastructure can do this but filesystems don't expect
2436 * userspace to be able to do it.
2437 */
2438 if (flags &
2439 (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
2440 return -EOPNOTSUPP;
2441
2442 /* Filter out limits not enabled */
2443 for (type = 0; type < MAXQUOTAS; type++)
2444 if (!sb_has_quota_limits_enabled(sb, type))
2445 flags &= ~qtype_enforce_flag(type);
2446 /* Nothing left? */
2447 if (!flags)
2448 return -EEXIST;
2449 for (type = 0; type < MAXQUOTAS; type++) {
2450 if (flags & qtype_enforce_flag(type)) {
2451 ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
2452 if (ret < 0)
2453 goto out_err;
2454 }
2455 }
2456 return 0;
2457out_err:
2458 /* Backout enforcement disabling we already did */
2459 for (type--; type >= 0; type--) {
2460 if (flags & qtype_enforce_flag(type))
2461 dquot_enable(dqopt->files[type], type,
2462 dqopt->info[type].dqi_fmt_id,
2463 DQUOT_LIMITS_ENABLED);
2464 }
2465 return ret;
2396} 2466}
2397 2467
2398/* Generic routine for getting common part of quota structure */ 2468/* Generic routine for getting common part of quota structure */
2399static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) 2469static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
2400{ 2470{
2401 struct mem_dqblk *dm = &dquot->dq_dqb; 2471 struct mem_dqblk *dm = &dquot->dq_dqb;
2402 2472
2403 memset(di, 0, sizeof(*di)); 2473 memset(di, 0, sizeof(*di));
2404 di->d_version = FS_DQUOT_VERSION;
2405 di->d_flags = dquot->dq_id.type == USRQUOTA ?
2406 FS_USER_QUOTA : FS_GROUP_QUOTA;
2407 di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
2408
2409 spin_lock(&dq_data_lock); 2474 spin_lock(&dq_data_lock);
2410 di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit); 2475 di->d_spc_hardlimit = dm->dqb_bhardlimit;
2411 di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit); 2476 di->d_spc_softlimit = dm->dqb_bsoftlimit;
2412 di->d_ino_hardlimit = dm->dqb_ihardlimit; 2477 di->d_ino_hardlimit = dm->dqb_ihardlimit;
2413 di->d_ino_softlimit = dm->dqb_isoftlimit; 2478 di->d_ino_softlimit = dm->dqb_isoftlimit;
2414 di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace; 2479 di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
2415 di->d_icount = dm->dqb_curinodes; 2480 di->d_ino_count = dm->dqb_curinodes;
2416 di->d_btimer = dm->dqb_btime; 2481 di->d_spc_timer = dm->dqb_btime;
2417 di->d_itimer = dm->dqb_itime; 2482 di->d_ino_timer = dm->dqb_itime;
2418 spin_unlock(&dq_data_lock); 2483 spin_unlock(&dq_data_lock);
2419} 2484}
2420 2485
2421int dquot_get_dqblk(struct super_block *sb, struct kqid qid, 2486int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
2422 struct fs_disk_quota *di) 2487 struct qc_dqblk *di)
2423{ 2488{
2424 struct dquot *dquot; 2489 struct dquot *dquot;
2425 2490
@@ -2433,70 +2498,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
2433} 2498}
2434EXPORT_SYMBOL(dquot_get_dqblk); 2499EXPORT_SYMBOL(dquot_get_dqblk);
2435 2500
2436#define VFS_FS_DQ_MASK \ 2501#define VFS_QC_MASK \
2437 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ 2502 (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
2438 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \ 2503 QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
2439 FS_DQ_BTIMER | FS_DQ_ITIMER) 2504 QC_SPC_TIMER | QC_INO_TIMER)
2440 2505
2441/* Generic routine for setting common part of quota structure */ 2506/* Generic routine for setting common part of quota structure */
2442static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) 2507static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
2443{ 2508{
2444 struct mem_dqblk *dm = &dquot->dq_dqb; 2509 struct mem_dqblk *dm = &dquot->dq_dqb;
2445 int check_blim = 0, check_ilim = 0; 2510 int check_blim = 0, check_ilim = 0;
2446 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; 2511 struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
2447 2512
2448 if (di->d_fieldmask & ~VFS_FS_DQ_MASK) 2513 if (di->d_fieldmask & ~VFS_QC_MASK)
2449 return -EINVAL; 2514 return -EINVAL;
2450 2515
2451 if (((di->d_fieldmask & FS_DQ_BSOFT) && 2516 if (((di->d_fieldmask & QC_SPC_SOFT) &&
2452 (di->d_blk_softlimit > dqi->dqi_maxblimit)) || 2517 di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
2453 ((di->d_fieldmask & FS_DQ_BHARD) && 2518 ((di->d_fieldmask & QC_SPC_HARD) &&
2454 (di->d_blk_hardlimit > dqi->dqi_maxblimit)) || 2519 di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
2455 ((di->d_fieldmask & FS_DQ_ISOFT) && 2520 ((di->d_fieldmask & QC_INO_SOFT) &&
2456 (di->d_ino_softlimit > dqi->dqi_maxilimit)) || 2521 (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
2457 ((di->d_fieldmask & FS_DQ_IHARD) && 2522 ((di->d_fieldmask & QC_INO_HARD) &&
2458 (di->d_ino_hardlimit > dqi->dqi_maxilimit))) 2523 (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
2459 return -ERANGE; 2524 return -ERANGE;
2460 2525
2461 spin_lock(&dq_data_lock); 2526 spin_lock(&dq_data_lock);
2462 if (di->d_fieldmask & FS_DQ_BCOUNT) { 2527 if (di->d_fieldmask & QC_SPACE) {
2463 dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace; 2528 dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
2464 check_blim = 1; 2529 check_blim = 1;
2465 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2530 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2466 } 2531 }
2467 2532
2468 if (di->d_fieldmask & FS_DQ_BSOFT) 2533 if (di->d_fieldmask & QC_SPC_SOFT)
2469 dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit); 2534 dm->dqb_bsoftlimit = di->d_spc_softlimit;
2470 if (di->d_fieldmask & FS_DQ_BHARD) 2535 if (di->d_fieldmask & QC_SPC_HARD)
2471 dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit); 2536 dm->dqb_bhardlimit = di->d_spc_hardlimit;
2472 if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) { 2537 if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
2473 check_blim = 1; 2538 check_blim = 1;
2474 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2539 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2475 } 2540 }
2476 2541
2477 if (di->d_fieldmask & FS_DQ_ICOUNT) { 2542 if (di->d_fieldmask & QC_INO_COUNT) {
2478 dm->dqb_curinodes = di->d_icount; 2543 dm->dqb_curinodes = di->d_ino_count;
2479 check_ilim = 1; 2544 check_ilim = 1;
2480 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2545 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2481 } 2546 }
2482 2547
2483 if (di->d_fieldmask & FS_DQ_ISOFT) 2548 if (di->d_fieldmask & QC_INO_SOFT)
2484 dm->dqb_isoftlimit = di->d_ino_softlimit; 2549 dm->dqb_isoftlimit = di->d_ino_softlimit;
2485 if (di->d_fieldmask & FS_DQ_IHARD) 2550 if (di->d_fieldmask & QC_INO_HARD)
2486 dm->dqb_ihardlimit = di->d_ino_hardlimit; 2551 dm->dqb_ihardlimit = di->d_ino_hardlimit;
2487 if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) { 2552 if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
2488 check_ilim = 1; 2553 check_ilim = 1;
2489 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2554 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2490 } 2555 }
2491 2556
2492 if (di->d_fieldmask & FS_DQ_BTIMER) { 2557 if (di->d_fieldmask & QC_SPC_TIMER) {
2493 dm->dqb_btime = di->d_btimer; 2558 dm->dqb_btime = di->d_spc_timer;
2494 check_blim = 1; 2559 check_blim = 1;
2495 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2560 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2496 } 2561 }
2497 2562
2498 if (di->d_fieldmask & FS_DQ_ITIMER) { 2563 if (di->d_fieldmask & QC_INO_TIMER) {
2499 dm->dqb_itime = di->d_itimer; 2564 dm->dqb_itime = di->d_ino_timer;
2500 check_ilim = 1; 2565 check_ilim = 1;
2501 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2566 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2502 } 2567 }
@@ -2506,7 +2571,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2506 dm->dqb_curspace < dm->dqb_bsoftlimit) { 2571 dm->dqb_curspace < dm->dqb_bsoftlimit) {
2507 dm->dqb_btime = 0; 2572 dm->dqb_btime = 0;
2508 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2573 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
2509 } else if (!(di->d_fieldmask & FS_DQ_BTIMER)) 2574 } else if (!(di->d_fieldmask & QC_SPC_TIMER))
2510 /* Set grace only if user hasn't provided his own... */ 2575 /* Set grace only if user hasn't provided his own... */
2511 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; 2576 dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
2512 } 2577 }
@@ -2515,7 +2580,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2515 dm->dqb_curinodes < dm->dqb_isoftlimit) { 2580 dm->dqb_curinodes < dm->dqb_isoftlimit) {
2516 dm->dqb_itime = 0; 2581 dm->dqb_itime = 0;
2517 clear_bit(DQ_INODES_B, &dquot->dq_flags); 2582 clear_bit(DQ_INODES_B, &dquot->dq_flags);
2518 } else if (!(di->d_fieldmask & FS_DQ_ITIMER)) 2583 } else if (!(di->d_fieldmask & QC_INO_TIMER))
2519 /* Set grace only if user hasn't provided his own... */ 2584 /* Set grace only if user hasn't provided his own... */
2520 dm->dqb_itime = get_seconds() + dqi->dqi_igrace; 2585 dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
2521 } 2586 }
@@ -2531,7 +2596,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2531} 2596}
2532 2597
2533int dquot_set_dqblk(struct super_block *sb, struct kqid qid, 2598int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
2534 struct fs_disk_quota *di) 2599 struct qc_dqblk *di)
2535{ 2600{
2536 struct dquot *dquot; 2601 struct dquot *dquot;
2537 int rc; 2602 int rc;
@@ -2582,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2582 goto out; 2647 goto out;
2583 } 2648 }
2584 mi = sb_dqopt(sb)->info + type; 2649 mi = sb_dqopt(sb)->info + type;
2650 if (ii->dqi_valid & IIF_FLAGS) {
2651 if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
2652 (ii->dqi_flags & DQF_ROOT_SQUASH &&
2653 mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
2654 err = -EINVAL;
2655 goto out;
2656 }
2657 }
2585 spin_lock(&dq_data_lock); 2658 spin_lock(&dq_data_lock);
2586 if (ii->dqi_valid & IIF_BGRACE) 2659 if (ii->dqi_valid & IIF_BGRACE)
2587 mi->dqi_bgrace = ii->dqi_bgrace; 2660 mi->dqi_bgrace = ii->dqi_bgrace;
@@ -2611,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
2611}; 2684};
2612EXPORT_SYMBOL(dquot_quotactl_ops); 2685EXPORT_SYMBOL(dquot_quotactl_ops);
2613 2686
2687const struct quotactl_ops dquot_quotactl_sysfile_ops = {
2688 .quota_enable = dquot_quota_enable,
2689 .quota_disable = dquot_quota_disable,
2690 .quota_sync = dquot_quota_sync,
2691 .get_info = dquot_get_dqinfo,
2692 .set_info = dquot_set_dqinfo,
2693 .get_dqblk = dquot_get_dqblk,
2694 .set_dqblk = dquot_set_dqblk
2695};
2696EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
2697
2614static int do_proc_dqstats(struct ctl_table *table, int write, 2698static int do_proc_dqstats(struct ctl_table *table, int write,
2615 void __user *buffer, size_t *lenp, loff_t *ppos) 2699 void __user *buffer, size_t *lenp, loff_t *ppos)
2616{ 2700{
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2aa4151f99d2..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,40 @@ static int quota_sync_all(int type)
66 return ret; 66 return ret;
67} 67}
68 68
69unsigned int qtype_enforce_flag(int type)
70{
71 switch (type) {
72 case USRQUOTA:
73 return FS_QUOTA_UDQ_ENFD;
74 case GRPQUOTA:
75 return FS_QUOTA_GDQ_ENFD;
76 case PRJQUOTA:
77 return FS_QUOTA_PDQ_ENFD;
78 }
79 return 0;
80}
81
69static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 82static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
70 struct path *path) 83 struct path *path)
71{ 84{
72 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta) 85 if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
73 return -ENOSYS; 86 return -ENOSYS;
74 if (sb->s_qcop->quota_on_meta) 87 if (sb->s_qcop->quota_enable)
75 return sb->s_qcop->quota_on_meta(sb, type, id); 88 return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
76 if (IS_ERR(path)) 89 if (IS_ERR(path))
77 return PTR_ERR(path); 90 return PTR_ERR(path);
78 return sb->s_qcop->quota_on(sb, type, id, path); 91 return sb->s_qcop->quota_on(sb, type, id, path);
79} 92}
80 93
94static int quota_quotaoff(struct super_block *sb, int type)
95{
96 if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
97 return -ENOSYS;
98 if (sb->s_qcop->quota_disable)
99 return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
100 return sb->s_qcop->quota_off(sb, type);
101}
102
81static int quota_getfmt(struct super_block *sb, int type, void __user *addr) 103static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
82{ 104{
83 __u32 fmt; 105 __u32 fmt;
@@ -118,17 +140,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
118 return sb->s_qcop->set_info(sb, type, &info); 140 return sb->s_qcop->set_info(sb, type, &info);
119} 141}
120 142
121static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) 143static inline qsize_t qbtos(qsize_t blocks)
144{
145 return blocks << QIF_DQBLKSIZE_BITS;
146}
147
148static inline qsize_t stoqb(qsize_t space)
149{
150 return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
151}
152
153static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
122{ 154{
123 memset(dst, 0, sizeof(*dst)); 155 memset(dst, 0, sizeof(*dst));
124 dst->dqb_bhardlimit = src->d_blk_hardlimit; 156 dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
125 dst->dqb_bsoftlimit = src->d_blk_softlimit; 157 dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
126 dst->dqb_curspace = src->d_bcount; 158 dst->dqb_curspace = src->d_space;
127 dst->dqb_ihardlimit = src->d_ino_hardlimit; 159 dst->dqb_ihardlimit = src->d_ino_hardlimit;
128 dst->dqb_isoftlimit = src->d_ino_softlimit; 160 dst->dqb_isoftlimit = src->d_ino_softlimit;
129 dst->dqb_curinodes = src->d_icount; 161 dst->dqb_curinodes = src->d_ino_count;
130 dst->dqb_btime = src->d_btimer; 162 dst->dqb_btime = src->d_spc_timer;
131 dst->dqb_itime = src->d_itimer; 163 dst->dqb_itime = src->d_ino_timer;
132 dst->dqb_valid = QIF_ALL; 164 dst->dqb_valid = QIF_ALL;
133} 165}
134 166
@@ -136,7 +168,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
136 void __user *addr) 168 void __user *addr)
137{ 169{
138 struct kqid qid; 170 struct kqid qid;
139 struct fs_disk_quota fdq; 171 struct qc_dqblk fdq;
140 struct if_dqblk idq; 172 struct if_dqblk idq;
141 int ret; 173 int ret;
142 174
@@ -154,36 +186,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
154 return 0; 186 return 0;
155} 187}
156 188
157static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src) 189static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
158{ 190{
159 dst->d_blk_hardlimit = src->dqb_bhardlimit; 191 dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
160 dst->d_blk_softlimit = src->dqb_bsoftlimit; 192 dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
161 dst->d_bcount = src->dqb_curspace; 193 dst->d_space = src->dqb_curspace;
162 dst->d_ino_hardlimit = src->dqb_ihardlimit; 194 dst->d_ino_hardlimit = src->dqb_ihardlimit;
163 dst->d_ino_softlimit = src->dqb_isoftlimit; 195 dst->d_ino_softlimit = src->dqb_isoftlimit;
164 dst->d_icount = src->dqb_curinodes; 196 dst->d_ino_count = src->dqb_curinodes;
165 dst->d_btimer = src->dqb_btime; 197 dst->d_spc_timer = src->dqb_btime;
166 dst->d_itimer = src->dqb_itime; 198 dst->d_ino_timer = src->dqb_itime;
167 199
168 dst->d_fieldmask = 0; 200 dst->d_fieldmask = 0;
169 if (src->dqb_valid & QIF_BLIMITS) 201 if (src->dqb_valid & QIF_BLIMITS)
170 dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD; 202 dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
171 if (src->dqb_valid & QIF_SPACE) 203 if (src->dqb_valid & QIF_SPACE)
172 dst->d_fieldmask |= FS_DQ_BCOUNT; 204 dst->d_fieldmask |= QC_SPACE;
173 if (src->dqb_valid & QIF_ILIMITS) 205 if (src->dqb_valid & QIF_ILIMITS)
174 dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD; 206 dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
175 if (src->dqb_valid & QIF_INODES) 207 if (src->dqb_valid & QIF_INODES)
176 dst->d_fieldmask |= FS_DQ_ICOUNT; 208 dst->d_fieldmask |= QC_INO_COUNT;
177 if (src->dqb_valid & QIF_BTIME) 209 if (src->dqb_valid & QIF_BTIME)
178 dst->d_fieldmask |= FS_DQ_BTIMER; 210 dst->d_fieldmask |= QC_SPC_TIMER;
179 if (src->dqb_valid & QIF_ITIME) 211 if (src->dqb_valid & QIF_ITIME)
180 dst->d_fieldmask |= FS_DQ_ITIMER; 212 dst->d_fieldmask |= QC_INO_TIMER;
181} 213}
182 214
183static int quota_setquota(struct super_block *sb, int type, qid_t id, 215static int quota_setquota(struct super_block *sb, int type, qid_t id,
184 void __user *addr) 216 void __user *addr)
185{ 217{
186 struct fs_disk_quota fdq; 218 struct qc_dqblk fdq;
187 struct if_dqblk idq; 219 struct if_dqblk idq;
188 struct kqid qid; 220 struct kqid qid;
189 221
@@ -198,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
198 return sb->s_qcop->set_dqblk(sb, qid, &fdq); 230 return sb->s_qcop->set_dqblk(sb, qid, &fdq);
199} 231}
200 232
201static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) 233static int quota_enable(struct super_block *sb, void __user *addr)
234{
235 __u32 flags;
236
237 if (copy_from_user(&flags, addr, sizeof(flags)))
238 return -EFAULT;
239 if (!sb->s_qcop->quota_enable)
240 return -ENOSYS;
241 return sb->s_qcop->quota_enable(sb, flags);
242}
243
244static int quota_disable(struct super_block *sb, void __user *addr)
202{ 245{
203 __u32 flags; 246 __u32 flags;
204 247
205 if (copy_from_user(&flags, addr, sizeof(flags))) 248 if (copy_from_user(&flags, addr, sizeof(flags)))
206 return -EFAULT; 249 return -EFAULT;
207 if (!sb->s_qcop->set_xstate) 250 if (!sb->s_qcop->quota_disable)
208 return -ENOSYS; 251 return -ENOSYS;
209 return sb->s_qcop->set_xstate(sb, flags, cmd); 252 return sb->s_qcop->quota_disable(sb, flags);
210} 253}
211 254
212static int quota_getxstate(struct super_block *sb, void __user *addr) 255static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -247,10 +290,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
247 return ret; 290 return ret;
248} 291}
249 292
293/*
294 * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
295 * out of there as xfsprogs rely on definitions being in that header file. So
296 * just define same functions here for quota purposes.
297 */
298#define XFS_BB_SHIFT 9
299
300static inline u64 quota_bbtob(u64 blocks)
301{
302 return blocks << XFS_BB_SHIFT;
303}
304
305static inline u64 quota_btobb(u64 bytes)
306{
307 return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
308}
309
310static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
311{
312 dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
313 dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
314 dst->d_ino_hardlimit = src->d_ino_hardlimit;
315 dst->d_ino_softlimit = src->d_ino_softlimit;
316 dst->d_space = quota_bbtob(src->d_bcount);
317 dst->d_ino_count = src->d_icount;
318 dst->d_ino_timer = src->d_itimer;
319 dst->d_spc_timer = src->d_btimer;
320 dst->d_ino_warns = src->d_iwarns;
321 dst->d_spc_warns = src->d_bwarns;
322 dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
323 dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
324 dst->d_rt_space = quota_bbtob(src->d_rtbcount);
325 dst->d_rt_spc_timer = src->d_rtbtimer;
326 dst->d_rt_spc_warns = src->d_rtbwarns;
327 dst->d_fieldmask = 0;
328 if (src->d_fieldmask & FS_DQ_ISOFT)
329 dst->d_fieldmask |= QC_INO_SOFT;
330 if (src->d_fieldmask & FS_DQ_IHARD)
331 dst->d_fieldmask |= QC_INO_HARD;
332 if (src->d_fieldmask & FS_DQ_BSOFT)
333 dst->d_fieldmask |= QC_SPC_SOFT;
334 if (src->d_fieldmask & FS_DQ_BHARD)
335 dst->d_fieldmask |= QC_SPC_HARD;
336 if (src->d_fieldmask & FS_DQ_RTBSOFT)
337 dst->d_fieldmask |= QC_RT_SPC_SOFT;
338 if (src->d_fieldmask & FS_DQ_RTBHARD)
339 dst->d_fieldmask |= QC_RT_SPC_HARD;
340 if (src->d_fieldmask & FS_DQ_BTIMER)
341 dst->d_fieldmask |= QC_SPC_TIMER;
342 if (src->d_fieldmask & FS_DQ_ITIMER)
343 dst->d_fieldmask |= QC_INO_TIMER;
344 if (src->d_fieldmask & FS_DQ_RTBTIMER)
345 dst->d_fieldmask |= QC_RT_SPC_TIMER;
346 if (src->d_fieldmask & FS_DQ_BWARNS)
347 dst->d_fieldmask |= QC_SPC_WARNS;
348 if (src->d_fieldmask & FS_DQ_IWARNS)
349 dst->d_fieldmask |= QC_INO_WARNS;
350 if (src->d_fieldmask & FS_DQ_RTBWARNS)
351 dst->d_fieldmask |= QC_RT_SPC_WARNS;
352 if (src->d_fieldmask & FS_DQ_BCOUNT)
353 dst->d_fieldmask |= QC_SPACE;
354 if (src->d_fieldmask & FS_DQ_ICOUNT)
355 dst->d_fieldmask |= QC_INO_COUNT;
356 if (src->d_fieldmask & FS_DQ_RTBCOUNT)
357 dst->d_fieldmask |= QC_RT_SPACE;
358}
359
250static int quota_setxquota(struct super_block *sb, int type, qid_t id, 360static int quota_setxquota(struct super_block *sb, int type, qid_t id,
251 void __user *addr) 361 void __user *addr)
252{ 362{
253 struct fs_disk_quota fdq; 363 struct fs_disk_quota fdq;
364 struct qc_dqblk qdq;
254 struct kqid qid; 365 struct kqid qid;
255 366
256 if (copy_from_user(&fdq, addr, sizeof(fdq))) 367 if (copy_from_user(&fdq, addr, sizeof(fdq)))
@@ -260,13 +371,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
260 qid = make_kqid(current_user_ns(), type, id); 371 qid = make_kqid(current_user_ns(), type, id);
261 if (!qid_valid(qid)) 372 if (!qid_valid(qid))
262 return -EINVAL; 373 return -EINVAL;
263 return sb->s_qcop->set_dqblk(sb, qid, &fdq); 374 copy_from_xfs_dqblk(&qdq, &fdq);
375 return sb->s_qcop->set_dqblk(sb, qid, &qdq);
376}
377
378static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
379 int type, qid_t id)
380{
381 memset(dst, 0, sizeof(*dst));
382 dst->d_version = FS_DQUOT_VERSION;
383 dst->d_id = id;
384 if (type == USRQUOTA)
385 dst->d_flags = FS_USER_QUOTA;
386 else if (type == PRJQUOTA)
387 dst->d_flags = FS_PROJ_QUOTA;
388 else
389 dst->d_flags = FS_GROUP_QUOTA;
390 dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
391 dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
392 dst->d_ino_hardlimit = src->d_ino_hardlimit;
393 dst->d_ino_softlimit = src->d_ino_softlimit;
394 dst->d_bcount = quota_btobb(src->d_space);
395 dst->d_icount = src->d_ino_count;
396 dst->d_itimer = src->d_ino_timer;
397 dst->d_btimer = src->d_spc_timer;
398 dst->d_iwarns = src->d_ino_warns;
399 dst->d_bwarns = src->d_spc_warns;
400 dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
401 dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
402 dst->d_rtbcount = quota_btobb(src->d_rt_space);
403 dst->d_rtbtimer = src->d_rt_spc_timer;
404 dst->d_rtbwarns = src->d_rt_spc_warns;
264} 405}
265 406
266static int quota_getxquota(struct super_block *sb, int type, qid_t id, 407static int quota_getxquota(struct super_block *sb, int type, qid_t id,
267 void __user *addr) 408 void __user *addr)
268{ 409{
269 struct fs_disk_quota fdq; 410 struct fs_disk_quota fdq;
411 struct qc_dqblk qdq;
270 struct kqid qid; 412 struct kqid qid;
271 int ret; 413 int ret;
272 414
@@ -275,8 +417,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
275 qid = make_kqid(current_user_ns(), type, id); 417 qid = make_kqid(current_user_ns(), type, id);
276 if (!qid_valid(qid)) 418 if (!qid_valid(qid))
277 return -EINVAL; 419 return -EINVAL;
278 ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); 420 ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
279 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) 421 if (ret)
422 return ret;
423 copy_to_xfs_dqblk(&fdq, &qdq, type, id);
424 if (copy_to_user(addr, &fdq, sizeof(fdq)))
280 return -EFAULT; 425 return -EFAULT;
281 return ret; 426 return ret;
282} 427}
@@ -317,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
317 case Q_QUOTAON: 462 case Q_QUOTAON:
318 return quota_quotaon(sb, type, cmd, id, path); 463 return quota_quotaon(sb, type, cmd, id, path);
319 case Q_QUOTAOFF: 464 case Q_QUOTAOFF:
320 if (!sb->s_qcop->quota_off) 465 return quota_quotaoff(sb, type);
321 return -ENOSYS;
322 return sb->s_qcop->quota_off(sb, type);
323 case Q_GETFMT: 466 case Q_GETFMT:
324 return quota_getfmt(sb, type, addr); 467 return quota_getfmt(sb, type, addr);
325 case Q_GETINFO: 468 case Q_GETINFO:
@@ -335,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
335 return -ENOSYS; 478 return -ENOSYS;
336 return sb->s_qcop->quota_sync(sb, type); 479 return sb->s_qcop->quota_sync(sb, type);
337 case Q_XQUOTAON: 480 case Q_XQUOTAON:
481 return quota_enable(sb, addr);
338 case Q_XQUOTAOFF: 482 case Q_XQUOTAOFF:
339 return quota_setxstate(sb, cmd, addr); 483 return quota_disable(sb, addr);
340 case Q_XQUOTARM: 484 case Q_XQUOTARM:
341 return quota_rmxquota(sb, addr); 485 return quota_rmxquota(sb, addr);
342 case Q_XGETQSTAT: 486 case Q_XGETQSTAT:
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
169 } 169 }
170 ret = 0; 170 ret = 0;
171 /* limits are stored as unsigned 32-bit data */ 171 /* limits are stored as unsigned 32-bit data */
172 dqopt->info[type].dqi_maxblimit = 0xffffffff; 172 dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
173 dqopt->info[type].dqi_maxilimit = 0xffffffff; 173 dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
174 dqopt->info[type].dqi_igrace = 174 dqopt->info[type].dqi_igrace =
175 dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; 175 dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
176 dqopt->info[type].dqi_bgrace = 176 dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 02751ec695c5..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
117 qinfo = info->dqi_priv; 117 qinfo = info->dqi_priv;
118 if (version == 0) { 118 if (version == 0) {
119 /* limits are stored as unsigned 32-bit data */ 119 /* limits are stored as unsigned 32-bit data */
120 info->dqi_maxblimit = 0xffffffff; 120 info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
121 info->dqi_maxilimit = 0xffffffff; 121 info->dqi_max_ino_limit = 0xffffffff;
122 } else { 122 } else {
123 /* used space is stored as unsigned 64-bit value */ 123 /* used space is stored as unsigned 64-bit value in bytes */
124 info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ 124 info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
125 info->dqi_maxilimit = 0xffffffffffffffffULL; 125 info->dqi_max_ino_limit = 0xffffffffffffffffULL;
126 } 126 }
127 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 127 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
128 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 128 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
129 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); 129 /* No flags currently supported */
130 info->dqi_flags = 0;
130 qinfo->dqi_sb = sb; 131 qinfo->dqi_sb = sb;
131 qinfo->dqi_type = type; 132 qinfo->dqi_type = type;
132 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 133 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
@@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type)
157 info->dqi_flags &= ~DQF_INFO_DIRTY; 158 info->dqi_flags &= ~DQF_INFO_DIRTY;
158 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); 159 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
159 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); 160 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
160 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 161 /* No flags currently supported */
162 dinfo.dqi_flags = cpu_to_le32(0);
161 spin_unlock(&dq_data_lock); 163 spin_unlock(&dq_data_lock);
162 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); 164 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
163 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); 165 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
34 unsigned long flags); 34 unsigned long flags);
35static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); 35static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
36 36
37static unsigned ramfs_mmap_capabilities(struct file *file)
38{
39 return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
40 NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
41}
42
37const struct file_operations ramfs_file_operations = { 43const struct file_operations ramfs_file_operations = {
44 .mmap_capabilities = ramfs_mmap_capabilities,
38 .mmap = ramfs_nommu_mmap, 45 .mmap = ramfs_nommu_mmap,
39 .get_unmapped_area = ramfs_nommu_get_unmapped_area, 46 .get_unmapped_area = ramfs_nommu_get_unmapped_area,
40 .read = new_sync_read, 47 .read = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
50 .set_page_dirty = __set_page_dirty_no_writeback, 50 .set_page_dirty = __set_page_dirty_no_writeback,
51}; 51};
52 52
53static struct backing_dev_info ramfs_backing_dev_info = {
54 .name = "ramfs",
55 .ra_pages = 0, /* No readahead */
56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
57 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
58 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
59};
60
61struct inode *ramfs_get_inode(struct super_block *sb, 53struct inode *ramfs_get_inode(struct super_block *sb,
62 const struct inode *dir, umode_t mode, dev_t dev) 54 const struct inode *dir, umode_t mode, dev_t dev)
63{ 55{
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
67 inode->i_ino = get_next_ino(); 59 inode->i_ino = get_next_ino();
68 inode_init_owner(inode, dir, mode); 60 inode_init_owner(inode, dir, mode);
69 inode->i_mapping->a_ops = &ramfs_aops; 61 inode->i_mapping->a_ops = &ramfs_aops;
70 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
71 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 62 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
72 mapping_set_unevictable(inode->i_mapping); 63 mapping_set_unevictable(inode->i_mapping);
73 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 64 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
267int __init init_ramfs_fs(void) 258int __init init_ramfs_fs(void)
268{ 259{
269 static unsigned long once; 260 static unsigned long once;
270 int err;
271 261
272 if (test_and_set_bit(0, &once)) 262 if (test_and_set_bit(0, &once))
273 return 0; 263 return 0;
274 264 return register_filesystem(&ramfs_fs_type);
275 err = bdi_init(&ramfs_backing_dev_info);
276 if (err)
277 return err;
278
279 err = register_filesystem(&ramfs_fs_type);
280 if (err)
281 bdi_destroy(&ramfs_backing_dev_info);
282
283 return err;
284} 265}
285fs_initcall(init_ramfs_fs); 266fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..8e1b68786d66 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
333} 333}
334#endif 334#endif
335 335
336ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
337{
338 struct kiocb kiocb;
339 ssize_t ret;
340
341 if (!file->f_op->read_iter)
342 return -EINVAL;
343
344 init_sync_kiocb(&kiocb, file);
345 kiocb.ki_pos = *ppos;
346 kiocb.ki_nbytes = iov_iter_count(iter);
347
348 iter->type |= READ;
349 ret = file->f_op->read_iter(&kiocb, iter);
350 if (ret == -EIOCBQUEUED)
351 ret = wait_on_sync_kiocb(&kiocb);
352
353 if (ret > 0)
354 *ppos = kiocb.ki_pos;
355 return ret;
356}
357EXPORT_SYMBOL(vfs_iter_read);
358
359ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
360{
361 struct kiocb kiocb;
362 ssize_t ret;
363
364 if (!file->f_op->write_iter)
365 return -EINVAL;
366
367 init_sync_kiocb(&kiocb, file);
368 kiocb.ki_pos = *ppos;
369 kiocb.ki_nbytes = iov_iter_count(iter);
370
371 iter->type |= WRITE;
372 ret = file->f_op->write_iter(&kiocb, iter);
373 if (ret == -EIOCBQUEUED)
374 ret = wait_on_sync_kiocb(&kiocb);
375
376 if (ret > 0)
377 *ppos = kiocb.ki_pos;
378 return ret;
379}
380EXPORT_SYMBOL(vfs_iter_write);
381
336/* 382/*
337 * rw_verify_area doesn't like huge counts. We limit 383 * rw_verify_area doesn't like huge counts. We limit
338 * them to something that fits in "int" so that others 384 * them to something that fits in "int" so that others
@@ -358,7 +404,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
358 return retval; 404 return retval;
359 } 405 }
360 406
361 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 407 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
362 retval = locks_mandatory_area( 408 retval = locks_mandatory_area(
363 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, 409 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
364 inode, file, pos, count); 410 inode, file, pos, count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a7eec9888f10..e72401e1f995 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file,
2766 int old_ref = 0; 2766 int old_ref = 0;
2767 2767
2768 inode = mapping->host; 2768 inode = mapping->host;
2769 *fsdata = 0; 2769 *fsdata = NULL;
2770 if (flags & AOP_FLAG_CONT_EXPAND && 2770 if (flags & AOP_FLAG_CONT_EXPAND &&
2771 (pos & (inode->i_sb->s_blocksize - 1)) == 0) { 2771 (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
2772 pos ++; 2772 pos ++;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
70 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; 70 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
71} 71}
72 72
73static unsigned romfs_mmap_capabilities(struct file *file)
74{
75 struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
76
77 if (!mtd)
78 return NOMMU_MAP_COPY;
79 return mtd_mmap_capabilities(mtd);
80}
81
73const struct file_operations romfs_ro_fops = { 82const struct file_operations romfs_ro_fops = {
74 .llseek = generic_file_llseek, 83 .llseek = generic_file_llseek,
75 .read = new_sync_read, 84 .read = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
77 .splice_read = generic_file_splice_read, 86 .splice_read = generic_file_splice_read,
78 .mmap = romfs_mmap, 87 .mmap = romfs_mmap,
79 .get_unmapped_area = romfs_get_unmapped_area, 88 .get_unmapped_area = romfs_get_unmapped_area,
89 .mmap_capabilities = romfs_mmap_capabilities,
80}; 90};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
355 case ROMFH_REG: 355 case ROMFH_REG:
356 i->i_fop = &romfs_ro_fops; 356 i->i_fop = &romfs_ro_fops;
357 i->i_data.a_ops = &romfs_aops; 357 i->i_data.a_ops = &romfs_aops;
358 if (i->i_sb->s_mtd)
359 i->i_data.backing_dev_info =
360 i->i_sb->s_mtd->backing_dev_info;
361 if (nextfh & ROMFH_EXEC) 358 if (nextfh & ROMFH_EXEC)
362 mode |= S_IXUGO; 359 mode |= S_IXUGO;
363 break; 360 break;
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
971 if (ret == -EINTR) { 971 if (ret == -EINTR) {
972 struct restart_block *restart_block; 972 struct restart_block *restart_block;
973 973
974 restart_block = &current_thread_info()->restart_block; 974 restart_block = &current->restart_block;
975 restart_block->fn = do_restart_poll; 975 restart_block->fn = do_restart_poll;
976 restart_block->poll.ufds = ufds; 976 restart_block->poll.ufds = ufds;
977 restart_block->poll.nfds = nfds; 977 restart_block->poll.nfds = nfds;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
539 return res; 539 return res;
540} 540}
541 541
542int seq_bitmap(struct seq_file *m, const unsigned long *bits,
543 unsigned int nr_bits)
544{
545 if (m->count < m->size) {
546 int len = bitmap_scnprintf(m->buf + m->count,
547 m->size - m->count, bits, nr_bits);
548 if (m->count + len < m->size) {
549 m->count += len;
550 return 0;
551 }
552 }
553 seq_set_overflow(m);
554 return -1;
555}
556EXPORT_SYMBOL(seq_bitmap);
557
558int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
559 unsigned int nr_bits)
560{
561 if (m->count < m->size) {
562 int len = bitmap_scnlistprintf(m->buf + m->count,
563 m->size - m->count, bits, nr_bits);
564 if (m->count + len < m->size) {
565 m->count += len;
566 return 0;
567 }
568 }
569 seq_set_overflow(m);
570 return -1;
571}
572EXPORT_SYMBOL(seq_bitmap_list);
573
574static void *single_start(struct seq_file *p, loff_t *pos) 542static void *single_start(struct seq_file *p, loff_t *pos)
575{ 543{
576 return NULL + (*pos == 0); 544 return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
961 splice_from_pipe_begin(&sd); 961 splice_from_pipe_begin(&sd);
962 while (sd.total_len) { 962 while (sd.total_len) {
963 struct iov_iter from; 963 struct iov_iter from;
964 struct kiocb kiocb;
965 size_t left; 964 size_t left;
966 int n, idx; 965 int n, idx;
967 966
@@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1005 left -= this_len; 1004 left -= this_len;
1006 } 1005 }
1007 1006
1008 /* ... iov_iter */ 1007 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
1009 from.type = ITER_BVEC | WRITE; 1008 sd.total_len - left);
1010 from.bvec = array; 1009 ret = vfs_iter_write(out, &from, &sd.pos);
1011 from.nr_segs = n;
1012 from.count = sd.total_len - left;
1013 from.iov_offset = 0;
1014
1015 /* ... and iocb */
1016 init_sync_kiocb(&kiocb, out);
1017 kiocb.ki_pos = sd.pos;
1018 kiocb.ki_nbytes = sd.total_len - left;
1019
1020 /* now, send it */
1021 ret = out->f_op->write_iter(&kiocb, &from);
1022 if (-EIOCBQUEUED == ret)
1023 ret = wait_on_sync_kiocb(&kiocb);
1024
1025 if (ret <= 0) 1010 if (ret <= 0)
1026 break; 1011 break;
1027 1012
1028 sd.num_spliced += ret; 1013 sd.num_spliced += ret;
1029 sd.total_len -= ret; 1014 sd.total_len -= ret;
1030 *ppos = sd.pos = kiocb.ki_pos; 1015 *ppos = sd.pos;
1031 1016
1032 /* dismiss the fully eaten buffers, adjust the partial one */ 1017 /* dismiss the fully eaten buffers, adjust the partial one */
1033 while (ret) { 1018 while (ret) {
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..65a53efc1cf4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
36#include "internal.h" 36#include "internal.h"
37 37
38 38
39LIST_HEAD(super_blocks); 39static LIST_HEAD(super_blocks);
40DEFINE_SPINLOCK(sb_lock); 40static DEFINE_SPINLOCK(sb_lock);
41 41
42static char *sb_writers_name[SB_FREEZE_LEVELS] = { 42static char *sb_writers_name[SB_FREEZE_LEVELS] = {
43 "sb_writers", 43 "sb_writers",
@@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
75 return SHRINK_STOP; 75 return SHRINK_STOP;
76 76
77 if (sb->s_op->nr_cached_objects) 77 if (sb->s_op->nr_cached_objects)
78 fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); 78 fs_objects = sb->s_op->nr_cached_objects(sb, sc);
79 79
80 inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); 80 inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
81 dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); 81 dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
82 total_objects = dentries + inodes + fs_objects + 1; 82 total_objects = dentries + inodes + fs_objects + 1;
83 if (!total_objects) 83 if (!total_objects)
84 total_objects = 1; 84 total_objects = 1;
@@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
86 /* proportion the scan between the caches */ 86 /* proportion the scan between the caches */
87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); 87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); 88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
89 fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
89 90
90 /* 91 /*
91 * prune the dcache first as the icache is pinned by it, then 92 * prune the dcache first as the icache is pinned by it, then
92 * prune the icache, followed by the filesystem specific caches 93 * prune the icache, followed by the filesystem specific caches
94 *
95 * Ensure that we always scan at least one object - memcg kmem
96 * accounting uses this to fully empty the caches.
93 */ 97 */
94 freed = prune_dcache_sb(sb, dentries, sc->nid); 98 sc->nr_to_scan = dentries + 1;
95 freed += prune_icache_sb(sb, inodes, sc->nid); 99 freed = prune_dcache_sb(sb, sc);
100 sc->nr_to_scan = inodes + 1;
101 freed += prune_icache_sb(sb, sc);
96 102
97 if (fs_objects) { 103 if (fs_objects) {
98 fs_objects = mult_frac(sc->nr_to_scan, fs_objects, 104 sc->nr_to_scan = fs_objects + 1;
99 total_objects); 105 freed += sb->s_op->free_cached_objects(sb, sc);
100 freed += sb->s_op->free_cached_objects(sb, fs_objects,
101 sc->nid);
102 } 106 }
103 107
104 drop_super(sb); 108 drop_super(sb);
@@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
118 * scalability bottleneck. The counts could get updated 122 * scalability bottleneck. The counts could get updated
119 * between super_cache_count and super_cache_scan anyway. 123 * between super_cache_count and super_cache_scan anyway.
120 * Call to super_cache_count with shrinker_rwsem held 124 * Call to super_cache_count with shrinker_rwsem held
121 * ensures the safety of call to list_lru_count_node() and 125 * ensures the safety of call to list_lru_shrink_count() and
122 * s_op->nr_cached_objects(). 126 * s_op->nr_cached_objects().
123 */ 127 */
124 if (sb->s_op && sb->s_op->nr_cached_objects) 128 if (sb->s_op && sb->s_op->nr_cached_objects)
125 total_objects = sb->s_op->nr_cached_objects(sb, 129 total_objects = sb->s_op->nr_cached_objects(sb, sc);
126 sc->nid);
127 130
128 total_objects += list_lru_count_node(&sb->s_dentry_lru, 131 total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
129 sc->nid); 132 total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
130 total_objects += list_lru_count_node(&sb->s_inode_lru,
131 sc->nid);
132 133
133 total_objects = vfs_pressure_ratio(total_objects); 134 total_objects = vfs_pressure_ratio(total_objects);
134 return total_objects; 135 return total_objects;
@@ -185,15 +186,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
185 } 186 }
186 init_waitqueue_head(&s->s_writers.wait); 187 init_waitqueue_head(&s->s_writers.wait);
187 init_waitqueue_head(&s->s_writers.wait_unfrozen); 188 init_waitqueue_head(&s->s_writers.wait_unfrozen);
189 s->s_bdi = &noop_backing_dev_info;
188 s->s_flags = flags; 190 s->s_flags = flags;
189 s->s_bdi = &default_backing_dev_info;
190 INIT_HLIST_NODE(&s->s_instances); 191 INIT_HLIST_NODE(&s->s_instances);
191 INIT_HLIST_BL_HEAD(&s->s_anon); 192 INIT_HLIST_BL_HEAD(&s->s_anon);
192 INIT_LIST_HEAD(&s->s_inodes); 193 INIT_LIST_HEAD(&s->s_inodes);
193 194
194 if (list_lru_init(&s->s_dentry_lru)) 195 if (list_lru_init_memcg(&s->s_dentry_lru))
195 goto fail; 196 goto fail;
196 if (list_lru_init(&s->s_inode_lru)) 197 if (list_lru_init_memcg(&s->s_inode_lru))
197 goto fail; 198 goto fail;
198 199
199 init_rwsem(&s->s_umount); 200 init_rwsem(&s->s_umount);
@@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
229 s->s_shrink.scan_objects = super_cache_scan; 230 s->s_shrink.scan_objects = super_cache_scan;
230 s->s_shrink.count_objects = super_cache_count; 231 s->s_shrink.count_objects = super_cache_count;
231 s->s_shrink.batch = 1024; 232 s->s_shrink.batch = 1024;
232 s->s_shrink.flags = SHRINKER_NUMA_AWARE; 233 s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
233 return s; 234 return s;
234 235
235fail: 236fail:
@@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s)
284 unregister_shrinker(&s->s_shrink); 285 unregister_shrinker(&s->s_shrink);
285 fs->kill_sb(s); 286 fs->kill_sb(s);
286 287
288 /*
289 * Since list_lru_destroy() may sleep, we cannot call it from
290 * put_super(), where we hold the sb_lock. Therefore we destroy
291 * the lru lists right now.
292 */
293 list_lru_destroy(&s->s_dentry_lru);
294 list_lru_destroy(&s->s_inode_lru);
295
287 put_filesystem(fs); 296 put_filesystem(fs);
288 put_super(s); 297 put_super(s);
289 } else { 298 } else {
@@ -706,9 +715,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
706 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 715 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
707 716
708 if (remount_ro) { 717 if (remount_ro) {
709 if (sb->s_pins.first) { 718 if (!hlist_empty(&sb->s_pins)) {
710 up_write(&sb->s_umount); 719 up_write(&sb->s_umount);
711 sb_pin_kill(sb); 720 group_pin_kill(&sb->s_pins);
712 down_write(&sb->s_umount); 721 down_write(&sb->s_umount);
713 if (!sb->s_root) 722 if (!sb->s_root)
714 return 0; 723 return 0;
@@ -863,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev);
863 872
864int set_anon_super(struct super_block *s, void *data) 873int set_anon_super(struct super_block *s, void *data)
865{ 874{
866 int error = get_anon_bdev(&s->s_dev); 875 return get_anon_bdev(&s->s_dev);
867 if (!error)
868 s->s_bdi = &noop_backing_dev_info;
869 return error;
870} 876}
871 877
872EXPORT_SYMBOL(set_anon_super); 878EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
1111 sb = root->d_sb; 1117 sb = root->d_sb;
1112 BUG_ON(!sb); 1118 BUG_ON(!sb);
1113 WARN_ON(!sb->s_bdi); 1119 WARN_ON(!sb->s_bdi);
1114 WARN_ON(sb->s_bdi == &default_backing_dev_info);
1115 sb->s_flags |= MS_BORN; 1120 sb->s_flags |= MS_BORN;
1116 1121
1117 error = security_sb_kern_mount(sb, flags, secdata); 1122 error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
177 */ 177 */
178int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) 178int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
179{ 179{
180 struct inode *inode = file->f_mapping->host;
181
180 if (!file->f_op->fsync) 182 if (!file->f_op->fsync)
181 return -EINVAL; 183 return -EINVAL;
184 if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
185 spin_lock(&inode->i_lock);
186 inode->i_state &= ~I_DIRTY_TIME;
187 spin_unlock(&inode->i_lock);
188 mark_inode_dirty_sync(inode);
189 }
182 return file->f_op->fsync(file, start, end, datasync); 190 return file->f_op->fsync(file, start, end, datasync);
183} 191}
184EXPORT_SYMBOL(vfs_fsync_range); 192EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
295 key = attr->key ?: (struct lock_class_key *)&attr->skey; 295 key = attr->key ?: (struct lock_class_key *)&attr->skey;
296#endif 296#endif
297 kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, 297 kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
298 (void *)attr, ns, true, key); 298 (void *)attr, ns, key);
299 if (IS_ERR(kn)) { 299 if (IS_ERR(kn)) {
300 if (PTR_ERR(kn) == -EEXIST) 300 if (PTR_ERR(kn) == -EEXIST)
301 sysfs_warn_dup(parent, attr->name); 301 sysfs_warn_dup(parent, attr->name);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 7d2a860ba788..2554d8835b48 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -99,7 +99,7 @@ static int internal_create_group(struct kobject *kobj, int update,
99 return -EINVAL; 99 return -EINVAL;
100 if (!grp->attrs && !grp->bin_attrs) { 100 if (!grp->attrs && !grp->bin_attrs) {
101 WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n", 101 WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
102 kobj->name, grp->name ? "" : grp->name); 102 kobj->name, grp->name ?: "");
103 return -EINVAL; 103 return -EINVAL;
104 } 104 }
105 if (grp->name) { 105 if (grp->name) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7ed13e1e216a..4cfb3e82c56f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2032,6 +2032,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2032 long long blk_offs; 2032 long long blk_offs;
2033 struct ubifs_data_node *dn = node; 2033 struct ubifs_data_node *dn = node;
2034 2034
2035 ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ);
2036
2035 /* 2037 /*
2036 * Search the inode node this data node belongs to and insert 2038 * Search the inode node this data node belongs to and insert
2037 * it to the RB-tree of inodes. 2039 * it to the RB-tree of inodes.
@@ -2060,6 +2062,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2060 struct ubifs_dent_node *dent = node; 2062 struct ubifs_dent_node *dent = node;
2061 struct fsck_inode *fscki1; 2063 struct fsck_inode *fscki1;
2062 2064
2065 ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ);
2066
2063 err = ubifs_validate_entry(c, dent); 2067 err = ubifs_validate_entry(c, dent);
2064 if (err) 2068 if (err)
2065 goto out_dump; 2069 goto out_dump;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..0fa6c803992e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
108 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
109 ubifs_current_time(inode); 109 ubifs_current_time(inode);
110 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
111 /* Disable readahead */
112 inode->i_mapping->backing_dev_info = &c->bdi;
113 111
114 switch (mode & S_IFMT) { 112 switch (mode & S_IFMT) {
115 case S_IFREG: 113 case S_IFREG:
@@ -272,6 +270,10 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
272 goto out_budg; 270 goto out_budg;
273 } 271 }
274 272
273 err = ubifs_init_security(dir, inode, &dentry->d_name);
274 if (err)
275 goto out_cancel;
276
275 mutex_lock(&dir_ui->ui_mutex); 277 mutex_lock(&dir_ui->ui_mutex);
276 dir->i_size += sz_change; 278 dir->i_size += sz_change;
277 dir_ui->ui_size = dir->i_size; 279 dir_ui->ui_size = dir->i_size;
@@ -728,6 +730,10 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
728 goto out_budg; 730 goto out_budg;
729 } 731 }
730 732
733 err = ubifs_init_security(dir, inode, &dentry->d_name);
734 if (err)
735 goto out_cancel;
736
731 mutex_lock(&dir_ui->ui_mutex); 737 mutex_lock(&dir_ui->ui_mutex);
732 insert_inode_hash(inode); 738 insert_inode_hash(inode);
733 inc_nlink(inode); 739 inc_nlink(inode);
@@ -808,6 +814,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
808 ui->data = dev; 814 ui->data = dev;
809 ui->data_len = devlen; 815 ui->data_len = devlen;
810 816
817 err = ubifs_init_security(dir, inode, &dentry->d_name);
818 if (err)
819 goto out_cancel;
820
811 mutex_lock(&dir_ui->ui_mutex); 821 mutex_lock(&dir_ui->ui_mutex);
812 dir->i_size += sz_change; 822 dir->i_size += sz_change;
813 dir_ui->ui_size = dir->i_size; 823 dir_ui->ui_size = dir->i_size;
@@ -884,6 +894,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
884 ui->data_len = len; 894 ui->data_len = len;
885 inode->i_size = ubifs_inode(inode)->ui_size = len; 895 inode->i_size = ubifs_inode(inode)->ui_size = len;
886 896
897 err = ubifs_init_security(dir, inode, &dentry->d_name);
898 if (err)
899 goto out_cancel;
900
887 mutex_lock(&dir_ui->ui_mutex); 901 mutex_lock(&dir_ui->ui_mutex);
888 dir->i_size += sz_change; 902 dir->i_size += sz_change;
889 dir_ui->ui_size = dir->i_size; 903 dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..e627c0acf626 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
1536 .fault = filemap_fault, 1536 .fault = filemap_fault,
1537 .map_pages = filemap_map_pages, 1537 .map_pages = filemap_map_pages,
1538 .page_mkwrite = ubifs_vm_page_mkwrite, 1538 .page_mkwrite = ubifs_vm_page_mkwrite,
1539 .remap_pages = generic_file_remap_pages,
1540}; 1539};
1541 1540
1542static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1541static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1574,6 +1573,10 @@ const struct inode_operations ubifs_symlink_inode_operations = {
1574 .follow_link = ubifs_follow_link, 1573 .follow_link = ubifs_follow_link,
1575 .setattr = ubifs_setattr, 1574 .setattr = ubifs_setattr,
1576 .getattr = ubifs_getattr, 1575 .getattr = ubifs_getattr,
1576 .setxattr = ubifs_setxattr,
1577 .getxattr = ubifs_getxattr,
1578 .listxattr = ubifs_listxattr,
1579 .removexattr = ubifs_removexattr,
1577}; 1580};
1578 1581
1579const struct file_operations ubifs_file_operations = { 1582const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 3187925e9879..9b40a1c5e160 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1028,9 +1028,22 @@ int ubifs_replay_journal(struct ubifs_info *c)
1028 1028
1029 do { 1029 do {
1030 err = replay_log_leb(c, lnum, 0, c->sbuf); 1030 err = replay_log_leb(c, lnum, 0, c->sbuf);
1031 if (err == 1) 1031 if (err == 1) {
1032 /* We hit the end of the log */ 1032 if (lnum != c->lhead_lnum)
1033 break; 1033 /* We hit the end of the log */
1034 break;
1035
1036 /*
1037 * The head of the log must always start with the
1038 * "commit start" node on a properly formatted UBIFS.
1039 * But we found no nodes at all, which means that
1040 * someting went wrong and we cannot proceed mounting
1041 * the file-system.
1042 */
1043 ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
1044 lnum, 0);
1045 err = -EINVAL;
1046 }
1034 if (err) 1047 if (err)
1035 goto out; 1048 goto out;
1036 lnum = ubifs_next_log_lnum(c, lnum); 1049 lnum = ubifs_next_log_lnum(c, lnum);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..93e946561c5c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
156 if (err) 156 if (err)
157 goto out_invalid; 157 goto out_invalid;
158 158
159 /* Disable read-ahead */
160 inode->i_mapping->backing_dev_info = &c->bdi;
161
162 switch (inode->i_mode & S_IFMT) { 159 switch (inode->i_mode & S_IFMT) {
163 case S_IFREG: 160 case S_IFREG:
164 inode->i_mapping->a_ops = &ubifs_file_address_operations; 161 inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
2017 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 2014 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
2018 */ 2015 */
2019 c->bdi.name = "ubifs", 2016 c->bdi.name = "ubifs",
2020 c->bdi.capabilities = BDI_CAP_MAP_COPY; 2017 c->bdi.capabilities = 0;
2021 err = bdi_init(&c->bdi); 2018 err = bdi_init(&c->bdi);
2022 if (err) 2019 if (err)
2023 goto out_close; 2020 goto out_close;
@@ -2039,6 +2036,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
2039 if (c->max_inode_sz > MAX_LFS_FILESIZE) 2036 if (c->max_inode_sz > MAX_LFS_FILESIZE)
2040 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; 2037 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
2041 sb->s_op = &ubifs_super_operations; 2038 sb->s_op = &ubifs_super_operations;
2039 sb->s_xattr = ubifs_xattr_handlers;
2042 2040
2043 mutex_lock(&c->umount_mutex); 2041 mutex_lock(&c->umount_mutex);
2044 err = mount_ubifs(c); 2042 err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c4fe900c67ab..bc04b9c69891 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -36,6 +36,7 @@
36#include <linux/mtd/ubi.h> 36#include <linux/mtd/ubi.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/backing-dev.h> 38#include <linux/backing-dev.h>
39#include <linux/security.h>
39#include "ubifs-media.h" 40#include "ubifs-media.h"
40 41
41/* Version of this UBIFS implementation */ 42/* Version of this UBIFS implementation */
@@ -1465,6 +1466,7 @@ extern spinlock_t ubifs_infos_lock;
1465extern atomic_long_t ubifs_clean_zn_cnt; 1466extern atomic_long_t ubifs_clean_zn_cnt;
1466extern struct kmem_cache *ubifs_inode_slab; 1467extern struct kmem_cache *ubifs_inode_slab;
1467extern const struct super_operations ubifs_super_operations; 1468extern const struct super_operations ubifs_super_operations;
1469extern const struct xattr_handler *ubifs_xattr_handlers[];
1468extern const struct address_space_operations ubifs_file_address_operations; 1470extern const struct address_space_operations ubifs_file_address_operations;
1469extern const struct file_operations ubifs_file_operations; 1471extern const struct file_operations ubifs_file_operations;
1470extern const struct inode_operations ubifs_file_inode_operations; 1472extern const struct inode_operations ubifs_file_inode_operations;
@@ -1754,6 +1756,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
1754 size_t size); 1756 size_t size);
1755ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); 1757ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
1756int ubifs_removexattr(struct dentry *dentry, const char *name); 1758int ubifs_removexattr(struct dentry *dentry, const char *name);
1759int ubifs_init_security(struct inode *dentry, struct inode *inode,
1760 const struct qstr *qstr);
1757 1761
1758/* super.c */ 1762/* super.c */
1759struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); 1763struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 5e0a63b1b0d5..a92be244a6fb 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -100,24 +100,30 @@ static const struct file_operations empty_fops;
100static int create_xattr(struct ubifs_info *c, struct inode *host, 100static int create_xattr(struct ubifs_info *c, struct inode *host,
101 const struct qstr *nm, const void *value, int size) 101 const struct qstr *nm, const void *value, int size)
102{ 102{
103 int err; 103 int err, names_len;
104 struct inode *inode; 104 struct inode *inode;
105 struct ubifs_inode *ui, *host_ui = ubifs_inode(host); 105 struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
106 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 106 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
107 .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, 107 .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
108 .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; 108 .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
109 109
110 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) 110 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
111 ubifs_err("inode %lu already has too many xattrs (%d), cannot create more",
112 host->i_ino, host_ui->xattr_cnt);
111 return -ENOSPC; 113 return -ENOSPC;
114 }
112 /* 115 /*
113 * Linux limits the maximum size of the extended attribute names list 116 * Linux limits the maximum size of the extended attribute names list
114 * to %XATTR_LIST_MAX. This means we should not allow creating more 117 * to %XATTR_LIST_MAX. This means we should not allow creating more
115 * extended attributes if the name list becomes larger. This limitation 118 * extended attributes if the name list becomes larger. This limitation
116 * is artificial for UBIFS, though. 119 * is artificial for UBIFS, though.
117 */ 120 */
118 if (host_ui->xattr_names + host_ui->xattr_cnt + 121 names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
119 nm->len + 1 > XATTR_LIST_MAX) 122 if (names_len > XATTR_LIST_MAX) {
123 ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
124 host->i_ino, names_len, XATTR_LIST_MAX);
120 return -ENOSPC; 125 return -ENOSPC;
126 }
121 127
122 err = ubifs_budget_space(c, &req); 128 err = ubifs_budget_space(c, &req);
123 if (err) 129 if (err)
@@ -293,18 +299,16 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
293 return ERR_PTR(-EINVAL); 299 return ERR_PTR(-EINVAL);
294} 300}
295 301
296int ubifs_setxattr(struct dentry *dentry, const char *name, 302static int setxattr(struct inode *host, const char *name, const void *value,
297 const void *value, size_t size, int flags) 303 size_t size, int flags)
298{ 304{
299 struct inode *inode, *host = dentry->d_inode; 305 struct inode *inode;
300 struct ubifs_info *c = host->i_sb->s_fs_info; 306 struct ubifs_info *c = host->i_sb->s_fs_info;
301 struct qstr nm = QSTR_INIT(name, strlen(name)); 307 struct qstr nm = QSTR_INIT(name, strlen(name));
302 struct ubifs_dent_node *xent; 308 struct ubifs_dent_node *xent;
303 union ubifs_key key; 309 union ubifs_key key;
304 int err, type; 310 int err, type;
305 311
306 dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name,
307 host->i_ino, dentry, size);
308 ubifs_assert(mutex_is_locked(&host->i_mutex)); 312 ubifs_assert(mutex_is_locked(&host->i_mutex));
309 313
310 if (size > UBIFS_MAX_INO_DATA) 314 if (size > UBIFS_MAX_INO_DATA)
@@ -356,6 +360,15 @@ out_free:
356 return err; 360 return err;
357} 361}
358 362
363int ubifs_setxattr(struct dentry *dentry, const char *name,
364 const void *value, size_t size, int flags)
365{
366 dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
367 name, dentry->d_inode->i_ino, dentry, size);
368
369 return setxattr(dentry->d_inode, name, value, size, flags);
370}
371
359ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, 372ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
360 size_t size) 373 size_t size)
361{ 374{
@@ -568,3 +581,84 @@ out_free:
568 kfree(xent); 581 kfree(xent);
569 return err; 582 return err;
570} 583}
584
585static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
586 const char *name, size_t name_len, int flags)
587{
588 const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
589 const size_t total_len = prefix_len + name_len + 1;
590
591 if (list && total_len <= list_size) {
592 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
593 memcpy(list + prefix_len, name, name_len);
594 list[prefix_len + name_len] = '\0';
595 }
596
597 return total_len;
598}
599
600static int security_getxattr(struct dentry *d, const char *name, void *buffer,
601 size_t size, int flags)
602{
603 return ubifs_getxattr(d, name, buffer, size);
604}
605
606static int security_setxattr(struct dentry *d, const char *name,
607 const void *value, size_t size, int flags,
608 int handler_flags)
609{
610 return ubifs_setxattr(d, name, value, size, flags);
611}
612
613static const struct xattr_handler ubifs_xattr_security_handler = {
614 .prefix = XATTR_SECURITY_PREFIX,
615 .list = security_listxattr,
616 .get = security_getxattr,
617 .set = security_setxattr,
618};
619
620const struct xattr_handler *ubifs_xattr_handlers[] = {
621 &ubifs_xattr_security_handler,
622 NULL,
623};
624
625static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
626 void *fs_info)
627{
628 const struct xattr *xattr;
629 char *name;
630 int err = 0;
631
632 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
633 name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
634 strlen(xattr->name) + 1, GFP_NOFS);
635 if (!name) {
636 err = -ENOMEM;
637 break;
638 }
639 strcpy(name, XATTR_SECURITY_PREFIX);
640 strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
641 err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
642 kfree(name);
643 if (err < 0)
644 break;
645 }
646
647 return err;
648}
649
650int ubifs_init_security(struct inode *dentry, struct inode *inode,
651 const struct qstr *qstr)
652{
653 int err;
654
655 mutex_lock(&inode->i_mutex);
656 err = security_inode_init_security(inode, dentry, qstr,
657 &init_xattrs, 0);
658 mutex_unlock(&inode->i_mutex);
659
660 if (err)
661 ubifs_err("cannot initialize security for inode %lu, error %d",
662 inode->i_ino, err);
663 return err;
664}
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..c6e17a744c3b 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -2,10 +2,12 @@ config UDF_FS
2 tristate "UDF file system support" 2 tristate "UDF file system support"
3 select CRC_ITU_T 3 select CRC_ITU_T
4 help 4 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if 5 This is a file system used on some CD-ROMs and DVDs. Since the
6 you intend to mount DVD discs or CDRW's written in packet mode, or 6 file system is supported by multiple operating systems and is more
7 if written to by other UDF utilities, such as DirectCD. 7 compatible with standard unix file systems, it is also suitable for
8 Please read <file:Documentation/filesystems/udf.txt>. 8 removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
9 written in packet mode, or if you want to use UDF for removable USB
10 disks. Please read <file:Documentation/filesystems/udf.txt>.
9 11
10 To compile this file system support as a module, choose M here: the 12 To compile this file system support as a module, choose M here: the
11 module will be called udf. 13 module will be called udf.
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index a012c51caffd..05e90edd1992 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -57,6 +57,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
57 sector_t offset; 57 sector_t offset;
58 int i, num, ret = 0; 58 int i, num, ret = 0;
59 struct extent_position epos = { NULL, 0, {0, 0} }; 59 struct extent_position epos = { NULL, 0, {0, 0} };
60 struct super_block *sb = dir->i_sb;
60 61
61 if (ctx->pos == 0) { 62 if (ctx->pos == 0) {
62 if (!dir_emit_dot(file, ctx)) 63 if (!dir_emit_dot(file, ctx))
@@ -76,16 +77,16 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
76 if (nf_pos == 0) 77 if (nf_pos == 0)
77 nf_pos = udf_ext0_offset(dir); 78 nf_pos = udf_ext0_offset(dir);
78 79
79 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); 80 fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1);
80 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 81 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
81 if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, 82 if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits,
82 &epos, &eloc, &elen, &offset) 83 &epos, &eloc, &elen, &offset)
83 != (EXT_RECORDED_ALLOCATED >> 30)) { 84 != (EXT_RECORDED_ALLOCATED >> 30)) {
84 ret = -ENOENT; 85 ret = -ENOENT;
85 goto out; 86 goto out;
86 } 87 }
87 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); 88 block = udf_get_lb_pblock(sb, &eloc, offset);
88 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 89 if ((++offset << sb->s_blocksize_bits) < elen) {
89 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 90 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
90 epos.offset -= sizeof(struct short_ad); 91 epos.offset -= sizeof(struct short_ad);
91 else if (iinfo->i_alloc_type == 92 else if (iinfo->i_alloc_type ==
@@ -95,18 +96,18 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
95 offset = 0; 96 offset = 0;
96 } 97 }
97 98
98 if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) { 99 if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) {
99 ret = -EIO; 100 ret = -EIO;
100 goto out; 101 goto out;
101 } 102 }
102 103
103 if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) { 104 if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) {
104 i = 16 >> (dir->i_sb->s_blocksize_bits - 9); 105 i = 16 >> (sb->s_blocksize_bits - 9);
105 if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) 106 if (i + offset > (elen >> sb->s_blocksize_bits))
106 i = (elen >> dir->i_sb->s_blocksize_bits) - offset; 107 i = (elen >> sb->s_blocksize_bits) - offset;
107 for (num = 0; i > 0; i--) { 108 for (num = 0; i > 0; i--) {
108 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i); 109 block = udf_get_lb_pblock(sb, &eloc, offset + i);
109 tmp = udf_tgetblk(dir->i_sb, block); 110 tmp = udf_tgetblk(sb, block);
110 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) 111 if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
111 bha[num++] = tmp; 112 bha[num++] = tmp;
112 else 113 else
@@ -152,12 +153,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
152 } 153 }
153 154
154 if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { 155 if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
155 if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) 156 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
156 continue; 157 continue;
157 } 158 }
158 159
159 if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { 160 if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
160 if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) 161 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
161 continue; 162 continue;
162 } 163 }
163 164
@@ -167,12 +168,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
167 continue; 168 continue;
168 } 169 }
169 170
170 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 171 flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
171 if (!flen) 172 if (!flen)
172 continue; 173 continue;
173 174
174 tloc = lelb_to_cpu(cfi.icb.extLocation); 175 tloc = lelb_to_cpu(cfi.icb.extLocation);
175 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); 176 iblock = udf_get_lb_pblock(sb, &tloc, 0);
176 if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) 177 if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
177 goto out; 178 goto out;
178 } /* end while */ 179 } /* end while */
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bb15771b92ae..08f3555fbeac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -224,7 +224,7 @@ out:
224static int udf_release_file(struct inode *inode, struct file *filp) 224static int udf_release_file(struct inode *inode, struct file *filp)
225{ 225{
226 if (filp->f_mode & FMODE_WRITE && 226 if (filp->f_mode & FMODE_WRITE &&
227 atomic_read(&inode->i_writecount) > 1) { 227 atomic_read(&inode->i_writecount) == 1) {
228 /* 228 /*
229 * Grab i_mutex to avoid races with writes changing i_size 229 * Grab i_mutex to avoid races with writes changing i_size
230 * while we are running. 230 * while we are running.
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c9b4df5810d5..a445d599098d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
750 /* Are we beyond EOF? */ 750 /* Are we beyond EOF? */
751 if (etype == -1) { 751 if (etype == -1) {
752 int ret; 752 int ret;
753 isBeyondEOF = 1; 753 isBeyondEOF = true;
754 if (count) { 754 if (count) {
755 if (c) 755 if (c)
756 laarr[0] = laarr[1]; 756 laarr[0] = laarr[1];
@@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
792 endnum = c + 1; 792 endnum = c + 1;
793 lastblock = 1; 793 lastblock = 1;
794 } else { 794 } else {
795 isBeyondEOF = 0; 795 isBeyondEOF = false;
796 endnum = startnum = ((count > 2) ? 2 : count); 796 endnum = startnum = ((count > 2) ? 2 : count);
797 797
798 /* if the current extent is in position 0, 798 /* if the current extent is in position 0,
@@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
1288 struct kernel_lb_addr *iloc = &iinfo->i_location; 1288 struct kernel_lb_addr *iloc = &iinfo->i_location;
1289 unsigned int link_count; 1289 unsigned int link_count;
1290 unsigned int indirections = 0; 1290 unsigned int indirections = 0;
1291 int bs = inode->i_sb->s_blocksize;
1291 int ret = -EIO; 1292 int ret = -EIO;
1292 1293
1293reread: 1294reread:
@@ -1374,38 +1375,35 @@ reread:
1374 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { 1375 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
1375 iinfo->i_efe = 1; 1376 iinfo->i_efe = 1;
1376 iinfo->i_use = 0; 1377 iinfo->i_use = 0;
1377 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1378 ret = udf_alloc_i_data(inode, bs -
1378 sizeof(struct extendedFileEntry)); 1379 sizeof(struct extendedFileEntry));
1379 if (ret) 1380 if (ret)
1380 goto out; 1381 goto out;
1381 memcpy(iinfo->i_ext.i_data, 1382 memcpy(iinfo->i_ext.i_data,
1382 bh->b_data + sizeof(struct extendedFileEntry), 1383 bh->b_data + sizeof(struct extendedFileEntry),
1383 inode->i_sb->s_blocksize - 1384 bs - sizeof(struct extendedFileEntry));
1384 sizeof(struct extendedFileEntry));
1385 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { 1385 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
1386 iinfo->i_efe = 0; 1386 iinfo->i_efe = 0;
1387 iinfo->i_use = 0; 1387 iinfo->i_use = 0;
1388 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1388 ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
1389 sizeof(struct fileEntry));
1390 if (ret) 1389 if (ret)
1391 goto out; 1390 goto out;
1392 memcpy(iinfo->i_ext.i_data, 1391 memcpy(iinfo->i_ext.i_data,
1393 bh->b_data + sizeof(struct fileEntry), 1392 bh->b_data + sizeof(struct fileEntry),
1394 inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1393 bs - sizeof(struct fileEntry));
1395 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { 1394 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
1396 iinfo->i_efe = 0; 1395 iinfo->i_efe = 0;
1397 iinfo->i_use = 1; 1396 iinfo->i_use = 1;
1398 iinfo->i_lenAlloc = le32_to_cpu( 1397 iinfo->i_lenAlloc = le32_to_cpu(
1399 ((struct unallocSpaceEntry *)bh->b_data)-> 1398 ((struct unallocSpaceEntry *)bh->b_data)->
1400 lengthAllocDescs); 1399 lengthAllocDescs);
1401 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1400 ret = udf_alloc_i_data(inode, bs -
1402 sizeof(struct unallocSpaceEntry)); 1401 sizeof(struct unallocSpaceEntry));
1403 if (ret) 1402 if (ret)
1404 goto out; 1403 goto out;
1405 memcpy(iinfo->i_ext.i_data, 1404 memcpy(iinfo->i_ext.i_data,
1406 bh->b_data + sizeof(struct unallocSpaceEntry), 1405 bh->b_data + sizeof(struct unallocSpaceEntry),
1407 inode->i_sb->s_blocksize - 1406 bs - sizeof(struct unallocSpaceEntry));
1408 sizeof(struct unallocSpaceEntry));
1409 return 0; 1407 return 0;
1410 } 1408 }
1411 1409
@@ -1489,6 +1487,28 @@ reread:
1489 } 1487 }
1490 inode->i_generation = iinfo->i_unique; 1488 inode->i_generation = iinfo->i_unique;
1491 1489
1490 /*
1491 * Sanity check length of allocation descriptors and extended attrs to
1492 * avoid integer overflows
1493 */
1494 if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
1495 goto out;
1496 /* Now do exact checks */
1497 if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
1498 goto out;
1499 /* Sanity checks for files in ICB so that we don't get confused later */
1500 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1501 /*
1502 * For file in ICB data is stored in allocation descriptor
1503 * so sizes should match
1504 */
1505 if (iinfo->i_lenAlloc != inode->i_size)
1506 goto out;
1507 /* File in ICB has to fit in there... */
1508 if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
1509 goto out;
1510 }
1511
1492 switch (fe->icbTag.fileType) { 1512 switch (fe->icbTag.fileType) {
1493 case ICBTAG_FILE_TYPE_DIRECTORY: 1513 case ICBTAG_FILE_TYPE_DIRECTORY:
1494 inode->i_op = &udf_dir_inode_operations; 1514 inode->i_op = &udf_dir_inode_operations;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c12e260fd6c4..33b246b82c98 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -159,18 +159,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
159 struct udf_inode_info *dinfo = UDF_I(dir); 159 struct udf_inode_info *dinfo = UDF_I(dir);
160 int isdotdot = child->len == 2 && 160 int isdotdot = child->len == 2 &&
161 child->name[0] == '.' && child->name[1] == '.'; 161 child->name[0] == '.' && child->name[1] == '.';
162 struct super_block *sb = dir->i_sb;
162 163
163 size = udf_ext0_offset(dir) + dir->i_size; 164 size = udf_ext0_offset(dir) + dir->i_size;
164 f_pos = udf_ext0_offset(dir); 165 f_pos = udf_ext0_offset(dir);
165 166
166 fibh->sbh = fibh->ebh = NULL; 167 fibh->sbh = fibh->ebh = NULL;
167 fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); 168 fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1);
168 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 169 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
169 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, 170 if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos,
170 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) 171 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
171 goto out_err; 172 goto out_err;
172 block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); 173 block = udf_get_lb_pblock(sb, &eloc, offset);
173 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 174 if ((++offset << sb->s_blocksize_bits) < elen) {
174 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 175 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
175 epos.offset -= sizeof(struct short_ad); 176 epos.offset -= sizeof(struct short_ad);
176 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 177 else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
@@ -178,7 +179,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
178 } else 179 } else
179 offset = 0; 180 offset = 0;
180 181
181 fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); 182 fibh->sbh = fibh->ebh = udf_tread(sb, block);
182 if (!fibh->sbh) 183 if (!fibh->sbh)
183 goto out_err; 184 goto out_err;
184 } 185 }
@@ -217,12 +218,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
217 } 218 }
218 219
219 if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { 220 if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
220 if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) 221 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
221 continue; 222 continue;
222 } 223 }
223 224
224 if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { 225 if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
225 if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) 226 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
226 continue; 227 continue;
227 } 228 }
228 229
@@ -233,7 +234,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
233 if (!lfi) 234 if (!lfi)
234 continue; 235 continue;
235 236
236 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 237 flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
237 if (flen && udf_match(flen, fname, child->len, child->name)) 238 if (flen && udf_match(flen, fname, child->len, child->name))
238 goto out_ok; 239 goto out_ok;
239 } 240 }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3ccb2f11fc76..f169411c4ea0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence(
1599 struct udf_vds_record *curr; 1599 struct udf_vds_record *curr;
1600 struct generic_desc *gd; 1600 struct generic_desc *gd;
1601 struct volDescPtr *vdp; 1601 struct volDescPtr *vdp;
1602 int done = 0; 1602 bool done = false;
1603 uint32_t vdsn; 1603 uint32_t vdsn;
1604 uint16_t ident; 1604 uint16_t ident;
1605 long next_s = 0, next_e = 0; 1605 long next_s = 0, next_e = 0;
@@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence(
1680 lastblock = next_e; 1680 lastblock = next_e;
1681 next_s = next_e = 0; 1681 next_s = next_e = 0;
1682 } else 1682 } else
1683 done = 1; 1683 done = true;
1684 break; 1684 break;
1685 } 1685 }
1686 brelse(bh); 1686 brelse(bh);
@@ -2300,6 +2300,7 @@ static void udf_put_super(struct super_block *sb)
2300 udf_close_lvid(sb); 2300 udf_close_lvid(sb);
2301 brelse(sbi->s_lvid_bh); 2301 brelse(sbi->s_lvid_bh);
2302 udf_sb_free_partitions(sb); 2302 udf_sb_free_partitions(sb);
2303 mutex_destroy(&sbi->s_alloc_mutex);
2303 kfree(sb->s_fs_info); 2304 kfree(sb->s_fs_info);
2304 sb->s_fs_info = NULL; 2305 sb->s_fs_info = NULL;
2305} 2306}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6fb7945c1e6e..ac10ca939f26 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -30,49 +30,73 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include "udf_i.h" 31#include "udf_i.h"
32 32
33static void udf_pc_to_char(struct super_block *sb, unsigned char *from, 33static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
34 int fromlen, unsigned char *to) 34 int fromlen, unsigned char *to, int tolen)
35{ 35{
36 struct pathComponent *pc; 36 struct pathComponent *pc;
37 int elen = 0; 37 int elen = 0;
38 int comp_len;
38 unsigned char *p = to; 39 unsigned char *p = to;
39 40
41 /* Reserve one byte for terminating \0 */
42 tolen--;
40 while (elen < fromlen) { 43 while (elen < fromlen) {
41 pc = (struct pathComponent *)(from + elen); 44 pc = (struct pathComponent *)(from + elen);
45 elen += sizeof(struct pathComponent);
42 switch (pc->componentType) { 46 switch (pc->componentType) {
43 case 1: 47 case 1:
44 /* 48 /*
45 * Symlink points to some place which should be agreed 49 * Symlink points to some place which should be agreed
46 * upon between originator and receiver of the media. Ignore. 50 * upon between originator and receiver of the media. Ignore.
47 */ 51 */
48 if (pc->lengthComponentIdent > 0) 52 if (pc->lengthComponentIdent > 0) {
53 elen += pc->lengthComponentIdent;
49 break; 54 break;
55 }
50 /* Fall through */ 56 /* Fall through */
51 case 2: 57 case 2:
58 if (tolen == 0)
59 return -ENAMETOOLONG;
52 p = to; 60 p = to;
53 *p++ = '/'; 61 *p++ = '/';
62 tolen--;
54 break; 63 break;
55 case 3: 64 case 3:
65 if (tolen < 3)
66 return -ENAMETOOLONG;
56 memcpy(p, "../", 3); 67 memcpy(p, "../", 3);
57 p += 3; 68 p += 3;
69 tolen -= 3;
58 break; 70 break;
59 case 4: 71 case 4:
72 if (tolen < 2)
73 return -ENAMETOOLONG;
60 memcpy(p, "./", 2); 74 memcpy(p, "./", 2);
61 p += 2; 75 p += 2;
76 tolen -= 2;
62 /* that would be . - just ignore */ 77 /* that would be . - just ignore */
63 break; 78 break;
64 case 5: 79 case 5:
65 p += udf_get_filename(sb, pc->componentIdent, p, 80 elen += pc->lengthComponentIdent;
66 pc->lengthComponentIdent); 81 if (elen > fromlen)
82 return -EIO;
83 comp_len = udf_get_filename(sb, pc->componentIdent,
84 pc->lengthComponentIdent,
85 p, tolen);
86 p += comp_len;
87 tolen -= comp_len;
88 if (tolen == 0)
89 return -ENAMETOOLONG;
67 *p++ = '/'; 90 *p++ = '/';
91 tolen--;
68 break; 92 break;
69 } 93 }
70 elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
71 } 94 }
72 if (p > to + 1) 95 if (p > to + 1)
73 p[-1] = '\0'; 96 p[-1] = '\0';
74 else 97 else
75 p[0] = '\0'; 98 p[0] = '\0';
99 return 0;
76} 100}
77 101
78static int udf_symlink_filler(struct file *file, struct page *page) 102static int udf_symlink_filler(struct file *file, struct page *page)
@@ -80,11 +104,17 @@ static int udf_symlink_filler(struct file *file, struct page *page)
80 struct inode *inode = page->mapping->host; 104 struct inode *inode = page->mapping->host;
81 struct buffer_head *bh = NULL; 105 struct buffer_head *bh = NULL;
82 unsigned char *symlink; 106 unsigned char *symlink;
83 int err = -EIO; 107 int err;
84 unsigned char *p = kmap(page); 108 unsigned char *p = kmap(page);
85 struct udf_inode_info *iinfo; 109 struct udf_inode_info *iinfo;
86 uint32_t pos; 110 uint32_t pos;
87 111
112 /* We don't support symlinks longer than one block */
113 if (inode->i_size > inode->i_sb->s_blocksize) {
114 err = -ENAMETOOLONG;
115 goto out_unmap;
116 }
117
88 iinfo = UDF_I(inode); 118 iinfo = UDF_I(inode);
89 pos = udf_block_map(inode, 0); 119 pos = udf_block_map(inode, 0);
90 120
@@ -94,14 +124,18 @@ static int udf_symlink_filler(struct file *file, struct page *page)
94 } else { 124 } else {
95 bh = sb_bread(inode->i_sb, pos); 125 bh = sb_bread(inode->i_sb, pos);
96 126
97 if (!bh) 127 if (!bh) {
98 goto out; 128 err = -EIO;
129 goto out_unlock_inode;
130 }
99 131
100 symlink = bh->b_data; 132 symlink = bh->b_data;
101 } 133 }
102 134
103 udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); 135 err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
104 brelse(bh); 136 brelse(bh);
137 if (err)
138 goto out_unlock_inode;
105 139
106 up_read(&iinfo->i_data_sem); 140 up_read(&iinfo->i_data_sem);
107 SetPageUptodate(page); 141 SetPageUptodate(page);
@@ -109,9 +143,10 @@ static int udf_symlink_filler(struct file *file, struct page *page)
109 unlock_page(page); 143 unlock_page(page);
110 return 0; 144 return 0;
111 145
112out: 146out_unlock_inode:
113 up_read(&iinfo->i_data_sem); 147 up_read(&iinfo->i_data_sem);
114 SetPageError(page); 148 SetPageError(page);
149out_unmap:
115 kunmap(page); 150 kunmap(page);
116 unlock_page(page); 151 unlock_page(page);
117 return err; 152 return err;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 1cc3c993ebd0..47bb3f5ca360 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -211,7 +211,8 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
211} 211}
212 212
213/* unicode.c */ 213/* unicode.c */
214extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); 214extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
215 int);
215extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 216extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
216 int); 217 int);
217extern int udf_build_ustr(struct ustr *, dstring *, int); 218extern int udf_build_ustr(struct ustr *, dstring *, int);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index afd470e588ff..b84fee372734 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,7 +28,8 @@
28 28
29#include "udf_sb.h" 29#include "udf_sb.h"
30 30
31static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); 31static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
32 int);
32 33
33static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 34static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
34{ 35{
@@ -333,8 +334,8 @@ try_again:
333 return u_len + 1; 334 return u_len + 1;
334} 335}
335 336
336int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, 337int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
337 int flen) 338 uint8_t *dname, int dlen)
338{ 339{
339 struct ustr *filename, *unifilename; 340 struct ustr *filename, *unifilename;
340 int len = 0; 341 int len = 0;
@@ -347,7 +348,7 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
347 if (!unifilename) 348 if (!unifilename)
348 goto out1; 349 goto out1;
349 350
350 if (udf_build_ustr_exact(unifilename, sname, flen)) 351 if (udf_build_ustr_exact(unifilename, sname, slen))
351 goto out2; 352 goto out2;
352 353
353 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 354 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
@@ -366,7 +367,8 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
366 } else 367 } else
367 goto out2; 368 goto out2;
368 369
369 len = udf_translate_to_linux(dname, filename->u_name, filename->u_len, 370 len = udf_translate_to_linux(dname, dlen,
371 filename->u_name, filename->u_len,
370 unifilename->u_name, unifilename->u_len); 372 unifilename->u_name, unifilename->u_len);
371out2: 373out2:
372 kfree(unifilename); 374 kfree(unifilename);
@@ -403,10 +405,12 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname,
403#define EXT_MARK '.' 405#define EXT_MARK '.'
404#define CRC_MARK '#' 406#define CRC_MARK '#'
405#define EXT_SIZE 5 407#define EXT_SIZE 5
408/* Number of chars we need to store generated CRC to make filename unique */
409#define CRC_LEN 5
406 410
407static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, 411static int udf_translate_to_linux(uint8_t *newName, int newLen,
408 int udfLen, uint8_t *fidName, 412 uint8_t *udfName, int udfLen,
409 int fidNameLen) 413 uint8_t *fidName, int fidNameLen)
410{ 414{
411 int index, newIndex = 0, needsCRC = 0; 415 int index, newIndex = 0, needsCRC = 0;
412 int extIndex = 0, newExtIndex = 0, hasExt = 0; 416 int extIndex = 0, newExtIndex = 0, hasExt = 0;
@@ -439,7 +443,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
439 newExtIndex = newIndex; 443 newExtIndex = newIndex;
440 } 444 }
441 } 445 }
442 if (newIndex < 256) 446 if (newIndex < newLen)
443 newName[newIndex++] = curr; 447 newName[newIndex++] = curr;
444 else 448 else
445 needsCRC = 1; 449 needsCRC = 1;
@@ -467,13 +471,13 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
467 } 471 }
468 ext[localExtIndex++] = curr; 472 ext[localExtIndex++] = curr;
469 } 473 }
470 maxFilenameLen = 250 - localExtIndex; 474 maxFilenameLen = newLen - CRC_LEN - localExtIndex;
471 if (newIndex > maxFilenameLen) 475 if (newIndex > maxFilenameLen)
472 newIndex = maxFilenameLen; 476 newIndex = maxFilenameLen;
473 else 477 else
474 newIndex = newExtIndex; 478 newIndex = newExtIndex;
475 } else if (newIndex > 250) 479 } else if (newIndex > newLen - CRC_LEN)
476 newIndex = 250; 480 newIndex = newLen - CRC_LEN;
477 newName[newIndex++] = CRC_MARK; 481 newName[newIndex++] = CRC_MARK;
478 valueCRC = crc_itu_t(0, fidName, fidNameLen); 482 valueCRC = crc_itu_t(0, fidName, fidNameLen);
479 newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); 483 newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index da73801301d5..8092d3759a5e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -95,22 +95,18 @@
95 95
96void lock_ufs(struct super_block *sb) 96void lock_ufs(struct super_block *sb)
97{ 97{
98#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
99 struct ufs_sb_info *sbi = UFS_SB(sb); 98 struct ufs_sb_info *sbi = UFS_SB(sb);
100 99
101 mutex_lock(&sbi->mutex); 100 mutex_lock(&sbi->mutex);
102 sbi->mutex_owner = current; 101 sbi->mutex_owner = current;
103#endif
104} 102}
105 103
106void unlock_ufs(struct super_block *sb) 104void unlock_ufs(struct super_block *sb)
107{ 105{
108#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
109 struct ufs_sb_info *sbi = UFS_SB(sb); 106 struct ufs_sb_info *sbi = UFS_SB(sb);
110 107
111 sbi->mutex_owner = NULL; 108 sbi->mutex_owner = NULL;
112 mutex_unlock(&sbi->mutex); 109 mutex_unlock(&sbi->mutex);
113#endif
114} 110}
115 111
116static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) 112static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
@@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep;
1415static struct inode *ufs_alloc_inode(struct super_block *sb) 1411static struct inode *ufs_alloc_inode(struct super_block *sb)
1416{ 1412{
1417 struct ufs_inode_info *ei; 1413 struct ufs_inode_info *ei;
1418 ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); 1414
1415 ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
1419 if (!ei) 1416 if (!ei)
1420 return NULL; 1417 return NULL;
1418
1421 ei->vfs_inode.i_version = 1; 1419 ei->vfs_inode.i_version = 1;
1422 return &ei->vfs_inode; 1420 return &ei->vfs_inode;
1423} 1421}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d61799949580..df6828570e87 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
121xfs-$(CONFIG_PROC_FS) += xfs_stats.o 121xfs-$(CONFIG_PROC_FS) += xfs_stats.o
122xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o 122xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
123xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o 123xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
124xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 53e95b2a1369..a7a3a63bb360 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -91,16 +91,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
91 return ptr; 91 return ptr;
92} 92}
93 93
94void
95kmem_free(const void *ptr)
96{
97 if (!is_vmalloc_addr(ptr)) {
98 kfree(ptr);
99 } else {
100 vfree(ptr);
101 }
102}
103
104void * 94void *
105kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, 95kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
106 xfs_km_flags_t flags) 96 xfs_km_flags_t flags)
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 64db0e53edea..cc6b768fc068 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -63,7 +63,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
63extern void *kmem_alloc(size_t, xfs_km_flags_t); 63extern void *kmem_alloc(size_t, xfs_km_flags_t);
64extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); 64extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
65extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); 65extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
66extern void kmem_free(const void *); 66static inline void kmem_free(const void *ptr)
67{
68 kvfree(ptr);
69}
67 70
68 71
69extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); 72extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5d38e8b8a913..15105dbc9e28 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -403,7 +403,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
403 if (!xfs_sb_version_hasattr2(&mp->m_sb)) { 403 if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
404 xfs_sb_version_addattr2(&mp->m_sb); 404 xfs_sb_version_addattr2(&mp->m_sb);
405 spin_unlock(&mp->m_sb_lock); 405 spin_unlock(&mp->m_sb_lock);
406 xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); 406 xfs_log_sb(tp);
407 } else 407 } else
408 spin_unlock(&mp->m_sb_lock); 408 spin_unlock(&mp->m_sb_lock);
409 } 409 }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b5eb4743f75a..61ec015dca16 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -973,7 +973,11 @@ xfs_bmap_local_to_extents(
973 *firstblock = args.fsbno; 973 *firstblock = args.fsbno;
974 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 974 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
975 975
976 /* initialise the block and copy the data */ 976 /*
977 * Initialise the block and copy the data
978 *
979 * Note: init_fn must set the buffer log item type correctly!
980 */
977 init_fn(tp, bp, ip, ifp); 981 init_fn(tp, bp, ip, ifp);
978 982
979 /* account for the change in fork size and log everything */ 983 /* account for the change in fork size and log everything */
@@ -1221,22 +1225,20 @@ xfs_bmap_add_attrfork(
1221 goto bmap_cancel; 1225 goto bmap_cancel;
1222 if (!xfs_sb_version_hasattr(&mp->m_sb) || 1226 if (!xfs_sb_version_hasattr(&mp->m_sb) ||
1223 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { 1227 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
1224 __int64_t sbfields = 0; 1228 bool log_sb = false;
1225 1229
1226 spin_lock(&mp->m_sb_lock); 1230 spin_lock(&mp->m_sb_lock);
1227 if (!xfs_sb_version_hasattr(&mp->m_sb)) { 1231 if (!xfs_sb_version_hasattr(&mp->m_sb)) {
1228 xfs_sb_version_addattr(&mp->m_sb); 1232 xfs_sb_version_addattr(&mp->m_sb);
1229 sbfields |= XFS_SB_VERSIONNUM; 1233 log_sb = true;
1230 } 1234 }
1231 if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { 1235 if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
1232 xfs_sb_version_addattr2(&mp->m_sb); 1236 xfs_sb_version_addattr2(&mp->m_sb);
1233 sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); 1237 log_sb = true;
1234 } 1238 }
1235 if (sbfields) { 1239 spin_unlock(&mp->m_sb_lock);
1236 spin_unlock(&mp->m_sb_lock); 1240 if (log_sb)
1237 xfs_mod_sb(tp, sbfields); 1241 xfs_log_sb(tp);
1238 } else
1239 spin_unlock(&mp->m_sb_lock);
1240 } 1242 }
1241 1243
1242 error = xfs_bmap_finish(&tp, &flist, &committed); 1244 error = xfs_bmap_finish(&tp, &flist, &committed);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 44db6db86402..b9d8a499d2c4 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -28,6 +28,37 @@ struct xfs_trans;
28extern kmem_zone_t *xfs_bmap_free_item_zone; 28extern kmem_zone_t *xfs_bmap_free_item_zone;
29 29
30/* 30/*
31 * Argument structure for xfs_bmap_alloc.
32 */
33struct xfs_bmalloca {
34 xfs_fsblock_t *firstblock; /* i/o first block allocated */
35 struct xfs_bmap_free *flist; /* bmap freelist */
36 struct xfs_trans *tp; /* transaction pointer */
37 struct xfs_inode *ip; /* incore inode pointer */
38 struct xfs_bmbt_irec prev; /* extent before the new one */
39 struct xfs_bmbt_irec got; /* extent after, or delayed */
40
41 xfs_fileoff_t offset; /* offset in file filling in */
42 xfs_extlen_t length; /* i/o length asked/allocated */
43 xfs_fsblock_t blkno; /* starting block of new extent */
44
45 struct xfs_btree_cur *cur; /* btree cursor */
46 xfs_extnum_t idx; /* current extent index */
47 int nallocs;/* number of extents alloc'd */
48 int logflags;/* flags for transaction logging */
49
50 xfs_extlen_t total; /* total blocks needed for xaction */
51 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
52 xfs_extlen_t minleft; /* amount must be left after alloc */
53 bool eof; /* set if allocating past last extent */
54 bool wasdel; /* replacing a delayed allocation */
55 bool userdata;/* set if is user data */
56 bool aeof; /* allocated space at eof */
57 bool conv; /* overwriting unwritten extents */
58 int flags;
59};
60
61/*
31 * List of extents to be free "later". 62 * List of extents to be free "later".
32 * The list is kept sorted on xbf_startblock. 63 * The list is kept sorted on xbf_startblock.
33 */ 64 */
@@ -149,6 +180,8 @@ void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
149void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, 180void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
150 struct xfs_bmap_free *flist, struct xfs_mount *mp); 181 struct xfs_bmap_free *flist, struct xfs_mount *mp);
151void xfs_bmap_cancel(struct xfs_bmap_free *flist); 182void xfs_bmap_cancel(struct xfs_bmap_free *flist);
183int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
184 int *committed);
152void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); 185void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
153int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, 186int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
154 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); 187 xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fbd6da263571..8eb718979383 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -151,10 +151,13 @@ typedef struct xfs_sb {
151 __uint32_t sb_features2; /* additional feature bits */ 151 __uint32_t sb_features2; /* additional feature bits */
152 152
153 /* 153 /*
154 * bad features2 field as a result of failing to pad the sb 154 * bad features2 field as a result of failing to pad the sb structure to
155 * structure to 64 bits. Some machines will be using this field 155 * 64 bits. Some machines will be using this field for features2 bits.
156 * for features2 bits. Easiest just to mark it bad and not use 156 * Easiest just to mark it bad and not use it for anything else.
157 * it for anything else. 157 *
158 * This is not kept up to date in memory; it is always overwritten by
159 * the value in sb_features2 when formatting the incore superblock to
160 * the disk buffer.
158 */ 161 */
159 __uint32_t sb_bad_features2; 162 __uint32_t sb_bad_features2;
160 163
@@ -304,8 +307,8 @@ typedef enum {
304#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) 307#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
305#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) 308#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
306#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) 309#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
307#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) 310#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \
308#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) 311 XFS_SB_MVAL(BAD_FEATURES2))
309#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) 312#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
310#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) 313#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
311#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) 314#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
@@ -319,9 +322,9 @@ typedef enum {
319 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ 322 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
320 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ 323 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
321 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ 324 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
322 XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ 325 XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
323 XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ 326 XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
324 XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) 327 XFS_SB_PQUOTINO)
325 328
326 329
327/* 330/*
@@ -453,13 +456,11 @@ static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
453{ 456{
454 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; 457 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
455 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; 458 sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
456 sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
457} 459}
458 460
459static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) 461static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
460{ 462{
461 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; 463 sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
462 sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
463 if (!sbp->sb_features2) 464 if (!sbp->sb_features2)
464 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 465 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
465} 466}
@@ -475,7 +476,6 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
475{ 476{
476 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; 477 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
477 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; 478 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
478 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
479} 479}
480 480
481/* 481/*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 752915fa775a..b0a5fe95a3e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -40,69 +40,6 @@
40 * Physical superblock buffer manipulations. Shared with libxfs in userspace. 40 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
41 */ 41 */
42 42
43static const struct {
44 short offset;
45 short type; /* 0 = integer
46 * 1 = binary / string (no translation)
47 */
48} xfs_sb_info[] = {
49 { offsetof(xfs_sb_t, sb_magicnum), 0 },
50 { offsetof(xfs_sb_t, sb_blocksize), 0 },
51 { offsetof(xfs_sb_t, sb_dblocks), 0 },
52 { offsetof(xfs_sb_t, sb_rblocks), 0 },
53 { offsetof(xfs_sb_t, sb_rextents), 0 },
54 { offsetof(xfs_sb_t, sb_uuid), 1 },
55 { offsetof(xfs_sb_t, sb_logstart), 0 },
56 { offsetof(xfs_sb_t, sb_rootino), 0 },
57 { offsetof(xfs_sb_t, sb_rbmino), 0 },
58 { offsetof(xfs_sb_t, sb_rsumino), 0 },
59 { offsetof(xfs_sb_t, sb_rextsize), 0 },
60 { offsetof(xfs_sb_t, sb_agblocks), 0 },
61 { offsetof(xfs_sb_t, sb_agcount), 0 },
62 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
63 { offsetof(xfs_sb_t, sb_logblocks), 0 },
64 { offsetof(xfs_sb_t, sb_versionnum), 0 },
65 { offsetof(xfs_sb_t, sb_sectsize), 0 },
66 { offsetof(xfs_sb_t, sb_inodesize), 0 },
67 { offsetof(xfs_sb_t, sb_inopblock), 0 },
68 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
69 { offsetof(xfs_sb_t, sb_blocklog), 0 },
70 { offsetof(xfs_sb_t, sb_sectlog), 0 },
71 { offsetof(xfs_sb_t, sb_inodelog), 0 },
72 { offsetof(xfs_sb_t, sb_inopblog), 0 },
73 { offsetof(xfs_sb_t, sb_agblklog), 0 },
74 { offsetof(xfs_sb_t, sb_rextslog), 0 },
75 { offsetof(xfs_sb_t, sb_inprogress), 0 },
76 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
77 { offsetof(xfs_sb_t, sb_icount), 0 },
78 { offsetof(xfs_sb_t, sb_ifree), 0 },
79 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
80 { offsetof(xfs_sb_t, sb_frextents), 0 },
81 { offsetof(xfs_sb_t, sb_uquotino), 0 },
82 { offsetof(xfs_sb_t, sb_gquotino), 0 },
83 { offsetof(xfs_sb_t, sb_qflags), 0 },
84 { offsetof(xfs_sb_t, sb_flags), 0 },
85 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
86 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
87 { offsetof(xfs_sb_t, sb_unit), 0 },
88 { offsetof(xfs_sb_t, sb_width), 0 },
89 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
90 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
91 { offsetof(xfs_sb_t, sb_logsectsize), 0 },
92 { offsetof(xfs_sb_t, sb_logsunit), 0 },
93 { offsetof(xfs_sb_t, sb_features2), 0 },
94 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
95 { offsetof(xfs_sb_t, sb_features_compat), 0 },
96 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
97 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
98 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
99 { offsetof(xfs_sb_t, sb_crc), 0 },
100 { offsetof(xfs_sb_t, sb_pad), 0 },
101 { offsetof(xfs_sb_t, sb_pquotino), 0 },
102 { offsetof(xfs_sb_t, sb_lsn), 0 },
103 { sizeof(xfs_sb_t), 0 }
104};
105
106/* 43/*
107 * Reference counting access wrappers to the perag structures. 44 * Reference counting access wrappers to the perag structures.
108 * Because we never free per-ag structures, the only thing we 45 * Because we never free per-ag structures, the only thing we
@@ -461,58 +398,49 @@ xfs_sb_from_disk(
461 __xfs_sb_from_disk(to, from, true); 398 __xfs_sb_from_disk(to, from, true);
462} 399}
463 400
464static inline void 401static void
465xfs_sb_quota_to_disk( 402xfs_sb_quota_to_disk(
466 xfs_dsb_t *to, 403 struct xfs_dsb *to,
467 xfs_sb_t *from, 404 struct xfs_sb *from)
468 __int64_t *fields)
469{ 405{
470 __uint16_t qflags = from->sb_qflags; 406 __uint16_t qflags = from->sb_qflags;
471 407
408 to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
409 if (xfs_sb_version_has_pquotino(from)) {
410 to->sb_qflags = cpu_to_be16(from->sb_qflags);
411 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
412 to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
413 return;
414 }
415
472 /* 416 /*
473 * We need to do these manipilations only if we are working 417 * The in-core version of sb_qflags do not have XFS_OQUOTA_*
474 * with an older version of on-disk superblock. 418 * flags, whereas the on-disk version does. So, convert incore
419 * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
475 */ 420 */
476 if (xfs_sb_version_has_pquotino(from)) 421 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
477 return; 422 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
478 423
479 if (*fields & XFS_SB_QFLAGS) { 424 if (from->sb_qflags &
480 /* 425 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
481 * The in-core version of sb_qflags do not have 426 qflags |= XFS_OQUOTA_ENFD;
482 * XFS_OQUOTA_* flags, whereas the on-disk version 427 if (from->sb_qflags &
483 * does. So, convert incore XFS_{PG}QUOTA_* flags 428 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
484 * to on-disk XFS_OQUOTA_* flags. 429 qflags |= XFS_OQUOTA_CHKD;
485 */ 430 to->sb_qflags = cpu_to_be16(qflags);
486 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
487 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
488
489 if (from->sb_qflags &
490 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
491 qflags |= XFS_OQUOTA_ENFD;
492 if (from->sb_qflags &
493 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
494 qflags |= XFS_OQUOTA_CHKD;
495 to->sb_qflags = cpu_to_be16(qflags);
496 *fields &= ~XFS_SB_QFLAGS;
497 }
498 431
499 /* 432 /*
500 * GQUOTINO and PQUOTINO cannot be used together in versions of 433 * GQUOTINO and PQUOTINO cannot be used together in versions
501 * superblock that do not have pquotino. from->sb_flags tells us which 434 * of superblock that do not have pquotino. from->sb_flags
502 * quota is active and should be copied to disk. If neither are active, 435 * tells us which quota is active and should be copied to
503 * make sure we write NULLFSINO to the sb_gquotino field as a quota 436 * disk. If neither are active, we should NULL the inode.
504 * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
505 * bit is set.
506 * 437 *
507 * Note that we don't need to handle the sb_uquotino or sb_pquotino here 438 * In all cases, the separate pquotino must remain 0 because it
508 * as they do not require any translation. Hence the main sb field loop 439 * it beyond the "end" of the valid non-pquotino superblock.
509 * will write them appropriately from the in-core superblock.
510 */ 440 */
511 if ((*fields & XFS_SB_GQUOTINO) && 441 if (from->sb_qflags & XFS_GQUOTA_ACCT)
512 (from->sb_qflags & XFS_GQUOTA_ACCT))
513 to->sb_gquotino = cpu_to_be64(from->sb_gquotino); 442 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
514 else if ((*fields & XFS_SB_PQUOTINO) && 443 else if (from->sb_qflags & XFS_PQUOTA_ACCT)
515 (from->sb_qflags & XFS_PQUOTA_ACCT))
516 to->sb_gquotino = cpu_to_be64(from->sb_pquotino); 444 to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
517 else { 445 else {
518 /* 446 /*
@@ -526,63 +454,78 @@ xfs_sb_quota_to_disk(
526 to->sb_gquotino = cpu_to_be64(NULLFSINO); 454 to->sb_gquotino = cpu_to_be64(NULLFSINO);
527 } 455 }
528 456
529 *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); 457 to->sb_pquotino = 0;
530} 458}
531 459
532/*
533 * Copy in core superblock to ondisk one.
534 *
535 * The fields argument is mask of superblock fields to copy.
536 */
537void 460void
538xfs_sb_to_disk( 461xfs_sb_to_disk(
539 xfs_dsb_t *to, 462 struct xfs_dsb *to,
540 xfs_sb_t *from, 463 struct xfs_sb *from)
541 __int64_t fields)
542{ 464{
543 xfs_caddr_t to_ptr = (xfs_caddr_t)to; 465 xfs_sb_quota_to_disk(to, from);
544 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
545 xfs_sb_field_t f;
546 int first;
547 int size;
548
549 ASSERT(fields);
550 if (!fields)
551 return;
552 466
553 /* We should never write the crc here, it's updated in the IO path */ 467 to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
554 fields &= ~XFS_SB_CRC; 468 to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
555 469 to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
556 xfs_sb_quota_to_disk(to, from, &fields); 470 to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
557 while (fields) { 471 to->sb_rextents = cpu_to_be64(from->sb_rextents);
558 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 472 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
559 first = xfs_sb_info[f].offset; 473 to->sb_logstart = cpu_to_be64(from->sb_logstart);
560 size = xfs_sb_info[f + 1].offset - first; 474 to->sb_rootino = cpu_to_be64(from->sb_rootino);
561 475 to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
562 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); 476 to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
563 477 to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
564 if (size == 1 || xfs_sb_info[f].type == 1) { 478 to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
565 memcpy(to_ptr + first, from_ptr + first, size); 479 to->sb_agcount = cpu_to_be32(from->sb_agcount);
566 } else { 480 to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
567 switch (size) { 481 to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
568 case 2: 482 to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
569 *(__be16 *)(to_ptr + first) = 483 to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
570 cpu_to_be16(*(__u16 *)(from_ptr + first)); 484 to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
571 break; 485 to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
572 case 4: 486 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
573 *(__be32 *)(to_ptr + first) = 487 to->sb_blocklog = from->sb_blocklog;
574 cpu_to_be32(*(__u32 *)(from_ptr + first)); 488 to->sb_sectlog = from->sb_sectlog;
575 break; 489 to->sb_inodelog = from->sb_inodelog;
576 case 8: 490 to->sb_inopblog = from->sb_inopblog;
577 *(__be64 *)(to_ptr + first) = 491 to->sb_agblklog = from->sb_agblklog;
578 cpu_to_be64(*(__u64 *)(from_ptr + first)); 492 to->sb_rextslog = from->sb_rextslog;
579 break; 493 to->sb_inprogress = from->sb_inprogress;
580 default: 494 to->sb_imax_pct = from->sb_imax_pct;
581 ASSERT(0); 495 to->sb_icount = cpu_to_be64(from->sb_icount);
582 } 496 to->sb_ifree = cpu_to_be64(from->sb_ifree);
583 } 497 to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
498 to->sb_frextents = cpu_to_be64(from->sb_frextents);
584 499
585 fields &= ~(1LL << f); 500 to->sb_flags = from->sb_flags;
501 to->sb_shared_vn = from->sb_shared_vn;
502 to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
503 to->sb_unit = cpu_to_be32(from->sb_unit);
504 to->sb_width = cpu_to_be32(from->sb_width);
505 to->sb_dirblklog = from->sb_dirblklog;
506 to->sb_logsectlog = from->sb_logsectlog;
507 to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
508 to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
509
510 /*
511 * We need to ensure that bad_features2 always matches features2.
512 * Hence we enforce that here rather than having to remember to do it
513 * everywhere else that updates features2.
514 */
515 from->sb_bad_features2 = from->sb_features2;
516 to->sb_features2 = cpu_to_be32(from->sb_features2);
517 to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
518
519 if (xfs_sb_version_hascrc(from)) {
520 to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
521 to->sb_features_ro_compat =
522 cpu_to_be32(from->sb_features_ro_compat);
523 to->sb_features_incompat =
524 cpu_to_be32(from->sb_features_incompat);
525 to->sb_features_log_incompat =
526 cpu_to_be32(from->sb_features_log_incompat);
527 to->sb_pad = 0;
528 to->sb_lsn = cpu_to_be64(from->sb_lsn);
586 } 529 }
587} 530}
588 531
@@ -816,42 +759,51 @@ xfs_initialize_perag_data(
816} 759}
817 760
818/* 761/*
819 * xfs_mod_sb() can be used to copy arbitrary changes to the 762 * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
820 * in-core superblock into the superblock buffer to be logged. 763 * into the superblock buffer to be logged. It does not provide the higher
821 * It does not provide the higher level of locking that is 764 * level of locking that is needed to protect the in-core superblock from
822 * needed to protect the in-core superblock from concurrent 765 * concurrent access.
823 * access.
824 */ 766 */
825void 767void
826xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) 768xfs_log_sb(
769 struct xfs_trans *tp)
827{ 770{
828 xfs_buf_t *bp; 771 struct xfs_mount *mp = tp->t_mountp;
829 int first; 772 struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0);
830 int last;
831 xfs_mount_t *mp;
832 xfs_sb_field_t f;
833
834 ASSERT(fields);
835 if (!fields)
836 return;
837 mp = tp->t_mountp;
838 bp = xfs_trans_getsb(tp, mp, 0);
839 first = sizeof(xfs_sb_t);
840 last = 0;
841
842 /* translate/copy */
843 773
844 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); 774 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
775 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
776 xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
777}
845 778
846 /* find modified range */ 779/*
847 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); 780 * xfs_sync_sb
848 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 781 *
849 last = xfs_sb_info[f + 1].offset - 1; 782 * Sync the superblock to disk.
783 *
784 * Note that the caller is responsible for checking the frozen state of the
785 * filesystem. This procedure uses the non-blocking transaction allocator and
786 * thus will allow modifications to a frozen fs. This is required because this
787 * code can be called during the process of freezing where use of the high-level
788 * allocator would deadlock.
789 */
790int
791xfs_sync_sb(
792 struct xfs_mount *mp,
793 bool wait)
794{
795 struct xfs_trans *tp;
796 int error;
850 797
851 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 798 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
852 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 799 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
853 first = xfs_sb_info[f].offset; 800 if (error) {
801 xfs_trans_cancel(tp, 0);
802 return error;
803 }
854 804
855 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); 805 xfs_log_sb(tp);
856 xfs_trans_log_buf(tp, bp, first, last); 806 if (wait)
807 xfs_trans_set_sync(tp);
808 return xfs_trans_commit(tp, 0);
857} 809}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 8eb1c54bafbf..b25bb9a343f3 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,11 +27,12 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
27extern void xfs_perag_put(struct xfs_perag *pag); 27extern void xfs_perag_put(struct xfs_perag *pag);
28extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); 28extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
29 29
30extern void xfs_sb_calc_crc(struct xfs_buf *); 30extern void xfs_sb_calc_crc(struct xfs_buf *bp);
31extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 31extern void xfs_log_sb(struct xfs_trans *tp);
32extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *); 32extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
33extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 33extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
34extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 34extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
35extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
35extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); 36extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
36 37
37#endif /* __XFS_SB_H__ */ 38#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..8dda4b321343 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -82,7 +82,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
82#define XFS_TRANS_ATTR_RM 23 82#define XFS_TRANS_ATTR_RM 23
83#define XFS_TRANS_ATTR_FLAG 24 83#define XFS_TRANS_ATTR_FLAG 24
84#define XFS_TRANS_CLEAR_AGI_BUCKET 25 84#define XFS_TRANS_CLEAR_AGI_BUCKET 25
85#define XFS_TRANS_QM_SBCHANGE 26 85#define XFS_TRANS_SB_CHANGE 26
86/* 86/*
87 * Dummy entries since we use the transaction type to index into the 87 * Dummy entries since we use the transaction type to index into the
88 * trans_type[] in xlog_recover_print_trans_head() 88 * trans_type[] in xlog_recover_print_trans_head()
@@ -95,17 +95,15 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
95#define XFS_TRANS_QM_DQCLUSTER 32 95#define XFS_TRANS_QM_DQCLUSTER 32
96#define XFS_TRANS_QM_QINOCREATE 33 96#define XFS_TRANS_QM_QINOCREATE 33
97#define XFS_TRANS_QM_QUOTAOFF_END 34 97#define XFS_TRANS_QM_QUOTAOFF_END 34
98#define XFS_TRANS_SB_UNIT 35 98#define XFS_TRANS_FSYNC_TS 35
99#define XFS_TRANS_FSYNC_TS 36 99#define XFS_TRANS_GROWFSRT_ALLOC 36
100#define XFS_TRANS_GROWFSRT_ALLOC 37 100#define XFS_TRANS_GROWFSRT_ZERO 37
101#define XFS_TRANS_GROWFSRT_ZERO 38 101#define XFS_TRANS_GROWFSRT_FREE 38
102#define XFS_TRANS_GROWFSRT_FREE 39 102#define XFS_TRANS_SWAPEXT 39
103#define XFS_TRANS_SWAPEXT 40 103#define XFS_TRANS_CHECKPOINT 40
104#define XFS_TRANS_SB_COUNT 41 104#define XFS_TRANS_ICREATE 41
105#define XFS_TRANS_CHECKPOINT 42 105#define XFS_TRANS_CREATE_TMPFILE 42
106#define XFS_TRANS_ICREATE 43 106#define XFS_TRANS_TYPE_MAX 43
107#define XFS_TRANS_CREATE_TMPFILE 44
108#define XFS_TRANS_TYPE_MAX 44
109/* new transaction types need to be reflected in xfs_logprint(8) */ 107/* new transaction types need to be reflected in xfs_logprint(8) */
110 108
111#define XFS_TRANS_TYPES \ 109#define XFS_TRANS_TYPES \
@@ -113,7 +111,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
113 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ 111 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
114 { XFS_TRANS_INACTIVE, "INACTIVE" }, \ 112 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
115 { XFS_TRANS_CREATE, "CREATE" }, \ 113 { XFS_TRANS_CREATE, "CREATE" }, \
116 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
117 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ 114 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
118 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ 115 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
119 { XFS_TRANS_REMOVE, "REMOVE" }, \ 116 { XFS_TRANS_REMOVE, "REMOVE" }, \
@@ -134,23 +131,23 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
134 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ 131 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
135 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ 132 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
136 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ 133 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
137 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ 134 { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \
135 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
136 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
138 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ 137 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
139 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ 138 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
140 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ 139 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
141 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ 140 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
142 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ 141 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
143 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ 142 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
144 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
145 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ 143 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
146 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ 144 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
147 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ 145 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 146 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 147 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
151 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ 148 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
152 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 149 { XFS_TRANS_ICREATE, "ICREATE" }, \
153 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 150 { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
154 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 151 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
155 152
156/* 153/*
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index c80c5236c3da..e7e26bd6468f 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -178,6 +178,8 @@ xfs_symlink_local_to_remote(
178 struct xfs_mount *mp = ip->i_mount; 178 struct xfs_mount *mp = ip->i_mount;
179 char *buf; 179 char *buf;
180 180
181 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
182
181 if (!xfs_sb_version_hascrc(&mp->m_sb)) { 183 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
182 bp->b_ops = NULL; 184 bp->b_ops = NULL;
183 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); 185 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6c1330f29050..68cb1e7bf2bb 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -716,17 +716,6 @@ xfs_calc_clear_agi_bucket_reservation(
716} 716}
717 717
718/* 718/*
719 * Clearing the quotaflags in the superblock.
720 * the super block for changing quota flags: sector size
721 */
722STATIC uint
723xfs_calc_qm_sbchange_reservation(
724 struct xfs_mount *mp)
725{
726 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
727}
728
729/*
730 * Adjusting quota limits. 719 * Adjusting quota limits.
731 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) 720 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
732 */ 721 */
@@ -864,9 +853,6 @@ xfs_trans_resv_calc(
864 * The following transactions are logged in logical format with 853 * The following transactions are logged in logical format with
865 * a default log count. 854 * a default log count.
866 */ 855 */
867 resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
868 resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
869
870 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); 856 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
871 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; 857 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
872 858
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..2d5bdfce6d8f 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -56,7 +56,6 @@ struct xfs_trans_resv {
56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ 56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ 57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ 58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
59 struct xfs_trans_res tr_qm_sbchange; /* change quota flags */
60 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ 59 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
61 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ 60 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
62 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ 61 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 18e2f3bbae5e..3a9b7a1b8704 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
135 */ 135 */
136STATIC int 136STATIC int
137xfs_setfilesize( 137xfs_setfilesize(
138 struct xfs_ioend *ioend) 138 struct xfs_inode *ip,
139 struct xfs_trans *tp,
140 xfs_off_t offset,
141 size_t size)
139{ 142{
140 struct xfs_inode *ip = XFS_I(ioend->io_inode);
141 struct xfs_trans *tp = ioend->io_append_trans;
142 xfs_fsize_t isize; 143 xfs_fsize_t isize;
143 144
144 /*
145 * The transaction may have been allocated in the I/O submission thread,
146 * thus we need to mark ourselves as beeing in a transaction manually.
147 * Similarly for freeze protection.
148 */
149 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
150 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
151 0, 1, _THIS_IP_);
152
153 xfs_ilock(ip, XFS_ILOCK_EXCL); 145 xfs_ilock(ip, XFS_ILOCK_EXCL);
154 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 146 isize = xfs_new_eof(ip, offset + size);
155 if (!isize) { 147 if (!isize) {
156 xfs_iunlock(ip, XFS_ILOCK_EXCL); 148 xfs_iunlock(ip, XFS_ILOCK_EXCL);
157 xfs_trans_cancel(tp, 0); 149 xfs_trans_cancel(tp, 0);
158 return 0; 150 return 0;
159 } 151 }
160 152
161 trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); 153 trace_xfs_setfilesize(ip, offset, size);
162 154
163 ip->i_d.di_size = isize; 155 ip->i_d.di_size = isize;
164 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 156 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -167,6 +159,25 @@ xfs_setfilesize(
167 return xfs_trans_commit(tp, 0); 159 return xfs_trans_commit(tp, 0);
168} 160}
169 161
162STATIC int
163xfs_setfilesize_ioend(
164 struct xfs_ioend *ioend)
165{
166 struct xfs_inode *ip = XFS_I(ioend->io_inode);
167 struct xfs_trans *tp = ioend->io_append_trans;
168
169 /*
170 * The transaction may have been allocated in the I/O submission thread,
171 * thus we need to mark ourselves as being in a transaction manually.
172 * Similarly for freeze protection.
173 */
174 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
175 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
176 0, 1, _THIS_IP_);
177
178 return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
179}
180
170/* 181/*
171 * Schedule IO completion handling on the final put of an ioend. 182 * Schedule IO completion handling on the final put of an ioend.
172 * 183 *
@@ -182,8 +193,7 @@ xfs_finish_ioend(
182 193
183 if (ioend->io_type == XFS_IO_UNWRITTEN) 194 if (ioend->io_type == XFS_IO_UNWRITTEN)
184 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 195 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
185 else if (ioend->io_append_trans || 196 else if (ioend->io_append_trans)
186 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
187 queue_work(mp->m_data_workqueue, &ioend->io_work); 197 queue_work(mp->m_data_workqueue, &ioend->io_work);
188 else 198 else
189 xfs_destroy_ioend(ioend); 199 xfs_destroy_ioend(ioend);
@@ -215,22 +225,8 @@ xfs_end_io(
215 if (ioend->io_type == XFS_IO_UNWRITTEN) { 225 if (ioend->io_type == XFS_IO_UNWRITTEN) {
216 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 226 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
217 ioend->io_size); 227 ioend->io_size);
218 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
219 /*
220 * For direct I/O we do not know if we need to allocate blocks
221 * or not so we can't preallocate an append transaction as that
222 * results in nested reservations and log space deadlocks. Hence
223 * allocate the transaction here. While this is sub-optimal and
224 * can block IO completion for some time, we're stuck with doing
225 * it this way until we can pass the ioend to the direct IO
226 * allocation callbacks and avoid nesting that way.
227 */
228 error = xfs_setfilesize_trans_alloc(ioend);
229 if (error)
230 goto done;
231 error = xfs_setfilesize(ioend);
232 } else if (ioend->io_append_trans) { 228 } else if (ioend->io_append_trans) {
233 error = xfs_setfilesize(ioend); 229 error = xfs_setfilesize_ioend(ioend);
234 } else { 230 } else {
235 ASSERT(!xfs_ioend_is_append(ioend)); 231 ASSERT(!xfs_ioend_is_append(ioend));
236 } 232 }
@@ -242,17 +238,6 @@ done:
242} 238}
243 239
244/* 240/*
245 * Call IO completion handling in caller context on the final put of an ioend.
246 */
247STATIC void
248xfs_finish_ioend_sync(
249 struct xfs_ioend *ioend)
250{
251 if (atomic_dec_and_test(&ioend->io_remaining))
252 xfs_end_io(&ioend->io_work);
253}
254
255/*
256 * Allocate and initialise an IO completion structure. 241 * Allocate and initialise an IO completion structure.
257 * We need to track unwritten extent write completion here initially. 242 * We need to track unwritten extent write completion here initially.
258 * We'll need to extend this for updating the ondisk inode size later 243 * We'll need to extend this for updating the ondisk inode size later
@@ -273,7 +258,6 @@ xfs_alloc_ioend(
273 * all the I/O from calling the completion routine too early. 258 * all the I/O from calling the completion routine too early.
274 */ 259 */
275 atomic_set(&ioend->io_remaining, 1); 260 atomic_set(&ioend->io_remaining, 1);
276 ioend->io_isdirect = 0;
277 ioend->io_error = 0; 261 ioend->io_error = 0;
278 ioend->io_list = NULL; 262 ioend->io_list = NULL;
279 ioend->io_type = type; 263 ioend->io_type = type;
@@ -1459,11 +1443,7 @@ xfs_get_blocks_direct(
1459 * 1443 *
1460 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1444 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1461 * need to issue a transaction to convert the range from unwritten to written 1445 * need to issue a transaction to convert the range from unwritten to written
1462 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1446 * extents.
1463 * to do this and we are done. But in case this was a successful AIO
1464 * request this handler is called from interrupt context, from which we
1465 * can't start transactions. In that case offload the I/O completion to
1466 * the workqueues we also use for buffered I/O completion.
1467 */ 1447 */
1468STATIC void 1448STATIC void
1469xfs_end_io_direct_write( 1449xfs_end_io_direct_write(
@@ -1472,7 +1452,12 @@ xfs_end_io_direct_write(
1472 ssize_t size, 1452 ssize_t size,
1473 void *private) 1453 void *private)
1474{ 1454{
1475 struct xfs_ioend *ioend = iocb->private; 1455 struct inode *inode = file_inode(iocb->ki_filp);
1456 struct xfs_inode *ip = XFS_I(inode);
1457 struct xfs_mount *mp = ip->i_mount;
1458
1459 if (XFS_FORCED_SHUTDOWN(mp))
1460 return;
1476 1461
1477 /* 1462 /*
1478 * While the generic direct I/O code updates the inode size, it does 1463 * While the generic direct I/O code updates the inode size, it does
@@ -1480,22 +1465,33 @@ xfs_end_io_direct_write(
1480 * end_io handler thinks the on-disk size is outside the in-core 1465 * end_io handler thinks the on-disk size is outside the in-core
1481 * size. To prevent this just update it a little bit earlier here. 1466 * size. To prevent this just update it a little bit earlier here.
1482 */ 1467 */
1483 if (offset + size > i_size_read(ioend->io_inode)) 1468 if (offset + size > i_size_read(inode))
1484 i_size_write(ioend->io_inode, offset + size); 1469 i_size_write(inode, offset + size);
1485 1470
1486 /* 1471 /*
1487 * blockdev_direct_IO can return an error even after the I/O 1472 * For direct I/O we do not know if we need to allocate blocks or not,
1488 * completion handler was called. Thus we need to protect 1473 * so we can't preallocate an append transaction, as that results in
1489 * against double-freeing. 1474 * nested reservations and log space deadlocks. Hence allocate the
1475 * transaction here. While this is sub-optimal and can block IO
1476 * completion for some time, we're stuck with doing it this way until
1477 * we can pass the ioend to the direct IO allocation callbacks and
1478 * avoid nesting that way.
1490 */ 1479 */
1491 iocb->private = NULL; 1480 if (private && size > 0) {
1492 1481 xfs_iomap_write_unwritten(ip, offset, size);
1493 ioend->io_offset = offset; 1482 } else if (offset + size > ip->i_d.di_size) {
1494 ioend->io_size = size; 1483 struct xfs_trans *tp;
1495 if (private && size > 0) 1484 int error;
1496 ioend->io_type = XFS_IO_UNWRITTEN; 1485
1486 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1487 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1488 if (error) {
1489 xfs_trans_cancel(tp, 0);
1490 return;
1491 }
1497 1492
1498 xfs_finish_ioend_sync(ioend); 1493 xfs_setfilesize(ip, tp, offset, size);
1494 }
1499} 1495}
1500 1496
1501STATIC ssize_t 1497STATIC ssize_t
@@ -1507,39 +1503,16 @@ xfs_vm_direct_IO(
1507{ 1503{
1508 struct inode *inode = iocb->ki_filp->f_mapping->host; 1504 struct inode *inode = iocb->ki_filp->f_mapping->host;
1509 struct block_device *bdev = xfs_find_bdev_for_inode(inode); 1505 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1510 struct xfs_ioend *ioend = NULL;
1511 ssize_t ret;
1512 1506
1513 if (rw & WRITE) { 1507 if (rw & WRITE) {
1514 size_t size = iov_iter_count(iter); 1508 return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1515
1516 /*
1517 * We cannot preallocate a size update transaction here as we
1518 * don't know whether allocation is necessary or not. Hence we
1519 * can only tell IO completion that one is necessary if we are
1520 * not doing unwritten extent conversion.
1521 */
1522 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1523 if (offset + size > XFS_I(inode)->i_d.di_size)
1524 ioend->io_isdirect = 1;
1525
1526 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1527 offset, xfs_get_blocks_direct, 1509 offset, xfs_get_blocks_direct,
1528 xfs_end_io_direct_write, NULL, 1510 xfs_end_io_direct_write, NULL,
1529 DIO_ASYNC_EXTEND); 1511 DIO_ASYNC_EXTEND);
1530 if (ret != -EIOCBQUEUED && iocb->private)
1531 goto out_destroy_ioend;
1532 } else {
1533 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1534 offset, xfs_get_blocks_direct,
1535 NULL, NULL, 0);
1536 } 1512 }
1537 1513 return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
1538 return ret; 1514 offset, xfs_get_blocks_direct,
1539 1515 NULL, NULL, 0);
1540out_destroy_ioend:
1541 xfs_destroy_ioend(ioend);
1542 return ret;
1543} 1516}
1544 1517
1545/* 1518/*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f94dd459dff9..ac644e0137a4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
24 * Types of I/O for bmap clustering and I/O completion tracking. 24 * Types of I/O for bmap clustering and I/O completion tracking.
25 */ 25 */
26enum { 26enum {
27 XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */
28 XFS_IO_DELALLOC, /* covers delalloc region */ 27 XFS_IO_DELALLOC, /* covers delalloc region */
29 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ 28 XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
30 XFS_IO_OVERWRITE, /* covers already allocated extent */ 29 XFS_IO_OVERWRITE, /* covers already allocated extent */
31}; 30};
32 31
33#define XFS_IO_TYPES \ 32#define XFS_IO_TYPES \
34 { 0, "" }, \
35 { XFS_IO_DELALLOC, "delalloc" }, \ 33 { XFS_IO_DELALLOC, "delalloc" }, \
36 { XFS_IO_UNWRITTEN, "unwritten" }, \ 34 { XFS_IO_UNWRITTEN, "unwritten" }, \
37 { XFS_IO_OVERWRITE, "overwrite" } 35 { XFS_IO_OVERWRITE, "overwrite" }
@@ -45,7 +43,6 @@ typedef struct xfs_ioend {
45 unsigned int io_type; /* delalloc / unwritten */ 43 unsigned int io_type; /* delalloc / unwritten */
46 int io_error; /* I/O error code */ 44 int io_error; /* I/O error code */
47 atomic_t io_remaining; /* hold count */ 45 atomic_t io_remaining; /* hold count */
48 unsigned int io_isdirect : 1;/* direct I/O */
49 struct inode *io_inode; /* file being written to */ 46 struct inode *io_inode; /* file being written to */
50 struct buffer_head *io_buffer_head;/* buffer linked list head */ 47 struct buffer_head *io_buffer_head;/* buffer linked list head */
51 struct buffer_head *io_buffer_tail;/* buffer linked list tail */ 48 struct buffer_head *io_buffer_tail;/* buffer linked list tail */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 2fdb72d2c908..736429a72a12 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -26,43 +26,8 @@ struct xfs_ifork;
26struct xfs_inode; 26struct xfs_inode;
27struct xfs_mount; 27struct xfs_mount;
28struct xfs_trans; 28struct xfs_trans;
29struct xfs_bmalloca;
29 30
30/*
31 * Argument structure for xfs_bmap_alloc.
32 */
33struct xfs_bmalloca {
34 xfs_fsblock_t *firstblock; /* i/o first block allocated */
35 struct xfs_bmap_free *flist; /* bmap freelist */
36 struct xfs_trans *tp; /* transaction pointer */
37 struct xfs_inode *ip; /* incore inode pointer */
38 struct xfs_bmbt_irec prev; /* extent before the new one */
39 struct xfs_bmbt_irec got; /* extent after, or delayed */
40
41 xfs_fileoff_t offset; /* offset in file filling in */
42 xfs_extlen_t length; /* i/o length asked/allocated */
43 xfs_fsblock_t blkno; /* starting block of new extent */
44
45 struct xfs_btree_cur *cur; /* btree cursor */
46 xfs_extnum_t idx; /* current extent index */
47 int nallocs;/* number of extents alloc'd */
48 int logflags;/* flags for transaction logging */
49
50 xfs_extlen_t total; /* total blocks needed for xaction */
51 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
52 xfs_extlen_t minleft; /* amount must be left after alloc */
53 bool eof; /* set if allocating past last extent */
54 bool wasdel; /* replacing a delayed allocation */
55 bool userdata;/* set if is user data */
56 bool aeof; /* allocated space at eof */
57 bool conv; /* overwriting unwritten extents */
58 int flags;
59 struct completion *done;
60 struct work_struct work;
61 int result;
62};
63
64int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
65 int *committed);
66int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); 31int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
67int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, 32int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
68 int whichfork, int *eof); 33 int whichfork, int *eof);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
1488static enum lru_status 1488static enum lru_status
1489xfs_buftarg_wait_rele( 1489xfs_buftarg_wait_rele(
1490 struct list_head *item, 1490 struct list_head *item,
1491 struct list_lru_one *lru,
1491 spinlock_t *lru_lock, 1492 spinlock_t *lru_lock,
1492 void *arg) 1493 void *arg)
1493 1494
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
1509 */ 1510 */
1510 atomic_set(&bp->b_lru_ref, 0); 1511 atomic_set(&bp->b_lru_ref, 0);
1511 bp->b_state |= XFS_BSTATE_DISPOSE; 1512 bp->b_state |= XFS_BSTATE_DISPOSE;
1512 list_move(item, dispose); 1513 list_lru_isolate_move(lru, item, dispose);
1513 spin_unlock(&bp->b_lock); 1514 spin_unlock(&bp->b_lock);
1514 return LRU_REMOVED; 1515 return LRU_REMOVED;
1515} 1516}
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
1546static enum lru_status 1547static enum lru_status
1547xfs_buftarg_isolate( 1548xfs_buftarg_isolate(
1548 struct list_head *item, 1549 struct list_head *item,
1550 struct list_lru_one *lru,
1549 spinlock_t *lru_lock, 1551 spinlock_t *lru_lock,
1550 void *arg) 1552 void *arg)
1551{ 1553{
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
1569 } 1571 }
1570 1572
1571 bp->b_state |= XFS_BSTATE_DISPOSE; 1573 bp->b_state |= XFS_BSTATE_DISPOSE;
1572 list_move(item, dispose); 1574 list_lru_isolate_move(lru, item, dispose);
1573 spin_unlock(&bp->b_lock); 1575 spin_unlock(&bp->b_lock);
1574 return LRU_REMOVED; 1576 return LRU_REMOVED;
1575} 1577}
@@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan(
1583 struct xfs_buftarg, bt_shrinker); 1585 struct xfs_buftarg, bt_shrinker);
1584 LIST_HEAD(dispose); 1586 LIST_HEAD(dispose);
1585 unsigned long freed; 1587 unsigned long freed;
1586 unsigned long nr_to_scan = sc->nr_to_scan;
1587 1588
1588 freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, 1589 freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1589 &dispose, &nr_to_scan); 1590 xfs_buftarg_isolate, &dispose);
1590 1591
1591 while (!list_empty(&dispose)) { 1592 while (!list_empty(&dispose)) {
1592 struct xfs_buf *bp; 1593 struct xfs_buf *bp;
@@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count(
1605{ 1606{
1606 struct xfs_buftarg *btp = container_of(shrink, 1607 struct xfs_buftarg *btp = container_of(shrink,
1607 struct xfs_buftarg, bt_shrinker); 1608 struct xfs_buftarg, bt_shrinker);
1608 return list_lru_count_node(&btp->bt_lru, sc->nid); 1609 return list_lru_shrink_count(&btp->bt_lru, sc);
1609} 1610}
1610 1611
1611void 1612void
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3f9bd58edec7..507d96a57ac7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -319,6 +319,10 @@ xfs_buf_item_format(
319 ASSERT(atomic_read(&bip->bli_refcount) > 0); 319 ASSERT(atomic_read(&bip->bli_refcount) > 0);
320 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 320 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
321 (bip->bli_flags & XFS_BLI_STALE)); 321 (bip->bli_flags & XFS_BLI_STALE));
322 ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
323 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
324 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
325
322 326
323 /* 327 /*
324 * If it is an inode buffer, transfer the in-memory state to the 328 * If it is an inode buffer, transfer the in-memory state to the
@@ -535,7 +539,7 @@ xfs_buf_item_push(
535 if ((bp->b_flags & XBF_WRITE_FAIL) && 539 if ((bp->b_flags & XBF_WRITE_FAIL) &&
536 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { 540 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
537 xfs_warn(bp->b_target->bt_mount, 541 xfs_warn(bp->b_target->bt_mount,
538"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", 542"Detected failing async write on buffer block 0x%llx. Retrying async write.",
539 (long long)bp->b_bn); 543 (long long)bp->b_bn);
540 } 544 }
541 545
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c24c67e22a2a..2f536f33cd26 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -86,7 +86,7 @@ static inline void xfs_dqflock(xfs_dquot_t *dqp)
86 wait_for_completion(&dqp->q_flush); 86 wait_for_completion(&dqp->q_flush);
87} 87}
88 88
89static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) 89static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
90{ 90{
91 return try_wait_for_completion(&dqp->q_flush); 91 return try_wait_for_completion(&dqp->q_flush);
92} 92}
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5eb4a14e0a0f..b97359ba2648 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@
30#include "xfs_trace.h" 30#include "xfs_trace.h"
31#include "xfs_icache.h" 31#include "xfs_icache.h"
32#include "xfs_log.h" 32#include "xfs_log.h"
33#include "xfs_pnfs.h"
33 34
34/* 35/*
35 * Note that we only accept fileids which are long enough rather than allow 36 * Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = {
245 .fh_to_parent = xfs_fs_fh_to_parent, 246 .fh_to_parent = xfs_fs_fh_to_parent,
246 .get_parent = xfs_fs_get_parent, 247 .get_parent = xfs_fs_get_parent,
247 .commit_metadata = xfs_fs_nfs_commit_metadata, 248 .commit_metadata = xfs_fs_nfs_commit_metadata,
249#ifdef CONFIG_NFSD_PNFS
250 .get_uuid = xfs_fs_get_uuid,
251 .map_blocks = xfs_fs_map_blocks,
252 .commit_blocks = xfs_fs_commit_blocks,
253#endif
248}; 254};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..ce615d12fb44 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_log.h" 37#include "xfs_log.h"
38#include "xfs_icache.h" 38#include "xfs_icache.h"
39#include "xfs_pnfs.h"
39 40
40#include <linux/aio.h> 41#include <linux/aio.h>
41#include <linux/dcache.h> 42#include <linux/dcache.h>
@@ -127,6 +128,42 @@ xfs_iozero(
127 return (-status); 128 return (-status);
128} 129}
129 130
131int
132xfs_update_prealloc_flags(
133 struct xfs_inode *ip,
134 enum xfs_prealloc_flags flags)
135{
136 struct xfs_trans *tp;
137 int error;
138
139 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
140 error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
141 if (error) {
142 xfs_trans_cancel(tp, 0);
143 return error;
144 }
145
146 xfs_ilock(ip, XFS_ILOCK_EXCL);
147 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
148
149 if (!(flags & XFS_PREALLOC_INVISIBLE)) {
150 ip->i_d.di_mode &= ~S_ISUID;
151 if (ip->i_d.di_mode & S_IXGRP)
152 ip->i_d.di_mode &= ~S_ISGID;
153 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
154 }
155
156 if (flags & XFS_PREALLOC_SET)
157 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
158 if (flags & XFS_PREALLOC_CLEAR)
159 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
160
161 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
162 if (flags & XFS_PREALLOC_SYNC)
163 xfs_trans_set_sync(tp);
164 return xfs_trans_commit(tp, 0);
165}
166
130/* 167/*
131 * Fsync operations on directories are much simpler than on regular files, 168 * Fsync operations on directories are much simpler than on regular files,
132 * as there is no file data to flush, and thus also no need for explicit 169 * as there is no file data to flush, and thus also no need for explicit
@@ -518,6 +555,10 @@ restart:
518 if (error) 555 if (error)
519 return error; 556 return error;
520 557
558 error = xfs_break_layouts(inode, iolock);
559 if (error)
560 return error;
561
521 /* 562 /*
522 * If the offset is beyond the size of the file, we need to zero any 563 * If the offset is beyond the size of the file, we need to zero any
523 * blocks that fall between the existing EOF and the start of this 564 * blocks that fall between the existing EOF and the start of this
@@ -699,7 +740,7 @@ xfs_file_buffered_aio_write(
699 740
700 iov_iter_truncate(from, count); 741 iov_iter_truncate(from, count);
701 /* We can write back this queue in page reclaim */ 742 /* We can write back this queue in page reclaim */
702 current->backing_dev_info = mapping->backing_dev_info; 743 current->backing_dev_info = inode_to_bdi(inode);
703 744
704write_retry: 745write_retry:
705 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 746 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@ -784,8 +825,9 @@ xfs_file_fallocate(
784{ 825{
785 struct inode *inode = file_inode(file); 826 struct inode *inode = file_inode(file);
786 struct xfs_inode *ip = XFS_I(inode); 827 struct xfs_inode *ip = XFS_I(inode);
787 struct xfs_trans *tp;
788 long error; 828 long error;
829 enum xfs_prealloc_flags flags = 0;
830 uint iolock = XFS_IOLOCK_EXCL;
789 loff_t new_size = 0; 831 loff_t new_size = 0;
790 832
791 if (!S_ISREG(inode->i_mode)) 833 if (!S_ISREG(inode->i_mode))
@@ -794,7 +836,11 @@ xfs_file_fallocate(
794 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) 836 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
795 return -EOPNOTSUPP; 837 return -EOPNOTSUPP;
796 838
797 xfs_ilock(ip, XFS_IOLOCK_EXCL); 839 xfs_ilock(ip, iolock);
840 error = xfs_break_layouts(inode, &iolock);
841 if (error)
842 goto out_unlock;
843
798 if (mode & FALLOC_FL_PUNCH_HOLE) { 844 if (mode & FALLOC_FL_PUNCH_HOLE) {
799 error = xfs_free_file_space(ip, offset, len); 845 error = xfs_free_file_space(ip, offset, len);
800 if (error) 846 if (error)
@@ -822,6 +868,8 @@ xfs_file_fallocate(
822 if (error) 868 if (error)
823 goto out_unlock; 869 goto out_unlock;
824 } else { 870 } else {
871 flags |= XFS_PREALLOC_SET;
872
825 if (!(mode & FALLOC_FL_KEEP_SIZE) && 873 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
826 offset + len > i_size_read(inode)) { 874 offset + len > i_size_read(inode)) {
827 new_size = offset + len; 875 new_size = offset + len;
@@ -839,28 +887,10 @@ xfs_file_fallocate(
839 goto out_unlock; 887 goto out_unlock;
840 } 888 }
841 889
842 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
843 error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
844 if (error) {
845 xfs_trans_cancel(tp, 0);
846 goto out_unlock;
847 }
848
849 xfs_ilock(ip, XFS_ILOCK_EXCL);
850 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
851 ip->i_d.di_mode &= ~S_ISUID;
852 if (ip->i_d.di_mode & S_IXGRP)
853 ip->i_d.di_mode &= ~S_ISGID;
854
855 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
856 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
857
858 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
859 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
860
861 if (file->f_flags & O_DSYNC) 890 if (file->f_flags & O_DSYNC)
862 xfs_trans_set_sync(tp); 891 flags |= XFS_PREALLOC_SYNC;
863 error = xfs_trans_commit(tp, 0); 892
893 error = xfs_update_prealloc_flags(ip, flags);
864 if (error) 894 if (error)
865 goto out_unlock; 895 goto out_unlock;
866 896
@@ -874,7 +904,7 @@ xfs_file_fallocate(
874 } 904 }
875 905
876out_unlock: 906out_unlock:
877 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 907 xfs_iunlock(ip, iolock);
878 return error; 908 return error;
879} 909}
880 910
@@ -1384,5 +1414,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
1384 .fault = filemap_fault, 1414 .fault = filemap_fault,
1385 .map_pages = filemap_map_pages, 1415 .map_pages = filemap_map_pages,
1386 .page_mkwrite = xfs_vm_page_mkwrite, 1416 .page_mkwrite = xfs_vm_page_mkwrite,
1387 .remap_pages = generic_file_remap_pages,
1388}; 1417};
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdc64220fcb0..74efe5b760dc 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -488,6 +488,7 @@ xfs_growfs_data_private(
488 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); 488 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
489 if (dpct) 489 if (dpct)
490 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 490 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
491 xfs_trans_set_sync(tp);
491 error = xfs_trans_commit(tp, 0); 492 error = xfs_trans_commit(tp, 0);
492 if (error) 493 if (error)
493 return error; 494 return error;
@@ -541,7 +542,7 @@ xfs_growfs_data_private(
541 saved_error = error; 542 saved_error = error;
542 continue; 543 continue;
543 } 544 }
544 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); 545 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
545 546
546 error = xfs_bwrite(bp); 547 error = xfs_bwrite(bp);
547 xfs_buf_relse(bp); 548 xfs_buf_relse(bp);
@@ -601,6 +602,12 @@ xfs_growfs_data(
601 if (!mutex_trylock(&mp->m_growlock)) 602 if (!mutex_trylock(&mp->m_growlock))
602 return -EWOULDBLOCK; 603 return -EWOULDBLOCK;
603 error = xfs_growfs_data_private(mp, in); 604 error = xfs_growfs_data_private(mp, in);
605 /*
606 * Increment the generation unconditionally, the error could be from
607 * updating the secondary superblocks, in which case the new size
608 * is live already.
609 */
610 mp->m_generation++;
604 mutex_unlock(&mp->m_growlock); 611 mutex_unlock(&mp->m_growlock);
605 return error; 612 return error;
606} 613}
@@ -756,37 +763,6 @@ out:
756 return 0; 763 return 0;
757} 764}
758 765
759/*
760 * Dump a transaction into the log that contains no real change. This is needed
761 * to be able to make the log dirty or stamp the current tail LSN into the log
762 * during the covering operation.
763 *
764 * We cannot use an inode here for this - that will push dirty state back up
765 * into the VFS and then periodic inode flushing will prevent log covering from
766 * making progress. Hence we log a field in the superblock instead and use a
767 * synchronous transaction to ensure the superblock is immediately unpinned
768 * and can be written back.
769 */
770int
771xfs_fs_log_dummy(
772 xfs_mount_t *mp)
773{
774 xfs_trans_t *tp;
775 int error;
776
777 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
778 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
779 if (error) {
780 xfs_trans_cancel(tp, 0);
781 return error;
782 }
783
784 /* log the UUID because it is an unchanging field */
785 xfs_mod_sb(tp, XFS_SB_UUID);
786 xfs_trans_set_sync(tp);
787 return xfs_trans_commit(tp, 0);
788}
789
790int 766int
791xfs_fs_goingdown( 767xfs_fs_goingdown(
792 xfs_mount_t *mp, 768 xfs_mount_t *mp,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 41f804e740d7..daafa1f6d260 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1995,6 +1995,7 @@ xfs_iunlink(
1995 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1995 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1996 offset = offsetof(xfs_agi_t, agi_unlinked) + 1996 offset = offsetof(xfs_agi_t, agi_unlinked) +
1997 (sizeof(xfs_agino_t) * bucket_index); 1997 (sizeof(xfs_agino_t) * bucket_index);
1998 xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
1998 xfs_trans_log_buf(tp, agibp, offset, 1999 xfs_trans_log_buf(tp, agibp, offset,
1999 (offset + sizeof(xfs_agino_t) - 1)); 2000 (offset + sizeof(xfs_agino_t) - 1));
2000 return 0; 2001 return 0;
@@ -2086,6 +2087,7 @@ xfs_iunlink_remove(
2086 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 2087 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2087 offset = offsetof(xfs_agi_t, agi_unlinked) + 2088 offset = offsetof(xfs_agi_t, agi_unlinked) +
2088 (sizeof(xfs_agino_t) * bucket_index); 2089 (sizeof(xfs_agino_t) * bucket_index);
2090 xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
2089 xfs_trans_log_buf(tp, agibp, offset, 2091 xfs_trans_log_buf(tp, agibp, offset,
2090 (offset + sizeof(xfs_agino_t) - 1)); 2092 (offset + sizeof(xfs_agino_t) - 1));
2091 } else { 2093 } else {
@@ -2656,6 +2658,124 @@ xfs_sort_for_rename(
2656} 2658}
2657 2659
2658/* 2660/*
2661 * xfs_cross_rename()
2662 *
2663 * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
2664 */
2665STATIC int
2666xfs_cross_rename(
2667 struct xfs_trans *tp,
2668 struct xfs_inode *dp1,
2669 struct xfs_name *name1,
2670 struct xfs_inode *ip1,
2671 struct xfs_inode *dp2,
2672 struct xfs_name *name2,
2673 struct xfs_inode *ip2,
2674 struct xfs_bmap_free *free_list,
2675 xfs_fsblock_t *first_block,
2676 int spaceres)
2677{
2678 int error = 0;
2679 int ip1_flags = 0;
2680 int ip2_flags = 0;
2681 int dp2_flags = 0;
2682
2683 /* Swap inode number for dirent in first parent */
2684 error = xfs_dir_replace(tp, dp1, name1,
2685 ip2->i_ino,
2686 first_block, free_list, spaceres);
2687 if (error)
2688 goto out;
2689
2690 /* Swap inode number for dirent in second parent */
2691 error = xfs_dir_replace(tp, dp2, name2,
2692 ip1->i_ino,
2693 first_block, free_list, spaceres);
2694 if (error)
2695 goto out;
2696
2697 /*
2698 * If we're renaming one or more directories across different parents,
2699 * update the respective ".." entries (and link counts) to match the new
2700 * parents.
2701 */
2702 if (dp1 != dp2) {
2703 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2704
2705 if (S_ISDIR(ip2->i_d.di_mode)) {
2706 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2707 dp1->i_ino, first_block,
2708 free_list, spaceres);
2709 if (error)
2710 goto out;
2711
2712 /* transfer ip2 ".." reference to dp1 */
2713 if (!S_ISDIR(ip1->i_d.di_mode)) {
2714 error = xfs_droplink(tp, dp2);
2715 if (error)
2716 goto out;
2717 error = xfs_bumplink(tp, dp1);
2718 if (error)
2719 goto out;
2720 }
2721
2722 /*
2723 * Although ip1 isn't changed here, userspace needs
2724 * to be warned about the change, so that applications
2725 * relying on it (like backup ones), will properly
2726 * notify the change
2727 */
2728 ip1_flags |= XFS_ICHGTIME_CHG;
2729 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2730 }
2731
2732 if (S_ISDIR(ip1->i_d.di_mode)) {
2733 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2734 dp2->i_ino, first_block,
2735 free_list, spaceres);
2736 if (error)
2737 goto out;
2738
2739 /* transfer ip1 ".." reference to dp2 */
2740 if (!S_ISDIR(ip2->i_d.di_mode)) {
2741 error = xfs_droplink(tp, dp1);
2742 if (error)
2743 goto out;
2744 error = xfs_bumplink(tp, dp2);
2745 if (error)
2746 goto out;
2747 }
2748
2749 /*
2750 * Although ip2 isn't changed here, userspace needs
2751 * to be warned about the change, so that applications
2752 * relying on it (like backup ones), will properly
2753 * notify the change
2754 */
2755 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
2756 ip2_flags |= XFS_ICHGTIME_CHG;
2757 }
2758 }
2759
2760 if (ip1_flags) {
2761 xfs_trans_ichgtime(tp, ip1, ip1_flags);
2762 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2763 }
2764 if (ip2_flags) {
2765 xfs_trans_ichgtime(tp, ip2, ip2_flags);
2766 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2767 }
2768 if (dp2_flags) {
2769 xfs_trans_ichgtime(tp, dp2, dp2_flags);
2770 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2771 }
2772 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2773 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2774out:
2775 return error;
2776}
2777
2778/*
2659 * xfs_rename 2779 * xfs_rename
2660 */ 2780 */
2661int 2781int
@@ -2665,7 +2785,8 @@ xfs_rename(
2665 xfs_inode_t *src_ip, 2785 xfs_inode_t *src_ip,
2666 xfs_inode_t *target_dp, 2786 xfs_inode_t *target_dp,
2667 struct xfs_name *target_name, 2787 struct xfs_name *target_name,
2668 xfs_inode_t *target_ip) 2788 xfs_inode_t *target_ip,
2789 unsigned int flags)
2669{ 2790{
2670 xfs_trans_t *tp = NULL; 2791 xfs_trans_t *tp = NULL;
2671 xfs_mount_t *mp = src_dp->i_mount; 2792 xfs_mount_t *mp = src_dp->i_mount;
@@ -2743,6 +2864,18 @@ xfs_rename(
2743 } 2864 }
2744 2865
2745 /* 2866 /*
2867 * Handle RENAME_EXCHANGE flags
2868 */
2869 if (flags & RENAME_EXCHANGE) {
2870 error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
2871 target_dp, target_name, target_ip,
2872 &free_list, &first_block, spaceres);
2873 if (error)
2874 goto abort_return;
2875 goto finish_rename;
2876 }
2877
2878 /*
2746 * Set up the target. 2879 * Set up the target.
2747 */ 2880 */
2748 if (target_ip == NULL) { 2881 if (target_ip == NULL) {
@@ -2881,6 +3014,7 @@ xfs_rename(
2881 if (new_parent) 3014 if (new_parent)
2882 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3015 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
2883 3016
3017finish_rename:
2884 /* 3018 /*
2885 * If this is a synchronous mount, make sure that the 3019 * If this is a synchronous mount, make sure that the
2886 * rename transaction goes to disk before returning to 3020 * rename transaction goes to disk before returning to
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4ed2ba9342dc..86cd6b39bed7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,7 +338,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
338int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 338int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
339 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 339 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
340 struct xfs_name *target_name, 340 struct xfs_name *target_name,
341 struct xfs_inode *target_ip); 341 struct xfs_inode *target_ip, unsigned int flags);
342 342
343void xfs_ilock(xfs_inode_t *, uint); 343void xfs_ilock(xfs_inode_t *, uint);
344int xfs_ilock_nowait(xfs_inode_t *, uint); 344int xfs_ilock_nowait(xfs_inode_t *, uint);
@@ -377,6 +377,15 @@ int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
377int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); 377int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
378 378
379/* from xfs_file.c */ 379/* from xfs_file.c */
380enum xfs_prealloc_flags {
381 XFS_PREALLOC_SET = (1 << 1),
382 XFS_PREALLOC_CLEAR = (1 << 2),
383 XFS_PREALLOC_SYNC = (1 << 3),
384 XFS_PREALLOC_INVISIBLE = (1 << 4),
385};
386
387int xfs_update_prealloc_flags(struct xfs_inode *,
388 enum xfs_prealloc_flags);
380int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 389int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
381int xfs_iozero(struct xfs_inode *, loff_t, size_t); 390int xfs_iozero(struct xfs_inode *, loff_t, size_t);
382 391
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a1831980a68e..bf70a2affb05 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_icache.h" 39#include "xfs_icache.h"
40#include "xfs_symlink.h" 40#include "xfs_symlink.h"
41#include "xfs_trans.h" 41#include "xfs_trans.h"
42#include "xfs_pnfs.h"
42 43
43#include <linux/capability.h> 44#include <linux/capability.h>
44#include <linux/dcache.h> 45#include <linux/dcache.h>
@@ -606,11 +607,9 @@ xfs_ioc_space(
606 unsigned int cmd, 607 unsigned int cmd,
607 xfs_flock64_t *bf) 608 xfs_flock64_t *bf)
608{ 609{
609 struct xfs_mount *mp = ip->i_mount;
610 struct xfs_trans *tp;
611 struct iattr iattr; 610 struct iattr iattr;
612 bool setprealloc = false; 611 enum xfs_prealloc_flags flags = 0;
613 bool clrprealloc = false; 612 uint iolock = XFS_IOLOCK_EXCL;
614 int error; 613 int error;
615 614
616 /* 615 /*
@@ -630,11 +629,19 @@ xfs_ioc_space(
630 if (!S_ISREG(inode->i_mode)) 629 if (!S_ISREG(inode->i_mode))
631 return -EINVAL; 630 return -EINVAL;
632 631
632 if (filp->f_flags & O_DSYNC)
633 flags |= XFS_PREALLOC_SYNC;
634 if (ioflags & XFS_IO_INVIS)
635 flags |= XFS_PREALLOC_INVISIBLE;
636
633 error = mnt_want_write_file(filp); 637 error = mnt_want_write_file(filp);
634 if (error) 638 if (error)
635 return error; 639 return error;
636 640
637 xfs_ilock(ip, XFS_IOLOCK_EXCL); 641 xfs_ilock(ip, iolock);
642 error = xfs_break_layouts(inode, &iolock);
643 if (error)
644 goto out_unlock;
638 645
639 switch (bf->l_whence) { 646 switch (bf->l_whence) {
640 case 0: /*SEEK_SET*/ 647 case 0: /*SEEK_SET*/
@@ -673,25 +680,23 @@ xfs_ioc_space(
673 } 680 }
674 681
675 if (bf->l_start < 0 || 682 if (bf->l_start < 0 ||
676 bf->l_start > mp->m_super->s_maxbytes || 683 bf->l_start > inode->i_sb->s_maxbytes ||
677 bf->l_start + bf->l_len < 0 || 684 bf->l_start + bf->l_len < 0 ||
678 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { 685 bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
679 error = -EINVAL; 686 error = -EINVAL;
680 goto out_unlock; 687 goto out_unlock;
681 } 688 }
682 689
683 switch (cmd) { 690 switch (cmd) {
684 case XFS_IOC_ZERO_RANGE: 691 case XFS_IOC_ZERO_RANGE:
692 flags |= XFS_PREALLOC_SET;
685 error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); 693 error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
686 if (!error)
687 setprealloc = true;
688 break; 694 break;
689 case XFS_IOC_RESVSP: 695 case XFS_IOC_RESVSP:
690 case XFS_IOC_RESVSP64: 696 case XFS_IOC_RESVSP64:
697 flags |= XFS_PREALLOC_SET;
691 error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, 698 error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
692 XFS_BMAPI_PREALLOC); 699 XFS_BMAPI_PREALLOC);
693 if (!error)
694 setprealloc = true;
695 break; 700 break;
696 case XFS_IOC_UNRESVSP: 701 case XFS_IOC_UNRESVSP:
697 case XFS_IOC_UNRESVSP64: 702 case XFS_IOC_UNRESVSP64:
@@ -701,6 +706,7 @@ xfs_ioc_space(
701 case XFS_IOC_ALLOCSP64: 706 case XFS_IOC_ALLOCSP64:
702 case XFS_IOC_FREESP: 707 case XFS_IOC_FREESP:
703 case XFS_IOC_FREESP64: 708 case XFS_IOC_FREESP64:
709 flags |= XFS_PREALLOC_CLEAR;
704 if (bf->l_start > XFS_ISIZE(ip)) { 710 if (bf->l_start > XFS_ISIZE(ip)) {
705 error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), 711 error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
706 bf->l_start - XFS_ISIZE(ip), 0); 712 bf->l_start - XFS_ISIZE(ip), 0);
@@ -712,8 +718,6 @@ xfs_ioc_space(
712 iattr.ia_size = bf->l_start; 718 iattr.ia_size = bf->l_start;
713 719
714 error = xfs_setattr_size(ip, &iattr); 720 error = xfs_setattr_size(ip, &iattr);
715 if (!error)
716 clrprealloc = true;
717 break; 721 break;
718 default: 722 default:
719 ASSERT(0); 723 ASSERT(0);
@@ -723,35 +727,10 @@ xfs_ioc_space(
723 if (error) 727 if (error)
724 goto out_unlock; 728 goto out_unlock;
725 729
726 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); 730 error = xfs_update_prealloc_flags(ip, flags);
727 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
728 if (error) {
729 xfs_trans_cancel(tp, 0);
730 goto out_unlock;
731 }
732
733 xfs_ilock(ip, XFS_ILOCK_EXCL);
734 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
735
736 if (!(ioflags & XFS_IO_INVIS)) {
737 ip->i_d.di_mode &= ~S_ISUID;
738 if (ip->i_d.di_mode & S_IXGRP)
739 ip->i_d.di_mode &= ~S_ISGID;
740 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
741 }
742
743 if (setprealloc)
744 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
745 else if (clrprealloc)
746 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
747
748 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
749 if (filp->f_flags & O_DSYNC)
750 xfs_trans_set_sync(tp);
751 error = xfs_trans_commit(tp, 0);
752 731
753out_unlock: 732out_unlock:
754 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 733 xfs_iunlock(ip, iolock);
755 mnt_drop_write_file(filp); 734 mnt_drop_write_file(filp);
756 return error; 735 return error;
757} 736}
@@ -1013,20 +992,182 @@ xfs_diflags_to_linux(
1013 inode->i_flags &= ~S_NOATIME; 992 inode->i_flags &= ~S_NOATIME;
1014} 993}
1015 994
1016#define FSX_PROJID 1 995static int
1017#define FSX_EXTSIZE 2 996xfs_ioctl_setattr_xflags(
1018#define FSX_XFLAGS 4 997 struct xfs_trans *tp,
1019#define FSX_NONBLOCK 8 998 struct xfs_inode *ip,
999 struct fsxattr *fa)
1000{
1001 struct xfs_mount *mp = ip->i_mount;
1002
1003 /* Can't change realtime flag if any extents are allocated. */
1004 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
1005 XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
1006 return -EINVAL;
1007
1008 /* If realtime flag is set then must have realtime device */
1009 if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
1010 if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
1011 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
1012 return -EINVAL;
1013 }
1014
1015 /*
1016 * Can't modify an immutable/append-only file unless
1017 * we have appropriate permission.
1018 */
1019 if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
1020 (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
1021 !capable(CAP_LINUX_IMMUTABLE))
1022 return -EPERM;
1023
1024 xfs_set_diflags(ip, fa->fsx_xflags);
1025 xfs_diflags_to_linux(ip);
1026 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1027 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1028 XFS_STATS_INC(xs_ig_attrchg);
1029 return 0;
1030}
1031
1032/*
1033 * Set up the transaction structure for the setattr operation, checking that we
1034 * have permission to do so. On success, return a clean transaction and the
1035 * inode locked exclusively ready for further operation specific checks. On
1036 * failure, return an error without modifying or locking the inode.
1037 */
1038static struct xfs_trans *
1039xfs_ioctl_setattr_get_trans(
1040 struct xfs_inode *ip)
1041{
1042 struct xfs_mount *mp = ip->i_mount;
1043 struct xfs_trans *tp;
1044 int error;
1045
1046 if (mp->m_flags & XFS_MOUNT_RDONLY)
1047 return ERR_PTR(-EROFS);
1048 if (XFS_FORCED_SHUTDOWN(mp))
1049 return ERR_PTR(-EIO);
1050
1051 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
1052 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1053 if (error)
1054 goto out_cancel;
1055
1056 xfs_ilock(ip, XFS_ILOCK_EXCL);
1057 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1058
1059 /*
1060 * CAP_FOWNER overrides the following restrictions:
1061 *
1062 * The user ID of the calling process must be equal to the file owner
1063 * ID, except in cases where the CAP_FSETID capability is applicable.
1064 */
1065 if (!inode_owner_or_capable(VFS_I(ip))) {
1066 error = -EPERM;
1067 goto out_cancel;
1068 }
1069
1070 if (mp->m_flags & XFS_MOUNT_WSYNC)
1071 xfs_trans_set_sync(tp);
1072
1073 return tp;
1074
1075out_cancel:
1076 xfs_trans_cancel(tp, 0);
1077 return ERR_PTR(error);
1078}
1079
1080/*
1081 * extent size hint validation is somewhat cumbersome. Rules are:
1082 *
1083 * 1. extent size hint is only valid for directories and regular files
1084 * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
1085 * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
1086 * 4. can only be changed on regular files if no extents are allocated
1087 * 5. can be changed on directories at any time
1088 * 6. extsize hint of 0 turns off hints, clears inode flags.
1089 * 7. Extent size must be a multiple of the appropriate block size.
1090 * 8. for non-realtime files, the extent size hint must be limited
1091 * to half the AG size to avoid alignment extending the extent beyond the
1092 * limits of the AG.
1093 */
1094static int
1095xfs_ioctl_setattr_check_extsize(
1096 struct xfs_inode *ip,
1097 struct fsxattr *fa)
1098{
1099 struct xfs_mount *mp = ip->i_mount;
1100
1101 if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
1102 return -EINVAL;
1103
1104 if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
1105 !S_ISDIR(ip->i_d.di_mode))
1106 return -EINVAL;
1107
1108 if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
1109 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
1110 return -EINVAL;
1111
1112 if (fa->fsx_extsize != 0) {
1113 xfs_extlen_t size;
1114 xfs_fsblock_t extsize_fsb;
1115
1116 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1117 if (extsize_fsb > MAXEXTLEN)
1118 return -EINVAL;
1119
1120 if (XFS_IS_REALTIME_INODE(ip) ||
1121 (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1122 size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
1123 } else {
1124 size = mp->m_sb.sb_blocksize;
1125 if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
1126 return -EINVAL;
1127 }
1128
1129 if (fa->fsx_extsize % size)
1130 return -EINVAL;
1131 } else
1132 fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
1133
1134 return 0;
1135}
1136
1137static int
1138xfs_ioctl_setattr_check_projid(
1139 struct xfs_inode *ip,
1140 struct fsxattr *fa)
1141{
1142 /* Disallow 32bit project ids if projid32bit feature is not enabled. */
1143 if (fa->fsx_projid > (__uint16_t)-1 &&
1144 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
1145 return -EINVAL;
1146
1147 /*
1148 * Project Quota ID state is only allowed to change from within the init
1149 * namespace. Enforce that restriction only if we are trying to change
1150 * the quota ID state. Everything else is allowed in user namespaces.
1151 */
1152 if (current_user_ns() == &init_user_ns)
1153 return 0;
1154
1155 if (xfs_get_projid(ip) != fa->fsx_projid)
1156 return -EINVAL;
1157 if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
1158 (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
1159 return -EINVAL;
1160
1161 return 0;
1162}
1020 1163
1021STATIC int 1164STATIC int
1022xfs_ioctl_setattr( 1165xfs_ioctl_setattr(
1023 xfs_inode_t *ip, 1166 xfs_inode_t *ip,
1024 struct fsxattr *fa, 1167 struct fsxattr *fa)
1025 int mask)
1026{ 1168{
1027 struct xfs_mount *mp = ip->i_mount; 1169 struct xfs_mount *mp = ip->i_mount;
1028 struct xfs_trans *tp; 1170 struct xfs_trans *tp;
1029 unsigned int lock_flags = 0;
1030 struct xfs_dquot *udqp = NULL; 1171 struct xfs_dquot *udqp = NULL;
1031 struct xfs_dquot *pdqp = NULL; 1172 struct xfs_dquot *pdqp = NULL;
1032 struct xfs_dquot *olddquot = NULL; 1173 struct xfs_dquot *olddquot = NULL;
@@ -1034,17 +1175,9 @@ xfs_ioctl_setattr(
1034 1175
1035 trace_xfs_ioctl_setattr(ip); 1176 trace_xfs_ioctl_setattr(ip);
1036 1177
1037 if (mp->m_flags & XFS_MOUNT_RDONLY) 1178 code = xfs_ioctl_setattr_check_projid(ip, fa);
1038 return -EROFS; 1179 if (code)
1039 if (XFS_FORCED_SHUTDOWN(mp)) 1180 return code;
1040 return -EIO;
1041
1042 /*
1043 * Disallow 32bit project ids when projid32bit feature is not enabled.
1044 */
1045 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
1046 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
1047 return -EINVAL;
1048 1181
1049 /* 1182 /*
1050 * If disk quotas is on, we make sure that the dquots do exist on disk, 1183 * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1054,7 +1187,7 @@ xfs_ioctl_setattr(
1054 * If the IDs do change before we take the ilock, we're covered 1187 * If the IDs do change before we take the ilock, we're covered
1055 * because the i_*dquot fields will get updated anyway. 1188 * because the i_*dquot fields will get updated anyway.
1056 */ 1189 */
1057 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { 1190 if (XFS_IS_QUOTA_ON(mp)) {
1058 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, 1191 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
1059 ip->i_d.di_gid, fa->fsx_projid, 1192 ip->i_d.di_gid, fa->fsx_projid,
1060 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); 1193 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
@@ -1062,175 +1195,49 @@ xfs_ioctl_setattr(
1062 return code; 1195 return code;
1063 } 1196 }
1064 1197
1065 /* 1198 tp = xfs_ioctl_setattr_get_trans(ip);
1066 * For the other attributes, we acquire the inode lock and 1199 if (IS_ERR(tp)) {
1067 * first do an error checking pass. 1200 code = PTR_ERR(tp);
1068 */ 1201 goto error_free_dquots;
1069 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
1070 code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1071 if (code)
1072 goto error_return;
1073
1074 lock_flags = XFS_ILOCK_EXCL;
1075 xfs_ilock(ip, lock_flags);
1076
1077 /*
1078 * CAP_FOWNER overrides the following restrictions:
1079 *
1080 * The user ID of the calling process must be equal
1081 * to the file owner ID, except in cases where the
1082 * CAP_FSETID capability is applicable.
1083 */
1084 if (!inode_owner_or_capable(VFS_I(ip))) {
1085 code = -EPERM;
1086 goto error_return;
1087 }
1088
1089 /*
1090 * Do a quota reservation only if projid is actually going to change.
1091 * Only allow changing of projid from init_user_ns since it is a
1092 * non user namespace aware identifier.
1093 */
1094 if (mask & FSX_PROJID) {
1095 if (current_user_ns() != &init_user_ns) {
1096 code = -EINVAL;
1097 goto error_return;
1098 }
1099
1100 if (XFS_IS_QUOTA_RUNNING(mp) &&
1101 XFS_IS_PQUOTA_ON(mp) &&
1102 xfs_get_projid(ip) != fa->fsx_projid) {
1103 ASSERT(tp);
1104 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
1105 pdqp, capable(CAP_FOWNER) ?
1106 XFS_QMOPT_FORCE_RES : 0);
1107 if (code) /* out of quota */
1108 goto error_return;
1109 }
1110 } 1202 }
1111 1203
1112 if (mask & FSX_EXTSIZE) {
1113 /*
1114 * Can't change extent size if any extents are allocated.
1115 */
1116 if (ip->i_d.di_nextents &&
1117 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
1118 fa->fsx_extsize)) {
1119 code = -EINVAL; /* EFBIG? */
1120 goto error_return;
1121 }
1122 1204
1123 /* 1205 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
1124 * Extent size must be a multiple of the appropriate block 1206 xfs_get_projid(ip) != fa->fsx_projid) {
1125 * size, if set at all. It must also be smaller than the 1207 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
1126 * maximum extent size supported by the filesystem. 1208 capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
1127 * 1209 if (code) /* out of quota */
1128 * Also, for non-realtime files, limit the extent size hint to 1210 goto error_trans_cancel;
1129 * half the size of the AGs in the filesystem so alignment
1130 * doesn't result in extents larger than an AG.
1131 */
1132 if (fa->fsx_extsize != 0) {
1133 xfs_extlen_t size;
1134 xfs_fsblock_t extsize_fsb;
1135
1136 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1137 if (extsize_fsb > MAXEXTLEN) {
1138 code = -EINVAL;
1139 goto error_return;
1140 }
1141
1142 if (XFS_IS_REALTIME_INODE(ip) ||
1143 ((mask & FSX_XFLAGS) &&
1144 (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
1145 size = mp->m_sb.sb_rextsize <<
1146 mp->m_sb.sb_blocklog;
1147 } else {
1148 size = mp->m_sb.sb_blocksize;
1149 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1150 code = -EINVAL;
1151 goto error_return;
1152 }
1153 }
1154
1155 if (fa->fsx_extsize % size) {
1156 code = -EINVAL;
1157 goto error_return;
1158 }
1159 }
1160 } 1211 }
1161 1212
1213 code = xfs_ioctl_setattr_check_extsize(ip, fa);
1214 if (code)
1215 goto error_trans_cancel;
1162 1216
1163 if (mask & FSX_XFLAGS) { 1217 code = xfs_ioctl_setattr_xflags(tp, ip, fa);
1164 /* 1218 if (code)
1165 * Can't change realtime flag if any extents are allocated. 1219 goto error_trans_cancel;
1166 */
1167 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
1168 (XFS_IS_REALTIME_INODE(ip)) !=
1169 (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1170 code = -EINVAL; /* EFBIG? */
1171 goto error_return;
1172 }
1173
1174 /*
1175 * If realtime flag is set then must have realtime data.
1176 */
1177 if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1178 if ((mp->m_sb.sb_rblocks == 0) ||
1179 (mp->m_sb.sb_rextsize == 0) ||
1180 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
1181 code = -EINVAL;
1182 goto error_return;
1183 }
1184 }
1185
1186 /*
1187 * Can't modify an immutable/append-only file unless
1188 * we have appropriate permission.
1189 */
1190 if ((ip->i_d.di_flags &
1191 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
1192 (fa->fsx_xflags &
1193 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
1194 !capable(CAP_LINUX_IMMUTABLE)) {
1195 code = -EPERM;
1196 goto error_return;
1197 }
1198 }
1199
1200 xfs_trans_ijoin(tp, ip, 0);
1201 1220
1202 /* 1221 /*
1203 * Change file ownership. Must be the owner or privileged. 1222 * Change file ownership. Must be the owner or privileged. CAP_FSETID
1223 * overrides the following restrictions:
1224 *
1225 * The set-user-ID and set-group-ID bits of a file will be cleared upon
1226 * successful return from chown()
1204 */ 1227 */
1205 if (mask & FSX_PROJID) {
1206 /*
1207 * CAP_FSETID overrides the following restrictions:
1208 *
1209 * The set-user-ID and set-group-ID bits of a file will be
1210 * cleared upon successful return from chown()
1211 */
1212 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
1213 !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
1214 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
1215
1216 /*
1217 * Change the ownerships and register quota modifications
1218 * in the transaction.
1219 */
1220 if (xfs_get_projid(ip) != fa->fsx_projid) {
1221 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1222 olddquot = xfs_qm_vop_chown(tp, ip,
1223 &ip->i_pdquot, pdqp);
1224 }
1225 ASSERT(ip->i_d.di_version > 1);
1226 xfs_set_projid(ip, fa->fsx_projid);
1227 }
1228 1228
1229 } 1229 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
1230 !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
1231 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
1230 1232
1231 if (mask & FSX_XFLAGS) { 1233 /* Change the ownerships and register project quota modifications */
1232 xfs_set_diflags(ip, fa->fsx_xflags); 1234 if (xfs_get_projid(ip) != fa->fsx_projid) {
1233 xfs_diflags_to_linux(ip); 1235 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1236 olddquot = xfs_qm_vop_chown(tp, ip,
1237 &ip->i_pdquot, pdqp);
1238 }
1239 ASSERT(ip->i_d.di_version > 1);
1240 xfs_set_projid(ip, fa->fsx_projid);
1234 } 1241 }
1235 1242
1236 /* 1243 /*
@@ -1238,34 +1245,12 @@ xfs_ioctl_setattr(
1238 * extent size hint should be set on the inode. If no extent size flags 1245 * extent size hint should be set on the inode. If no extent size flags
1239 * are set on the inode then unconditionally clear the extent size hint. 1246 * are set on the inode then unconditionally clear the extent size hint.
1240 */ 1247 */
1241 if (mask & FSX_EXTSIZE) { 1248 if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
1242 int extsize = 0; 1249 ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
1243 1250 else
1244 if (ip->i_d.di_flags & 1251 ip->i_d.di_extsize = 0;
1245 (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
1246 extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
1247 ip->i_d.di_extsize = extsize;
1248 }
1249
1250 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1251 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1252
1253 XFS_STATS_INC(xs_ig_attrchg);
1254 1252
1255 /*
1256 * If this is a synchronous mount, make sure that the
1257 * transaction goes to disk before returning to the user.
1258 * This is slightly sub-optimal in that truncates require
1259 * two sync transactions instead of one for wsync filesystems.
1260 * One for the truncate and one for the timestamps since we
1261 * don't want to change the timestamps unless we're sure the
1262 * truncate worked. Truncates are less than 1% of the laddis
1263 * mix so this probably isn't worth the trouble to optimize.
1264 */
1265 if (mp->m_flags & XFS_MOUNT_WSYNC)
1266 xfs_trans_set_sync(tp);
1267 code = xfs_trans_commit(tp, 0); 1253 code = xfs_trans_commit(tp, 0);
1268 xfs_iunlock(ip, lock_flags);
1269 1254
1270 /* 1255 /*
1271 * Release any dquot(s) the inode had kept before chown. 1256 * Release any dquot(s) the inode had kept before chown.
@@ -1276,12 +1261,11 @@ xfs_ioctl_setattr(
1276 1261
1277 return code; 1262 return code;
1278 1263
1279 error_return: 1264error_trans_cancel:
1265 xfs_trans_cancel(tp, 0);
1266error_free_dquots:
1280 xfs_qm_dqrele(udqp); 1267 xfs_qm_dqrele(udqp);
1281 xfs_qm_dqrele(pdqp); 1268 xfs_qm_dqrele(pdqp);
1282 xfs_trans_cancel(tp, 0);
1283 if (lock_flags)
1284 xfs_iunlock(ip, lock_flags);
1285 return code; 1269 return code;
1286} 1270}
1287 1271
@@ -1292,20 +1276,15 @@ xfs_ioc_fssetxattr(
1292 void __user *arg) 1276 void __user *arg)
1293{ 1277{
1294 struct fsxattr fa; 1278 struct fsxattr fa;
1295 unsigned int mask;
1296 int error; 1279 int error;
1297 1280
1298 if (copy_from_user(&fa, arg, sizeof(fa))) 1281 if (copy_from_user(&fa, arg, sizeof(fa)))
1299 return -EFAULT; 1282 return -EFAULT;
1300 1283
1301 mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
1302 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1303 mask |= FSX_NONBLOCK;
1304
1305 error = mnt_want_write_file(filp); 1284 error = mnt_want_write_file(filp);
1306 if (error) 1285 if (error)
1307 return error; 1286 return error;
1308 error = xfs_ioctl_setattr(ip, &fa, mask); 1287 error = xfs_ioctl_setattr(ip, &fa);
1309 mnt_drop_write_file(filp); 1288 mnt_drop_write_file(filp);
1310 return error; 1289 return error;
1311} 1290}
@@ -1325,14 +1304,14 @@ xfs_ioc_getxflags(
1325 1304
1326STATIC int 1305STATIC int
1327xfs_ioc_setxflags( 1306xfs_ioc_setxflags(
1328 xfs_inode_t *ip, 1307 struct xfs_inode *ip,
1329 struct file *filp, 1308 struct file *filp,
1330 void __user *arg) 1309 void __user *arg)
1331{ 1310{
1311 struct xfs_trans *tp;
1332 struct fsxattr fa; 1312 struct fsxattr fa;
1333 unsigned int flags; 1313 unsigned int flags;
1334 unsigned int mask; 1314 int error;
1335 int error;
1336 1315
1337 if (copy_from_user(&flags, arg, sizeof(flags))) 1316 if (copy_from_user(&flags, arg, sizeof(flags)))
1338 return -EFAULT; 1317 return -EFAULT;
@@ -1342,15 +1321,26 @@ xfs_ioc_setxflags(
1342 FS_SYNC_FL)) 1321 FS_SYNC_FL))
1343 return -EOPNOTSUPP; 1322 return -EOPNOTSUPP;
1344 1323
1345 mask = FSX_XFLAGS;
1346 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1347 mask |= FSX_NONBLOCK;
1348 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); 1324 fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
1349 1325
1350 error = mnt_want_write_file(filp); 1326 error = mnt_want_write_file(filp);
1351 if (error) 1327 if (error)
1352 return error; 1328 return error;
1353 error = xfs_ioctl_setattr(ip, &fa, mask); 1329
1330 tp = xfs_ioctl_setattr_get_trans(ip);
1331 if (IS_ERR(tp)) {
1332 error = PTR_ERR(tp);
1333 goto out_drop_write;
1334 }
1335
1336 error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
1337 if (error) {
1338 xfs_trans_cancel(tp, 0);
1339 goto out_drop_write;
1340 }
1341
1342 error = xfs_trans_commit(tp, 0);
1343out_drop_write:
1354 mnt_drop_write_file(filp); 1344 mnt_drop_write_file(filp);
1355 return error; 1345 return error;
1356} 1346}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ec6772866f3d..bfc7c7c8a0c8 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -423,7 +423,7 @@ xfs_compat_attrmulti_by_handle(
423 423
424 ops = memdup_user(compat_ptr(am_hreq.ops), size); 424 ops = memdup_user(compat_ptr(am_hreq.ops), size);
425 if (IS_ERR(ops)) { 425 if (IS_ERR(ops)) {
426 error = -PTR_ERR(ops); 426 error = PTR_ERR(ops);
427 goto out_dput; 427 goto out_dput;
428 } 428 }
429 429
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c980e2a5086b..ccb1dd0d509e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,7 +802,7 @@ int
802xfs_iomap_write_unwritten( 802xfs_iomap_write_unwritten(
803 xfs_inode_t *ip, 803 xfs_inode_t *ip,
804 xfs_off_t offset, 804 xfs_off_t offset,
805 size_t count) 805 xfs_off_t count)
806{ 806{
807 xfs_mount_t *mp = ip->i_mount; 807 xfs_mount_t *mp = ip->i_mount;
808 xfs_fileoff_t offset_fsb; 808 xfs_fileoff_t offset_fsb;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 411fbb8919ef..8688e663d744 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,6 +27,6 @@ int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *); 27 struct xfs_bmbt_irec *);
28int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, 28int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
29 struct xfs_bmbt_irec *); 29 struct xfs_bmbt_irec *);
30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
31 31
32#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c50311cae1b1..d919ad7b16bf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,6 +37,7 @@
37#include "xfs_da_btree.h" 37#include "xfs_da_btree.h"
38#include "xfs_dir2.h" 38#include "xfs_dir2.h"
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_pnfs.h"
40 41
41#include <linux/capability.h> 42#include <linux/capability.h>
42#include <linux/xattr.h> 43#include <linux/xattr.h>
@@ -380,18 +381,27 @@ xfs_vn_rename(
380 struct inode *odir, 381 struct inode *odir,
381 struct dentry *odentry, 382 struct dentry *odentry,
382 struct inode *ndir, 383 struct inode *ndir,
383 struct dentry *ndentry) 384 struct dentry *ndentry,
385 unsigned int flags)
384{ 386{
385 struct inode *new_inode = ndentry->d_inode; 387 struct inode *new_inode = ndentry->d_inode;
388 int omode = 0;
386 struct xfs_name oname; 389 struct xfs_name oname;
387 struct xfs_name nname; 390 struct xfs_name nname;
388 391
389 xfs_dentry_to_name(&oname, odentry, 0); 392 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
393 return -EINVAL;
394
395 /* if we are exchanging files, we need to set i_mode of both files */
396 if (flags & RENAME_EXCHANGE)
397 omode = ndentry->d_inode->i_mode;
398
399 xfs_dentry_to_name(&oname, odentry, omode);
390 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); 400 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
391 401
392 return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), 402 return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
393 XFS_I(ndir), &nname, new_inode ? 403 XFS_I(ndir), &nname,
394 XFS_I(new_inode) : NULL); 404 new_inode ? XFS_I(new_inode) : NULL, flags);
395} 405}
396 406
397/* 407/*
@@ -496,7 +506,7 @@ xfs_setattr_mode(
496 inode->i_mode |= mode & ~S_IFMT; 506 inode->i_mode |= mode & ~S_IFMT;
497} 507}
498 508
499static void 509void
500xfs_setattr_time( 510xfs_setattr_time(
501 struct xfs_inode *ip, 511 struct xfs_inode *ip,
502 struct iattr *iattr) 512 struct iattr *iattr)
@@ -970,9 +980,13 @@ xfs_vn_setattr(
970 int error; 980 int error;
971 981
972 if (iattr->ia_valid & ATTR_SIZE) { 982 if (iattr->ia_valid & ATTR_SIZE) {
973 xfs_ilock(ip, XFS_IOLOCK_EXCL); 983 uint iolock = XFS_IOLOCK_EXCL;
974 error = xfs_setattr_size(ip, iattr); 984
975 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 985 xfs_ilock(ip, iolock);
986 error = xfs_break_layouts(dentry->d_inode, &iolock);
987 if (!error)
988 error = xfs_setattr_size(ip, iattr);
989 xfs_iunlock(ip, iolock);
976 } else { 990 } else {
977 error = xfs_setattr_nonsize(ip, iattr, 0); 991 error = xfs_setattr_nonsize(ip, iattr, 0);
978 } 992 }
@@ -1144,7 +1158,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
1144 */ 1158 */
1145 .rmdir = xfs_vn_unlink, 1159 .rmdir = xfs_vn_unlink,
1146 .mknod = xfs_vn_mknod, 1160 .mknod = xfs_vn_mknod,
1147 .rename = xfs_vn_rename, 1161 .rename2 = xfs_vn_rename,
1148 .get_acl = xfs_get_acl, 1162 .get_acl = xfs_get_acl,
1149 .set_acl = xfs_set_acl, 1163 .set_acl = xfs_set_acl,
1150 .getattr = xfs_vn_getattr, 1164 .getattr = xfs_vn_getattr,
@@ -1172,7 +1186,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1172 */ 1186 */
1173 .rmdir = xfs_vn_unlink, 1187 .rmdir = xfs_vn_unlink,
1174 .mknod = xfs_vn_mknod, 1188 .mknod = xfs_vn_mknod,
1175 .rename = xfs_vn_rename, 1189 .rename2 = xfs_vn_rename,
1176 .get_acl = xfs_get_acl, 1190 .get_acl = xfs_get_acl,
1177 .set_acl = xfs_set_acl, 1191 .set_acl = xfs_set_acl,
1178 .getattr = xfs_vn_getattr, 1192 .getattr = xfs_vn_getattr,
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 1c34e4335920..ea7a98e9cb70 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
32 */ 32 */
33#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ 33#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
34 34
35extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
35extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, 36extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
36 int flags); 37 int flags);
37extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); 38extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e408bf5a3ff7..bcc7cfabb787 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -33,6 +33,7 @@
33#include "xfs_fsops.h" 33#include "xfs_fsops.h"
34#include "xfs_cksum.h" 34#include "xfs_cksum.h"
35#include "xfs_sysfs.h" 35#include "xfs_sysfs.h"
36#include "xfs_sb.h"
36 37
37kmem_zone_t *xfs_log_ticket_zone; 38kmem_zone_t *xfs_log_ticket_zone;
38 39
@@ -1290,9 +1291,20 @@ xfs_log_worker(
1290 struct xfs_mount *mp = log->l_mp; 1291 struct xfs_mount *mp = log->l_mp;
1291 1292
1292 /* dgc: errors ignored - not fatal and nowhere to report them */ 1293 /* dgc: errors ignored - not fatal and nowhere to report them */
1293 if (xfs_log_need_covered(mp)) 1294 if (xfs_log_need_covered(mp)) {
1294 xfs_fs_log_dummy(mp); 1295 /*
1295 else 1296 * Dump a transaction into the log that contains no real change.
1297 * This is needed to stamp the current tail LSN into the log
1298 * during the covering operation.
1299 *
1300 * We cannot use an inode here for this - that will push dirty
1301 * state back up into the VFS and then periodic inode flushing
1302 * will prevent log covering from making progress. Hence we
1303 * synchronously log the superblock instead to ensure the
1304 * superblock is immediately unpinned and can be written back.
1305 */
1306 xfs_sync_sb(mp, true);
1307 } else
1296 xfs_log_force(mp, 0); 1308 xfs_log_force(mp, 0);
1297 1309
1298 /* start pushing all the metadata that is currently dirty */ 1310 /* start pushing all the metadata that is currently dirty */
@@ -1395,6 +1407,8 @@ xlog_alloc_log(
1395 ASSERT(xfs_buf_islocked(bp)); 1407 ASSERT(xfs_buf_islocked(bp));
1396 xfs_buf_unlock(bp); 1408 xfs_buf_unlock(bp);
1397 1409
1410 /* use high priority wq for log I/O completion */
1411 bp->b_ioend_wq = mp->m_log_workqueue;
1398 bp->b_iodone = xlog_iodone; 1412 bp->b_iodone = xlog_iodone;
1399 log->l_xbuf = bp; 1413 log->l_xbuf = bp;
1400 1414
@@ -1427,6 +1441,8 @@ xlog_alloc_log(
1427 ASSERT(xfs_buf_islocked(bp)); 1441 ASSERT(xfs_buf_islocked(bp));
1428 xfs_buf_unlock(bp); 1442 xfs_buf_unlock(bp);
1429 1443
1444 /* use high priority wq for log I/O completion */
1445 bp->b_ioend_wq = mp->m_log_workqueue;
1430 bp->b_iodone = xlog_iodone; 1446 bp->b_iodone = xlog_iodone;
1431 iclog->ic_bp = bp; 1447 iclog->ic_bp = bp;
1432 iclog->ic_data = bp->b_addr; 1448 iclog->ic_data = bp->b_addr;
@@ -1806,8 +1822,6 @@ xlog_sync(
1806 XFS_BUF_ZEROFLAGS(bp); 1822 XFS_BUF_ZEROFLAGS(bp);
1807 XFS_BUF_ASYNC(bp); 1823 XFS_BUF_ASYNC(bp);
1808 bp->b_flags |= XBF_SYNCIO; 1824 bp->b_flags |= XBF_SYNCIO;
1809 /* use high priority completion wq */
1810 bp->b_ioend_wq = log->l_mp->m_log_workqueue;
1811 1825
1812 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { 1826 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1813 bp->b_flags |= XBF_FUA; 1827 bp->b_flags |= XBF_FUA;
@@ -1856,8 +1870,6 @@ xlog_sync(
1856 bp->b_flags |= XBF_SYNCIO; 1870 bp->b_flags |= XBF_SYNCIO;
1857 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1871 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1858 bp->b_flags |= XBF_FUA; 1872 bp->b_flags |= XBF_FUA;
1859 /* use high priority completion wq */
1860 bp->b_ioend_wq = log->l_mp->m_log_workqueue;
1861 1873
1862 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1874 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1863 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1875 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -2027,7 +2039,7 @@ xlog_print_tic_res(
2027 " total reg = %u bytes (o/flow = %u bytes)\n" 2039 " total reg = %u bytes (o/flow = %u bytes)\n"
2028 " ophdrs = %u (ophdr space = %u bytes)\n" 2040 " ophdrs = %u (ophdr space = %u bytes)\n"
2029 " ophdr + reg = %u bytes\n" 2041 " ophdr + reg = %u bytes\n"
2030 " num regions = %u\n", 2042 " num regions = %u",
2031 ((ticket->t_trans_type <= 0 || 2043 ((ticket->t_trans_type <= 0 ||
2032 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 2044 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
2033 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 2045 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d3d38836f87f..4fa80e63eea2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -408,11 +408,11 @@ xfs_update_alignment(xfs_mount_t *mp)
408 if (xfs_sb_version_hasdalign(sbp)) { 408 if (xfs_sb_version_hasdalign(sbp)) {
409 if (sbp->sb_unit != mp->m_dalign) { 409 if (sbp->sb_unit != mp->m_dalign) {
410 sbp->sb_unit = mp->m_dalign; 410 sbp->sb_unit = mp->m_dalign;
411 mp->m_update_flags |= XFS_SB_UNIT; 411 mp->m_update_sb = true;
412 } 412 }
413 if (sbp->sb_width != mp->m_swidth) { 413 if (sbp->sb_width != mp->m_swidth) {
414 sbp->sb_width = mp->m_swidth; 414 sbp->sb_width = mp->m_swidth;
415 mp->m_update_flags |= XFS_SB_WIDTH; 415 mp->m_update_sb = true;
416 } 416 }
417 } else { 417 } else {
418 xfs_warn(mp, 418 xfs_warn(mp,
@@ -583,38 +583,19 @@ int
583xfs_mount_reset_sbqflags( 583xfs_mount_reset_sbqflags(
584 struct xfs_mount *mp) 584 struct xfs_mount *mp)
585{ 585{
586 int error;
587 struct xfs_trans *tp;
588
589 mp->m_qflags = 0; 586 mp->m_qflags = 0;
590 587
591 /* 588 /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
592 * It is OK to look at sb_qflags here in mount path,
593 * without m_sb_lock.
594 */
595 if (mp->m_sb.sb_qflags == 0) 589 if (mp->m_sb.sb_qflags == 0)
596 return 0; 590 return 0;
597 spin_lock(&mp->m_sb_lock); 591 spin_lock(&mp->m_sb_lock);
598 mp->m_sb.sb_qflags = 0; 592 mp->m_sb.sb_qflags = 0;
599 spin_unlock(&mp->m_sb_lock); 593 spin_unlock(&mp->m_sb_lock);
600 594
601 /* 595 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
602 * If the fs is readonly, let the incore superblock run
603 * with quotas off but don't flush the update out to disk
604 */
605 if (mp->m_flags & XFS_MOUNT_RDONLY)
606 return 0; 596 return 0;
607 597
608 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 598 return xfs_sync_sb(mp, false);
609 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
610 if (error) {
611 xfs_trans_cancel(tp, 0);
612 xfs_alert(mp, "%s: Superblock update failed!", __func__);
613 return error;
614 }
615
616 xfs_mod_sb(tp, XFS_SB_QFLAGS);
617 return xfs_trans_commit(tp, 0);
618} 599}
619 600
620__uint64_t 601__uint64_t
@@ -659,26 +640,25 @@ xfs_mountfs(
659 xfs_sb_mount_common(mp, sbp); 640 xfs_sb_mount_common(mp, sbp);
660 641
661 /* 642 /*
662 * Check for a mismatched features2 values. Older kernels 643 * Check for a mismatched features2 values. Older kernels read & wrote
663 * read & wrote into the wrong sb offset for sb_features2 644 * into the wrong sb offset for sb_features2 on some platforms due to
664 * on some platforms due to xfs_sb_t not being 64bit size aligned 645 * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
665 * when sb_features2 was added, which made older superblock 646 * which made older superblock reading/writing routines swap it as a
666 * reading/writing routines swap it as a 64-bit value. 647 * 64-bit value.
667 * 648 *
668 * For backwards compatibility, we make both slots equal. 649 * For backwards compatibility, we make both slots equal.
669 * 650 *
670 * If we detect a mismatched field, we OR the set bits into the 651 * If we detect a mismatched field, we OR the set bits into the existing
671 * existing features2 field in case it has already been modified; we 652 * features2 field in case it has already been modified; we don't want
672 * don't want to lose any features. We then update the bad location 653 * to lose any features. We then update the bad location with the ORed
673 * with the ORed value so that older kernels will see any features2 654 * value so that older kernels will see any features2 flags. The
674 * flags, and mark the two fields as needing updates once the 655 * superblock writeback code ensures the new sb_features2 is copied to
675 * transaction subsystem is online. 656 * sb_bad_features2 before it is logged or written to disk.
676 */ 657 */
677 if (xfs_sb_has_mismatched_features2(sbp)) { 658 if (xfs_sb_has_mismatched_features2(sbp)) {
678 xfs_warn(mp, "correcting sb_features alignment problem"); 659 xfs_warn(mp, "correcting sb_features alignment problem");
679 sbp->sb_features2 |= sbp->sb_bad_features2; 660 sbp->sb_features2 |= sbp->sb_bad_features2;
680 sbp->sb_bad_features2 = sbp->sb_features2; 661 mp->m_update_sb = true;
681 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
682 662
683 /* 663 /*
684 * Re-check for ATTR2 in case it was found in bad_features2 664 * Re-check for ATTR2 in case it was found in bad_features2
@@ -692,17 +672,17 @@ xfs_mountfs(
692 if (xfs_sb_version_hasattr2(&mp->m_sb) && 672 if (xfs_sb_version_hasattr2(&mp->m_sb) &&
693 (mp->m_flags & XFS_MOUNT_NOATTR2)) { 673 (mp->m_flags & XFS_MOUNT_NOATTR2)) {
694 xfs_sb_version_removeattr2(&mp->m_sb); 674 xfs_sb_version_removeattr2(&mp->m_sb);
695 mp->m_update_flags |= XFS_SB_FEATURES2; 675 mp->m_update_sb = true;
696 676
697 /* update sb_versionnum for the clearing of the morebits */ 677 /* update sb_versionnum for the clearing of the morebits */
698 if (!sbp->sb_features2) 678 if (!sbp->sb_features2)
699 mp->m_update_flags |= XFS_SB_VERSIONNUM; 679 mp->m_update_sb = true;
700 } 680 }
701 681
702 /* always use v2 inodes by default now */ 682 /* always use v2 inodes by default now */
703 if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { 683 if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
704 mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; 684 mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
705 mp->m_update_flags |= XFS_SB_VERSIONNUM; 685 mp->m_update_sb = true;
706 } 686 }
707 687
708 /* 688 /*
@@ -895,8 +875,8 @@ xfs_mountfs(
895 * the next remount into writeable mode. Otherwise we would never 875 * the next remount into writeable mode. Otherwise we would never
896 * perform the update e.g. for the root filesystem. 876 * perform the update e.g. for the root filesystem.
897 */ 877 */
898 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 878 if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
899 error = xfs_mount_log_sb(mp, mp->m_update_flags); 879 error = xfs_sync_sb(mp, false);
900 if (error) { 880 if (error) {
901 xfs_warn(mp, "failed to write sb changes"); 881 xfs_warn(mp, "failed to write sb changes");
902 goto out_rtunmount; 882 goto out_rtunmount;
@@ -1103,9 +1083,6 @@ xfs_fs_writable(
1103int 1083int
1104xfs_log_sbcount(xfs_mount_t *mp) 1084xfs_log_sbcount(xfs_mount_t *mp)
1105{ 1085{
1106 xfs_trans_t *tp;
1107 int error;
1108
1109 /* allow this to proceed during the freeze sequence... */ 1086 /* allow this to proceed during the freeze sequence... */
1110 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) 1087 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
1111 return 0; 1088 return 0;
@@ -1119,17 +1096,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
1119 if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) 1096 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
1120 return 0; 1097 return 0;
1121 1098
1122 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); 1099 return xfs_sync_sb(mp, true);
1123 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
1124 if (error) {
1125 xfs_trans_cancel(tp, 0);
1126 return error;
1127 }
1128
1129 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
1130 xfs_trans_set_sync(tp);
1131 error = xfs_trans_commit(tp, 0);
1132 return error;
1133} 1100}
1134 1101
1135/* 1102/*
@@ -1423,34 +1390,6 @@ xfs_freesb(
1423} 1390}
1424 1391
1425/* 1392/*
1426 * Used to log changes to the superblock unit and width fields which could
1427 * be altered by the mount options, as well as any potential sb_features2
1428 * fixup. Only the first superblock is updated.
1429 */
1430int
1431xfs_mount_log_sb(
1432 xfs_mount_t *mp,
1433 __int64_t fields)
1434{
1435 xfs_trans_t *tp;
1436 int error;
1437
1438 ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
1439 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
1440 XFS_SB_VERSIONNUM));
1441
1442 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1443 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
1444 if (error) {
1445 xfs_trans_cancel(tp, 0);
1446 return error;
1447 }
1448 xfs_mod_sb(tp, fields);
1449 error = xfs_trans_commit(tp, 0);
1450 return error;
1451}
1452
1453/*
1454 * If the underlying (data/log/rt) device is readonly, there are some 1393 * If the underlying (data/log/rt) device is readonly, there are some
1455 * operations that cannot proceed. 1394 * operations that cannot proceed.
1456 */ 1395 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69d4d3c..0d8abd6364d9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,8 +162,7 @@ typedef struct xfs_mount {
162 struct delayed_work m_reclaim_work; /* background inode reclaim */ 162 struct delayed_work m_reclaim_work; /* background inode reclaim */
163 struct delayed_work m_eofblocks_work; /* background eof blocks 163 struct delayed_work m_eofblocks_work; /* background eof blocks
164 trimming */ 164 trimming */
165 __int64_t m_update_flags; /* sb flags we need to update 165 bool m_update_sb; /* sb needs update in mount */
166 on the next remount,rw */
167 int64_t m_low_space[XFS_LOWSP_MAX]; 166 int64_t m_low_space[XFS_LOWSP_MAX];
168 /* low free space thresholds */ 167 /* low free space thresholds */
169 struct xfs_kobj m_kobj; 168 struct xfs_kobj m_kobj;
@@ -175,6 +174,17 @@ typedef struct xfs_mount {
175 struct workqueue_struct *m_reclaim_workqueue; 174 struct workqueue_struct *m_reclaim_workqueue;
176 struct workqueue_struct *m_log_workqueue; 175 struct workqueue_struct *m_log_workqueue;
177 struct workqueue_struct *m_eofblocks_workqueue; 176 struct workqueue_struct *m_eofblocks_workqueue;
177
178 /*
179 * Generation of the filesysyem layout. This is incremented by each
180 * growfs, and used by the pNFS server to ensure the client updates
181 * its view of the block device once it gets a layout that might
182 * reference the newly added blocks. Does not need to be persistent
183 * as long as we only allow file system size increments, but if we
184 * ever support shrinks it would have to be persisted in addition
185 * to various other kinds of pain inflicted on the pNFS server.
186 */
187 __uint32_t m_generation;
178} xfs_mount_t; 188} xfs_mount_t;
179 189
180/* 190/*
@@ -378,7 +388,7 @@ extern void xfs_unmountfs(xfs_mount_t *);
378extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 388extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
379extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 389extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
380 uint, int); 390 uint, int);
381extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); 391extern int xfs_mount_log_sb(xfs_mount_t *);
382extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 392extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
383extern int xfs_readsb(xfs_mount_t *, int); 393extern int xfs_readsb(xfs_mount_t *, int);
384extern void xfs_freesb(xfs_mount_t *); 394extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 000000000000..4b33ef112400
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,322 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include "xfs.h"
5#include "xfs_format.h"
6#include "xfs_log_format.h"
7#include "xfs_trans_resv.h"
8#include "xfs_sb.h"
9#include "xfs_mount.h"
10#include "xfs_inode.h"
11#include "xfs_trans.h"
12#include "xfs_log.h"
13#include "xfs_bmap.h"
14#include "xfs_bmap_util.h"
15#include "xfs_error.h"
16#include "xfs_iomap.h"
17#include "xfs_shared.h"
18#include "xfs_bit.h"
19#include "xfs_pnfs.h"
20
21/*
22 * Ensure that we do not have any outstanding pNFS layouts that can be used by
23 * clients to directly read from or write to this inode. This must be called
24 * before every operation that can remove blocks from the extent map.
25 * Additionally we call it during the write operation, where aren't concerned
26 * about exposing unallocated blocks but just want to provide basic
27 * synchronization between a local writer and pNFS clients. mmap writes would
28 * also benefit from this sort of synchronization, but due to the tricky locking
29 * rules in the page fault path we don't bother.
30 */
31int
32xfs_break_layouts(
33 struct inode *inode,
34 uint *iolock)
35{
36 struct xfs_inode *ip = XFS_I(inode);
37 int error;
38
39 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
40
41 while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
42 xfs_iunlock(ip, *iolock);
43 error = break_layout(inode, true);
44 *iolock = XFS_IOLOCK_EXCL;
45 xfs_ilock(ip, *iolock);
46 }
47
48 return error;
49}
50
51/*
52 * Get a unique ID including its location so that the client can identify
53 * the exported device.
54 */
55int
56xfs_fs_get_uuid(
57 struct super_block *sb,
58 u8 *buf,
59 u32 *len,
60 u64 *offset)
61{
62 struct xfs_mount *mp = XFS_M(sb);
63
64 printk_once(KERN_NOTICE
65"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
66 mp->m_fsname);
67
68 if (*len < sizeof(uuid_t))
69 return -EINVAL;
70
71 memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
72 *len = sizeof(uuid_t);
73 *offset = offsetof(struct xfs_dsb, sb_uuid);
74 return 0;
75}
76
77static void
78xfs_bmbt_to_iomap(
79 struct xfs_inode *ip,
80 struct iomap *iomap,
81 struct xfs_bmbt_irec *imap)
82{
83 struct xfs_mount *mp = ip->i_mount;
84
85 if (imap->br_startblock == HOLESTARTBLOCK) {
86 iomap->blkno = IOMAP_NULL_BLOCK;
87 iomap->type = IOMAP_HOLE;
88 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
89 iomap->blkno = IOMAP_NULL_BLOCK;
90 iomap->type = IOMAP_DELALLOC;
91 } else {
92 iomap->blkno =
93 XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
94 if (imap->br_state == XFS_EXT_UNWRITTEN)
95 iomap->type = IOMAP_UNWRITTEN;
96 else
97 iomap->type = IOMAP_MAPPED;
98 }
99 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
100 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
101}
102
103/*
104 * Get a layout for the pNFS client.
105 */
106int
107xfs_fs_map_blocks(
108 struct inode *inode,
109 loff_t offset,
110 u64 length,
111 struct iomap *iomap,
112 bool write,
113 u32 *device_generation)
114{
115 struct xfs_inode *ip = XFS_I(inode);
116 struct xfs_mount *mp = ip->i_mount;
117 struct xfs_bmbt_irec imap;
118 xfs_fileoff_t offset_fsb, end_fsb;
119 loff_t limit;
120 int bmapi_flags = XFS_BMAPI_ENTIRE;
121 int nimaps = 1;
122 uint lock_flags;
123 int error = 0;
124
125 if (XFS_FORCED_SHUTDOWN(mp))
126 return -EIO;
127
128 /*
129 * We can't export inodes residing on the realtime device. The realtime
130 * device doesn't have a UUID to identify it, so the client has no way
131 * to find it.
132 */
133 if (XFS_IS_REALTIME_INODE(ip))
134 return -ENXIO;
135
136 /*
137 * Lock out any other I/O before we flush and invalidate the pagecache,
138 * and then hand out a layout to the remote system. This is very
139 * similar to direct I/O, except that the synchronization is much more
140 * complicated. See the comment near xfs_break_layouts for a detailed
141 * explanation.
142 */
143 xfs_ilock(ip, XFS_IOLOCK_EXCL);
144
145 error = -EINVAL;
146 limit = mp->m_super->s_maxbytes;
147 if (!write)
148 limit = max(limit, round_up(i_size_read(inode),
149 inode->i_sb->s_blocksize));
150 if (offset > limit)
151 goto out_unlock;
152 if (offset > limit - length)
153 length = limit - offset;
154
155 error = filemap_write_and_wait(inode->i_mapping);
156 if (error)
157 goto out_unlock;
158 error = invalidate_inode_pages2(inode->i_mapping);
159 if (WARN_ON_ONCE(error))
160 return error;
161
162 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
163 offset_fsb = XFS_B_TO_FSBT(mp, offset);
164
165 lock_flags = xfs_ilock_data_map_shared(ip);
166 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
167 &imap, &nimaps, bmapi_flags);
168 xfs_iunlock(ip, lock_flags);
169
170 if (error)
171 goto out_unlock;
172
173 if (write) {
174 enum xfs_prealloc_flags flags = 0;
175
176 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
177
178 if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
179 error = xfs_iomap_write_direct(ip, offset, length,
180 &imap, nimaps);
181 if (error)
182 goto out_unlock;
183
184 /*
185 * Ensure the next transaction is committed
186 * synchronously so that the blocks allocated and
187 * handed out to the client are guaranteed to be
188 * present even after a server crash.
189 */
190 flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
191 }
192
193 error = xfs_update_prealloc_flags(ip, flags);
194 if (error)
195 goto out_unlock;
196 }
197 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
198
199 xfs_bmbt_to_iomap(ip, iomap, &imap);
200 *device_generation = mp->m_generation;
201 return error;
202out_unlock:
203 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
204 return error;
205}
206
207/*
208 * Ensure the size update falls into a valid allocated block.
209 */
210static int
211xfs_pnfs_validate_isize(
212 struct xfs_inode *ip,
213 xfs_off_t isize)
214{
215 struct xfs_bmbt_irec imap;
216 int nimaps = 1;
217 int error = 0;
218
219 xfs_ilock(ip, XFS_ILOCK_SHARED);
220 error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
221 &imap, &nimaps, 0);
222 xfs_iunlock(ip, XFS_ILOCK_SHARED);
223 if (error)
224 return error;
225
226 if (imap.br_startblock == HOLESTARTBLOCK ||
227 imap.br_startblock == DELAYSTARTBLOCK ||
228 imap.br_state == XFS_EXT_UNWRITTEN)
229 return -EIO;
230 return 0;
231}
232
233/*
234 * Make sure the blocks described by maps are stable on disk. This includes
235 * converting any unwritten extents, flushing the disk cache and updating the
236 * time stamps.
237 *
238 * Note that we rely on the caller to always send us a timestamp update so that
239 * we always commit a transaction here. If that stops being true we will have
240 * to manually flush the cache here similar to what the fsync code path does
241 * for datasyncs on files that have no dirty metadata.
242 */
243int
244xfs_fs_commit_blocks(
245 struct inode *inode,
246 struct iomap *maps,
247 int nr_maps,
248 struct iattr *iattr)
249{
250 struct xfs_inode *ip = XFS_I(inode);
251 struct xfs_mount *mp = ip->i_mount;
252 struct xfs_trans *tp;
253 bool update_isize = false;
254 int error, i;
255 loff_t size;
256
257 ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
258
259 xfs_ilock(ip, XFS_IOLOCK_EXCL);
260
261 size = i_size_read(inode);
262 if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
263 update_isize = true;
264 size = iattr->ia_size;
265 }
266
267 for (i = 0; i < nr_maps; i++) {
268 u64 start, length, end;
269
270 start = maps[i].offset;
271 if (start > size)
272 continue;
273
274 end = start + maps[i].length;
275 if (end > size)
276 end = size;
277
278 length = end - start;
279 if (!length)
280 continue;
281
282 /*
283 * Make sure reads through the pagecache see the new data.
284 */
285 error = invalidate_inode_pages2_range(inode->i_mapping,
286 start >> PAGE_CACHE_SHIFT,
287 (end - 1) >> PAGE_CACHE_SHIFT);
288 WARN_ON_ONCE(error);
289
290 error = xfs_iomap_write_unwritten(ip, start, length);
291 if (error)
292 goto out_drop_iolock;
293 }
294
295 if (update_isize) {
296 error = xfs_pnfs_validate_isize(ip, size);
297 if (error)
298 goto out_drop_iolock;
299 }
300
301 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
302 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
303 if (error)
304 goto out_drop_iolock;
305
306 xfs_ilock(ip, XFS_ILOCK_EXCL);
307 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
308 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
309
310 xfs_setattr_time(ip, iattr);
311 if (update_isize) {
312 i_size_write(inode, iattr->ia_size);
313 ip->i_d.di_size = iattr->ia_size;
314 }
315
316 xfs_trans_set_sync(tp);
317 error = xfs_trans_commit(tp, 0);
318
319out_drop_iolock:
320 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
321 return error;
322}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 000000000000..b7fbfce660f6
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,18 @@
1#ifndef _XFS_PNFS_H
2#define _XFS_PNFS_H 1
3
4#ifdef CONFIG_NFSD_PNFS
5int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
6int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
7 struct iomap *iomap, bool write, u32 *device_generation);
8int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
9 struct iattr *iattr);
10
11int xfs_break_layouts(struct inode *inode, uint *iolock);
12#else
13static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
14{
15 return 0;
16}
17#endif /* CONFIG_NFSD_PNFS */
18#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 79fb19dd9c83..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
430static enum lru_status 430static enum lru_status
431xfs_qm_dquot_isolate( 431xfs_qm_dquot_isolate(
432 struct list_head *item, 432 struct list_head *item,
433 struct list_lru_one *lru,
433 spinlock_t *lru_lock, 434 spinlock_t *lru_lock,
434 void *arg) 435 void *arg)
435 __releases(lru_lock) __acquires(lru_lock) 436 __releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
450 XFS_STATS_INC(xs_qm_dqwants); 451 XFS_STATS_INC(xs_qm_dqwants);
451 452
452 trace_xfs_dqreclaim_want(dqp); 453 trace_xfs_dqreclaim_want(dqp);
453 list_del_init(&dqp->q_lru); 454 list_lru_isolate(lru, &dqp->q_lru);
454 XFS_STATS_DEC(xs_qm_dquot_unused); 455 XFS_STATS_DEC(xs_qm_dquot_unused);
455 return LRU_REMOVED; 456 return LRU_REMOVED;
456 } 457 }
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
494 xfs_dqunlock(dqp); 495 xfs_dqunlock(dqp);
495 496
496 ASSERT(dqp->q_nrefs == 0); 497 ASSERT(dqp->q_nrefs == 0);
497 list_move_tail(&dqp->q_lru, &isol->dispose); 498 list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
498 XFS_STATS_DEC(xs_qm_dquot_unused); 499 XFS_STATS_DEC(xs_qm_dquot_unused);
499 trace_xfs_dqreclaim_done(dqp); 500 trace_xfs_dqreclaim_done(dqp);
500 XFS_STATS_INC(xs_qm_dqreclaims); 501 XFS_STATS_INC(xs_qm_dqreclaims);
@@ -523,7 +524,6 @@ xfs_qm_shrink_scan(
523 struct xfs_qm_isolate isol; 524 struct xfs_qm_isolate isol;
524 unsigned long freed; 525 unsigned long freed;
525 int error; 526 int error;
526 unsigned long nr_to_scan = sc->nr_to_scan;
527 527
528 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 528 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
529 return 0; 529 return 0;
@@ -531,8 +531,8 @@ xfs_qm_shrink_scan(
531 INIT_LIST_HEAD(&isol.buffers); 531 INIT_LIST_HEAD(&isol.buffers);
532 INIT_LIST_HEAD(&isol.dispose); 532 INIT_LIST_HEAD(&isol.dispose);
533 533
534 freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, 534 freed = list_lru_shrink_walk(&qi->qi_lru, sc,
535 &nr_to_scan); 535 xfs_qm_dquot_isolate, &isol);
536 536
537 error = xfs_buf_delwri_submit(&isol.buffers); 537 error = xfs_buf_delwri_submit(&isol.buffers);
538 if (error) 538 if (error)
@@ -557,7 +557,7 @@ xfs_qm_shrink_count(
557 struct xfs_quotainfo *qi = container_of(shrink, 557 struct xfs_quotainfo *qi = container_of(shrink,
558 struct xfs_quotainfo, qi_shrinker); 558 struct xfs_quotainfo, qi_shrinker);
559 559
560 return list_lru_count_node(&qi->qi_lru, sc->nid); 560 return list_lru_shrink_count(&qi->qi_lru, sc);
561} 561}
562 562
563/* 563/*
@@ -714,7 +714,6 @@ STATIC int
714xfs_qm_qino_alloc( 714xfs_qm_qino_alloc(
715 xfs_mount_t *mp, 715 xfs_mount_t *mp,
716 xfs_inode_t **ip, 716 xfs_inode_t **ip,
717 __int64_t sbfields,
718 uint flags) 717 uint flags)
719{ 718{
720 xfs_trans_t *tp; 719 xfs_trans_t *tp;
@@ -777,11 +776,6 @@ xfs_qm_qino_alloc(
777 spin_lock(&mp->m_sb_lock); 776 spin_lock(&mp->m_sb_lock);
778 if (flags & XFS_QMOPT_SBVERSION) { 777 if (flags & XFS_QMOPT_SBVERSION) {
779 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 778 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
780 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
781 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
782 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
783 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
784 XFS_SB_QFLAGS));
785 779
786 xfs_sb_version_addquota(&mp->m_sb); 780 xfs_sb_version_addquota(&mp->m_sb);
787 mp->m_sb.sb_uquotino = NULLFSINO; 781 mp->m_sb.sb_uquotino = NULLFSINO;
@@ -798,7 +792,7 @@ xfs_qm_qino_alloc(
798 else 792 else
799 mp->m_sb.sb_pquotino = (*ip)->i_ino; 793 mp->m_sb.sb_pquotino = (*ip)->i_ino;
800 spin_unlock(&mp->m_sb_lock); 794 spin_unlock(&mp->m_sb_lock);
801 xfs_mod_sb(tp, sbfields); 795 xfs_log_sb(tp);
802 796
803 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 797 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
804 xfs_alert(mp, "%s failed (error %d)!", __func__, error); 798 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
@@ -1451,7 +1445,7 @@ xfs_qm_mount_quotas(
1451 spin_unlock(&mp->m_sb_lock); 1445 spin_unlock(&mp->m_sb_lock);
1452 1446
1453 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { 1447 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
1454 if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { 1448 if (xfs_sync_sb(mp, false)) {
1455 /* 1449 /*
1456 * We could only have been turning quotas off. 1450 * We could only have been turning quotas off.
1457 * We aren't in very good shape actually because 1451 * We aren't in very good shape actually because
@@ -1482,7 +1476,6 @@ xfs_qm_init_quotainos(
1482 struct xfs_inode *gip = NULL; 1476 struct xfs_inode *gip = NULL;
1483 struct xfs_inode *pip = NULL; 1477 struct xfs_inode *pip = NULL;
1484 int error; 1478 int error;
1485 __int64_t sbflags = 0;
1486 uint flags = 0; 1479 uint flags = 0;
1487 1480
1488 ASSERT(mp->m_quotainfo); 1481 ASSERT(mp->m_quotainfo);
@@ -1517,9 +1510,6 @@ xfs_qm_init_quotainos(
1517 } 1510 }
1518 } else { 1511 } else {
1519 flags |= XFS_QMOPT_SBVERSION; 1512 flags |= XFS_QMOPT_SBVERSION;
1520 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1521 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
1522 XFS_SB_QFLAGS);
1523 } 1513 }
1524 1514
1525 /* 1515 /*
@@ -1530,7 +1520,6 @@ xfs_qm_init_quotainos(
1530 */ 1520 */
1531 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { 1521 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
1532 error = xfs_qm_qino_alloc(mp, &uip, 1522 error = xfs_qm_qino_alloc(mp, &uip,
1533 sbflags | XFS_SB_UQUOTINO,
1534 flags | XFS_QMOPT_UQUOTA); 1523 flags | XFS_QMOPT_UQUOTA);
1535 if (error) 1524 if (error)
1536 goto error_rele; 1525 goto error_rele;
@@ -1539,7 +1528,6 @@ xfs_qm_init_quotainos(
1539 } 1528 }
1540 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { 1529 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
1541 error = xfs_qm_qino_alloc(mp, &gip, 1530 error = xfs_qm_qino_alloc(mp, &gip,
1542 sbflags | XFS_SB_GQUOTINO,
1543 flags | XFS_QMOPT_GQUOTA); 1531 flags | XFS_QMOPT_GQUOTA);
1544 if (error) 1532 if (error)
1545 goto error_rele; 1533 goto error_rele;
@@ -1548,7 +1536,6 @@ xfs_qm_init_quotainos(
1548 } 1536 }
1549 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { 1537 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
1550 error = xfs_qm_qino_alloc(mp, &pip, 1538 error = xfs_qm_qino_alloc(mp, &pip,
1551 sbflags | XFS_SB_PQUOTINO,
1552 flags | XFS_QMOPT_PQUOTA); 1539 flags | XFS_QMOPT_PQUOTA);
1553 if (error) 1540 if (error)
1554 goto error_rele; 1541 goto error_rele;
@@ -1587,32 +1574,6 @@ xfs_qm_dqfree_one(
1587 xfs_qm_dqdestroy(dqp); 1574 xfs_qm_dqdestroy(dqp);
1588} 1575}
1589 1576
1590/*
1591 * Start a transaction and write the incore superblock changes to
1592 * disk. flags parameter indicates which fields have changed.
1593 */
1594int
1595xfs_qm_write_sb_changes(
1596 xfs_mount_t *mp,
1597 __int64_t flags)
1598{
1599 xfs_trans_t *tp;
1600 int error;
1601
1602 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1603 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
1604 if (error) {
1605 xfs_trans_cancel(tp, 0);
1606 return error;
1607 }
1608
1609 xfs_mod_sb(tp, flags);
1610 error = xfs_trans_commit(tp, 0);
1611
1612 return error;
1613}
1614
1615
1616/* --------------- utility functions for vnodeops ---------------- */ 1577/* --------------- utility functions for vnodeops ---------------- */
1617 1578
1618 1579
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3a07a937e232..0d4d3590cf85 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
157#define XFS_QM_RTBWARNLIMIT 5 157#define XFS_QM_RTBWARNLIMIT 5
158 158
159extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); 159extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
160extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
161 160
162/* dquot stuff */ 161/* dquot stuff */
163extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); 162extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
@@ -166,9 +165,9 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
166/* quota ops */ 165/* quota ops */
167extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); 166extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
168extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, 167extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
169 uint, struct fs_disk_quota *); 168 uint, struct qc_dqblk *);
170extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, 169extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
171 struct fs_disk_quota *); 170 struct qc_dqblk *);
172extern int xfs_qm_scall_getqstat(struct xfs_mount *, 171extern int xfs_qm_scall_getqstat(struct xfs_mount *,
173 struct fs_quota_stat *); 172 struct fs_quota_stat *);
174extern int xfs_qm_scall_getqstatv(struct xfs_mount *, 173extern int xfs_qm_scall_getqstatv(struct xfs_mount *,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74fca68e43b6..9b965db45800 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -39,7 +39,6 @@ STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
39STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 39STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
40 uint); 40 uint);
41STATIC uint xfs_qm_export_flags(uint); 41STATIC uint xfs_qm_export_flags(uint);
42STATIC uint xfs_qm_export_qtype_flags(uint);
43 42
44/* 43/*
45 * Turn off quota accounting and/or enforcement for all udquots and/or 44 * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -92,8 +91,7 @@ xfs_qm_scall_quotaoff(
92 mutex_unlock(&q->qi_quotaofflock); 91 mutex_unlock(&q->qi_quotaofflock);
93 92
94 /* XXX what to do if error ? Revert back to old vals incore ? */ 93 /* XXX what to do if error ? Revert back to old vals incore ? */
95 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 94 return xfs_sync_sb(mp, false);
96 return error;
97 } 95 }
98 96
99 dqtype = 0; 97 dqtype = 0;
@@ -314,7 +312,6 @@ xfs_qm_scall_quotaon(
314{ 312{
315 int error; 313 int error;
316 uint qf; 314 uint qf;
317 __int64_t sbflags;
318 315
319 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 316 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
320 /* 317 /*
@@ -322,30 +319,22 @@ xfs_qm_scall_quotaon(
322 */ 319 */
323 flags &= ~(XFS_ALL_QUOTA_ACCT); 320 flags &= ~(XFS_ALL_QUOTA_ACCT);
324 321
325 sbflags = 0;
326
327 if (flags == 0) { 322 if (flags == 0) {
328 xfs_debug(mp, "%s: zero flags, m_qflags=%x", 323 xfs_debug(mp, "%s: zero flags, m_qflags=%x",
329 __func__, mp->m_qflags); 324 __func__, mp->m_qflags);
330 return -EINVAL; 325 return -EINVAL;
331 } 326 }
332 327
333 /* No fs can turn on quotas with a delayed effect */
334 ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
335
336 /* 328 /*
337 * Can't enforce without accounting. We check the superblock 329 * Can't enforce without accounting. We check the superblock
338 * qflags here instead of m_qflags because rootfs can have 330 * qflags here instead of m_qflags because rootfs can have
339 * quota acct on ondisk without m_qflags' knowing. 331 * quota acct on ondisk without m_qflags' knowing.
340 */ 332 */
341 if (((flags & XFS_UQUOTA_ACCT) == 0 && 333 if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
342 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
343 (flags & XFS_UQUOTA_ENFD)) || 334 (flags & XFS_UQUOTA_ENFD)) ||
344 ((flags & XFS_GQUOTA_ACCT) == 0 && 335 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
345 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
346 (flags & XFS_GQUOTA_ENFD)) || 336 (flags & XFS_GQUOTA_ENFD)) ||
347 ((flags & XFS_PQUOTA_ACCT) == 0 && 337 ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
348 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
349 (flags & XFS_PQUOTA_ENFD))) { 338 (flags & XFS_PQUOTA_ENFD))) {
350 xfs_debug(mp, 339 xfs_debug(mp,
351 "%s: Can't enforce without acct, flags=%x sbflags=%x", 340 "%s: Can't enforce without acct, flags=%x sbflags=%x",
@@ -370,11 +359,11 @@ xfs_qm_scall_quotaon(
370 /* 359 /*
371 * There's nothing to change if it's the same. 360 * There's nothing to change if it's the same.
372 */ 361 */
373 if ((qf & flags) == flags && sbflags == 0) 362 if ((qf & flags) == flags)
374 return -EEXIST; 363 return -EEXIST;
375 sbflags |= XFS_SB_QFLAGS;
376 364
377 if ((error = xfs_qm_write_sb_changes(mp, sbflags))) 365 error = xfs_sync_sb(mp, false);
366 if (error)
378 return error; 367 return error;
379 /* 368 /*
380 * If we aren't trying to switch on quota enforcement, we are done. 369 * If we aren't trying to switch on quota enforcement, we are done.
@@ -384,8 +373,7 @@ xfs_qm_scall_quotaon(
384 ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != 373 ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
385 (mp->m_qflags & XFS_PQUOTA_ACCT)) || 374 (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
386 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != 375 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
387 (mp->m_qflags & XFS_GQUOTA_ACCT)) || 376 (mp->m_qflags & XFS_GQUOTA_ACCT)))
388 (flags & XFS_ALL_QUOTA_ENFD) == 0)
389 return 0; 377 return 0;
390 378
391 if (! XFS_IS_QUOTA_RUNNING(mp)) 379 if (! XFS_IS_QUOTA_RUNNING(mp))
@@ -422,20 +410,12 @@ xfs_qm_scall_getqstat(
422 memset(out, 0, sizeof(fs_quota_stat_t)); 410 memset(out, 0, sizeof(fs_quota_stat_t));
423 411
424 out->qs_version = FS_QSTAT_VERSION; 412 out->qs_version = FS_QSTAT_VERSION;
425 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
426 out->qs_uquota.qfs_ino = NULLFSINO;
427 out->qs_gquota.qfs_ino = NULLFSINO;
428 return 0;
429 }
430
431 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 413 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
432 (XFS_ALL_QUOTA_ACCT| 414 (XFS_ALL_QUOTA_ACCT|
433 XFS_ALL_QUOTA_ENFD)); 415 XFS_ALL_QUOTA_ENFD));
434 if (q) { 416 uip = q->qi_uquotaip;
435 uip = q->qi_uquotaip; 417 gip = q->qi_gquotaip;
436 gip = q->qi_gquotaip; 418 pip = q->qi_pquotaip;
437 pip = q->qi_pquotaip;
438 }
439 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 419 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
440 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 420 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
441 0, 0, &uip) == 0) 421 0, 0, &uip) == 0)
@@ -481,14 +461,13 @@ xfs_qm_scall_getqstat(
481 if (temppqip) 461 if (temppqip)
482 IRELE(pip); 462 IRELE(pip);
483 } 463 }
484 if (q) { 464 out->qs_incoredqs = q->qi_dquots;
485 out->qs_incoredqs = q->qi_dquots; 465 out->qs_btimelimit = q->qi_btimelimit;
486 out->qs_btimelimit = q->qi_btimelimit; 466 out->qs_itimelimit = q->qi_itimelimit;
487 out->qs_itimelimit = q->qi_itimelimit; 467 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
488 out->qs_rtbtimelimit = q->qi_rtbtimelimit; 468 out->qs_bwarnlimit = q->qi_bwarnlimit;
489 out->qs_bwarnlimit = q->qi_bwarnlimit; 469 out->qs_iwarnlimit = q->qi_iwarnlimit;
490 out->qs_iwarnlimit = q->qi_iwarnlimit; 470
491 }
492 return 0; 471 return 0;
493} 472}
494 473
@@ -509,13 +488,6 @@ xfs_qm_scall_getqstatv(
509 bool tempgqip = false; 488 bool tempgqip = false;
510 bool temppqip = false; 489 bool temppqip = false;
511 490
512 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
513 out->qs_uquota.qfs_ino = NULLFSINO;
514 out->qs_gquota.qfs_ino = NULLFSINO;
515 out->qs_pquota.qfs_ino = NULLFSINO;
516 return 0;
517 }
518
519 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 491 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
520 (XFS_ALL_QUOTA_ACCT| 492 (XFS_ALL_QUOTA_ACCT|
521 XFS_ALL_QUOTA_ENFD)); 493 XFS_ALL_QUOTA_ENFD));
@@ -523,11 +495,9 @@ xfs_qm_scall_getqstatv(
523 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 495 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
524 out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; 496 out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
525 497
526 if (q) { 498 uip = q->qi_uquotaip;
527 uip = q->qi_uquotaip; 499 gip = q->qi_gquotaip;
528 gip = q->qi_gquotaip; 500 pip = q->qi_pquotaip;
529 pip = q->qi_pquotaip;
530 }
531 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 501 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
532 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 502 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
533 0, 0, &uip) == 0) 503 0, 0, &uip) == 0)
@@ -562,19 +532,18 @@ xfs_qm_scall_getqstatv(
562 if (temppqip) 532 if (temppqip)
563 IRELE(pip); 533 IRELE(pip);
564 } 534 }
565 if (q) { 535 out->qs_incoredqs = q->qi_dquots;
566 out->qs_incoredqs = q->qi_dquots; 536 out->qs_btimelimit = q->qi_btimelimit;
567 out->qs_btimelimit = q->qi_btimelimit; 537 out->qs_itimelimit = q->qi_itimelimit;
568 out->qs_itimelimit = q->qi_itimelimit; 538 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
569 out->qs_rtbtimelimit = q->qi_rtbtimelimit; 539 out->qs_bwarnlimit = q->qi_bwarnlimit;
570 out->qs_bwarnlimit = q->qi_bwarnlimit; 540 out->qs_iwarnlimit = q->qi_iwarnlimit;
571 out->qs_iwarnlimit = q->qi_iwarnlimit; 541
572 }
573 return 0; 542 return 0;
574} 543}
575 544
576#define XFS_DQ_MASK \ 545#define XFS_QC_MASK \
577 (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) 546 (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
578 547
579/* 548/*
580 * Adjust quota limits, and start/stop timers accordingly. 549 * Adjust quota limits, and start/stop timers accordingly.
@@ -584,7 +553,7 @@ xfs_qm_scall_setqlim(
584 struct xfs_mount *mp, 553 struct xfs_mount *mp,
585 xfs_dqid_t id, 554 xfs_dqid_t id,
586 uint type, 555 uint type,
587 fs_disk_quota_t *newlim) 556 struct qc_dqblk *newlim)
588{ 557{
589 struct xfs_quotainfo *q = mp->m_quotainfo; 558 struct xfs_quotainfo *q = mp->m_quotainfo;
590 struct xfs_disk_dquot *ddq; 559 struct xfs_disk_dquot *ddq;
@@ -593,9 +562,9 @@ xfs_qm_scall_setqlim(
593 int error; 562 int error;
594 xfs_qcnt_t hard, soft; 563 xfs_qcnt_t hard, soft;
595 564
596 if (newlim->d_fieldmask & ~XFS_DQ_MASK) 565 if (newlim->d_fieldmask & ~XFS_QC_MASK)
597 return -EINVAL; 566 return -EINVAL;
598 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) 567 if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
599 return 0; 568 return 0;
600 569
601 /* 570 /*
@@ -633,11 +602,11 @@ xfs_qm_scall_setqlim(
633 /* 602 /*
634 * Make sure that hardlimits are >= soft limits before changing. 603 * Make sure that hardlimits are >= soft limits before changing.
635 */ 604 */
636 hard = (newlim->d_fieldmask & FS_DQ_BHARD) ? 605 hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
637 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) : 606 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
638 be64_to_cpu(ddq->d_blk_hardlimit); 607 be64_to_cpu(ddq->d_blk_hardlimit);
639 soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ? 608 soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
640 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) : 609 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
641 be64_to_cpu(ddq->d_blk_softlimit); 610 be64_to_cpu(ddq->d_blk_softlimit);
642 if (hard == 0 || hard >= soft) { 611 if (hard == 0 || hard >= soft) {
643 ddq->d_blk_hardlimit = cpu_to_be64(hard); 612 ddq->d_blk_hardlimit = cpu_to_be64(hard);
@@ -650,11 +619,11 @@ xfs_qm_scall_setqlim(
650 } else { 619 } else {
651 xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); 620 xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
652 } 621 }
653 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? 622 hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
654 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : 623 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
655 be64_to_cpu(ddq->d_rtb_hardlimit); 624 be64_to_cpu(ddq->d_rtb_hardlimit);
656 soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ? 625 soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
657 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) : 626 (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
658 be64_to_cpu(ddq->d_rtb_softlimit); 627 be64_to_cpu(ddq->d_rtb_softlimit);
659 if (hard == 0 || hard >= soft) { 628 if (hard == 0 || hard >= soft) {
660 ddq->d_rtb_hardlimit = cpu_to_be64(hard); 629 ddq->d_rtb_hardlimit = cpu_to_be64(hard);
@@ -667,10 +636,10 @@ xfs_qm_scall_setqlim(
667 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); 636 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
668 } 637 }
669 638
670 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? 639 hard = (newlim->d_fieldmask & QC_INO_HARD) ?
671 (xfs_qcnt_t) newlim->d_ino_hardlimit : 640 (xfs_qcnt_t) newlim->d_ino_hardlimit :
672 be64_to_cpu(ddq->d_ino_hardlimit); 641 be64_to_cpu(ddq->d_ino_hardlimit);
673 soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ? 642 soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
674 (xfs_qcnt_t) newlim->d_ino_softlimit : 643 (xfs_qcnt_t) newlim->d_ino_softlimit :
675 be64_to_cpu(ddq->d_ino_softlimit); 644 be64_to_cpu(ddq->d_ino_softlimit);
676 if (hard == 0 || hard >= soft) { 645 if (hard == 0 || hard >= soft) {
@@ -687,12 +656,12 @@ xfs_qm_scall_setqlim(
687 /* 656 /*
688 * Update warnings counter(s) if requested 657 * Update warnings counter(s) if requested
689 */ 658 */
690 if (newlim->d_fieldmask & FS_DQ_BWARNS) 659 if (newlim->d_fieldmask & QC_SPC_WARNS)
691 ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns); 660 ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
692 if (newlim->d_fieldmask & FS_DQ_IWARNS) 661 if (newlim->d_fieldmask & QC_INO_WARNS)
693 ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns); 662 ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
694 if (newlim->d_fieldmask & FS_DQ_RTBWARNS) 663 if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
695 ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns); 664 ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
696 665
697 if (id == 0) { 666 if (id == 0) {
698 /* 667 /*
@@ -702,24 +671,24 @@ xfs_qm_scall_setqlim(
702 * soft and hard limit values (already done, above), and 671 * soft and hard limit values (already done, above), and
703 * for warnings. 672 * for warnings.
704 */ 673 */
705 if (newlim->d_fieldmask & FS_DQ_BTIMER) { 674 if (newlim->d_fieldmask & QC_SPC_TIMER) {
706 q->qi_btimelimit = newlim->d_btimer; 675 q->qi_btimelimit = newlim->d_spc_timer;
707 ddq->d_btimer = cpu_to_be32(newlim->d_btimer); 676 ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
708 } 677 }
709 if (newlim->d_fieldmask & FS_DQ_ITIMER) { 678 if (newlim->d_fieldmask & QC_INO_TIMER) {
710 q->qi_itimelimit = newlim->d_itimer; 679 q->qi_itimelimit = newlim->d_ino_timer;
711 ddq->d_itimer = cpu_to_be32(newlim->d_itimer); 680 ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
712 } 681 }
713 if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { 682 if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
714 q->qi_rtbtimelimit = newlim->d_rtbtimer; 683 q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
715 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); 684 ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
716 } 685 }
717 if (newlim->d_fieldmask & FS_DQ_BWARNS) 686 if (newlim->d_fieldmask & QC_SPC_WARNS)
718 q->qi_bwarnlimit = newlim->d_bwarns; 687 q->qi_bwarnlimit = newlim->d_spc_warns;
719 if (newlim->d_fieldmask & FS_DQ_IWARNS) 688 if (newlim->d_fieldmask & QC_INO_WARNS)
720 q->qi_iwarnlimit = newlim->d_iwarns; 689 q->qi_iwarnlimit = newlim->d_ino_warns;
721 if (newlim->d_fieldmask & FS_DQ_RTBWARNS) 690 if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
722 q->qi_rtbwarnlimit = newlim->d_rtbwarns; 691 q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
723 } else { 692 } else {
724 /* 693 /*
725 * If the user is now over quota, start the timelimit. 694 * If the user is now over quota, start the timelimit.
@@ -801,7 +770,7 @@ xfs_qm_log_quotaoff(
801 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; 770 mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
802 spin_unlock(&mp->m_sb_lock); 771 spin_unlock(&mp->m_sb_lock);
803 772
804 xfs_mod_sb(tp, XFS_SB_QFLAGS); 773 xfs_log_sb(tp);
805 774
806 /* 775 /*
807 * We have to make sure that the transaction is secure on disk before we 776 * We have to make sure that the transaction is secure on disk before we
@@ -824,7 +793,7 @@ xfs_qm_scall_getquota(
824 struct xfs_mount *mp, 793 struct xfs_mount *mp,
825 xfs_dqid_t id, 794 xfs_dqid_t id,
826 uint type, 795 uint type,
827 struct fs_disk_quota *dst) 796 struct qc_dqblk *dst)
828{ 797{
829 struct xfs_dquot *dqp; 798 struct xfs_dquot *dqp;
830 int error; 799 int error;
@@ -848,28 +817,25 @@ xfs_qm_scall_getquota(
848 } 817 }
849 818
850 memset(dst, 0, sizeof(*dst)); 819 memset(dst, 0, sizeof(*dst));
851 dst->d_version = FS_DQUOT_VERSION; 820 dst->d_spc_hardlimit =
852 dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags); 821 XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
853 dst->d_id = be32_to_cpu(dqp->q_core.d_id); 822 dst->d_spc_softlimit =
854 dst->d_blk_hardlimit = 823 XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
855 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
856 dst->d_blk_softlimit =
857 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
858 dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); 824 dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
859 dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 825 dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
860 dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount); 826 dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
861 dst->d_icount = dqp->q_res_icount; 827 dst->d_ino_count = dqp->q_res_icount;
862 dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer); 828 dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
863 dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer); 829 dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
864 dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns); 830 dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
865 dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns); 831 dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
866 dst->d_rtb_hardlimit = 832 dst->d_rt_spc_hardlimit =
867 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); 833 XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
868 dst->d_rtb_softlimit = 834 dst->d_rt_spc_softlimit =
869 XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); 835 XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
870 dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount); 836 dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
871 dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer); 837 dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
872 dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns); 838 dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
873 839
874 /* 840 /*
875 * Internally, we don't reset all the timers when quota enforcement 841 * Internally, we don't reset all the timers when quota enforcement
@@ -882,23 +848,23 @@ xfs_qm_scall_getquota(
882 dqp->q_core.d_flags == XFS_DQ_GROUP) || 848 dqp->q_core.d_flags == XFS_DQ_GROUP) ||
883 (!XFS_IS_PQUOTA_ENFORCED(mp) && 849 (!XFS_IS_PQUOTA_ENFORCED(mp) &&
884 dqp->q_core.d_flags == XFS_DQ_PROJ)) { 850 dqp->q_core.d_flags == XFS_DQ_PROJ)) {
885 dst->d_btimer = 0; 851 dst->d_spc_timer = 0;
886 dst->d_itimer = 0; 852 dst->d_ino_timer = 0;
887 dst->d_rtbtimer = 0; 853 dst->d_rt_spc_timer = 0;
888 } 854 }
889 855
890#ifdef DEBUG 856#ifdef DEBUG
891 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || 857 if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
892 (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) || 858 (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
893 (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) && 859 (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
894 dst->d_id != 0) { 860 id != 0) {
895 if ((dst->d_bcount > dst->d_blk_softlimit) && 861 if ((dst->d_space > dst->d_spc_softlimit) &&
896 (dst->d_blk_softlimit > 0)) { 862 (dst->d_spc_softlimit > 0)) {
897 ASSERT(dst->d_btimer != 0); 863 ASSERT(dst->d_spc_timer != 0);
898 } 864 }
899 if ((dst->d_icount > dst->d_ino_softlimit) && 865 if ((dst->d_ino_count > dst->d_ino_softlimit) &&
900 (dst->d_ino_softlimit > 0)) { 866 (dst->d_ino_softlimit > 0)) {
901 ASSERT(dst->d_itimer != 0); 867 ASSERT(dst->d_ino_timer != 0);
902 } 868 }
903 } 869 }
904#endif 870#endif
@@ -908,26 +874,6 @@ out_put:
908} 874}
909 875
910STATIC uint 876STATIC uint
911xfs_qm_export_qtype_flags(
912 uint flags)
913{
914 /*
915 * Can't be more than one, or none.
916 */
917 ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
918 (FS_PROJ_QUOTA | FS_USER_QUOTA));
919 ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
920 (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
921 ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
922 (FS_USER_QUOTA | FS_GROUP_QUOTA));
923 ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
924
925 return (flags & XFS_DQ_USER) ?
926 FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
927 FS_PROJ_QUOTA : FS_GROUP_QUOTA;
928}
929
930STATIC uint
931xfs_qm_export_flags( 877xfs_qm_export_flags(
932 uint flags) 878 uint flags)
933{ 879{
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7542bbeca6a1..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
64 return xfs_qm_scall_getqstatv(mp, fqs); 64 return xfs_qm_scall_getqstatv(mp, fqs);
65} 65}
66 66
67STATIC int 67static unsigned int
68xfs_fs_set_xstate( 68xfs_quota_flags(unsigned int uflags)
69 struct super_block *sb,
70 unsigned int uflags,
71 int op)
72{ 69{
73 struct xfs_mount *mp = XFS_M(sb); 70 unsigned int flags = 0;
74 unsigned int flags = 0;
75
76 if (sb->s_flags & MS_RDONLY)
77 return -EROFS;
78 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
79 return -ENOSYS;
80 71
81 if (uflags & FS_QUOTA_UDQ_ACCT) 72 if (uflags & FS_QUOTA_UDQ_ACCT)
82 flags |= XFS_UQUOTA_ACCT; 73 flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
91 if (uflags & FS_QUOTA_PDQ_ENFD) 82 if (uflags & FS_QUOTA_PDQ_ENFD)
92 flags |= XFS_PQUOTA_ENFD; 83 flags |= XFS_PQUOTA_ENFD;
93 84
94 switch (op) { 85 return flags;
95 case Q_XQUOTAON: 86}
96 return xfs_qm_scall_quotaon(mp, flags); 87
97 case Q_XQUOTAOFF: 88STATIC int
98 if (!XFS_IS_QUOTA_ON(mp)) 89xfs_quota_enable(
99 return -EINVAL; 90 struct super_block *sb,
100 return xfs_qm_scall_quotaoff(mp, flags); 91 unsigned int uflags)
101 } 92{
93 struct xfs_mount *mp = XFS_M(sb);
94
95 if (sb->s_flags & MS_RDONLY)
96 return -EROFS;
97 if (!XFS_IS_QUOTA_RUNNING(mp))
98 return -ENOSYS;
99
100 return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
101}
102
103STATIC int
104xfs_quota_disable(
105 struct super_block *sb,
106 unsigned int uflags)
107{
108 struct xfs_mount *mp = XFS_M(sb);
109
110 if (sb->s_flags & MS_RDONLY)
111 return -EROFS;
112 if (!XFS_IS_QUOTA_RUNNING(mp))
113 return -ENOSYS;
114 if (!XFS_IS_QUOTA_ON(mp))
115 return -EINVAL;
102 116
103 return -EINVAL; 117 return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
104} 118}
105 119
106STATIC int 120STATIC int
@@ -131,7 +145,7 @@ STATIC int
131xfs_fs_get_dqblk( 145xfs_fs_get_dqblk(
132 struct super_block *sb, 146 struct super_block *sb,
133 struct kqid qid, 147 struct kqid qid,
134 struct fs_disk_quota *fdq) 148 struct qc_dqblk *qdq)
135{ 149{
136 struct xfs_mount *mp = XFS_M(sb); 150 struct xfs_mount *mp = XFS_M(sb);
137 151
@@ -141,14 +155,14 @@ xfs_fs_get_dqblk(
141 return -ESRCH; 155 return -ESRCH;
142 156
143 return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), 157 return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
144 xfs_quota_type(qid.type), fdq); 158 xfs_quota_type(qid.type), qdq);
145} 159}
146 160
147STATIC int 161STATIC int
148xfs_fs_set_dqblk( 162xfs_fs_set_dqblk(
149 struct super_block *sb, 163 struct super_block *sb,
150 struct kqid qid, 164 struct kqid qid,
151 struct fs_disk_quota *fdq) 165 struct qc_dqblk *qdq)
152{ 166{
153 struct xfs_mount *mp = XFS_M(sb); 167 struct xfs_mount *mp = XFS_M(sb);
154 168
@@ -160,13 +174,14 @@ xfs_fs_set_dqblk(
160 return -ESRCH; 174 return -ESRCH;
161 175
162 return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), 176 return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
163 xfs_quota_type(qid.type), fdq); 177 xfs_quota_type(qid.type), qdq);
164} 178}
165 179
166const struct quotactl_ops xfs_quotactl_operations = { 180const struct quotactl_ops xfs_quotactl_operations = {
167 .get_xstatev = xfs_fs_get_xstatev, 181 .get_xstatev = xfs_fs_get_xstatev,
168 .get_xstate = xfs_fs_get_xstate, 182 .get_xstate = xfs_fs_get_xstate,
169 .set_xstate = xfs_fs_set_xstate, 183 .quota_enable = xfs_quota_enable,
184 .quota_disable = xfs_quota_disable,
170 .rm_xquota = xfs_fs_rm_xquota, 185 .rm_xquota = xfs_fs_rm_xquota,
171 .get_dqblk = xfs_fs_get_dqblk, 186 .get_dqblk = xfs_fs_get_dqblk,
172 .set_dqblk = xfs_fs_set_dqblk, 187 .set_dqblk = xfs_fs_set_dqblk,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19cbda196369..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -685,7 +685,7 @@ xfs_blkdev_get(
685 mp); 685 mp);
686 if (IS_ERR(*bdevp)) { 686 if (IS_ERR(*bdevp)) {
687 error = PTR_ERR(*bdevp); 687 error = PTR_ERR(*bdevp);
688 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); 688 xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
689 } 689 }
690 690
691 return error; 691 return error;
@@ -1111,6 +1111,11 @@ xfs_fs_statfs(
1111 statp->f_files, 1111 statp->f_files,
1112 mp->m_maxicount); 1112 mp->m_maxicount);
1113 1113
1114 /* If sb_icount overshot maxicount, report actual allocation */
1115 statp->f_files = max_t(typeof(statp->f_files),
1116 statp->f_files,
1117 sbp->sb_icount);
1118
1114 /* make sure statp->f_ffree does not underflow */ 1119 /* make sure statp->f_ffree does not underflow */
1115 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1120 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1116 statp->f_ffree = max_t(__int64_t, ffree, 0); 1121 statp->f_ffree = max_t(__int64_t, ffree, 0);
@@ -1257,13 +1262,13 @@ xfs_fs_remount(
1257 * If this is the first remount to writeable state we 1262 * If this is the first remount to writeable state we
1258 * might have some superblock changes to update. 1263 * might have some superblock changes to update.
1259 */ 1264 */
1260 if (mp->m_update_flags) { 1265 if (mp->m_update_sb) {
1261 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1266 error = xfs_sync_sb(mp, false);
1262 if (error) { 1267 if (error) {
1263 xfs_warn(mp, "failed to write sb changes"); 1268 xfs_warn(mp, "failed to write sb changes");
1264 return error; 1269 return error;
1265 } 1270 }
1266 mp->m_update_flags = 0; 1271 mp->m_update_sb = false;
1267 } 1272 }
1268 1273
1269 /* 1274 /*
@@ -1293,8 +1298,9 @@ xfs_fs_remount(
1293 1298
1294/* 1299/*
1295 * Second stage of a freeze. The data is already frozen so we only 1300 * Second stage of a freeze. The data is already frozen so we only
1296 * need to take care of the metadata. Once that's done write a dummy 1301 * need to take care of the metadata. Once that's done sync the superblock
1297 * record to dirty the log in case of a crash while frozen. 1302 * to the log to dirty it in case of a crash while frozen. This ensures that we
1303 * will recover the unlinked inode lists on the next mount.
1298 */ 1304 */
1299STATIC int 1305STATIC int
1300xfs_fs_freeze( 1306xfs_fs_freeze(
@@ -1304,7 +1310,7 @@ xfs_fs_freeze(
1304 1310
1305 xfs_save_resvblks(mp); 1311 xfs_save_resvblks(mp);
1306 xfs_quiesce_attr(mp); 1312 xfs_quiesce_attr(mp);
1307 return xfs_fs_log_dummy(mp); 1313 return xfs_sync_sb(mp, true);
1308} 1314}
1309 1315
1310STATIC int 1316STATIC int
@@ -1531,7 +1537,7 @@ xfs_fs_mount(
1531static long 1537static long
1532xfs_fs_nr_cached_objects( 1538xfs_fs_nr_cached_objects(
1533 struct super_block *sb, 1539 struct super_block *sb,
1534 int nid) 1540 struct shrink_control *sc)
1535{ 1541{
1536 return xfs_reclaim_inodes_count(XFS_M(sb)); 1542 return xfs_reclaim_inodes_count(XFS_M(sb));
1537} 1543}
@@ -1539,10 +1545,9 @@ xfs_fs_nr_cached_objects(
1539static long 1545static long
1540xfs_fs_free_cached_objects( 1546xfs_fs_free_cached_objects(
1541 struct super_block *sb, 1547 struct super_block *sb,
1542 long nr_to_scan, 1548 struct shrink_control *sc)
1543 int nid)
1544{ 1549{
1545 return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); 1550 return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
1546} 1551}
1547 1552
1548static const struct super_operations xfs_super_operations = { 1553static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 1743b9f8e23d..a0c8067cea6f 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -149,24 +149,6 @@ static struct ctl_table xfs_table[] = {
149 .extra2 = &xfs_params.inherit_noatim.max 149 .extra2 = &xfs_params.inherit_noatim.max
150 }, 150 },
151 { 151 {
152 .procname = "xfsbufd_centisecs",
153 .data = &xfs_params.xfs_buf_timer.val,
154 .maxlen = sizeof(int),
155 .mode = 0644,
156 .proc_handler = proc_dointvec_minmax,
157 .extra1 = &xfs_params.xfs_buf_timer.min,
158 .extra2 = &xfs_params.xfs_buf_timer.max
159 },
160 {
161 .procname = "age_buffer_centisecs",
162 .data = &xfs_params.xfs_buf_age.val,
163 .maxlen = sizeof(int),
164 .mode = 0644,
165 .proc_handler = proc_dointvec_minmax,
166 .extra1 = &xfs_params.xfs_buf_age.min,
167 .extra2 = &xfs_params.xfs_buf_age.max
168 },
169 {
170 .procname = "inherit_nosymlinks", 152 .procname = "inherit_nosymlinks",
171 .data = &xfs_params.inherit_nosym.val, 153 .data = &xfs_params.inherit_nosym.val,
172 .maxlen = sizeof(int), 154 .maxlen = sizeof(int),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fa3135b9bf04..eb90cd59a0ec 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -472,6 +472,7 @@ xfs_trans_apply_sb_deltas(
472 whole = 1; 472 whole = 1;
473 } 473 }
474 474
475 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
475 if (whole) 476 if (whole)
476 /* 477 /*
477 * Log the whole thing, the fields are noncontiguous. 478 * Log the whole thing, the fields are noncontiguous.
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0a4d4ab6d9a9..75798412859a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -327,9 +327,10 @@ xfs_trans_read_buf_map(
327 return -EIO; 327 return -EIO;
328 } 328 }
329 329
330 if (tp) 330 if (tp) {
331 _xfs_trans_bjoin(tp, bp, 1); 331 _xfs_trans_bjoin(tp, bp, 1);
332 trace_xfs_trans_read_buf(bp->b_fspriv); 332 trace_xfs_trans_read_buf(bp->b_fspriv);
333 }
333 *bpp = bp; 334 *bpp = bp;
334 return 0; 335 return 0;
335 336